Correlation - Scatterplot Matrix

From Q
Jump to navigation Jump to search

Create a matrix where the lower triangle displays scatterplots of variable pairs, the diagonal displays variable histograms and the upper triangle displays correlations of variable pairs. Also known as a scatter plot matrix or SPLOM.

Shows a matrix where the lower triangle displays scatterplots of variable pairs, the diagonal displays variable histograms and the upper triangle displays correlations of variable pairs. Also known as a scatter plot matrix or SPLOM.

How to Create a Scatterplot Matrix

  1. Add the object by selecting from the menu Anything > Advanced Analysis > Correlation > Scatterplot MatrixCreate > Correlation > Scatterplot Matrix
  2. Under Inputs > Variables to plot select the variables to include in the matrix.

Examples

An example of a scatterplot matrix with 3 variables and fitted LOESS curves:

An example of a scatterplot matrix with categorical variables:

Options

The options in the Object Inspector are organized into two tabs: Inputs and Properties.

Inputs

Variables to plot The variables that will be shown in the matrix. These can be numeric, categorical or ordered categorical.

Variable names Whether to show variable names in the matrix, instead of variable labels.

Missing data Method for dealing with missing data. See Missing Data Options.

Error if missing data
Exclude cases with missing data
Use partial data

Fitted line

None No fitted line is shown in each scatterplot.
Straight A line of best fit (from linear regression) is shown in each scatterplot.
LOESS A line generated using fitted values from locally weighted polynomial regression is shown in each scatterplot.

Scatter point type The type of plot point to display.

Small dots
Dots
Big dots
Circles

Scatter point modifications

None No modifications.
Jitter Add noise to each point so that areas with closely positioned points appear denser.
Enlarge points with multiple observations Multiple points with identical positions will appear as an enlarged point. This is the default.

Properties

This tab contains options for formatting the size of the object, as well as the underlying R code used to create the visualization, and the JavaScript code use to customize the Object Inspector itself (see Object Inspector for more details about these options). Additional options are available by editing the code.

Acknowledgements

Uses the R package weights.

Code

form.setHeading("Scatterplot Matrix");
form.dropBox({name: "formInput", label: "Variables to plot",
    types: ["V:numeric, categorical, ordered categorical"],
    multi: true, min_inputs: 2, height: 8});
form.checkBox({label: "Variable names", name: "formNames", default_value: false, prompt: "Display names instead of labels"});
form.comboBox({label: "Missing data", 
    alternatives: ["Error if missing data", "Exclude cases with missing data", "Use partial data"], 
    name: "formMissing", default_value: "Use partial data",
    prompt: "Options for handling cases with missing data"});
form.comboBox({label: "Fitted line",
            alternatives: ["None", "Straight", "LOESS"],
            name: "formLine", 
            default_value: "None", prompt: "Draw a fitted line or curve"});
form.comboBox({label: "Scatter point type",
            alternatives: ["Small dots", "Dots", "Big dots", "Circles"],
            name: "formSymbol",
            default_value: "Dots"})
form.comboBox({label: "Scatter point modifications",
            alternatives: ["None", "Jitter", "Enlarge points with multiple observations"],
            name: "formPointMods",
            default_value: "Enlarge points with multiple observations",
            prompt: "Options for dealing with overlapping points"})
# Random seed used for jitter
seed <- 123

# Scatterplot
panel.scatter <- function(x, y, pch = par("pch"), ...)
{
    if (formPointMods == "None")
    {
        points(x, y, pch = pch, ...)
    }
    else if (formPointMods == "Jitter")
    {
        set.seed(seed)
        jitter.x <- x + rnorm(length(x)) * (max(x, na.rm = TRUE) - min(x, na.rm = TRUE)) * 0.01
        jitter.y <- y + rnorm(length(y)) * (max(y, na.rm = TRUE) - min(y, na.rm = TRUE)) * 0.01
        points(jitter.x, jitter.y, pch = pch, ...)
    }
    else if (formPointMods == "Enlarge points with multiple observations")
    {
        c.hash <- hash()
        x.hash <- hash()
        y.hash <- hash()
        for (i in seq(x))
        {
            key <- paste(x[i], y[i], collapse = ",")

            if (has.key(key, c.hash))
                c.hash[[key]] <- c.hash[[key]] + wgt[i]
            else
            {
                c.hash[[key]] <- wgt[i]
                x.hash[[key]] <- x[i]
                y.hash[[key]] <- y[i]
            }
        }
        cex <- pmax(1, 10 * sqrt(values(c.hash) / sum(wgt)))
        points(values(x.hash), values(y.hash), pch = pch, cex = cex, ...)
    }
    else
        stop(paste("Point setting not handled:", formPointMods))
  
    if (formLine == "Straight")
    {
        reg <- lm(y ~ x, weights = wgt)
        abline(reg, col = "red", ...)
    }
    else if (formLine == "LOESS")
    {
        ok <- is.finite(x) & is.finite(y)
        if (any(ok))
        {
            xok <- x[ok]
            yok <- y[ok]
            j <- order(xok)
            suppressWarnings(fitted.y <- loess(yok ~ xok, weights = wgt[ok])$fitted)
            lines(xok[j], fitted.y[j], col = "red", ...)
        }
    }
}

# Histograms for the diagonal
panel.hist <- function(x, ...)
{
    usr <- par("usr")
    on.exit(par(usr))
    par(usr = c(usr[1:2], 0, 1.5) )
    h <- weights::wtd.hist(x, plot = FALSE, weight = wgt)
    breaks <- h$breaks
    nB <- length(breaks)
    y <- h$counts
    y <- y / max(y)
    rect(breaks[-nB], 0, breaks[-1], y, col = "gray")
}

# Correlations for the upper triangle
panel.cor <- function(x, y, digits = 2, ...)
{
    usr <- par("usr")
    on.exit(par(usr))
    par(usr = c(0, 1, 0, 1))
    r <- weights::wtd.cors(x, y, weight = wgt)
    txt <- FormatAsReal(r, decimals = digits)
    text(0.5, 0.5, txt, cex = 1.2)
}

# Variable names/labels in the diagonal
panel.label <- function(x, y, labels, cex, font, ...)
{
    text(0.5, 0.85, labels, cex = 1.2)
}

library(hash)
library(flipData)
library(flipFormat)
library(flipTransformations)

dat <- AsNumeric(ProcessQVariables(QDataFrame(formInput)), binary = FALSE)
# Changing names to labels.
if (!formNames)
    names(dat) <- ExtractCommonPrefix(Labels(dat))$shortened.labels
wgt <- if (is.null(QCalibratedWeight)) {
    rep(1, nrow(dat))
} else {
    QCalibratedWeight
}
dat <- dat[QFilter, ]
wgt <- wgt[QFilter]

processed.data <- if (formMissing == "Error if missing data") {
    ErrorIfMissingDataFound(dat)
} else if (formMissing == "Exclude cases with missing data") {
    RemoveCasesWithAnyNA(dat)
} else if (formMissing == "Use partial data") {
    RemoveCasesWithAllNA(dat)
} else
    stop(c("Option not handled:", formMissing))
wgt <- wgt[row.names(dat) %in% rownames(processed.data)]

pch <- if (formSymbol == "Small dots") {
    "."
} else if (formSymbol == "Dots") {
    20
} else if (formSymbol == "Big dots") {
    19
} else if (formSymbol == "Circles") {
    "O"
} else
    stop(paste("Symbol not found:", formSymbol))

scatterplotmatrix <- pairs(processed.data,
                           pch = pch,
                           lower.panel = panel.scatter,
                           diag.panel = panel.hist,
                           upper.panel = panel.cor,
                           text.panel = panel.label,
                           gap = 0.5)