Create New Variables - Variable(s) with Outliers Removed

From Q
Jump to navigation Jump to search

Create new variable(s) with any outlying values in the selected variable(s) replaced with NaN

This QScript checks the selected data for outliers and creates new copies of the data with the outliers removed. Outliers are defined as values that are not within a certain number of standard deviations from the variable mean. The new copies of data will have the outlying values replaced with NaN. Data that does not contain outliers will not be copied.

Technical details

You will be asked to specify:

  1. The data to check.
  2. The cut-off value to use to define outliers. Respondents whose value is not within this many standard deviations of the mean of a variable will be considered outliers. The default value is 3 standard deviations.

A new folder will be created in the report tree that contains tables for the selected data and any new copies of data with the outliers removed.

The new copies of variables use a JavaScript formula to assign respondents with outlying values with a value of NaN. The means and standard deviations are determined when this script is run. As a result, the definition of an outlier in variables where the outliers have been removed will not be updated if the underlying data changes.

How to apply this QScript

  • Start typing the name of the QScript into the Search features and data box in the top right of the Q window.
  • Click on the QScript when it appears in the QScripts and Rules section of the search results.

OR

  • Select Automate > Browse Online Library.
  • Select this QScript from the list.

Customizing the QScript

This QScript is written in JavaScript and can be customized by copying and modifying the JavaScript.

Customizing QScripts in Q4.11 and more recent versions

  • Start typing the name of the QScript into the Search features and data box in the top right of the Q window.
  • Hover your mouse over the QScript when it appears in the QScripts and Rules section of the search results.
  • Press Edit a Copy (bottom-left corner of the preview).
  • Modify the JavaScript (see QScripts for more detail on this).
  • Either:
    • Run the QScript, by pressing the blue triangle button.
    • Save the QScript and run it at a later time, using Automate > Run QScript (Macro) from File.

Customizing QScripts in older versions

  • Copy the JavaScript shown on this page.
  • Create a new text file, giving it a file extension of .QScript. See here for more information about how to do this.
  • Modify the JavaScript (see QScripts for more detail on this).
  • Run the file using Automate > Run QScript (Macro) from File.

JavaScript

// The script checks for outliers in selected variables, and creates new variables with the outliers
// removed if any are found.
// The script checks for outliers in selected variables, and creates new variables with the outliers
// removed if any are found.
 
includeWeb("QScript Selection Functions");
includeWeb("QScript Functions to Generate Outputs"); 
includeWeb("QScript Table Functions"); 

 
if (!main())
    log("QScript cancelled.");
else
    conditionallyEmptyLog("QScript finished.");
 
function main() {
    var web_mode = (!!Q.isOnTheWeb && Q.isOnTheWeb());
    var allowed_types = ["Numeric", "Numeric - Multi", "Numeric - Grid"];
    var questions = selectInputQuestions(allowed_types);
    if (!questions)
        return false;
    if (!areQuestionsValidAndNonEmpty(questions))
        return false;

    // Specify the number of standard deviations that defines an 'outlier'
    var number_sd_from_mean;
    while (isNaN(number_sd_from_mean)) {
        number_sd_from_mean = prompt("Enter the cut-off value to use to identify outliers. Respondents whose value is not within this many standard deviations from the mean will be considered outliers.", 3);
        if (isNaN(number_sd_from_mean))
            alert('The cut-off value must be a number.');
    }
    
    // Begin the report
    var paragraphs = ["The tables below show data that has been checked for outliers.",
                      "Test: value is more than " + number_sd_from_mean + " standard deviations from the mean.",
                      "Where the data contains outliers, a copy of the data is made and it is labeled as 'OUTLIERS REMOVED'",
                      " "];
 
    // creating the tables in the report tree
    var group = project.report.appendGroup();
    group.name = "Checked for outliers";
 
    // Check each of the input variables and construct new variables where needed
    var outliers_found = false;
    var outlier_list = [];
    questions.forEach(function (question) {
        if (!question.isHidden) {
            var data_file = question.dataFile;
            // Make a table for the original question
            if (!web_mode) {
                new_table = group.appendTable();
                new_table.primary = question;
                new_table.secondary = "SUMMARY";
                new_table.cellStatistics = ['Average', 'Standard Deviation','Minimum', 'Maximum', 'Base n', 'Missing n'];                
            }else {
                // Position both tables side by side, not possible with "TwoItems" page, so use left/height properties
                var TABLE_PAD = 20;
                var page = group.appendPage('Blank');
                page.name = question.name;

                new_table = page.appendTable();
                new_table.primary = question;
                new_table.secondary = "SUMMARY";
                new_table.cellStatistics = ['Average', 'Standard Deviation','Minimum', 'Maximum', 'Base n', 'Missing n'];  
               
                var side_by_side;
                var max_width = page.width - 2*TABLE_PAD;
                var max_height = page.height - 3*TABLE_PAD;  // need space for description text
                if (new_table.height > page.height - TABLE_PAD) {  // place tall tables side-by-side
                    side_by_side = true
                    new_table.left = TABLE_PAD;
                    max_width = max_width - TABLE_PAD;  // account for middle space 
                    if (new_table.width > max_width/2)
                        new_table.width = max_width/2;
                    if (new_table.height > max_height) {
                        new_table.top = TABLE_PAD;
                        new_table.height = max_height;
                    }
                }else {  // place 2nd table underneath first
                    side_by_side = false;
                    new_table.top = TABLE_PAD;
                    max_height = max_height - TABLE_PAD;  // account for middle space  
                    if (new_table.height > max_height/2)
                        new_table.height = max_height/2;
                    if (new_table.width > max_width) {
                        new_table.left = TABLE_PAD;
                        new_table.width = max_width;
                    }
                }
            }
 
            // Check the statistics for each variable in the question to determine
            // if there are any outliers
            var outlier_data = question.variables.map(function (v) {
                return checkVariableForOutliers(v, number_sd_from_mean);
            });
            var outliers = outlier_data.filter(function (obj) {
                return obj.hasOutliers;
            }).length > 0;
 
            // Generate new variables if any of the variables in the question contains outliers
            if (outliers) { 
                outliers_found = true;
                var max_permissable = outlier_data.map(function (obj) {
                    return obj.maxPermissable;
                });
                var min_permissable = outlier_data.map(function (obj) {
                    return obj.minPermissable;
                });
                outlier_list.push(question.name);
                var variables = question.variables;
                var new_variables = [];
                for (var v = 0; v < variables.length; v++) {
                    var v_name = variables[v].name;
                    var expression = "if (" + v_name + " > " + min_permissable[v] + " && " + v_name + " < " + max_permissable[v] +") " + v_name + "; else NaN";
                    try {
                        new_variables.push(question.dataFile.newJavaScriptVariable(expression, false, preventDuplicateVariableName(data_file, v_name + "_noOutliers"), variables[v].label, null));
                    } catch (e) {
                        log("Could not idenitify outliers in " + v_name + ": " + e);
                        return false;
                    }
                }
                var new_question = data_file.setQuestion(preventDuplicateQuestionName(data_file, question.name + " OUTLIERS REMOVED"), question.questionType, new_variables);
                insertAtHoverButtonIfShown(new_question);

                if (!web_mode){
                    new_table = group.appendTable();
                    new_table.primary = new_question;
                    new_table.secondary = "SUMMARY";
                    new_table.cellStatistics = ['Average', 'Standard Deviation','Minimum', 'Maximum', 'Base n', 'Missing n'];                         
                }else {
                    new_table2 = page.appendTable();
                    new_table2.primary = new_question;
                    new_table2.secondary = "SUMMARY";
                    new_table2.cellStatistics = ['Average', 'Standard Deviation','Minimum', 'Maximum', 'Base n', 'Missing n'];                         
 
                    if (side_by_side) {  // place to right of tall table
                        new_table2.left = new_table.width + 2*TABLE_PAD;
                        if (new_table2.width > max_width/2)
                            new_table2.width = max_width/2;
                        if (new_table2.height > max_height) {
                            new_table2.top = TABLE_PAD;
                            new_table2.height = max_height;
                        }                       
                    }else {  // place 2nd table underneath first
                        new_table2.top = new_table.height + 2*TABLE_PAD;
                        if (new_table2.height > max_height/2)
                            new_table2.height = max_height/2;
                        if (new_table2.width > max_width) {
                            new_table2.left = TABLE_PAD;
                            new_table2.width = max_width;
                        }
                    }                    
                
                    var descriptive_text = page.appendText();
                    descriptive_text.text = "Respondents with values larger than three standard deviations from mean have been removed.";
                    if (side_by_side) {
                        descriptive_text.top = 2*TABLE_PAD + new_table.height;
                    }else
                        descriptive_text.top = 3*TABLE_PAD + new_table.height + new_table2.height;   
                }
           
            }
        }
    });
    
    if (!web_mode) {        
        if (outliers_found) {
            paragraphs.push("Outliers found in:");
            paragraphs.push("");
            paragraphs = paragraphs.concat(outlier_list);
        } else
            paragraphs.push('No outliers found');

        simpleHTMLReport(paragraphs, "Checked for outliers", group, true, false);
    }else{
        if (outliers_found) {
            function makeWordList(words) {
                if (words.length == 1){
                    return "New variable '" + words[0] + "' created. ";
                }else
                    return "New variables '" + words.slice(0,words.length-1).join("', ") 
                             + "', and '" + words[words.length-1] + "' created. ";
            }
            log(makeWordList(outlier_list)
                + "Tables showing the variables with and without outliers removed have been added to the bottom of the document."
                + " Respondents whose value are not within " + number_sd_from_mean 
                + " standard deviations from the mean were considered outliers.");
        }else
            log("No outliers detected. Tables showing a summary of the original variables have been added to the bottom of the document.")
    }
    return true;
}

 
// Checks a variable for outliers.
function checkVariableForOutliers(variable, number_sd_from_mean) {

    // Compute standard deviation (note, no weights used)
    var xx = variable.rawValues;
    var n = 0;
    var tot = 0;
    var min = Infinity;
    var max = -Infinity;
    for (var i = 0; i < xx.length; i++)
    {
        if (xx[i] != null && !isNaN(xx[i]))
        {
            tot += xx[i];
            n++;

            if (xx[i] > max)
                max = xx[i];
            if (xx[i] < min)
                min = xx[i];
        }
    }
    var mean = tot/n;
    var tmp_sd = 0;
    for (var i = 0; i < xx.length; i++)
    {
        if (xx[i] != null && !isNaN(xx[i]))
            tmp_sd += (xx[i] - mean) * (xx[i] - mean);
    }
    var sd = Math.sqrt(tmp_sd/(n-1))

    var outliers = false;
    var min_permissable = mean - number_sd_from_mean * sd;
    var max_permissable = mean + number_sd_from_mean * sd;
    var too_low = min < min_permissable;
    var too_high = max > max_permissable;
    if (too_low || too_high)
        outliers = true;
 
    return { hasOutliers: outliers, maxPermissable: max_permissable, minPermissable: min_permissable };
}

See also