Create New Variables - Variable(s) with Outliers Removed
Create new variable(s) with any outlying values in the selected variable(s) replaced with NaN
This QScript checks the selected data for outliers and creates new copies of the data with the outliers removed. Outliers are defined as values that are not within a certain number of standard deviations from the variable mean. The new copies of data will have the outlying values replaced with NaN. Data that does not contain outliers will not be copied.
Technical details
You will be asked to specify:
- The data to check.
- The cut-off value to use to define outliers. Respondents whose value is not within this many standard deviations of the mean of a variable will be considered outliers. The default value is 3 standard deviations.
A new folder will be created in the report tree that contains tables for the selected data and any new copies of data with the outliers removed.
The new copies of variables use a JavaScript formula to assign respondents with outlying values with a value of NaN. The means and standard deviations are determined when this script is run. As a result, the definition of an outlier in variables where the outliers have been removed will not be updated if the underlying data changes.
How to apply this QScript
- Start typing the name of the QScript into the Search features and data box in the top right of the Q window.
- Click on the QScript when it appears in the QScripts and Rules section of the search results.
OR
- Select Automate > Browse Online Library.
- Select this QScript from the list.
Customizing the QScript
This QScript is written in JavaScript and can be customized by copying and modifying the JavaScript.
Customizing QScripts in Q4.11 and more recent versions
- Start typing the name of the QScript into the Search features and data box in the top right of the Q window.
- Hover your mouse over the QScript when it appears in the QScripts and Rules section of the search results.
- Press Edit a Copy (bottom-left corner of the preview).
- Modify the JavaScript (see QScripts for more detail on this).
- Either:
- Run the QScript, by pressing the blue triangle button.
- Save the QScript and run it at a later time, using Automate > Run QScript (Macro) from File.
Customizing QScripts in older versions
JavaScript
// The script checks for outliers in selected variables, and creates new variables with the outliers
// removed if any are found.
// The script checks for outliers in selected variables, and creates new variables with the outliers
// removed if any are found.
includeWeb("QScript Selection Functions");
includeWeb("QScript Functions to Generate Outputs");
includeWeb("QScript Table Functions");
if (!main())
log("QScript cancelled.");
else
conditionallyEmptyLog("QScript finished.");
function main() {
var web_mode = (!!Q.isOnTheWeb && Q.isOnTheWeb());
var allowed_types = ["Numeric", "Numeric - Multi", "Numeric - Grid"];
var questions = selectInputQuestions(allowed_types);
if (!questions)
return false;
if (!areQuestionsValidAndNonEmpty(questions))
return false;
// Specify the number of standard deviations that defines an 'outlier'
var number_sd_from_mean;
while (isNaN(number_sd_from_mean)) {
number_sd_from_mean = prompt("Enter the cut-off value to use to identify outliers. Respondents whose value is not within this many standard deviations from the mean will be considered outliers.", 3);
if (isNaN(number_sd_from_mean))
alert('The cut-off value must be a number.');
}
// Begin the report
var paragraphs = ["The tables below show data that has been checked for outliers.",
"Test: value is more than " + number_sd_from_mean + " standard deviations from the mean.",
"Where the data contains outliers, a copy of the data is made and it is labeled as 'OUTLIERS REMOVED'",
" "];
// creating the tables in the report tree
var group = project.report.appendGroup();
group.name = "Checked for outliers";
// Check each of the input variables and construct new variables where needed
var outliers_found = false;
var outlier_list = [];
questions.forEach(function (question) {
if (!question.isHidden) {
var data_file = question.dataFile;
// Make a table for the original question
if (!web_mode) {
new_table = group.appendTable();
new_table.primary = question;
new_table.secondary = "SUMMARY";
new_table.cellStatistics = ['Average', 'Standard Deviation','Minimum', 'Maximum', 'Base n', 'Missing n'];
}else {
// Position both tables side by side, not possible with "TwoItems" page, so use left/height properties
var TABLE_PAD = 20;
var page = group.appendPage('Blank');
page.name = question.name;
new_table = page.appendTable();
new_table.primary = question;
new_table.secondary = "SUMMARY";
new_table.cellStatistics = ['Average', 'Standard Deviation','Minimum', 'Maximum', 'Base n', 'Missing n'];
var side_by_side;
var max_width = page.width - 2*TABLE_PAD;
var max_height = page.height - 3*TABLE_PAD; // need space for description text
if (new_table.height > page.height - TABLE_PAD) { // place tall tables side-by-side
side_by_side = true
new_table.left = TABLE_PAD;
max_width = max_width - TABLE_PAD; // account for middle space
if (new_table.width > max_width/2)
new_table.width = max_width/2;
if (new_table.height > max_height) {
new_table.top = TABLE_PAD;
new_table.height = max_height;
}
}else { // place 2nd table underneath first
side_by_side = false;
new_table.top = TABLE_PAD;
max_height = max_height - TABLE_PAD; // account for middle space
if (new_table.height > max_height/2)
new_table.height = max_height/2;
if (new_table.width > max_width) {
new_table.left = TABLE_PAD;
new_table.width = max_width;
}
}
}
// Check the statistics for each variable in the question to determine
// if there are any outliers
var outlier_data = question.variables.map(function (v) {
return checkVariableForOutliers(v, number_sd_from_mean);
});
var outliers = outlier_data.filter(function (obj) {
return obj.hasOutliers;
}).length > 0;
// Generate new variables if any of the variables in the question contains outliers
if (outliers) {
outliers_found = true;
var max_permissable = outlier_data.map(function (obj) {
return obj.maxPermissable;
});
var min_permissable = outlier_data.map(function (obj) {
return obj.minPermissable;
});
outlier_list.push(question.name);
var variables = question.variables;
var new_variables = [];
for (var v = 0; v < variables.length; v++) {
var v_name = variables[v].name;
var expression = "if (" + v_name + " > " + min_permissable[v] + " && " + v_name + " < " + max_permissable[v] +") " + v_name + "; else NaN";
try {
new_variables.push(question.dataFile.newJavaScriptVariable(expression, false, preventDuplicateVariableName(data_file, v_name + "_noOutliers"), variables[v].label, null));
} catch (e) {
log("Could not idenitify outliers in " + v_name + ": " + e);
return false;
}
}
var new_question = data_file.setQuestion(preventDuplicateQuestionName(data_file, question.name + " OUTLIERS REMOVED"), question.questionType, new_variables);
insertAtHoverButtonIfShown(new_question);
if (!web_mode){
new_table = group.appendTable();
new_table.primary = new_question;
new_table.secondary = "SUMMARY";
new_table.cellStatistics = ['Average', 'Standard Deviation','Minimum', 'Maximum', 'Base n', 'Missing n'];
}else {
new_table2 = page.appendTable();
new_table2.primary = new_question;
new_table2.secondary = "SUMMARY";
new_table2.cellStatistics = ['Average', 'Standard Deviation','Minimum', 'Maximum', 'Base n', 'Missing n'];
if (side_by_side) { // place to right of tall table
new_table2.left = new_table.width + 2*TABLE_PAD;
if (new_table2.width > max_width/2)
new_table2.width = max_width/2;
if (new_table2.height > max_height) {
new_table2.top = TABLE_PAD;
new_table2.height = max_height;
}
}else { // place 2nd table underneath first
new_table2.top = new_table.height + 2*TABLE_PAD;
if (new_table2.height > max_height/2)
new_table2.height = max_height/2;
if (new_table2.width > max_width) {
new_table2.left = TABLE_PAD;
new_table2.width = max_width;
}
}
var descriptive_text = page.appendText();
descriptive_text.text = "Respondents with values larger than three standard deviations from mean have been removed.";
if (side_by_side) {
descriptive_text.top = 2*TABLE_PAD + new_table.height;
}else
descriptive_text.top = 3*TABLE_PAD + new_table.height + new_table2.height;
}
}
}
});
if (!web_mode) {
if (outliers_found) {
paragraphs.push("Outliers found in:");
paragraphs.push("");
paragraphs = paragraphs.concat(outlier_list);
} else
paragraphs.push('No outliers found');
simpleHTMLReport(paragraphs, "Checked for outliers", group, true, false);
}else{
if (outliers_found) {
function makeWordList(words) {
if (words.length == 1){
return "New variable '" + words[0] + "' created. ";
}else
return "New variables '" + words.slice(0,words.length-1).join("', ")
+ "', and '" + words[words.length-1] + "' created. ";
}
log(makeWordList(outlier_list)
+ "Tables showing the variables with and without outliers removed have been added to the bottom of the document."
+ " Respondents whose value are not within " + number_sd_from_mean
+ " standard deviations from the mean were considered outliers.");
}else
log("No outliers detected. Tables showing a summary of the original variables have been added to the bottom of the document.")
}
return true;
}
// Checks a variable for outliers.
function checkVariableForOutliers(variable, number_sd_from_mean) {
// Compute standard deviation (note, no weights used)
var xx = variable.rawValues;
var n = 0;
var tot = 0;
var min = Infinity;
var max = -Infinity;
for (var i = 0; i < xx.length; i++)
{
if (xx[i] != null && !isNaN(xx[i]))
{
tot += xx[i];
n++;
if (xx[i] > max)
max = xx[i];
if (xx[i] < min)
min = xx[i];
}
}
var mean = tot/n;
var tmp_sd = 0;
for (var i = 0; i < xx.length; i++)
{
if (xx[i] != null && !isNaN(xx[i]))
tmp_sd += (xx[i] - mean) * (xx[i] - mean);
}
var sd = Math.sqrt(tmp_sd/(n-1))
var outliers = false;
var min_permissable = mean - number_sd_from_mean * sd;
var max_permissable = mean + number_sd_from_mean * sd;
var too_low = min < min_permissable;
var too_high = max > max_permissable;
if (too_low || too_high)
outliers = true;
return { hasOutliers: outliers, maxPermissable: max_permissable, minPermissable: min_permissable };
}
See also
- QScript for more general information about QScripts.
- QScript Examples Library for other examples.
- Online JavaScript Libraries for the libraries of functions that can be used when writing QScripts.
- QScript Reference for information about how QScript can manipulate the different elements of a project.
- JavaScript for information about the JavaScript programming language.
- Table JavaScript and Plot JavaScript for tools for using JavaScript to modify the appearance of tables and charts.