Preliminary Project Setup - Tables for Data Checking
Jump to navigation
Jump to search
Read through a data set for data that likely needs to be examined or corrected, and create tables with cells that may be of interest highlighted
This QScript scans through a data file looking for data that likely needs to be examined or corrected, and creates tables with cells that may be of interest highlighted.
Technical details
It examines all tables containing non-Text and non-Date data and:
- Identifies all tables containing cells with sample sizes of less than 30.
- Identifies all tables containing Don't Know responses.
- Identifies all tables containing blank labels.
- Identifies all tables containing options chosen by 5 or fewer people, or, less than 1% respondents
- Identifies all empty tables
- Identifies numeric data containing outliers, where a variable contains values that are more than (less than) 3 standard deviations above (below) the mean
- Identifies questions where the Base n is not the same for all variables
Identified tables have the relevant cells highlighted in yellow (via Rules or Table JavaScript in earlier versions of Q). Data containing outliers will be shown as histograms.
How to apply this QScript
- Start typing the name of the QScript into the Search features and data box in the top right of the Q window.
- Click on the QScript when it appears in the QScripts and Rules section of the search results.
OR
- Select Automate > Browse Online Library.
- Select this QScript from the list.
Customizing the QScript
This QScript is written in JavaScript and can be customized by copying and modifying the JavaScript.
Customizing QScripts in Q4.11 and more recent versions
- Start typing the name of the QScript into the Search features and data box in the top right of the Q window.
- Hover your mouse over the QScript when it appears in the QScripts and Rules section of the search results.
- Press Edit a Copy (bottom-left corner of the preview).
- Modify the JavaScript (see QScripts for more detail on this).
- Either:
- Run the QScript, by pressing the blue triangle button.
- Save the QScript and run it at a later time, using Automate > Run QScript (Macro) from File.
Customizing QScripts in older versions
JavaScript
// Read in commonly-used functions from the Wiki
includeWeb('QScript Utility Functions');
includeWeb('QScript Questionnaire Functions');
includeWeb('QScript Selection Functions');
includeWeb('QScript Value Attributes Functions');
includeWeb('QScript Functions to Generate Outputs');
includeWeb('QScript Data Reduction Functions');
includeWeb('QScript Functions for Combining Categories');
includeWeb('QScript Functions for Processing Arrays');
includeWeb('QScript Table Functions');
includeWeb('JavaScript Utilities');
includeWeb('QScript Functions to Generate Outputs');
main();
function main() {
var is_displayr = (!!Q.isOnTheWeb && Q.isOnTheWeb());
var structure_name = is_displayr ? "variable sets" : "questions";
var selected_datafile = requestOneDataFileFromProject(true, true, is_displayr);
var questions = getAllQuestionsByTypes([selected_datafile], ["Pick One","Pick One - Multi",
"Number","Number - Multi","Number - Grid","Pick Any","Pick Any - Compact","Pick Any - Grid"]);
questions = questions.filter(function (q) { return q.isValid; });
var num_questions = questions.length;
var check_questions = [];
var outlier_questions = [];
var inconsistent_base_questions = [];
var empty_questions = [];
var display_n = [];
var display_base_n = [];
// check for large tabs
var large_and_small = splitArrayIntoApplicableAndNotApplicable(questions, questionsYieldLargeTabs)
var too_large = large_and_small.applicable;
var skip_large = false;
if (too_large.length > 0) {
skip_large = askYesNo("Some " + structure_name +
" in your data set will produce very large tables that will be slow " +
" to calculate and check. Do you want to skip them?\r\n\r\n" +
too_large.map(function(q) { return q.name; }).join("\r\n"));
if (skip_large)
questions = large_and_small.notApplicable;
}
var temporary_table = project.report.appendTable();
var questions_containing_dk = [];
questions.forEach(function (current_question){
var q_type = current_question.questionType;
if (q_type.indexOf("Number") == 0)
if (questionHasOutliers(current_question, 3))
outlier_questions.push(current_question);
temporary_table.primary = current_question;
temporary_table.secondary = "SUMMARY";
var statistics = getStatisticsFromTable(temporary_table, ["Base n", "n","%","Row %"]);
if (statistics == null) {
empty_questions.push(current_question);
} else {
if (current_question.variables.length > 1 && !statisticIsConstant(statistics["Base n"])) {
inconsistent_base_questions.push(current_question);
} else if (containsDontKnow(current_question) || containsBlankLabels(current_question))
check_questions.push(current_question);
else {
if (statistics == null) {
check_questions.push(current_question);
} else {
var not_in_yet = true;
if (not_in_yet && statistics["Base n"] != null && minWithNaNasNegativeInfinity(statistics["Base n"]) < 30){
not_in_yet = false;
display_base_n.push(current_question.name);
}
if (not_in_yet && statistics["n"] != null && minWithNaNasNegativeInfinity(statistics["n"]) < 5){
not_in_yet = false;
display_n.push(current_question.name);
}
if (not_in_yet && statistics["%"] != null && minWithNaNasNegativeInfinity(statistics["%"]) < 1.0)
not_in_yet = false;
if(!not_in_yet)
check_questions.push(current_question);
}
}
}
});
temporary_table.deleteItem();
// Specify the table javascript expression to highlight cells
var expression = '// checks to see if a label contains one of the possible substrings (array)\r\n\
function containsSubstring(label, possible_substrings) {\r\n\
for (var i in possible_substrings)\r\n\
if((label).toLowerCase().indexOf(possible_substrings[i]) != -1) // checks if label contains any of the above\r\n\
return true;\r\n\
return false;\r\n\
}\r\n\
// checks to see if a label represents a don\'t know\r\n\
function isDontKnow(label) {\r\n\
if (containsSubstring(label,["dk","d.k.","don\'t know","dont know","unsure","un-sure","not sure","do not know","no idea","N/A","applicable","not use", "never use"]))\r\n\
return true;\r\n\
if (label == "NA" || label == "na") //removes labels that are NA\r\n\
return true;\r\n\
return false;\r\n\
}\r\n\
// Names of statistics to be checked in each table\r\n\
var stats_to_check = ["Base n", "n","%","Row %"];\r\n\
// Corresponding maximum value for each statistic\r\n\
var upper_limits_for_stats = [30, 5, 1.0, 1.0];\r\n\
var colors = table.cellColors;\r\n\
var num_stats = stats_to_check.length;\r\n\
var num_rows = table.numberRows;\r\n\
var num_cols = table.numberColumns;\r\n\
for (var j = 0; j < num_stats; j++) {\r\n\
// Check that the stat is available for this table\r\n\
if (table.availableStatistics.indexOf(stats_to_check[j]) > -1) {\r\n\
var values = table.get(stats_to_check[j]);\r\n\
for (var row = 0; row < num_rows; row++)\r\n\
for (var column = 0; column < num_cols; column++) { \r\n\
if (values[row][column] < upper_limits_for_stats[j])\r\n\
colors[row][column] = "Yellow";\r\n\
}\r\n\
}\r\n\
}\r\n\
// Check each cell to see if its row or column label looks like a Dont Know style response\r\n\
var row_labels = table.rowLabels;\r\n\
var col_labels = table.columnLabels;\r\n\
if (num_cols > 1) {\r\n\
for (var row = 0; row < num_rows; row++)\r\n\
for (var column = 0; column < num_cols; column++) { \r\n\
if (isDontKnow(row_labels[row]) || isDontKnow(col_labels[column]))\r\n\
colors[row][column] = "Yellow";\r\n\
}\r\n\
} else \r\n\
for (var row = 0; row < num_rows; row++)\r\n\
if (isDontKnow(row_labels[row]) || row_labels[row].search(/\\S/) == -1)\r\n\
colors[row][0] = "Yellow";\r\n\
// Store the modified cell colors.\r\n\
table.cellColors = colors;\r\n\
if (fileFormatVersion() > 8.12)\r\n\
form.setSummary("Highlight cells for checking")';
if (check_questions.length == 0 && outlier_questions.length == 0 && inconsistent_base_questions.length == 0)
log('No questions have been identified as being likely to require cleaning');
else {
var new_group = project.report.appendGroup();
if (!is_displayr)
new_group.name = "Data to Review";
else
new_group.name = "Tables For Data Checking";
// Add histograms for outliers
if (outlier_questions.length > 0) {
var outlier_group = new_group.appendGroup();
outlier_group.name = "Data that may contain outliers";
if (!is_displayr) {
outlier_questions.forEach(function (q) {
var new_histogram = outlier_group.appendPlot("Histogram");
new_histogram.primary = q;
});
}else {
outlier_questions.forEach(function (q) {
var page = outlier_group.appendPage("Blank");
page.name = q.name;
var new_histogram = page.appendPlot("Histogram");
new_histogram.primary = q;
});
}
}
if (empty_questions.length > 0)
generateSubgroupOfSummaryTables("Empty " + structure_name,
new_group, empty_questions)
// Make a table for each question to be checked. Add the table JavaScript (or Rule)
var num_questions = check_questions.length;
var new_table;
if (is_displayr) {
var base_group = new_group.appendGroup();
base_group.name = "Data with values needing checking";
}
if (fileFormatVersion() > 8.12)
var highlight_rule = project.rules.newCustomRule(expression);
for (var j = 0; j < num_questions; j++) {
var current_question = check_questions[j];
if (is_displayr) {
var page = base_group.appendPage("Blank");
page.name = current_question.name;
new_table = page.appendTable();
}else
new_table = new_group.appendTable();
new_table.primary = current_question;
var question_name = current_question.name;
if (display_n.indexOf(question_name) != -1)
addStatisticsToTableIfPossible(new_table, "n");
if (display_base_n.indexOf(question_name) != -1) {
addStatisticsToTableIfPossible(new_table, "Base n");
}
if (fileFormatVersion() > 8.12)
new_table.rules.add(highlight_rule);
}
if (fileFormatVersion() <= 8.12)
addTableJavaScriptToTablesInGroup(new_group, expression);
if (inconsistent_base_questions.length > 0) {
var base_group = new_group.appendGroup();
var base_n_str = is_displayr ? "Sample Size" : "Base n";
base_group.name = "Data with varying " + base_n_str;
inconsistent_base_questions.forEach(function (q) {
if (is_displayr) {
var page = base_group.appendPage("Blank");
page.name = q.name;
var new_table = page.appendTable();
}else
var new_table = base_group.appendTable();
new_table.primary = q;
addStatisticsToTableIfPossible(new_table, "Base n");
if (fileFormatVersion() > 8.12)
new_table.rules.add(highlight_rule);
});
if (fileFormatVersion() <= 8.12)
addTableJavaScriptToTablesInGroup(base_group, expression);
new_group.moveAfter(base_group, null);
}
if (!is_displayr) {
var message = ["Tables have been created, and any cells containing values that should be reviewed have been marked in yellow.\r\n"];
if (fileFormatVersion() > 8.12)
message.push("To remove this formatting select one of the tables, select Automate > Manage Rules, and delete the Rule called \'Highlight cells for checking\'.");
else
message.push("To remove this formatting you need to select the tables, select Edit > Table JavaScript and delete the JavaScript.");
}else
var message = ["Tables with cells that should be reviewed marked in yellow have been added to the folder 'Data with values needing checking'."];
if (inconsistent_base_questions.length > 0) {
if (!is_displayr)
message.push("");
message.push("Tables where the Base n varies between cells have been added to the folder 'Data with varying Base n'.")
}
if (outlier_questions.length > 0) {
if (!is_displayr)
message.push("");
message.push("Histograms showing " + structure_name +
" with outliers have been added to the folder called " +
"'Data that may contain outliers'.");
}
if (skip_large) {
if (!is_displayr)
message.push("");
var msg_str = is_displayr ? "Variable Set Structure to Numeric or Numeric - Multi" : "Question Type to Number or Number - Multi";
message.push("Some " + structure_name + " were skipped to " +
"avoid creating tables that are slow to calculate and " +
" check. You should review these " + structure_name +
" and consider changing the " + msg_str + " if appropriate:");
message.push(too_large.map(function(q) { return q.name; }).join("\r\n"));
}
var report = simpleHTMLReport(message, new_group.name, new_group, true, is_displayr)
conditionallyEmptyLog(message.join("\r\n"));
}
}
// Returns true if the input question contains outliers according to the
// specified number of standard deviations from the mean.
function questionHasOutliers(question, number_sd_from_mean) {
checkQuestionType(question, ["Number", "Number - Multi", "Number - Grid"]);
// constructing the outputs
var temp_table = project.report.appendTable();
temp_table.primary = question;
//numeric_questions.push(question);
temp_table.secondary = "SUMMARY";
temp_table.cellStatistics = ['Average', 'Standard Deviation','Minimum', 'Maximum', 'Base n', 'Missing n'];
var output;
try {
output = temp_table.calculateOutput();
} catch (e) {
temp_table.deleteItem();
return false; //Question is empty;
}
var n_columns = question.questionType == "Number - Grid" ? output.numberColumns - 1: 1; //excludes sums
var n_rows = Math.max(1, output.numberRows - 1);
// determining if there are outliers
var outliers = false;
var min_permissable;
var max_permissable;
var v_counter;
var means = output.get('Average');
var sds = output.get('Standard Deviation');
var mins = output.get('Minimum');
var maxes = output.get('Maximum');
for (var column = 0; column < n_columns; column++)
for (var row = 0; row < n_rows; row++) {
var mean = means[row][column];
var sd = sds[row][column];
min_permissable = mean - number_sd_from_mean * sd;
max_permissable = mean + number_sd_from_mean * sd;
var too_low = mins[row][column] < min_permissable;
var too_high = maxes[row][column] > max_permissable;
if (too_low || too_high)
outliers = true;
}
temp_table.deleteItem();
return outliers;
}
// Checks to see if a question contains a Don't Know option
function containsDontKnow(current_question) {
current_value_attributes = current_question.valueAttributes;
current_unique_values = current_question.uniqueValues;
num_vals = current_unique_values.length;
labels = valueLabels(current_question);
for (var k = 0; k < num_vals; k++) {
if (isDontKnow(labels[k]))
return true;
}
return false;
}
// Checks to see if a question has blank labels in the data reduction
function containsBlankLabels(question) {
var var_labels = question.variables.map(function (v) { return v.label; });
var attributes = question.valueAttributes;
var value_labels = question.uniqueValues.map(function (v) { return attributes.getLabel(v); });
return value_labels.concat(var_labels).filter(function (s) { return s.search(/\S/) == -1; }).length > 0;
}
function statisticIsConstant(stat_array) {
var flat_array = [].concat.apply([], stat_array);
var min = Math.min.apply(null, flat_array);
var max = Math.max.apply(null, flat_array);
return min == max;
}
function questionsYieldLargeTabs(row_q) {
var q_type = row_q.questionType;
if (row_q.dataReduction.rowLabels == null || row_q.dataReduction.columnLabels)
if (q_type == "Pick One - Multi" || q_type == "Number - Grid" || q_type == "Pick Any - Grid") {
return row_q.dataReduction.rowLabels.length * row_q.dataReduction.columnLabels.length > 10000;
} else {
return row_q.dataReduction.rowLabels.length > 10000;
}
}
See also
- QScript for more general information about QScripts.
- QScript Examples Library for other examples.
- Online JavaScript Libraries for the libraries of functions that can be used when writing QScripts.
- QScript Reference for information about how QScript can manipulate the different elements of a project.
- JavaScript for information about the JavaScript programming language.
- Table JavaScript and Plot JavaScript for tools for using JavaScript to modify the appearance of tables and charts.