Preliminary Project Setup - Check for Errors in Data File Construction

From Q
Jump to navigation Jump to search

Read through a data file and check to see if it contains common data file errors This QScript reads through a data file and checks to see if it contains common data file errors. This script should be run immediately after importing a data file (i.e., prior to any data cleaning being undertaken in Q).

Technical details

The script will generate a brief table of any issues that have been found, along with a list of the names of the variables that have issues, and any questions whose variables have issues will be added to your Report.

An error is, in the main, a common variation from SPSS Data File Specifications. More specifically, this QScript:

  • Checks that the Variable Types of the variables in the project appear sensible.
  • Checks if any Pick One - Multi questions should be set as Pick Any - Compact questions.
  • Checks for incorrect Missing Data settings in binary variables.
  • Checks for variables with blank labels.
  • Checks for Text variables storing multiple response data separated by commas.
  • Checks for the presence of an unique ID variable.
  • Checks for demographic variables with incomplete data.
  • Checks for variables containing only a single possible value.

How to apply this QScript

  • Start typing the name of the QScript into the Search features and data box in the top right of the Q window.
  • Click on the QScript when it appears in the QScripts and Rules section of the search results.

OR

  • Select Automate > Browse Online Library.
  • Select this QScript from the list.

Customizing the QScript

This QScript is written in JavaScript and can be customized by copying and modifying the JavaScript.

Customizing QScripts in Q4.11 and more recent versions

  • Start typing the name of the QScript into the Search features and data box in the top right of the Q window.
  • Hover your mouse over the QScript when it appears in the QScripts and Rules section of the search results.
  • Press Edit a Copy (bottom-left corner of the preview).
  • Modify the JavaScript (see QScripts for more detail on this).
  • Either:
    • Run the QScript, by pressing the blue triangle button.
    • Save the QScript and run it at a later time, using Automate > Run QScript (Macro) from File.

Customizing QScripts in older versions

  • Copy the JavaScript shown on this page.
  • Create a new text file, giving it a file extension of .QScript. See here for more information about how to do this.
  • Modify the JavaScript (see QScripts for more detail on this).
  • Run the file using Automate > Run QScript (Macro) from File.

JavaScript

includeWeb('QScript Functions to Generate Outputs');
includeWeb('QScript Value Attributes Functions');
includeWeb('QScript Selection Functions');
includeWeb('JavaScript Text Analysis Functions');
includeWeb("JavaScript Array Functions");
includeWeb("JavaScript Utilities");
 
var is_displayr = (!!Q.isOnTheWeb && Q.isOnTheWeb());
var data_file = requestOneDataFileFromProject(true, true, is_displayr);
var variables = data_file.variables; //gets all variables 
var n_variables = variables.length;
 
var vars_should_be_categorical = [];
var vars_should_be_numeric = [];
var vars_should_be_categorical_from_text = [];
var vars_should_be_numeric_from_text = [];
var binary_variables_with_incorrect_missing_data = [];
var empty_labels = [];
var text_with_numbers_and_commas = [];
var only_one_value = [];
 
 
var non_hidden_variables = variables.filter(function (variable) {return !variable.question.isHidden && !variable.question.isBanner && variable.question.isValid; });
 
non_hidden_variables.forEach(function (variable) {
    var variable_type = variable.variableType;
    var isCategorical = variable_type == "Categorical" || variable_type == "Ordered Categorical";
    var isNumeric = variable_type == "Numeric";
    var values = variable.uniqueValues;
    var n_values = values.length;
    var variable_name = variable.name;
    var variable_label = variable.label;
 
    // Check for variables with a single value
    if (variable_type != "Text" && n_values == 1)
        only_one_value.push(variable);

    // Check for empty variable labels
    if (variable_label.search(/\S/) == -1)
        empty_labels.push(variable);
 
    // Check the values and variable type
    if (variable_type != "Text"){
        var question_values = variable.question.uniqueValues;
        var n_question_values = question_values.length;
 
        var n_missing_values = numberMissingValues(variable);
        // Determine if the labels of the variable match the source values
        var labels_match_source_values = valueLabelsMatchValues(variable);
 
        // Assess whether the variable may be the wrong type
        if (isCategorical && labels_match_source_values && variable.question.questionType != "Pick Any")
            vars_should_be_numeric.push(variable);
        if (isNumeric && !labels_match_source_values)
            vars_should_be_categorical.push(variable);
 
        // Check missing data settings for binary variables
        if (n_values == 2 && n_missing_values > 0 && variable_type == "Categorical" && n_question_values == 2)
            binary_variables_with_incorrect_missing_data.push(variable);
    }
    else if (n_values > 1) {
 
        // Check for text containing numbers separated by commas 
        if (textContainsNumbersSeparatedByCommas(variable))
            text_with_numbers_and_commas.push(variable);
 
        if (n_values <= 20 // Only a few different responses
            && !isOther(variable_label)  // Not obviously an 'Other/Specify'
            && !containsSubstring(variable_label.toLowerCase(), ["text"]) // Not designated as 'text' in the label
            && (variable_name.indexOf('_') > -1 && data_file.getVariablesByName(variable_name.split('_')[0]).length > 0)) // Isn't obviously part of a multi
               vars_should_be_categorical_from_text.push(variable);
        else {
            var is_numeric = true;
            var current_value;
            for (var j = 1; j < n_values; j++) {
                current_value = values[j];
                if (isNaN(parseFloat(current_value)) || !isFinite(current_value)) {
                    is_numeric = false;
                    break;
                }
            }
            if (is_numeric)
                vars_should_be_numeric_from_text.push(variable);
        } 
    }
});
 
// Check for Pick Any - Compact questions
var pick_any_compact_questions = [];
data_file.questions.forEach(function (q) {
    if (q.questionType == 'Pick One - Multi' && !q.isHidden) {
        var number_non_missing_values = q.uniqueValues.length - numberMissingValues(q);
        if (hasMissingSelected(q) && number_non_missing_values > 5 && isMaxMultiFormat(q))
            pick_any_compact_questions.push(q);
    }
});
 
// Check for incomplete demographic variables
var incomplete_demographics = checkForIncompleteCases(data_file);
 
// Identify the questions associated with the problem variables
var qs_with_vars_should_be_categorical = getUniqueQuestionsForVariables(vars_should_be_categorical);
var qs_with_vars_should_be_numeric = getUniqueQuestionsForVariables(vars_should_be_numeric);
var qs_with_vars_should_be_numeric_from_text = getUniqueQuestionsForVariables(vars_should_be_numeric_from_text);
var qs_with_vars_should_be_categorical_from_text = getUniqueQuestionsForVariables(vars_should_be_categorical_from_text);
var qs_with_binary_variables_with_incorrect_missing_data = getUniqueQuestionsForVariables(binary_variables_with_incorrect_missing_data);
var qs_with_empty_labels = getUniqueQuestionsForVariables(empty_labels);
var qs_with_variables_with_single_value = getUniqueQuestionsForVariables(only_one_value);
 
var total_errors = qs_with_vars_should_be_categorical.length
    + qs_with_vars_should_be_numeric.length
    + qs_with_vars_should_be_numeric_from_text.length
    + qs_with_vars_should_be_categorical_from_text.length
    + pick_any_compact_questions.length
    + qs_with_binary_variables_with_incorrect_missing_data.length
    + qs_with_empty_labels.length
    + text_with_numbers_and_commas.length
    + incomplete_demographics.length
    + qs_with_variables_with_single_value.length;
 
// Create the logs
 
// Check for ID variable
var ID_var = parseIDVariables(data_file);
 
var summary_log = "";
var variable_list = "";
 
if (total_errors > 0) {
    var base_group = project.report.appendGroup();
    base_group.name = "Possible Data File Errors";
    var group_name;
 
    if (ID_var.variable == null || !ID_var.unique)
        log("The data does not appear to contain an ID variable with unique values for each respondent.");
    else 
        log("An ID variable has been found and it contains unique values for each respondent: " + ID_var.variable.label + "\r\nSelect this variable in the Case IDs menu in the Data tab.");
    log('\r\n');
 
    var prod_str = is_displayr ? "Displayr" : "Q";
    summary_log += "This data file may have been constructed containing some errors.  Please review the folder called \"Possible Data File Errors\" "
        + "for tables illustrating specific types of problems.  Please note that although " + prod_str + " has many tools to help you fix errors in the "
        + "construction of the data file, it is usually more productive to fix the errors at the time of creating the data file.  The table "
        + "below summarizes the key issues identified.\r\n\r\n\r\n";
 
    function addError(questions, group_name, summary_prefix, list_prefix, variables) {
        if (questions.length > 0) {
            if (!is_displayr)
                summary_log += summary_prefix + questions.length + '\r\n';
            else
                summary_log += questions.length + " " + summary_prefix + '\r\n';
            generateSubgroupOfSummaryTables(group_name, base_group, questions);
            variable_list += list_prefix + "\r\n\r\n" + variableNameList(variables) + "\r\n\r\n";
        }
    }
    
    var q_str = is_displayr ? "Variable Sets" : "Questions";
    var pom_str = is_displayr ? "Nominal - Multi" : "Pick One - Multi";
    var pac_str = is_displayr ? "Binary - Multi (Compact)" : "Pick Any - Compact";

    var msg = q_str + " with numeric variables that may be categorical";
    addError(qs_with_vars_should_be_categorical,
             "Variable type may be Categorical",
             is_displayr ? msg : msg + " \t\t\t|\t",
             "Numeric variables that may be categorical:",
             vars_should_be_categorical);
    
    msg = q_str + " with categorical variables that may be numeric"; 
    addError(qs_with_vars_should_be_numeric,
             "Variable type may be Numeric",
             is_displayr ? msg : msg + " \t\t\t|\t",
             "Categorical variables that may be numeric:",
             vars_should_be_numeric);
    
    msg = q_str + " with binary variables that have incorrect missing data selections";
    addError(qs_with_binary_variables_with_incorrect_missing_data,
             "Binary variables with incorrect Missing Data selection",
             is_displayr ? msg : msg + "\t|\t",
             "Binary variables with incorrect missing data selections:",
             binary_variables_with_incorrect_missing_data);
    
    msg = q_str + " with text variables that may be categorical";
    addError(qs_with_vars_should_be_categorical_from_text,
             "Text variables that may be Categorical",
             is_displayr ? msg : msg + " \t\t\t|\t",
             "Text variables that may be categorical:",
             vars_should_be_categorical_from_text);
    
    msg = q_str + " with text variables that may be numeric";
    addError(qs_with_vars_should_be_numeric_from_text,
             "Text variables that may be Numeric",
             is_displayr ? msg : msg + " \t\t\t\t|\t",
             "Text variables that may be numeric:",
             vars_should_be_numeric_from_text);
    
    msg = "Text variables with numbers separated by commas or semicolons";
    addError(text_with_numbers_and_commas.map(function (v) { return v.question; } ),
             msg,
             is_displayr ? msg : msg + "\t\t|\t",
             msg + ":",
             text_with_numbers_and_commas);
    
    msg = q_str + " with blank labels";
    addError(qs_with_empty_labels,
             msg,
             is_displayr ? msg : msg + "\t\t\t\t\t\t|\t",
             "Variables with blank labels:",
             empty_labels);
    
    msg = q_str + " with variables having a single value";
    addError(qs_with_variables_with_single_value,
             msg,
             is_displayr? msg : msg + "\t\t\t\t|\t",
             "Variables with a single value:",
             only_one_value);
    
    msg = pom_str + " " + q_str.toLowerCase() + " that may be " + pac_str;
    addError(pick_any_compact_questions,
             msg,
             is_displayr? msg : msg + "\t\t\t|\t",
             msg + ":",
             pick_any_compact_questions);

 
    if (text_with_numbers_and_commas.length > 0)
        summary_log += '\r\nWhen Text variables contain numbers separated by commas it indicates that multiple response data has been stored in a single variable. \r\n';
    if (qs_with_empty_labels.length > 0)
        summary_log += '\r\nBlank labels can occur when the question wordings for these questions are so long that the program that has been used to create the data file has left out the row labels.\r\n';
 
    if (incomplete_demographics.length > 0) {
        summary_log += '\r\nThe following demographic variables have missing values, suggesting that the data file may include incomplete cases: \r\n\r\n'
        incomplete_demographics.forEach(function (v) {
            summary_log += v.label + '\r\n';
        });
        generateSubgroupOfSummaryTables("Incomplete demographics", base_group, incomplete_demographics.map(function (v) { return v.question; }));
    }
 
    log(summary_log);
    log('\r\n');
    log('Lists of Variables and ' + q_str + ' with errors\r\n\r\n');
    log(variable_list);
} else if (ID_var.variable == null || !ID_var.unique) {
    log("The data does not appear to contain an ID variable with unique values for each respondent.");
    log("Otherwise the data file appears to be well constructed.");
} else {
    log("This data file appears to be well constructed.")
    log("An ID variable has been found and it contains unique values for each respondent: " + ID_var.variable.label + "\r\nSelect this variable in the Case IDs menu in the Data tab.");
}
 
 
 
// This function generates a Pick Any question whose rows
// count the number of respondents with non-missing values
// in each row of the input question. The input question
// should be a Pick One - Multi question.
function generateQuestionToCheckSampleSizes(question) {
    var data_file = question.dataFile;
    var variable_array = question.variables;
    // Generate JavaScript variables to use to determine the 
    // proportion of non-missing responses in each variable
    // from 'variable_array'
    var checking_variables = [];
    for (var j = 0; j < variable_array.length; j++) {
        var current_variable = variable_array[j];
        var current_name = current_variable.name;
        var formula = "!isNaN(" + current_name + ");";
        var new_name = current_name + "check";
        var new_label = current_variable.label + "check";
        var new_variable = data_file.newJavaScriptVariable(formula, false, new_name, new_label, data_file.variables[data_file.variables.length -1]);
        new_variable.variableType = "Categorical";
        new_variable.question.questionType = "Pick Any";
        checking_variables.push(new_variable);
    }
    var new_q = data_file.setQuestion(question.name + 'temp for checking', 'Pick Any', checking_variables);
    new_q.valueAttributes.setCountThisValue(1, true);
    new_q.valueAttributes.setCountThisValue(0, false);
    return new_q;
}
 
// The function returns true if:
//
// 1. The n from the last row (priot to the NET row) is smaller than the 
//    n from the first row
// 2. The n does not increase between consecutive rows when moving from
//    the top row to the last row of the table.
// 3. The raw data in the question does not feature respondents with missing 
//    data in an earlier variable but non-missing data in a later variable.
function isMaxMultiFormat(question) {
    if (question.variables.length < 2)
        return false;
    // Check sample sizes decreasing in the table
    var checking_question = generateQuestionToCheckSampleSizes(question);      
    var temp_group = project.report.appendGroup();
    var temp_table = temp_group.appendTable();
    temp_table.primary = checking_question;
    var output = temp_table.calculateOutput();
    var counts = output.get('n');
    var row_sample_sizes = [];
    for (var j = 0; j < output.numberRows - 1; j++) {
        row_sample_sizes.push(counts[j][0]);
    }
 
    var sample_sizes_decreasing = true;
    if (row_sample_sizes[row_sample_sizes.length-1] < row_sample_sizes[0]) {
        for (var j = 1; j < row_sample_sizes.length; j++) {
            if (row_sample_sizes[j] > row_sample_sizes[j-1]) {
                sample_sizes_decreasing = false;
                break;
            }
        }
    } else
        sample_sizes_decreasing = false;
    temp_group.deleteItem();
    deleteQuestion(checking_question);
    // Check the raw data to determine if this is truly max-multi.
    // Respondents in a max-multi will not have missing data in
    // an earlier variable and have non-missing data in a later one.
    if (sample_sizes_decreasing) {
        var attributes = question.valueAttributes;
        var missing_source_values = question.uniqueValues.filter(function(v) { return !isNaN(v) && attributes.getIsMissingData(v); });
        var nans_before_values = false;
        var raw_data = question.variables.map(function(v) { return v.rawValues; });
        var transposed_data = Q.transpose(raw_data);
        for (var j = 0; j < raw_data[0].length; j++) {
            var respondent_data = transposed_data[j];
            var first_missing_index = respondent_data.map(function(x) { return missing_source_values.indexOf(x) > -1 || isNaN(x);} ).indexOf(true);
            if (first_missing_index > -1 && first_missing_index < respondent_data.length - 1) {        
                for (var k = first_missing_index + 1; k < respondent_data.length; k++) {
                    if (!isNaN(respondent_data[k]) && missing_source_values.indexOf(respondent_data[k]) == -1 ) {
                        nans_before_values = true;
                        break;
                    }
                }
            }
            if (nans_before_values) 
                break;
        }
        return !nans_before_values;
    } else
        return false;
}
 
// Delete variables from a question. Variables must be constructed.
function deleteQuestion(question) {
    var variable_array = question.variables;
    for (var j = 0; j < variable_array.length; j++)
        variable_array[j].deleteVariable();
}
 
// Check the value attributes for one or more categories
// are selected as missing data. Max-multi format data
// must have at least one category selected as Missing Data.
function hasMissingSelected(question) {
    var current_value_attributes = question.valueAttributes;
    var current_unique_values = question.uniqueValues;
    var has_missing = false;
    for (var j = 0; j < current_unique_values.length; j++) {
        var v = current_unique_values[j];
        if (current_value_attributes.getIsMissingData(v)) {
            has_missing = true;
            break;
        }
    }
    return has_missing;
}
 
// This function returns true if all non-missing value labels are identical to their source values.
// This means that the non-missing value labels are numeric, and this variable probably should be numeric.
function valueLabelsMatchValues(variable) {
    var values = variable.uniqueValues;
    var labels = valueLabels(variable);
    var value_attributes = variable.valueAttributes;
    var num_vals = values.length;
    var labels_match_values = true;
    for (var j = 0; j < num_vals; j++) {
        if (values[j].toString() != labels[j] && !value_attributes.getIsMissingData(values[j])) {
            labels_match_values = false;
            break;
        }
    }
    return labels_match_values;
}
 
function getUniqueQuestionsForVariables(variable_array) {
    return uniqueQObjectsInArray(variable_array.map(function (v) { return v.question; } ));
}
 
function variableNameList(variable_array) {
    return variable_array.map(function (v) { return v.name; }).join('\r\n');
}
 
function uniqueQObjectsInArray(q_object_array){
    var unique_elements = [];
    for (var j = 0; j < q_object_array.length; j++) {
        var duplicate = false;
        var cur_ob = q_object_array[j];
        for (var k = 0; k < unique_elements.length; k++) {
            if (cur_ob.equals(unique_elements[k])) {
                duplicate = true;
                break;
            }
        }
        if (!duplicate)
            unique_elements.push(cur_ob);
    }
    return unique_elements;
}
 
 
function variableHasUniqueValues(variable) {
    return !arrayHasDuplicateElements(variable.rawValues);
}
 
function addVariablesWithNamesMatchingStrings(variable_array, data_file, string_array) {
    string_array.forEach(function (v) {
        var cur_var = data_file.getVariableByName(v);
        if (cur_var != null)
            variable_array.push(cur_var);
    });
}
 
function variableLabelContainsString(variable, string) {
    return variable.label.indexOf(string) > -1;
}
 
 
function checkForIncompleteCases(data_file) {
    var variables = data_file.variables.filter(function (v) { return !v.question.isHidden; });
 
    var demographic_strings = ["age", "gender", "sex", "location"];
 
    // Check only the first and last variables in the file.
    var demographic_index_range = Math.min(10, Math.floor(variables.length / 2));
 
    // Identify potential demographic variables
    var candidate_variables = [];
 
    var add_demographic_var = function (cur_var) {
        if (cur_var.question.questionType == "Pick One") {
            demographic_strings.forEach(function (s) {
                if (cur_var.name.indexOf(s) > -1 || cur_var.label.indexOf(s) > -1)
                    candidate_variables.push(cur_var);
            });
        }
    };
 
    for (var j = 0; j < demographic_index_range; j++) {
        add_demographic_var(variables[j]);
        add_demographic_var(variables[variables.length - 1 - j]);
    }
 
    // Check for missing values in candidate variables
    var vars_with_incompletes = candidate_variables.filter(function  (v) {
        var unique_values = v.uniqueValues;
        return unique_values.filter(function (x) { return isNaN(x); }).length > 0;
    });
 
    return vars_with_incompletes;
}
 
 
 
function parseIDVariables(data_file) {
 
    var variables = data_file.variables;
 
    // Build a list of variables that are candidate ID variables based on name or label.
    // List is constructed in order of preference.
    var candidate_variables = [];
 
    // Top Priority Variable Names - ID
    var top_names = ["ID", "id"];
    addVariablesWithNamesMatchingStrings(candidate_variables, data_file, top_names);
 
    // Variable labels that contain "ID" (case sensitive)
    candidate_variables = candidate_variables.concat(variables.filter(function (v) { return variableLabelContainsString(v, "ID"); } ));
 
    var secondary_name_strings = ["respid", "identifier", "unique", "record", "response", "id", "case", "respondent", "serial"];
    secondary_name_strings.forEach(function (s) {
        candidate_variables = candidate_variables.concat(data_file.getVariablesByName(s));
    });
 
    var secondary_label_strings = ["Identifier", "identifier", "ResponseID", "Response ID", "Unique", "unique"];
    secondary_label_strings.forEach(function (s) {
        candidate_variables = candidate_variables.concat(variables.filter(function (v) { return variableLabelContainsString(v, s); } ));
    });
 
    // Return the first variable in the preferential list that has unique values
    for (var j = 0; j < candidate_variables.length; j++)
        if (variableHasUniqueValues(candidate_variables[j]))
            return { variable: candidate_variables[j], unique: true };
 
    if (candidate_variables.length > 0)
        return { variable: candidate_variables[0], unique: false };
 
    return {variable: null, unique: false};
}
 
// Determine if a text variable contains responses in the format
// 1,2,3
// Which tend to refer to multiple response data that has been
// stored in a single variable.
// Returns true if at least one response of this type is found
// and all responses are either of this type or are numbers (without any commas)
function textContainsNumbersSeparatedByCommas(text_variable) {
    var raw_values = text_variable.rawValues;
    var number_only_count = 0;
    var number_comma_count = 0;
    for (var j = 0; j < raw_values.length; j++) {
        var s = raw_values[j];
        var processed_string = s.split(/[\,\;]\s*/);
        var substrings_are_numeric = processed_string.filter(function (x) {
            return isNumber(x) || x.trim() == "";
        });
        if (processed_string.length == 1 && substrings_are_numeric.length == 1)
            number_only_count++;
        else if (processed_string.length == substrings_are_numeric.length)
            number_comma_count++;
        else
            return false;
    }
 
    return number_comma_count > 0 && number_comma_count + number_only_count == raw_values.length;
}

See also