Text Analysis - Save Variable(s) - Categories

From Q
Jump to navigation Jump to search

Save variables to the data set containing the categories. Where there are multiple input variables, multiple sets of variables are added for each.

This QScriptfeature saves the categories from a Automatic Categorization - List of Items, Automatic Categorization - Unstructured Text, Automatic Categorization - Entity Extraction or Text Analysis - Advanced - Setup Text AnalysisInsert > Text Analysis > Automatic Categorization > List of Items/Unstructured Text/Entity Extraction or Insert > Text Analysis > Advanced > Setup Text Analysis output as a Pick One or Pick Any - Compact questionNominal or Binary Multi - Compact variable set.

Technical Details

The variables created from this QScriptfeature may become invalid and need to be deleted and recreated if the output from Automatic Categorization - List of Items, Automatic Categorization - Unstructured Text, Automatic Categorization - Entity Extraction or Automatic Categorization - Advanced - Setup Text AnalysisInsert > Text Analysis > Automatic Categorization > List of Items/Unstructured Text/Entity Extraction or Insert > Text Analysis > Advanced > Setup Text Analysis has changed, either due to the input text variable being modified or the input settings modified.

This QScript requires version 5.5.1 to be executed.

Code

includeWeb("QScript R Output Functions");

main();

function main() {
    if (Q.fileFormatVersion() < 14.11) // require Q 5.5.1 or after in order to call selected_item.data
    {
        log("Q 5.5.1 or later is required to run this script, please update your version of Q.")
        return false;
    }
    
    var script_name = "Save Variable(s) - Categories";
    var analysis_name = "Categories";
    var variable_prefix = "categ";
    
    var bad_selection_message = "Select a List Categorization, Automatic Categorization, Entity Extraction or Setup Text Analysis output.";
    var web_mode = (!!Q.isOnTheWeb && Q.isOnTheWeb());
    
    var selected_item = getSelectedROutputFromPage(["categorizedlist", "AutomaticCategorization", "EntityExtraction", "TextClassifier", "wordBag"]);
    if (selected_item === null) {
        log(bad_selection_message);
        return false;
    }
    
    var data_file = getDataFileFromItemDependants(selected_item);
    
    if (data_file == null)
    {
        log("'Save variables' cannot be applied to an output with no data file.");
        return false;
    }
    
    // Find last variable, which we will place the generated variables after
    var last_variable = getLastVariable(getVariables(selected_item.dependants(false)));
    
    // Check if the Token controls exists and pass it in if it does.
    var form_max_levels = selected_item.getInput("formMaxLevels");
    if (form_max_levels === null) {
        form_max_levels = "Inf";
    }
    var form_max_mentions = selected_item.getInput("formMaxMentions");
    if (form_max_mentions === null) {
        form_max_mentions = "Inf";
    }
    // Save the variables depending on the R output class.
    if (selected_item.outputClasses.indexOf("categorizedlist") > -1)
    {
        var variable_names = selected_item.data.get("variable.names");
        for (var j = 0; j < variable_names.length; j++)
        {
            var expression = "categorizedlist = " + 
                             stringToRName(selected_item.referenceName) + 
                          "\nif (categorizedlist$hash != '" + selected_item.data.get("hash") + "')\n" +
                          "    stop('The Text Analysis output used to create these variables has changed " +
                          "and as a result these variables are no longer valid. Please delete these variables and rerun the " +
                          script_name + " script on the Text Analysis output.')\n" +
                          "# Look to replace this with single javascript code outside the loop\n" +
                          "form.max.levels <- " + form_max_levels + "\n" + 
                          "form.max.mentions <- " + form_max_mentions + "\n" +
                          "# Temporary fix\n" +
                          stringToRName(selected_item.referenceName) + "$n.cases <- length(" + stringToRName(selected_item.referenceName) + "$subset)\n" +
                          "flipTextAnalysis::SaveVariablesCategories(" + stringToRName(selected_item.referenceName) + ",\n" + 
                          "                                          variable.name = categorizedlist$variable.names[" + (j + 1) + "],\n" +
                          "                                          form.max.levels = " + form_max_levels + ",\n" +
                          "                                          form.max.mentions = " + form_max_mentions + ")";
            var new_q_name = preventDuplicateQuestionName(data_file, analysis_name + " from " + variable_names[j]);
            var temp_var_name = randomVariableName(16); // temporary name, random to (almost) guarantee uniqueness
            
            try {
                var new_r_question = data_file.newRQuestion(expression, new_q_name, temp_var_name, last_variable);
            } catch (e) {
                if (/Can only convert tabular results to an R/.test(e)) {
                    log(analysis_name + " could not be created from this item: The output variable is too large. " + 
                        "The variable size can be reduced by increasing the required size of each category before saving (increasing the value in the 'Minimum category size' control)");
                } else {
                    log(analysis_name + " could not be created from this item: " + e);
                }
                return false;
            }
            
            // Replace temporary variable names
            nameSequentialVariables(new_r_question.variables, variable_prefix);
            
            new_r_question.questionType = "Pick Any - Compact";
            
            // In Q, create a table showing the new question
            if (!web_mode) {
                var t = selected_item.group.appendTable();
                t.primary = new_r_question;
            }
            
            last_variable = new_r_question.variables[new_r_question.variables.length - 1];
        }
    }
    else if (selected_item.outputClasses.indexOf("AutomaticCategorization") > -1)
    {
        var valid_check = "categorization = " + stringToRName(selected_item.referenceName) + 
            "\nif (gsub('\\u00a0', ' ', gsub('\\'', '', categorization$text.label, fixed = TRUE), fixed = TRUE) != '" + selected_item.data.get("text.label").toString().replace(/'/g, "") + "')\n" +
                          "    stop('The input text variable used for the Text Analysis has changed " +
                          "and as a result this variable is no longer valid. Please delete this variable and rerun the " +
                          script_name + " script on the Text Analysis output.')\n"
        var pick_any_output = false;
        var existing_cat = selected_item.getInput("formExistingCat");
        if (existing_cat !== null) {
            var predicted_from_existing = selected_item.data.get("predicted");
            var predicted_type = selected_item.data.getAttribute("predicted", "class");
            var pick_any_output = predicted_type == "data.frame";
            var expression = valid_check + 
                             "categorization$predicted";
        } else {
            var expression = valid_check + 
                             "categorization$categorization";
        }
        var new_q_name = preventDuplicateQuestionName(data_file, analysis_name + " from " +
                                                          selected_item.data.get("text.label"));
        var temp_var_name = randomVariableName(16); // temporary name, random to (almost) guarantee uniqueness

        try {
            var new_r_question = data_file.newRQuestion(expression, new_q_name, temp_var_name, last_variable);
        } catch (e) {
            if (/Can only convert tabular results to an R/.test(e)) {
                var number_categories = selected_item.getInput("formCategories");
                var size_msg = number_categories > 1 ? " The variable size could possibly be reduced by decreasing the number of categories from " + number_categories + " to a smaller number." : "";
                log(analysis_name + " could not be created from this item: The output variable is too large. " + size_msg);
            } else {
                log(analysis_name + " could not be created from this item: " + e);
            }
            return false;
        }

        // Replace temporary variable names
        nameSequentialVariables(new_r_question.variables, variable_prefix);

        new_r_question.variables.forEach(function(v) {v.variableType = "Categorical"});
           
        new_r_question.questionType = pick_any_output ? "Pick Any" : "Pick One";
        
        if (pick_any_output) {
            new_r_question.needsCheckValuesToCount = false;
        }   
        
        // In Q, create a table showing the new question
        if (!web_mode) {
            var t = selected_item.group.appendTable();
            t.primary = new_r_question;
            project.report.setSelectedRaw([t]);
        }
    }
	else if (selected_item.outputClasses.indexOf("TextClassifier") > -1)
    {
        var valid_check = "categorization = " + stringToRName(selected_item.referenceName) + 
                          "\nif (gsub('\\u00a0', ' ', gsub('\\'', '', categorization$text.label, fixed = TRUE), fixed = TRUE) != '" + selected_item.data.get("text.label").toString().replace(/'/g, "") + "')\n" +
                          "    stop('The input text variable used for the Text Analysis has changed " +
                          "and as a result this variable is no longer valid. Please delete this variable and rerun the " +
                          script_name + " script on the Text Analysis output.')\n"
        var existing_cat = selected_item.getInput("formExistingCat");
        var predicted_from_existing = selected_item.data.get("predicted");
        var predicted_type = selected_item.data.getAttribute("predicted", "class");
        var pick_any_output = predicted_type == "data.frame";
        var expression = valid_check + 
                             "categorization$predicted";
        var new_q_name = preventDuplicateQuestionName(data_file, analysis_name + " from " +
                                                          selected_item.data.get("text.label"));
        var temp_var_name = randomVariableName(16); // temporary name, random to (almost) guarantee uniqueness

        try {
            var new_r_question = data_file.newRQuestion(expression, new_q_name, temp_var_name, last_variable);
        } catch (e) {
            log(analysis_name + " could not be created from this item: " + e);
            return false;
        }

        // Replace temporary variable names
        nameSequentialVariables(new_r_question.variables, variable_prefix);

        new_r_question.variables.forEach(function(v) {v.variableType = "Categorical"});
           
        new_r_question.questionType = pick_any_output ? "Pick Any" : "Pick One";
        
        if (pick_any_output) {
            new_r_question.needsCheckValuesToCount = false;
        }   
        
        // In Q, create a table showing the new question
        if (!web_mode) {
            var t = selected_item.group.appendTable();
            t.primary = new_r_question;
            project.report.setSelectedRaw([t]);
        }
    }
    else if(selected_item.outputClasses.indexOf("EntityExtraction") > -1)
    {
        var entity_variables = selected_item.data.get("entity.variables");
        
        if(entity_variables.length === 1 && typeof entity_variables[0] === "boolean") {
            var empty_reason = selected_item.data.get("entity.variables.empty.reason");
            if(empty_reason == "output") {
                var reason = "there are no extracted entities in the output.";
            } else if (empty_reason == "min") {
                var entity_counts = selected_item.data.get("entity.counts");
                var entity_names = selected_item.data.getAttribute("entity.counts", "names");
                var min_cases_to_save = selected_item.data.get("min.cases.to.save");
                var largest_count = entity_counts.reduce(function(x, y) {
                    return (x > y) ? x : y;
                });
                var largest_name = entity_names[entity_counts.indexOf(largest_count)];
                var reason = "the entity type with the most number of extractions, '" + largest_name + "', contains " + largest_count + " named entities. However," +
                    " the minimum number of entities required to save is set at " + min_cases_to_save + ". Set 'Minimum number of cases to save' to " + 
                    largest_count + " or lower to save the extracted named entities as variables.";
            } else if(empty_reason == "remove"){
                var reason = "the only entities in the output have been removed with the user specified remove entities from extraction settings.";
            } else {
                log("Error: unknown reason for no variables to save");
            }
            log("No entity variables have been saved since " + reason);
            return false;
        }
        var entity_R_type = selected_item.data.getAttribute("entity.variables", "data.type");
        var entity_type_names = selected_item.data.getAttribute("entity.variables", "names");
        for(var i = entity_variables.length - 1; i >= 0; i--){
            var r_entity_variables = entity_type_names[i].replace(/ /g, ".").toLowerCase() + ".variables";
            var expression = r_entity_variables + " <- " + stringToRName(selected_item.referenceName) + '$entity.variables[["' + entity_type_names[i] + '"]]\n' +
                             "n.levels <- attr(" + r_entity_variables + ", \"length.original.levels\")\n" +
                             "levels.exceeded <- !is.null(n.levels)\n" +
                             "form.max.levels <- " + form_max_levels + "\n" +
                             "if (levels.exceeded)\n" +
                             "{\n" +
                             "\twarning.msg <- paste0(\" has \", n.levels, \" categories (number of different \",\n" +
                             "\t                      \"extracted entities identified in the text). For performance reasons, the number of \",\n" +
                             "\t                      \"extracted entities has been reduced to the top \", form.max.levels, \" most \",\n" +
                             "\t                      \"popular entities with ties broken by alphabetical order if \",\n" +
                             "\t                      \"necessary. If you wish to relax this setting and save more \",\n" +
                             "\t                      \"entities of this type, increase the number to save by changing the value in \",\n" +
                             "\t                      \"the 'Maximum number of unique entity levels to save' control. However, \",\n" +
                             "\t                      \"performance might suffer.\")\n" +
                             "} else\n" +
                             "\twarning.msg <- NULL\n" +
                             "mentions.exceeded <- isTRUE(attr(" + r_entity_variables + ", \"mentions.truncated\"))\n" +
                             "n.mentions <- attr(" + r_entity_variables + ", \"original.max.mentions\")\n" +
                             "n.levels <- attr(" + r_entity_variables + ", \"mention.adjusted.levels\")\n" +
                             "if (mentions.exceeded && !is.null(n.mentions))\n" +
                             "{\n" +
                             "\toutput.msg <- paste0(\", the number of saved variables per case has been truncated to the first \",\n" +
                             "\t                     " + form_max_mentions + ", \" observed entities.\")\n" +
                             "\tif (!is.null(n.levels) && n.levels < form.max.levels)\n" +
                             "\toutput.msg <- paste0(output.msg, \" This truncation has reduced the number of extracted entities \",\n" +
                             "\t                     \"further from \", form.max.levels, \" categories to \", n.levels,\n" +
                             "\t                     \" categories.\")\n" +
                             "\toutput.msg <- paste0(output.msg, \" If you wish to relax this setting and save more entities per \",\n" +
                             "\t                     \"case, increase the value in the 'Maximum number of entities \",\n" +
                             "\t                     \"per case to save'.\")\n" +
                             "\tif (levels.exceeded)\n" +
                             "\t\twarning.msg <- paste0(warning.msg, \" Also, there is at least one case with \", n.mentions,\n" +
                             "\t\t                      \" identified entities. To keep the variable size manageable\", output.msg)\n" +
                             "\telse\n" +
                             "\t\twarning.msg <- paste0(\", has at least one case with \", n.mentions, \" identified entities. \",\n" +
                             "\t\t                      \"For performance reasons and to keep the variable \",\n" +
                             "\t\t                      \"size manageable\", output.msg)\n" +
                             "}\n" +
                             "if (levels.exceeded || mentions.exceeded)\n" +
                             "{\n" +
                             "\tpre.warning <- paste0(\"The " + entity_type_names[i] + " entities from the Entity Extraction output, " + selected_item.referenceName + "\")\n" +
                             "\twarning(pre.warning, warning.msg)\n" +
                             "}\n" +
                             r_entity_variables;
            var new_q_name = preventDuplicateQuestionName(data_file, entity_type_names[i] + " Entities from " +
                                                      selected_item.data.get("text.label"));
            var temp_var_name = randomVariableName(16); // temporary name, random to (almost) guarantee uniqueness
            
            try {
                var new_r_question = data_file.newRQuestion(expression, new_q_name, temp_var_name, last_variable);
                if(entity_R_type[i] === "list") {
                    new_r_question.questionType = "Pick Any - Compact"
                }
            } catch (e) {
                if (/Can only convert tabular results to an R/.test(e)) {
                    var size_msg = selected_item.getInput("formMaxLevels") === null ? "." : " The variable size can be reduced by decreasing the maximum number of entity levels.";
                    log(analysis_name + " could not be created from this item: The output variable is too large" + size_msg);
                } else {
                    log(analysis_name + " could not be created from this item: " + e);
                }
                return false;
            }

            // Replace temporary variable names
            nameSequentialVariables(new_r_question.variables, variable_prefix);

            if (!web_mode) {
                var t = selected_item.group.appendTable();
                t.primary = new_r_question;
                project.report.setSelectedRaw([t]);
            }
        }
    }
    else if(selected_item.outputClasses.indexOf("wordBag") > -1)
    {
	var tokenized_text = selected_item.data.get("transformed.tokenized");
        var n_gram_max = selected_item.getInput("formNGramMax");
        var form_min_freq = selected_item.getInput("formminfreq");
        var flattened_tokenized = tokenized_text.flat();
        if(flattened_tokenized.length === 0)
        {
            var base_message = "Variables could not be created from this item, there are no terms in the text after transformation.";
            var min_frequency = selected_item.getInput("formminfreq");
            var extra_message = "";
            if(min_frequency > 1)
            {
                extra_message = " Terms might be found if the minimum frequency control is reduced from " + min_frequency + " to a smaller value.";
            }
            log(base_message + extra_message);
            return false;
        }
        // If wordBag is new, check the hash, otherwise compute the variables regardless for an older wordBag output.
        try {
            var valid_check = "\nif (wordBag$hash != '" + selected_item.data.get("hash") + "')\n" +
                          "    stop('The Text Analysis output used to create these variables has changed " +
                          "and as a result these variables are no longer valid. Please delete these variables and rerun the " +
                          script_name + " script on the Text Analysis output.')\n";
        } catch(e) {
            var valid_check = "";
        }
        var freq_statement = "The output size can be reduced by increasing the Minimum Frequency value from " + form_min_freq + " to a larger value.";
        var n_gram_reduce_statement = n_gram_max > 1 ? " Also consider reducing the Maximum n for n-gram identification value from " + n_gram_max + " to a smaller value." : "";
 
        var expression = "wordBag <- " + stringToRName(selected_item.referenceName) + "\n" +
                         valid_check +
                         "flipTextAnalysis::SaveVariablesCategories(" + stringToRName(selected_item.referenceName) + ",\n" + 
                         "                                          form.max.levels = " + form_max_levels + ",\n" +
                         "                                          form.max.mentions = " + form_max_mentions + ")";
        var new_q_name = preventDuplicateQuestionName(data_file, analysis_name + " from " +
                                                          selected_item.data.getAttribute("original.text", "label"));
        var temp_var_name = randomVariableName(16); // temporary name, random to (almost) guarantee uniqueness

        try {
            var new_r_question = data_file.newRQuestion(expression, new_q_name, temp_var_name, last_variable);
        } catch (e) {
            if (/Can only convert tabular results to an R/.test(e)) {
                log(analysis_name + " could not be created from this item: The output variable is too large. " + 
                    freq_statement + n_gram_reduce_statement);
            } else {
                log(analysis_name + " could not be created from this item: " + e);
            }
            return false;
        }

        // Replace temporary variable names
        nameSequentialVariables(new_r_question.variables, variable_prefix);
        
        new_r_question.questionType = "Pick Any - Compact";
               
        // In Q, create a table showing the new question
        if (!web_mode) {
            var t = selected_item.group.appendTable();
            t.primary = new_r_question;
            project.report.setSelectedRaw([t]);
        }
    }
    return true;
}