Text Analysis - Save Variable(s) - First Category
Save a variable to the data set containing the first category mentioned. Where there are multiple input categories, the first category of each will be saved as a separate variable.
This QScript saves the first categories from a Automatic Categorization - List of Items, Automatic Categorization - Unstructured Text, Automatic Categorization - Entity Extraction or Text Analysis - Advanced - Setup Text Analysis output as a Pick One or Pick One - Multi question .
Technical Details
The variables created from this QScript may become invalid and need to be deleted and recreated if the output from Automatic Categorization - List of Items, Automatic Categorization - Unstructured Text, Automatic Categorization - Entity Extraction or Automatic Categorization - Advanced - Setup Text Analysis has changed, either due to the input text variable being modified or the input settings modified.
This QScript requires version 5.5.1 to be executed.
Code
includeWeb("QScript R Output Functions");
main();
function main() {
if (Q.fileFormatVersion() < 14.11) // require Q 5.5.1 or after in order to call selected_item.data
{
log("Q 5.5.1 or later is required to run this script, please update your version of Q.")
return false;
}
var script_name = "Save Variable(s) - First Category";
var analysis_name = "First Categories";
var variable_prefix = "first.categ";
var bad_selection_message = "Select a List Categorization, Automatic Categorization, Entity Extraction or Setup Text Analysis output.";
var web_mode = (!!Q.isOnTheWeb && Q.isOnTheWeb());
var selected_item = getSelectedROutputFromPage(["categorizedlist", "AutomaticCategorization", "EntityExtraction", "TextClassifier", "wordBag"]);
if (selected_item === null) {
log(bad_selection_message);
return false;
}
var data_file = getDataFileFromItemDependants(selected_item);
if (data_file == null)
{
log("'Save variables' cannot be applied to an output with no data file.");
return false;
}
// Find last variable, which we will place the generated variables after
var last_variable = getLastVariable(getVariables(selected_item.dependants(false)));
// Check if the Token control exists and pass it in if it does.
var form_max_levels = selected_item.getInput("formMaxLevels");
if (form_max_levels === null) {
form_max_levels = "Inf";
}
if (selected_item.outputClasses.indexOf("categorizedlist") > -1)
{
var variable_names = selected_item.data.get("variable.names");
var expression = "categorizedlist <- " +
stringToRName(selected_item.referenceName) +
"\nif (paste0(gsub('\\u00a0', ' ', gsub('\\'', '', categorizedlist$variable.names, fixed = TRUE), fixed = TRUE), collapse = '') != '" +
selected_item.data.get("variable.names").join("").replace(/'/g, "") + "')\n" +
" stop('The input text variables used for the Text Analysis have changed " +
"and as a result these variables are no longer valid. Please delete these variables and rerun the " +
script_name + " script on the Text Analysis output.')\n" +
"flipTextAnalysis::SaveVariablesFirstCategory(" + stringToRName(selected_item.referenceName) + ",\n" +
" form.max.levels = " + form_max_levels + ")";
var new_q_name = preventDuplicateQuestionName(data_file, analysis_name + " from " + selected_item.referenceName);
var temp_var_name = randomVariableName(16); // temporary name, random to (almost) guarantee uniqueness
try {
var new_r_question = data_file.newRQuestion(expression, new_q_name, temp_var_name, last_variable);
} catch (e) {
if (/Can only convert tabular results to an R/.test(e)) {
log(analysis_name + " could not be created from this item: The output variable is too large. " +
"The variable size can be reduced by increasing the required size of each category before saving (increasing the value in the 'Minimum category size' control)");
} else {
log(analysis_name + " could not be created from this item: " + e);
}
return false;
}
// Replace temporary variable names
nameSequentialVariables(new_r_question.variables, variable_prefix);
if (new_r_question.variables.length > 1)
new_r_question.questionType = "Pick One - Multi";
else
new_r_question.questionType = "Pick One";
}
else if (selected_item.outputClasses.indexOf("AutomaticCategorization") > -1)
{
var valid_check = "categorization = " + stringToRName(selected_item.referenceName) +
"\nif (gsub('\\u00a0', ' ', gsub('\\'', '', categorization$text.label, fixed = TRUE), fixed = TRUE) != '" + selected_item.data.get("text.label").toString().replace(/'/g, "") + "')\n" +
" stop('The input text variable used for the Text Analysis has changed " +
"and as a result this variable is no longer valid. Please delete this variable and rerun the " +
script_name + " script on the Text Analysis output.')\n"
var expression = valid_check;
expression += "categorization$categorization";
var new_q_name = preventDuplicateQuestionName(data_file, analysis_name + " from " +
selected_item.data.get("text.label"));
var temp_var_name = randomVariableName(16); // temporary name, random to (almost) guarantee uniqueness
try {
var new_r_question = data_file.newRQuestion(expression, new_q_name, temp_var_name, last_variable);
} catch (e) {
if (/Can only convert tabular results to an R/.test(e)) {
var number_categories = selected_item.getInput("formCategories");
var size_msg = number_categories > 1 ? " The variable size could possibly be reduced by decreasing the number of categories from " + number_categories + " to a smaller number." : "";
log(analysis_name + " could not be created from this item: The output variable is too large. " + size_msg);
} else {
log(analysis_name + " could not be created from this item: " + e);
}
return false;
}
// Replace temporary variable names
nameSequentialVariables(new_r_question.variables, variable_prefix);
new_r_question.variables[0].variableType = "Categorical";
new_r_question.questionType = "Pick One";
}
else if (selected_item.outputClasses.indexOf("TextClassifier") > -1)
{
var valid_check = "categorization = " + stringToRName(selected_item.referenceName) +
"\nif (gsub('\\u00a0', ' ', gsub('\\'', '', categorization$text.label, fixed = TRUE), fixed = TRUE) != '" + selected_item.data.get("text.label").toString().replace(/'/g, "") + "')\n" +
" stop('The input text variable used for the Text Analysis has changed " +
"and as a result this variable is no longer valid. Please delete this variable and rerun the " +
script_name + " script on the Text Analysis output.')\n"
var existing_cat = selected_item.getInput("formExistingCat");
var expression = valid_check;
var predicted_type = selected_item.data.getAttribute("predicted", "class");
var multiple_input = predicted_type == "data.frame";
if(multiple_input)
{
expression += "predicted <- categorization$predicted\n" +
"category.levels <- names(predicted)\n" +
"factor(apply(predicted, 1, function(x) {\n" +
"\tif(all(is.na(x)) || sum(x, na.rm = TRUE) == 0) NA else category.levels[which.max(x)]\n" +
"}))";
}
else
{
expression += "categorization$predicted";
}
var new_q_name = preventDuplicateQuestionName(data_file, analysis_name + " from " +
selected_item.data.get("text.label"));
var temp_var_name = randomVariableName(16); // temporary name, random to (almost) guarantee uniqueness
try {
var new_r_question = data_file.newRQuestion(expression, new_q_name, temp_var_name, last_variable);
} catch (e) {
log(analysis_name + " could not be created from this item: " + e);
return false;
}
// Replace temporary variable names
nameSequentialVariables(new_r_question.variables, variable_prefix);
new_r_question.variables[0].variableType = "Categorical";
new_r_question.questionType = "Pick One";
}
else if(selected_item.outputClasses.indexOf("EntityExtraction") > -1)
{
var entity_variables = selected_item.data.get("entity.variables");
if(entity_variables.length === 1 && typeof entity_variables[0] === "boolean") {
var empty_reason = selected_item.data.get("entity.variables.empty.reason");
if(empty_reason == "output") {
var reason = "there are no extracted entities in the output.";
} else if (empty_reason == "min") {
var entity_counts = selected_item.data.get("entity.counts");
var entity_names = selected_item.data.getAttribute("entity.counts", "names");
var min_cases_to_save = selected_item.data.get("min.cases.to.save");
var largest_count = entity_counts.reduce(function(x, y) {
return (x > y) ? x : y;
});
var largest_name = entity_names[entity_counts.indexOf(largest_count)];
var reason = "the entity type with the most number of extractions, '" + largest_name + "', contains " + largest_count + " named entities. However," +
" the minimum number of entities required to save is set at " + min_cases_to_save + ". Set 'Minimum number of cases to save' to " +
largest_count + " or lower to save the extracted named entities as variables.";
} else if(empty_reason == "remove"){
var reason = "the only entities in the output have been removed with the user specified remove entities from extraction settings.";
} else {
log("Error: unknown reason for no variables to save for this Entity Extraction output");
return false;
}
log("No entity variables have been saved since " + reason);
return false;
}
var entity_type_names = selected_item.data.getAttribute("entity.variables", "names");
var entity_R_type = selected_item.data.getAttribute("entity.variables", "data.type");
for(var i = entity_variables.length - 1; i >= 0; i--){
var r_entity_variables = entity_type_names[i].replace(/ /g, ".").toLowerCase() + ".variables";
var expression = stringToRName(r_entity_variables) + " <- " + stringToRName(selected_item.referenceName) + '$entity.variables[["' + entity_type_names[i] + '"]]'
if(entity_R_type[i] === "list") {
expression += '[, 1]\n';
} else {
expression += '\n';
}
expression += "unique.entities <- unique(" + stringToRName(r_entity_variables) + ")\n" +
"n.levels <- length(unique.entities)\n" +
"length.original.levels <- attr(" + stringToRName(r_entity_variables) + ", \"length.original.levels\")\n" +
"if (!is.null(length.original.levels))\n" +
"{\n" +
"\twarning(\"In the first " + entity_type_names[i] + " entities extracted in each case for the Entity Extraction R output, '" + selected_item.referenceName + "', there are \", n.levels, \" different entities overall (number of unique observed entities of this type extracted from the text). \",\n" +
"\t \"For performance reasons, the number of overall extracted entities in this entity type has been reduced to the top \", form.max.levels, \" most popular levels\",\n" +
"\t \"with ties broken by alphabetical order if necessary. If you wish to relax this setting and save more levels, \",\n" +
"\t \"increase the number to save by changing the value in the \'Maximum number of entities to save\' control in '" + selected_item.referenceName + "'. \",\n" +
"\t \"However, performance might suffer.\")\n" +
"}\n" +
"tab <- sort(table(" + stringToRName(r_entity_variables) + "), decreasing = TRUE)\n" +
"unobserved.levels <- tab == 0\n" +
"factor(" + stringToRName(r_entity_variables) + ", levels = names(which(!unobserved.levels)))";
var new_q_name = preventDuplicateQuestionName(data_file, entity_type_names[i] + " Entities from " +
selected_item.data.get("text.label"));
var temp_var_name = randomVariableName(16); // temporary name, random to (almost) guarantee uniqueness
try {
var new_r_question = data_file.newRQuestion(expression, new_q_name, temp_var_name, last_variable);
} catch (e) {
if (/Can only convert tabular results to an R/.test(e)) {
var size_msg = selected_item.getInput("formMaxLevels") === null ? "." : " The variable size can be reduced by decreasing the maximum number of entity levels.";
log(analysis_name + " could not be created from this item: The output variable is too large" + size_msg);
} else {
log(analysis_name + " could not be created from this item: " + e);
}
return false;
}
// Replace temporary variable names
nameSequentialVariables(new_r_question.variables, variable_prefix);
if (!web_mode) {
var t = selected_item.group.appendTable();
t.primary = new_r_question;
project.report.setSelectedRaw([t]);
}
}
}
else if(selected_item.outputClasses.indexOf("wordBag") > -1)
{
var tokenized_text = selected_item.data.get("transformed.tokenized");
var n_gram_max = selected_item.getInput("formNGramMax");
var form_min_freq = selected_item.getInput("formminfreq");
var flattened_tokenized = tokenized_text.flat();
if(flattened_tokenized.length === 0)
{
var base_message = "Variables could not be created from this item, there are no terms in the text after transformation.";
var min_frequency = selected_item.getInput("formminfreq");
var extra_message = "";
if(min_frequency > 1)
{
extra_message = " Terms might be found if the minimum frequency control is reduced from " + min_frequency + " to a smaller value.";
}
log(base_message + extra_message);
return false;
}
var freq_statement = "The output size can be reduced by increasing the Minimum Frequency value from " + form_min_freq + " to a larger value.";
var n_gram_reduce_statement = n_gram_max > 1 ? " Also consider reducing the Maximum n for n-gram identification value from " + n_gram_max + " to a smaller value." : "";
var expression = "wordBag <- " + stringToRName(selected_item.referenceName) + "\n" +
"\nif (gsub('\\u00a0', ' ', gsub('\\'', '', attr(wordBag$original.text, \'label\'), fixed = TRUE), fixed = TRUE) != '" + selected_item.data.getAttribute("original.text", "label").toString().replace(/'/g, "") + "')\n" +
" stop('The input text variable used for the Text Analysis has changed " +
"and as a result this variable is no longer valid. Please delete this variable and rerun the " +
script_name + " script on the Text Analysis output.')\n" +
"flipTextAnalysis::SaveVariablesFirstCategory(" + stringToRName(selected_item.referenceName) + ",\n" +
" form.max.levels = " + form_max_levels + ")";
var new_q_name = preventDuplicateQuestionName(data_file, analysis_name + " from " +
selected_item.data.getAttribute("original.text", "label"));
var temp_var_name = randomVariableName(16); // temporary name, random to (almost) guarantee uniqueness
try {
var new_r_question = data_file.newRQuestion(expression, new_q_name, temp_var_name, last_variable);
} catch (e) {
if (/Can only convert tabular results to an R/.test(e)) {
log(analysis_name + " could not be created from this item: The output variable is too large. " +
freq_statement + n_gram_reduce_statement);
} else {
log(analysis_name + " could not be created from this item: " + e);
}
return false;
}
// Replace temporary variable names
nameSequentialVariables(new_r_question.variables, variable_prefix);
}
// In Q, select the table showing the new question
if (!web_mode && selected_item.outputClasses.indexOf("EntityExtraction") < 0) {
var t = selected_item.group.appendTable();
t.primary = new_r_question;
project.report.setSelectedRaw([t]);
}
return true;
}
Displayr - Anything Menu
Extensions
Q Technical Reference
Q Technical Reference
Q Technical Reference > Setting Up Data > Creating New Variables
Q Technical Reference > Updating and Automation > Automation Online Library
Q Technical Reference > Updating and Automation > JavaScript > QScript > QScript Examples Library > QScript Online Library
R Online Library
Text Analysis
Text Analysis - Advanced
User Interface > Create Text Analysis