QScript Functions for Saving Text Analysis Variables
Jump to navigation
Jump to search
This page is currently under construction, or it refers to features which are under development and not yet available for use.
This page is under construction. Its contents are only visible to developers!
includeWeb('QScript Utility Functions');
includeWeb('QScript R Output Functions');
// In all occurences below, item_and_data_set_settings is a JSON that has the following elements
// - selected item - The Text Analysis R Output to save the variables from.
// - form_max_levels - The integer stating the maximum number of levels in the category,
// - form_max_mentions - The integer stating the maximum number ,
// - data_file - The data file to save the variabbles,
// - last_variable - The position to save the variables,
function saveListCategorizationVariables(item_and_data_settings) {
let categories = item_and_data_settings.variablePrefix === 'categ';
return categories ? saveListCatAll(item_and_data_settings) : saveListCatFirst(item_and_data_settings);
}
function saveListCatAll(item_and_data_settings) {
let selected_item = item_and_data_settings.selectedItem;
let data_file = item_and_data_settings.dataFile;
let form_max_levels = item_and_data_settings.formMaxLevels;
let form_max_mentions = item_and_data_settings.formMaxMentions;
let last_variable = item_and_data_settings.lastVariable;
let analysis_name = item_and_data_settings.analysisName;
let variable_prefix = item_and_data_settings.variablePrefix;
let variable_names = selected_item.data.get('variable.names');
let saved_variables = [];
let text_analysis = 'categorizedlist';
let r_input_name = stringToRName(selected_item.referenceName);
for (let j = variable_names.length - 1; j >= 0; j--)
{
let r_variable_names = `${text_analysis}$variable.names[${j + 1}]`;
let expr = `
${text_analysis} <- ${r_input_name}
if (categorizedlist$hash != "${selected_item.data.get('hash')}")
stop("The Text Analysis output used to create these variables has changed ",
"and as a result these variables are no longer valid. ",
"Please delete these variables and rerun the Save Variable(s) - Categories ",
"option on the Text Analysis output.")
# Look to replace this with single javascript code outside the loop
form.max.levels <- ${form_max_levels}
form.max.mentions <- ${form_max_mentions}
flipTextAnalysis::SaveVariablesCategories(${r_input_name},
variable.name = ${r_variable_names},
form.max.levels = form.max.levels,
form.max.mentions = form.max.mentions)`;
let new_q_name = preventDuplicateQuestionName(data_file, `${analysis_name} from ${variable_names[j]}`);
let temp_var_name = randomVariableName(16); // temporary name, random to (almost) guarantee uniqueness
let new_r_question;
try {
new_r_question = data_file.newRQuestion(expr, new_q_name, temp_var_name, last_variable);
} catch (e) {
if (/Can only convert tabular results to an R/.test(e)) {
log(analysis_name + ' could not be created from this item: The output variable is too large. ' +
'The variable size can be reduced by increasing the required size of each category before ' +
'saving (increasing the value in the \'Minimum category size\' control)');
} else
log(analysis_name + ' could not be created from this item: ' + e);
return false;
}
// Replace temporary variable names
nameSequentialVariables(new_r_question.variables, variable_prefix);
new_r_question.questionType = 'Pick Any - Compact';
saved_variables.push(new_r_question);
}
return saved_variables;
}
function saveListCatFirst(item_and_data_settings) {
let selected_item = item_and_data_settings.selectedItem;
let data_file = item_and_data_settings.dataFile;
let form_max_levels = item_and_data_settings.formMaxLevels;
let last_variable = item_and_data_settings.lastVariable;
let analysis_name = item_and_data_settings.analysisName;
let variable_prefix = item_and_data_settings.variablePrefix;
let text_analysis = 'categorizedlist';
let var_NBSP_removed = removeNBSPForR(`${text_analysis}$variable.names`, selected_item.data.get('variable.names'));
let expr = `
${text_analysis} <- ${stringToRName(selected_item.referenceName)}
if (${var_NBSP_removed.lefthandside} != "${var_NBSP_removed.righthandside}")
stop("The input text variables used for the Text Analysis have changed ",
"and as a result these variables are no longer valid. Please delete ",
"these variables and rerun the Save Variable(s) - First Category ",
"option on the Text Analysis output.")
flipTextAnalysis::SaveVariablesFirstCategory(${stringToRName(selected_item.referenceName)},
form.max.levels = ${form_max_levels})`;
var new_q_name = preventDuplicateQuestionName(data_file, `${analysis_name} from ${selected_item.referenceName}`);
var temp_var_name = randomVariableName(16); // temporary name, random to (almost) guarantee uniqueness
let new_r_question;
try {
new_r_question = data_file.newRQuestion(expr, new_q_name, temp_var_name, last_variable);
} catch (e) {
if (/Can only convert tabular results to an R/.test(e)) {
log(analysis_name + ' could not be created from this item: The output variable is too large. ' +
'The variable size can be reduced by increasing the required size of each category before ' +
'saving (increasing the value in the \'Minimum category size\' control)');
} else {
log(analysis_name + ' could not be created from this item: ' + e);
}
return false;
}
// Replace temporary variable names
nameSequentialVariables(new_r_question.variables, variable_prefix);
new_r_question.questionType = new_r_question.variables.length > 1? 'Pick One - Multi' : 'Pick One';
return [new_r_question];
}
function saveAutomaticCategorizationVariables(item_and_data_settings) {
let categories = item_and_data_settings.variablePrefix === 'categ';
return categories ? saveAutoCatAll(item_and_data_settings): saveAutoCatFirst(item_and_data_settings);
}
function saveAutoCatAll(item_and_data_settings) {
let selected_item = item_and_data_settings.selectedItem;
let data_file = item_and_data_settings.dataFile;
let last_variable = item_and_data_settings.lastVariable;
let analysis_name = item_and_data_settings.analysisName;
let variable_prefix = item_and_data_settings.variablePrefix;
let text_analysis = 'categorization';
let var_NBSP_removed = removeNBSPForR(`${text_analysis}$text.label`, selected_item.data.get('text.label'));
let valid_check = `
${text_analysis} <- ${stringToRName(selected_item.referenceName)}
if (${var_NBSP_removed.lefthandside} != "${var_NBSP_removed.righthandside}")
stop("The input text variable used for the Text Analysis has changed ",
"and as a result this variable is no longer valid. Please delete this ",
"variable and rerun the Save Variable(s) - Categories option on the ",
"Text Analysis output.")
`;
let pick_any_output = false;
let existing_cat = selected_item.getInput('formExistingCat');
let expression = valid_check;
if (existing_cat !== null) {
let predicted_type = selected_item.data.getAttribute('predicted', 'class');
pick_any_output = predicted_type == 'data.frame';
expression += `${text_analysis}$predicted`;
} else
expression += `${text_analysis}$categorization`;
let new_q_name = preventDuplicateQuestionName(data_file, analysis_name + " from " +
selected_item.data.get("text.label"));
let temp_var_name = randomVariableName(16); // temporary name, random to (almost) guarantee uniqueness
let new_r_question;
try {
new_r_question = data_file.newRQuestion(expression, new_q_name, temp_var_name, last_variable);
} catch (e) {
if (/Can only convert tabular results to an R/.test(e)) {
let number_categories = selected_item.getInput('formCategories');
let size_msg = '';
if (number_categories > 1)
size_msg = ' The variable size could possibly be reduced by decreasing the number of categories ' +
'from ' + number_categories + ' to a smaller number.';
log(analysis_name + ' could not be created from this item: The output variable is too large. ' + size_msg);
} else {
log(analysis_name + ' could not be created from this item: ' + e);
}
return false;
}
// Replace temporary variable names
nameSequentialVariables(new_r_question.variables, variable_prefix);
new_r_question.variables.forEach(function(v) {v.variableType = 'Categorical'});
new_r_question.questionType = pick_any_output ? 'Pick Any' : 'Pick One';
if (pick_any_output)
new_r_question.needsCheckValuesToCount = false;
return [new_r_question];
}
function saveAutoCatFirst(item_and_data_settings) {
let selected_item = item_and_data_settings.selectedItem;
let data_file = item_and_data_settings.dataFile;
let last_variable = item_and_data_settings.lastVariable;
let analysis_name = item_and_data_settings.analysisName;
let variable_prefix = item_and_data_settings.variablePrefix;
let text_analysis = 'categorization';
let embedded_label = selected_item.data.get('text.label');
let var_NBSP_removed = removeNBSPForR(text_analysis + '$text.label', embedded_label);
let valid_check = `
${text_analysis} <- ${stringToRName(selected_item.referenceName)}
if (${var_NBSP_removed.lefthandside} != "${var_NBSP_removed.righthandside}")
stop("The input text variable used for the Text Analysis has changed ",
"and as a result this variable is no longer valid. Please delete ",
"this variable and rerun the Save Variable(s) - First Categories ",
"option on the Text Analysis output.")
`;
let expression = valid_check;
expression += 'categorization$categorization';
let new_q_name = preventDuplicateQuestionName(data_file, analysis_name + ' from ' + embedded_label);
let temp_var_name = randomVariableName(16); // temporary name, random to (almost) guarantee uniqueness
let new_r_question;
try {
new_r_question = data_file.newRQuestion(expression, new_q_name, temp_var_name, last_variable);
} catch (e) {
if (/Can only convert tabular results to an R/.test(e)) {
let number_categories = selected_item.getInput('formCategories');
let size_msg = '';
if (number_categories > 1)
size_msg = ' The variable size could possibly be reduced by decreasing the number of categories ' +
'from ' + number_categories + ' to a smaller number.';
log(analysis_name + ' could not be created from this item: The output variable is too large. ' + size_msg);
} else {
log(analysis_name + ' could not be created from this item: ' + e);
}
return false;
}
// Replace temporary variable names
nameSequentialVariables(new_r_question.variables, variable_prefix);
new_r_question.variables[0].variableType = 'Categorical';
new_r_question.questionType = 'Pick One';
return [new_r_question];
}
function saveTextClassifierVariables(item_and_data_settings) {
let categories = item_and_data_settings.variablePrefix === 'categ';
return categories ? saveTextClassifierAll(item_and_data_settings) : saveTextClassifierFirst(item_and_data_settings);
}
function saveTextClassifierAll(item_and_data_settings) {
let selected_item = item_and_data_settings.selectedItem;
let data_file = item_and_data_settings.dataFile;
let last_variable = item_and_data_settings.lastVariable;
let analysis_name = item_and_data_settings.analysisName;
let variable_prefix = item_and_data_settings.variablePrefix;
let text_analysis = 'categorization';
let embedded_label = selected_item.data.get('text.label');
let var_NBSP_removed = removeNBSPForR(text_analysis + '$text.label', embedded_label);
let expression = `
${text_analysis} <- ${stringToRName(selected_item.referenceName)}
if (${var_NBSP_removed.lefthandside} != "${var_NBSP_removed.righthandside}")
stop("The input text variable used for the Text Analysis has changed ",
"and as a result this variable is no longer valid. Please delete ",
"this variable and rerun the Save Variable(s) - Categories option ",
"on the Text Analysis output.")
categorization$predicted
`;
let new_q_name = preventDuplicateQuestionName(data_file, analysis_name + ' from ' + embedded_label);
let temp_var_name = randomVariableName(16); // temporary name, random to (almost) guarantee uniqueness
let new_r_question;
try {
new_r_question = data_file.newRQuestion(expression, new_q_name, temp_var_name, last_variable);
} catch (e) {
log(analysis_name + ' could not be created from this item: ' + e);
return false;
}
// Replace temporary variable names
nameSequentialVariables(new_r_question.variables, variable_prefix);
new_r_question.variables.forEach(v => v.variableType = 'Categorical');
let predicted_type = selected_item.data.getAttribute('predicted', 'class');
let pick_any_output = predicted_type == 'data.frame';
new_r_question.questionType = pick_any_output ? 'Pick Any' : 'Pick One';
if (pick_any_output)
new_r_question.needsCheckValuesToCount = false;
return [new_r_question];
}
function saveTextClassifierFirst(item_and_data_settings) {
let selected_item = item_and_data_settings.selectedItem;
let data_file = item_and_data_settings.dataFile;
let last_variable = item_and_data_settings.lastVariable;
let analysis_name = item_and_data_settings.analysisName;
let variable_prefix = item_and_data_settings.variablePrefix;
let text_analysis = 'categorization';
let var_NBSP_removed = removeNBSPForR(text_analysis + '$text.label', selected_item.data.get('text.label'));
let expression = `
${text_analysis} <- ${stringToRName(selected_item.referenceName)}
if (${var_NBSP_removed.lefthandside} != "${var_NBSP_removed.righthandside}")
stop("The input text variable used for the Text Analysis has changed ",
"and as a result this variable is no longer valid. Please delete ",
"this variable and rerun the Save Variable(s) - First Categeory ",
"option on the Text Analysis output.")
`;
let predicted_type = selected_item.data.getAttribute('predicted', 'class');
let multiple_input = predicted_type == 'data.frame';
if(multiple_input)
{
expression += `
predicted <- categorization$predicted
category.levels <- names(predicted)
firstResponse <- function(x) if (all(is.na(x)) || sum(x, na.rm = TRUE) == 0) NA else category.levels[which.max(x)]
new.var <- apply(predicted, 1L, firstResponse)
factor(new.var)
`;
}
else
expression += 'categorization$predicted';
let new_q_name = preventDuplicateQuestionName(data_file,
`${analysis_name} from ${selected_item.data.get('text.label')}`);
let temp_var_name = randomVariableName(16); // temporary name, random to (almost) guarantee uniqueness
let new_r_question;
try {
new_r_question = data_file.newRQuestion(expression, new_q_name, temp_var_name, last_variable);
} catch (e) {
log(analysis_name + ' could not be created from this item: ' + e);
return false;
}
// Replace temporary variable names
nameSequentialVariables(new_r_question.variables, variable_prefix);
new_r_question.variables[0].variableType = 'Categorical';
new_r_question.questionType = 'Pick One';
return [new_r_question];
}
function saveEntityExtractionVariables(item_and_data_settings) {
let categ = item_and_data_settings.variablePrefix === 'categ';
return categ ? saveEntityExtractionAll(item_and_data_settings) : saveEntityExtractionFirst(item_and_data_settings);
}
function saveEntityExtractionAll(item_and_data_settings) {
let selected_item = item_and_data_settings.selectedItem;
let data_file = item_and_data_settings.dataFile;
let form_max_levels = item_and_data_settings.formMaxLevels;
let form_max_mentions = item_and_data_settings.formMaxMentions;
let last_variable = item_and_data_settings.lastVariable;
let analysis_name = item_and_data_settings.analysisName;
let variable_prefix = item_and_data_settings.variablePrefix;
let entity_variables = selected_item.data.get('entity.variables');
if (entity_variables.length === 1 && typeof entity_variables[0] === 'boolean') {
let reason;
let empty_reason = selected_item.data.get('entity.variables.empty.reason');
if (empty_reason == 'output') {
reason = 'there are no extracted entities in the output.';
} else if (empty_reason == 'min') {
let entity_counts = selected_item.data.get('entity.counts');
let entity_names = selected_item.data.getAttribute('entity.counts', 'names');
let min_cases_to_save = selected_item.data.get('min.cases.to.save');
let largest_count = entity_counts.reduce(function(x, y) {
return (x > y) ? x : y;
});
let largest_name = entity_names[entity_counts.indexOf(largest_count)];
reason = 'the entity type with the most number of extractions, \'' + largest_name + '\', contains ' +
largest_count + ' named entities. However, the minimum number of entities required to save ' +
'is set at ' + min_cases_to_save + '. Set \'Minimum number of cases to save\' to ' +
largest_count + ' or lower to save the extracted named entities as variables.';
} else if (empty_reason == 'remove') {
reason = 'the only entities in the output have been removed with the user specified remove entities ' +
'from extraction settings.';
} else {
log('Error: unknown reason for no variables to save');
return false;
}
log('No entity variables have been saved since ' + reason);
return false;
}
let saved_variables = [];
let entity_R_type = selected_item.data.getAttribute('entity.variables', 'data.type');
let entity_type_names = selected_item.data.getAttribute('entity.variables', 'names');
let input_r_name = stringToRName(selected_item.referenceName);
for(let i = entity_variables.length - 1; i >= 0; i--){
let r_entity_vars = entity_type_names[i].replace(/ /g, '.').toLowerCase() + '.variables';
let expr = `
${r_entity_vars} <- ${input_r_name}$entity.variables[["${entity_type_names[i]}"]]
n.levels <- attr(${r_entity_vars}, "length.original.levels")
levels.exceeded <- !is.null(n.levels)
form.max.levels <- ${form_max_levels}
if (levels.exceeded)
{
warn.msg <- paste0(" has ", n.levels, " categories (number of different ",
"extracted entities identified in the text). For performance reasons",
", the number of extracted entities has been reduced to the top ",
form.max.levels, " most popular entities with ties broken by ",
"alphabetical order if necessary. If you wish to relax this setting ",
"and save more entities of this type, increase the number to save by ",
"changing the value in the \\"Maximum number of unique entity levels ",
"to save\\" control. However, performance might suffer.")
} else
warn.msg <- NULL
mentions.exceeded <- isTRUE(attr(${r_entity_vars}, "mentions.truncated"))
n.mentions <- attr(${r_entity_vars}, "original.max.mentions")
n.levels <- attr(${r_entity_vars}, "mention.adjusted.levels")
if (mentions.exceeded && !is.null(n.mentions))
{
out.msg <- paste0(", the number of saved variables per case has been truncated to the ",
"first ${form_max_mentions} observed entities.")
if (!is.null(n.levels) && n.levels < form.max.levels)
out.msg <- paste0(out.msg, " This truncation has reduced the number of extracted ",
"entities further from ", form.max.levels, " categories to ", n.levels,
" categories.")
output.msg <- paste0(out.msg, " If you wish to relax this setting and save more entities ",
"per case, increase the value in the \\"Maximum number of entities ",
"per case to save\\".")
if (levels.exceeded)
warn.msg <- paste0(warn.msg, " Also, there is at least one case with ", n.mentions,
" identified entities. To keep the variable size manageable", out.msg)
else
warn.msg <- paste0(", has at least one case with ", n.mentions, " identified entities. ",
"For performance reasons and to keep the variable ",
"size manageable", out.msg)
}
if (levels.exceeded || mentions.exceeded)
{
pre.warn <- paste0("The ' + entity_type_names[i] + ' entities from the Entity Extraction ",
"output, ' + selected_item.referenceName + '")
warning(pre.warn, warn.msg)
}
${r_entity_vars}`;
let new_q_name = preventDuplicateQuestionName(data_file, entity_type_names[i] + ' Entities from ' +
selected_item.data.get('text.label'));
let temp_var_name = randomVariableName(16); // temporary name, random to (almost) guarantee uniqueness
let new_r_question;
try {
new_r_question = data_file.newRQuestion(expr, new_q_name, temp_var_name, last_variable);
if(entity_R_type[i] === 'list') {
new_r_question.questionType = 'Pick Any - Compact'
}
} catch (e) {
if (/Can only convert tabular results to an R/.test(e)) {
let size_msg;
if (selected_item.getInput('formMaxLevels') === null)
size_msg = '.'
else
size_msg = ' The variable size can be reduced by decreasing the maximum number of entity levels.';
log(analysis_name + ' could not be created from this item: ' +
'The output variable is too large' + size_msg);
} else {
log(analysis_name + ' could not be created from this item: ' + e);
}
return false;
}
// Replace temporary variable names
nameSequentialVariables(new_r_question.variables, variable_prefix);
saved_variables.push(new_r_question);
}
return saved_variables;
}
function saveEntityExtractionFirst(item_and_data_settings) {
let selected_item = item_and_data_settings.selectedItem;
let data_file = item_and_data_settings.dataFile;
let last_variable = item_and_data_settings.lastVariable;
let analysis_name = item_and_data_settings.analysisName;
let variable_prefix = item_and_data_settings.variablePrefix;
let entity_variables = selected_item.data.get('entity.variables');
if (entity_variables.length === 1 && typeof entity_variables[0] === 'boolean') {
let reason;
let empty_reason = selected_item.data.get('entity.variables.empty.reason');
if(empty_reason == 'output') {
reason = 'there are no extracted entities in the output.';
} else if (empty_reason == 'min') {
let entity_counts = selected_item.data.get('entity.counts');
let entity_names = selected_item.data.getAttribute('entity.counts', 'names');
let min_cases_to_save = selected_item.data.get('min.cases.to.save');
let largest_count = entity_counts.reduce(function(x, y) {
return (x > y) ? x : y;
});
let largest_name = entity_names[entity_counts.indexOf(largest_count)];
reason = 'the entity type with the most number of extractions, \'' + largest_name + '\', contains ' +
largest_count + ' named entities. However, the minimum number of entities required to save ' +
'is set at ' + min_cases_to_save + '. Set \'Minimum number of cases to save\' to ' +
largest_count + ' or lower to save the extracted named entities as variables.';
} else if(empty_reason == 'remove'){
reason = 'the only entities in the output have been removed with the user specified remove entities ' +
'from extraction settings.';
} else {
log('Error: unknown reason for no variables to save for this Entity Extraction output');
return false;
}
log('No entity variables have been saved since ' + reason);
return false;
}
let saved_variables = [];
let entity_type_names = selected_item.data.getAttribute('entity.variables', 'names');
let entity_R_type = selected_item.data.getAttribute('entity.variables', 'data.type');
let r_input_name = stringToRName(selected_item.referenceName);
let expr;
for(let i = entity_variables.length - 1; i >= 0; i--){
let r_entity_vars = stringToRName(entity_type_names[i].replace(/ /g, '.').toLowerCase() + '.variables');
let subscript_idx = entity_R_type[i] === 'list' ? '[, 1]' : '';
expr = `
${r_entity_vars} <- ${r_input_name}$entity.variables[["${entity_type_names[i]}"]]${subscript_idx}
unique.entities <- unique(${r_entity_vars})
n.levels <- length(unique.entities)
length.original.levels <- attr(${r_entity_vars}, "length.original.levels")
if (!is.null(length.original.levels))
{
warning("In the first ${entity_type_names[i]} entities extracted in each case for the ",
"Entity Extraction R output, \\"' + selected_item.referenceName + '\\", there are ",
n.levels, " different entities overall (number of unique observed entities of this ",
"type extracted from the text). For performance reasons, the number of overall ",
"extracted entities in this entity type has been reduced to the top ", form.max.levels,
"most popular levels with ties broken by alphabetical order if necessary. If you wish ",
"to relax this setting and save more levels, increase the number to save by changing ",
"the value in the \\"Maximum number of entities to save\\" control ",
"in \\"' + selected_item.referenceName + '\\". However, performance might suffer.")
}
tab <- sort(table(${r_entity_vars}), decreasing = TRUE)
unobserved.levels <- tab == 0
factor(${r_entity_vars}, levels = names(which(!unobserved.levels)))
`;
let new_q_name = preventDuplicateQuestionName(data_file, entity_type_names[i] + ' Entities from ' +
selected_item.data.get('text.label'));
let temp_var_name = randomVariableName(16); // temporary name, random to (almost) guarantee uniqueness
let new_r_question;
try {
new_r_question = data_file.newRQuestion(expr, new_q_name, temp_var_name, last_variable);
} catch (e) {
if (/Can only convert tabular results to an R/.test(e)) {
if (selected_item.getInput('formMaxLevels') === null)
size_msg = '.'
else
size_msg = ' The variable size can be reduced by decreasing the maximum number of entity levels.';
log(analysis_name + ' could not be created from this item: ' +
'The output variable is too large' + size_msg);
} else {
log(analysis_name + ' could not be created from this item: ' + e);
}
return false;
}
// Replace temporary variable names
nameSequentialVariables(new_r_question.variables, variable_prefix);
saved_variables.push(new_r_question);
}
return saved_variables;
}
function saveWordBagVariables(item_and_data_settings){
let categories = item_and_data_settings.variablePrefix === 'categ';
return categories ? saveWordBagAll(item_and_data_settings) : saveWordBagFirst(item_and_data_settings);
}
function saveWordBagAll(item_and_data_settings) {
let selected_item = item_and_data_settings.selectedItem;
let data_file = item_and_data_settings.dataFile;
let form_max_levels = item_and_data_settings.formMaxLevels;
let form_max_mentions = item_and_data_settings.formMaxMentions;
let last_variable = item_and_data_settings.lastVariable;
let analysis_name = item_and_data_settings.analysisName;
let variable_prefix = item_and_data_settings.variablePrefix;
let tokenized_text = selected_item.data.get('transformed.tokenized');
let n_gram_max = selected_item.getInput('formNGramMax');
let form_min_freq = selected_item.getInput('formminfreq');
let flattened_tokenized = tokenized_text.flat();
if (flattened_tokenized.length === 0)
{
let msg = 'Variables could not be created from this item, there are no terms in the text after transformation.';
let min_frequency = selected_item.getInput('formminfreq');
let extra = '';
if(min_frequency > 1)
extra = ' Terms might be found if the minimum frequency control is reduced from ' + min_frequency +
' to a smaller value.';
log(msg + extra);
return false;
}
let valid_item;
let text_analysis = 'wordBag';
// If wordBag is new, check the hash, otherwise compute the variables regardless for an older wordBag output.
try {
valid_item = `
if (${text_analysis}$hash != "${selected_item.data.get('hash')}")
stop("The Text Analysis output used to create these variables has changed ",
"and as a result these variables are no longer valid. Please delete ",
"these variables and rerun the Save Variable(s) - Categories option ",
"on the Text Analysis output.")
`;
} catch(e) {
valid_item = '';
}
let freq_statement = 'The output size can be reduced by increasing the Minimum Frequency value from ' +
form_min_freq + ' to a larger value.';
let n_gram_statement;
if (n_gram_max === 1)
n_gram_statement = '';
else
n_gram_statement = ' Also consider reducing the Maximum n for n-gram identification value from ' +
n_gram_max + ' to a smaller value.';
let expression = text_analysis + ' <- ' + stringToRName(selected_item.referenceName) + '\n' +
valid_item +
'flipTextAnalysis::SaveVariablesCategories(' + stringToRName(selected_item.referenceName) + ',\n' +
' form.max.levels = ' + form_max_levels + ',\n' +
' form.max.mentions = ' + form_max_mentions + ')';
let new_q_name = preventDuplicateQuestionName(data_file, analysis_name + ' from ' +
selected_item.data.getAttribute('original.text', 'label'));
let temp_var_name = randomVariableName(16); // temporary name, random to (almost) guarantee uniqueness
let new_r_question;
try {
new_r_question = data_file.newRQuestion(expression, new_q_name, temp_var_name, last_variable);
} catch (e) {
if (/Can only convert tabular results to an R/.test(e)) {
log(analysis_name + ' could not be created from this item: The output variable is too large. ' +
freq_statement + n_gram_statement);
} else {
log(analysis_name + ' could not be created from this item: ' + e);
}
return false;
}
// Replace temporary variable names
nameSequentialVariables(new_r_question.variables, variable_prefix);
new_r_question.questionType = 'Pick Any - Compact';
return [new_r_question];
}
function saveWordBagFirst(item_and_data_settings) {
let selected_item = item_and_data_settings.selectedItem;
let data_file = item_and_data_settings.dataFile;
let form_max_levels = item_and_data_settings.formMaxLevels;
let last_variable = item_and_data_settings.lastVariable;
let analysis_name = item_and_data_settings.analysisName;
let variable_prefix = item_and_data_settings.variablePrefix;
let tokenized_text = selected_item.data.get('transformed.tokenized');
let n_gram_max = selected_item.getInput('formNGramMax');
let form_min_freq = selected_item.getInput('formminfreq');
let flattened_tokenized = tokenized_text.flat();
if (flattened_tokenized.length === 0)
{
let msg = 'Variables could not be created from this item, there are no terms in the text after transformation.';
let min_frequency = selected_item.getInput('formminfreq');
let extra = '';
if(min_frequency > 1)
extra = ' Terms might be found if the minimum frequency control is reduced from ' +
min_frequency + ' to a smaller value.';
log(msg + extra);
return false;
}
let freq_statement = 'The output size can be reduced by increasing the Minimum Frequency value from ' +
form_min_freq + ' to a larger value.';
let n_gram_statement;
if (n_gram_max === 1)
n_gram_statement = '';
else
n_gram_statement = ' Also consider reducing the Maximum n for n-gram identification value from ' +
n_gram_max + ' to a smaller value.';
let original_text_label = selected_item.data.getAttribute('original.text', 'label');
let text_analysis = 'wordBag';
let var_NBSP_removed = removeNBSPForR('attr(' + text_analysis + '$original.text, "label")',
original_text_label);
let r_input_name = stringToRName(selected_item.referenceName);
let expr = text_analysis + ' <- ' + r_input_name + '\n' +
'if (' + var_NBSP_removed.lefthandside + ' != "' + var_NBSP_removed.righthandside + '")\n' +
' stop("The input text variable used for the Text Analysis has changed ' +
'and as a result this variable is no longer valid. Please delete this variable and rerun the ' +
'Save Variable(s) - First Category option on the Text Analysis output.")\n' +
'flipTextAnalysis::SaveVariablesFirstCategory(' + r_input_name + ',\n' +
' form.max.levels = ' + form_max_levels + ')';
let new_q_name = preventDuplicateQuestionName(data_file, analysis_name + ' from ' +
selected_item.data.getAttribute('original.text', 'label'));
let temp_var_name = randomVariableName(16); // temporary name, random to (almost) guarantee uniqueness
let new_r_question;
try {
new_r_question = data_file.newRQuestion(expr, new_q_name, temp_var_name, last_variable);
} catch (e) {
if (/Can only convert tabular results to an R/.test(e)) {
log(analysis_name + ' could not be created from this item: The output variable is too large. ' +
freq_statement + n_gram_statement);
} else {
log(analysis_name + ' could not be created from this item: ' + e);
}
return false;
}
// Replace temporary variable names
nameSequentialVariables(new_r_question.variables, variable_prefix);
return [new_r_question];
}
const TEXT_ANALYSIS_CLASSES = ['categorizedlist', 'AutomaticCategorization', 'EntityExtraction',
'TextClassifier', 'wordBag'];
function saveTextAnalysisVariables(script_name) {
let selected_item = getSelectedROutputFromPage(TEXT_ANALYSIS_CLASSES);
let selection_valid = selected_item != null;
if (!selection_valid) {
log('Select a List Categorization, Automatic Categorization, Entity Extraction ' +
'or Setup Text Analysis output first before using this feature.');
return;
}
let data_file = selection_valid && getDataFileFromItemDependants(selected_item);
selection_valid = selection_valid && data_file != null;
if (!selection_valid) {
log("'Save variables' cannot be applied to an output with no data file.");
return;
}
// Find last variable, which we will place the generated variables after
let dependants = selected_item.dependants(false);
dependants = dependants.filter(removeErroneousDependant);
let last_variable = getLastVariable(getVariables(dependants));
let categories_to_save = script_name.replace('Save Variable(s) - ','');
let all_categories = categories_to_save === 'Categories';
let analysis_name = all_categories ? 'Categories' : 'First Categories';
let variable_prefix = all_categories ? 'categ' : 'first.categ';
let output_classes = selected_item.outputClasses;
// Check if the Token controls exists and pass it in if it does.
let form_max_levels = form_max_mentions = 'Inf';
if (!!selected_item.getInput('formMaxLevels'))
form_max_levels = selected_item.getInput('formMaxLevels');
if (!!selected_item.getInput('formMaxMentions'))
form_max_mentions = selected_item.getInput('formMaxMentions');
let item_and_data_settings = {selectedItem: selected_item, formMaxLevels: form_max_levels,
formMaxMentions: form_max_mentions, dataFile: data_file,
lastVariable: last_variable, categories: categories_to_save,
analysisName: analysis_name, variablePrefix: variable_prefix};
let saved_variables = [];
if (output_classes.indexOf('categorizedlist') > -1)
saved_variables = saveListCategorizationVariables(item_and_data_settings);
else if (output_classes.indexOf('AutomaticCategorization') > -1)
saved_variables = saveAutomaticCategorizationVariables(item_and_data_settings);
else if (output_classes.indexOf('EntityExtraction') > -1)
saved_variables = saveEntityExtractionVariables(item_and_data_settings);
else if (output_classes.indexOf('TextClassifier') > -1)
saved_variables = saveTextClassifierVariables(item_and_data_settings);
else // Must be a wordBag
saved_variables = saveWordBagVariables(item_and_data_settings);
// In Q, create a table showing the new question
if (!inDisplayr() && typeof saved_variables !== 'boolean') {
let t;
saved_variables.forEach(q => {
t = selected_item.group.appendTable();
t.primary = q;
});
project.report.setSelectedRaw([t]);
}
}
function removeNBSPForR(lhs_text, rhs_text) {
return {lefthandside: 'paste0(gsub("\\u00a0", " ", ' + lhs_text + ', fixed = TRUE), collapse = "")',
righthandside: rhs_text.join('').replace(/\u00a0/g, ' ').replace(/"/g, '\\"')};
}