diff --git a/facebook/qsf-tools/generate-codebook.R b/facebook/qsf-tools/generate-codebook.R index f4e9cb3c3..72786e8d4 100644 --- a/facebook/qsf-tools/generate-codebook.R +++ b/facebook/qsf-tools/generate-codebook.R @@ -4,7 +4,7 @@ ## ## Usage: ## -## Rscript generate-codebook.R path/to/qsf path/to/codebook +## Rscript generate-codebook.R [UMD/CMU] path/to/qsf path/to/codebook suppressPackageStartupMessages({ library(tidyverse) @@ -17,81 +17,32 @@ suppressPackageStartupMessages({ process_qsf <- function(path_to_qsf, + survey_version=c("CMU", "UMD"), path_to_shortname_map="./static/item_shortquestion_map.csv", path_to_replacement_map="./static/item_replacement_map.csv") { + survey_version <- match.arg(survey_version) q <- read_json(path_to_qsf) wave <- get_wave(path_to_qsf) - # get the survey elements with flow logic (should be one per block randomization branch) - ii_flow <- q$SurveyElements %>% - map_chr("Element") %>% - {. == "FL"} %>% - which() - ii_block_randomizer <- q$SurveyElements[ii_flow] %>% - map(~ .x$Payload$Flow) %>% - map(~ map(.x,~ .x$Type == "BlockRandomizer")) %>% - unlist() %>% - which() - random_block_ids <- q$SurveyElements[ii_flow] %>% - map(~ .x$Payload$Flow) %>% - map(~ .x[ii_block_randomizer]) %>% - map(~ map(.x,~ .x$Flow)) %>% - map(~ map(.x,~ map(.x,~ .x$ID))) %>% - unlist() - - block_id_item_map <- get_block_item_map(q) - block_id_item_map <- block_id_item_map %>% filter(BlockID %in% random_block_ids) %>% - select(-BlockID) - - # get the survey elements that are questions: - ii_questions <- q$SurveyElements %>% - map_chr("Element") %>% - {. == "SQ"} %>% - which() - - # get the questions that were shown to respondents - shown_items <- get_shown_items(q) - ii_shown <- q$SurveyElements[ii_questions] %>% - map_chr(~ .x$Payload$QuestionID) %>% - {. %in% shown_items} %>% - which() + displayed_questions <- subset_qsf_to_displayed(q) - # subset qsf to valid elements - displayed_questions <- q$SurveyElements[ii_questions][ii_shown] - - # Qualtrics auto-assigned question IDs + # get Qualtrics auto-assigned question IDs qids <- displayed_questions %>% map_chr(~ .x$Payload$QuestionID) - # the items are how we will match these to the survey data: - items <- displayed_questions %>% - map_chr(~ .x$Payload$DataExportTag) - - # B13 was originally named incorrectly. Rename manually as needed - items[items == "B13 "] <- "B13" - # V2a in Wave 13 was originally named incorrectly. Rename manually as needed - if (wave == 13) { - items[items == "V2a"] <- "V2d" - } + # get item names + item_names <- displayed_questions %>% + map_chr(~ .x$Payload$DataExportTag) %>% + patch_item_names(survey_version, wave) - # get the text of the question: + # get question text: questions <- displayed_questions %>% map_chr(~ .x$Payload$QuestionText) - # get the type of question: - type_map <- c(MC = "Multiple choice", TE = "Text", Matrix = "Matrix") - qtype <- displayed_questions %>% - map_chr(~ .x$Payload$QuestionType) %>% - {type_map[.]} - - ii_multiselect <- displayed_questions %>% - map_chr(~ .x$Payload$Selector) %>% - {. == "MAVR"} %>% - which() - qtype[ii_multiselect] <- "Multiselect" - qtype[items == "A5"] <- "Matrix" # this will be treated like C10 - - # get the choices (for MC and Matrix): + # get question types + qtype <- get_question_formats(displayed_questions, item_names, survey_version) + + # get choices for multiple choice (MC) and Matrix items: choices <- displayed_questions %>% map(~ .x$Payload$Choices) %>% map(~ map(.x, "Display")) @@ -122,8 +73,8 @@ process_qsf <- function(path_to_qsf, map(which) %>% map(names) %>% unlist() - other_text_items <- paste(items[ii_other_text_option], text_elem, "TEXT", sep="_") %>% - setNames(items[ii_other_text_option]) + other_text_items <- paste(item_names[ii_other_text_option], text_elem, "TEXT", sep="_") %>% + setNames(item_names[ii_other_text_option]) # some questions port the choices from other questions ii_carryforward <- displayed_questions %>% @@ -189,7 +140,7 @@ process_qsf <- function(path_to_qsf, paste(c(qid, selectable_text, option_code), collapse="") }, .x)) %>% # Replace QID with question number (A2, etc) - map(~ gsubfn("(QID[0-9]+)", function(qid) {items[qids == qid]}, .x)) %>% + map(~ gsubfn("(QID[0-9]+)", function(qid) {item_names[qids == qid]}, .x)) %>% # Collapse logic into a single string. map(~ paste(.x, collapse=" ")) @@ -220,7 +171,7 @@ process_qsf <- function(path_to_qsf, unlist() # format all qsf content lists into a single tibble - qdf <- tibble(variable = items, + qdf <- tibble(variable = item_names, question = questions, type = qtype, choices = choices, @@ -229,8 +180,9 @@ process_qsf <- function(path_to_qsf, response_option_randomization = response_option_randomization) # Add on module randomization + block_id_item_map <- map_qids_to_module(q) block_id_item_map <- block_id_item_map %>% - left_join(data.frame(qid=qids, item=items), by=c("Questions"="qid")) + left_join(data.frame(qid=qids, item=item_names), by=c("Questions"="qid")) qdf <- qdf %>% left_join(block_id_item_map, by=c(variable="item")) %>% rename(group_of_respondents_item_was_shown_to = BlockName) @@ -289,16 +241,21 @@ process_qsf <- function(path_to_qsf, select(new) %>% unnest(new) - # A5 and C10 are special cases b/c of they are matrix of text entry questions: - # also C10 needs an extra _1. - matrix_items <- matrix_items %>% - mutate(variable = if_else(str_starts(variable, "C10"), paste0(variable, "_1"), variable), - type = if_else(str_starts(variable, "A5|C10"), "Text", type), - choices = if_else(str_starts(variable, "A5|C10"), list(list()), choices)) + # Custom matrix formatting + if (survey_version == "CMU") { + # A5 and C10 are special cases b/c they are matrices of text entry questions: + # also C10 needs an extra _1. + matrix_items <- matrix_items %>% + mutate(variable = if_else(str_starts(variable, "C10"), paste0(variable, "_1"), variable), + type = if_else(str_starts(variable, "A5|C10"), "Text", type), + choices = if_else(str_starts(variable, "A5|C10"), list(list()), choices)) + } else if (survey_version == "UMD") { + # pass + } qdf <- bind_rows(nonmatrix_items, matrix_items) - # indicate which items have replaced old items. + # indicate which new items have replaced old items. replaces_map <- read_csv(path_to_replacement_map, col_types = cols(new_item = col_character(), old_item = col_character() @@ -465,11 +422,12 @@ add_qsf_to_codebook <- function(path_to_qsf, path_to_codebook) { args <- commandArgs(TRUE) -if (length(args) != 2) { - stop("Usage: Rscript generate-codebook.R path/to/qsf path/to/codebook") +if (length(args) != 3) { + stop("Usage: Rscript generate-codebook.R [UMD/CMU] path/to/qsf path/to/codebook") } -path_to_qsf <- args[1] -path_to_codebook <- args[2] +survey_version <- args[1] +path_to_qsf <- args[2] +path_to_codebook <- args[3] invisible(add_qsf_to_codebook(path_to_qsf, path_to_codebook)) diff --git a/facebook/qsf-tools/list-matrix-items.R b/facebook/qsf-tools/list-matrix-items.R new file mode 100644 index 000000000..fbbafd113 --- /dev/null +++ b/facebook/qsf-tools/list-matrix-items.R @@ -0,0 +1,49 @@ +#!/usr/bin/env Rscript + +## Print a list of survey questions that we handle as matrices. +## +## Usage: +## +## Rscript list-matrix-items.R [UMD/CMU] path/to/qsf + +suppressPackageStartupMessages({ + library(jsonlite) + library(tidyverse) + source("qsf-utils.R") +}) + + +print_matrix_items <- function(path_to_qsf, survey_version=c("CMU", "UMD")) { + survey_version <- match.arg(survey_version) + q <- read_json(path_to_qsf) + wave <- get_wave(path_to_qsf) + + displayed_questions <- subset_qsf_to_displayed(q) + + # Get survey item names + item_names <- displayed_questions %>% + map_chr(~ .x$Payload$DataExportTag) %>% + patch_item_names(survey_version, wave) + + # Get survey item formats + qtype <- get_question_formats(displayed_questions, item_names, survey_version) + + qdf <- tibble(variable = item_names, + type = qtype) + + matrix_items <- qdf %>% filter(type == "Matrix") %>% pull(variable) + message("Wave ", wave, " has ", length(matrix_items), " matrix items: ", paste(matrix_items, collapse=", ")) + + return(NULL) +} + +args <- commandArgs(TRUE) + +if (length(args) != 2) { + stop("Usage: Rscript list-matrix-items.R [UMD/CMU] path/to/qsf") +} + +survey_version <- args[1] +path_to_qsf <- args[2] + +invisible(print_matrix_items(path_to_qsf, survey_version)) diff --git a/facebook/qsf-tools/qsf-utils.R b/facebook/qsf-tools/qsf-utils.R index 8e68ef81c..f7cdcca43 100644 --- a/facebook/qsf-tools/qsf-utils.R +++ b/facebook/qsf-tools/qsf-utils.R @@ -7,10 +7,10 @@ #' #' @return list of Qualtrics Question IDs (QIDs) of items shown to respondents get_shown_items <- function(qsf) { - block_out <- Filter(function(elem) { elem[["Element"]] == "BL" }, qsf$SurveyElements)[[1]]$Payload + all_blocks <- Filter(function(elem) { elem[["Element"]] == "BL" }, qsf$SurveyElements)[[1]]$Payload shown_items <- list() - for (block in block_out) { + for (block in all_blocks) { if (block$Type == "Trash") { next } @@ -27,10 +27,10 @@ get_shown_items <- function(qsf) { get_block_item_map <- function(qsf) { - block_out <- Filter(function(elem) { elem[["Element"]] == "BL" }, qsf$SurveyElements)[[1]]$Payload + all_blocks <- Filter(function(elem) { elem[["Element"]] == "BL" }, qsf$SurveyElements)[[1]]$Payload items <- list() - for (block in block_out) { + for (block in all_blocks) { if (block$Type == "Trash") { next } @@ -56,7 +56,7 @@ get_block_item_map <- function(qsf) { get_wave <- function(path_to_qsf) { qsf_name_pattern <- "(.*Wave_)([0-9]*([.][0-9])?)([.]qsf)$" if (!grepl(qsf_name_pattern, path_to_qsf)) { - stop("qsf filename should be of the format 'Survey_of_COVID-Like_Illness_-_Wave_XX.qsf'") + stop("qsf filename should be of the format 'Wave_XX.qsf' where 'XX' is an integer or float") } wave <- as.numeric( @@ -65,3 +65,109 @@ get_wave <- function(path_to_qsf) { return(wave) } + +#' Create mapping of QIDs to module name +#' +#' @param qsf contents of QSF file in JSON format +#' +#' @return dataframe with `BlockName` (module name) and `Questions` (QIDs) columns +map_qids_to_module <- function(qsf) { + # get the survey elements with flow logic (should be one per block randomization branch) + ii_flow <- qsf$SurveyElements %>% + map_chr("Element") %>% + {. == "FL"} %>% + which() + ii_block_randomizer <- qsf$SurveyElements[ii_flow] %>% + map(~ .x$Payload$Flow) %>% + map(~ map(.x,~ .x$Type == "BlockRandomizer")) %>% + unlist() %>% + which() + random_block_ids <- qsf$SurveyElements[ii_flow] %>% + map(~ .x$Payload$Flow) %>% + map(~ .x[ii_block_randomizer]) %>% + map(~ map(.x,~ .x$Flow)) %>% + map(~ map(.x,~ map(.x,~ .x$ID))) %>% + unlist() + + block_id_item_map <- get_block_item_map(qsf) + block_id_item_map <- block_id_item_map %>% filter(BlockID %in% random_block_ids) %>% + select(-BlockID) + + return(block_id_item_map) +} + +#' Get only questions that were shown to respondents, using definition in `get_shown_items` +#' +#' @param qsf contents of QSF file in JSON format +#' +#' @return QSF subsetted to only displayed questions +subset_qsf_to_displayed <- function(qsf) { + # get the survey elements that are questions: + ii_questions <- qsf$SurveyElements %>% + map_chr("Element") %>% + {. == "SQ"} %>% + which() + + # get the questions that were shown to respondents + shown_items <- get_shown_items(qsf) + ii_shown <- qsf$SurveyElements[ii_questions] %>% + map_chr(~ .x$Payload$QuestionID) %>% + {. %in% shown_items} %>% + which() + + # subset qsf to valid elements + displayed_questions <- qsf$SurveyElements[ii_questions][ii_shown] + + return(displayed_questions) +} + +#' Replace erroneous question names +#' +#' @param item_names character vector of survey question names +#' @param survey_version either "UMD" or "CMU" +#' @param wave integer or float survey version +#' +#' @return character vector of repaired survey question names +patch_item_names <- function(item_names, survey_version, wave) { + if (survey_version == "CMU") { + # B13 was originally named incorrectly. + item_names[item_names == "B13 "] <- "B13" + # V2a in Wave 13 was originally named incorrectly. + if (wave == 13) { + item_names[item_names == "V2a"] <- "V2d" + } + } else if (survey_version == "UMD") { + # pass + } + + return(item_names) +} + +#' Fetch and customize question format types. +#' +#' @param qsf contents of QSF file in JSON format +#' @param item_names character vector of survey question names +#' @param survey_version either "UMD" or "CMU" +#' +#' @return character vector of repaired survey question names +get_question_formats <- function(qsf, item_names, survey_version){ + type_map <- c(MC = "Multiple choice", TE = "Text", Matrix = "Matrix") + + qtype <- qsf %>% + map_chr(~ .x$Payload$QuestionType) %>% + {type_map[.]} + + ii_multiselect <- qsf %>% + map_chr(~ .x$Payload$Selector) %>% + {. == "MAVR"} %>% + which() + qtype[ii_multiselect] <- "Multiselect" + + if (survey_version == "CMU") { + qtype[item_names == "A5"] <- "Matrix" # this will be treated like C10 + } else if (survey_version == "UMD") { + # pass + } + + return(qtype) +}