diff --git a/facebook/qsf-tools/combine_changelogs_eu.R b/facebook/qsf-tools/combine_changelogs_eu.R index 1af5aeea5..f999e439d 100644 --- a/facebook/qsf-tools/combine_changelogs_eu.R +++ b/facebook/qsf-tools/combine_changelogs_eu.R @@ -22,7 +22,7 @@ combine_changelogs <- function(path_to_changelog_eu, old_version = col_double() )) %>% mutate( - eu_version = "EU" + eu_noneu = "EU" ) changelog_noneu <- read_csv(path_to_changelog_noneu, col_types = cols( @@ -31,21 +31,21 @@ combine_changelogs <- function(path_to_changelog_eu, old_version = col_double() )) %>% mutate( - eu_version = "Non-EU" + eu_noneu = "Non-EU" ) # Using rbind here to raise an error if columns differ between the existing # changelog and the new wave data. changelog_with_duplicates <- rbind(changelog_eu, changelog_noneu) - count_duplicated_rows <- changelog_with_duplicates %>% group_by(across(c(-eu_version))) %>% summarize(count = n()) + count_duplicated_rows <- changelog_with_duplicates %>% group_by(across(c(-eu_noneu))) %>% summarize(count = n()) changelog <- changelog_with_duplicates %>% left_join(count_duplicated_rows) - changelog$eu_version[changelog$count == 2] <- "Both" + changelog$eu_noneu[changelog$count == 2] <- "Both" # Sort so that items with missing type (non-Qualtrics fields) are at the top. # Drop duplicates. changelog <- changelog %>% - arrange(variable_name, eu_version) %>% + arrange(variable_name, eu_noneu) %>% select(-count) %>% distinct() diff --git a/facebook/qsf-tools/combine_codebooks_eu.R b/facebook/qsf-tools/combine_codebooks_eu.R index 020da8b23..6109dd8a5 100644 --- a/facebook/qsf-tools/combine_codebooks_eu.R +++ b/facebook/qsf-tools/combine_codebooks_eu.R @@ -21,7 +21,7 @@ combine_codebooks <- function(path_to_codebook_eu, version = col_double() )) %>% mutate( - eu_version = "EU" + eu_noneu = "EU" ) codebook_noneu <- read_csv(path_to_codebook_noneu, col_types = cols( @@ -29,22 +29,22 @@ combine_codebooks <- function(path_to_codebook_eu, version = col_double() )) %>% mutate( - eu_version = "Non-EU" + eu_noneu = "Non-EU" ) # Using rbind here to raise an error if columns differ between the existing # codebook and the new wave data. codebook_with_duplicates <- rbind(codebook_eu, codebook_noneu) - count_duplicated_rows <- codebook_with_duplicates %>% group_by(across(c(-eu_version))) %>% summarize(count = n()) + count_duplicated_rows <- codebook_with_duplicates %>% group_by(across(c(-eu_noneu))) %>% summarize(count = n()) codebook <- codebook_with_duplicates %>% left_join(count_duplicated_rows) - codebook$eu_version[codebook$count == 2] <- "Both" + codebook$eu_noneu[codebook$count == 2] <- "Both" # Sort so that items with missing type (non-Qualtrics fields) are at the top. # Drop duplicates. codebook <- codebook %>% - arrange(!is.na(.data$question_type), variable, version, eu_version) %>% - select(-count) %>% + arrange(!is.na(.data$question_type), variable, version, eu_noneu) %>% + select(-count, -replaces) %>% distinct() return(codebook) diff --git a/facebook/qsf-tools/generate-changelog.R b/facebook/qsf-tools/generate-changelog.R index f874a4e66..3733fee01 100644 --- a/facebook/qsf-tools/generate-changelog.R +++ b/facebook/qsf-tools/generate-changelog.R @@ -7,434 +7,426 @@ ## ## Usage: ## -## Rscript generate-changelog.R UMD|CMU path/to/codebook path/to/diff/or/diff/directory path/to/output/changelog [path/to/old/changelog] +## Rscript generate-changelog-with-codebook.R UMD|CMU path/to/codebook path/to/output/changelog [path/to/old/changelog] suppressPackageStartupMessages({ library(tidyverse) - library(purrr) - library(stringr) - source("qsf-utils.R") }) +# "old" = new +WAVE_COMPARE_MAP <- list( + "UMD" = c( + "1" = 2, + "2" = 3, + "3" = 4, + "4" = 5, + "5" = 6, + "6" = 7, + "7" = 8, + "8" = 9, + "9" = 10, + "10" = 11, + "11" = 12, + "12" = 13 + ), + "CMU" = c( + "1" = 2, + "2" = 3, + "3" = 4, + "4" = 5, + "5" = 6, + "6" = 7, + "7" = 8, + "8" = 10, + "10" = 11, + "11" = 12, + "12" = 13 + ) +) + +DIFF_COLS <- c( + "question", + "matrix_subquestion", + "response_options", + "display_logic", + "response_option_randomization", + "respondent_group" +) + +CHANGE_TYPE_MAP <- c( + added = "Item added", + removed = "Item removed", + question = "Question wording changed", + display_logic = "Display logic changed", + response_options = "Answer choices changed", + matrix_subquestion = "Matrix subquestion text changed", + response_option_randomization = "Answer choice order changed", + respondent_group = "Respondent group changed" +) + generate_changelog <- function(path_to_codebook, - path_to_diff, path_to_changelog, path_to_old_changelog, - survey_version, - rename_map_file="item_rename_map.csv") { + survey_version) { # Get the codebook. Contains details about each question (text, answer # choices, display logic) by wave. - codebook <- get_codebook(path_to_codebook) + codebook_raw <- read_csv(path_to_codebook, col_types = cols( + .default = col_character(), + version = col_double() + )) - # Get the diffs + rationale. Contains info about which items changed between - # waves, plus a description of what changed and why. - qsf_diff <- get_diff(path_to_diff) + local_compare_map <- WAVE_COMPARE_MAP[[survey_version]] + # Add new-old wave mapping columns. Drop unused rows. + codebook <- codebook_raw %>% + mutate( + old_version = version, + new_version = local_compare_map[as.character(version)] + ) - if (!("notes" %in% names(qsf_diff)) && is.null(path_to_old_changelog)) { - warning("rationales must be provided either in the diff or via an old version of the changelog") - qsf_diff$notes <- NA_character_ - } + codebook <- full_join( + # with old columns + codebook %>% + rename_with(function(cols) { + map_chr(cols, ~ rename_col(.x, "old")) + }) %>% + select(-replaces) %>% + mutate(x_exists = TRUE), + # with new columns + codebook %>% + rename_with(function(cols) { + map_chr(cols, ~ rename_col(.x, "new")) + }) %>% + select(-replaces, -old_version, -new_version) %>% + mutate(y_exists = TRUE), + by = c("new_version" = "version", "variable" = "variable") + ) %>% + mutate( + description = coalesce(description.x, description.y), + question_type = coalesce(question_type.x, question_type.y), + ) %>% + select(-version, -description.x, -description.y, -question_type.x, -question_type.y) %>% + rename(variable_name = variable) - qsf_diff <- expand_out_matrix_subquestions(qsf_diff) + # Drop obs where both old and new info is missing -- these are metavariables + # that we report in the microdata, like "weight" and "StartDate" + codebook <- codebook %>% + filter( + !(is.na(old_question) & + is.na(old_display_logic) & + is.na(old_response_option_randomization) & + is.na(old_respondent_group) & + is.na(new_question) & + is.na(new_display_logic) & + is.na(new_response_option_randomization) & + is.na(new_respondent_group)) + ) - # Rename items as necessary - path_to_rename_map <- localize_static_filepath(rename_map_file, survey_version) - qsf_diff <- qsf_diff %>% - rowwise() %>% - mutate( - # Don't rename items that have been removed. Renaming is based on `new_wave`, - # but removed items are not actually present in `new_wave`, just the `old_wave`. - variable_name = patch_item_names( - variable_name, path_to_rename_map, new_wave, change_type != "Item removed" - ) + # Fill in version where missing + codebook$new_version <- coalesce(codebook$new_version, map_dbl(codebook$old_version, ~ local_compare_map[as.character(.x)])) + codebook$old_version <- coalesce(codebook$old_version, map_dbl(codebook$new_version, ~ get_old_version(.x, local_compare_map) %>% as.double())) + + # Drop obs where version is not in names or values of the wave mapping (i.e. 12.5) + codebook <- codebook %>% + filter( + new_version %in% c(local_compare_map, names(local_compare_map)), + old_version %in% c(local_compare_map, names(local_compare_map)) ) - result <- prepare_matrix_base_questions_for_join(qsf_diff, codebook) - qsf_diff <- result$diff - vars_not_in_codebook <- result$vars_not_in_codebook - - changelog <- make_changelog_from_codebook_and_diff(qsf_diff, codebook, vars_not_in_codebook) - changelog <- add_rationales_from_old_changelog(changelog, path_to_old_changelog) - check_missing_rationales(changelog) + ## Find differences. + result <- list() - write_excel_csv(changelog %>% rename(new_version=new_wave, old_version=old_wave), path_to_changelog, quote="needed") -} - -# Read codebook from path. Drop fields we don't use in the changelog. -get_codebook <- function(path_to_codebook) { - codebook <- read_csv(path_to_codebook, col_types = cols( - .default = col_character(), - version = col_double() - )) %>% - rename(question_text = question, matrix_subquestion_text = matrix_subquestion) %>% - select( - -replaces, -description, -question_type, - -response_option_randomization, -respondent_group + # Any item where x (old fields) does not exist but y does has been "added" + added_items <- codebook %>% + filter( + is.na(x_exists) & y_exists ) + codebook <- anti_join(codebook, added_items) - return(codebook) -} - -# Try to load `path_to_diff`. Check if it is a single CSV or a directory -# containing a set of CSVs. -get_diff <- function(path_to_diff) { - if (dir.exists(path_to_diff)) { - # Load all CSVs from a directory - csvs <- list.files(path_to_diff, pattern = "*.csv$", full.names = TRUE) - qsf_diff <- list() - for (csv in csvs) { - curr_diff <- read_csv(csv, col_types = cols( - .default = col_character(), - new_wave = col_double(), - old_wave = col_double() - )) - - old_wave_vars <- curr_diff %>% - filter(change_type != "Item added") %>% - distinct(item) %>% - pull() - new_wave_vars <- curr_diff %>% - filter(change_type != "Item removed") %>% - distinct(item) %>% - pull() - if (all(c("C0_matrix", "C0_likert") %in% old_wave_vars) || all(c("C0_matrix", "C0_likert") %in% new_wave_vars)) { - stop("Only one of 'C0_matrix' and 'C0_likert' can be present at once in file ", csv) - } - if (all(c("B13_profile", "B13_likert") %in% old_wave_vars) || all(c("B13_profile", "B13_likert") %in% new_wave_vars)) { - stop("Only one of 'B13_profile' and 'B13_likert' can be present at once in file ", csv) - } - if (all(c("B14_profile", "B14_likert") %in% old_wave_vars) || all(c("B14_profile", "B14_likert") %in% new_wave_vars)) { - stop("Only one of 'B14_profile' and 'B14_likert' can be present at once in file ", csv) - } - if (all(c("B12a_profile", "B12a_likert") %in% old_wave_vars) || all(c("B12a_profile", "B12a_likert") %in% new_wave_vars)) { - stop("Only one of 'B12a_profile' and 'B12a_likert' can be present at once in file ", csv) - } - if (all(c("B12b_profile", "B12b_likert") %in% old_wave_vars) || all(c("B12b_profile", "B12b_likert") %in% new_wave_vars)) { - stop("Only one of 'B12b_profile' and 'B12b_likert' can be present at once in file ", csv) - } - if (all(c("B1b_matrix", "B1b_likert") %in% old_wave_vars) || all(c("B1b_matrix", "B1b_likert") %in% new_wave_vars)) { - stop("Only one of 'B1b_matrix' and 'B1b_likert' can be present at once in file ", csv) - } - - qsf_diff[[csv]] <- curr_diff + # Process added items + added_items <- added_items %>% + mutate( + change_type = CHANGE_TYPE_MAP["added"] + ) %>% + select(-x_exists, -y_exists) + + combos <- added_items %>% + filter(question_type == "Matrix" | !is.na(new_matrix_base_name) | !is.na(new_matrix_subquestion)) %>% + distinct(old_version, new_matrix_base_name) + + for (i in seq_len(nrow(combos))) { + wave = combos[i,] %>% pull(old_version) + base_name = combos[i,] %>% pull(new_matrix_base_name) + tmp <- added_items %>% + filter( + old_version == wave, new_matrix_base_name == base_name + ) + added_items <- anti_join(added_items, tmp) + if (nrow(filter(codebook_raw, version == wave, matrix_base_name == base_name)) == 0) { + # Dedup subqs so only report base question once + tmp <- tmp %>% + group_by(old_matrix_base_name, new_matrix_base_name, new_version, old_version) %>% + mutate( + variable_name = new_matrix_base_name, + old_matrix_subquestion = NA, + new_matrix_subquestion = "Differ by subquestion", + old_response_options = case_when( + length(unique(old_response_options)) == 1 ~ old_response_options, + TRUE ~ "Differ by subquestion" + ), + new_response_options = case_when( + length(unique(new_response_options)) == 1 ~ new_response_options, + TRUE ~ "Differ by subquestion" + ) + ) %>% + slice_head() %>% + ungroup() + } else { + tmp <- mutate(tmp, change_type = "Matrix subquestion added to existing item") } - qsf_diff <- purrr::reduce(qsf_diff, rbind) %>% - rename(variable_name = item) %>% - select(-contains("qid")) - } else if (file.exists(path_to_diff)) { - # Load a single file - qsf_diff <- read_csv(path_to_diff, col_types = cols( - .default = col_character(), - new_wave = col_double(), - old_wave = col_double() - )) %>% - rename(variable_name = item) %>% - select(-contains("qid")) - } else { - stop(path_to_diff, " is not a valid file or directory") + added_items <- rbind(added_items, tmp) } - - return(qsf_diff) -} -# Turn any item listed as an `impacted_subquestion` into its own observation. -# Other fields are set to be the same as the base question (e.g. the base -# question is E1 for matrix subquestion E1_1). -expand_out_matrix_subquestions <- function(qsf_diff) { - # The diff only lists base name for matrix questions that changed. For - # example, `variable_name` is "Z1" if any matrix subquestion ("Z1_1", "Z1_2", - # etc) changed. The modified subquestions are listed in column - # `impacted_subquestions`. - # - # Since the codebook lists matrix subquestions separately, we need to split up - # the `impacted_subquestions` such that each subquestion is its own - # observation. This will allow us to join the codebook onto the diff. - nonmatrix_changes <- qsf_diff %>% - filter(is.na(impacted_subquestions)) %>% - select(-impacted_subquestions) - # Separately process any obs with non-missing `impacted_subquestions.` - matrix_changes <- qsf_diff %>% - filter(!is.na(impacted_subquestions)) %>% - # If multiple matrix subquestions changed, list each separately. - rowwise() %>% - mutate(new = list( - tibble( - new_wave = new_wave, - old_wave = old_wave, - change_type = change_type, - variable_name = str_split(impacted_subquestions, ", ") %>% unlist(), - notes = notes - ) - )) %>% - select(new) %>% - unnest(new) + result[["added"]] <- added_items - # Combine matrix and non-matrix subquestions. Use rbind to warn if our columns - # differ. - qsf_diff <- rbind(nonmatrix_changes, matrix_changes) %>% - arrange(new_wave, old_wave) - - return(qsf_diff) -} - -# Matrix base questions (e.g. the base question is E1 for matrix subquestion -# E1_1) exist in diffs but not in the codebook. To be able to join them between -# the two dfs, create a variable name mapping specifically for use in the join -# operation. -# -# A matrix base question is mapped to the first associated subquestion instance -# for a particular wave. The first subquestion is used for convenience and -# reproducibility; subquestion-specific fields are set to `NA`. -prepare_matrix_base_questions_for_join <- function(qsf_diff, codebook) { - # If variable_name from the qsf_diff is not also listed as a variable in - # the codebook, try adding an underscore to the end and looking for a variable - # in the codebook that starts with that string. The matrix_subquestion_text - # field of the match should be populated, although we want to ignore it and - # fill with NA instead. - vars_not_in_codebook <- setdiff( - qsf_diff %>% distinct(variable_name) %>% pull(), - codebook %>% distinct(variable) %>% pull() - ) - - # Add an underscore to the unmatched variable names to create a regex pattern - matrix_prefixes <- paste0(vars_not_in_codebook, "_") - names(matrix_prefixes) <- vars_not_in_codebook - - # A subset of UMD variables need manual mapping. - if ("C0_matrix" %in% names(matrix_prefixes)) { - matrix_prefixes["C0_matrix"] <- "C0_" - } - if ("C0_likert" %in% names(matrix_prefixes)) { - matrix_prefixes["C0_likert"] <- "C0_" - } + # Any item where x (old fields) exists but y does not has been "removed" + removed_items <- codebook %>% + filter( + x_exists & is.na(y_exists) + ) + codebook <- anti_join(codebook, removed_items) %>% + select(-x_exists, -y_exists) - if ("B13_profile" %in% names(matrix_prefixes)) { - matrix_prefixes["B13_profile"] <- "B13_" - } - if ("B13_likert" %in% names(matrix_prefixes)) { - matrix_prefixes["B13_likert"] <- "B13_" - } + # Process removed items. + removed_items <- removed_items %>% + mutate( + change_type = CHANGE_TYPE_MAP["removed"] + ) %>% + select(-x_exists, -y_exists) - if ("B14_profile" %in% names(matrix_prefixes)) { - matrix_prefixes["B14_profile"] <- "B14_" - } - if ("B14_likert" %in% names(matrix_prefixes)) { - matrix_prefixes["B14_likert"] <- "B14_" - } + combos <- removed_items %>% + filter(question_type == "Matrix" | !is.na(old_matrix_base_name) | !is.na(old_matrix_subquestion)) %>% + distinct(new_version, old_matrix_base_name) - if ("B12a_profile" %in% names(matrix_prefixes)) { - matrix_prefixes["B12a_profile"] <- "B12a_" - } - if ("B12a_likert" %in% names(matrix_prefixes)) { - matrix_prefixes["B12a_likert"] <- "B12a_" + for (i in seq_len(nrow(combos))) { + wave = combos[i,] %>% pull(new_version) + base_name = combos[i,] %>% pull(old_matrix_base_name) + tmp <- removed_items %>% + filter( + new_version == wave, old_matrix_base_name == base_name + ) + removed_items <- anti_join(removed_items, tmp) + if (nrow(filter(codebook_raw, version == wave, matrix_base_name == base_name)) == 0) { + # Dedup subqs so only report base question once + tmp <- tmp %>% + group_by(old_matrix_base_name, new_matrix_base_name, new_version, old_version) %>% + mutate( + variable_name = old_matrix_base_name, + old_matrix_subquestion = "Differ by subquestion", + new_matrix_subquestion = NA, + old_response_options = case_when( + length(unique(old_response_options)) == 1 ~ old_response_options, + TRUE ~ "Differ by subquestion" + ), + new_response_options = case_when( + length(unique(new_response_options)) == 1 ~ new_response_options, + TRUE ~ "Differ by subquestion" + ) + ) %>% + slice_head() %>% + ungroup() + } else { + tmp <- mutate(tmp, change_type = "Matrix subquestion removed from existing item") + } + removed_items <- rbind(removed_items, tmp) } - if ("B12b_profile" %in% names(matrix_prefixes)) { - matrix_prefixes["B12b_profile"] <- "B12b_" - } - if ("B12b_likert" %in% names(matrix_prefixes)) { - matrix_prefixes["B12b_likert"] <- "B12b_" - } + result[["removed"]] <- removed_items - if ("B1b_matrix" %in% names(matrix_prefixes)) { - matrix_prefixes["B1b_matrix"] <- "B1b_" - } - if ("B1b_likert" %in% names(matrix_prefixes)) { - matrix_prefixes["B1b_likert"] <- "B1b_" + + # Do all other comparisons + for (col in DIFF_COLS) { + new_col <- paste("new", col, sep="_") + old_col <- paste("old", col, sep="_") + items_not_identical <- find_col_differences(codebook, new_col, old_col) + + changed <- codebook %>% + filter(items_not_identical) %>% + mutate(change_type = CHANGE_TYPE_MAP[col]) + if (col == "question") { + # Drop obs if the change is due to trivial formatting, e.g. nbsp + changed <- changed %>% + mutate( + new_question_wo_formatting = str_replace_all(new_question, " ", " "), + old_question_wo_formatting = str_replace_all(old_question, " ", " ") + ) %>% + filter(new_question_wo_formatting != old_question_wo_formatting) %>% + select(-new_question_wo_formatting, -old_question_wo_formatting) + } + result[[col]] <- changed } - # First matrix item match by wave and matrix base question. - map_matrix_prefix_to_first_match <- codebook %>% - mutate( - join_variable = case_when( - # Create the basename for matrix items. - !is.na(matrix_subquestion_text) ~ strsplit(variable, "_") %>% - # Get all but last underscore-delimited chunk - purrr::map(~ .x[1:(length(.x) - 1)]) %>% - # Combine all but the last chunk with underscores. - purrr::map(~ paste0(.x, collapse="_") %>% paste0("_")) %>% - unlist(), - TRUE ~ variable - ) - ) %>% - filter(join_variable %in% matrix_prefixes) %>% - group_by(version, join_variable) %>% - slice_head() %>% - select(version, variable, join_variable) + changelog <- bind_rows(result) - # Add the regex patterns onto the diff. - qsf_diff <- qsf_diff %>% - mutate( - join_variable = case_when( - variable_name %in% vars_not_in_codebook ~ matrix_prefixes[variable_name], - TRUE ~ variable_name - ) - ) %>% - left_join( - map_matrix_prefix_to_first_match %>% rename_with(function(column_names) { - paste("new", column_names, sep = "_") - }), - by=c("new_wave" = "new_version", "join_variable"="new_join_variable") - ) %>% - left_join( - map_matrix_prefix_to_first_match %>% rename_with(function(column_names) { - paste("old", column_names, sep = "_") - }), - by=c("old_wave" = "old_version", "join_variable"="old_join_variable") - ) %>% - rename( - join_variable_new_wave = new_variable, - join_variable_old_wave = old_variable - ) %>% - mutate( - join_variable_new_wave = coalesce(join_variable_new_wave, variable_name), - join_variable_old_wave = coalesce(join_variable_old_wave, variable_name) - ) %>% - select(-join_variable) - - return(list("diff" = qsf_diff, "vars_not_in_codebook" = vars_not_in_codebook)) -} - -# Join codebook onto diff and modify columns to make the changelog. -make_changelog_from_codebook_and_diff <- function(qsf_diff, codebook, vars_not_in_codebook) { - # Create changelog by joining codebook onto annotated diff. - changelog <- qsf_diff %>% - # Add info about new version of question - left_join( - codebook %>% rename_with(function(column_names) { - paste("new", column_names, sep = "_") - }), - by=c("new_wave" = "new_version", "join_variable_new_wave" = "new_variable") + ## Don't report all matrix subquestions when the change is shared between all + ## of them, just report the base item. + # Group by matrix_base_name, change_type, and wave, as long as the change_type is relevant and matrix_base_name is not NA. + # Keep only one obs for each group. + # Set var name in kept obs to matrix_base_name for generality and to be able to join rationales on. + combos <- changelog %>% + filter((question_type == "Matrix" | !is.na(old_matrix_base_name) | !is.na(old_matrix_subquestion)) & + change_type %in% c( + "Question wording changed", + "Display logic changed", + "Answer choices changed", ## TODO: needs special logic, because Matrix subquestions can actually have different answer choices. Not needed for UMD + "Answer choice order changed", ## TODO: needs special logic, because Matrix subquestions can actually have different answer choices. Not needed for UMD + "Respondent group changed" + ) ) %>% - # Add info about previous version of question - left_join( - codebook %>% rename_with(function(column_names) { - paste("old", column_names, sep = "_") - }), - by=c("old_wave" = "old_version", "join_variable_old_wave" = "old_variable") - ) %>% - select( - new_wave, - old_wave, - variable_name, - change_type, - new_question_text, - new_matrix_subquestion_text, - new_response_options, - new_display_logic, - old_question_text, - old_matrix_subquestion_text, - old_response_options, - old_display_logic, - notes - ) %>% - # If item is a matrix question where something other than the matrix - # subquestions changed between waves, drop matrix_subquestion_text fields, - # which are relevant for only a single subquestion. - mutate( - new_matrix_subquestion_text = case_when( - variable_name %in% vars_not_in_codebook ~ NA_character_, - TRUE ~ new_matrix_subquestion_text - ), - old_matrix_subquestion_text = case_when( - variable_name %in% vars_not_in_codebook ~ NA_character_, - TRUE ~ old_matrix_subquestion_text - ) - ) %>% - # When an item was added, all `old_` fields should be empty; when an item - # was removed, all `new_` fields should be empty. - mutate( - old_question_text = case_when( - change_type == "Item added" ~ NA_character_, - TRUE ~ old_question_text - ), - old_matrix_subquestion_text = case_when( - change_type == "Item added" ~ NA_character_, - TRUE ~ old_matrix_subquestion_text - ), - old_response_options = case_when( - change_type == "Item added" ~ NA_character_, - TRUE ~ old_response_options - ), - old_display_logic = case_when( - change_type == "Item added" ~ NA_character_, - TRUE ~ old_display_logic - ), - new_question_text = case_when( - change_type == "Item removed" ~ NA_character_, - TRUE ~ new_question_text - ), - new_matrix_subquestion_text = case_when( - change_type == "Item removed" ~ NA_character_, - TRUE ~ new_matrix_subquestion_text - ), - new_response_options = case_when( - change_type == "Item removed" ~ NA_character_, - TRUE ~ new_response_options - ), - new_display_logic = case_when( - change_type == "Item removed" ~ NA_character_, - TRUE ~ new_display_logic + distinct(new_version, old_version, new_matrix_base_name, old_matrix_base_name, change_type) + + SPECIAL_HANDLING <- list( + "Answer choices changed" = list("new_response_options", "old_response_options"), + "Answer choices order changed" = list("new_response_option_randomization", "old_response_option_randomization") + ) + for (i in seq_len(nrow(combos))) { + new_v <- combos[i,] %>% pull(new_version) + old_v <- combos[i,] %>% pull(old_version) + new_base <- combos[i,] %>% pull(new_matrix_base_name) + old_base <- combos[i,] %>% pull(old_matrix_base_name) + change <- combos[i,] %>% pull(change_type) + + tmp <- changelog %>% + filter( + new_version == new_v, + old_version == old_v, + new_matrix_base_name == new_base, + old_matrix_base_name == old_base, + change_type == change ) - ) + changelog <- anti_join(changelog, tmp) + + combine_flag <- FALSE + if (change %in% names(SPECIAL_HANDLING)) { + # See if the changed column is the same for all obs. Check if all matrix + # subquestions are listed. + new_col <- SPECIAL_HANDLING[[change]][[1]] + old_col <- SPECIAL_HANDLING[[change]][[2]] + if ( + length(unique(tmp[[new_col]])) == 1 && + length(unique(tmp[[old_col]])) == 1 && + ( + nrow(tmp) == codebook_raw %>% filter(version == old_v, matrix_base_name == old_base) %>% nrow() || + nrow(tmp) == codebook_raw %>% filter(version == new_v, matrix_base_name == new_base) %>% nrow() + ) + ) { + combine_flag <- TRUE + } + } else { + combine_flag <- TRUE + } + + if (combine_flag) { + tmp <- tmp %>% + slice_head() %>% + mutate( + variable_name = case_when( + old_matrix_base_name != new_matrix_base_name ~ paste(old_matrix_base_name, new_matrix_base_name, sep="/"), + TRUE ~ old_matrix_base_name + ), + old_matrix_subquestion = NA, + new_matrix_subquestion = NA + ) + } + + changelog <- rbind(changelog, tmp) + } - return(changelog) -} - -# Add old rationales, if available, to new changelog -add_rationales_from_old_changelog <- function(changelog, path_to_old_changelog) { - # If path_to_old_changelog is provided, prefer it over existing notes column. - if (!is.null(path_to_old_changelog)) { + ## Join old rationales on. + # TODO: The first time this happens using this new script, need to manually map + # some rationales for "Matrix subquestions changed", since previously this tag + # would include added and removed subquestions. + if (is.null(path_to_old_changelog)) { + warning("rationales will be empty; an old version of the changelog was not provided") + changelog$notes <- NA_character_ + } else { old_changelog <- read_csv(path_to_old_changelog, col_types = cols( .default = col_character(), - new_wave = col_double(), - old_wave = col_double() + new_version = col_double(), + old_version = col_double() )) %>% - select(new_wave, old_wave, variable_name, change_type, notes) + select(new_version, old_version, variable_name, change_type, notes) changelog <- changelog %>% - select(-notes) %>% - left_join(old_changelog, by=c("new_wave", "old_wave", "variable_name", "change_type")) + left_join(old_changelog, by=c("new_version", "old_version", "variable_name", "change_type")) } - return(changelog) + write_excel_csv( + changelog %>% + rename( + new_question_text = new_question, + old_question_text = old_question, + new_matrix_subquestion_text = new_matrix_subquestion, + old_matrix_subquestion_text = old_matrix_subquestion + ) %>% + select( + new_version, + old_version, + variable_name, + description, + change_type, + new_matrix_base_name, + new_question_text, + new_matrix_subquestion_text, + new_response_options, + new_display_logic, + new_response_option_randomization, + new_respondent_group, + old_matrix_base_name, + old_question_text, + old_matrix_subquestion_text, + old_response_options, + old_display_logic, + old_response_option_randomization, + old_respondent_group, + notes + ) %>% + arrange(new_version, old_version), + path_to_changelog, quote="needed" + ) } -check_missing_rationales <- function(changelog) { - if (any(is.na(changelog$notes))) { - vars_missing_rationales <- changelog %>% - filter(is.na(notes) | notes == "") %>% - pull(variable_name) - waves <- changelog %>% - filter(is.na(notes) | notes == "") %>% - pull(new_wave) - change_types <- changelog %>% - filter(is.na(notes) | notes == "") %>% - pull(change_type) - warning( - "variables ", paste0(vars_missing_rationales, " (new_wave ", waves, ", ", change_types, ")", collapse = ", "), - " are missing rationales" - ) +rename_col <- function(col, prefix) { + if (col %in% c(DIFF_COLS, "matrix_base_name")) { + paste(prefix, col, sep = "_") + } else { + col } - - return(NULL) } +find_col_differences <- function(codebook, new_col, old_col) { + codebook[[old_col]] != codebook[[new_col]] +} + +get_old_version <- function(new_version, compare_map) { + ifelse(new_version %in% compare_map, compare_map[compare_map == new_version] %>% names(), NA_character_) +} + + args <- commandArgs(TRUE) -if (!(length(args) %in% c(4, 5))) { - stop("Usage: Rscript generate-changelog.R UMD|CMU path/to/codebook path/to/diff/directory path/to/output/changelog [path/to/old/changelog]") +if (!(length(args) %in% c(3, 4))) { + stop("Usage: Rscript generate-changelog-with-codebook.R UMD|CMU path/to/codebook path/to/output/changelog [path/to/old/changelog]") } survey_version <- args[1] path_to_codebook <- args[2] -path_to_diff <- args[3] -path_to_changelog <- args[4] +path_to_changelog <- args[3] -if (length(args) == 5) { - path_to_old_changelog <- args[5] +if (length(args) == 4) { + path_to_old_changelog <- args[4] } else { path_to_old_changelog <- NULL } -invisible(generate_changelog(path_to_codebook, path_to_diff, path_to_changelog, path_to_old_changelog, survey_version)) +invisible(generate_changelog(path_to_codebook, path_to_changelog, path_to_old_changelog, survey_version)) + diff --git a/facebook/qsf-tools/generate-codebook.R b/facebook/qsf-tools/generate-codebook.R index 213aa200a..26f81948e 100644 --- a/facebook/qsf-tools/generate-codebook.R +++ b/facebook/qsf-tools/generate-codebook.R @@ -212,6 +212,7 @@ process_qsf <- function(path_to_qsf, " 'DisplayLogic' would be overwritten.") } display_logic[inpage_ii] <- inpage_logic[inpage_ii] + display_logic_raw <- display_logic display_logic <- display_logic %>% map(~ .x$`0`) %>% @@ -233,7 +234,7 @@ process_qsf <- function(path_to_qsf, curr_map <- recode_map[qids == qid][[1]] if ( !is.null(curr_map) ) { - option_code <- curr_map[names(curr_map) == option_code] + option_code <- ifelse(option_code %in% names(curr_map), curr_map[[which(names(curr_map) == option_code)]], option_code) } paste(c(qid, selectable_text, option_code), collapse="") @@ -244,14 +245,12 @@ process_qsf <- function(path_to_qsf, map(~ paste(.x, collapse=" ")) # Handle questions that use a fixed condition ("If False", "If True") - ii_boolean_displaylogic <- (displayed_questions %>% - map(~ .x$Payload$DisplayLogic) %>% + ii_boolean_displaylogic <- (display_logic_raw %>% map(~ .x$`0`) %>% map(~ map(.x, "LogicType") %>% unlist()) == "BooleanValue") %>% which() - display_logic[ii_boolean_displaylogic] <- displayed_questions[ii_boolean_displaylogic] %>% - map(~ .x$Payload$DisplayLogic) %>% + display_logic[ii_boolean_displaylogic] <- display_logic_raw[ii_boolean_displaylogic] %>% map(~ .x$`0`) %>% map(~ paste( map(.x, "Value") @@ -260,8 +259,7 @@ process_qsf <- function(path_to_qsf, # Collapse logic into a single string. map(~ paste(.x, collapse="")) - logic_type <- displayed_questions %>% - map(~ .x$Payload$DisplayLogic) %>% + logic_type <- display_logic_raw %>% map(~ .x$`0`$Type) display_logic <- paste(logic_type, display_logic) %>% @@ -269,6 +267,11 @@ process_qsf <- function(path_to_qsf, map(~ gsub(" $", "", .x)) %>% unlist() + # Hard-code display logic for UMD V15a. + if (survey_version == "UMD" && wave == 12) { + display_logic[which(item_names == "V15a")] <- "If V1/SelectableChoice/1 Is NotSelected" + } + # format all qsf content lists into a single tibble qdf <- tibble(variable = item_names, question = questions, @@ -341,9 +344,10 @@ process_qsf <- function(path_to_qsf, # separate matrix subquestions into separate fields (to match exported data) nonmatrix_items <- qdf %>% filter(question_type != "Matrix") %>% + mutate(matrix_base_name = NA_character_) %>% select(-matrix_subquestion_field_names) - has_resp_by_subq <- qdf %>% + has_response_by_subq <- qdf %>% filter(question_type == "Matrix") %>% pull(response_options) %>% map_lgl(~ all(map_lgl(.x, ~ inherits(.x, "list"))) && @@ -351,10 +355,11 @@ process_qsf <- function(path_to_qsf, matrix_items <- qdf %>% filter(question_type == "Matrix") %>% - filter(!has_resp_by_subq) %>% + filter(!has_response_by_subq) %>% rowwise() %>% mutate(new = list( - tibble(variable = unlist(matrix_subquestion_field_names), + tibble(matrix_base_name = variable, + variable = unlist(matrix_subquestion_field_names), question = question, matrix_subquestion = unlist(matrix_subquestions), question_type = question_type, @@ -367,13 +372,15 @@ process_qsf <- function(path_to_qsf, )) %>% select(new) %>% unnest(new) + matrix_items_resp_by_subq <- qdf %>% filter(question_type == "Matrix") %>% - filter(has_resp_by_subq) %>% + filter(has_response_by_subq) %>% rowwise() %>% mutate(new = list( - tibble(variable = unlist(matrix_subquestion_field_names), + tibble(matrix_base_name = variable, + variable = unlist(matrix_subquestion_field_names), question = question, matrix_subquestion = unlist(matrix_subquestions), question_type = question_type, @@ -387,7 +394,8 @@ process_qsf <- function(path_to_qsf, select(new) %>% unnest(new) - matrix_items <- rbind(matrix_items, matrix_items_resp_by_subq) + matrix_items <- rbind(matrix_items, matrix_items_resp_by_subq) %>% + select(variable, matrix_base_name, everything()) # Custom matrix formatting if (survey_version == "CMU") { @@ -418,6 +426,7 @@ process_qsf <- function(path_to_qsf, ) %>% select(wave, variable, + matrix_base_name, replaces, description, question, @@ -446,7 +455,11 @@ process_qsf <- function(path_to_qsf, qdf <- rbind(qdf, other_text_items) qdf$response_options[qdf$question_type == "Text"] <- NA - + # Drop occasional start and end square brackets from matrix response options. + qdf <- qdf %>% + mutate( + response_options = map_chr(response_options, ~ remove_brackets(.x)) + ) # Quality checks stopifnot(length(qdf$variable) == length(unique(qdf$variable))) @@ -461,6 +474,14 @@ process_qsf <- function(path_to_qsf, return(qdf %>% rename(version = wave)) } +remove_brackets <- function(response_set) { + if ( !is.na(response_set) && startsWith(response_set, "[") && endsWith(response_set, "]") ) { + str_sub(response_set, 2, -2) + } else { + response_set + } +} + #' Append the parsed and formatted info from the QSF to the existing codebook #' #' @param qdf dataframe containing parsed QSF data @@ -505,7 +526,7 @@ add_qdf_to_codebook <- function(qdf, qdf <- qdf %>% select(-replaces) } } - + # Using rbind here to raise an error if columns differ between the existing # codebook and the new wave data. # Sort so that items with missing question_type (non-Qualtrics fields) are at the top.