From 8528a470c88d7ce40730cc1a30cd85aad790ec67 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 31 May 2022 19:31:14 -0400 Subject: [PATCH 01/13] report base question for matrix subqs --- facebook/qsf-tools/generate-codebook.R | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/facebook/qsf-tools/generate-codebook.R b/facebook/qsf-tools/generate-codebook.R index 213aa200a..4182f093c 100644 --- a/facebook/qsf-tools/generate-codebook.R +++ b/facebook/qsf-tools/generate-codebook.R @@ -341,6 +341,7 @@ process_qsf <- function(path_to_qsf, # separate matrix subquestions into separate fields (to match exported data) nonmatrix_items <- qdf %>% filter(question_type != "Matrix") %>% + mutate(variable_base_name = NA_character_) %>% select(-matrix_subquestion_field_names) has_resp_by_subq <- qdf %>% @@ -354,7 +355,8 @@ process_qsf <- function(path_to_qsf, filter(!has_resp_by_subq) %>% rowwise() %>% mutate(new = list( - tibble(variable = unlist(matrix_subquestion_field_names), + tibble(variable_base_name = variable, + variable = unlist(matrix_subquestion_field_names), question = question, matrix_subquestion = unlist(matrix_subquestions), question_type = question_type, @@ -367,13 +369,15 @@ process_qsf <- function(path_to_qsf, )) %>% select(new) %>% unnest(new) + matrix_items_resp_by_subq <- qdf %>% filter(question_type == "Matrix") %>% filter(has_resp_by_subq) %>% rowwise() %>% mutate(new = list( - tibble(variable = unlist(matrix_subquestion_field_names), + tibble(variable_base_name = variable, + variable = unlist(matrix_subquestion_field_names), question = question, matrix_subquestion = unlist(matrix_subquestions), question_type = question_type, @@ -387,7 +391,8 @@ process_qsf <- function(path_to_qsf, select(new) %>% unnest(new) - matrix_items <- rbind(matrix_items, matrix_items_resp_by_subq) + matrix_items <- rbind(matrix_items, matrix_items_resp_by_subq) %>% + select(variable, variable_base_name, everything()) # Custom matrix formatting if (survey_version == "CMU") { @@ -418,6 +423,7 @@ process_qsf <- function(path_to_qsf, ) %>% select(wave, variable, + variable_base_name, replaces, description, question, From 1c9bc0e2f9c7ec67d94398979405db1b9f28f3a1 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 31 May 2022 19:31:43 -0400 Subject: [PATCH 02/13] add new changelog approach --- .../generate-changelog-with-codebook.R | 301 ++++++++++++++++++ 1 file changed, 301 insertions(+) create mode 100644 facebook/qsf-tools/generate-changelog-with-codebook.R diff --git a/facebook/qsf-tools/generate-changelog-with-codebook.R b/facebook/qsf-tools/generate-changelog-with-codebook.R new file mode 100644 index 000000000..b69ed541e --- /dev/null +++ b/facebook/qsf-tools/generate-changelog-with-codebook.R @@ -0,0 +1,301 @@ +#!/usr/bin/env Rscript + +## Combine the codebook and one or more diffs into a single file showing and +## rationalizing changes between waves. The diffs can be annotated, containing a +## `notes` column with rationales for the changes, or the rationales from a +## previous changelog version can be used. +## +## Usage: +## +## Rscript generate-changelog-with-codebook.R UMD|CMU path/to/codebook path/to/output/changelog [path/to/old/changelog] + +suppressPackageStartupMessages({ + library(tidyverse) +}) + +# "old" = new +WAVE_COMPARE_MAP <- list( + "UMD" = c( + "1" = 2, + "2" = 3, + "3" = 4, + "4" = 5, + "5" = 6, + "6" = 7, + "7" = 8, + "8" = 9, + "9" = 10, + "10" = 11, + "11" = 12, + "12" = 13 + ), + "CMU" = c( + "1" = 2, + "2" = 3, + "3" = 4, + "4" = 5, + "5" = 6, + "6" = 7, + "7" = 8, + "8" = 10, + "10" = 11, + "11" = 12, + "12" = 13 + ) +) + +DIFF_COLS <- c( + "question", + "matrix_subquestion", + "response_options", + "display_logic", + "response_option_randomization", + "respondent_group" +) + +CHANGE_TYPE_MAP <- c( + added = "Item added", + removed = "Item removed", + question = "Question wording changed", + display_logic = "Display logic changed", + response_options = "Answer choices changed", + matrix_subquestion = "Matrix subquestions changed", + response_option_randomization = "Answer choice order changed", + respondent_group = "Respondent group changed" +) + + +generate_changelog <- function(path_to_codebook, + path_to_changelog, + path_to_old_changelog, + survey_version) { + # Get the codebook. Contains details about each question (text, answer + # choices, display logic) by wave. + codebook <- codebook <- read_csv(path_to_codebook, col_types = cols( + .default = col_character(), + version = col_double() + )) + + local_compare_map <- WAVE_COMPARE_MAP[[survey_version]] + # Add new-old wave mapping columns. Drop unused rows. + codebook <- codebook %>% + mutate( + old_version = version, + new_version = local_compare_map[as.character(version)] + ) %>% + # If new_version is missing, the survey wavey of that obs doesn't have an + # new wave to compare against. + filter( + !is.na(new_version) + ) + + codebook <- full_join( + # with old columns + codebook %>% + rename_with(function(cols) { + map_chr(cols, ~ rename_col(.x, "old")) + }) %>% + select(-replaces), + # with new columns + codebook %>% + rename_with(function(cols) { + map_chr(cols, ~ rename_col(.x, "new")) + }) %>% + select(-description, -question_type, -replaces, -old_version, -new_version), + by = c("new_version" = "version", "variable" = "variable") + ) %>% + select(-version) %>% + rename(variable_name = variable) + + ## Find differences. + result <- list() + + # Drop obs where both old and new info is missing -- these are metavariables + # that we report in the microdata, like "weight" and "StartDate" + codebook <- codebook %>% + filter( + !(is.na(old_question) & + is.na(old_display_logic) & + is.na(old_response_option_randomization) & + is.na(old_respondent_group) & + is.na(new_question) & + is.na(new_display_logic) & + is.na(new_response_option_randomization) & + is.na(new_respondent_group)) + ) + + # Any item with missing "old_*" fields has been added. + result[["added"]] <- codebook %>% + filter( + is.na(old_question), + is.na(old_display_logic), + is.na(old_response_option_randomization), + is.na(old_respondent_group), + !is.na(new_question), + !is.na(new_display_logic), + !is.na(new_response_option_randomization), + !is.na(new_respondent_group) + ) %>% + mutate( + change_type = CHANGE_TYPE_MAP["added"], + old_version = map_chr( + new_version, + ~ ifelse( + length(names(local_compare_map[local_compare_map == .x])) == 0, + NA_character_, + names(local_compare_map[local_compare_map == .x]) + ) + ) %>% as.integer() + ) %>% + filter(!is.na(old_version)) + codebook <- codebook %>% + filter( + !(is.na(old_question) & + is.na(old_display_logic) & + is.na(old_response_option_randomization) & + is.na(old_respondent_group) & + !is.na(new_question) & + !is.na(new_display_logic) & + !is.na(new_response_option_randomization) & + !is.na(new_respondent_group)) + ) + + # Any item with missing "new_*" fields has been removed. + result[["removed"]] <- codebook %>% + filter( + !is.na(old_question), + !is.na(old_display_logic), + !is.na(old_response_option_randomization), + !is.na(old_respondent_group), + is.na(new_question), + is.na(new_display_logic), + is.na(new_response_option_randomization), + is.na(new_respondent_group) + ) %>% + mutate( + change_type = CHANGE_TYPE_MAP["removed"] + ) + codebook <- codebook %>% + filter( + !(!is.na(old_question) & + !is.na(old_display_logic) & + !is.na(old_response_option_randomization) & + !is.na(old_respondent_group) & + is.na(new_question) & + is.na(new_display_logic) & + is.na(new_response_option_randomization) & + is.na(new_respondent_group)) + ) + + # Do all other comparisons + for (col in DIFF_COLS) { + new_col <- paste("new", col, sep="_") + old_col <- paste("old", col, sep="_") + items_not_identical <- find_col_differences(codebook, new_col, old_col) + + changed <- codebook %>% + filter(items_not_identical) %>% + mutate(change_type = CHANGE_TYPE_MAP[col]) + if (col == "question") { + # Drop obs if the change is due to trivial formatting, e.g. nbsp + changed <- changed %>% + mutate( + new_question_wo_formatting = str_replace_all(new_question, " ", " "), + old_question_wo_formatting = str_replace_all(old_question, " ", " ") + ) %>% + filter(new_question_wo_formatting != old_question_wo_formatting) %>% + select(-new_question_wo_formatting, -old_question_wo_formatting) + } + result[[col]] <- changed + } + + changelog <- bind_rows(result) + + ## Don't report all matrix subquestions when the change is shared between all + ## of them, just report the base item. + + # Group by variable_base_name and change_type, as long as change is not "Matrix subquestion changed" and variable_base_name is not NA. + # Keep only one obs for each group. + # Set var name in kept obs to variable_base_name for generality and to be able to join rationales on. + + ## Join old rationales on. + # TODO: The first time this happens using this new script, need to manually map + # some rationales for "Matrix subquestions changed", since previously this tag + # would include added and removed subquestions. + if (is.null(path_to_old_changelog)) { + warning("rationales will be empty unless an old version of the changelog is provided") + changelog$notes <- NA_character_ + } else { + old_changelog <- read_csv(path_to_old_changelog, col_types = cols( + .default = col_character(), + new_version = col_double(), + old_version = col_double() + )) %>% + select(new_version, old_version, variable_name, change_type, notes) + changelog <- changelog %>% + left_join(old_changelog, by=c("new_version", "old_version", "variable_name", "change_type")) + } + + write_excel_csv( + changelog %>% + rename( + new_question_text = new_question, + old_question_text = old_question, + new_matrix_subquestion_text = new_matrix_subquestion, + old_matrix_subquestion_text = old_matrix_subquestion + ) %>% + select( + new_version, + old_version, + variable_name, + description, + change_type, + new_question_text, + new_matrix_subquestion_text, + new_response_options, + new_display_logic, + new_response_option_randomization, + new_respondent_group, + old_question_text, + old_matrix_subquestion_text, + old_response_options, + old_display_logic, + old_response_option_randomization, + old_respondent_group, + notes + ) %>% + arrange(new_version, old_version), + path_to_changelog, quote="needed" + ) +} + +rename_col <- function(col, prefix) { + if (col %in% DIFF_COLS) { + paste(prefix, col, sep = "_") + } else { + col + } +} + +find_col_differences <- function(codebook, new_col, old_col) { + codebook[[old_col]] != codebook[[new_col]] +} + +args <- commandArgs(TRUE) + +if (!(length(args) %in% c(3, 4))) { + stop("Usage: Rscript generate-changelog-with-codebook.R UMD|CMU path/to/codebook path/to/output/changelog [path/to/old/changelog]") +} + +survey_version <- args[1] +path_to_codebook <- args[2] +path_to_changelog <- args[3] + +if (length(args) == 4) { + path_to_old_changelog <- args[4] +} else { + path_to_old_changelog <- NULL +} + +invisible(generate_changelog(path_to_codebook, path_to_changelog, path_to_old_changelog, survey_version)) + From 86468fbf91fa53356344386b49d5f5183f869c08 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 1 Jun 2022 18:43:34 -0400 Subject: [PATCH 03/13] fixes to new changelog code --- .../generate-changelog-with-codebook.R | 209 ++++++++++++++---- facebook/qsf-tools/generate-codebook.R | 16 +- 2 files changed, 173 insertions(+), 52 deletions(-) diff --git a/facebook/qsf-tools/generate-changelog-with-codebook.R b/facebook/qsf-tools/generate-changelog-with-codebook.R index b69ed541e..2afebdfbc 100644 --- a/facebook/qsf-tools/generate-changelog-with-codebook.R +++ b/facebook/qsf-tools/generate-changelog-with-codebook.R @@ -71,22 +71,17 @@ generate_changelog <- function(path_to_codebook, survey_version) { # Get the codebook. Contains details about each question (text, answer # choices, display logic) by wave. - codebook <- codebook <- read_csv(path_to_codebook, col_types = cols( + codebook_raw <- read_csv(path_to_codebook, col_types = cols( .default = col_character(), version = col_double() )) - + browser() local_compare_map <- WAVE_COMPARE_MAP[[survey_version]] # Add new-old wave mapping columns. Drop unused rows. - codebook <- codebook %>% + codebook <- codebook_raw %>% mutate( old_version = version, new_version = local_compare_map[as.character(version)] - ) %>% - # If new_version is missing, the survey wavey of that obs doesn't have an - # new wave to compare against. - filter( - !is.na(new_version) ) codebook <- full_join( @@ -95,21 +90,24 @@ generate_changelog <- function(path_to_codebook, rename_with(function(cols) { map_chr(cols, ~ rename_col(.x, "old")) }) %>% - select(-replaces), + select(-replaces) %>% + mutate(x_exists = TRUE), # with new columns codebook %>% rename_with(function(cols) { map_chr(cols, ~ rename_col(.x, "new")) }) %>% - select(-description, -question_type, -replaces, -old_version, -new_version), + select(-replaces, -old_version, -new_version) %>% + mutate(y_exists = TRUE), by = c("new_version" = "version", "variable" = "variable") ) %>% - select(-version) %>% + mutate( + description = coalesce(description.x, description.y), + question_type = coalesce(question_type.x, question_type.y), + ) %>% + select(-version, -description.x, -description.y, -question_type.x, -question_type.y) %>% rename(variable_name = variable) - ## Find differences. - result <- list() - # Drop obs where both old and new info is missing -- these are metavariables # that we report in the microdata, like "weight" and "StartDate" codebook <- codebook %>% @@ -124,18 +122,29 @@ generate_changelog <- function(path_to_codebook, is.na(new_respondent_group)) ) - # Any item with missing "old_*" fields has been added. - result[["added"]] <- codebook %>% + # Fill in version where missing + codebook$new_version <- coalesce(codebook$new_version, map_dbl(codebook$old_version, ~ local_compare_map[as.character(.x)])) + codebook$old_version <- coalesce(codebook$old_version, map_dbl(codebook$new_version, ~ get_old_version(.x, local_compare_map) %>% as.double())) + + # Drop obs where version is not in names or values of the wave mapping (i.e. 12.5) + codebook <- codebook %>% filter( - is.na(old_question), - is.na(old_display_logic), - is.na(old_response_option_randomization), - is.na(old_respondent_group), - !is.na(new_question), - !is.na(new_display_logic), - !is.na(new_response_option_randomization), - !is.na(new_respondent_group) - ) %>% + new_version %in% c(local_compare_map, names(local_compare_map)), + old_version %in% c(local_compare_map, names(local_compare_map)) + ) + + ## Find differences. + result <- list() + browser() + # Any item where x (old fields) does not exist but y does has been "added" + added_items <- codebook %>% + filter( + is.na(x_exists) & y_exists + ) + codebook <- anti_join(codebook, added_items) + + # Process added items + added_items <- added_items %>% mutate( change_type = CHANGE_TYPE_MAP["added"], old_version = map_chr( @@ -147,21 +156,52 @@ generate_changelog <- function(path_to_codebook, ) ) %>% as.integer() ) %>% - filter(!is.na(old_version)) - codebook <- codebook %>% - filter( - !(is.na(old_question) & - is.na(old_display_logic) & - is.na(old_response_option_randomization) & - is.na(old_respondent_group) & - !is.na(new_question) & - !is.na(new_display_logic) & - !is.na(new_response_option_randomization) & - !is.na(new_respondent_group)) - ) - + select(-x_exists, -y_exists) + + + combos <- added_items %>% + filter(question_type == "Matrix") %>% + distinct(old_version, new_matrix_base_name) + + for (i in seq_len(nrow(combos))) { + browser() + wave = combos[i,] %>% pull(old_version) + base_name = combos[i,] %>% pull(new_matrix_base_name) + tmp <- added_items %>% + filter( + old_version == wave, new_matrix_base_name == base_name + ) + added_items <- anti_join(added_items, tmp) + if (nrow(filter(codebook_raw, version == wave, matrix_base_name == base_name)) == 0) { + # Dedup subqs so only report base question once + tmp <- tmp %>% + group_by(old_matrix_base_name, new_matrix_base_name, new_version, old_version) %>% + mutate( + variable_name = new_matrix_base_name, + old_matrix_subquestion = NA, + new_matrix_subquestion = "Differ by subquestion", + old_response_options = case_when( + length(unique(old_response_options)) == 1 ~ old_response_options, + TRUE ~ "Differ by subquestion" + ), + new_response_options = case_when( + length(unique(new_response_options)) == 1 ~ new_response_options, + TRUE ~ "Differ by subquestion" + ) + ) %>% + slice_head() %>% + ungroup() + } else { + tmp <- mutate(tmp, change_type = "Matrix subquestion added to existing item") + } + added_items <- rbind(added_items, tmp) + } + browser() + result[["added"]] <- added_items + + # Any item with missing "new_*" fields has been removed. - result[["removed"]] <- codebook %>% + removed_items <- codebook %>% filter( !is.na(old_question), !is.na(old_display_logic), @@ -187,6 +227,48 @@ generate_changelog <- function(path_to_codebook, is.na(new_respondent_group)) ) + combos <- removed_items %>% + filter(question_type == "Matrix") %>% + distinct(new_version, old_matrix_base_name) + + for (i in seq_len(nrow(combos))) { + wave = combos[i,] %>% pull(new_version) + base_name = combos[i,] %>% pull(old_matrix_base_name) + tmp <- removed_items %>% + filter( + new_version == wave, old_matrix_base_name == base_name + ) + removed_items <- anti_join(removed_items, tmp) + if (nrow(filter(codebook_raw, version == wave, matrix_base_name == base_name)) == 0) { + # Dedup subqs so only report base question once + tmp <- tmp %>% + group_by(old_matrix_base_name, new_matrix_base_name, new_version, old_version) %>% + mutate( + variable_name = old_matrix_base_name, + old_matrix_subquestion = "Differ by subquestion", + new_matrix_subquestion = NA, + old_response_options = case_when( + length(unique(old_response_options)) == 1 ~ old_response_options, + TRUE ~ "Differ by subquestion" + ), + new_response_options = case_when( + length(unique(new_response_options)) == 1 ~ new_response_options, + TRUE ~ "Differ by subquestion" + ) + ) %>% + slice_head() %>% + ungroup() + } else { + tmp <- mutate(tmp, change_type = "Matrix subquestion removed from existing item") + } + removed_items <- rbind(removed_items, tmp) + } + + result[["removed"]] <- removed_items + + + + # Do all other comparisons for (col in DIFF_COLS) { new_col <- paste("new", col, sep="_") @@ -213,17 +295,49 @@ generate_changelog <- function(path_to_codebook, ## Don't report all matrix subquestions when the change is shared between all ## of them, just report the base item. - - # Group by variable_base_name and change_type, as long as change is not "Matrix subquestion changed" and variable_base_name is not NA. + # Group by matrix_base_name, change_type, and wave, as long as the change_type is relevant and matrix_base_name is not NA. # Keep only one obs for each group. - # Set var name in kept obs to variable_base_name for generality and to be able to join rationales on. + # Set var name in kept obs to matrix_base_name for generality and to be able to join rationales on. + dedup_matrix_changes <- changelog %>% + filter( + (!is.na(old_matrix_base_name) | ! is.na(new_matrix_base_name)) & + change_type %in% c( + "Question wording changed", + "Display logic changed", + # "Answer choices changed", ## TODO: needs special logic, because Matrix subquestions can actually have different answer choices. Not needed for UMD + # "Answer choice order changed", ## TODO: needs special logic, because Matrix subquestions can actually have different answer choices. Not needed for UMD + "Respondent group changed" + ) + ) %>% + group_by(old_matrix_base_name, new_matrix_base_name, new_version, old_version, change_type) %>% + slice_head() %>% + ungroup() %>% + mutate( + variable_name = case_when( + old_matrix_base_name != new_matrix_base_name ~ paste(old_matrix_base_name, new_matrix_base_name, sep="/"), + TRUE ~ old_matrix_base_name + ), + old_matrix_subquestion = NA, + new_matrix_subquestion = NA + ) + changelog <- changelog %>% + filter( !( + (!is.na(old_matrix_base_name) | ! is.na(new_matrix_base_name)) & + change_type %in% c( + "Question wording changed", + "Display logic changed", + "Respondent group changed" + ) + ) + ) + changelog <- rbind(changelog, dedup_matrix_changes) ## Join old rationales on. # TODO: The first time this happens using this new script, need to manually map # some rationales for "Matrix subquestions changed", since previously this tag # would include added and removed subquestions. if (is.null(path_to_old_changelog)) { - warning("rationales will be empty unless an old version of the changelog is provided") + warning("rationales will be empty; an old version of the changelog was not provided") changelog$notes <- NA_character_ } else { old_changelog <- read_csv(path_to_old_changelog, col_types = cols( @@ -250,12 +364,14 @@ generate_changelog <- function(path_to_codebook, variable_name, description, change_type, + new_matrix_base_name, new_question_text, new_matrix_subquestion_text, new_response_options, new_display_logic, new_response_option_randomization, new_respondent_group, + old_matrix_base_name, old_question_text, old_matrix_subquestion_text, old_response_options, @@ -270,7 +386,7 @@ generate_changelog <- function(path_to_codebook, } rename_col <- function(col, prefix) { - if (col %in% DIFF_COLS) { + if (col %in% c(DIFF_COLS, "matrix_base_name")) { paste(prefix, col, sep = "_") } else { col @@ -281,6 +397,11 @@ find_col_differences <- function(codebook, new_col, old_col) { codebook[[old_col]] != codebook[[new_col]] } +get_old_version <- function(new_version, compare_map) { + ifelse(new_version %in% compare_map, compare_map[compare_map == new_version] %>% names(), NA_character_) +} + + args <- commandArgs(TRUE) if (!(length(args) %in% c(3, 4))) { diff --git a/facebook/qsf-tools/generate-codebook.R b/facebook/qsf-tools/generate-codebook.R index 4182f093c..b22f08aac 100644 --- a/facebook/qsf-tools/generate-codebook.R +++ b/facebook/qsf-tools/generate-codebook.R @@ -341,10 +341,10 @@ process_qsf <- function(path_to_qsf, # separate matrix subquestions into separate fields (to match exported data) nonmatrix_items <- qdf %>% filter(question_type != "Matrix") %>% - mutate(variable_base_name = NA_character_) %>% + mutate(matrix_base_name = NA_character_) %>% select(-matrix_subquestion_field_names) - has_resp_by_subq <- qdf %>% + has_response_by_subq <- qdf %>% filter(question_type == "Matrix") %>% pull(response_options) %>% map_lgl(~ all(map_lgl(.x, ~ inherits(.x, "list"))) && @@ -352,10 +352,10 @@ process_qsf <- function(path_to_qsf, matrix_items <- qdf %>% filter(question_type == "Matrix") %>% - filter(!has_resp_by_subq) %>% + filter(!has_response_by_subq) %>% rowwise() %>% mutate(new = list( - tibble(variable_base_name = variable, + tibble(matrix_base_name = variable, variable = unlist(matrix_subquestion_field_names), question = question, matrix_subquestion = unlist(matrix_subquestions), @@ -373,10 +373,10 @@ process_qsf <- function(path_to_qsf, matrix_items_resp_by_subq <- qdf %>% filter(question_type == "Matrix") %>% - filter(has_resp_by_subq) %>% + filter(has_response_by_subq) %>% rowwise() %>% mutate(new = list( - tibble(variable_base_name = variable, + tibble(matrix_base_name = variable, variable = unlist(matrix_subquestion_field_names), question = question, matrix_subquestion = unlist(matrix_subquestions), @@ -392,7 +392,7 @@ process_qsf <- function(path_to_qsf, unnest(new) matrix_items <- rbind(matrix_items, matrix_items_resp_by_subq) %>% - select(variable, variable_base_name, everything()) + select(variable, matrix_base_name, everything()) # Custom matrix formatting if (survey_version == "CMU") { @@ -423,7 +423,7 @@ process_qsf <- function(path_to_qsf, ) %>% select(wave, variable, - variable_base_name, + matrix_base_name, replaces, description, question, From 74bfa192d51284bc1f5112087bf4ddb3f0c773ad Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 1 Jun 2022 19:01:17 -0400 Subject: [PATCH 04/13] simplify filters with antijoins; remove test code --- .../generate-changelog-with-codebook.R | 55 ++++++------------- 1 file changed, 16 insertions(+), 39 deletions(-) diff --git a/facebook/qsf-tools/generate-changelog-with-codebook.R b/facebook/qsf-tools/generate-changelog-with-codebook.R index 2afebdfbc..070173b42 100644 --- a/facebook/qsf-tools/generate-changelog-with-codebook.R +++ b/facebook/qsf-tools/generate-changelog-with-codebook.R @@ -59,7 +59,7 @@ CHANGE_TYPE_MAP <- c( question = "Question wording changed", display_logic = "Display logic changed", response_options = "Answer choices changed", - matrix_subquestion = "Matrix subquestions changed", + matrix_subquestion = "Matrix subquestion text changed", response_option_randomization = "Answer choice order changed", respondent_group = "Respondent group changed" ) @@ -75,7 +75,7 @@ generate_changelog <- function(path_to_codebook, .default = col_character(), version = col_double() )) - browser() + local_compare_map <- WAVE_COMPARE_MAP[[survey_version]] # Add new-old wave mapping columns. Drop unused rows. codebook <- codebook_raw %>% @@ -135,7 +135,7 @@ generate_changelog <- function(path_to_codebook, ## Find differences. result <- list() - browser() + # Any item where x (old fields) does not exist but y does has been "added" added_items <- codebook %>% filter( @@ -146,25 +146,16 @@ generate_changelog <- function(path_to_codebook, # Process added items added_items <- added_items %>% mutate( - change_type = CHANGE_TYPE_MAP["added"], - old_version = map_chr( - new_version, - ~ ifelse( - length(names(local_compare_map[local_compare_map == .x])) == 0, - NA_character_, - names(local_compare_map[local_compare_map == .x]) - ) - ) %>% as.integer() + change_type = CHANGE_TYPE_MAP["added"] ) %>% select(-x_exists, -y_exists) - combos <- added_items %>% filter(question_type == "Matrix") %>% distinct(old_version, new_matrix_base_name) for (i in seq_len(nrow(combos))) { - browser() + wave = combos[i,] %>% pull(old_version) base_name = combos[i,] %>% pull(new_matrix_base_name) tmp <- added_items %>% @@ -196,36 +187,24 @@ generate_changelog <- function(path_to_codebook, } added_items <- rbind(added_items, tmp) } - browser() + result[["added"]] <- added_items - # Any item with missing "new_*" fields has been removed. + # Any item where x (old fields) exists but y does not has been "removed" removed_items <- codebook %>% filter( - !is.na(old_question), - !is.na(old_display_logic), - !is.na(old_response_option_randomization), - !is.na(old_respondent_group), - is.na(new_question), - is.na(new_display_logic), - is.na(new_response_option_randomization), - is.na(new_respondent_group) - ) %>% + x_exists & is.na(y_exists) + ) + codebook <- anti_join(codebook, removed_items) %>% + select(-x_exists, -y_exists) + + # Process removed items. + removed_items <- removed_items %>% mutate( change_type = CHANGE_TYPE_MAP["removed"] - ) - codebook <- codebook %>% - filter( - !(!is.na(old_question) & - !is.na(old_display_logic) & - !is.na(old_response_option_randomization) & - !is.na(old_respondent_group) & - is.na(new_question) & - is.na(new_display_logic) & - is.na(new_response_option_randomization) & - is.na(new_respondent_group)) - ) + ) %>% + select(-x_exists, -y_exists) combos <- removed_items %>% filter(question_type == "Matrix") %>% @@ -267,8 +246,6 @@ generate_changelog <- function(path_to_codebook, result[["removed"]] <- removed_items - - # Do all other comparisons for (col in DIFF_COLS) { new_col <- paste("new", col, sep="_") From de5c478338253b69b448a052b419a7bbe950c968 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 3 Jun 2022 17:15:40 -0400 Subject: [PATCH 05/13] filters --- facebook/qsf-tools/generate-changelog-with-codebook.R | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/facebook/qsf-tools/generate-changelog-with-codebook.R b/facebook/qsf-tools/generate-changelog-with-codebook.R index 070173b42..db071b1ab 100644 --- a/facebook/qsf-tools/generate-changelog-with-codebook.R +++ b/facebook/qsf-tools/generate-changelog-with-codebook.R @@ -151,11 +151,10 @@ generate_changelog <- function(path_to_codebook, select(-x_exists, -y_exists) combos <- added_items %>% - filter(question_type == "Matrix") %>% + filter(question_type == "Matrix" | !is.na(new_matrix_base_name) | !is.na(new_matrix_subquestion)) %>% distinct(old_version, new_matrix_base_name) for (i in seq_len(nrow(combos))) { - wave = combos[i,] %>% pull(old_version) base_name = combos[i,] %>% pull(new_matrix_base_name) tmp <- added_items %>% @@ -190,7 +189,7 @@ generate_changelog <- function(path_to_codebook, result[["added"]] <- added_items - + browser() # Any item where x (old fields) exists but y does not has been "removed" removed_items <- codebook %>% filter( @@ -207,7 +206,7 @@ generate_changelog <- function(path_to_codebook, select(-x_exists, -y_exists) combos <- removed_items %>% - filter(question_type == "Matrix") %>% + filter(question_type == "Matrix" | !is.na(old_matrix_base_name) | !is.na(old_matrix_subquestion)) %>% distinct(new_version, old_matrix_base_name) for (i in seq_len(nrow(combos))) { @@ -296,7 +295,7 @@ generate_changelog <- function(path_to_codebook, ), old_matrix_subquestion = NA, new_matrix_subquestion = NA - ) + ) changelog <- changelog %>% filter( !( (!is.na(old_matrix_base_name) | ! is.na(new_matrix_base_name)) & From c3c2faeb46707cf6b31bec09467bc179078246a7 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 3 Jun 2022 17:17:06 -0400 Subject: [PATCH 06/13] dedup matrix items for answer choice changes --- .../generate-changelog-with-codebook.R | 98 +++++++++++++------ 1 file changed, 66 insertions(+), 32 deletions(-) diff --git a/facebook/qsf-tools/generate-changelog-with-codebook.R b/facebook/qsf-tools/generate-changelog-with-codebook.R index db071b1ab..3733fee01 100644 --- a/facebook/qsf-tools/generate-changelog-with-codebook.R +++ b/facebook/qsf-tools/generate-changelog-with-codebook.R @@ -189,7 +189,6 @@ generate_changelog <- function(path_to_codebook, result[["added"]] <- added_items - browser() # Any item where x (old fields) exists but y does not has been "removed" removed_items <- codebook %>% filter( @@ -274,39 +273,74 @@ generate_changelog <- function(path_to_codebook, # Group by matrix_base_name, change_type, and wave, as long as the change_type is relevant and matrix_base_name is not NA. # Keep only one obs for each group. # Set var name in kept obs to matrix_base_name for generality and to be able to join rationales on. - dedup_matrix_changes <- changelog %>% - filter( - (!is.na(old_matrix_base_name) | ! is.na(new_matrix_base_name)) & - change_type %in% c( - "Question wording changed", - "Display logic changed", - # "Answer choices changed", ## TODO: needs special logic, because Matrix subquestions can actually have different answer choices. Not needed for UMD - # "Answer choice order changed", ## TODO: needs special logic, because Matrix subquestions can actually have different answer choices. Not needed for UMD - "Respondent group changed" - ) + combos <- changelog %>% + filter((question_type == "Matrix" | !is.na(old_matrix_base_name) | !is.na(old_matrix_subquestion)) & + change_type %in% c( + "Question wording changed", + "Display logic changed", + "Answer choices changed", ## TODO: needs special logic, because Matrix subquestions can actually have different answer choices. Not needed for UMD + "Answer choice order changed", ## TODO: needs special logic, because Matrix subquestions can actually have different answer choices. Not needed for UMD + "Respondent group changed" + ) ) %>% - group_by(old_matrix_base_name, new_matrix_base_name, new_version, old_version, change_type) %>% - slice_head() %>% - ungroup() %>% - mutate( - variable_name = case_when( - old_matrix_base_name != new_matrix_base_name ~ paste(old_matrix_base_name, new_matrix_base_name, sep="/"), - TRUE ~ old_matrix_base_name - ), - old_matrix_subquestion = NA, - new_matrix_subquestion = NA - ) - changelog <- changelog %>% - filter( !( - (!is.na(old_matrix_base_name) | ! is.na(new_matrix_base_name)) & - change_type %in% c( - "Question wording changed", - "Display logic changed", - "Respondent group changed" - ) - ) + distinct(new_version, old_version, new_matrix_base_name, old_matrix_base_name, change_type) + + SPECIAL_HANDLING <- list( + "Answer choices changed" = list("new_response_options", "old_response_options"), + "Answer choices order changed" = list("new_response_option_randomization", "old_response_option_randomization") ) - changelog <- rbind(changelog, dedup_matrix_changes) + for (i in seq_len(nrow(combos))) { + new_v <- combos[i,] %>% pull(new_version) + old_v <- combos[i,] %>% pull(old_version) + new_base <- combos[i,] %>% pull(new_matrix_base_name) + old_base <- combos[i,] %>% pull(old_matrix_base_name) + change <- combos[i,] %>% pull(change_type) + + tmp <- changelog %>% + filter( + new_version == new_v, + old_version == old_v, + new_matrix_base_name == new_base, + old_matrix_base_name == old_base, + change_type == change + ) + changelog <- anti_join(changelog, tmp) + + combine_flag <- FALSE + if (change %in% names(SPECIAL_HANDLING)) { + # See if the changed column is the same for all obs. Check if all matrix + # subquestions are listed. + new_col <- SPECIAL_HANDLING[[change]][[1]] + old_col <- SPECIAL_HANDLING[[change]][[2]] + if ( + length(unique(tmp[[new_col]])) == 1 && + length(unique(tmp[[old_col]])) == 1 && + ( + nrow(tmp) == codebook_raw %>% filter(version == old_v, matrix_base_name == old_base) %>% nrow() || + nrow(tmp) == codebook_raw %>% filter(version == new_v, matrix_base_name == new_base) %>% nrow() + ) + ) { + combine_flag <- TRUE + } + } else { + combine_flag <- TRUE + } + + if (combine_flag) { + tmp <- tmp %>% + slice_head() %>% + mutate( + variable_name = case_when( + old_matrix_base_name != new_matrix_base_name ~ paste(old_matrix_base_name, new_matrix_base_name, sep="/"), + TRUE ~ old_matrix_base_name + ), + old_matrix_subquestion = NA, + new_matrix_subquestion = NA + ) + } + + changelog <- rbind(changelog, tmp) + } ## Join old rationales on. # TODO: The first time this happens using this new script, need to manually map From 8cfd2d89962d4665b25ee1a264dc3aed84e0855e Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 3 Jun 2022 18:34:01 -0400 Subject: [PATCH 07/13] remove occasional brackets around response options --- facebook/qsf-tools/generate-codebook.R | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/facebook/qsf-tools/generate-codebook.R b/facebook/qsf-tools/generate-codebook.R index b22f08aac..0b7dea879 100644 --- a/facebook/qsf-tools/generate-codebook.R +++ b/facebook/qsf-tools/generate-codebook.R @@ -452,7 +452,11 @@ process_qsf <- function(path_to_qsf, qdf <- rbind(qdf, other_text_items) qdf$response_options[qdf$question_type == "Text"] <- NA - + # Drop occasional start and end square brackets from matrix response options. + qdf <- qdf %>% + mutate( + response_options = map_chr(response_options, ~ remove_brackets(.x)) + ) # Quality checks stopifnot(length(qdf$variable) == length(unique(qdf$variable))) @@ -467,6 +471,14 @@ process_qsf <- function(path_to_qsf, return(qdf %>% rename(version = wave)) } +remove_brackets <- function(response_set) { + if ( !is.na(response_set) && startsWith(response_set, "[") && endsWith(response_set, "]") ) { + str_sub(response_set, 2, -2) + } else { + response_set + } +} + #' Append the parsed and formatted info from the QSF to the existing codebook #' #' @param qdf dataframe containing parsed QSF data @@ -511,7 +523,7 @@ add_qdf_to_codebook <- function(qdf, qdf <- qdf %>% select(-replaces) } } - + # Using rbind here to raise an error if columns differ between the existing # codebook and the new wave data. # Sort so that items with missing question_type (non-Qualtrics fields) are at the top. From 637d5e7b491776a9e75162cd3d3175f3d21d9c21 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 6 Jun 2022 11:51:08 -0400 Subject: [PATCH 08/13] robustify display logic recoding --- facebook/qsf-tools/generate-codebook.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/facebook/qsf-tools/generate-codebook.R b/facebook/qsf-tools/generate-codebook.R index 0b7dea879..3265a7cf9 100644 --- a/facebook/qsf-tools/generate-codebook.R +++ b/facebook/qsf-tools/generate-codebook.R @@ -233,7 +233,7 @@ process_qsf <- function(path_to_qsf, curr_map <- recode_map[qids == qid][[1]] if ( !is.null(curr_map) ) { - option_code <- curr_map[names(curr_map) == option_code] + option_code <- ifelse(option_code %in% names(curr_map), curr_map[[which(names(curr_map) == option_code)]], option_code) } paste(c(qid, selectable_text, option_code), collapse="") From 4a63603bac935a54755b56ed5d4c19ed69137ff6 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 6 Jun 2022 12:25:30 -0400 Subject: [PATCH 09/13] include "If" for items with in-page logic --- facebook/qsf-tools/generate-codebook.R | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/facebook/qsf-tools/generate-codebook.R b/facebook/qsf-tools/generate-codebook.R index 3265a7cf9..95e37ceef 100644 --- a/facebook/qsf-tools/generate-codebook.R +++ b/facebook/qsf-tools/generate-codebook.R @@ -212,6 +212,7 @@ process_qsf <- function(path_to_qsf, " 'DisplayLogic' would be overwritten.") } display_logic[inpage_ii] <- inpage_logic[inpage_ii] + display_logic_raw <- display_logic display_logic <- display_logic %>% map(~ .x$`0`) %>% @@ -244,14 +245,12 @@ process_qsf <- function(path_to_qsf, map(~ paste(.x, collapse=" ")) # Handle questions that use a fixed condition ("If False", "If True") - ii_boolean_displaylogic <- (displayed_questions %>% - map(~ .x$Payload$DisplayLogic) %>% + ii_boolean_displaylogic <- (display_logic_raw %>% map(~ .x$`0`) %>% map(~ map(.x, "LogicType") %>% unlist()) == "BooleanValue") %>% which() - display_logic[ii_boolean_displaylogic] <- displayed_questions[ii_boolean_displaylogic] %>% - map(~ .x$Payload$DisplayLogic) %>% + display_logic[ii_boolean_displaylogic] <- display_logic_raw[ii_boolean_displaylogic] %>% map(~ .x$`0`) %>% map(~ paste( map(.x, "Value") @@ -260,8 +259,7 @@ process_qsf <- function(path_to_qsf, # Collapse logic into a single string. map(~ paste(.x, collapse="")) - logic_type <- displayed_questions %>% - map(~ .x$Payload$DisplayLogic) %>% + logic_type <- display_logic_raw %>% map(~ .x$`0`$Type) display_logic <- paste(logic_type, display_logic) %>% From 8e725b93f86068276d77f22b68e749fbd8313839 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 6 Jun 2022 12:31:53 -0400 Subject: [PATCH 10/13] hard-code UMD V15a display logic --- facebook/qsf-tools/generate-codebook.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/facebook/qsf-tools/generate-codebook.R b/facebook/qsf-tools/generate-codebook.R index 95e37ceef..26f81948e 100644 --- a/facebook/qsf-tools/generate-codebook.R +++ b/facebook/qsf-tools/generate-codebook.R @@ -267,6 +267,11 @@ process_qsf <- function(path_to_qsf, map(~ gsub(" $", "", .x)) %>% unlist() + # Hard-code display logic for UMD V15a. + if (survey_version == "UMD" && wave == 12) { + display_logic[which(item_names == "V15a")] <- "If V1/SelectableChoice/1 Is NotSelected" + } + # format all qsf content lists into a single tibble qdf <- tibble(variable = item_names, question = questions, From 3db535b281c2a6e387a1881519753e968c6185df Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 6 Jun 2022 12:40:27 -0400 Subject: [PATCH 11/13] rm diff-based changelog --- .../generate-changelog-with-codebook.R | 432 ---------- facebook/qsf-tools/generate-changelog.R | 738 +++++++++--------- 2 files changed, 365 insertions(+), 805 deletions(-) delete mode 100644 facebook/qsf-tools/generate-changelog-with-codebook.R diff --git a/facebook/qsf-tools/generate-changelog-with-codebook.R b/facebook/qsf-tools/generate-changelog-with-codebook.R deleted file mode 100644 index 3733fee01..000000000 --- a/facebook/qsf-tools/generate-changelog-with-codebook.R +++ /dev/null @@ -1,432 +0,0 @@ -#!/usr/bin/env Rscript - -## Combine the codebook and one or more diffs into a single file showing and -## rationalizing changes between waves. The diffs can be annotated, containing a -## `notes` column with rationales for the changes, or the rationales from a -## previous changelog version can be used. -## -## Usage: -## -## Rscript generate-changelog-with-codebook.R UMD|CMU path/to/codebook path/to/output/changelog [path/to/old/changelog] - -suppressPackageStartupMessages({ - library(tidyverse) -}) - -# "old" = new -WAVE_COMPARE_MAP <- list( - "UMD" = c( - "1" = 2, - "2" = 3, - "3" = 4, - "4" = 5, - "5" = 6, - "6" = 7, - "7" = 8, - "8" = 9, - "9" = 10, - "10" = 11, - "11" = 12, - "12" = 13 - ), - "CMU" = c( - "1" = 2, - "2" = 3, - "3" = 4, - "4" = 5, - "5" = 6, - "6" = 7, - "7" = 8, - "8" = 10, - "10" = 11, - "11" = 12, - "12" = 13 - ) -) - -DIFF_COLS <- c( - "question", - "matrix_subquestion", - "response_options", - "display_logic", - "response_option_randomization", - "respondent_group" -) - -CHANGE_TYPE_MAP <- c( - added = "Item added", - removed = "Item removed", - question = "Question wording changed", - display_logic = "Display logic changed", - response_options = "Answer choices changed", - matrix_subquestion = "Matrix subquestion text changed", - response_option_randomization = "Answer choice order changed", - respondent_group = "Respondent group changed" -) - - -generate_changelog <- function(path_to_codebook, - path_to_changelog, - path_to_old_changelog, - survey_version) { - # Get the codebook. Contains details about each question (text, answer - # choices, display logic) by wave. - codebook_raw <- read_csv(path_to_codebook, col_types = cols( - .default = col_character(), - version = col_double() - )) - - local_compare_map <- WAVE_COMPARE_MAP[[survey_version]] - # Add new-old wave mapping columns. Drop unused rows. - codebook <- codebook_raw %>% - mutate( - old_version = version, - new_version = local_compare_map[as.character(version)] - ) - - codebook <- full_join( - # with old columns - codebook %>% - rename_with(function(cols) { - map_chr(cols, ~ rename_col(.x, "old")) - }) %>% - select(-replaces) %>% - mutate(x_exists = TRUE), - # with new columns - codebook %>% - rename_with(function(cols) { - map_chr(cols, ~ rename_col(.x, "new")) - }) %>% - select(-replaces, -old_version, -new_version) %>% - mutate(y_exists = TRUE), - by = c("new_version" = "version", "variable" = "variable") - ) %>% - mutate( - description = coalesce(description.x, description.y), - question_type = coalesce(question_type.x, question_type.y), - ) %>% - select(-version, -description.x, -description.y, -question_type.x, -question_type.y) %>% - rename(variable_name = variable) - - # Drop obs where both old and new info is missing -- these are metavariables - # that we report in the microdata, like "weight" and "StartDate" - codebook <- codebook %>% - filter( - !(is.na(old_question) & - is.na(old_display_logic) & - is.na(old_response_option_randomization) & - is.na(old_respondent_group) & - is.na(new_question) & - is.na(new_display_logic) & - is.na(new_response_option_randomization) & - is.na(new_respondent_group)) - ) - - # Fill in version where missing - codebook$new_version <- coalesce(codebook$new_version, map_dbl(codebook$old_version, ~ local_compare_map[as.character(.x)])) - codebook$old_version <- coalesce(codebook$old_version, map_dbl(codebook$new_version, ~ get_old_version(.x, local_compare_map) %>% as.double())) - - # Drop obs where version is not in names or values of the wave mapping (i.e. 12.5) - codebook <- codebook %>% - filter( - new_version %in% c(local_compare_map, names(local_compare_map)), - old_version %in% c(local_compare_map, names(local_compare_map)) - ) - - ## Find differences. - result <- list() - - # Any item where x (old fields) does not exist but y does has been "added" - added_items <- codebook %>% - filter( - is.na(x_exists) & y_exists - ) - codebook <- anti_join(codebook, added_items) - - # Process added items - added_items <- added_items %>% - mutate( - change_type = CHANGE_TYPE_MAP["added"] - ) %>% - select(-x_exists, -y_exists) - - combos <- added_items %>% - filter(question_type == "Matrix" | !is.na(new_matrix_base_name) | !is.na(new_matrix_subquestion)) %>% - distinct(old_version, new_matrix_base_name) - - for (i in seq_len(nrow(combos))) { - wave = combos[i,] %>% pull(old_version) - base_name = combos[i,] %>% pull(new_matrix_base_name) - tmp <- added_items %>% - filter( - old_version == wave, new_matrix_base_name == base_name - ) - added_items <- anti_join(added_items, tmp) - if (nrow(filter(codebook_raw, version == wave, matrix_base_name == base_name)) == 0) { - # Dedup subqs so only report base question once - tmp <- tmp %>% - group_by(old_matrix_base_name, new_matrix_base_name, new_version, old_version) %>% - mutate( - variable_name = new_matrix_base_name, - old_matrix_subquestion = NA, - new_matrix_subquestion = "Differ by subquestion", - old_response_options = case_when( - length(unique(old_response_options)) == 1 ~ old_response_options, - TRUE ~ "Differ by subquestion" - ), - new_response_options = case_when( - length(unique(new_response_options)) == 1 ~ new_response_options, - TRUE ~ "Differ by subquestion" - ) - ) %>% - slice_head() %>% - ungroup() - } else { - tmp <- mutate(tmp, change_type = "Matrix subquestion added to existing item") - } - added_items <- rbind(added_items, tmp) - } - - result[["added"]] <- added_items - - # Any item where x (old fields) exists but y does not has been "removed" - removed_items <- codebook %>% - filter( - x_exists & is.na(y_exists) - ) - codebook <- anti_join(codebook, removed_items) %>% - select(-x_exists, -y_exists) - - # Process removed items. - removed_items <- removed_items %>% - mutate( - change_type = CHANGE_TYPE_MAP["removed"] - ) %>% - select(-x_exists, -y_exists) - - combos <- removed_items %>% - filter(question_type == "Matrix" | !is.na(old_matrix_base_name) | !is.na(old_matrix_subquestion)) %>% - distinct(new_version, old_matrix_base_name) - - for (i in seq_len(nrow(combos))) { - wave = combos[i,] %>% pull(new_version) - base_name = combos[i,] %>% pull(old_matrix_base_name) - tmp <- removed_items %>% - filter( - new_version == wave, old_matrix_base_name == base_name - ) - removed_items <- anti_join(removed_items, tmp) - if (nrow(filter(codebook_raw, version == wave, matrix_base_name == base_name)) == 0) { - # Dedup subqs so only report base question once - tmp <- tmp %>% - group_by(old_matrix_base_name, new_matrix_base_name, new_version, old_version) %>% - mutate( - variable_name = old_matrix_base_name, - old_matrix_subquestion = "Differ by subquestion", - new_matrix_subquestion = NA, - old_response_options = case_when( - length(unique(old_response_options)) == 1 ~ old_response_options, - TRUE ~ "Differ by subquestion" - ), - new_response_options = case_when( - length(unique(new_response_options)) == 1 ~ new_response_options, - TRUE ~ "Differ by subquestion" - ) - ) %>% - slice_head() %>% - ungroup() - } else { - tmp <- mutate(tmp, change_type = "Matrix subquestion removed from existing item") - } - removed_items <- rbind(removed_items, tmp) - } - - result[["removed"]] <- removed_items - - - # Do all other comparisons - for (col in DIFF_COLS) { - new_col <- paste("new", col, sep="_") - old_col <- paste("old", col, sep="_") - items_not_identical <- find_col_differences(codebook, new_col, old_col) - - changed <- codebook %>% - filter(items_not_identical) %>% - mutate(change_type = CHANGE_TYPE_MAP[col]) - if (col == "question") { - # Drop obs if the change is due to trivial formatting, e.g. nbsp - changed <- changed %>% - mutate( - new_question_wo_formatting = str_replace_all(new_question, " ", " "), - old_question_wo_formatting = str_replace_all(old_question, " ", " ") - ) %>% - filter(new_question_wo_formatting != old_question_wo_formatting) %>% - select(-new_question_wo_formatting, -old_question_wo_formatting) - } - result[[col]] <- changed - } - - changelog <- bind_rows(result) - - ## Don't report all matrix subquestions when the change is shared between all - ## of them, just report the base item. - # Group by matrix_base_name, change_type, and wave, as long as the change_type is relevant and matrix_base_name is not NA. - # Keep only one obs for each group. - # Set var name in kept obs to matrix_base_name for generality and to be able to join rationales on. - combos <- changelog %>% - filter((question_type == "Matrix" | !is.na(old_matrix_base_name) | !is.na(old_matrix_subquestion)) & - change_type %in% c( - "Question wording changed", - "Display logic changed", - "Answer choices changed", ## TODO: needs special logic, because Matrix subquestions can actually have different answer choices. Not needed for UMD - "Answer choice order changed", ## TODO: needs special logic, because Matrix subquestions can actually have different answer choices. Not needed for UMD - "Respondent group changed" - ) - ) %>% - distinct(new_version, old_version, new_matrix_base_name, old_matrix_base_name, change_type) - - SPECIAL_HANDLING <- list( - "Answer choices changed" = list("new_response_options", "old_response_options"), - "Answer choices order changed" = list("new_response_option_randomization", "old_response_option_randomization") - ) - for (i in seq_len(nrow(combos))) { - new_v <- combos[i,] %>% pull(new_version) - old_v <- combos[i,] %>% pull(old_version) - new_base <- combos[i,] %>% pull(new_matrix_base_name) - old_base <- combos[i,] %>% pull(old_matrix_base_name) - change <- combos[i,] %>% pull(change_type) - - tmp <- changelog %>% - filter( - new_version == new_v, - old_version == old_v, - new_matrix_base_name == new_base, - old_matrix_base_name == old_base, - change_type == change - ) - changelog <- anti_join(changelog, tmp) - - combine_flag <- FALSE - if (change %in% names(SPECIAL_HANDLING)) { - # See if the changed column is the same for all obs. Check if all matrix - # subquestions are listed. - new_col <- SPECIAL_HANDLING[[change]][[1]] - old_col <- SPECIAL_HANDLING[[change]][[2]] - if ( - length(unique(tmp[[new_col]])) == 1 && - length(unique(tmp[[old_col]])) == 1 && - ( - nrow(tmp) == codebook_raw %>% filter(version == old_v, matrix_base_name == old_base) %>% nrow() || - nrow(tmp) == codebook_raw %>% filter(version == new_v, matrix_base_name == new_base) %>% nrow() - ) - ) { - combine_flag <- TRUE - } - } else { - combine_flag <- TRUE - } - - if (combine_flag) { - tmp <- tmp %>% - slice_head() %>% - mutate( - variable_name = case_when( - old_matrix_base_name != new_matrix_base_name ~ paste(old_matrix_base_name, new_matrix_base_name, sep="/"), - TRUE ~ old_matrix_base_name - ), - old_matrix_subquestion = NA, - new_matrix_subquestion = NA - ) - } - - changelog <- rbind(changelog, tmp) - } - - ## Join old rationales on. - # TODO: The first time this happens using this new script, need to manually map - # some rationales for "Matrix subquestions changed", since previously this tag - # would include added and removed subquestions. - if (is.null(path_to_old_changelog)) { - warning("rationales will be empty; an old version of the changelog was not provided") - changelog$notes <- NA_character_ - } else { - old_changelog <- read_csv(path_to_old_changelog, col_types = cols( - .default = col_character(), - new_version = col_double(), - old_version = col_double() - )) %>% - select(new_version, old_version, variable_name, change_type, notes) - changelog <- changelog %>% - left_join(old_changelog, by=c("new_version", "old_version", "variable_name", "change_type")) - } - - write_excel_csv( - changelog %>% - rename( - new_question_text = new_question, - old_question_text = old_question, - new_matrix_subquestion_text = new_matrix_subquestion, - old_matrix_subquestion_text = old_matrix_subquestion - ) %>% - select( - new_version, - old_version, - variable_name, - description, - change_type, - new_matrix_base_name, - new_question_text, - new_matrix_subquestion_text, - new_response_options, - new_display_logic, - new_response_option_randomization, - new_respondent_group, - old_matrix_base_name, - old_question_text, - old_matrix_subquestion_text, - old_response_options, - old_display_logic, - old_response_option_randomization, - old_respondent_group, - notes - ) %>% - arrange(new_version, old_version), - path_to_changelog, quote="needed" - ) -} - -rename_col <- function(col, prefix) { - if (col %in% c(DIFF_COLS, "matrix_base_name")) { - paste(prefix, col, sep = "_") - } else { - col - } -} - -find_col_differences <- function(codebook, new_col, old_col) { - codebook[[old_col]] != codebook[[new_col]] -} - -get_old_version <- function(new_version, compare_map) { - ifelse(new_version %in% compare_map, compare_map[compare_map == new_version] %>% names(), NA_character_) -} - - -args <- commandArgs(TRUE) - -if (!(length(args) %in% c(3, 4))) { - stop("Usage: Rscript generate-changelog-with-codebook.R UMD|CMU path/to/codebook path/to/output/changelog [path/to/old/changelog]") -} - -survey_version <- args[1] -path_to_codebook <- args[2] -path_to_changelog <- args[3] - -if (length(args) == 4) { - path_to_old_changelog <- args[4] -} else { - path_to_old_changelog <- NULL -} - -invisible(generate_changelog(path_to_codebook, path_to_changelog, path_to_old_changelog, survey_version)) - diff --git a/facebook/qsf-tools/generate-changelog.R b/facebook/qsf-tools/generate-changelog.R index f874a4e66..3733fee01 100644 --- a/facebook/qsf-tools/generate-changelog.R +++ b/facebook/qsf-tools/generate-changelog.R @@ -7,434 +7,426 @@ ## ## Usage: ## -## Rscript generate-changelog.R UMD|CMU path/to/codebook path/to/diff/or/diff/directory path/to/output/changelog [path/to/old/changelog] +## Rscript generate-changelog-with-codebook.R UMD|CMU path/to/codebook path/to/output/changelog [path/to/old/changelog] suppressPackageStartupMessages({ library(tidyverse) - library(purrr) - library(stringr) - source("qsf-utils.R") }) +# "old" = new +WAVE_COMPARE_MAP <- list( + "UMD" = c( + "1" = 2, + "2" = 3, + "3" = 4, + "4" = 5, + "5" = 6, + "6" = 7, + "7" = 8, + "8" = 9, + "9" = 10, + "10" = 11, + "11" = 12, + "12" = 13 + ), + "CMU" = c( + "1" = 2, + "2" = 3, + "3" = 4, + "4" = 5, + "5" = 6, + "6" = 7, + "7" = 8, + "8" = 10, + "10" = 11, + "11" = 12, + "12" = 13 + ) +) + +DIFF_COLS <- c( + "question", + "matrix_subquestion", + "response_options", + "display_logic", + "response_option_randomization", + "respondent_group" +) + +CHANGE_TYPE_MAP <- c( + added = "Item added", + removed = "Item removed", + question = "Question wording changed", + display_logic = "Display logic changed", + response_options = "Answer choices changed", + matrix_subquestion = "Matrix subquestion text changed", + response_option_randomization = "Answer choice order changed", + respondent_group = "Respondent group changed" +) + generate_changelog <- function(path_to_codebook, - path_to_diff, path_to_changelog, path_to_old_changelog, - survey_version, - rename_map_file="item_rename_map.csv") { + survey_version) { # Get the codebook. Contains details about each question (text, answer # choices, display logic) by wave. - codebook <- get_codebook(path_to_codebook) + codebook_raw <- read_csv(path_to_codebook, col_types = cols( + .default = col_character(), + version = col_double() + )) - # Get the diffs + rationale. Contains info about which items changed between - # waves, plus a description of what changed and why. - qsf_diff <- get_diff(path_to_diff) + local_compare_map <- WAVE_COMPARE_MAP[[survey_version]] + # Add new-old wave mapping columns. Drop unused rows. + codebook <- codebook_raw %>% + mutate( + old_version = version, + new_version = local_compare_map[as.character(version)] + ) - if (!("notes" %in% names(qsf_diff)) && is.null(path_to_old_changelog)) { - warning("rationales must be provided either in the diff or via an old version of the changelog") - qsf_diff$notes <- NA_character_ - } + codebook <- full_join( + # with old columns + codebook %>% + rename_with(function(cols) { + map_chr(cols, ~ rename_col(.x, "old")) + }) %>% + select(-replaces) %>% + mutate(x_exists = TRUE), + # with new columns + codebook %>% + rename_with(function(cols) { + map_chr(cols, ~ rename_col(.x, "new")) + }) %>% + select(-replaces, -old_version, -new_version) %>% + mutate(y_exists = TRUE), + by = c("new_version" = "version", "variable" = "variable") + ) %>% + mutate( + description = coalesce(description.x, description.y), + question_type = coalesce(question_type.x, question_type.y), + ) %>% + select(-version, -description.x, -description.y, -question_type.x, -question_type.y) %>% + rename(variable_name = variable) - qsf_diff <- expand_out_matrix_subquestions(qsf_diff) + # Drop obs where both old and new info is missing -- these are metavariables + # that we report in the microdata, like "weight" and "StartDate" + codebook <- codebook %>% + filter( + !(is.na(old_question) & + is.na(old_display_logic) & + is.na(old_response_option_randomization) & + is.na(old_respondent_group) & + is.na(new_question) & + is.na(new_display_logic) & + is.na(new_response_option_randomization) & + is.na(new_respondent_group)) + ) - # Rename items as necessary - path_to_rename_map <- localize_static_filepath(rename_map_file, survey_version) - qsf_diff <- qsf_diff %>% - rowwise() %>% - mutate( - # Don't rename items that have been removed. Renaming is based on `new_wave`, - # but removed items are not actually present in `new_wave`, just the `old_wave`. - variable_name = patch_item_names( - variable_name, path_to_rename_map, new_wave, change_type != "Item removed" - ) + # Fill in version where missing + codebook$new_version <- coalesce(codebook$new_version, map_dbl(codebook$old_version, ~ local_compare_map[as.character(.x)])) + codebook$old_version <- coalesce(codebook$old_version, map_dbl(codebook$new_version, ~ get_old_version(.x, local_compare_map) %>% as.double())) + + # Drop obs where version is not in names or values of the wave mapping (i.e. 12.5) + codebook <- codebook %>% + filter( + new_version %in% c(local_compare_map, names(local_compare_map)), + old_version %in% c(local_compare_map, names(local_compare_map)) ) - result <- prepare_matrix_base_questions_for_join(qsf_diff, codebook) - qsf_diff <- result$diff - vars_not_in_codebook <- result$vars_not_in_codebook - - changelog <- make_changelog_from_codebook_and_diff(qsf_diff, codebook, vars_not_in_codebook) - changelog <- add_rationales_from_old_changelog(changelog, path_to_old_changelog) - check_missing_rationales(changelog) + ## Find differences. + result <- list() - write_excel_csv(changelog %>% rename(new_version=new_wave, old_version=old_wave), path_to_changelog, quote="needed") -} - -# Read codebook from path. Drop fields we don't use in the changelog. -get_codebook <- function(path_to_codebook) { - codebook <- read_csv(path_to_codebook, col_types = cols( - .default = col_character(), - version = col_double() - )) %>% - rename(question_text = question, matrix_subquestion_text = matrix_subquestion) %>% - select( - -replaces, -description, -question_type, - -response_option_randomization, -respondent_group + # Any item where x (old fields) does not exist but y does has been "added" + added_items <- codebook %>% + filter( + is.na(x_exists) & y_exists ) + codebook <- anti_join(codebook, added_items) - return(codebook) -} - -# Try to load `path_to_diff`. Check if it is a single CSV or a directory -# containing a set of CSVs. -get_diff <- function(path_to_diff) { - if (dir.exists(path_to_diff)) { - # Load all CSVs from a directory - csvs <- list.files(path_to_diff, pattern = "*.csv$", full.names = TRUE) - qsf_diff <- list() - for (csv in csvs) { - curr_diff <- read_csv(csv, col_types = cols( - .default = col_character(), - new_wave = col_double(), - old_wave = col_double() - )) - - old_wave_vars <- curr_diff %>% - filter(change_type != "Item added") %>% - distinct(item) %>% - pull() - new_wave_vars <- curr_diff %>% - filter(change_type != "Item removed") %>% - distinct(item) %>% - pull() - if (all(c("C0_matrix", "C0_likert") %in% old_wave_vars) || all(c("C0_matrix", "C0_likert") %in% new_wave_vars)) { - stop("Only one of 'C0_matrix' and 'C0_likert' can be present at once in file ", csv) - } - if (all(c("B13_profile", "B13_likert") %in% old_wave_vars) || all(c("B13_profile", "B13_likert") %in% new_wave_vars)) { - stop("Only one of 'B13_profile' and 'B13_likert' can be present at once in file ", csv) - } - if (all(c("B14_profile", "B14_likert") %in% old_wave_vars) || all(c("B14_profile", "B14_likert") %in% new_wave_vars)) { - stop("Only one of 'B14_profile' and 'B14_likert' can be present at once in file ", csv) - } - if (all(c("B12a_profile", "B12a_likert") %in% old_wave_vars) || all(c("B12a_profile", "B12a_likert") %in% new_wave_vars)) { - stop("Only one of 'B12a_profile' and 'B12a_likert' can be present at once in file ", csv) - } - if (all(c("B12b_profile", "B12b_likert") %in% old_wave_vars) || all(c("B12b_profile", "B12b_likert") %in% new_wave_vars)) { - stop("Only one of 'B12b_profile' and 'B12b_likert' can be present at once in file ", csv) - } - if (all(c("B1b_matrix", "B1b_likert") %in% old_wave_vars) || all(c("B1b_matrix", "B1b_likert") %in% new_wave_vars)) { - stop("Only one of 'B1b_matrix' and 'B1b_likert' can be present at once in file ", csv) - } - - qsf_diff[[csv]] <- curr_diff + # Process added items + added_items <- added_items %>% + mutate( + change_type = CHANGE_TYPE_MAP["added"] + ) %>% + select(-x_exists, -y_exists) + + combos <- added_items %>% + filter(question_type == "Matrix" | !is.na(new_matrix_base_name) | !is.na(new_matrix_subquestion)) %>% + distinct(old_version, new_matrix_base_name) + + for (i in seq_len(nrow(combos))) { + wave = combos[i,] %>% pull(old_version) + base_name = combos[i,] %>% pull(new_matrix_base_name) + tmp <- added_items %>% + filter( + old_version == wave, new_matrix_base_name == base_name + ) + added_items <- anti_join(added_items, tmp) + if (nrow(filter(codebook_raw, version == wave, matrix_base_name == base_name)) == 0) { + # Dedup subqs so only report base question once + tmp <- tmp %>% + group_by(old_matrix_base_name, new_matrix_base_name, new_version, old_version) %>% + mutate( + variable_name = new_matrix_base_name, + old_matrix_subquestion = NA, + new_matrix_subquestion = "Differ by subquestion", + old_response_options = case_when( + length(unique(old_response_options)) == 1 ~ old_response_options, + TRUE ~ "Differ by subquestion" + ), + new_response_options = case_when( + length(unique(new_response_options)) == 1 ~ new_response_options, + TRUE ~ "Differ by subquestion" + ) + ) %>% + slice_head() %>% + ungroup() + } else { + tmp <- mutate(tmp, change_type = "Matrix subquestion added to existing item") } - qsf_diff <- purrr::reduce(qsf_diff, rbind) %>% - rename(variable_name = item) %>% - select(-contains("qid")) - } else if (file.exists(path_to_diff)) { - # Load a single file - qsf_diff <- read_csv(path_to_diff, col_types = cols( - .default = col_character(), - new_wave = col_double(), - old_wave = col_double() - )) %>% - rename(variable_name = item) %>% - select(-contains("qid")) - } else { - stop(path_to_diff, " is not a valid file or directory") + added_items <- rbind(added_items, tmp) } - - return(qsf_diff) -} -# Turn any item listed as an `impacted_subquestion` into its own observation. -# Other fields are set to be the same as the base question (e.g. the base -# question is E1 for matrix subquestion E1_1). -expand_out_matrix_subquestions <- function(qsf_diff) { - # The diff only lists base name for matrix questions that changed. For - # example, `variable_name` is "Z1" if any matrix subquestion ("Z1_1", "Z1_2", - # etc) changed. The modified subquestions are listed in column - # `impacted_subquestions`. - # - # Since the codebook lists matrix subquestions separately, we need to split up - # the `impacted_subquestions` such that each subquestion is its own - # observation. This will allow us to join the codebook onto the diff. - nonmatrix_changes <- qsf_diff %>% - filter(is.na(impacted_subquestions)) %>% - select(-impacted_subquestions) - # Separately process any obs with non-missing `impacted_subquestions.` - matrix_changes <- qsf_diff %>% - filter(!is.na(impacted_subquestions)) %>% - # If multiple matrix subquestions changed, list each separately. - rowwise() %>% - mutate(new = list( - tibble( - new_wave = new_wave, - old_wave = old_wave, - change_type = change_type, - variable_name = str_split(impacted_subquestions, ", ") %>% unlist(), - notes = notes - ) - )) %>% - select(new) %>% - unnest(new) + result[["added"]] <- added_items - # Combine matrix and non-matrix subquestions. Use rbind to warn if our columns - # differ. - qsf_diff <- rbind(nonmatrix_changes, matrix_changes) %>% - arrange(new_wave, old_wave) - - return(qsf_diff) -} - -# Matrix base questions (e.g. the base question is E1 for matrix subquestion -# E1_1) exist in diffs but not in the codebook. To be able to join them between -# the two dfs, create a variable name mapping specifically for use in the join -# operation. -# -# A matrix base question is mapped to the first associated subquestion instance -# for a particular wave. The first subquestion is used for convenience and -# reproducibility; subquestion-specific fields are set to `NA`. -prepare_matrix_base_questions_for_join <- function(qsf_diff, codebook) { - # If variable_name from the qsf_diff is not also listed as a variable in - # the codebook, try adding an underscore to the end and looking for a variable - # in the codebook that starts with that string. The matrix_subquestion_text - # field of the match should be populated, although we want to ignore it and - # fill with NA instead. - vars_not_in_codebook <- setdiff( - qsf_diff %>% distinct(variable_name) %>% pull(), - codebook %>% distinct(variable) %>% pull() - ) - - # Add an underscore to the unmatched variable names to create a regex pattern - matrix_prefixes <- paste0(vars_not_in_codebook, "_") - names(matrix_prefixes) <- vars_not_in_codebook - - # A subset of UMD variables need manual mapping. - if ("C0_matrix" %in% names(matrix_prefixes)) { - matrix_prefixes["C0_matrix"] <- "C0_" - } - if ("C0_likert" %in% names(matrix_prefixes)) { - matrix_prefixes["C0_likert"] <- "C0_" - } + # Any item where x (old fields) exists but y does not has been "removed" + removed_items <- codebook %>% + filter( + x_exists & is.na(y_exists) + ) + codebook <- anti_join(codebook, removed_items) %>% + select(-x_exists, -y_exists) - if ("B13_profile" %in% names(matrix_prefixes)) { - matrix_prefixes["B13_profile"] <- "B13_" - } - if ("B13_likert" %in% names(matrix_prefixes)) { - matrix_prefixes["B13_likert"] <- "B13_" - } + # Process removed items. + removed_items <- removed_items %>% + mutate( + change_type = CHANGE_TYPE_MAP["removed"] + ) %>% + select(-x_exists, -y_exists) - if ("B14_profile" %in% names(matrix_prefixes)) { - matrix_prefixes["B14_profile"] <- "B14_" - } - if ("B14_likert" %in% names(matrix_prefixes)) { - matrix_prefixes["B14_likert"] <- "B14_" - } + combos <- removed_items %>% + filter(question_type == "Matrix" | !is.na(old_matrix_base_name) | !is.na(old_matrix_subquestion)) %>% + distinct(new_version, old_matrix_base_name) - if ("B12a_profile" %in% names(matrix_prefixes)) { - matrix_prefixes["B12a_profile"] <- "B12a_" - } - if ("B12a_likert" %in% names(matrix_prefixes)) { - matrix_prefixes["B12a_likert"] <- "B12a_" + for (i in seq_len(nrow(combos))) { + wave = combos[i,] %>% pull(new_version) + base_name = combos[i,] %>% pull(old_matrix_base_name) + tmp <- removed_items %>% + filter( + new_version == wave, old_matrix_base_name == base_name + ) + removed_items <- anti_join(removed_items, tmp) + if (nrow(filter(codebook_raw, version == wave, matrix_base_name == base_name)) == 0) { + # Dedup subqs so only report base question once + tmp <- tmp %>% + group_by(old_matrix_base_name, new_matrix_base_name, new_version, old_version) %>% + mutate( + variable_name = old_matrix_base_name, + old_matrix_subquestion = "Differ by subquestion", + new_matrix_subquestion = NA, + old_response_options = case_when( + length(unique(old_response_options)) == 1 ~ old_response_options, + TRUE ~ "Differ by subquestion" + ), + new_response_options = case_when( + length(unique(new_response_options)) == 1 ~ new_response_options, + TRUE ~ "Differ by subquestion" + ) + ) %>% + slice_head() %>% + ungroup() + } else { + tmp <- mutate(tmp, change_type = "Matrix subquestion removed from existing item") + } + removed_items <- rbind(removed_items, tmp) } - if ("B12b_profile" %in% names(matrix_prefixes)) { - matrix_prefixes["B12b_profile"] <- "B12b_" - } - if ("B12b_likert" %in% names(matrix_prefixes)) { - matrix_prefixes["B12b_likert"] <- "B12b_" - } + result[["removed"]] <- removed_items - if ("B1b_matrix" %in% names(matrix_prefixes)) { - matrix_prefixes["B1b_matrix"] <- "B1b_" - } - if ("B1b_likert" %in% names(matrix_prefixes)) { - matrix_prefixes["B1b_likert"] <- "B1b_" + + # Do all other comparisons + for (col in DIFF_COLS) { + new_col <- paste("new", col, sep="_") + old_col <- paste("old", col, sep="_") + items_not_identical <- find_col_differences(codebook, new_col, old_col) + + changed <- codebook %>% + filter(items_not_identical) %>% + mutate(change_type = CHANGE_TYPE_MAP[col]) + if (col == "question") { + # Drop obs if the change is due to trivial formatting, e.g. nbsp + changed <- changed %>% + mutate( + new_question_wo_formatting = str_replace_all(new_question, " ", " "), + old_question_wo_formatting = str_replace_all(old_question, " ", " ") + ) %>% + filter(new_question_wo_formatting != old_question_wo_formatting) %>% + select(-new_question_wo_formatting, -old_question_wo_formatting) + } + result[[col]] <- changed } - # First matrix item match by wave and matrix base question. - map_matrix_prefix_to_first_match <- codebook %>% - mutate( - join_variable = case_when( - # Create the basename for matrix items. - !is.na(matrix_subquestion_text) ~ strsplit(variable, "_") %>% - # Get all but last underscore-delimited chunk - purrr::map(~ .x[1:(length(.x) - 1)]) %>% - # Combine all but the last chunk with underscores. - purrr::map(~ paste0(.x, collapse="_") %>% paste0("_")) %>% - unlist(), - TRUE ~ variable - ) - ) %>% - filter(join_variable %in% matrix_prefixes) %>% - group_by(version, join_variable) %>% - slice_head() %>% - select(version, variable, join_variable) + changelog <- bind_rows(result) - # Add the regex patterns onto the diff. - qsf_diff <- qsf_diff %>% - mutate( - join_variable = case_when( - variable_name %in% vars_not_in_codebook ~ matrix_prefixes[variable_name], - TRUE ~ variable_name - ) - ) %>% - left_join( - map_matrix_prefix_to_first_match %>% rename_with(function(column_names) { - paste("new", column_names, sep = "_") - }), - by=c("new_wave" = "new_version", "join_variable"="new_join_variable") - ) %>% - left_join( - map_matrix_prefix_to_first_match %>% rename_with(function(column_names) { - paste("old", column_names, sep = "_") - }), - by=c("old_wave" = "old_version", "join_variable"="old_join_variable") - ) %>% - rename( - join_variable_new_wave = new_variable, - join_variable_old_wave = old_variable - ) %>% - mutate( - join_variable_new_wave = coalesce(join_variable_new_wave, variable_name), - join_variable_old_wave = coalesce(join_variable_old_wave, variable_name) - ) %>% - select(-join_variable) - - return(list("diff" = qsf_diff, "vars_not_in_codebook" = vars_not_in_codebook)) -} - -# Join codebook onto diff and modify columns to make the changelog. -make_changelog_from_codebook_and_diff <- function(qsf_diff, codebook, vars_not_in_codebook) { - # Create changelog by joining codebook onto annotated diff. - changelog <- qsf_diff %>% - # Add info about new version of question - left_join( - codebook %>% rename_with(function(column_names) { - paste("new", column_names, sep = "_") - }), - by=c("new_wave" = "new_version", "join_variable_new_wave" = "new_variable") + ## Don't report all matrix subquestions when the change is shared between all + ## of them, just report the base item. + # Group by matrix_base_name, change_type, and wave, as long as the change_type is relevant and matrix_base_name is not NA. + # Keep only one obs for each group. + # Set var name in kept obs to matrix_base_name for generality and to be able to join rationales on. + combos <- changelog %>% + filter((question_type == "Matrix" | !is.na(old_matrix_base_name) | !is.na(old_matrix_subquestion)) & + change_type %in% c( + "Question wording changed", + "Display logic changed", + "Answer choices changed", ## TODO: needs special logic, because Matrix subquestions can actually have different answer choices. Not needed for UMD + "Answer choice order changed", ## TODO: needs special logic, because Matrix subquestions can actually have different answer choices. Not needed for UMD + "Respondent group changed" + ) ) %>% - # Add info about previous version of question - left_join( - codebook %>% rename_with(function(column_names) { - paste("old", column_names, sep = "_") - }), - by=c("old_wave" = "old_version", "join_variable_old_wave" = "old_variable") - ) %>% - select( - new_wave, - old_wave, - variable_name, - change_type, - new_question_text, - new_matrix_subquestion_text, - new_response_options, - new_display_logic, - old_question_text, - old_matrix_subquestion_text, - old_response_options, - old_display_logic, - notes - ) %>% - # If item is a matrix question where something other than the matrix - # subquestions changed between waves, drop matrix_subquestion_text fields, - # which are relevant for only a single subquestion. - mutate( - new_matrix_subquestion_text = case_when( - variable_name %in% vars_not_in_codebook ~ NA_character_, - TRUE ~ new_matrix_subquestion_text - ), - old_matrix_subquestion_text = case_when( - variable_name %in% vars_not_in_codebook ~ NA_character_, - TRUE ~ old_matrix_subquestion_text - ) - ) %>% - # When an item was added, all `old_` fields should be empty; when an item - # was removed, all `new_` fields should be empty. - mutate( - old_question_text = case_when( - change_type == "Item added" ~ NA_character_, - TRUE ~ old_question_text - ), - old_matrix_subquestion_text = case_when( - change_type == "Item added" ~ NA_character_, - TRUE ~ old_matrix_subquestion_text - ), - old_response_options = case_when( - change_type == "Item added" ~ NA_character_, - TRUE ~ old_response_options - ), - old_display_logic = case_when( - change_type == "Item added" ~ NA_character_, - TRUE ~ old_display_logic - ), - new_question_text = case_when( - change_type == "Item removed" ~ NA_character_, - TRUE ~ new_question_text - ), - new_matrix_subquestion_text = case_when( - change_type == "Item removed" ~ NA_character_, - TRUE ~ new_matrix_subquestion_text - ), - new_response_options = case_when( - change_type == "Item removed" ~ NA_character_, - TRUE ~ new_response_options - ), - new_display_logic = case_when( - change_type == "Item removed" ~ NA_character_, - TRUE ~ new_display_logic + distinct(new_version, old_version, new_matrix_base_name, old_matrix_base_name, change_type) + + SPECIAL_HANDLING <- list( + "Answer choices changed" = list("new_response_options", "old_response_options"), + "Answer choices order changed" = list("new_response_option_randomization", "old_response_option_randomization") + ) + for (i in seq_len(nrow(combos))) { + new_v <- combos[i,] %>% pull(new_version) + old_v <- combos[i,] %>% pull(old_version) + new_base <- combos[i,] %>% pull(new_matrix_base_name) + old_base <- combos[i,] %>% pull(old_matrix_base_name) + change <- combos[i,] %>% pull(change_type) + + tmp <- changelog %>% + filter( + new_version == new_v, + old_version == old_v, + new_matrix_base_name == new_base, + old_matrix_base_name == old_base, + change_type == change ) - ) + changelog <- anti_join(changelog, tmp) + + combine_flag <- FALSE + if (change %in% names(SPECIAL_HANDLING)) { + # See if the changed column is the same for all obs. Check if all matrix + # subquestions are listed. + new_col <- SPECIAL_HANDLING[[change]][[1]] + old_col <- SPECIAL_HANDLING[[change]][[2]] + if ( + length(unique(tmp[[new_col]])) == 1 && + length(unique(tmp[[old_col]])) == 1 && + ( + nrow(tmp) == codebook_raw %>% filter(version == old_v, matrix_base_name == old_base) %>% nrow() || + nrow(tmp) == codebook_raw %>% filter(version == new_v, matrix_base_name == new_base) %>% nrow() + ) + ) { + combine_flag <- TRUE + } + } else { + combine_flag <- TRUE + } + + if (combine_flag) { + tmp <- tmp %>% + slice_head() %>% + mutate( + variable_name = case_when( + old_matrix_base_name != new_matrix_base_name ~ paste(old_matrix_base_name, new_matrix_base_name, sep="/"), + TRUE ~ old_matrix_base_name + ), + old_matrix_subquestion = NA, + new_matrix_subquestion = NA + ) + } + + changelog <- rbind(changelog, tmp) + } - return(changelog) -} - -# Add old rationales, if available, to new changelog -add_rationales_from_old_changelog <- function(changelog, path_to_old_changelog) { - # If path_to_old_changelog is provided, prefer it over existing notes column. - if (!is.null(path_to_old_changelog)) { + ## Join old rationales on. + # TODO: The first time this happens using this new script, need to manually map + # some rationales for "Matrix subquestions changed", since previously this tag + # would include added and removed subquestions. + if (is.null(path_to_old_changelog)) { + warning("rationales will be empty; an old version of the changelog was not provided") + changelog$notes <- NA_character_ + } else { old_changelog <- read_csv(path_to_old_changelog, col_types = cols( .default = col_character(), - new_wave = col_double(), - old_wave = col_double() + new_version = col_double(), + old_version = col_double() )) %>% - select(new_wave, old_wave, variable_name, change_type, notes) + select(new_version, old_version, variable_name, change_type, notes) changelog <- changelog %>% - select(-notes) %>% - left_join(old_changelog, by=c("new_wave", "old_wave", "variable_name", "change_type")) + left_join(old_changelog, by=c("new_version", "old_version", "variable_name", "change_type")) } - return(changelog) + write_excel_csv( + changelog %>% + rename( + new_question_text = new_question, + old_question_text = old_question, + new_matrix_subquestion_text = new_matrix_subquestion, + old_matrix_subquestion_text = old_matrix_subquestion + ) %>% + select( + new_version, + old_version, + variable_name, + description, + change_type, + new_matrix_base_name, + new_question_text, + new_matrix_subquestion_text, + new_response_options, + new_display_logic, + new_response_option_randomization, + new_respondent_group, + old_matrix_base_name, + old_question_text, + old_matrix_subquestion_text, + old_response_options, + old_display_logic, + old_response_option_randomization, + old_respondent_group, + notes + ) %>% + arrange(new_version, old_version), + path_to_changelog, quote="needed" + ) } -check_missing_rationales <- function(changelog) { - if (any(is.na(changelog$notes))) { - vars_missing_rationales <- changelog %>% - filter(is.na(notes) | notes == "") %>% - pull(variable_name) - waves <- changelog %>% - filter(is.na(notes) | notes == "") %>% - pull(new_wave) - change_types <- changelog %>% - filter(is.na(notes) | notes == "") %>% - pull(change_type) - warning( - "variables ", paste0(vars_missing_rationales, " (new_wave ", waves, ", ", change_types, ")", collapse = ", "), - " are missing rationales" - ) +rename_col <- function(col, prefix) { + if (col %in% c(DIFF_COLS, "matrix_base_name")) { + paste(prefix, col, sep = "_") + } else { + col } - - return(NULL) } +find_col_differences <- function(codebook, new_col, old_col) { + codebook[[old_col]] != codebook[[new_col]] +} + +get_old_version <- function(new_version, compare_map) { + ifelse(new_version %in% compare_map, compare_map[compare_map == new_version] %>% names(), NA_character_) +} + + args <- commandArgs(TRUE) -if (!(length(args) %in% c(4, 5))) { - stop("Usage: Rscript generate-changelog.R UMD|CMU path/to/codebook path/to/diff/directory path/to/output/changelog [path/to/old/changelog]") +if (!(length(args) %in% c(3, 4))) { + stop("Usage: Rscript generate-changelog-with-codebook.R UMD|CMU path/to/codebook path/to/output/changelog [path/to/old/changelog]") } survey_version <- args[1] path_to_codebook <- args[2] -path_to_diff <- args[3] -path_to_changelog <- args[4] +path_to_changelog <- args[3] -if (length(args) == 5) { - path_to_old_changelog <- args[5] +if (length(args) == 4) { + path_to_old_changelog <- args[4] } else { path_to_old_changelog <- NULL } -invisible(generate_changelog(path_to_codebook, path_to_diff, path_to_changelog, path_to_old_changelog, survey_version)) +invisible(generate_changelog(path_to_codebook, path_to_changelog, path_to_old_changelog, survey_version)) + From 22989941fed27abaa12821ba5b16244cf6b64518 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 6 Jun 2022 13:46:56 -0400 Subject: [PATCH 12/13] rename UMD eu_version field to eu_noneu --- facebook/qsf-tools/combine_changelogs_eu.R | 10 +++++----- facebook/qsf-tools/combine_codebooks_eu.R | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/facebook/qsf-tools/combine_changelogs_eu.R b/facebook/qsf-tools/combine_changelogs_eu.R index 1af5aeea5..f999e439d 100644 --- a/facebook/qsf-tools/combine_changelogs_eu.R +++ b/facebook/qsf-tools/combine_changelogs_eu.R @@ -22,7 +22,7 @@ combine_changelogs <- function(path_to_changelog_eu, old_version = col_double() )) %>% mutate( - eu_version = "EU" + eu_noneu = "EU" ) changelog_noneu <- read_csv(path_to_changelog_noneu, col_types = cols( @@ -31,21 +31,21 @@ combine_changelogs <- function(path_to_changelog_eu, old_version = col_double() )) %>% mutate( - eu_version = "Non-EU" + eu_noneu = "Non-EU" ) # Using rbind here to raise an error if columns differ between the existing # changelog and the new wave data. changelog_with_duplicates <- rbind(changelog_eu, changelog_noneu) - count_duplicated_rows <- changelog_with_duplicates %>% group_by(across(c(-eu_version))) %>% summarize(count = n()) + count_duplicated_rows <- changelog_with_duplicates %>% group_by(across(c(-eu_noneu))) %>% summarize(count = n()) changelog <- changelog_with_duplicates %>% left_join(count_duplicated_rows) - changelog$eu_version[changelog$count == 2] <- "Both" + changelog$eu_noneu[changelog$count == 2] <- "Both" # Sort so that items with missing type (non-Qualtrics fields) are at the top. # Drop duplicates. changelog <- changelog %>% - arrange(variable_name, eu_version) %>% + arrange(variable_name, eu_noneu) %>% select(-count) %>% distinct() diff --git a/facebook/qsf-tools/combine_codebooks_eu.R b/facebook/qsf-tools/combine_codebooks_eu.R index 020da8b23..3e99dc83b 100644 --- a/facebook/qsf-tools/combine_codebooks_eu.R +++ b/facebook/qsf-tools/combine_codebooks_eu.R @@ -21,7 +21,7 @@ combine_codebooks <- function(path_to_codebook_eu, version = col_double() )) %>% mutate( - eu_version = "EU" + eu_noneu = "EU" ) codebook_noneu <- read_csv(path_to_codebook_noneu, col_types = cols( @@ -29,21 +29,21 @@ combine_codebooks <- function(path_to_codebook_eu, version = col_double() )) %>% mutate( - eu_version = "Non-EU" + eu_noneu = "Non-EU" ) # Using rbind here to raise an error if columns differ between the existing # codebook and the new wave data. codebook_with_duplicates <- rbind(codebook_eu, codebook_noneu) - count_duplicated_rows <- codebook_with_duplicates %>% group_by(across(c(-eu_version))) %>% summarize(count = n()) + count_duplicated_rows <- codebook_with_duplicates %>% group_by(across(c(-eu_noneu))) %>% summarize(count = n()) codebook <- codebook_with_duplicates %>% left_join(count_duplicated_rows) - codebook$eu_version[codebook$count == 2] <- "Both" + codebook$eu_noneu[codebook$count == 2] <- "Both" # Sort so that items with missing type (non-Qualtrics fields) are at the top. # Drop duplicates. codebook <- codebook %>% - arrange(!is.na(.data$question_type), variable, version, eu_version) %>% + arrange(!is.na(.data$question_type), variable, version, eu_noneu) %>% select(-count) %>% distinct() From ccb5f9142c8a08ed82ab7a0edb5966012e6788c1 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 6 Jun 2022 13:47:47 -0400 Subject: [PATCH 13/13] drop replaces field from UMD --- facebook/qsf-tools/combine_codebooks_eu.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/facebook/qsf-tools/combine_codebooks_eu.R b/facebook/qsf-tools/combine_codebooks_eu.R index 3e99dc83b..6109dd8a5 100644 --- a/facebook/qsf-tools/combine_codebooks_eu.R +++ b/facebook/qsf-tools/combine_codebooks_eu.R @@ -44,7 +44,7 @@ combine_codebooks <- function(path_to_codebook_eu, # Drop duplicates. codebook <- codebook %>% arrange(!is.na(.data$question_type), variable, version, eu_noneu) %>% - select(-count) %>% + select(-count, -replaces) %>% distinct() return(codebook)