Skip to content

Commit b712169

Browse files
authored
Merge pull request #1624 from cmu-delphi/ndefries/qsf-umd-bugs
[CTIS qsf tools] Changelog bugs from round 3 feedback
2 parents ea89d3b + ca5209c commit b712169

12 files changed

+312
-52
lines changed
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#!/usr/bin/env Rscript
2+
3+
## Combine an EU and a non-EU codebook (UMD only), adding in a column indicating
4+
## whether a given field was included in just the EU version, just the non-EU
5+
## version, or in both.
6+
##
7+
## Usage:
8+
##
9+
## Rscript append-old-changelog-umd.R path/to/output/changelog path/to/old/changelog
10+
11+
12+
suppressPackageStartupMessages({
13+
library(tidyverse)
14+
})
15+
16+
add_rationales_from_old_changelog <- function(path_to_changelog, path_to_old_changelog) {
17+
# If path_to_old_changelog is provided, prefer it over existing notes column.
18+
19+
changelog <- read_csv(path_to_changelog, col_types = cols(
20+
.default = col_character(),
21+
new_version = col_double(),
22+
old_version = col_double()
23+
))
24+
old_changelog <- read_csv(path_to_old_changelog, col_types = cols(
25+
.default = col_character(),
26+
new_version = col_double(),
27+
old_version = col_double()
28+
)) %>%
29+
select(new_version, old_version, variable_name, change_type,eu_version, notes)
30+
changelog <- changelog %>%
31+
select(-notes) %>%
32+
left_join(old_changelog, by=c("new_version", "old_version", "variable_name", "change_type","eu_version"))
33+
34+
35+
write_excel_csv(changelog, path_to_changelog, quote="needed")
36+
}
37+
38+
args <- commandArgs(TRUE)
39+
40+
if (!(length(args) %in% c(2))) {
41+
stop("Usage: Rscript append-old-changelog-umd.R path/to/output/changelog path/to/old/changelog")
42+
}
43+
44+
path_to_changelog <- args[1]
45+
path_to_old_changelog <- args[2]
46+
add_rationales_from_old_changelog(path_to_changelog, path_to_old_changelog)

facebook/qsf-tools/combine_changelogs_eu.R

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,17 @@ combine_changelogs <- function(path_to_changelog_eu,
1818

1919
changelog_eu <- read_csv(path_to_changelog_eu, col_types = cols(
2020
.default = col_character(),
21-
new_wave = col_double(),
22-
old_wave = col_double()
21+
new_version = col_double(),
22+
old_version = col_double()
2323
)) %>%
2424
mutate(
2525
eu_version = "EU"
2626
)
2727

2828
changelog_noneu <- read_csv(path_to_changelog_noneu, col_types = cols(
2929
.default = col_character(),
30-
new_wave = col_double(),
31-
old_wave = col_double()
30+
new_version = col_double(),
31+
old_version = col_double()
3232
)) %>%
3333
mutate(
3434
eu_version = "Non-EU"

facebook/qsf-tools/combine_codebooks_eu.R

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
##
77
## Usage:
88
##
9-
## Rscript combine_codebooks_eu.R.R path/to/eu/codebook path/to/noneu/codebook path/to/combined/codebook
9+
## Rscript combine_codebooks_eu.R path/to/eu/codebook path/to/noneu/codebook path/to/combined/codebook
1010

1111
suppressPackageStartupMessages({
1212
library(tidyverse)
@@ -18,15 +18,15 @@ combine_codebooks <- function(path_to_codebook_eu,
1818

1919
codebook_eu <- read_csv(path_to_codebook_eu, col_types = cols(
2020
.default = col_character(),
21-
wave = col_double()
21+
version = col_double()
2222
)) %>%
2323
mutate(
2424
eu_version = "EU"
2525
)
2626

2727
codebook_noneu <- read_csv(path_to_codebook_noneu, col_types = cols(
2828
.default = col_character(),
29-
wave = col_double()
29+
version = col_double()
3030
)) %>%
3131
mutate(
3232
eu_version = "Non-EU"
@@ -43,7 +43,7 @@ combine_codebooks <- function(path_to_codebook_eu,
4343
# Sort so that items with missing type (non-Qualtrics fields) are at the top.
4444
# Drop duplicates.
4545
codebook <- codebook %>%
46-
arrange(!is.na(.data$question_type), variable, wave, eu_version) %>%
46+
arrange(!is.na(.data$question_type), variable, version, eu_version) %>%
4747
select(-count) %>%
4848
distinct()
4949

facebook/qsf-tools/generate-changelog.R

Lines changed: 128 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,8 @@ generate_changelog <- function(path_to_codebook,
3131
# waves, plus a description of what changed and why.
3232
qsf_diff <- get_diff(path_to_diff)
3333

34-
if (!("notes" %in% names(qsf_diff))) {
35-
if (is.null(path_to_old_changelog)) {
36-
stop("rationales must be provided either in the diff or via an old version of the changelog")
37-
}
34+
if (!("notes" %in% names(qsf_diff)) && is.null(path_to_old_changelog)) {
35+
warning("rationales must be provided either in the diff or via an old version of the changelog")
3836
qsf_diff$notes <- NA_character_
3937
}
4038

@@ -60,14 +58,14 @@ generate_changelog <- function(path_to_codebook,
6058
changelog <- add_rationales_from_old_changelog(changelog, path_to_old_changelog)
6159
check_missing_rationales(changelog)
6260

63-
write_excel_csv(changelog, path_to_changelog, quote="needed")
61+
write_excel_csv(changelog %>% rename(new_version=new_wave, old_version=old_wave), path_to_changelog, quote="needed")
6462
}
6563

6664
# Read codebook from path. Drop fields we don't use in the changelog.
6765
get_codebook <- function(path_to_codebook) {
6866
codebook <- read_csv(path_to_codebook, col_types = cols(
6967
.default = col_character(),
70-
wave = col_double()
68+
version = col_double()
7169
)) %>%
7270
rename(question_text = question, matrix_subquestion_text = matrix_subquestion) %>%
7371
select(
@@ -77,7 +75,7 @@ get_codebook <- function(path_to_codebook) {
7775

7876
return(codebook)
7977
}
80-
78+
8179
# Try to load `path_to_diff`. Check if it is a single CSV or a directory
8280
# containing a set of CSVs.
8381
get_diff <- function(path_to_diff) {
@@ -86,11 +84,40 @@ get_diff <- function(path_to_diff) {
8684
csvs <- list.files(path_to_diff, pattern = "*.csv$", full.names = TRUE)
8785
qsf_diff <- list()
8886
for (csv in csvs) {
89-
qsf_diff[[csv]] <- read_csv(csv, col_types = cols(
87+
curr_diff <- read_csv(csv, col_types = cols(
9088
.default = col_character(),
9189
new_wave = col_double(),
9290
old_wave = col_double()
9391
))
92+
93+
old_wave_vars <- curr_diff %>%
94+
filter(change_type != "Item added") %>%
95+
distinct(item) %>%
96+
pull()
97+
new_wave_vars <- curr_diff %>%
98+
filter(change_type != "Item removed") %>%
99+
distinct(item) %>%
100+
pull()
101+
if (all(c("C0_matrix", "C0_likert") %in% old_wave_vars) || all(c("C0_matrix", "C0_likert") %in% new_wave_vars)) {
102+
stop("Only one of 'C0_matrix' and 'C0_likert' can be present at once in file ", csv)
103+
}
104+
if (all(c("B13_profile", "B13_likert") %in% old_wave_vars) || all(c("B13_profile", "B13_likert") %in% new_wave_vars)) {
105+
stop("Only one of 'B13_profile' and 'B13_likert' can be present at once in file ", csv)
106+
}
107+
if (all(c("B14_profile", "B14_likert") %in% old_wave_vars) || all(c("B14_profile", "B14_likert") %in% new_wave_vars)) {
108+
stop("Only one of 'B14_profile' and 'B14_likert' can be present at once in file ", csv)
109+
}
110+
if (all(c("B12a_profile", "B12a_likert") %in% old_wave_vars) || all(c("B12a_profile", "B12a_likert") %in% new_wave_vars)) {
111+
stop("Only one of 'B12a_profile' and 'B12a_likert' can be present at once in file ", csv)
112+
}
113+
if (all(c("B12b_profile", "B12b_likert") %in% old_wave_vars) || all(c("B12b_profile", "B12b_likert") %in% new_wave_vars)) {
114+
stop("Only one of 'B12b_profile' and 'B12b_likert' can be present at once in file ", csv)
115+
}
116+
if (all(c("B1b_matrix", "B1b_likert") %in% old_wave_vars) || all(c("B1b_matrix", "B1b_likert") %in% new_wave_vars)) {
117+
stop("Only one of 'B1b_matrix' and 'B1b_likert' can be present at once in file ", csv)
118+
}
119+
120+
qsf_diff[[csv]] <- curr_diff
94121
}
95122
qsf_diff <- purrr::reduce(qsf_diff, rbind) %>%
96123
rename(variable_name = item) %>%
@@ -169,23 +196,72 @@ prepare_matrix_base_questions_for_join <- function(qsf_diff, codebook) {
169196
qsf_diff %>% distinct(variable_name) %>% pull(),
170197
codebook %>% distinct(variable) %>% pull()
171198
)
199+
172200
# Add an underscore to the unmatched variable names to create a regex pattern
173201
matrix_prefixes <- paste0(vars_not_in_codebook, "_")
174202
names(matrix_prefixes) <- vars_not_in_codebook
203+
204+
# A subset of UMD variables need manual mapping.
205+
if ("C0_matrix" %in% names(matrix_prefixes)) {
206+
matrix_prefixes["C0_matrix"] <- "C0_"
207+
}
208+
if ("C0_likert" %in% names(matrix_prefixes)) {
209+
matrix_prefixes["C0_likert"] <- "C0_"
210+
}
211+
212+
if ("B13_profile" %in% names(matrix_prefixes)) {
213+
matrix_prefixes["B13_profile"] <- "B13_"
214+
}
215+
if ("B13_likert" %in% names(matrix_prefixes)) {
216+
matrix_prefixes["B13_likert"] <- "B13_"
217+
}
218+
219+
if ("B14_profile" %in% names(matrix_prefixes)) {
220+
matrix_prefixes["B14_profile"] <- "B14_"
221+
}
222+
if ("B14_likert" %in% names(matrix_prefixes)) {
223+
matrix_prefixes["B14_likert"] <- "B14_"
224+
}
225+
226+
if ("B12a_profile" %in% names(matrix_prefixes)) {
227+
matrix_prefixes["B12a_profile"] <- "B12a_"
228+
}
229+
if ("B12a_likert" %in% names(matrix_prefixes)) {
230+
matrix_prefixes["B12a_likert"] <- "B12a_"
231+
}
232+
233+
if ("B12b_profile" %in% names(matrix_prefixes)) {
234+
matrix_prefixes["B12b_profile"] <- "B12b_"
235+
}
236+
if ("B12b_likert" %in% names(matrix_prefixes)) {
237+
matrix_prefixes["B12b_likert"] <- "B12b_"
238+
}
239+
240+
if ("B1b_matrix" %in% names(matrix_prefixes)) {
241+
matrix_prefixes["B1b_matrix"] <- "B1b_"
242+
}
243+
if ("B1b_likert" %in% names(matrix_prefixes)) {
244+
matrix_prefixes["B1b_likert"] <- "B1b_"
245+
}
175246

176247
# First matrix item match by wave and matrix base question.
177248
map_matrix_prefix_to_first_match <- codebook %>%
178249
mutate(
179250
join_variable = case_when(
180251
# Create the basename for matrix items.
181-
grepl("_", variable) ~ strsplit(variable, "_") %>% purrr::map_chr(~ .x[1]) %>% paste0("_"),
252+
!is.na(matrix_subquestion_text) ~ strsplit(variable, "_") %>%
253+
# Get all but last underscore-delimited chunk
254+
purrr::map(~ .x[1:(length(.x) - 1)]) %>%
255+
# Combine all but the last chunk with underscores.
256+
purrr::map(~ paste0(.x, collapse="_") %>% paste0("_")) %>%
257+
unlist(),
182258
TRUE ~ variable
183259
)
184260
) %>%
185261
filter(join_variable %in% matrix_prefixes) %>%
186-
group_by(wave, join_variable) %>%
262+
group_by(version, join_variable) %>%
187263
slice_head() %>%
188-
select(wave, variable, join_variable)
264+
select(version, variable, join_variable)
189265

190266
# Add the regex patterns onto the diff.
191267
qsf_diff <- qsf_diff %>%
@@ -199,13 +275,13 @@ prepare_matrix_base_questions_for_join <- function(qsf_diff, codebook) {
199275
map_matrix_prefix_to_first_match %>% rename_with(function(column_names) {
200276
paste("new", column_names, sep = "_")
201277
}),
202-
by=c("new_wave" = "new_wave", "join_variable"="new_join_variable")
278+
by=c("new_wave" = "new_version", "join_variable"="new_join_variable")
203279
) %>%
204280
left_join(
205281
map_matrix_prefix_to_first_match %>% rename_with(function(column_names) {
206282
paste("old", column_names, sep = "_")
207283
}),
208-
by=c("old_wave" = "old_wave", "join_variable"="old_join_variable")
284+
by=c("old_wave" = "old_version", "join_variable"="old_join_variable")
209285
) %>%
210286
rename(
211287
join_variable_new_wave = new_variable,
@@ -216,7 +292,7 @@ prepare_matrix_base_questions_for_join <- function(qsf_diff, codebook) {
216292
join_variable_old_wave = coalesce(join_variable_old_wave, variable_name)
217293
) %>%
218294
select(-join_variable)
219-
295+
220296
return(list("diff" = qsf_diff, "vars_not_in_codebook" = vars_not_in_codebook))
221297
}
222298

@@ -229,14 +305,14 @@ make_changelog_from_codebook_and_diff <- function(qsf_diff, codebook, vars_not_i
229305
codebook %>% rename_with(function(column_names) {
230306
paste("new", column_names, sep = "_")
231307
}),
232-
by=c("new_wave" = "new_wave", "join_variable_new_wave" = "new_variable")
308+
by=c("new_wave" = "new_version", "join_variable_new_wave" = "new_variable")
233309
) %>%
234310
# Add info about previous version of question
235311
left_join(
236312
codebook %>% rename_with(function(column_names) {
237313
paste("old", column_names, sep = "_")
238314
}),
239-
by=c("old_wave" = "old_wave", "join_variable_old_wave" = "old_variable")
315+
by=c("old_wave" = "old_version", "join_variable_old_wave" = "old_variable")
240316
) %>%
241317
select(
242318
new_wave,
@@ -265,6 +341,42 @@ make_changelog_from_codebook_and_diff <- function(qsf_diff, codebook, vars_not_i
265341
variable_name %in% vars_not_in_codebook ~ NA_character_,
266342
TRUE ~ old_matrix_subquestion_text
267343
)
344+
) %>%
345+
# When an item was added, all `old_` fields should be empty; when an item
346+
# was removed, all `new_` fields should be empty.
347+
mutate(
348+
old_question_text = case_when(
349+
change_type == "Item added" ~ NA_character_,
350+
TRUE ~ old_question_text
351+
),
352+
old_matrix_subquestion_text = case_when(
353+
change_type == "Item added" ~ NA_character_,
354+
TRUE ~ old_matrix_subquestion_text
355+
),
356+
old_response_options = case_when(
357+
change_type == "Item added" ~ NA_character_,
358+
TRUE ~ old_response_options
359+
),
360+
old_display_logic = case_when(
361+
change_type == "Item added" ~ NA_character_,
362+
TRUE ~ old_display_logic
363+
),
364+
new_question_text = case_when(
365+
change_type == "Item removed" ~ NA_character_,
366+
TRUE ~ new_question_text
367+
),
368+
new_matrix_subquestion_text = case_when(
369+
change_type == "Item removed" ~ NA_character_,
370+
TRUE ~ new_matrix_subquestion_text
371+
),
372+
new_response_options = case_when(
373+
change_type == "Item removed" ~ NA_character_,
374+
TRUE ~ new_response_options
375+
),
376+
new_display_logic = case_when(
377+
change_type == "Item removed" ~ NA_character_,
378+
TRUE ~ new_display_logic
379+
)
268380
)
269381

270382
return(changelog)

0 commit comments

Comments
 (0)