Skip to content

Commit 1b19769

Browse files
authored
Merge pull request #1405 from cmu-delphi/new-CID-lists
[CTIS] Create new set of CID lists to send to FB
2 parents 9d3981b + 19f6f23 commit 1b19769

18 files changed

+310
-90
lines changed

facebook/Makefile

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ PYTHON:=env/bin/python
1212
QUALTRICS=$(shell $(PYTHON) -m delphi_utils get input_dir)
1313
WEIGHTS=$(shell $(PYTHON) -m delphi_utils get weights_in_dir)
1414
CIDS=$(shell $(PYTHON) -m delphi_utils get weights_out_dir)
15+
CIDS_EXP=$(shell $(PYTHON) -m delphi_utils get experimental_weights_out_dir)
1516
INDIVIDUAL=$(shell $(PYTHON) -m delphi_utils get individual_dir)
1617
INDIVIDUAL_RACEETH=$(shell $(PYTHON) -m delphi_utils get individual_raceeth_dir)
1718
ARCHIVE=$(shell $(PYTHON) -m delphi_utils get archive_dir)
@@ -25,7 +26,9 @@ SFTP_OPTIONS=$(shell $(PYTHON) -m delphi_utils get sftp_options)
2526
MAX_WEIGHTED=ls -1 $(WEIGHTS) | grep dap | tail -1 | sed 's/_.*//;s/-//g;'
2627

2728
ANTIJOIN:="antijoin.cids.sorted.txt"
29+
ANTIJOIN_EXP:="antijoin.experimental.cids.sorted.txt"
2830
CIDS_DEST:="fb-interchange/cmu_respondent_ids"
31+
CIDS_EXP_DEST:="fb-interchange/cmu_respondent_ww_ids"
2932
INDIVID_DEST:="fb-public-results/"
3033
INDIVID_RACEETH_DEST:="protected-race-ethnicity-data/"
3134
RAW_DEST:="raw"
@@ -59,7 +62,7 @@ tidy: receiving
5962
mv scratch/*.tgz tidy/
6063

6164
clean:
62-
rm -f $(RECEIVING)/*.csv $(INDIVIDUAL)/*.csv $(INDIVIDUAL_RACEETH)/*.csv $(CIDS)/*.csv
65+
rm -f $(RECEIVING)/*.csv $(INDIVIDUAL)/*.csv $(INDIVIDUAL_RACEETH)/*.csv $(CIDS)/*.csv $(CIDS_EXP)/*.csv
6366

6467
clean-archive:
6568
rm -f $(ARCHIVE)/*.Rds
@@ -78,6 +81,9 @@ install: install-python install-R
7881
$(CIDS):
7982
[ -f $(CIDS) ] || mkdir -p $(CIDS)
8083

84+
$(CIDS_EXP):
85+
[ -f $(CIDS_EXP) ] || mkdir -p $(CIDS_EXP)
86+
8187
init-qualtrics:
8288
grep '"token": "..*"' params.json
8389

@@ -133,14 +139,14 @@ dev: delphiFacebook_1.0.tar.gz
133139
lib:
134140
R -e 'roxygen2::roxygenise("delphiFacebook")'
135141

136-
run-R: $(CIDS)
142+
run-R: $(CIDS) $(CIDS_EXP)
137143
rm -rf tmp
138144
time Rscript run.R 2>&1 |tee tmp
139145
grep "run_facebook completed successfully" tmp
140146
grep "scheduled core" tmp ; \
141147
[ "$$?" -eq 1 ]
142148

143-
pipeline: scratch init-qualtrics params.json $(WEIGHTS) run-R post-cids post-individual post-individual-raceeth post-done tidy
149+
pipeline: scratch init-qualtrics params.json $(WEIGHTS) run-R post-cids post-experimental-cids post-individual post-individual-raceeth post-done tidy
144150
grep $(TODAY) params.json
145151
[ -f $(YESTERDAY) ] && rm $(YESTERDAY) || true
146152
touch $@
@@ -184,6 +190,28 @@ post-cids: $(TODAY) $(CIDS)
184190
echo "SUCCESS: $(DRY_MESSAGE)Posted `echo $${POST} | wc -w` cid files" >> $(MESSAGES)
185191
touch $@
186192

193+
post-experimental-cids: $(TODAY) $(CIDS_EXP)
194+
rm -rf tmp
195+
touch $(ANTIJOIN_EXP)
196+
POST=`find $(CIDS_EXP) -maxdepth 1 -newer $(TODAY) -name "cvid_cids_*.csv"`; \
197+
[ -n "$${POST}" ]; \
198+
LC_ALL=C find $(CIDS_EXP) -maxdepth 1 -daystart -mtime +0 -name "cvid_cids*.csv" -exec sort -u -o ${ANTIJOIN_EXP} {} +; \
199+
BATCH=""; \
200+
for f in $${POST}; do \
201+
LC_ALL=C comm -23 <(LC_ALL=C sort $$f) ${ANTIJOIN_EXP} >tmp; \
202+
diff -q tmp $$f || mv $$f $$f.bak; \
203+
mv tmp $$f; \
204+
ncids=`wc -l $$f | awk '{print $$1}'`; \
205+
if [[ $$ncids == "0" ]]; then \
206+
echo "ERROR: 0 CIDs reported for $$f"; \
207+
exit 73; \
208+
fi; \
209+
BATCH="$${BATCH}put $$f ${CIDS_EXP_DEST}\n"; \
210+
done; \
211+
$(SFTP_POST); \
212+
echo "SUCCESS: $(DRY_MESSAGE)Posted `echo $${POST} | wc -w` experimental cid files" >> $(MESSAGES)
213+
touch $@
214+
187215
post-individual: $(TODAY) $(INDIVIDUAL)
188216
POST=`find $(INDIVIDUAL) -maxdepth 1 -newer $(TODAY) -name "cvid_responses_*.csv"`; \
189217
[ -n "$${POST}" ]; \
@@ -210,10 +238,12 @@ post-individual-raceeth: $(TODAY) $(INDIVIDUAL_RACEETH)
210238
echo "SUCCESS: $(DRY_MESSAGE)Posted `echo $${POST} | wc -w` race-ethnicity microresponse files" >> $(MESSAGES)
211239
touch $@
212240

213-
post-done: post-cids
241+
post-done: post-cids post-experimental-cids
214242
touch $(YESTERDAY).done
215243
BATCH="put $(YESTERDAY).done $(CIDS_DEST)\n"; \
216244
$(SFTP_POST)
245+
BATCH="put $(YESTERDAY).done $(CIDS_EXP_DEST)\n"; \
246+
$(SFTP_POST)
217247
echo "SUCCESS: $(DRY_MESSAGE)Posted $(YESTERDAY).done" >> $(MESSAGES)
218248

219249
validate-covidcast:

facebook/delphiFacebook/NAMESPACE

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ export(end_of_prev_full_month)
1212
export(end_of_prev_full_week)
1313
export(filter_complete_responses)
1414
export(filter_data_for_aggregation)
15+
export(filter_module_complete_responses)
1516
export(filter_responses)
1617
export(floor_epiweek)
1718
export(get_filenames_in_range)
@@ -51,6 +52,7 @@ export(update_archive)
5152
export(update_params)
5253
export(verify_aggs)
5354
export(write_cid)
55+
export(write_cid_experimental_wrapper)
5456
export(write_contingency_tables)
5557
export(write_data_api)
5658
export(write_individual)
@@ -121,4 +123,5 @@ importFrom(stringi,stri_trans_tolower)
121123
importFrom(stringi,stri_trim)
122124
importFrom(tibble,add_column)
123125
importFrom(tibble,tribble)
126+
importFrom(utils,tail)
124127
useDynLib(delphiFacebook, .registration = TRUE)

facebook/delphiFacebook/R/contingency_variables.R

Lines changed: 0 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -4,70 +4,6 @@
44
## input data is always from only one wave of the survey -- they do not deal
55
## with inputs that have multiple waves mingled in one data frame.
66

7-
#' Gender
8-
#'
9-
#' @param input_data input data frame of raw survey data
10-
#' @param wave integer indicating survey version
11-
#'
12-
#' @return augmented data frame
13-
code_gender <- function(input_data, wave) {
14-
if ("D1" %in% names(input_data)) {
15-
input_data$gender <- case_when(
16-
input_data$D1 == 1 ~ "Male",
17-
input_data$D1 == 2 ~ "Female",
18-
input_data$D1 == 3 ~ "Other",
19-
input_data$D1 == 4 ~ "Other",
20-
input_data$D1 == 5 ~ NA_character_,
21-
TRUE ~ NA_character_
22-
)
23-
} else {
24-
input_data$gender <- NA_character_
25-
}
26-
27-
return(input_data)
28-
}
29-
30-
#' Age-related fields
31-
#'
32-
#' @param input_data input data frame of raw survey data
33-
#' @param wave integer indicating survey version
34-
#'
35-
#' @return augmented data frame
36-
code_age <- function(input_data, wave) {
37-
if ("D2" %in% names(input_data)) {
38-
input_data$agefull <- case_when(
39-
input_data$D2 == 1 ~ "18-24",
40-
input_data$D2 == 2 ~ "25-34",
41-
input_data$D2 == 3 ~ "35-44",
42-
input_data$D2 == 4 ~ "45-54",
43-
input_data$D2 == 5 ~ "55-64",
44-
input_data$D2 == 6 ~ "65-74",
45-
input_data$D2 == 7 ~ "75plus",
46-
TRUE ~ NA_character_
47-
)
48-
49-
# Condensed age categories
50-
input_data$age <- case_when(
51-
input_data$D2 == 1 ~ "18-24",
52-
input_data$D2 == 2 ~ "25-44",
53-
input_data$D2 == 3 ~ "25-44",
54-
input_data$D2 == 4 ~ "45-64",
55-
input_data$D2 == 5 ~ "45-64",
56-
input_data$D2 == 6 ~ "65plus",
57-
input_data$D2 == 7 ~ "65plus",
58-
TRUE ~ NA_character_
59-
)
60-
61-
input_data$age65plus <- input_data$age == "65plus"
62-
} else {
63-
input_data$agefull <- NA_character_
64-
input_data$age <- NA_character_
65-
input_data$age65plus <- NA
66-
}
67-
68-
return(input_data)
69-
}
70-
717
#' Occupation
728
#'
739
#' @param input_data input data frame of raw survey data

facebook/delphiFacebook/R/responses.R

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,8 @@ load_response_one <- function(input_filename, params, contingency_run) {
194194
input_data <- code_schooling(input_data, wave)
195195
input_data <- code_beliefs(input_data, wave)
196196
input_data <- code_news_and_info(input_data, wave)
197+
input_data <- code_gender(input_data, wave)
198+
input_data <- code_age(input_data, wave)
197199

198200
if (!is.null(params$produce_individual_raceeth) && params$produce_individual_raceeth) {
199201
input_data <- code_race_ethnicity(input_data, wave)
@@ -227,8 +229,6 @@ load_response_one <- function(input_filename, params, contingency_run) {
227229
if (contingency_run) {
228230
## Create additional fields for aggregations.
229231
# Demographic grouping variables
230-
input_data <- code_gender(input_data, wave)
231-
input_data <- code_age(input_data, wave)
232232
input_data <- code_race_ethnicity(input_data, wave)
233233
input_data <- code_occupation(input_data, wave)
234234
input_data <- code_education(input_data, wave)
@@ -546,6 +546,8 @@ module_assignment <- function(input_data, wave) {
546546
input_data$FL_23_DO == "ModuleB" ~ "B",
547547
TRUE ~ NA_character_
548548
)
549+
} else {
550+
input_data$module <- NA_character_
549551
}
550552

551553
return(input_data)
@@ -752,3 +754,48 @@ filter_complete_responses <- function(data_full, params)
752754

753755
return(data_full)
754756
}
757+
758+
#' Filter responses to those that are "module-complete". Splits by module assignment
759+
#'
760+
#' Inclusion criteria:
761+
#'
762+
#' * answered age consent
763+
#' * CID/token IS NOT missing
764+
#' * distribution source (ie previews) IS NOT irregular
765+
#' * start date IS IN range, pacific time
766+
#' * Date is in [`params$start_date - params$backfill_days`, `end_date`],
767+
#' inclusive.
768+
#' * answered minimum of 2 additional questions, where to "answer" a numeric
769+
#' open-ended question (A2, A2b, B2b, Q40, C10_1_1, C10_2_1, C10_3_1, C10_4_1,
770+
#' D3, D4, D5) means to provide any number (floats okay) and to "answer" a radio
771+
#' button question is to provide a selection.
772+
#' * reached the end of the survey (i.e. sees the "Thank you" message)
773+
#' * answered age and gender questions
774+
#'
775+
#' Most of these criteria are handled by `filter_responses()` and
776+
#' `filter_complete_responses()` above; this function need only handle the last
777+
#' two criteria.
778+
#'
779+
#' @param data_full data frame of responses
780+
#' @param params named list of configuration options from `read_params()`,
781+
#' containing `start_date`, `backfill_days`, and `end_date`
782+
#'
783+
#' @importFrom dplyr filter
784+
#' @importFrom rlang .data
785+
#' @export
786+
filter_module_complete_responses <- function(data_full, params)
787+
{
788+
date_col <- if ("day" %in% names(data_full)) { "day" } else { "Date" }
789+
data_full <- rename(data_full, Date = .data$date) %>%
790+
filter_complete_responses(params) %>%
791+
filter(!is.na(.data$age),
792+
!is.na(.data$gender),
793+
.data$Finished == 1) %>%
794+
select(date_col, .data$token, .data$module)
795+
796+
data_a <- filter(data_full, .data$module == "A")
797+
data_b <- filter(data_full, .data$module == "B")
798+
799+
return(list(a = data_a, b = data_b))
800+
}
801+

facebook/delphiFacebook/R/run.R

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@ run_facebook <- function(params)
4242
data_full <- join_weights(data_full, params, weights = "full")$df
4343
msg_df("full data to share with research partners", data_full)
4444

45+
# create module-complete data used to create CID lists separately for each module
46+
data_module_complete <- filter_module_complete_responses(input_data, params)
47+
data_module_complete_a <- data_module_complete[["a"]]
48+
data_module_complete_b <- data_module_complete[["b"]]
49+
4550
## Set default number of cores for mclapply to the total available number,
4651
## because we are greedy and this will typically run on a server.
4752
if (params$parallel) {
@@ -60,6 +65,11 @@ run_facebook <- function(params)
6065
{
6166
write_cid(data_full, "full", params)
6267
write_cid(data_agg, "part_a", params)
68+
69+
write_cid_experimental_wrapper(data_full, "full", params, "")
70+
write_cid_experimental_wrapper(data_agg, "part_a", params, "")
71+
write_cid_experimental_wrapper(data_module_complete_a, "module_complete", params, "modul_a_")
72+
write_cid_experimental_wrapper(data_module_complete_b, "module_complete", params, "modul_b_")
6373
}
6474
if ( "archive" %in% params$output )
6575
{

facebook/delphiFacebook/R/variables.R

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1032,3 +1032,68 @@ code_race_ethnicity <- function(input_data, wave) {
10321032

10331033
return(input_data)
10341034
}
1035+
1036+
#' Gender
1037+
#'
1038+
#' @param input_data input data frame of raw survey data
1039+
#' @param wave integer indicating survey version
1040+
#'
1041+
#' @return augmented data frame
1042+
code_gender <- function(input_data, wave) {
1043+
if ("D1" %in% names(input_data)) {
1044+
input_data$gender <- case_when(
1045+
input_data$D1 == 1 ~ "Male",
1046+
input_data$D1 == 2 ~ "Female",
1047+
input_data$D1 == 3 ~ "Other",
1048+
input_data$D1 == 4 ~ "Other",
1049+
input_data$D1 == 5 ~ NA_character_,
1050+
TRUE ~ NA_character_
1051+
)
1052+
} else {
1053+
input_data$gender <- NA_character_
1054+
}
1055+
1056+
return(input_data)
1057+
}
1058+
1059+
#' Age-related fields
1060+
#'
1061+
#' @param input_data input data frame of raw survey data
1062+
#' @param wave integer indicating survey version
1063+
#'
1064+
#' @return augmented data frame
1065+
code_age <- function(input_data, wave) {
1066+
if ("D2" %in% names(input_data)) {
1067+
input_data$agefull <- case_when(
1068+
input_data$D2 == 1 ~ "18-24",
1069+
input_data$D2 == 2 ~ "25-34",
1070+
input_data$D2 == 3 ~ "35-44",
1071+
input_data$D2 == 4 ~ "45-54",
1072+
input_data$D2 == 5 ~ "55-64",
1073+
input_data$D2 == 6 ~ "65-74",
1074+
input_data$D2 == 7 ~ "75plus",
1075+
TRUE ~ NA_character_
1076+
)
1077+
1078+
# Condensed age categories
1079+
input_data$age <- case_when(
1080+
input_data$D2 == 1 ~ "18-24",
1081+
input_data$D2 == 2 ~ "25-44",
1082+
input_data$D2 == 3 ~ "25-44",
1083+
input_data$D2 == 4 ~ "45-64",
1084+
input_data$D2 == 5 ~ "45-64",
1085+
input_data$D2 == 6 ~ "65plus",
1086+
input_data$D2 == 7 ~ "65plus",
1087+
TRUE ~ NA_character_
1088+
)
1089+
1090+
input_data$age65plus <- input_data$age == "65plus"
1091+
} else {
1092+
input_data$agefull <- NA_character_
1093+
input_data$age <- NA_character_
1094+
input_data$age65plus <- NA
1095+
}
1096+
1097+
return(input_data)
1098+
}
1099+

0 commit comments

Comments
 (0)