1
1
# ' Return params file as an R list
2
2
# '
3
3
# ' Reads a parameters file. Copies global params to contingency params if not
4
- # ' already defined.
4
+ # ' already defined. Uses current date as end_date if not provided.
5
5
# '
6
6
# ' @param path path to the parameters file; if not present, will try to copy the file
7
7
# ' "params.json.template"
@@ -17,15 +17,21 @@ read_contingency_params <- function(path = "params.json", template_path = "param
17
17
contingency_params $ start_time <- ymd_hms(
18
18
sprintf(" %s 00:00:00" , contingency_params $ start_date ), tz = tz_to
19
19
)
20
+
21
+ # Fill in end_date, if missing, with current date.
22
+ contingency_params $ end_date <- if_else(
23
+ is.null(contingency_params $ end_date ), as.character(Sys.Date()), contingency_params $ end_date
24
+ )
25
+
20
26
contingency_params $ end_time <- ymd_hms(
21
27
sprintf(" %s 23:59:59" , contingency_params $ end_date ), tz = tz_to
22
28
)
23
29
24
30
global_params <- c(" archive_days" , " backfill_days" , " static_dir" , " cache_dir" ,
25
31
" archive_dir" , " weights_in_dir" , " input_dir" , " debug" ,
26
- " parallel" )
32
+ " parallel" , " qualtrics " )
27
33
for (param in global_params ) {
28
- if ( is.null(contingency_params [[param ]]) ) {
34
+ if ( is.null(contingency_params [[param ]]) & ! is.null( params [[ param ]]) ) {
29
35
contingency_params [[param ]] <- params [[param ]]
30
36
}
31
37
}
@@ -55,36 +61,45 @@ read_contingency_params <- function(path = "params.json", template_path = "param
55
61
# '
56
62
# ' @export
57
63
update_params <- function (params ) {
58
- # Fill in end_time, if missing, with current time.
59
- if (is.null(params $ end_time )) {
60
- params $ end_time <- Sys.time()
61
- }
62
-
63
64
# Construct aggregate date range.
64
65
if ( ! is.null(params $ start_date ) ) {
66
+ # If start_date is provided, use start/end dates exactly as given.
65
67
date_range <- list (params $ start_time , params $ end_time )
66
68
} else {
67
69
# If start_date is not provided, assume want to use preceding full time period.
68
70
date_range <- get_range_prev_full_period(
69
- as_date(params $ end_date )
70
- , params $ aggregate_range
71
+ as_date(params $ end_date ), params $ aggregate_range
71
72
)
72
73
}
73
74
74
- params $ input <- get_filenames_in_range(date_range [[1 ]], date_range [[2 ]], params )
75
- if ( length(params [[" input" ]]) == 0 || all(is.na(params [[" input" ]])) ) {
75
+ if ( is.null(params [[" input" ]]) || length(params $ input ) == 0 ) {
76
+ # If params$input empty or not provided, fetch filenames from input_dir.
77
+ params $ input <- get_sparse_filenames(date_range [[1 ]], date_range [[2 ]], params )
78
+ } else {
79
+ # If input files provided, subset to those in desired date range.
80
+ params $ input <- get_filenames_in_range(date_range [[1 ]], date_range [[2 ]], params )
81
+ }
82
+
83
+ # Overwrites contents of file of the same name.
84
+ writeLines(params $ input , " contingency_input.txt" )
85
+
86
+ if ( length(params [[" input" ]]) == 0 || all(is.na(params $ input )) ) {
76
87
stop(" no input files to read in" )
77
88
}
78
89
79
90
params $ start_time <- date_range [[1 ]]
80
91
params $ end_time <- date_range [[2 ]]
81
92
params $ start_date <- as_date(date_range [[1 ]])
82
93
params $ end_date <- as_date(date_range [[2 ]])
83
-
94
+
84
95
return (params )
85
96
}
86
97
87
98
# ' Get relevant input data file names from `input_dir`.
99
+ # '
100
+ # ' Only include files containing data that falls at least somewhat between start
101
+ # ' and end dates, and is from an allowed ("active") survey and not a "dormant"
102
+ # ' survey.
88
103
# '
89
104
# ' @param start_date Start of desired date range
90
105
# ' @param end_date End of desired date range
@@ -101,17 +116,33 @@ get_filenames_in_range <- function(start_date, end_date, params) {
101
116
start_date <- as_date(start_date ) - days(params $ backfill_days )
102
117
end_date <- as_date(end_date )
103
118
104
- if ( is.null(params $ input ) | length(params $ input ) == 0 ) {
105
- date_pattern <- " ^[0-9]{4}-[0-9]{2}-[0-9]{2}.*[.]csv$"
106
- youtube_pattern <- " .*YouTube[.]csv$"
119
+ if ( is.null(params [[" input" ]]) || length(params $ input ) == 0 ) {
120
+ # # Keep all files from active surveys that appear in the input dir.
121
+
122
+ if ( ! is.null(params [[" qualtrics" ]]) ) {
123
+ include_patterns <- names(params $ qualtrics $ surveys $ active )
124
+ include_patterns <- gsub(" " , " _" , include_patterns , fixed = TRUE )
125
+
126
+ exclude_patterns <- names(params $ qualtrics $ surveys $ dormant )
127
+ exclude_patterns <- gsub(" " , " _" , exclude_patterns , fixed = TRUE )
128
+ } else {
129
+ # If no active/dormant survey info provided, use basic patterns to
130
+ # include/exclude survey files.
131
+ include_patterns <- c(" ^[0-9]{4}-[0-9]{2}-[0-9]{2}.*[.]csv$" )
132
+ exclude_patterns <- c(" .*YouTube[.]csv$" )
133
+ }
107
134
108
135
filenames <- list.files(path = params $ input_dir )
109
- filenames <- filenames [grepl(date_pattern , filenames ) & ! grepl(youtube_pattern , filenames )]
136
+
137
+ include_map <- grepl(paste(include_patterns , collapse = " |" ), filenames )
138
+ exclude_map <- grepl(paste(exclude_patterns , collapse = " |" ), filenames )
139
+ filenames <- filenames [include_map & ! exclude_map ]
110
140
} else {
111
141
filenames <- params $ input
112
142
}
113
143
114
- file_end_dates <- as_date(substr(filenames , 1 , 10 ))
144
+ # Filenames are formatted as "{generation date}.{start date}.{end date}.{survey name}_-_{survey version}.csv".
145
+ file_end_dates <- as_date(substr(filenames , 23 , 32 ))
115
146
file_start_dates <- as_date(substr(filenames , 12 , 21 ))
116
147
117
148
# Only keep files with data that falls at least somewhat between the desired
@@ -123,6 +154,37 @@ get_filenames_in_range <- function(start_date, end_date, params) {
123
154
return (filenames )
124
155
}
125
156
157
+ # ' Get sparse list of input data files from `input_dir`.
158
+ # '
159
+ # ' Finds every fourth + last file by date.
160
+ # '
161
+ # ' @param start_date Start of desired date range
162
+ # ' @param end_date End of desired date range
163
+ # ' @param params Params object produced by read_params
164
+ # '
165
+ # ' @return Character vector of filenames
166
+ # '
167
+ # ' @importFrom lubridate as_date
168
+ # '
169
+ # ' @export
170
+ get_sparse_filenames <- function (start_date , end_date , params ) {
171
+ if (params $ use_input_asis ) { return (params $ input ) }
172
+
173
+ filenames <- get_filenames_in_range(start_date , end_date , params )
174
+
175
+ file_end_dates <- as_date(substr(filenames , 23 , 32 ))
176
+ unique_file_end_dates <- sort(unique(file_end_dates ))
177
+
178
+ # Use every fourth date. Always keep last date.
179
+ keep_inds <- unique(c(
180
+ seq(1 , length(unique_file_end_dates ), 4L ),
181
+ length(unique_file_end_dates )))
182
+ keep_dates <- unique_file_end_dates [keep_inds ]
183
+ filenames <- filenames [file_end_dates %in% keep_dates ]
184
+
185
+ return (filenames )
186
+ }
187
+
126
188
# ' Check user-set aggregations for basic validity and add a few necessary cols.
127
189
# '
128
190
# ' @param aggregations Data frame with columns `name`, `var_weight`, `metric`,
0 commit comments