Skip to content

Commit 635fbbe

Browse files
committed
remove funcs that fetch missing dates from local files. update docstrings. add num days to fetch param
1 parent 68fedd2 commit 635fbbe

File tree

2 files changed

+40
-89
lines changed

2 files changed

+40
-89
lines changed

google_symptoms/delphi_google_symptoms/pull.py

Lines changed: 37 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,7 @@
11
"""Retrieve data and wrangle into appropriate format."""
22
# -*- coding: utf-8 -*-
33
import re
4-
54
from datetime import date, datetime, timedelta # pylint: disable=unused-import
6-
from os import listdir, makedirs
7-
from os.path import isfile, join, exists
8-
95
import pandas_gbq
106
from google.oauth2 import service_account
117
import numpy as np
@@ -98,80 +94,40 @@ def preprocess(df, level):
9894
return df
9995

10096

101-
def get_missing_dates(receiving_dir, export_start_date):
102-
"""Produce list of dates to retrieve data for.
97+
def get_date_range(export_start_date, retrieve_days_before_now):
98+
"""Produce date range to retrieve data for.
10399
104-
Date list is created based on dates seen in already exported CSVs.
100+
Calculate start of date range as a static offset from the end date
101+
("now"). Pad date range by an additional 7 days before the earliest
102+
date to produce data for calculating smoothed estimates.
105103
106104
Parameters
107105
----------
108-
receiving_dir: str
109-
path to output directory
110-
export_start_date: date
111-
first date to retrieve data for
112-
113-
Returns
114-
-------
115-
list
116-
"""
117-
if not exists(receiving_dir):
118-
makedirs(receiving_dir)
119-
120-
OUTPUT_NAME_PATTERN = re.compile("^[0-9]{8}_.*[.]csv")
121-
existing_output_files = [f for f in listdir(receiving_dir) if isfile(
122-
join(receiving_dir, f)) and OUTPUT_NAME_PATTERN.match(f)]
123-
124-
existing_output_dates = {datetime.strptime(f[0:8], "%Y%m%d").date()
125-
for f in existing_output_files}
126-
expected_dates = {date.date() for date in pd.date_range(
127-
start=export_start_date,
128-
end=date.today(),
129-
freq='D')}
130-
131-
missing_dates = list(expected_dates.difference(existing_output_dates))
132-
133-
return missing_dates
134-
135-
136-
def get_all_dates(receiving_dir, export_start_date):
137-
"""Pad missing dates with enough extra days to do smoothing.
138-
139-
Using the missing_dates list as reference, creates a new list of dates
140-
spanning 6 days before the earliest date in missing_dates to today. This
141-
pads missing_dates with enough prior days to produce smoothed estimates
142-
starting on min(missing_dates) and fills in any gaps in missing_dates.
143-
144-
Parameters
145-
----------
146-
receiving_dir: str
147-
path to output directory
148106
export_start_date: date
149107
first date to retrieve data for
108+
retrieve_days_before_now: int
109+
number of days before end date ("now") to export
150110
151111
Returns
152112
-------
153113
list
154114
"""
155115
PAD_DAYS = 7
156116

157-
missing_dates = get_missing_dates(receiving_dir, export_start_date)
158-
if len(missing_dates) == 0:
159-
return missing_dates
160-
161-
# Calculate list start date to avoid getting data before the
162-
# user-set start date. Convert both dates/datetimes to date to avoid error
163-
# from trying to compare different types.
117+
end_date = date.today()
118+
# Don't fetch data before the user-set start date. Convert both
119+
# dates/datetimes to date to avoid error from trying to compare
120+
# different types.
164121
start_date = max(
165-
min(missing_dates) - timedelta(days=PAD_DAYS - 1),
122+
end_date - timedelta(days=retrieve_days_before_now),
166123
export_start_date.date()
167124
)
168125

169-
retrieve_dates = {date.date() for date in pd.date_range(
170-
start=start_date,
171-
end=date.today(),
172-
freq='D')}
126+
retrieve_dates = [
127+
start_date - timedelta(days=PAD_DAYS - 1),
128+
end_date]
173129

174-
return list(retrieve_dates)
130+
return retrieve_dates
175131

176132

177133
def format_dates_for_query(date_list):
@@ -187,30 +143,22 @@ def format_dates_for_query(date_list):
187143
188144
Returns
189145
-------
190-
str: "timestamp("YYYY-MM-DD"), ..."
146+
list[str]: ["YYYY-MM-DD"), "YYYY-MM-DD"]
191147
"""
192-
earliest_available_symptom_search_year = 2017
193-
194-
filtered_date_strings = [datetime.strftime(date, "%Y-%m-%d")
195-
for date in date_list
196-
if date.year >= earliest_available_symptom_search_year]
197-
198-
# Convert list of dates into list of BigQuery-compatible timestamps.
199-
query_string = 'timestamp("' + \
200-
'"), timestamp("'.join(filtered_date_strings) + '")'
201-
202-
return query_string
148+
formatted_date_strings = [datetime.strftime(date, "%Y-%m-%d")
149+
for date in date_list]
150+
return formatted_date_strings
203151

204152

205-
def produce_query(level, date_string):
153+
def produce_query(level, date_range):
206154
"""Create query string.
207155
208156
Parameters
209157
----------
210158
level: str
211159
"county" or "state"
212-
date_string: str
213-
"timestamp(date), ..." where timestamps are BigQuery-compatible
160+
date_range: list[str]
161+
["YYYY-MM-DD"), "YYYY-MM-DD"] where dates are BigQuery-compatible.
214162
215163
Returns
216164
-------
@@ -225,7 +173,7 @@ def produce_query(level, date_string):
225173
date,
226174
{symptom_cols}
227175
from `bigquery-public-data.covid19_symptom_search.{symptom_table}`
228-
where timestamp(date) in ({date_list}) and
176+
where timestamp(date) between timestamp("{start_date}") and timestamp("{end_date}") and
229177
country_region_code = "US"
230178
"""
231179
base_level_table = {"state": "symptom_search_sub_region_1_daily",
@@ -235,12 +183,13 @@ def produce_query(level, date_string):
235183
query = base_query.format(
236184
symptom_cols=", ".join(colname_map.keys()),
237185
symptom_table=base_level_table[level],
238-
date_list=date_string)
186+
start_date=date_range[0],
187+
end_date=date_range[1])
239188

240189
return query
241190

242191

243-
def pull_gs_data_one_geolevel(level, date_string):
192+
def pull_gs_data_one_geolevel(level, date_range):
244193
"""Pull latest data for a single geo level.
245194
246195
Fetch data and transform it into the appropriate format, as described in
@@ -261,14 +210,14 @@ def pull_gs_data_one_geolevel(level, date_string):
261210
----------
262211
level: str
263212
"county" or "state"
264-
date_string: str
265-
"timestamp("YYYY-MM-DD"), ..." where timestamps are BigQuery-compatible
213+
date_range: list[str]
214+
["YYYY-MM-DD"), "YYYY-MM-DD"] where dates are BigQuery-compatible.
266215
267216
Returns
268217
-------
269218
pd.DataFrame
270219
"""
271-
query = produce_query(level, date_string)
220+
query = produce_query(level, date_range)
272221

273222
df = pandas_gbq.read_gbq(query, progress_bar_type=None)
274223

@@ -301,7 +250,7 @@ def initialize_credentials(path_to_credentials):
301250
pandas_gbq.context.project = credentials.project_id
302251

303252

304-
def pull_gs_data(path_to_credentials, receiving_dir, export_start_date):
253+
def pull_gs_data(path_to_credentials, export_start_date, num_export_days):
305254
"""Pull latest dataset for each geo level and combine.
306255
307256
PS: No information for PR
@@ -312,29 +261,29 @@ def pull_gs_data(path_to_credentials, receiving_dir, export_start_date):
312261
Path to BigQuery API key and service account json file
313262
level: str
314263
"county" or "state"
315-
receiving_dir: str
316-
path to output directory
317264
export_start_date: date
318265
first date to retrieve data for
266+
num_export_days: int
267+
number of days before end date ("now") to export
319268
320269
Returns
321270
-------
322271
dict: {"county": pd.DataFrame, "state": pd.DataFrame}
323272
"""
324273
# Fetch and format dates we want to attempt to retrieve
325-
retrieve_dates = get_all_dates(receiving_dir, export_start_date)
326-
retrieve_dates_dict = format_dates_for_query(retrieve_dates)
274+
retrieve_dates = get_date_range(export_start_date, num_export_days)
275+
retrieve_dates = format_dates_for_query(retrieve_dates)
327276

328277
initialize_credentials(path_to_credentials)
329278

330279
# Create dictionary for state and county level data
331280
dfs = {}
332281

333282
# For state level data
334-
dfs["state"] = pull_gs_data_one_geolevel("state", retrieve_dates_dict)
283+
dfs["state"] = pull_gs_data_one_geolevel("state", retrieve_dates)
335284

336285
# For county level data
337-
dfs["county"] = pull_gs_data_one_geolevel("county", retrieve_dates_dict)
286+
dfs["county"] = pull_gs_data_one_geolevel("county", retrieve_dates)
338287

339288
# Add District of Columbia as county
340289
try:

google_symptoms/delphi_google_symptoms/run.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,14 +32,16 @@ def run_module():
3232
export_start_date = datetime.strptime(
3333
params["export_start_date"], "%Y-%m-%d")
3434
export_dir = params["export_dir"]
35+
num_export_days = params.get("num_export_days", 14)
3536

3637
logger = get_structured_logger(
3738
__name__, filename=params.get("log_filename"),
3839
log_exceptions=params.get("log_exceptions", True))
3940

4041
# Pull GS data
4142
dfs = pull_gs_data(params["path_to_bigquery_credentials"],
42-
export_dir, export_start_date)
43+
export_start_date,
44+
num_export_days)
4345
gmpr = geomap.GeoMapper()
4446

4547
for geo_res in GEO_RESOLUTIONS:

0 commit comments

Comments
 (0)