Skip to content

Commit 4b2e7c5

Browse files
authored
Merge pull request #1760 from cmu-delphi/ndefries/backcorr-input-format
[Backfill corrections] Align daily and rollup file formats; make dates portable
2 parents 4641604 + f3a2e33 commit 4b2e7c5

File tree

4 files changed

+27
-15
lines changed

4 files changed

+27
-15
lines changed

changehc/delphi_changehc/backfill.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,18 @@ def store_backfill_file(df, _end_date, backfill_dir, numtype, geo, weekday):
4646
'num', 'den']
4747
backfilldata = backfilldata.loc[backfilldata["time_value"] >= _start_date,
4848
selected_columns]
49+
50+
backfilldata["lag"] = [(_end_date - x).days for x in backfilldata["time_value"]]
51+
backfilldata["time_value"] = backfilldata.time_value.dt.strftime("%Y-%m-%d")
52+
backfilldata["issue_date"] = datetime.strftime(_end_date, "%Y-%m-%d")
53+
54+
backfilldata = backfilldata.astype({
55+
"time_value": "string",
56+
"issue_date": "string",
57+
"fips": "string",
58+
"state_id": "string"
59+
})
60+
4961
path = backfill_dir + \
5062
"/changehc_%s_as_of_%s.parquet"%(numtype, datetime.strftime(_end_date, "%Y%m%d"))
5163
# Store intermediate file into the backfill folder
@@ -109,9 +121,6 @@ def get_date(file_link):
109121
pdList = []
110122
for fn in new_files:
111123
df = pd.read_parquet(fn, engine='pyarrow')
112-
issue_date = get_date(fn)
113-
df["issue_date"] = issue_date
114-
df["lag"] = [(issue_date - x).days for x in df["time_value"]]
115124
pdList.append(df)
116125
merged_file = pd.concat(pdList).sort_values(["time_value", "fips"])
117126
path = backfill_dir + "/changehc_%s_from_%s_to_%s.parquet"%(

changehc/tests/test_backfill.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
class TestBackfill:
4141

4242
def test_store_backfill_file(self):
43-
43+
4444
fn = "changehc_covid_as_of_20200101.parquet"
4545
dropdate = datetime(2020, 1, 1)
4646
numtype = "covid"
@@ -69,7 +69,7 @@ def test_store_backfill_file(self):
6969
backfill_df = pd.read_parquet(backfill_dir + "/"+ fn, engine='pyarrow')
7070

7171
selected_columns = ['time_value', 'fips', 'state_id',
72-
'num', 'den']
72+
'num', 'den', 'lag', 'issue_date']
7373
assert set(selected_columns) == set(backfill_df.columns)
7474

7575
os.remove(backfill_dir + "/" + fn)
@@ -114,9 +114,6 @@ def test_merge_backfill_file(self):
114114
if "from" in file:
115115
continue
116116
df = pd.read_parquet(file, engine='pyarrow')
117-
issue_date = datetime.strptime(file[-16:-8], "%Y%m%d")
118-
df["issue_date"] = issue_date
119-
df["lag"] = [(issue_date - x).days for x in df["time_value"]]
120117
pdList.append(df)
121118
os.remove(file)
122119
new_files = glob.glob(backfill_dir + "/changehc_%s*.parquet"%numtype)

quidel_covidtest/delphi_quidel_covidtest/backfill.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,17 @@ def store_backfill_file(df, _end_date, backfill_dir):
5656
'num_age_0_17', 'den_age_0_17']
5757
backfilldata = backfilldata.loc[backfilldata["time_value"] >= _start_date,
5858
selected_columns]
59+
backfilldata["lag"] = [(_end_date - x).days for x in backfilldata["time_value"]]
60+
backfilldata["time_value"] = backfilldata.time_value.dt.strftime("%Y-%m-%d")
61+
backfilldata["issue_date"] = datetime.strftime(_end_date, "%Y-%m-%d")
62+
63+
backfilldata = backfilldata.astype({
64+
"time_value": "string",
65+
"issue_date": "string",
66+
"fips": "string",
67+
"state_id": "string"
68+
})
69+
5970
path = backfill_dir + \
6071
"/quidel_covidtest_as_of_%s.parquet"%datetime.strftime(_end_date, "%Y%m%d")
6172
# Store intermediate file into the backfill folder
@@ -108,9 +119,6 @@ def get_date(file_link):
108119
pdList = []
109120
for fn in new_files:
110121
df = pd.read_parquet(fn, engine='pyarrow')
111-
issue_date = get_date(fn)
112-
df["issue_date"] = issue_date
113-
df["lag"] = [(issue_date - x).days for x in df["time_value"]]
114122
pdList.append(df)
115123
merged_file = pd.concat(pdList).sort_values(["time_value", "fips"])
116124
path = backfill_dir + "/quidel_covidtest_from_%s_to_%s.parquet"%(

quidel_covidtest/tests/test_backfill.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ def test_store_backfill_file(self):
4949
'num_age_18_49', 'den_age_18_49',
5050
'num_age_50_64', 'den_age_50_64',
5151
'num_age_65plus', 'den_age_65plus',
52-
'num_age_0_17', 'den_age_0_17']
52+
'num_age_0_17', 'den_age_0_17',
53+
'lag', 'issue_date']
5354
assert set(selected_columns) == set(backfill_df.columns)
5455

5556
os.remove(backfill_dir + "/" + fn)
@@ -86,9 +87,6 @@ def test_merge_backfill_file(self):
8687
if "from" in file:
8788
continue
8889
df = pd.read_parquet(file, engine='pyarrow')
89-
issue_date = datetime.strptime(file[-16:-8], "%Y%m%d")
90-
df["issue_date"] = issue_date
91-
df["lag"] = [(issue_date - x).days for x in df["time_value"]]
9290
pdList.append(df)
9391
os.remove(file)
9492
new_files = glob.glob(backfill_dir + "/quidel_covidtest*.parquet")

0 commit comments

Comments
 (0)