Skip to content

Commit 410b791

Browse files
authored
Merge pull request #1265 from cmu-delphi/change-timestamp-to-date
Change geomapper class to use "timestamp" as the default name for date_col's
2 parents 3b84dbf + 4aa202a commit 410b791

File tree

23 files changed

+69
-84
lines changed

23 files changed

+69
-84
lines changed

_delphi_utils_python/delphi_utils/geomap.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ def megacounty_creation(
183183
thr_win_len,
184184
thr_col="visits",
185185
fips_col="fips",
186-
date_col="date",
186+
date_col="timestamp",
187187
mega_col="megafips",
188188
):
189189
"""Create megacounty column.
@@ -340,7 +340,7 @@ def replace_geocode(
340340
new_code,
341341
from_col=None,
342342
new_col=None,
343-
date_col="date",
343+
date_col="timestamp",
344344
data_cols=None,
345345
dropna=True,
346346
):
@@ -366,7 +366,7 @@ def replace_geocode(
366366
new_code: {'fips', 'zip', 'state_code', 'state_id', 'state_name', 'hrr', 'msa',
367367
'hhs'}
368368
Specifies the geocode type of the data in new_col.
369-
date_col: str or None, default "date"
369+
date_col: str or None, default "timestamp"
370370
Specify which column contains the date values. Used for value aggregation.
371371
If None, then the aggregation is done only on geo_id.
372372
data_cols: list, default None
@@ -457,7 +457,7 @@ def fips_to_megacounty(
457457
thr_win_len,
458458
thr_col="visits",
459459
fips_col="fips",
460-
date_col="date",
460+
date_col="timestamp",
461461
mega_col="megafips",
462462
count_cols=None,
463463
):

_delphi_utils_python/tests/test_geomap.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,47 +15,47 @@ class TestGeoMapper:
1515
fips_data = pd.DataFrame(
1616
{
1717
"fips": ["01123", "02340", "98633", "18181"],
18-
"date": [pd.Timestamp("2018-01-01")] * 4,
18+
"timestamp": [pd.Timestamp("2018-01-01")] * 4,
1919
"count": [2, 0, 20, 10021],
2020
"total": [4, 0, 400, 100001],
2121
}
2222
)
2323
fips_data_2 = pd.DataFrame(
2424
{
2525
"fips": ["01123", "02340", "02002", "18633", "18181"],
26-
"date": [pd.Timestamp("2018-01-01")] * 5,
26+
"timestamp": [pd.Timestamp("2018-01-01")] * 5,
2727
"count": [2, 1, 20, np.nan, 10021],
2828
"total": [4, 1, 400, np.nan, 100001],
2929
}
3030
)
3131
fips_data_3 = pd.DataFrame(
3232
{
3333
"fips": ["48059", "48253", "48441", "72003", "72005", "10999"],
34-
"date": [pd.Timestamp("2018-01-01")] * 3 + [pd.Timestamp("2018-01-03")] * 3,
34+
"timestamp": [pd.Timestamp("2018-01-01")] * 3 + [pd.Timestamp("2018-01-03")] * 3,
3535
"count": [1, 2, 3, 4, 8, 5],
3636
"total": [2, 4, 7, 11, 100, 10],
3737
}
3838
)
3939
fips_data_4 = pd.DataFrame(
4040
{
4141
"fips": ["01123", "48253", "72003", "18181"],
42-
"date": [pd.Timestamp("2018-01-01")] * 4,
42+
"timestamp": [pd.Timestamp("2018-01-01")] * 4,
4343
"count": [2, 1, np.nan, 10021],
4444
"total": [4, 1, np.nan, 100001],
4545
}
4646
)
4747
fips_data_5 = pd.DataFrame(
4848
{
4949
"fips": [1123, 48253, 72003, 18181],
50-
"date": [pd.Timestamp("2018-01-01")] * 4,
50+
"timestamp": [pd.Timestamp("2018-01-01")] * 4,
5151
"count": [2, 1, np.nan, 10021],
5252
"total": [4, 1, np.nan, 100001],
5353
}
5454
)
5555
zip_data = pd.DataFrame(
5656
{
5757
"zip": ["45140", "95616", "95618"] * 2,
58-
"date": [pd.Timestamp("2018-01-01")] * 3 + [pd.Timestamp("2018-01-03")] * 3,
58+
"timestamp": [pd.Timestamp("2018-01-01")] * 3 + [pd.Timestamp("2018-01-03")] * 3,
5959
"count": [99, 345, 456, 100, 344, 442],
6060
}
6161
)
@@ -66,15 +66,15 @@ class TestGeoMapper:
6666
pd.DataFrame(
6767
{
6868
"fips": ["01001"] * len(jan_month),
69-
"date": jan_month,
69+
"timestamp": jan_month,
7070
"count": np.arange(len(jan_month)),
7171
"visits": np.arange(len(jan_month)),
7272
}
7373
),
7474
pd.DataFrame(
7575
{
7676
"fips": ["01002"] * len(jan_month),
77-
"date": jan_month,
77+
"timestamp": jan_month,
7878
"count": np.arange(len(jan_month)),
7979
"visits": 2 * np.arange(len(jan_month)),
8080
}
@@ -86,15 +86,15 @@ class TestGeoMapper:
8686
pd.DataFrame(
8787
{
8888
"fips": ["01001"] * len(jan_month),
89-
"date": jan_month,
89+
"timestamp": jan_month,
9090
"count": np.arange(len(jan_month)),
9191
"_thr_col_roll": np.arange(len(jan_month)),
9292
}
9393
),
9494
pd.DataFrame(
9595
{
9696
"fips": [11001] * len(jan_month),
97-
"date": jan_month,
97+
"timestamp": jan_month,
9898
"count": np.arange(len(jan_month)),
9999
"_thr_col_roll": np.arange(len(jan_month)),
100100
}
@@ -112,7 +112,7 @@ class TestGeoMapper:
112112
84000013,
113113
84090002,
114114
],
115-
"date": [pd.Timestamp("2018-01-01")] * 3
115+
"timestamp": [pd.Timestamp("2018-01-01")] * 3
116116
+ [pd.Timestamp("2018-01-03")] * 3
117117
+ [pd.Timestamp("2018-01-01")],
118118
"count": [1, 2, 3, 4, 8, 5, 20],
@@ -245,7 +245,7 @@ def test_add_geocode(self, geomapper):
245245
new_data,
246246
pd.DataFrame().from_dict(
247247
{
248-
"date": {0: pd.Timestamp("2018-01-01 00:00:00")},
248+
"timestamp": {0: pd.Timestamp("2018-01-01 00:00:00")},
249249
"NATION": {0: "us"},
250250
"count": {0: 10024.0},
251251
"total": {0: 100006.0},
@@ -259,7 +259,7 @@ def test_add_geocode(self, geomapper):
259259
new_data,
260260
pd.DataFrame().from_dict(
261261
{
262-
"date": {
262+
"timestamp": {
263263
0: pd.Timestamp("2018-01-01"),
264264
1: pd.Timestamp("2018-01-03"),
265265
},
@@ -280,7 +280,7 @@ def test_add_geocode(self, geomapper):
280280
assert geomapper.add_geocode(self.fips_data_3, "fips", "hrr", dropna=False).isna().any().any()
281281

282282
# fips -> zip (date_col=None chech)
283-
new_data = geomapper.replace_geocode(self.fips_data_5.drop(columns=["date"]), "fips", "hrr", date_col=None)
283+
new_data = geomapper.replace_geocode(self.fips_data_5.drop(columns=["timestamp"]), "fips", "hrr", date_col=None)
284284
pd.testing.assert_frame_equal(
285285
new_data,
286286
pd.DataFrame().from_dict(
@@ -293,7 +293,7 @@ def test_add_geocode(self, geomapper):
293293
)
294294

295295
# fips -> hhs
296-
new_data = geomapper.replace_geocode(self.fips_data_3.drop(columns=["date"]),
296+
new_data = geomapper.replace_geocode(self.fips_data_3.drop(columns=["timestamp"]),
297297
"fips", "hhs", date_col=None)
298298
pd.testing.assert_frame_equal(
299299
new_data,
@@ -313,7 +313,7 @@ def test_add_geocode(self, geomapper):
313313
new_data,
314314
pd.DataFrame().from_dict(
315315
{
316-
"date": {0: pd.Timestamp("2018-01-01"), 1: pd.Timestamp("2018-01-01"),
316+
"timestamp": {0: pd.Timestamp("2018-01-01"), 1: pd.Timestamp("2018-01-01"),
317317
2: pd.Timestamp("2018-01-03"), 3: pd.Timestamp("2018-01-03")},
318318
"hhs": {0: "5", 1: "9", 2: "5", 3: "9"},
319319
"count": {0: 99.0, 1: 801.0, 2: 100.0, 3: 786.0},

_template_python/delphi_NAME/run.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,7 @@ def run_module(params):
4747
## aggregate & smooth
4848
## TODO: add num/prop variations if needed
4949
for sensor, smoother, geo in product(SIGNALS, SMOOTHERS, GEOS):
50-
df = mapper.replace_geocode(
51-
all_data, "zip", geo,
52-
new_col="geo_id",
53-
date_col="timestamp")
50+
df = mapper.replace_geocode(all_data, "zip", geo, new_col="geo_id")
5451
## TODO: recompute sample_size, se here if not NA
5552
df["val"] = df[["geo_id", "val"]].groupby("geo_id")["val"].transform(
5653
smoother[0].smooth

changehc/delphi_changehc/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class Config:
2929
FLU_LIKE_COL = "Flu-like"
3030
COVID_LIKE_COL = "Covid-like"
3131
COUNT_COLS = [COVID_COL,DENOM_COL,FLU_COL,MIXED_COL,FLU_LIKE_COL,COVID_LIKE_COL]
32-
DATE_COL = "date"
32+
DATE_COL = "timestamp"
3333
GEO_COL = "fips"
3434
ID_COLS = [DATE_COL] + [GEO_COL]
3535
FILT_COLS = ID_COLS + COUNT_COLS

changehc/delphi_changehc/update_sensor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ def update_sensor(self,
232232
# sample size is never shared
233233
df["sample_size"] = np.nan
234234
# conform to naming expected by create_export_csv()
235-
df = df.reset_index().rename(columns={"date": "timestamp", "rate": "val"})
235+
df = df.reset_index().rename(columns={"rate": "val"})
236236
# df.loc[~df['incl'], ["val", "se"]] = np.nan # update to this line after nancodes get merged in
237237
df = df[df['incl']]
238238

changehc/tests/test_load_data.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def test_base_unit(self):
4545

4646
def test_denom_columns(self):
4747
assert "fips" in self.denom_data.index.names
48-
assert "date" in self.denom_data.index.names
48+
assert "timestamp" in self.denom_data.index.names
4949

5050
expected_denom_columns = ["Denominator"]
5151
for col in expected_denom_columns:
@@ -54,7 +54,7 @@ def test_denom_columns(self):
5454

5555
def test_claims_columns(self):
5656
assert "fips" in self.covid_data.index.names
57-
assert "date" in self.covid_data.index.names
57+
assert "timestamp" in self.covid_data.index.names
5858

5959
expected_covid_columns = ["COVID"]
6060
for col in expected_covid_columns:
@@ -63,7 +63,7 @@ def test_claims_columns(self):
6363

6464
def test_combined_columns(self):
6565
assert "fips" in self.combined_data.index.names
66-
assert "date" in self.combined_data.index.names
66+
assert "timestamp" in self.combined_data.index.names
6767

6868
expected_combined_columns = ["num", "den"]
6969
for col in expected_combined_columns:
@@ -75,16 +75,16 @@ def test_edge_values(self):
7575
for data in [self.denom_data,
7676
self.covid_data,
7777
self.combined_data]:
78-
assert data.index.get_level_values('date').max() >= Config.FIRST_DATA_DATE
79-
assert data.index.get_level_values('date').min() < DROP_DATE
78+
assert data.index.get_level_values("timestamp").max() >= Config.FIRST_DATA_DATE
79+
assert data.index.get_level_values("timestamp").min() < DROP_DATE
8080

8181
def test_fips_values(self):
8282
for data in [self.denom_data,
8383
self.covid_data,
8484
self.combined_data]:
8585
assert (
8686
len(data.index.get_level_values(
87-
'fips').unique()) <= len(self.gmpr.get_geo_values("fips"))
87+
"fips").unique()) <= len(self.gmpr.get_geo_values("fips"))
8888
)
8989

9090
def test_combined_fips_values(self):

changehc/tests/test_update_sensor.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ class TestCHCSensorUpdator:
4242
"num": [0, 100, 200, 300, 400, 500, 600, 100, 200, 300, 400, 500, 600],
4343
"fips": ['01001'] * 7 + ['04007'] * 6,
4444
"den": [1000] * 7 + [2000] * 6,
45-
"date": [pd.Timestamp(f'03-{i}-2020') for i in range(1, 14)]}).set_index(["fips","date"])
45+
"timestamp": [pd.Timestamp(f'03-{i}-2020') for i in range(1, 14)]}).set_index(["fips","timestamp"])
4646

4747
def test_shift_dates(self):
4848
"""Tests that dates in the data are shifted according to the burn-in and lag."""
@@ -88,7 +88,7 @@ def test_geo_reindex(self):
8888
"num": [0, 100, 200, 300, 400, 500, 600, 100, 200, 300, 400, 500, 600],
8989
"fips": ['01001'] * 7 + ['04007'] * 6,
9090
"den": [1000] * 7 + [2000] * 6,
91-
"date": [pd.Timestamp(f'03-{i}-2020') for i in range(1, 14)]})
91+
"timestamp": [pd.Timestamp(f'03-{i}-2020') for i in range(1, 14)]})
9292
data_frame = su_inst.geo_reindex(test_data)
9393
assert data_frame.shape[0] == multiple*len(su_inst.fit_dates)
9494
assert (data_frame.sum() == (4200,19000)).all()
@@ -118,8 +118,8 @@ def test_update_sensor(self):
118118
"num": [0, 100, 200, 300, 400, 500, 600, 100, 200, 300, 400, 500, 600] * 2,
119119
"fips": ["01001"] * 13 + ["42003"] * 13,
120120
"den": [30, 50, 50, 10, 1, 5, 5, 50, 50, 50, 0, 0, 0] * 2,
121-
"date": list(pd.date_range("20200301", "20200313")) * 2
122-
}).set_index(["fips", "date"])
121+
"timestamp": list(pd.date_range("20200301", "20200313")) * 2
122+
}).set_index(["fips", "timestamp"])
123123
su_inst.update_sensor(small_test_data, td.name)
124124
for f in os.listdir(td.name):
125125
outputs[f] = pd.read_csv(os.path.join(td.name, f))

claims_hosp/delphi_claims_hosp/config.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,13 @@ class Config:
3232
# data columns
3333
CLAIMS_COUNT_COLS = ["Denominator", "Covid_like"]
3434
CLAIMS_DATE_COL = "ServiceDate"
35-
CLAIMS_RENAME_COLS = {"Pat HRR ID": "hrr", "ServiceDate": "date",
36-
"PatCountyFIPS": "fips", "PatAgeGroup": "age_group"}
35+
FIPS_COL = "fips"
36+
DATE_COL = "timestamp"
37+
AGE_COL = "age_group"
38+
HRR_COL = "hrr"
39+
40+
CLAIMS_RENAME_COLS = {"Pat HRR ID": HRR_COL, "ServiceDate": DATE_COL,
41+
"PatCountyFIPS": FIPS_COL, "PatAgeGroup": AGE_COL}
3742
CLAIMS_DTYPES = {
3843
"ServiceDate": str,
3944
"PatCountyFIPS": str,
@@ -43,10 +48,7 @@ class Config:
4348
"Pat HRR ID": str,
4449
}
4550

46-
FIPS_COL = "fips"
47-
DATE_COL = "date"
48-
AGE_COL = "age_group"
49-
HRR_COL = "hrr"
51+
5052

5153
SMOOTHER_BANDWIDTH = 100 # bandwidth for the linear left Gaussian filter
5254
MIN_DEN = 100 # number of total visits needed to produce a sensor

claims_hosp/delphi_claims_hosp/load_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def load_claims_data(claims_filepath, dropdate, base_geo):
4848
), "Claims counts must be nonnegative"
4949

5050
# aggregate age groups (so data is unique by date and base geography)
51-
claims_data = claims_data.groupby([base_geo, "date"]).sum()
51+
claims_data = claims_data.groupby([base_geo, Config.DATE_COL]).sum()
5252
claims_data.dropna(inplace=True) # drop rows with any missing entries
5353

5454
return claims_data

claims_hosp/delphi_claims_hosp/update_indicator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,11 +120,11 @@ def geo_reindex(self, data):
120120
return False
121121

122122
unique_geo_ids = pd.unique(data_frame[self.geo])
123-
data_frame.set_index([self.geo, 'date'], inplace=True)
123+
data_frame.set_index([self.geo, "timestamp"], inplace=True)
124124

125125
# for each location, fill in all missing dates with 0 values
126126
multiindex = pd.MultiIndex.from_product((unique_geo_ids, self.fit_dates),
127-
names=[self.geo, "date"])
127+
names=[self.geo, Config.DATE_COL])
128128
assert (
129129
len(multiindex) <= (GeoConstants.MAX_GEO[self.geo] * len(self.fit_dates))
130130
), "more loc-date pairs than maximum number of geographies x number of dates"

claims_hosp/tests/test_indicator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def test_backwards_pad(self):
5656
def test_fit_fips(self):
5757
date_range = pd.date_range("2020-05-01", "2020-05-20")
5858
all_fips = self.fips_data.fips.unique()
59-
loc_index_fips_data = self.fips_data.set_index(["fips", "date"])
59+
loc_index_fips_data = self.fips_data.set_index(["fips", "timestamp"])
6060
sample_fips = nr.choice(all_fips, 10)
6161

6262
for fips in sample_fips:
@@ -79,7 +79,7 @@ def test_fit_fips(self):
7979
def test_fit_hrrs(self):
8080
date_range = pd.date_range("2020-05-01", "2020-05-20")
8181
all_hrrs = self.hrr_data.hrr.unique()
82-
loc_index_hrr_data = self.hrr_data.set_index(["hrr", "date"])
82+
loc_index_hrr_data = self.hrr_data.set_index(["hrr", "timestamp"])
8383
sample_hrrs = nr.choice(all_hrrs, 10)
8484

8585
for hrr in sample_hrrs:

claims_hosp/tests/test_load_data.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@ def test_base_unit(self):
3434
def test_claims_columns(self):
3535
assert "hrr" in self.hrr_claims_data.index.names
3636
assert "fips" in self.fips_claims_data.index.names
37-
assert "date" in self.hrr_claims_data.index.names
38-
assert "date" in self.fips_claims_data.index.names
37+
assert "timestamp" in self.hrr_claims_data.index.names
38+
assert "timestamp" in self.fips_claims_data.index.names
3939

4040
expected_claims_columns = ["Denominator", "Covid_like"]
4141
for col in expected_claims_columns:
@@ -47,8 +47,8 @@ def test_claims_columns(self):
4747
def test_data_columns(self):
4848
assert "hrr" in self.hrr_data.columns
4949
assert "fips" in self.fips_data.columns
50-
assert "date" in self.hrr_data.columns
51-
assert "date" in self.fips_data.columns
50+
assert "timestamp" in self.hrr_data.columns
51+
assert "timestamp" in self.fips_data.columns
5252

5353
expected_columns = ["num", "den"]
5454
for col in expected_columns:
@@ -57,12 +57,12 @@ def test_data_columns(self):
5757

5858
def test_edge_values(self):
5959
for data in [self.hrr_claims_data, self.fips_claims_data]:
60-
assert data.index.get_level_values('date').max() >= Config.FIRST_DATA_DATE
61-
assert data.index.get_level_values('date').min() < DROP_DATE
60+
assert data.index.get_level_values("timestamp").max() >= Config.FIRST_DATA_DATE
61+
assert data.index.get_level_values("timestamp").min() < DROP_DATE
6262

6363
for data in [self.hrr_data, self.fips_data]:
64-
assert data.date.max() >= Config.FIRST_DATA_DATE
65-
assert data.date.min() < DROP_DATE
64+
assert data["timestamp"].max() >= Config.FIRST_DATA_DATE
65+
assert data["timestamp"].min() < DROP_DATE
6666

6767
def test_hrrs_values(self):
6868
assert len(self.hrr_data.hrr.unique()) <= CONSTANTS.NUM_HRRS

0 commit comments

Comments
 (0)