Skip to content

Commit 5931e21

Browse files
authored
Merge pull request #68 from cmu-delphi/jingjing/jhu_with_unassigned
update code for unassigned cases/deaths
2 parents 50f9e95 + cc203c6 commit 5931e21

File tree

4 files changed

+109
-25
lines changed

4 files changed

+109
-25
lines changed

jhu/delphi_jhu/geo.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,10 @@
8989

9090
FIPS_TO_STATE = {v: k.lower() for k, v in STATE_TO_FIPS.items()}
9191

92+
# Fake fips to States
93+
94+
JHU_FAKE_FIPS_TO_MEGA_FIPS = {f'900{x}' : f'{x}000' for x in STATE_TO_FIPS.values()}
95+
9296

9397
def fips_to_state(fips: str) -> str:
9498
"""Wrapper that handles exceptions to the FIPS scheme in the JHU data.
@@ -148,7 +152,7 @@ def disburse(df: pd.DataFrame, pooled_fips: str, fips_list: list):
148152
return df
149153

150154

151-
def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame):
155+
def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str):
152156
"""
153157
Maps a DataFrame df, which contains data at the county resolution, and
154158
aggregate it to the geographic resolution geo_res.
@@ -162,22 +166,38 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame):
162166
('county', 'state', 'msa', 'hrr').
163167
map_df: pd.DataFrame
164168
Loaded from static file "fips_prop_pop.csv".
169+
sensor: str
170+
sensor type. Valid options:
171+
("new_counts", "cumulative_counts",
172+
"incidence", "cumulative_prop")
165173
166174
Returns
167175
-------
168176
pd.DataFrame
169177
Columns: geo_id, timestamp, ...
170178
"""
171179
VALID_GEO_RES = ("county", "state", "msa", "hrr")
180+
#It is not clear to calculate the proportion for unassigned cases/deaths
181+
PROP_SENSORS = ("incidence", "cumulative_prop")
172182
if geo_res not in VALID_GEO_RES:
173183
raise ValueError(f"geo_res must be one of {VALID_GEO_RES}")
174-
df = df.copy()
184+
185+
df_mega = df[df['fips'].astype(int) >= 90001].copy()
186+
df_mega['geo_id'] = df_mega['fips'].apply(lambda x: JHU_FAKE_FIPS_TO_MEGA_FIPS[x])
187+
188+
df = df[df['fips'].astype(int) < 90001].copy()
189+
175190
if geo_res == "county":
176191
df["geo_id"] = df["fips"]
192+
if sensor not in PROP_SENSORS:
193+
df = df.append(df_mega)
177194
elif geo_res == "state":
178195
# Grab first two digits of fips
179196
# Map state fips to us postal code
180-
df["geo_id"] = df["fips"].apply(fips_to_state)
197+
df["geo_id"] = df["fips"]
198+
# Add unassigned cases/deaths
199+
df = df.append(df_mega)
200+
df["geo_id"] = df["geo_id"].apply(fips_to_state)
181201
elif geo_res in ("msa", "hrr"):
182202
# Disburse Dukes & Nantucket to individual counties
183203
df = disburse(df, DN_FIPS, DN_COUNTY_FIPS)
@@ -200,8 +220,13 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame):
200220
merged["new_counts"] = merged["new_counts"] * merged["pop_prop"]
201221
merged["population"] = merged["population"] * merged["pop_prop"]
202222
df = merged.drop(["zip", "pop_prop", "hrrnum", "cbsa_id"], axis=1)
223+
# if sensor not in PROP_SENSORS:
224+
# df_mega["geo_id"] = df_mega["geo_id"].apply(fips_to_state)
225+
# df = df.append(df_mega)
203226
df = df.drop("fips", axis=1)
204227
df = df.groupby(["geo_id", "timestamp"]).sum().reset_index()
228+
229+
# Value would be negative for megacounties , which would not be considered in the main function
205230
df["incidence"] = df["new_counts"] / df["population"] * INCIDENCE_BASE
206231
df["cumulative_prop"] = df["cumulative_counts"] / df["population"] * INCIDENCE_BASE
207232
return df

jhu/delphi_jhu/pull.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFr
6262
MIN_FIPS = 1000
6363
MAX_FIPS = 57000
6464
EXTRA_FIPS = (
65-
72, # Puerto Rico (provided as the entire state)
65+
72, # Puerto Rico (provided as the entire state)
6666
70002, # Kansas City, MO
6767
70003, # Dukes and Nantucket Counties, MA
6868
)
@@ -79,9 +79,13 @@ def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFr
7979
& (df["FIPS"] < MAX_FIPS)
8080
) # "Uncategorized", etc.
8181
| df["FIPS"].isin(EXTRA_FIPS)
82+
# Get Fake FIPS for unassigned cases
83+
| np.logical_and(df['FIPS'] >= 90001,
84+
df['FIPS'] <= 90056)
8285
]
8386
# Merge in population LOWERCASE, consistent across confirmed and deaths
84-
df = pd.merge(df, pop_df, on="FIPS")
87+
# Set population as NAN for fake fips
88+
df = pd.merge(df, pop_df, on="FIPS", how='left')
8589

8690
# Manual correction for PR
8791
df.loc[df["FIPS"] == 72, "FIPS"] = 72000

jhu/delphi_jhu/run.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
]
3636
SMOOTHERS = [
3737
"unsmoothed",
38-
"seven_day_average",
38+
#"seven_day_average",
3939
]
4040
SENSOR_NAME_MAP = {
4141
"new_counts": ("incidence_num", False),
@@ -84,7 +84,7 @@ def run_module():
8484
print(geo_res, metric, sensor, smoother)
8585
df = dfs[metric]
8686
# Aggregate to appropriate geographic resolution
87-
df = geo_map(df, geo_res, map_df)
87+
df = geo_map(df, geo_res, map_df, sensor)
8888
df["val"] = SMOOTHERS_MAP[smoother][0](df[sensor].values)
8989
df["se"] = np.nan
9090
df["sample_size"] = np.nan

jhu/tests/test_geo.py

Lines changed: 73 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,13 @@ def test_normal(self):
2525
assert fips_to_state("12003") == "fl"
2626
assert fips_to_state("50103") == "vt"
2727
assert fips_to_state("15003") == "hi"
28+
29+
def test_mega(self):
30+
31+
assert fips_to_state("01000") == "al"
32+
assert fips_to_state("13000") == "ga"
33+
assert fips_to_state("44000") == "ri"
34+
assert fips_to_state("12000") == "fl"
2835

2936

3037
class TestDisburse:
@@ -74,15 +81,27 @@ def test_county(self):
7481
}
7582
)
7683

77-
new_df = geo_map(df, "county", MAP_DF)
84+
df_mega = pd.DataFrame(
85+
{
86+
"fips": ["90013", "90001"],
87+
"timestamp": ["2020-02-15", "2020-02-15"],
88+
"new_counts": [8, 2],
89+
"cumulative_counts": [80, 12],
90+
"population": [np.nan, np.nan],
91+
}
92+
)
93+
94+
df = df.append(df_mega)
95+
96+
new_df = geo_map(df, "county", MAP_DF, 'new_counts')
7897

7998
exp_incidence = df["new_counts"] / df["population"] * 100000
8099
exp_cprop = df["cumulative_counts"] / df["population"] * 100000
81-
82-
assert set(new_df["geo_id"].values) == set(df["fips"].values)
100+
101+
assert set(new_df["geo_id"].values) == set(['01000', '13000', '48027', '50103', '53003'])
83102
assert set(new_df["timestamp"].values) == set(df["timestamp"].values)
84-
assert set(new_df["incidence"].values) == set(exp_incidence.values)
85-
assert set(new_df["cumulative_prop"].values) == set(exp_cprop.values)
103+
assert set(new_df["incidence"].values) - set(exp_incidence.values) == set([np.Inf])
104+
assert set(new_df["cumulative_prop"].values) - set(exp_cprop.values) == set([np.Inf])
86105

87106
def test_state(self):
88107

@@ -95,19 +114,31 @@ def test_state(self):
95114
"population": [100, 2100, 300, 25],
96115
}
97116
)
117+
118+
df_mega = pd.DataFrame(
119+
{
120+
"fips": ["90013", "90001", "04000", "25000"],
121+
"timestamp": ["2020-02-15", "2020-02-15", "2020-02-15", "2020-02-15"],
122+
"new_counts": [8, 2, 5, 10],
123+
"cumulative_counts": [80, 12, 30, 100],
124+
"population": [np.nan, np.nan, np.nan, np.nan],
125+
}
126+
)
127+
128+
df = df.append(df_mega)
98129

99-
new_df = geo_map(df, "state", MAP_DF)
130+
new_df = geo_map(df, "state", MAP_DF, 'new_counts')
100131

101-
exp_incidence = np.array([27, 13]) / np.array([2500, 25]) * 100000
102-
exp_cprop = np.array([165, 60]) / np.array([2500, 25]) * 100000
132+
exp_incidence = np.array([27 + 5, 13 + 10]) / np.array([2500, 25]) * 100000
133+
exp_cprop = np.array([165 + 30, 60 + 100]) / np.array([2500, 25]) * 100000
103134

104-
assert (new_df["geo_id"].values == ["az", "ma"]).all()
105-
assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all()
106-
assert (new_df["new_counts"].values == [27, 13]).all()
107-
assert (new_df["cumulative_counts"].values == [165, 60]).all()
108-
assert (new_df["population"].values == [2500, 25]).all()
109-
assert (new_df["incidence"].values == exp_incidence).all()
110-
assert (new_df["cumulative_prop"].values == exp_cprop).all()
135+
assert set(new_df["geo_id"].values) == set(["az", "ma", "al", "ga"])
136+
assert set(new_df["timestamp"].values) == set(["2020-02-15"])
137+
assert set(new_df["new_counts"].values) == set([32, 23, 2, 8])
138+
assert set(new_df["cumulative_counts"].values) == set([195, 160, 12, 80])
139+
assert set(new_df["population"].values) == set([2500, 25, 0])
140+
assert set(new_df["incidence"].values) - set(exp_incidence) == set([np.Inf])
141+
assert set(new_df["cumulative_prop"].values) - set(exp_cprop) == set([np.Inf])
111142

112143
def test_hrr(self):
113144

@@ -121,7 +152,19 @@ def test_hrr(self):
121152
}
122153
)
123154

124-
new_df = geo_map(df, "hrr", MAP_DF)
155+
# df_mega = pd.DataFrame(
156+
# {
157+
# "fips": ["90013", "90001"],
158+
# "timestamp": ["2020-02-15", "2020-02-15"],
159+
# "new_counts": [8, 2],
160+
# "cumulative_counts": [80, 12],
161+
# "population": [np.nan, np.nan],
162+
# }
163+
# )
164+
165+
# df = df.append(df_mega)
166+
167+
new_df = geo_map(df, "hrr", MAP_DF, 'new_counts')
125168

126169
exp_incidence = np.array([13, 27]) / np.array([25, 2500]) * 100000
127170
exp_cprop = np.array([60, 165]) / np.array([25, 2500]) * 100000
@@ -145,8 +188,20 @@ def test_msa(self):
145188
"population": [100, 2100, 300, 25],
146189
}
147190
)
148-
149-
new_df = geo_map(df, "msa", MAP_DF)
191+
192+
# df_mega = pd.DataFrame(
193+
# {
194+
# "fips": ["90013", "90001"],
195+
# "timestamp": ["2020-02-15", "2020-02-15"],
196+
# "new_counts": [8, 2],
197+
# "cumulative_counts": [80, 12],
198+
# "population": [np.nan, np.nan],
199+
# }
200+
# )
201+
202+
# df = df.append(df_mega)
203+
204+
new_df = geo_map(df, "msa", MAP_DF, 'new_counts')
150205

151206
exp_incidence = np.array([2, 13]) / np.array([300, 25]) * 100000
152207
exp_cprop = np.array([45, 60]) / np.array([300, 25]) * 100000

0 commit comments

Comments
 (0)