Skip to content

Commit 5ff04c0

Browse files
Jingjing Tangkrivard
Jingjing Tang
andcommitted
Add handling of unassigned cases/deaths to jhu
Co-authored-by: krivard <[email protected]>
1 parent 6cb7310 commit 5ff04c0

File tree

6 files changed

+230
-26
lines changed

6 files changed

+230
-26
lines changed

jhu/delphi_jhu/geo.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,10 @@
8989

9090
FIPS_TO_STATE = {v: k.lower() for k, v in STATE_TO_FIPS.items()}
9191

92+
# Fake fips to States
93+
94+
JHU_FAKE_FIPS_TO_MEGA_FIPS = {f'900{x}' : f'{x}000' for x in STATE_TO_FIPS.values()}
95+
9296

9397
def fips_to_state(fips: str) -> str:
9498
"""Wrapper that handles exceptions to the FIPS scheme in the JHU data.
@@ -148,7 +152,7 @@ def disburse(df: pd.DataFrame, pooled_fips: str, fips_list: list):
148152
return df
149153

150154

151-
def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame):
155+
def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str):
152156
"""
153157
Maps a DataFrame df, which contains data at the county resolution, and
154158
aggregate it to the geographic resolution geo_res.
@@ -162,22 +166,38 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame):
162166
('county', 'state', 'msa', 'hrr').
163167
map_df: pd.DataFrame
164168
Loaded from static file "fips_prop_pop.csv".
169+
sensor: str
170+
sensor type. Valid options:
171+
("new_counts", "cumulative_counts",
172+
"incidence", "cumulative_prop")
165173
166174
Returns
167175
-------
168176
pd.DataFrame
169177
Columns: geo_id, timestamp, ...
170178
"""
171179
VALID_GEO_RES = ("county", "state", "msa", "hrr")
180+
#It is not clear to calculate the proportion for unassigned cases/deaths
181+
PROP_SENSORS = ("incidence", "cumulative_prop")
172182
if geo_res not in VALID_GEO_RES:
173183
raise ValueError(f"geo_res must be one of {VALID_GEO_RES}")
174-
df = df.copy()
184+
185+
df_mega = df[df['fips'].astype(int) >= 90001].copy()
186+
df_mega['geo_id'] = df_mega['fips'].apply(lambda x: JHU_FAKE_FIPS_TO_MEGA_FIPS[x])
187+
188+
df = df[df['fips'].astype(int) < 90001].copy()
189+
175190
if geo_res == "county":
176191
df["geo_id"] = df["fips"]
192+
if sensor not in PROP_SENSORS:
193+
df = df.append(df_mega)
177194
elif geo_res == "state":
178195
# Grab first two digits of fips
179196
# Map state fips to us postal code
180-
df["geo_id"] = df["fips"].apply(fips_to_state)
197+
df["geo_id"] = df["fips"]
198+
# Add unassigned cases/deaths
199+
df = df.append(df_mega)
200+
df["geo_id"] = df["geo_id"].apply(fips_to_state)
181201
elif geo_res in ("msa", "hrr"):
182202
# Disburse Dukes & Nantucket to individual counties
183203
df = disburse(df, DN_FIPS, DN_COUNTY_FIPS)
@@ -200,8 +220,13 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame):
200220
merged["new_counts"] = merged["new_counts"] * merged["pop_prop"]
201221
merged["population"] = merged["population"] * merged["pop_prop"]
202222
df = merged.drop(["zip", "pop_prop", "hrrnum", "cbsa_id"], axis=1)
223+
# if sensor not in PROP_SENSORS:
224+
# df_mega["geo_id"] = df_mega["geo_id"].apply(fips_to_state)
225+
# df = df.append(df_mega)
203226
df = df.drop("fips", axis=1)
204227
df = df.groupby(["geo_id", "timestamp"]).sum().reset_index()
228+
229+
# Value would be negative for megacounties , which would not be considered in the main function
205230
df["incidence"] = df["new_counts"] / df["population"] * INCIDENCE_BASE
206231
df["cumulative_prop"] = df["cumulative_counts"] / df["population"] * INCIDENCE_BASE
207232
return df

jhu/delphi_jhu/pull.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFr
6262
MIN_FIPS = 1000
6363
MAX_FIPS = 57000
6464
EXTRA_FIPS = (
65-
72, # Puerto Rico (provided as the entire state)
65+
72, # Puerto Rico (provided as the entire state)
6666
70002, # Kansas City, MO
6767
70003, # Dukes and Nantucket Counties, MA
6868
)
@@ -79,9 +79,13 @@ def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFr
7979
& (df["FIPS"] < MAX_FIPS)
8080
) # "Uncategorized", etc.
8181
| df["FIPS"].isin(EXTRA_FIPS)
82+
# Get Fake FIPS for unassigned cases
83+
| np.logical_and(df['FIPS'] >= 90001,
84+
df['FIPS'] <= 90056)
8285
]
8386
# Merge in population LOWERCASE, consistent across confirmed and deaths
84-
df = pd.merge(df, pop_df, on="FIPS")
87+
# Set population as NAN for fake fips
88+
df = pd.merge(df, pop_df, on="FIPS", how='left')
8589

8690
# Manual correction for PR
8791
df.loc[df["FIPS"] == 72, "FIPS"] = 72000

jhu/delphi_jhu/run.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def run_module():
7777
print(geo_res, metric, sensor, smoother)
7878
df = dfs[metric]
7979
# Aggregate to appropriate geographic resolution
80-
df = geo_map(df, geo_res, map_df)
80+
df = geo_map(df, geo_res, map_df, sensor)
8181
df["val"] = SMOOTHERS_MAP[smoother][0](df[sensor].values)
8282
df["se"] = np.nan
8383
df["sample_size"] = np.nan

jhu/tests/receiving/.gitignore

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
# You should hard commit a prototype for this file, but we
2+
# want to avoid accidental adding of API tokens and other
3+
# private data parameters
4+
params.json
5+
6+
# Do not commit output files
7+
receiving/*.csv
8+
9+
# Remove macOS files
10+
.DS_Store
11+
12+
# virtual environment
13+
dview/
14+
15+
# Byte-compiled / optimized / DLL files
16+
__pycache__/
17+
*.py[cod]
18+
*$py.class
19+
20+
# C extensions
21+
*.so
22+
23+
# Distribution / packaging
24+
coverage.xml
25+
.Python
26+
build/
27+
develop-eggs/
28+
dist/
29+
downloads/
30+
eggs/
31+
.eggs/
32+
lib/
33+
lib64/
34+
parts/
35+
sdist/
36+
var/
37+
wheels/
38+
*.egg-info/
39+
.installed.cfg
40+
*.egg
41+
MANIFEST
42+
43+
# PyInstaller
44+
# Usually these files are written by a python script from a template
45+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
46+
*.manifest
47+
*.spec
48+
49+
# Installer logs
50+
pip-log.txt
51+
pip-delete-this-directory.txt
52+
53+
# Unit test / coverage reports
54+
htmlcov/
55+
.tox/
56+
.coverage
57+
.coverage.*
58+
.cache
59+
nosetests.xml
60+
coverage.xml
61+
*.cover
62+
.hypothesis/
63+
.pytest_cache/
64+
65+
# Translations
66+
*.mo
67+
*.pot
68+
69+
# Django stuff:
70+
*.log
71+
.static_storage/
72+
.media/
73+
local_settings.py
74+
75+
# Flask stuff:
76+
instance/
77+
.webassets-cache
78+
79+
# Scrapy stuff:
80+
.scrapy
81+
82+
# Sphinx documentation
83+
docs/_build/
84+
85+
# PyBuilder
86+
target/
87+
88+
# Jupyter Notebook
89+
.ipynb_checkpoints
90+
91+
# pyenv
92+
.python-version
93+
94+
# celery beat schedule file
95+
celerybeat-schedule
96+
97+
# SageMath parsed files
98+
*.sage.py
99+
100+
# Environments
101+
.env
102+
.venv
103+
env/
104+
venv/
105+
ENV/
106+
env.bak/
107+
venv.bak/
108+
109+
# Spyder project settings
110+
.spyderproject
111+
.spyproject
112+
113+
# Rope project settings
114+
.ropeproject
115+
116+
# mkdocs documentation
117+
/site
118+
119+
# mypy
120+
.mypy_cache/

jhu/tests/test_geo.py

Lines changed: 74 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,13 @@ def test_normal(self):
2525
assert fips_to_state("12003") == "fl"
2626
assert fips_to_state("50103") == "vt"
2727
assert fips_to_state("15003") == "hi"
28+
29+
def test_mega(self):
30+
31+
assert fips_to_state("01000") == "al"
32+
assert fips_to_state("13000") == "ga"
33+
assert fips_to_state("44000") == "ri"
34+
assert fips_to_state("12000") == "fl"
2835

2936

3037
class TestDisburse:
@@ -60,7 +67,7 @@ def test_incorrect_geo(self):
6067
)
6168

6269
with pytest.raises(ValueError):
63-
geo_map(df, "département", MAP_DF)
70+
geo_map(df, "département", MAP_DF, 'new_counts')
6471

6572
def test_county(self):
6673

@@ -74,15 +81,27 @@ def test_county(self):
7481
}
7582
)
7683

77-
new_df = geo_map(df, "county", MAP_DF)
84+
df_mega = pd.DataFrame(
85+
{
86+
"fips": ["90013", "90001"],
87+
"timestamp": ["2020-02-15", "2020-02-15"],
88+
"new_counts": [8, 2],
89+
"cumulative_counts": [80, 12],
90+
"population": [np.nan, np.nan],
91+
}
92+
)
93+
94+
df = df.append(df_mega)
95+
96+
new_df = geo_map(df, "county", MAP_DF, 'new_counts')
7897

7998
exp_incidence = df["new_counts"] / df["population"] * 100000
8099
exp_cprop = df["cumulative_counts"] / df["population"] * 100000
81-
82-
assert set(new_df["geo_id"].values) == set(df["fips"].values)
100+
101+
assert set(new_df["geo_id"].values) == set(['01000', '13000', '48027', '50103', '53003'])
83102
assert set(new_df["timestamp"].values) == set(df["timestamp"].values)
84-
assert set(new_df["incidence"].values) == set(exp_incidence.values)
85-
assert set(new_df["cumulative_prop"].values) == set(exp_cprop.values)
103+
assert set(new_df["incidence"].values) - set(exp_incidence.values) == set([np.Inf])
104+
assert set(new_df["cumulative_prop"].values) - set(exp_cprop.values) == set([np.Inf])
86105

87106
def test_state(self):
88107

@@ -95,19 +114,31 @@ def test_state(self):
95114
"population": [100, 2100, 300, 25],
96115
}
97116
)
117+
118+
df_mega = pd.DataFrame(
119+
{
120+
"fips": ["90013", "90001", "04000", "25000"],
121+
"timestamp": ["2020-02-15", "2020-02-15", "2020-02-15", "2020-02-15"],
122+
"new_counts": [8, 2, 5, 10],
123+
"cumulative_counts": [80, 12, 30, 100],
124+
"population": [np.nan, np.nan, np.nan, np.nan],
125+
}
126+
)
127+
128+
df = df.append(df_mega)
98129

99-
new_df = geo_map(df, "state", MAP_DF)
130+
new_df = geo_map(df, "state", MAP_DF, 'new_counts')
100131

101-
exp_incidence = np.array([27, 13]) / np.array([2500, 25]) * 100000
102-
exp_cprop = np.array([165, 60]) / np.array([2500, 25]) * 100000
132+
exp_incidence = np.array([27 + 5, 13 + 10]) / np.array([2500, 25]) * 100000
133+
exp_cprop = np.array([165 + 30, 60 + 100]) / np.array([2500, 25]) * 100000
103134

104-
assert (new_df["geo_id"].values == ["az", "ma"]).all()
105-
assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all()
106-
assert (new_df["new_counts"].values == [27, 13]).all()
107-
assert (new_df["cumulative_counts"].values == [165, 60]).all()
108-
assert (new_df["population"].values == [2500, 25]).all()
109-
assert (new_df["incidence"].values == exp_incidence).all()
110-
assert (new_df["cumulative_prop"].values == exp_cprop).all()
135+
assert set(new_df["geo_id"].values) == set(["az", "ma", "al", "ga"])
136+
assert set(new_df["timestamp"].values) == set(["2020-02-15"])
137+
assert set(new_df["new_counts"].values) == set([32, 23, 2, 8])
138+
assert set(new_df["cumulative_counts"].values) == set([195, 160, 12, 80])
139+
assert set(new_df["population"].values) == set([2500, 25, 0])
140+
assert set(new_df["incidence"].values) - set(exp_incidence) == set([np.Inf])
141+
assert set(new_df["cumulative_prop"].values) - set(exp_cprop) == set([np.Inf])
111142

112143
def test_hrr(self):
113144

@@ -121,7 +152,19 @@ def test_hrr(self):
121152
}
122153
)
123154

124-
new_df = geo_map(df, "hrr", MAP_DF)
155+
# df_mega = pd.DataFrame(
156+
# {
157+
# "fips": ["90013", "90001"],
158+
# "timestamp": ["2020-02-15", "2020-02-15"],
159+
# "new_counts": [8, 2],
160+
# "cumulative_counts": [80, 12],
161+
# "population": [np.nan, np.nan],
162+
# }
163+
# )
164+
165+
# df = df.append(df_mega)
166+
167+
new_df = geo_map(df, "hrr", MAP_DF, 'new_counts')
125168

126169
exp_incidence = np.array([13, 27]) / np.array([25, 2500]) * 100000
127170
exp_cprop = np.array([60, 165]) / np.array([25, 2500]) * 100000
@@ -145,8 +188,20 @@ def test_msa(self):
145188
"population": [100, 2100, 300, 25],
146189
}
147190
)
148-
149-
new_df = geo_map(df, "msa", MAP_DF)
191+
192+
# df_mega = pd.DataFrame(
193+
# {
194+
# "fips": ["90013", "90001"],
195+
# "timestamp": ["2020-02-15", "2020-02-15"],
196+
# "new_counts": [8, 2],
197+
# "cumulative_counts": [80, 12],
198+
# "population": [np.nan, np.nan],
199+
# }
200+
# )
201+
202+
# df = df.append(df_mega)
203+
204+
new_df = geo_map(df, "msa", MAP_DF, 'new_counts')
150205

151206
exp_incidence = np.array([2, 13]) / np.array([300, 25]) * 100000
152207
exp_cprop = np.array([45, 60]) / np.array([300, 25]) * 100000

jhu/tests/test_smooth.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def test_output_files_smoothed(self, run_as_module):
1414

1515
smoothed = pd.read_csv(
1616
join("receiving",
17-
f"{dates[-1]}_state_confirmed_7dav_cumulative_num.csv")
17+
f"{dates[-1]}_state_wip_confirmed_7dav_cumul_num.csv")
1818
)
1919

2020
raw = pd.concat([

0 commit comments

Comments
 (0)