Skip to content

update code for unassigned cases/deaths #68

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Jun 19, 2020
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 28 additions & 3 deletions jhu/delphi_jhu/geo.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@

FIPS_TO_STATE = {v: k.lower() for k, v in STATE_TO_FIPS.items()}

# Fake fips to States

JHU_FAKE_FIPS_TO_MEGA_FIPS = {f'900{x}' : f'{x}000' for x in STATE_TO_FIPS.values()}


def fips_to_state(fips: str) -> str:
"""Wrapper that handles exceptions to the FIPS scheme in the JHU data.
Expand Down Expand Up @@ -148,7 +152,7 @@ def disburse(df: pd.DataFrame, pooled_fips: str, fips_list: list):
return df


def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame):
def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str):
"""
Maps a DataFrame df, which contains data at the county resolution, and
aggregate it to the geographic resolution geo_res.
Expand All @@ -162,22 +166,38 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame):
('county', 'state', 'msa', 'hrr').
map_df: pd.DataFrame
Loaded from static file "fips_prop_pop.csv".
sensor: str
sensor type. Valid options:
("new_counts", "cumulative_counts",
"incidence", "cumulative_prop")

Returns
-------
pd.DataFrame
Columns: geo_id, timestamp, ...
"""
VALID_GEO_RES = ("county", "state", "msa", "hrr")
#It is not clear to calculate the proportion for unassigned cases/deaths
PROP_SENSORS = ("incidence", "cumulative_prop")
if geo_res not in VALID_GEO_RES:
raise ValueError(f"geo_res must be one of {VALID_GEO_RES}")
df = df.copy()

df_mega = df[df['fips'].astype(int) >= 90001].copy()
df_mega['geo_id'] = df_mega['fips'].apply(lambda x: JHU_FAKE_FIPS_TO_MEGA_FIPS[x])

df = df[df['fips'].astype(int) < 90001].copy()

if geo_res == "county":
df["geo_id"] = df["fips"]
if sensor not in PROP_SENSORS:
df = df.append(df_mega)
elif geo_res == "state":
# Grab first two digits of fips
# Map state fips to us postal code
df["geo_id"] = df["fips"].apply(fips_to_state)
df["geo_id"] = df["fips"]
# Add unassigned cases/deaths
df = df.append(df_mega)
df["geo_id"] = df["geo_id"].apply(fips_to_state)
elif geo_res in ("msa", "hrr"):
# Disburse Dukes & Nantucket to individual counties
df = disburse(df, DN_FIPS, DN_COUNTY_FIPS)
Expand All @@ -200,8 +220,13 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame):
merged["new_counts"] = merged["new_counts"] * merged["pop_prop"]
merged["population"] = merged["population"] * merged["pop_prop"]
df = merged.drop(["zip", "pop_prop", "hrrnum", "cbsa_id"], axis=1)
if sensor not in PROP_SENSORS:
df_mega["geo_id"] = df_mega["geo_id"].apply(fips_to_state)
df = df.append(df_mega)
df = df.drop("fips", axis=1)
df = df.groupby(["geo_id", "timestamp"]).sum().reset_index()

# Value would be negative for megacounties , which would not be considered in the main function
df["incidence"] = df["new_counts"] / df["population"] * INCIDENCE_BASE
df["cumulative_prop"] = df["cumulative_counts"] / df["population"] * INCIDENCE_BASE
return df
8 changes: 6 additions & 2 deletions jhu/delphi_jhu/pull.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFr
MIN_FIPS = 1000
MAX_FIPS = 57000
EXTRA_FIPS = (
72, # Puerto Rico (provided as the entire state)
72, # Puerto Rico (provided as the entire state)
70002, # Kansas City, MO
70003, # Dukes and Nantucket Counties, MA
)
Expand All @@ -79,9 +79,13 @@ def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFr
& (df["FIPS"] < MAX_FIPS)
) # "Uncategorized", etc.
| df["FIPS"].isin(EXTRA_FIPS)
# Get Fake FIPS for unassigned cases
| np.logical_and(df['FIPS'] >= 90001,
df['FIPS'] <= 90056)
]
# Merge in population LOWERCASE, consistent across confirmed and deaths
df = pd.merge(df, pop_df, on="FIPS")
# Set population as NAN for fake fips
df = pd.merge(df, pop_df, on="FIPS", how='left')

# Manual correction for PR
df.loc[df["FIPS"] == 72, "FIPS"] = 72000
Expand Down
2 changes: 1 addition & 1 deletion jhu/delphi_jhu/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def run_module():
print(geo_res, metric, sensor, smoother)
df = dfs[metric]
# Aggregate to appropriate geographic resolution
df = geo_map(df, geo_res, map_df)
df = geo_map(df, geo_res, map_df, sensor)
df["val"] = SMOOTHERS_MAP[smoother][0](df[sensor].values)
df["se"] = np.nan
df["sample_size"] = np.nan
Expand Down
121 changes: 90 additions & 31 deletions jhu/tests/test_geo.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@ def test_normal(self):
assert fips_to_state("12003") == "fl"
assert fips_to_state("50103") == "vt"
assert fips_to_state("15003") == "hi"

def test_mega(self):

assert fips_to_state("01000") == "al"
assert fips_to_state("13000") == "ga"
assert fips_to_state("44000") == "ri"
assert fips_to_state("12000") == "fl"


class TestDisburse:
Expand Down Expand Up @@ -74,15 +81,27 @@ def test_county(self):
}
)

new_df = geo_map(df, "county", MAP_DF)
df_mega = pd.DataFrame(
{
"fips": ["90013", "90001"],
"timestamp": ["2020-02-15", "2020-02-15"],
"new_counts": [8, 2],
"cumulative_counts": [80, 12],
"population": [np.nan, np.nan],
}
)

df = df.append(df_mega)

new_df = geo_map(df, "county", MAP_DF, 'new_counts')

exp_incidence = df["new_counts"] / df["population"] * 100000
exp_cprop = df["cumulative_counts"] / df["population"] * 100000

assert set(new_df["geo_id"].values) == set(df["fips"].values)
assert set(new_df["geo_id"].values) == set(['01000', '13000', '48027', '50103', '53003'])
assert set(new_df["timestamp"].values) == set(df["timestamp"].values)
assert set(new_df["incidence"].values) == set(exp_incidence.values)
assert set(new_df["cumulative_prop"].values) == set(exp_cprop.values)
assert set(new_df["incidence"].values) - set(exp_incidence.values) == set([np.Inf])
assert set(new_df["cumulative_prop"].values) - set(exp_cprop.values) == set([np.Inf])

def test_state(self):

Expand All @@ -95,19 +114,31 @@ def test_state(self):
"population": [100, 2100, 300, 25],
}
)

df_mega = pd.DataFrame(
{
"fips": ["90013", "90001", "04000", "25000"],
"timestamp": ["2020-02-15", "2020-02-15", "2020-02-15", "2020-02-15"],
"new_counts": [8, 2, 5, 10],
"cumulative_counts": [80, 12, 30, 100],
"population": [np.nan, np.nan, np.nan, np.nan],
}
)

df = df.append(df_mega)

new_df = geo_map(df, "state", MAP_DF)
new_df = geo_map(df, "state", MAP_DF, 'new_counts')

exp_incidence = np.array([27, 13]) / np.array([2500, 25]) * 100000
exp_cprop = np.array([165, 60]) / np.array([2500, 25]) * 100000
exp_incidence = np.array([27 + 5, 13 + 10]) / np.array([2500, 25]) * 100000
exp_cprop = np.array([165 + 30, 60 + 100]) / np.array([2500, 25]) * 100000

assert (new_df["geo_id"].values == ["az", "ma"]).all()
assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all()
assert (new_df["new_counts"].values == [27, 13]).all()
assert (new_df["cumulative_counts"].values == [165, 60]).all()
assert (new_df["population"].values == [2500, 25]).all()
assert (new_df["incidence"].values == exp_incidence).all()
assert (new_df["cumulative_prop"].values == exp_cprop).all()
assert set(new_df["geo_id"].values) == set(["az", "ma", "al", "ga"])
assert set(new_df["timestamp"].values) == set(["2020-02-15"])
assert set(new_df["new_counts"].values) == set([32, 23, 2, 8])
assert set(new_df["cumulative_counts"].values) == set([195, 160, 80, 12])
assert set(new_df["population"].values) == set([2500, 25, 0])
assert set(new_df["incidence"].values) - set(exp_incidence) == set([np.Inf])
assert set(new_df["cumulative_prop"].values) - set(exp_cprop) == set([np.Inf])

def test_hrr(self):

Expand All @@ -121,18 +152,32 @@ def test_hrr(self):
}
)

new_df = geo_map(df, "hrr", MAP_DF)
df_mega = pd.DataFrame(
{
"fips": ["90013", "90001"],
"timestamp": ["2020-02-15", "2020-02-15"],
"new_counts": [8, 2],
"cumulative_counts": [80, 12],
"population": [np.nan, np.nan],
}
)

df = df.append(df_mega)

new_df = geo_map(df, "hrr", MAP_DF, 'new_counts')

exp_incidence = np.array([13, 27]) / np.array([25, 2500]) * 100000
exp_cprop = np.array([60, 165]) / np.array([25, 2500]) * 100000

assert (new_df["geo_id"].values == [110, 147]).all()
assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all()
assert new_df["new_counts"].values == pytest.approx([13.0, 27.0])
assert new_df["cumulative_counts"].values == pytest.approx([60, 165])
assert new_df["population"].values == pytest.approx([25, 2500])
assert new_df["incidence"].values == pytest.approx(exp_incidence)
assert new_df["cumulative_prop"].values == pytest.approx(exp_cprop)
assert set(new_df["geo_id"].values) == set([110, 147, "al", "ga"])
assert set(new_df["timestamp"].values) == set(["2020-02-15"])
assert new_df["new_counts"].values == pytest.approx([13.0, 27.0, 2, 8])
assert new_df["cumulative_counts"].values == pytest.approx([60, 165, 12, 80])
assert new_df["population"].values == pytest.approx([25, 2500, 0, 0])
assert new_df["incidence"].values == pytest.approx(np.append(exp_incidence,
[np.Inf, np.Inf]))
assert new_df["cumulative_prop"].values == pytest.approx(np.append(exp_cprop,
[np.Inf, np.Inf]))

def test_msa(self):

Expand All @@ -145,16 +190,30 @@ def test_msa(self):
"population": [100, 2100, 300, 25],
}
)

df_mega = pd.DataFrame(
{
"fips": ["90013", "90001"],
"timestamp": ["2020-02-15", "2020-02-15"],
"new_counts": [8, 2],
"cumulative_counts": [80, 12],
"population": [np.nan, np.nan],
}
)

df = df.append(df_mega)

new_df = geo_map(df, "msa", MAP_DF)
new_df = geo_map(df, "msa", MAP_DF, 'new_counts')

exp_incidence = np.array([2, 13]) / np.array([300, 25]) * 100000
exp_cprop = np.array([45, 60]) / np.array([300, 25]) * 100000

assert (new_df["geo_id"].values == [31420, 49340]).all()
assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all()
assert new_df["new_counts"].values == pytest.approx([2.0, 13.0])
assert new_df["cumulative_counts"].values == pytest.approx([45, 60])
assert new_df["population"].values == pytest.approx([300, 25])
assert new_df["incidence"].values == pytest.approx(exp_incidence)
assert new_df["cumulative_prop"].values == pytest.approx(exp_cprop)
assert set(new_df["geo_id"].values) == set([31420, 49340, "al", "ga"])
assert set(new_df["timestamp"]) == set(["2020-02-15"])
assert new_df["new_counts"].values == pytest.approx([2.0, 13.0, 2.0, 8.0])
assert new_df["cumulative_counts"].values == pytest.approx([45, 60, 12, 80])
assert new_df["population"].values == pytest.approx([300, 25, 0, 0])
assert new_df["incidence"].values == pytest.approx(np.append(exp_incidence,
[np.Inf, np.Inf]))
assert new_df["cumulative_prop"].values == pytest.approx(np.append(exp_cprop,
[np.Inf, np.Inf]))