Skip to content

Commit b39d55f

Browse files
Jingjing Tangkrivard
Jingjing Tang
authored andcommitted
updated code for using geo utils
1 parent 6a7fd65 commit b39d55f

File tree

3 files changed

+70
-109
lines changed

3 files changed

+70
-109
lines changed
Lines changed: 45 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -1,77 +1,49 @@
11
"""Contains geographic mapping tools."""
2-
3-
def zip_to_msa(data, map_df):
4-
"""Map from zipcode to MSA (along with parent state).
5-
Args:
6-
data: dataframe at the day-zip resolution.
7-
Returns:
8-
tuple, a dataframe at day-msa, with parent state column, and their string keys
2+
from delphi_utils import GeoMapper
3+
4+
date_col = "timestamp"
5+
data_cols = ['totalTest', 'numUniqueDevices', 'positiveTest', "population"]
6+
gmpr = GeoMapper() # Use geo utils
7+
GEO_KEY_DICT = {
8+
"county": "fips",
9+
"msa": "msa",
10+
"hrr": "hrr",
11+
"state": "state_id"
12+
}
13+
def geo_map(geo_res, df):
14+
data = df.copy()
15+
geo_key = GEO_KEY_DICT[geo_res]
16+
# Add population for each zipcode
17+
data = gmpr.add_population_column(data, "zip")
18+
# zip -> geo_res
19+
data = gmpr.replace_geocode(data, "zip", geo_key,
20+
date_col=date_col, data_cols=data_cols)
21+
if geo_res == "state":
22+
return data
23+
# Add parent state
24+
data = add_parent_state(data, geo_res, geo_key)
25+
return data, geo_key
26+
27+
def add_parent_state(data, geo_res, geo_key):
928
"""
10-
# zip -> msa
11-
zip_map = map_df[["zip", "cbsa_id"]].dropna().drop_duplicates()
12-
# forget about the rest of the zips that aren't in MSA
13-
data = data.merge(zip_map, how="left", on="zip").dropna().drop(columns=["zip"], axis=1)
14-
15-
# msa + parent state
16-
# msa_map has mapping from msa to state, going by the state with the largest
17-
# population (since a msa may span multiple states)
18-
msa_map = map_df[["cbsa_id", "state_id", "population"]]
19-
msa_map = msa_map.groupby(["cbsa_id"]).max().reset_index()
20-
data = data.merge(msa_map, how="left", on="cbsa_id").drop(
21-
columns=["population"]).dropna()
22-
data = data.groupby(["timestamp", "cbsa_id", "state_id"]).sum().reset_index()
23-
data["cbsa_id"] = data["cbsa_id"].apply(lambda x: str(int(x)).zfill(5))
24-
25-
return data, "cbsa_id"
26-
27-
def zip_to_hrr(data, map_df):
28-
"""Map from zipcode to HRR (along with parent state).
29-
Args:
30-
data: dataframe at the day-zip resolution.
31-
Returns:
32-
tuple, a dataframe at day-msa, with parent state column, and their string keys
29+
- map from msa/hrr to state, going by the state with the largest
30+
population (since a msa/hrr may span multiple states)
31+
- map from county to the corresponding state
3332
"""
34-
# zip -> msa
35-
zip_map = map_df[["zip", "hrrnum"]].dropna().drop_duplicates()
36-
# forget about the rest of the zips that aren't in MSA
37-
data = data.merge(zip_map, how="left", on="zip").dropna().drop(columns=["zip"], axis=1)
38-
39-
# msa + parent state
40-
# msa_map has mapping from msa to state, going by the state with the largest
41-
# population (since a msa may span multiple states)
42-
msa_map = map_df[["hrrnum", "state_id", "population"]]
43-
msa_map = msa_map.groupby(["hrrnum"]).max().reset_index()
44-
data = data.merge(msa_map, how="left", on="hrrnum").drop(
33+
fips_to_state = gmpr._load_crosswalk(from_code="fips", to_code="state")
34+
if geo_res == "county":
35+
mix_map = fips_to_state[["fips", "state_id"]]
36+
else:
37+
fips_to_geo_res = gmpr._load_crosswalk(from_code="fips", to_code=geo_res)
38+
mix_map = fips_to_geo_res[["fips", geo_res]].merge(
39+
fips_to_state[["fips", "state_id"]],
40+
on="fips",
41+
how="inner")
42+
mix_map = gmpr.add_population_column(mix_map, "fips").groupby(
43+
geo_res).max().reset_index().drop(
44+
["fips", "population"], axis = 1)
45+
# Merge the info of parent state to the data
46+
data = data.merge(mix_map, how="left", on=geo_key).drop(
4547
columns=["population"]).dropna()
46-
data = data.groupby(["timestamp", "hrrnum", "state_id"]).sum().reset_index()
47-
data["hrrnum"] = data["hrrnum"].astype(int)
48-
49-
return data, "hrrnum"
50-
51-
def zip_to_county(data, map_df):
52-
"""Aggregate zip codes to the county resolution, along with its parent state.
53-
Args:
54-
data: dataframe aggregated to the day-zip resolution
55-
Returns:
56-
dataframe at the day-county resolution and parent state, with their string keys
57-
"""
58-
# zip -> county + parent state (county has unique state)
59-
zip_map = map_df[["fips", "zip", "state_id"]].dropna().drop_duplicates()
60-
data = data.merge(zip_map, how="left", on="zip").drop(columns=["zip"]).dropna()
61-
data = data.groupby(["timestamp", "fips", "state_id"]).sum().reset_index()
62-
data["fips"] = data["fips"].apply(lambda x: str(int(x)).zfill(5))
63-
64-
return data, "fips"
65-
66-
def zip_to_state(data, map_df):
67-
"""Aggregate zip codes to the state resolution.
68-
Args:
69-
data: dataframe aggregated to the day-zip resolution
70-
Returns:
71-
dataframe at the day-state resolution, with the state key
72-
"""
73-
zip_map = map_df[["zip", "state_id"]].dropna().drop_duplicates()
74-
data = data.merge(zip_map, how="left", on="zip").drop(
75-
columns=["zip"]).dropna()
76-
data = data.groupby(["timestamp", "state_id"]).sum().reset_index()
77-
return data
48+
data = data.groupby(["timestamp", geo_key, "state_id"]).sum().reset_index()
49+
return data

quidel_covidtest/delphi_quidel_covidtest/run.py

Lines changed: 12 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import pandas as pd
1010
from delphi_utils import read_params, add_prefix
1111

12-
from .geo_maps import (zip_to_msa, zip_to_hrr, zip_to_county, zip_to_state)
12+
from .geo_maps import geo_map
1313
from .pull import (pull_quidel_covidtest,
1414
check_export_start_date,
1515
check_export_end_date,
@@ -29,12 +29,8 @@ def run_module():
2929
params = read_params()
3030
cache_dir = params["cache_dir"]
3131
export_dir = params["export_dir"]
32-
static_file_dir = params["static_file_dir"]
3332
export_start_date = params["export_start_date"]
3433
export_end_date = params["export_end_date"]
35-
map_df = pd.read_csv(
36-
join(static_file_dir, "fips_prop_pop.csv"), dtype={"fips": int}
37-
)
3834

3935
# Pull data and update export date
4036
df, _end_date = pull_quidel_covidtest(params)
@@ -50,14 +46,14 @@ def run_module():
5046

5147
# State Level
5248
data = df.copy()
53-
state_groups = zip_to_state(data, map_df).groupby("state_id")
49+
state_groups = geo_map("state", data).groupby("state_id")
5450

5551
# Add prefix, if required
5652
sensors = add_prefix(SENSORS,
5753
wip_signal=read_params()["wip_signal"],
5854
prefix="wip_")
5955
smoothers = SMOOTHERS.copy()
60-
56+
6157
for sensor in sensors:
6258
# For State Level
6359
print("state", sensor)
@@ -78,21 +74,15 @@ def run_module():
7874

7975
# County/HRR/MSA level
8076
for geo_res in GEO_RESOLUTIONS:
81-
print(geo_res, sensor)
82-
data = df.copy()
83-
if geo_res == COUNTY:
84-
data, res_key = zip_to_county(data, map_df)
85-
elif geo_res == MSA:
86-
data, res_key = zip_to_msa(data, map_df)
87-
else:
88-
data, res_key = zip_to_hrr(data, map_df)
89-
90-
res_df = generate_sensor_for_other_geores(
91-
state_groups, data, res_key, smooth=smoothers[sensor][1],
92-
device=smoothers[sensor][0], first_date=first_date,
93-
last_date=last_date)
94-
export_csv(res_df, geo_res, sensor, receiving_dir=export_dir,
95-
start_date=export_start_date, end_date=export_end_date)
77+
geo_data, res_key = geo_map(geo_res, data)
78+
for sensor in sensors:
79+
print(geo_res, sensor)
80+
res_df = generate_sensor_for_other_geores(
81+
state_groups, geo_data, res_key, smooth=smoothers[sensor][1],
82+
device=smoothers[sensor][0], first_date=first_date,
83+
last_date=last_date)
84+
export_csv(res_df, geo_res, sensor, receiving_dir=export_dir,
85+
start_date=export_start_date, end_date=export_end_date)
9686

9787
# Export the cache file if the pipeline runs successfully.
9888
# Otherwise, don't update the cache file

quidel_covidtest/tests/test_geo_maps.py

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,9 @@
66
import numpy as np
77

88

9-
from delphi_quidel_covidtest.geo_maps import (zip_to_msa, zip_to_hrr,
10-
zip_to_county, zip_to_state)
9+
from delphi_quidel_covidtest.geo_maps import geo_map
1110

1211

13-
map_df = pd.read_csv(
14-
join("../static", "fips_prop_pop.csv"), dtype={"fips": int}
15-
)
16-
1712
class TestGeoMap:
1813
def test_county(self):
1914

@@ -24,10 +19,11 @@ def test_county(self):
2419
"2020-06-15", "2020-06-15", "2020-06-15"],
2520
"totalTest": [100, 50, 200, 200, 250, 500],
2621
"positiveTest": [10, 8, 15, 5, 20, 50],
22+
"numUniqueDevices": [2, 1, 1, 1, 1, 1]
2723
}
2824
)
2925

30-
new_df, res_key = zip_to_county(df, map_df)
26+
new_df, res_key = geo_map("county", df)
3127

3228
assert res_key == 'fips'
3329
assert set(new_df["fips"].values) == set(['25027', '53011', '48439'])
@@ -44,10 +40,11 @@ def test_state(self):
4440
"2020-06-15", "2020-06-15", "2020-06-15"],
4541
"totalTest": [100, 50, 200, 200, 250, 500],
4642
"positiveTest": [10, 8, 15, 5, 20, 50],
43+
"numUniqueDevices": [2, 1, 1, 1, 1, 1]
4744
}
4845
)
4946

50-
new_df = zip_to_state(df, map_df)
47+
new_df = geo_map("state", df)
5148

5249
assert set(new_df["state_id"].values) == set(['ma', 'tx', 'wa'])
5350
assert set(new_df["timestamp"].values) == set(df["timestamp"].values)
@@ -63,12 +60,13 @@ def test_hrr(self):
6360
"2020-06-15", "2020-06-15", "2020-06-15"],
6461
"totalTest": [100, 50, 200, 200, 250, 500],
6562
"positiveTest": [10, 8, 15, 5, 20, 50],
63+
"numUniqueDevices": [2, 1, 1, 1, 1, 1]
6664
}
6765
)
6866

69-
new_df, res_key = zip_to_hrr(df, map_df)
67+
new_df, res_key = geo_map("hrr", df)
7068

71-
assert set(new_df["hrrnum"].values) == set([16, 231, 340, 344, 394])
69+
assert set(new_df["hrr"].values) == set(["16", "231", "340", "344", "394"])
7270
assert set(new_df["timestamp"].values) == set(df["timestamp"].values)
7371
assert set(new_df["totalTest"].values) == set([500, 100, 250, 50, 400])
7472
assert set(new_df["positiveTest"].values) == set([50, 10, 20, 8, 20])
@@ -77,18 +75,19 @@ def test_msa(self):
7775

7876
df = pd.DataFrame(
7977
{
80-
"zip": [1607, 73716, 73719, 76010, 74435, 74936],
78+
"zip": [1607, 73716, 73719, 76010, 74945, 74936],
8179
"timestamp": ["2020-06-15", "2020-06-15", "2020-06-15",
8280
"2020-06-15", "2020-06-15", "2020-06-15"],
8381
"totalTest": [100, 50, 200, 200, 250, 500],
8482
"positiveTest": [10, 8, 15, 5, 20, 50],
83+
"numUniqueDevices": [2, 1, 1, 1, 1, 1]
8584
}
8685
)
8786

88-
new_df, res_key = zip_to_msa(df, map_df)
87+
new_df, res_key = geo_map("msa", df)
8988

90-
assert res_key == 'cbsa_id'
91-
assert set(new_df["cbsa_id"].values) == set(['19100', '22900', '49340'])
89+
assert res_key == 'msa'
90+
assert set(new_df["msa"].values) == set(['19100', '22900', '49340'])
9291
assert set(new_df["timestamp"].values) == set(df["timestamp"].values)
9392
assert set(new_df["totalTest"].values) == set([200, 750, 100])
9493
assert set(new_df["positiveTest"].values) == set([5, 70, 10])

0 commit comments

Comments
 (0)