Skip to content

Commit 9e10f00

Browse files
committed
Add population functions, zip to state, msa
1 parent 14f3085 commit 9e10f00

File tree

2 files changed

+168
-28
lines changed

2 files changed

+168
-28
lines changed

_delphi_utils_python/delphi_utils/geomap.py

Lines changed: 122 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ class GeoMapper:
6464
6565
The GeoMapper instance loads crosswalk tables from the package data_dir. The
6666
crosswalk tables are assumed to have been built using the geo_data_proc.py script
67-
in data_proc/geomap. If a mapping between codes is NOT one to many, then the table has
67+
in data_proc/geomap. If a mapping between codes is NOT one to many, then the table has
6868
just two colums. If the mapping IS one to many, then a third column, the weight column,
6969
exists (e.g. zip, fips, weight; satisfying (sum(weights) where zip==ZIP) == 1).
7070
@@ -73,7 +73,7 @@ class GeoMapper:
7373
- load_* : load a crosswalk table into the instance (e.g. zip to fips).
7474
- convert_* : add a new column to a dataframe by joining with a crosswalk table
7575
- *_to_* : replace a geo code column with another, using weighted sum aggregation where
76-
necessary (e.g. (sum(weights*count_column) groupby fips) would convert zip
76+
necessary (e.g. (sum(weights*count_column) groupby fips) would convert zip
7777
level data to fips level data)
7878
"""
7979

@@ -111,26 +111,40 @@ def load_crosswalk(self, from_code, to_code):
111111
("fips", "hrr"),
112112
]:
113113
self.crosswalks[from_code][to_code] = pd.read_csv(
114-
stream, dtype={from_code: str, to_code: str, "weight": float,},
114+
stream,
115+
dtype={
116+
from_code: str,
117+
to_code: str,
118+
"weight": float,
119+
},
115120
)
116121
# Unweighted crosswalks
117122
elif (from_code, to_code) in [
118123
("zip", "hrr"),
119124
("fips", "msa"),
120125
]:
121126
self.crosswalks[from_code][to_code] = pd.read_csv(
122-
stream, dtype={from_code: str, to_code: str},
127+
stream,
128+
dtype={from_code: str, to_code: str},
123129
)
124130
# Special table of state codes, state IDs, and state names
125131
elif (from_code, to_code) == ("state", "state"):
126132
self.crosswalks[from_code][to_code] = pd.read_csv(
127133
stream,
128-
dtype={"state_code": str, "state_id": str, "state_name": str,},
134+
dtype={
135+
"state_code": str,
136+
"state_id": str,
137+
"state_name": str,
138+
},
129139
)
130140
# Population tables
131141
elif (from_code, to_code) in [("fips", "pop"), ("zip", "pop")]:
132142
self.crosswalks[from_code][to_code] = pd.read_csv(
133-
stream, dtype={from_code: str, "pop": int,},
143+
stream,
144+
dtype={
145+
from_code: str,
146+
"pop": int,
147+
},
134148
)
135149
return self.crosswalks[from_code][to_code]
136150

@@ -200,7 +214,7 @@ def add_new_code(self, df, from_code, new_code, from_col=None, new_col=None):
200214
201215
Parameters
202216
---------
203-
df: pd.DataFrame
217+
df: pd.DataFrame
204218
Input dataframe.
205219
from_code: {'fips', 'zip', 'jhu_uid', 'state_code', 'state_id', 'state_name'}
206220
Specifies the geocode type of the data in from_col.
@@ -231,7 +245,9 @@ def add_new_code(self, df, from_code, new_code, from_col=None, new_col=None):
231245
crosswalk = self.load_crosswalk(from_code=from_code, to_code=new_code)
232246
crosswalk = crosswalk.rename(columns={from_code: from_col, new_code: new_col})
233247

234-
df = df.merge(crosswalk, left_on=from_col, right_on=from_col, how="left").dropna(subset=[new_col])
248+
df = df.merge(
249+
crosswalk, left_on=from_col, right_on=from_col, how="left"
250+
).dropna(subset=[new_col])
235251

236252
# Drop extra state columns
237253
state_codes = ["state_code", "state_id", "state_name"]
@@ -255,7 +271,7 @@ def convert_to_new_code(
255271
256272
Parameters
257273
---------
258-
df: pd.DataFrame
274+
df: pd.DataFrame
259275
Input dataframe.
260276
from_col: str
261277
Name of the column in data to match and remove.
@@ -311,7 +327,7 @@ def convert_fips_to_state_code(
311327
312328
Parameters
313329
---------
314-
data: pd.DataFrame
330+
data: pd.DataFrame
315331
Input dataframe.
316332
fips_col: str
317333
Name of FIPS column to convert in data.
@@ -361,13 +377,13 @@ def convert_fips_to_msa(
361377
362378
Parameters
363379
---------
364-
data: pd.DataFrame
380+
data: pd.DataFrame
365381
Input data.
366382
fips_col: str
367383
Name of dataframe column containing fips codes.
368384
date_col: str
369385
Name of dataframe column containing the dates.
370-
count_cols: str
386+
count_cols: str
371387
Name of dataframe column containing the data. If None (default) all non fips/date are used.
372388
msa_col: str
373389
Name of dataframe column to contain the msa codes.
@@ -387,8 +403,7 @@ def convert_fips_to_msa(
387403
data = data.merge(msa_table, left_on=fips_col, right_on="fips", how="left")
388404

389405
# Megacounty codes are 1, followed by up to 4 leading zeros, and ending with
390-
# two digits of the state's FIPS code.
391-
# TODO: Does this need to be improved?
406+
# two digits of the state's FIPS code.=
392407
if create_mega:
393408
data_st = data.loc[data[msa_col].isna(), fips_col]
394409
data.loc[data[msa_col].isna(), msa_col] = "1" + data_st.astype(str).str[
@@ -404,7 +419,7 @@ def convert_fips_to_zip(
404419
405420
Parameters
406421
---------
407-
data: pd.DataFrame
422+
data: pd.DataFrame
408423
Input data.
409424
fips_col: str
410425
Name of dataframe column containing fips codes.
@@ -424,7 +439,9 @@ def convert_fips_to_zip(
424439
data[fips_col] = data[fips_col].astype(str).str.zfill(5)
425440

426441
cross = df.rename(columns={"zip": zip_col, "weight": weight_col})
427-
data = data.merge(cross, left_on=fips_col, right_on="fips", how="left").dropna(subset=[zip_col])
442+
data = data.merge(cross, left_on=fips_col, right_on="fips", how="left").dropna(
443+
subset=[zip_col]
444+
)
428445
return data
429446

430447
def convert_state_code_to_state_id(
@@ -582,6 +599,53 @@ def zip_to_state_code(
582599
data = data.groupby([date_col, state_code_col], dropna=False).sum()
583600
return data.reset_index()
584601

602+
def convert_zip_to_state_id(
603+
self,
604+
data,
605+
zip_col="zip",
606+
state_id_col="state_id",
607+
date_col="date",
608+
count_cols=None,
609+
):
610+
zip_to_state_cross = self.load_crosswalk(from_code="zip", to_code="state")
611+
zip_to_state_cross = zip_to_state_cross.drop(
612+
columns=["state_code", "state_name"]
613+
).rename({"state_id": state_id_col})
614+
615+
if count_cols:
616+
data = data[[zip_col, date_col] + count_cols].copy()
617+
618+
if not is_string_dtype(data[zip_col]):
619+
data[zip_col] = data[zip_col].astype(str).str.zfill(5)
620+
621+
data = data.merge(zip_to_state_cross, left_on="zip", right_on="zip", how="left")
622+
return data
623+
624+
def zip_to_state_id(
625+
self,
626+
data,
627+
zip_col="zip",
628+
state_id_col="state_id",
629+
date_col="date",
630+
count_cols=None,
631+
):
632+
data = self.convert_zip_to_state_id(
633+
data,
634+
zip_col=zip_col,
635+
state_id_col=state_id_col,
636+
date_col=date_col,
637+
count_cols=count_cols,
638+
)
639+
data.drop(columns="zip", inplace=True)
640+
641+
if count_cols is None:
642+
count_cols = list(set(data.columns) - {date_col, state_id_col, "weight"})
643+
644+
data[count_cols] = data[count_cols].multiply(data["weight"], axis=0)
645+
data.drop("weight", axis=1, inplace=True)
646+
data = data.groupby([date_col, state_id_col], dropna=False).sum()
647+
return data.reset_index()
648+
585649
def fips_to_state_id(
586650
self,
587651
data,
@@ -594,13 +658,13 @@ def fips_to_state_id(
594658
595659
Parameters
596660
---------
597-
data: pd.DataFrame
661+
data: pd.DataFrame
598662
Input data.
599663
fips_col: str
600664
Name of dataframe column containing fips codes.
601665
date_col: str
602666
Name of dataframe column containing the dates.
603-
count_cols: str
667+
count_cols: str
604668
Name of dataframe column containing the data. If None (default) all non fips/date are used.
605669
state_id_col: str
606670
Name of dataframe column to contain the state codes.
@@ -629,20 +693,20 @@ def fips_to_msa(
629693
msa_col="msa",
630694
):
631695
"""Translate dataframe from fips to metropolitan statistical area (msa).
632-
696+
633697
The encoding we use is based on the most recent Census Bureau release of CBSA (March 2020)
634698
All counties not mapped to MSAs have msa encoded as 000XX where XX is the fips state code
635699
To see how the crosswalk table is derived look at _delphi_utils_python/data_proc/geomap/*
636700
637701
Parameters
638702
---------
639-
data: pd.DataFrame
703+
data: pd.DataFrame
640704
Input data.
641705
fips_col: str
642706
Name of dataframe column containing fips codes.
643707
date_col: str
644708
Name of dataframe column containing the dates.
645-
count_cols: str
709+
count_cols: str
646710
Name of dataframe column containing the data. If None (default) all non fips/date are used.
647711
msa_col: str
648712
Name of dataframe column to contain the msa codes.
@@ -661,9 +725,9 @@ def fips_to_msa(
661725
data.drop(fips_col, axis=1, inplace=True)
662726
data.dropna(axis=0, subset=[msa_col], inplace=True)
663727
if date_col:
664-
data = data.groupby([date_col, msa_col], dropna=False).sum()
728+
data = data.groupby([date_col, msa_col]).sum()
665729
else:
666-
data = data.groupby(msa_col, dropna=False).sum()
730+
data = data.groupby(msa_col).sum()
667731
return data.reset_index()
668732

669733
def zip_to_fips(
@@ -699,9 +763,9 @@ def zip_to_fips(
699763
data.drop([zip_col, "weight"], axis=1, inplace=True)
700764

701765
if date_col:
702-
data = data.groupby([date_col, fips_col], dropna=False).sum()
766+
data = data.groupby([date_col, fips_col]).sum()
703767
else:
704-
data = data.groupby(fips_col, dropna=False).sum()
768+
data = data.groupby(fips_col).sum()
705769
return data.reset_index()
706770

707771
def fips_to_megacounty(
@@ -750,7 +814,7 @@ def fips_to_megacounty(
750814
)
751815
data.set_index([fips_col, date_col], inplace=True)
752816
data = data.join(mega_data)
753-
data = data.reset_index().groupby([date_col, mega_col], dropna=False).sum()
817+
data = data.reset_index().groupby([date_col, mega_col]).sum()
754818
return data.reset_index()
755819

756820
def zip_to_hrr(
@@ -835,7 +899,7 @@ def jhu_uid_to_fips(
835899
data.dropna(subset=[fips_col], axis=0, inplace=True)
836900
data[count_cols] = data[count_cols].multiply(data["weight"], axis=0)
837901
data.drop([jhu_col, "weight"], axis=1, inplace=True)
838-
data = data.groupby([date_col, fips_col], dropna=False).sum()
902+
data = data.groupby([date_col, fips_col]).sum()
839903
return data.reset_index()
840904

841905
def fips_to_zip(
@@ -864,7 +928,7 @@ def fips_to_zip(
864928
data.drop(fips_col, axis=1, inplace=True)
865929
data[count_cols] = data[count_cols].multiply(data["weight"], axis=0)
866930
data.drop("weight", axis=1, inplace=True)
867-
data = data.groupby([date_col, zip_col], dropna=False).sum()
931+
data = data.groupby([date_col, zip_col]).sum()
868932
return data.reset_index()
869933

870934
def fips_to_hrr(
@@ -901,3 +965,33 @@ def fips_to_hrr(
901965
hrr_col=hrr_col,
902966
)
903967
return data
968+
969+
def add_population_column(self, data, geocode_type, geocode_col=None):
970+
"""
971+
Appends a population column to a dateframe, based on the FIPS or ZIP code.
972+
973+
Parameters
974+
---------
975+
data: pd.DataFrame
976+
The dataframe with a FIPS code column.
977+
geocode_type: {"fips", "zip"}
978+
The type of the geocode contained in geocode_col.
979+
geocode_col: str, default None
980+
The name of the column containing the geocodes. If None, uses the geocode_type
981+
as the name.
982+
983+
Returns
984+
--------
985+
data_with_pop: pd.Dataframe
986+
A dataframe with a population column appended.
987+
"""
988+
geocode_col = geocode_type if geocode_col is None else geocode_col
989+
pop_df = self.load_crosswalk(from_code=geocode_type, to_code="pop")
990+
data_with_pop = (
991+
data.copy()
992+
.merge(pop_df, left_on=geocode_col, right_on=geocode_type, how="left")
993+
.dropna(subset=["pop"])
994+
.rename(columns={"pop": "population"})
995+
)
996+
data_with_pop["population"] = data_with_pop["population"].astype(int)
997+
return data_with_pop

_delphi_utils_python/tests/test_geomap.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,3 +208,49 @@ def test_fips_to_hrr(self):
208208
gmpr = GeoMapper()
209209
new_data = gmpr.fips_to_hrr(self.fips_data_3)
210210
assert new_data.shape == (2,4)
211+
212+
213+
def test_convert_zip_to_msa(self):
214+
gmpr = GeoMapper()
215+
new_data = gmpr.convert_zip_to_msa(self.zip_data)
216+
assert new_data['msa'][2] == "40900"
217+
assert new_data['count'].multiply(new_data['weight']).sum() == self.zip_data['count'].sum()
218+
219+
def test_zip_to_msa(self):
220+
gmpr = GeoMapper()
221+
new_data = gmpr.zip_to_msa(self.zip_data)
222+
assert new_data.shape[0] == 6
223+
assert np.allclose(new_data['count'].sum(), self.zip_data['count'].sum())
224+
225+
def test_convert_zip_to_state_code(self):
226+
gmpr = GeoMapper()
227+
new_data = gmpr.convert_zip_to_state_code(self.zip_data)
228+
assert new_data.shape[0] == 12
229+
assert np.allclose(new_data['count'].multiply(new_data['weight']).sum(), self.zip_data['count'].sum())
230+
231+
def test_zip_to_state_code(self):
232+
gmpr = GeoMapper()
233+
new_data = gmpr.zip_to_state_code(self.zip_data)
234+
assert new_data.shape[0] == 4
235+
assert np.allclose(new_data['count'].sum(), self.zip_data['count'].sum())
236+
237+
def test_convert_zip_to_state_id(self):
238+
gmpr = GeoMapper()
239+
new_data = gmpr.convert_zip_to_state_id(self.zip_data)
240+
assert new_data.shape[0] == 12
241+
assert np.allclose(new_data['count'].multiply(new_data['weight']).sum(), self.zip_data['count'].sum())
242+
243+
def test_zip_to_state_id(self):
244+
gmpr = GeoMapper()
245+
new_data = gmpr.zip_to_state_id(self.zip_data)
246+
assert new_data.shape[0] == 4
247+
assert np.allclose(new_data['count'].sum(), self.zip_data['count'].sum())
248+
249+
def test_add_population_column(self):
250+
gmpr = GeoMapper()
251+
self.fips_data_3["fips"] = self.fips_data_3["fips"].astype(str).str.zfill(5)
252+
self.zip_data["zip"] = self.zip_data["zip"].astype(str).str.zfill(5)
253+
new_data = gmpr.add_population_column(self.fips_data_3, "fips")
254+
assert new_data["population"].sum() == 268155
255+
new_data = gmpr.add_population_column(self.zip_data, "zip")
256+
assert new_data["population"].sum() == 255160

0 commit comments

Comments
 (0)