@@ -64,7 +64,7 @@ class GeoMapper:
64
64
65
65
The GeoMapper instance loads crosswalk tables from the package data_dir. The
66
66
crosswalk tables are assumed to have been built using the geo_data_proc.py script
67
- in data_proc/geomap. If a mapping between codes is NOT one to many, then the table has
67
+ in data_proc/geomap. If a mapping between codes is NOT one to many, then the table has
68
68
just two colums. If the mapping IS one to many, then a third column, the weight column,
69
69
exists (e.g. zip, fips, weight; satisfying (sum(weights) where zip==ZIP) == 1).
70
70
@@ -73,7 +73,7 @@ class GeoMapper:
73
73
- load_* : load a crosswalk table into the instance (e.g. zip to fips).
74
74
- convert_* : add a new column to a dataframe by joining with a crosswalk table
75
75
- *_to_* : replace a geo code column with another, using weighted sum aggregation where
76
- necessary (e.g. (sum(weights*count_column) groupby fips) would convert zip
76
+ necessary (e.g. (sum(weights*count_column) groupby fips) would convert zip
77
77
level data to fips level data)
78
78
"""
79
79
@@ -111,26 +111,40 @@ def load_crosswalk(self, from_code, to_code):
111
111
("fips" , "hrr" ),
112
112
]:
113
113
self .crosswalks [from_code ][to_code ] = pd .read_csv (
114
- stream , dtype = {from_code : str , to_code : str , "weight" : float ,},
114
+ stream ,
115
+ dtype = {
116
+ from_code : str ,
117
+ to_code : str ,
118
+ "weight" : float ,
119
+ },
115
120
)
116
121
# Unweighted crosswalks
117
122
elif (from_code , to_code ) in [
118
123
("zip" , "hrr" ),
119
124
("fips" , "msa" ),
120
125
]:
121
126
self .crosswalks [from_code ][to_code ] = pd .read_csv (
122
- stream , dtype = {from_code : str , to_code : str },
127
+ stream ,
128
+ dtype = {from_code : str , to_code : str },
123
129
)
124
130
# Special table of state codes, state IDs, and state names
125
131
elif (from_code , to_code ) == ("state" , "state" ):
126
132
self .crosswalks [from_code ][to_code ] = pd .read_csv (
127
133
stream ,
128
- dtype = {"state_code" : str , "state_id" : str , "state_name" : str ,},
134
+ dtype = {
135
+ "state_code" : str ,
136
+ "state_id" : str ,
137
+ "state_name" : str ,
138
+ },
129
139
)
130
140
# Population tables
131
141
elif (from_code , to_code ) in [("fips" , "pop" ), ("zip" , "pop" )]:
132
142
self .crosswalks [from_code ][to_code ] = pd .read_csv (
133
- stream , dtype = {from_code : str , "pop" : int ,},
143
+ stream ,
144
+ dtype = {
145
+ from_code : str ,
146
+ "pop" : int ,
147
+ },
134
148
)
135
149
return self .crosswalks [from_code ][to_code ]
136
150
@@ -200,7 +214,7 @@ def add_new_code(self, df, from_code, new_code, from_col=None, new_col=None):
200
214
201
215
Parameters
202
216
---------
203
- df: pd.DataFrame
217
+ df: pd.DataFrame
204
218
Input dataframe.
205
219
from_code: {'fips', 'zip', 'jhu_uid', 'state_code', 'state_id', 'state_name'}
206
220
Specifies the geocode type of the data in from_col.
@@ -231,7 +245,9 @@ def add_new_code(self, df, from_code, new_code, from_col=None, new_col=None):
231
245
crosswalk = self .load_crosswalk (from_code = from_code , to_code = new_code )
232
246
crosswalk = crosswalk .rename (columns = {from_code : from_col , new_code : new_col })
233
247
234
- df = df .merge (crosswalk , left_on = from_col , right_on = from_col , how = "left" ).dropna (subset = [new_col ])
248
+ df = df .merge (
249
+ crosswalk , left_on = from_col , right_on = from_col , how = "left"
250
+ ).dropna (subset = [new_col ])
235
251
236
252
# Drop extra state columns
237
253
state_codes = ["state_code" , "state_id" , "state_name" ]
@@ -255,7 +271,7 @@ def convert_to_new_code(
255
271
256
272
Parameters
257
273
---------
258
- df: pd.DataFrame
274
+ df: pd.DataFrame
259
275
Input dataframe.
260
276
from_col: str
261
277
Name of the column in data to match and remove.
@@ -311,7 +327,7 @@ def convert_fips_to_state_code(
311
327
312
328
Parameters
313
329
---------
314
- data: pd.DataFrame
330
+ data: pd.DataFrame
315
331
Input dataframe.
316
332
fips_col: str
317
333
Name of FIPS column to convert in data.
@@ -361,13 +377,13 @@ def convert_fips_to_msa(
361
377
362
378
Parameters
363
379
---------
364
- data: pd.DataFrame
380
+ data: pd.DataFrame
365
381
Input data.
366
382
fips_col: str
367
383
Name of dataframe column containing fips codes.
368
384
date_col: str
369
385
Name of dataframe column containing the dates.
370
- count_cols: str
386
+ count_cols: str
371
387
Name of dataframe column containing the data. If None (default) all non fips/date are used.
372
388
msa_col: str
373
389
Name of dataframe column to contain the msa codes.
@@ -387,8 +403,7 @@ def convert_fips_to_msa(
387
403
data = data .merge (msa_table , left_on = fips_col , right_on = "fips" , how = "left" )
388
404
389
405
# Megacounty codes are 1, followed by up to 4 leading zeros, and ending with
390
- # two digits of the state's FIPS code.
391
- # TODO: Does this need to be improved?
406
+ # two digits of the state's FIPS code.=
392
407
if create_mega :
393
408
data_st = data .loc [data [msa_col ].isna (), fips_col ]
394
409
data .loc [data [msa_col ].isna (), msa_col ] = "1" + data_st .astype (str ).str [
@@ -404,7 +419,7 @@ def convert_fips_to_zip(
404
419
405
420
Parameters
406
421
---------
407
- data: pd.DataFrame
422
+ data: pd.DataFrame
408
423
Input data.
409
424
fips_col: str
410
425
Name of dataframe column containing fips codes.
@@ -424,7 +439,9 @@ def convert_fips_to_zip(
424
439
data [fips_col ] = data [fips_col ].astype (str ).str .zfill (5 )
425
440
426
441
cross = df .rename (columns = {"zip" : zip_col , "weight" : weight_col })
427
- data = data .merge (cross , left_on = fips_col , right_on = "fips" , how = "left" ).dropna (subset = [zip_col ])
442
+ data = data .merge (cross , left_on = fips_col , right_on = "fips" , how = "left" ).dropna (
443
+ subset = [zip_col ]
444
+ )
428
445
return data
429
446
430
447
def convert_state_code_to_state_id (
@@ -582,6 +599,53 @@ def zip_to_state_code(
582
599
data = data .groupby ([date_col , state_code_col ], dropna = False ).sum ()
583
600
return data .reset_index ()
584
601
602
+ def convert_zip_to_state_id (
603
+ self ,
604
+ data ,
605
+ zip_col = "zip" ,
606
+ state_id_col = "state_id" ,
607
+ date_col = "date" ,
608
+ count_cols = None ,
609
+ ):
610
+ zip_to_state_cross = self .load_crosswalk (from_code = "zip" , to_code = "state" )
611
+ zip_to_state_cross = zip_to_state_cross .drop (
612
+ columns = ["state_code" , "state_name" ]
613
+ ).rename ({"state_id" : state_id_col })
614
+
615
+ if count_cols :
616
+ data = data [[zip_col , date_col ] + count_cols ].copy ()
617
+
618
+ if not is_string_dtype (data [zip_col ]):
619
+ data [zip_col ] = data [zip_col ].astype (str ).str .zfill (5 )
620
+
621
+ data = data .merge (zip_to_state_cross , left_on = "zip" , right_on = "zip" , how = "left" )
622
+ return data
623
+
624
+ def zip_to_state_id (
625
+ self ,
626
+ data ,
627
+ zip_col = "zip" ,
628
+ state_id_col = "state_id" ,
629
+ date_col = "date" ,
630
+ count_cols = None ,
631
+ ):
632
+ data = self .convert_zip_to_state_id (
633
+ data ,
634
+ zip_col = zip_col ,
635
+ state_id_col = state_id_col ,
636
+ date_col = date_col ,
637
+ count_cols = count_cols ,
638
+ )
639
+ data .drop (columns = "zip" , inplace = True )
640
+
641
+ if count_cols is None :
642
+ count_cols = list (set (data .columns ) - {date_col , state_id_col , "weight" })
643
+
644
+ data [count_cols ] = data [count_cols ].multiply (data ["weight" ], axis = 0 )
645
+ data .drop ("weight" , axis = 1 , inplace = True )
646
+ data = data .groupby ([date_col , state_id_col ], dropna = False ).sum ()
647
+ return data .reset_index ()
648
+
585
649
def fips_to_state_id (
586
650
self ,
587
651
data ,
@@ -594,13 +658,13 @@ def fips_to_state_id(
594
658
595
659
Parameters
596
660
---------
597
- data: pd.DataFrame
661
+ data: pd.DataFrame
598
662
Input data.
599
663
fips_col: str
600
664
Name of dataframe column containing fips codes.
601
665
date_col: str
602
666
Name of dataframe column containing the dates.
603
- count_cols: str
667
+ count_cols: str
604
668
Name of dataframe column containing the data. If None (default) all non fips/date are used.
605
669
state_id_col: str
606
670
Name of dataframe column to contain the state codes.
@@ -629,20 +693,20 @@ def fips_to_msa(
629
693
msa_col = "msa" ,
630
694
):
631
695
"""Translate dataframe from fips to metropolitan statistical area (msa).
632
-
696
+
633
697
The encoding we use is based on the most recent Census Bureau release of CBSA (March 2020)
634
698
All counties not mapped to MSAs have msa encoded as 000XX where XX is the fips state code
635
699
To see how the crosswalk table is derived look at _delphi_utils_python/data_proc/geomap/*
636
700
637
701
Parameters
638
702
---------
639
- data: pd.DataFrame
703
+ data: pd.DataFrame
640
704
Input data.
641
705
fips_col: str
642
706
Name of dataframe column containing fips codes.
643
707
date_col: str
644
708
Name of dataframe column containing the dates.
645
- count_cols: str
709
+ count_cols: str
646
710
Name of dataframe column containing the data. If None (default) all non fips/date are used.
647
711
msa_col: str
648
712
Name of dataframe column to contain the msa codes.
@@ -661,9 +725,9 @@ def fips_to_msa(
661
725
data .drop (fips_col , axis = 1 , inplace = True )
662
726
data .dropna (axis = 0 , subset = [msa_col ], inplace = True )
663
727
if date_col :
664
- data = data .groupby ([date_col , msa_col ], dropna = False ).sum ()
728
+ data = data .groupby ([date_col , msa_col ]).sum ()
665
729
else :
666
- data = data .groupby (msa_col , dropna = False ).sum ()
730
+ data = data .groupby (msa_col ).sum ()
667
731
return data .reset_index ()
668
732
669
733
def zip_to_fips (
@@ -699,9 +763,9 @@ def zip_to_fips(
699
763
data .drop ([zip_col , "weight" ], axis = 1 , inplace = True )
700
764
701
765
if date_col :
702
- data = data .groupby ([date_col , fips_col ], dropna = False ).sum ()
766
+ data = data .groupby ([date_col , fips_col ]).sum ()
703
767
else :
704
- data = data .groupby (fips_col , dropna = False ).sum ()
768
+ data = data .groupby (fips_col ).sum ()
705
769
return data .reset_index ()
706
770
707
771
def fips_to_megacounty (
@@ -750,7 +814,7 @@ def fips_to_megacounty(
750
814
)
751
815
data .set_index ([fips_col , date_col ], inplace = True )
752
816
data = data .join (mega_data )
753
- data = data .reset_index ().groupby ([date_col , mega_col ], dropna = False ).sum ()
817
+ data = data .reset_index ().groupby ([date_col , mega_col ]).sum ()
754
818
return data .reset_index ()
755
819
756
820
def zip_to_hrr (
@@ -835,7 +899,7 @@ def jhu_uid_to_fips(
835
899
data .dropna (subset = [fips_col ], axis = 0 , inplace = True )
836
900
data [count_cols ] = data [count_cols ].multiply (data ["weight" ], axis = 0 )
837
901
data .drop ([jhu_col , "weight" ], axis = 1 , inplace = True )
838
- data = data .groupby ([date_col , fips_col ], dropna = False ).sum ()
902
+ data = data .groupby ([date_col , fips_col ]).sum ()
839
903
return data .reset_index ()
840
904
841
905
def fips_to_zip (
@@ -864,7 +928,7 @@ def fips_to_zip(
864
928
data .drop (fips_col , axis = 1 , inplace = True )
865
929
data [count_cols ] = data [count_cols ].multiply (data ["weight" ], axis = 0 )
866
930
data .drop ("weight" , axis = 1 , inplace = True )
867
- data = data .groupby ([date_col , zip_col ], dropna = False ).sum ()
931
+ data = data .groupby ([date_col , zip_col ]).sum ()
868
932
return data .reset_index ()
869
933
870
934
def fips_to_hrr (
@@ -901,3 +965,33 @@ def fips_to_hrr(
901
965
hrr_col = hrr_col ,
902
966
)
903
967
return data
968
+
969
+ def add_population_column (self , data , geocode_type , geocode_col = None ):
970
+ """
971
+ Appends a population column to a dateframe, based on the FIPS or ZIP code.
972
+
973
+ Parameters
974
+ ---------
975
+ data: pd.DataFrame
976
+ The dataframe with a FIPS code column.
977
+ geocode_type: {"fips", "zip"}
978
+ The type of the geocode contained in geocode_col.
979
+ geocode_col: str, default None
980
+ The name of the column containing the geocodes. If None, uses the geocode_type
981
+ as the name.
982
+
983
+ Returns
984
+ --------
985
+ data_with_pop: pd.Dataframe
986
+ A dataframe with a population column appended.
987
+ """
988
+ geocode_col = geocode_type if geocode_col is None else geocode_col
989
+ pop_df = self .load_crosswalk (from_code = geocode_type , to_code = "pop" )
990
+ data_with_pop = (
991
+ data .copy ()
992
+ .merge (pop_df , left_on = geocode_col , right_on = geocode_type , how = "left" )
993
+ .dropna (subset = ["pop" ])
994
+ .rename (columns = {"pop" : "population" })
995
+ )
996
+ data_with_pop ["population" ] = data_with_pop ["population" ].astype (int )
997
+ return data_with_pop
0 commit comments