57
57
58
58
def create_fips_zip_crosswalk ():
59
59
"""Build (weighted) crosswalk tables for FIPS to ZIP and ZIP to FIPS."""
60
- pop_df = pd .read_csv (FIPS_BY_ZIP_POP_URL )
61
-
60
+ pop_df = pd .read_csv (FIPS_BY_ZIP_POP_URL ).rename (columns = {"POPPT" : "pop" })
62
61
# Create the FIPS column by combining the state and county codes
63
- state_codes = pop_df ["STATE" ].astype (str ).str .zfill (2 )
64
- county_codes = pop_df ["COUNTY" ].astype (str ).str .zfill (3 )
65
- pop_df ["fips" ] = state_codes + county_codes
66
-
62
+ pop_df ["fips" ] = pop_df ["STATE" ].astype (str ).str .zfill (2 ) + pop_df ["COUNTY" ].astype (str ).str .zfill (3 )
67
63
# Create the ZIP column by adding leading zeros to the ZIP
68
64
pop_df ["zip" ] = pop_df ["ZCTA5" ].astype (str ).str .zfill (5 )
69
-
70
- # Pare down the dataframe to just the relevant columns: zip, fips, and population
71
- pop_df = pop_df [["zip" , "fips" , "POPPT" ]].rename (columns = {"POPPT" : "pop" })
65
+ pop_df = pop_df [["zip" , "fips" , "pop" ]]
72
66
73
67
# Find the population fractions (the heaviest computation, takes about a minute)
74
68
# Note that the denominator in the fractions is the source population
@@ -77,54 +71,39 @@ def create_fips_zip_crosswalk():
77
71
zip_fips : DataFrame = pop_df .groupby ("zip" , as_index = False ).apply (lambda g : g ["pop" ] / g ["pop" ].sum ())
78
72
79
73
# Rename and write to file
80
- fips_zip = fips_zip .reset_index (level = ["fips" , "zip" ]).rename (columns = {"pop" : "weight" })
81
- fips_zip = fips_zip [fips_zip ["weight" ] > 0.0 ]
74
+ fips_zip = fips_zip .reset_index (level = ["fips" , "zip" ]).rename (columns = {"pop" : "weight" }).query ("weight > 0.0" )
82
75
fips_zip .sort_values (["fips" , "zip" ]).to_csv (join (OUTPUT_DIR , FIPS_ZIP_OUT_FILENAME ), index = False )
83
76
84
- zip_fips = zip_fips .reset_index (level = ["fips" , "zip" ]).rename (columns = {"pop" : "weight" })
85
- zip_fips = zip_fips [zip_fips ["weight" ] > 0.0 ]
77
+ zip_fips = zip_fips .reset_index (level = ["fips" , "zip" ]).rename (columns = {"pop" : "weight" }).query ("weight > 0.0" )
86
78
zip_fips .sort_values (["zip" , "fips" ]).to_csv (join (OUTPUT_DIR , ZIP_FIPS_OUT_FILENAME ), index = False )
87
79
88
80
89
81
def create_zip_hsa_hrr_crosswalk ():
90
82
"""Build a crosswalk table for ZIP to HSA and for ZIP to HRR."""
91
- zipped_csv = ZipFile (BytesIO (requests .get (ZIP_HSA_HRR_URL ).content ))
92
- zip_df = pd .read_csv (zipped_csv .open (ZIP_HSA_HRR_FILENAME ))
83
+ with ZipFile (BytesIO (requests .get (ZIP_HSA_HRR_URL ).content )) as zipped_csv :
84
+ zip_df = pd .read_csv (zipped_csv .open (ZIP_HSA_HRR_FILENAME ))
93
85
94
- # Build the HSA table
95
86
hsa_df = zip_df [["zipcode18" , "hsanum" ]].rename (columns = {"zipcode18" : "zip" , "hsanum" : "hsa" })
87
+ hsa_df ["zip" ] = hsa_df ["zip" ].astype (str ).str .zfill (5 )
88
+ hsa_df ["hsa" ] = hsa_df ["hsa" ].astype (str )
89
+ hsa_df .sort_values (["zip" , "hsa" ]).to_csv (join (OUTPUT_DIR , ZIP_HSA_OUT_FILENAME ), index = False )
96
90
97
- # Build the HRR table
98
91
hrr_df = zip_df [["zipcode18" , "hrrnum" ]].rename (columns = {"zipcode18" : "zip" , "hrrnum" : "hrr" })
99
-
100
- # Convert to zero-padded strings
101
92
hrr_df ["zip" ] = hrr_df ["zip" ].astype (str ).str .zfill (5 )
102
93
hrr_df ["hrr" ] = hrr_df ["hrr" ].astype (str )
103
- hsa_df ["zip" ] = hsa_df ["zip" ].astype (str ).str .zfill (5 )
104
- hsa_df ["hsa" ] = hsa_df ["hsa" ].astype (str )
105
-
106
- hsa_df .sort_values (["zip" , "hsa" ]).to_csv (join (OUTPUT_DIR , ZIP_HSA_OUT_FILENAME ), index = False )
107
94
hrr_df .sort_values (["zip" , "hrr" ]).to_csv (join (OUTPUT_DIR , ZIP_HRR_OUT_FILENAME ), index = False )
108
95
109
96
110
97
def create_fips_msa_crosswalk ():
111
98
"""Build a crosswalk table for FIPS to MSA."""
112
- msa_cols = {
113
- "CBSA Code" : int ,
114
- "Metropolitan/Micropolitan Statistical Area" : str ,
115
- "FIPS State Code" : str ,
116
- "FIPS County Code" : str ,
117
- }
118
- # The following line requires the xlrd package.
119
- msa_df = pd .read_excel (FIPS_MSA_URL , skiprows = 2 , skipfooter = 4 , usecols = msa_cols .keys (), dtype = msa_cols )
120
-
121
- metro_bool = msa_df ["Metropolitan/Micropolitan Statistical Area" ] == "Metropolitan Statistical Area"
122
- msa_df = msa_df [metro_bool ]
99
+ # Requires xlrd.
100
+ msa_df = pd .read_excel (FIPS_MSA_URL , skiprows = 2 , skipfooter = 4 , dtype = {"CBSA Code" : int , "Metropolitan/Micropolitan Statistical Area" : str , "FIPS State Code" : str , "FIPS County Code" : str }).rename (columns = {"CBSA Code" : "msa" })
101
+ msa_df = msa_df [msa_df ["Metropolitan/Micropolitan Statistical Area" ] == "Metropolitan Statistical Area" ]
123
102
124
103
# Combine state and county codes into a single FIPS code
125
104
msa_df ["fips" ] = msa_df ["FIPS State Code" ].str .cat (msa_df ["FIPS County Code" ])
126
105
127
- msa_df .rename ( columns = { "CBSA Code" : "msa" }). sort_values (["fips" , "msa" ]).to_csv (join (OUTPUT_DIR , FIPS_MSA_OUT_FILENAME ), columns = ["fips" , "msa" ], index = False )
106
+ msa_df .sort_values (["fips" , "msa" ]).to_csv (join (OUTPUT_DIR , FIPS_MSA_OUT_FILENAME ), columns = ["fips" , "msa" ], index = False )
128
107
129
108
130
109
def create_jhu_uid_fips_crosswalk ():
@@ -177,23 +156,19 @@ def create_jhu_uid_fips_crosswalk():
177
156
{"jhu_uid" : "84070020" , "fips" : "49000" , "weight" : 1.0 },
178
157
]
179
158
)
159
+ # Map the Unassigned category to a custom megaFIPS XX000
180
160
unassigned_states = pd .DataFrame (
181
- [
182
- # Map the Unassigned category to a custom megaFIPS XX000
183
- {"jhu_uid" : str (x ), "fips" : str (x )[- 2 :].ljust (5 , "0" ), "weight" : 1.0 }
184
- for x in range (84090001 , 84090057 )
185
- ]
161
+ {"jhu_uid" : str (x ), "fips" : str (x )[- 2 :].ljust (5 , "0" ), "weight" : 1.0 }
162
+ for x in range (84090001 , 84090057 )
186
163
)
164
+ # Map the Out of State category to a custom megaFIPS XX000
187
165
out_of_state = pd .DataFrame (
188
- [
189
- # Map the Out of State category to a custom megaFIPS XX000
190
- {"jhu_uid" : str (x ), "fips" : str (x )[- 2 :].ljust (5 , "0" ), "weight" : 1.0 }
191
- for x in range (84080001 , 84080057 )
192
- ]
166
+ {"jhu_uid" : str (x ), "fips" : str (x )[- 2 :].ljust (5 , "0" ), "weight" : 1.0 }
167
+ for x in range (84080001 , 84080057 )
193
168
)
169
+ # Map the Unassigned and Out of State categories to the cusom megaFIPS 72000
194
170
puerto_rico_unassigned = pd .DataFrame (
195
171
[
196
- # Map the Unassigned and Out of State categories to the cusom megaFIPS 72000
197
172
{"jhu_uid" : "63072888" , "fips" : "72000" , "weight" : 1.0 },
198
173
{"jhu_uid" : "63072999" , "fips" : "72000" , "weight" : 1.0 },
199
174
]
@@ -206,35 +181,29 @@ def create_jhu_uid_fips_crosswalk():
206
181
)
207
182
208
183
jhu_df = pd .read_csv (JHU_FIPS_URL , dtype = {"UID" : str , "FIPS" : str }).query ("Country_Region == 'US'" )
209
- jhu_df = jhu_df .rename (columns = {"UID" : "jhu_uid" , "FIPS" : "fips" }).dropna (subset = ["fips" ])[[ "jhu_uid" , "fips" ]]
184
+ jhu_df = jhu_df .rename (columns = {"UID" : "jhu_uid" , "FIPS" : "fips" }).dropna (subset = ["fips" ])
210
185
211
186
# FIPS Codes that are just two digits long should be zero filled on the right.
212
187
# These are US state codes (XX) and the territories Guam (66), Northern Mariana Islands (69),
213
188
# Virgin Islands (78), and Puerto Rico (72).
214
- fips_st = jhu_df ["fips" ].str .len () <= 2
215
- jhu_df .loc [fips_st , "fips" ] = jhu_df .loc [fips_st , "fips" ].str .ljust (5 , "0" )
189
+ fips_territories = jhu_df ["fips" ].str .len () <= 2
190
+ jhu_df .loc [fips_territories , "fips" ] = jhu_df .loc [fips_territories , "fips" ].str .ljust (5 , "0" )
216
191
217
192
# Drop the JHU UIDs that were hand-modified
218
193
manual_correction_ids = pd .concat ([hand_additions , unassigned_states , out_of_state , puerto_rico_unassigned , cruise_ships ])["jhu_uid" ]
219
- dup_ind = jhu_df ["jhu_uid" ].isin (manual_correction_ids )
220
- jhu_df .drop (jhu_df .index [dup_ind ], inplace = True )
194
+ jhu_df .drop (jhu_df .index [jhu_df ["jhu_uid" ].isin (manual_correction_ids )], inplace = True )
221
195
222
196
# Add weights of 1.0 to everything not in hand additions, then merge in hand-additions
223
197
# Finally, zero fill FIPS
224
198
jhu_df ["weight" ] = 1.0
225
199
jhu_df = pd .concat ([jhu_df , hand_additions , unassigned_states , out_of_state , puerto_rico_unassigned ])
226
200
jhu_df ["fips" ] = jhu_df ["fips" ].astype (int ).astype (str ).str .zfill (5 )
227
- jhu_df .sort_values (["jhu_uid" , "fips" ]).to_csv (join (OUTPUT_DIR , JHU_FIPS_OUT_FILENAME ), index = False )
201
+ jhu_df .sort_values (["jhu_uid" , "fips" ]).to_csv (join (OUTPUT_DIR , JHU_FIPS_OUT_FILENAME ), columns = [ "jhu_uid" , "fips" , "weight" ], index = False )
228
202
229
203
230
204
def create_state_codes_crosswalk ():
231
205
"""Build a State ID -> State Name -> State code crosswalk file."""
232
- column_rename_map = {
233
- "STATE" : "state_code" ,
234
- "STUSAB" : "state_id" ,
235
- "STATE_NAME" : "state_name" ,
236
- }
237
- df = pd .read_csv (STATE_CODES_URL , delimiter = "|" ).drop (columns = "STATENS" ).rename (columns = column_rename_map )
206
+ df = pd .read_csv (STATE_CODES_URL , delimiter = "|" ).drop (columns = "STATENS" ).rename (columns = {"STATE" : "state_code" , "STUSAB" : "state_id" , "STATE_NAME" : "state_name" })
238
207
df ["state_code" ] = df ["state_code" ].astype (str ).str .zfill (2 )
239
208
df ["state_id" ] = df ["state_id" ].astype (str ).str .lower ()
240
209
@@ -259,7 +228,6 @@ def create_state_codes_crosswalk():
259
228
]
260
229
)
261
230
df = pd .concat ((df , territories ))
262
-
263
231
df .sort_values ("state_code" ).to_csv (join (OUTPUT_DIR , STATE_OUT_FILENAME ), index = False )
264
232
265
233
@@ -288,8 +256,7 @@ def create_state_hhs_crosswalk():
288
256
hhs_state_pairs .append ((9 , "Northern Mariana Islands" ))
289
257
290
258
# Make dataframe
291
- hhs_df = pd .DataFrame (hhs_state_pairs , columns = ["hhs" , "state_name" ])
292
- hhs_df ["hhs" ] = hhs_df ["hhs" ].astype (str )
259
+ hhs_df = pd .DataFrame (hhs_state_pairs , columns = ["hhs" , "state_name" ], dtype = str )
293
260
294
261
ss_df = ss_df .merge (hhs_df , on = "state_name" , how = "left" ).dropna ()
295
262
ss_df .sort_values ("state_code" ).to_csv (join (OUTPUT_DIR , STATE_HHS_OUT_FILENAME ), columns = ["state_code" , "hhs" ], index = False )
@@ -319,13 +286,12 @@ def create_fips_population_table():
319
286
census_pop = census_pop .reset_index (drop = True )
320
287
321
288
# Get the file with Puerto Rico populations
322
- df_pr = pd .read_csv (FIPS_PUERTO_RICO_POPULATION_URL )
289
+ df_pr = pd .read_csv (FIPS_PUERTO_RICO_POPULATION_URL ). rename ( columns = { "POPPT" : "pop" })
323
290
df_pr ["fips" ] = df_pr ["STATE" ].astype (str ).str .zfill (2 ) + df_pr ["COUNTY" ].astype (str ).str .zfill (3 )
324
- df_pr = df_pr . rename ( columns = { "POPPT" : "pop" }) [["fips" , "pop" ]]
291
+ df_pr = df_pr [["fips" , "pop" ]]
325
292
# Create the Puerto Rico megaFIPS
326
293
df_pr = df_pr [df_pr ["fips" ].isin ([str (x ) for x in range (72000 , 72999 )])]
327
294
df_pr = pd .concat ([df_pr , pd .DataFrame ([{"fips" : "72000" , "pop" : df_pr ["pop" ].sum ()}])])
328
-
329
295
# Fill the missing Puerto Rico data with 2010 information
330
296
df_pr = df_pr .groupby ("fips" ).sum ().reset_index ()
331
297
df_pr = df_pr [~ df_pr ["fips" ].isin (census_pop ["fips" ])]
@@ -354,8 +320,7 @@ def create_state_population_table():
354
320
355
321
census_pop = pd .read_csv (join (OUTPUT_DIR , FIPS_POPULATION_OUT_FILENAME ), dtype = {"fips" : str , "pop" : int })
356
322
state : DataFrame = pd .read_csv (join (OUTPUT_DIR , FIPS_STATE_OUT_FILENAME ), dtype = str )
357
- combined = state .merge (census_pop , on = "fips" )
358
- state_pop = combined .groupby (["state_code" , "state_id" , "state_name" ], as_index = False ).sum ()
323
+ state_pop = state .merge (census_pop , on = "fips" ).groupby (["state_code" , "state_id" , "state_name" ], as_index = False ).sum ()
359
324
state_pop .sort_values ("state_code" ).to_csv (join (OUTPUT_DIR , STATE_POPULATION_OUT_FILENAME ), index = False )
360
325
361
326
@@ -369,8 +334,7 @@ def create_hhs_population_table():
369
334
370
335
state_pop = pd .read_csv (join (OUTPUT_DIR , STATE_POPULATION_OUT_FILENAME ), dtype = {"state_code" : str , "hhs" : int }, usecols = ["state_code" , "pop" ])
371
336
state_hhs = pd .read_csv (join (OUTPUT_DIR , STATE_HHS_OUT_FILENAME ), dtype = str )
372
- combined = state_pop .merge (state_hhs , on = "state_code" )
373
- hhs_pop = combined .groupby ("hhs" , as_index = False ).sum ()
337
+ hhs_pop = state_pop .merge (state_hhs , on = "state_code" ).groupby ("hhs" , as_index = False ).sum ()
374
338
hhs_pop .sort_values ("hhs" ).to_csv (join (OUTPUT_DIR , HHS_POPULATION_OUT_FILENAME ), index = False )
375
339
376
340
0 commit comments