1
1
"""
2
2
Authors: Dmitry Shemetov @dshemetov, James Sharpnack @jsharpna
3
+
4
+ Intended execution:
5
+
6
+ cd _delphi_utils/data_proc/geomap
7
+ chmod u+x geo_data_proc.py
8
+ python geo_data_proc.py
3
9
"""
4
10
5
11
from io import BytesIO
6
12
from os import remove , listdir
7
13
from os .path import join , isfile
8
14
from zipfile import ZipFile
15
+ from pandas .core .frame import DataFrame
9
16
10
17
import requests
11
18
import pandas as pd
49
56
50
57
51
58
def create_fips_zip_crosswalk ():
52
- """
53
- Creates the (weighted) crosswalk tables between FIPS to ZIP and ZIP to FIPS
54
- from source.
55
- """
59
+ """Build (weighted) crosswalk tables for FIPS to ZIP and ZIP to FIPS."""
56
60
pop_df = pd .read_csv (FIPS_BY_ZIP_POP_URL )
57
61
58
62
# Create the FIPS column by combining the state and county codes
@@ -69,8 +73,8 @@ def create_fips_zip_crosswalk():
69
73
# Find the population fractions (the heaviest computation, takes about a minute)
70
74
# Note that the denominator in the fractions is the source population
71
75
pop_df .set_index (["fips" , "zip" ], inplace = True )
72
- fips_zip = pop_df .groupby ("fips" , as_index = False ).apply (lambda g : g ["pop" ] / g ["pop" ].sum ())
73
- zip_fips = pop_df .groupby ("zip" , as_index = False ).apply (lambda g : g ["pop" ] / g ["pop" ].sum ())
76
+ fips_zip : DataFrame = pop_df .groupby ("fips" , as_index = False ).apply (lambda g : g ["pop" ] / g ["pop" ].sum ())
77
+ zip_fips : DataFrame = pop_df .groupby ("zip" , as_index = False ).apply (lambda g : g ["pop" ] / g ["pop" ].sum ())
74
78
75
79
# Rename and write to file
76
80
fips_zip = fips_zip .reset_index (level = ["fips" , "zip" ]).rename (columns = {"pop" : "weight" })
@@ -83,7 +87,7 @@ def create_fips_zip_crosswalk():
83
87
84
88
85
89
def create_zip_hsa_hrr_crosswalk ():
86
- """Creates the crosswalk table from ZIP to HSA and from ZIP to HRR from source ."""
90
+ """Build a crosswalk table for ZIP to HSA and for ZIP to HRR."""
87
91
zipped_csv = ZipFile (BytesIO (requests .get (ZIP_HSA_HRR_URL ).content ))
88
92
zip_df = pd .read_csv (zipped_csv .open (ZIP_HSA_HRR_FILENAME ))
89
93
@@ -104,33 +108,27 @@ def create_zip_hsa_hrr_crosswalk():
104
108
105
109
106
110
def create_fips_msa_crosswalk ():
107
- """Creates the crosswalk table from FIPS to MSA from source ."""
111
+ """Build a crosswalk table for FIPS to MSA."""
108
112
msa_cols = {
109
113
"CBSA Code" : int ,
110
114
"Metropolitan/Micropolitan Statistical Area" : str ,
111
115
"FIPS State Code" : str ,
112
116
"FIPS County Code" : str ,
113
117
}
114
118
# The following line requires the xlrd package.
115
- msa_df = pd .read_excel (
116
- FIPS_MSA_URL ,
117
- skiprows = 2 ,
118
- skipfooter = 4 ,
119
- usecols = msa_cols .keys (),
120
- dtype = msa_cols ,
121
- )
119
+ msa_df = pd .read_excel (FIPS_MSA_URL , skiprows = 2 , skipfooter = 4 , usecols = msa_cols .keys (), dtype = msa_cols )
122
120
123
121
metro_bool = msa_df ["Metropolitan/Micropolitan Statistical Area" ] == "Metropolitan Statistical Area"
124
122
msa_df = msa_df [metro_bool ]
125
123
126
124
# Combine state and county codes into a single FIPS code
127
125
msa_df ["fips" ] = msa_df ["FIPS State Code" ].str .cat (msa_df ["FIPS County Code" ])
128
126
129
- msa_df .rename (columns = {"CBSA Code" : "msa" })[[ "fips" , "msa" ]] .sort_values (["fips" , "msa" ]).to_csv (join (OUTPUT_DIR , FIPS_MSA_OUT_FILENAME ), index = False )
127
+ msa_df .rename (columns = {"CBSA Code" : "msa" }).sort_values (["fips" , "msa" ]).to_csv (join (OUTPUT_DIR , FIPS_MSA_OUT_FILENAME ), columns = [ "fips" , "msa" ] , index = False )
130
128
131
129
132
130
def create_jhu_uid_fips_crosswalk ():
133
- """Creates the crosswalk table from JHU UID to FIPS from source ."""
131
+ """Build a crosswalk table from JHU UID to FIPS."""
134
132
# These are hand modifications that need to be made to the translation
135
133
# between JHU UID and FIPS. See below for the special cases information
136
134
# https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html#geographical-exceptions
@@ -207,7 +205,8 @@ def create_jhu_uid_fips_crosswalk():
207
205
]
208
206
)
209
207
210
- jhu_df = pd .read_csv (JHU_FIPS_URL , dtype = {"UID" : str , "FIPS" : str }).query ("Country_Region == 'US'" )[["UID" , "FIPS" ]].rename (columns = {"UID" : "jhu_uid" , "FIPS" : "fips" }).dropna (subset = ["fips" ])
208
+ jhu_df = pd .read_csv (JHU_FIPS_URL , dtype = {"UID" : str , "FIPS" : str }).query ("Country_Region == 'US'" )
209
+ jhu_df = jhu_df .rename (columns = {"UID" : "jhu_uid" , "FIPS" : "fips" }).dropna (subset = ["fips" ])[["jhu_uid" , "fips" ]]
211
210
212
211
# FIPS Codes that are just two digits long should be zero filled on the right.
213
212
# These are US state codes (XX) and the territories Guam (66), Northern Mariana Islands (69),
@@ -216,7 +215,8 @@ def create_jhu_uid_fips_crosswalk():
216
215
jhu_df .loc [fips_st , "fips" ] = jhu_df .loc [fips_st , "fips" ].str .ljust (5 , "0" )
217
216
218
217
# Drop the JHU UIDs that were hand-modified
219
- dup_ind = jhu_df ["jhu_uid" ].isin (pd .concat ([hand_additions , unassigned_states , out_of_state , puerto_rico_unassigned , cruise_ships ])["jhu_uid" ].values )
218
+ manual_correction_ids = pd .concat ([hand_additions , unassigned_states , out_of_state , puerto_rico_unassigned , cruise_ships ])["jhu_uid" ]
219
+ dup_ind = jhu_df ["jhu_uid" ].isin (manual_correction_ids )
220
220
jhu_df .drop (jhu_df .index [dup_ind ], inplace = True )
221
221
222
222
# Add weights of 1.0 to everything not in hand additions, then merge in hand-additions
@@ -228,13 +228,13 @@ def create_jhu_uid_fips_crosswalk():
228
228
229
229
230
230
def create_state_codes_crosswalk ():
231
- """Create the State ID -> State Name -> State code crosswalk file."""
231
+ """Build a State ID -> State Name -> State code crosswalk file."""
232
232
column_rename_map = {
233
233
"STATE" : "state_code" ,
234
234
"STUSAB" : "state_id" ,
235
235
"STATE_NAME" : "state_name" ,
236
236
}
237
- df = pd .read_csv (STATE_CODES_URL , delimiter = "|" ).drop (columns = "STATENS" ).rename (column_rename_map )
237
+ df = pd .read_csv (STATE_CODES_URL , delimiter = "|" ).drop (columns = "STATENS" ).rename (columns = column_rename_map )
238
238
df ["state_code" ] = df ["state_code" ].astype (str ).str .zfill (2 )
239
239
df ["state_id" ] = df ["state_id" ].astype (str ).str .lower ()
240
240
@@ -264,9 +264,7 @@ def create_state_codes_crosswalk():
264
264
265
265
266
266
def create_state_hhs_crosswalk ():
267
- """
268
- Create the state to hhs crosswalk.
269
- """
267
+ """Build a state to HHS crosswalk."""
270
268
if not isfile (join (OUTPUT_DIR , STATE_OUT_FILENAME )):
271
269
create_state_codes_crosswalk ()
272
270
@@ -293,19 +291,18 @@ def create_state_hhs_crosswalk():
293
291
hhs_df = pd .DataFrame (hhs_state_pairs , columns = ["hhs" , "state_name" ])
294
292
hhs_df ["hhs" ] = hhs_df ["hhs" ].astype (str )
295
293
296
- ss_df .merge (hhs_df , on = "state_name" , how = "left" ).dropna ()[["state_code" , "hhs" ]].sort_values ("state_code" ).to_csv (join (OUTPUT_DIR , STATE_HHS_OUT_FILENAME ), index = False )
294
+ ss_df = ss_df .merge (hhs_df , on = "state_name" , how = "left" ).dropna ()
295
+ ss_df .sort_values ("state_code" ).to_csv (join (OUTPUT_DIR , STATE_HHS_OUT_FILENAME ), columns = ["state_code" , "hhs" ], index = False )
297
296
298
297
299
298
def create_fips_population_table ():
300
- """
301
- Build a table of populations by FIPS county codes. Uses US Census Bureau population
302
- data from 2019, supplemented with 2010 population data for Puerto Rico, and a few
303
- small counties.
299
+ """Build a table of populations by FIPS county codes.
300
+
301
+ Uses US Census Bureau population data from 2020, with 2010 population data for Puerto Rico and a few exceptions.
304
302
"""
305
303
census_pop = pd .read_csv (FIPS_POPULATION_URL , encoding = "ISO-8859-1" )
306
304
census_pop ["fips" ] = census_pop .apply (lambda x : f"{ x ['STATE' ]:02d} { x ['COUNTY' ]:03d} " , axis = 1 )
307
- census_pop ["pop" ] = census_pop ["POPESTIMATE2020" ]
308
- census_pop = census_pop [["fips" , "pop" ]]
305
+ census_pop = census_pop .rename (columns = {"POPESTIMATE2020" : "pop" })[["fips" , "pop" ]]
309
306
310
307
# Set population for Dukes and Nantucket combo county
311
308
dukes_pop = int (census_pop .loc [census_pop ["fips" ] == "25007" , "pop" ])
@@ -324,8 +321,7 @@ def create_fips_population_table():
324
321
# Get the file with Puerto Rico populations
325
322
df_pr = pd .read_csv (FIPS_PUERTO_RICO_POPULATION_URL )
326
323
df_pr ["fips" ] = df_pr ["STATE" ].astype (str ).str .zfill (2 ) + df_pr ["COUNTY" ].astype (str ).str .zfill (3 )
327
- df_pr ["pop" ] = df_pr ["POPPT" ]
328
- df_pr = df_pr [["fips" , "pop" ]]
324
+ df_pr = df_pr .rename (columns = {"POPPT" : "pop" })[["fips" , "pop" ]]
329
325
# Create the Puerto Rico megaFIPS
330
326
df_pr = df_pr [df_pr ["fips" ].isin ([str (x ) for x in range (72000 , 72999 )])]
331
327
df_pr = pd .concat ([df_pr , pd .DataFrame ([{"fips" : "72000" , "pop" : df_pr ["pop" ].sum ()}])])
@@ -349,22 +345,28 @@ def create_fips_population_table():
349
345
350
346
351
347
def create_state_population_table ():
348
+ """Build a state population table."""
352
349
if not isfile (join (OUTPUT_DIR , FIPS_POPULATION_OUT_FILENAME )):
353
350
create_fips_population_table ()
351
+
354
352
if not isfile (join (OUTPUT_DIR , FIPS_STATE_OUT_FILENAME )):
355
353
derive_fips_state_crosswalk ()
354
+
356
355
census_pop = pd .read_csv (join (OUTPUT_DIR , FIPS_POPULATION_OUT_FILENAME ), dtype = {"fips" : str , "pop" : int })
357
- state = pd .read_csv (join (OUTPUT_DIR , FIPS_STATE_OUT_FILENAME ), dtype = str )
356
+ state : DataFrame = pd .read_csv (join (OUTPUT_DIR , FIPS_STATE_OUT_FILENAME ), dtype = str )
358
357
combined = state .merge (census_pop , on = "fips" )
359
358
state_pop = combined .groupby (["state_code" , "state_id" , "state_name" ], as_index = False ).sum ()
360
359
state_pop .sort_values ("state_code" ).to_csv (join (OUTPUT_DIR , STATE_POPULATION_OUT_FILENAME ), index = False )
361
360
362
361
363
362
def create_hhs_population_table ():
363
+ """Build an HHS population table."""
364
364
if not isfile (join (OUTPUT_DIR , STATE_POPULATION_OUT_FILENAME )):
365
365
create_state_population_table ()
366
+
366
367
if not isfile (join (OUTPUT_DIR , STATE_HHS_OUT_FILENAME )):
367
368
create_state_hhs_crosswalk ()
369
+
368
370
state_pop = pd .read_csv (join (OUTPUT_DIR , STATE_POPULATION_OUT_FILENAME ), dtype = {"state_code" : str , "hhs" : int }, usecols = ["state_code" , "pop" ])
369
371
state_hhs = pd .read_csv (join (OUTPUT_DIR , STATE_HHS_OUT_FILENAME ), dtype = str )
370
372
combined = state_pop .merge (state_hhs , on = "state_code" )
@@ -373,18 +375,17 @@ def create_hhs_population_table():
373
375
374
376
375
377
def create_nation_population_table ():
378
+ """Build a nation population table."""
376
379
if not isfile (join (OUTPUT_DIR , FIPS_POPULATION_OUT_FILENAME )):
377
380
create_fips_population_table ()
381
+
378
382
census_pop = pd .read_csv (join (OUTPUT_DIR , FIPS_POPULATION_OUT_FILENAME ), dtype = {"fips" : str , "pop" : int })
379
383
nation_pop = pd .DataFrame ({"nation" : ["us" ], "pop" : [census_pop ["pop" ].sum ()]})
380
384
nation_pop .to_csv (join (OUTPUT_DIR , NATION_POPULATION_OUT_FILENAME ), index = False )
381
385
382
386
383
387
def derive_zip_population_table ():
384
- """
385
- Builds a table of populations by ZIP code. Combines the tble of populations by
386
- FIPS code with the FIPS to ZIP code mapping.
387
- """
388
+ """Build a table of populations by ZIP code by translating from FIPS populations."""
388
389
if not isfile (join (OUTPUT_DIR , FIPS_POPULATION_OUT_FILENAME )):
389
390
create_fips_population_table ()
390
391
@@ -402,8 +403,7 @@ def derive_zip_population_table():
402
403
403
404
404
405
def derive_fips_hrr_crosswalk ():
405
- """Derives a crosswalk file from FIPS to HRR through FIPZ -> ZIP -> HRR
406
- from the crosswalk files made by the functions above."""
406
+ """Derive a crosswalk file from FIPS to HRR through FIPS -> ZIP -> HRR."""
407
407
if not isfile (join (OUTPUT_DIR , FIPS_ZIP_OUT_FILENAME )):
408
408
create_fips_zip_crosswalk ()
409
409
@@ -413,29 +413,25 @@ def derive_fips_hrr_crosswalk():
413
413
fz_df = pd .read_csv (join (OUTPUT_DIR , FIPS_ZIP_OUT_FILENAME ), dtype = {"fips" : str , "zip" : str , "weight" : float })
414
414
zh_df = pd .read_csv (join (OUTPUT_DIR , ZIP_HRR_OUT_FILENAME ), dtype = {"zip" : str , "hrr" : str })
415
415
416
- fz_df .merge (zh_df , on = "zip" , how = "left" ).drop (columns = "zip" ).groupby (["fips" , "hrr" ]).sum ().reset_index ().sort_values (["fips" , "hrr" ]).to_csv (join (OUTPUT_DIR , FIPS_HRR_OUT_FILENAME ), index = False )
416
+ fz_df = fz_df .merge (zh_df , on = "zip" , how = "left" ).drop (columns = "zip" ).groupby (["fips" , "hrr" ]).sum ().reset_index ()
417
+ fz_df .sort_values (["fips" , "hrr" ]).to_csv (join (OUTPUT_DIR , FIPS_HRR_OUT_FILENAME ), index = False )
417
418
418
419
419
420
def derive_fips_state_crosswalk ():
420
- """
421
- Builds a crosswalk between FIPS county codes and state information (number,
422
- abbreviation, name).
423
- """
421
+ """Derive a crosswalk between FIPS county codes and state information (number, abbreviation, name)."""
424
422
fips_pop = pd .read_csv (join (OUTPUT_DIR , FIPS_POPULATION_OUT_FILENAME ), dtype = {"fips" : str , "pop" : int })
425
423
426
424
megafips = pd .DataFrame ({"fips" : [fips + "000" for fips in fips_pop .fips .str [:2 ].unique ()], "pop" : np .nan })
427
425
fips_pop = pd .concat ([fips_pop , megafips ])
428
426
429
427
state_codes = pd .read_csv (join (OUTPUT_DIR , STATE_OUT_FILENAME ), dtype = {"state_code" : str , "state_id" : str , "state_name" : str })
430
428
fips_pop ["state_code" ] = fips_pop ["fips" ].str [:2 ]
431
- fips_pop .merge (state_codes , on = "state_code" , how = "left" ).drop (columns = "pop" ).sort_values (["fips" , "state_code" ]).to_csv (join (OUTPUT_DIR , FIPS_STATE_OUT_FILENAME ), index = False )
429
+ fips_pop = fips_pop .merge (state_codes , on = "state_code" , how = "left" ).drop (columns = "pop" )
430
+ fips_pop .sort_values (["fips" , "state_code" ]).to_csv (join (OUTPUT_DIR , FIPS_STATE_OUT_FILENAME ), index = False )
432
431
433
432
434
433
def derive_zip_msa_crosswalk ():
435
- """
436
- Derives a crosswalk file from ZIP to MSA through ZIP -> FIPS -> HRR
437
- from the crosswalk files made by the functions above.
438
- """
434
+ """Derive a crosswalk file from ZIP to MSA through ZIP -> FIPS -> HRR."""
439
435
if not isfile (join (OUTPUT_DIR , ZIP_FIPS_OUT_FILENAME )):
440
436
create_fips_zip_crosswalk ()
441
437
@@ -445,32 +441,31 @@ def derive_zip_msa_crosswalk():
445
441
zf_df = pd .read_csv (join (OUTPUT_DIR , ZIP_FIPS_OUT_FILENAME ), dtype = {"zip" : str , "fips" : str , "weight" : float })
446
442
fm_df = pd .read_csv (join (OUTPUT_DIR , FIPS_MSA_OUT_FILENAME ), dtype = {"fips" : str , "msa" : str })
447
443
448
- zf_df .merge (fm_df , on = "fips" ).drop (columns = "fips" ).groupby (["msa" , "zip" ]).sum ().reset_index ().sort_values (["zip" , "msa" ]).to_csv (join (OUTPUT_DIR , ZIP_MSA_OUT_FILENAME ), index = False )
444
+ zf_df = zf_df .merge (fm_df , on = "fips" ).drop (columns = "fips" ).groupby (["msa" , "zip" ]).sum ().reset_index ()
445
+ zf_df .sort_values (["zip" , "msa" ]).to_csv (join (OUTPUT_DIR , ZIP_MSA_OUT_FILENAME ), index = False )
449
446
450
447
451
448
def derive_zip_to_state_code ():
452
- """
453
- Builds a crosswalk between ZIP codes and state information (number, abbreviation,
454
- name).
455
- """
449
+ """Derive a crosswalk between ZIP codes and state information (number, abbreviation, name)."""
456
450
if not isfile (join (OUTPUT_DIR , STATE_OUT_FILENAME )):
457
451
create_state_codes_crosswalk ()
452
+
458
453
if not isfile (join (OUTPUT_DIR , ZIP_FIPS_OUT_FILENAME )):
459
454
create_fips_zip_crosswalk ()
460
455
461
456
sdf = pd .read_csv (join (OUTPUT_DIR , STATE_OUT_FILENAME ), dtype = {"state_code" : str , "state_id" : str , "state_name" : str })
462
457
zf_cf = pd .read_csv (join (OUTPUT_DIR , ZIP_FIPS_OUT_FILENAME ), dtype = {"zip" : str , "fips" : str })
463
458
464
459
zf_cf ["state_code" ] = zf_cf ["fips" ].str [:2 ]
465
- zf_cf .merge (sdf , left_on = "state_code" , right_on = "state_code" , how = "left" ).drop (columns = ["fips" ]).sort_values (["zip" , "state_code" ]).to_csv (join (OUTPUT_DIR , ZIP_STATE_CODE_OUT_FILENAME ), index = False )
460
+ zf_cf = zf_cf .merge (sdf , left_on = "state_code" , right_on = "state_code" , how = "left" ).drop (columns = ["fips" ])
461
+ zf_cf .sort_values (["zip" , "state_code" ]).to_csv (join (OUTPUT_DIR , ZIP_STATE_CODE_OUT_FILENAME ), index = False )
466
462
467
463
468
464
def derive_fips_hhs_crosswalk ():
469
- """
470
- Builds a crosswalk between FIPS county codes and HHS regions.
471
- """
465
+ """Derive a crosswalk between FIPS county codes and HHS regions."""
472
466
if not isfile (join (OUTPUT_DIR , STATE_HHS_OUT_FILENAME )):
473
467
create_state_hhs_crosswalk ()
468
+
474
469
if not isfile (join (OUTPUT_DIR , FIPS_POPULATION_OUT_FILENAME )):
475
470
create_fips_population_table ()
476
471
@@ -481,22 +476,23 @@ def derive_fips_hhs_crosswalk():
481
476
state_hhs = pd .read_csv (join (OUTPUT_DIR , STATE_HHS_OUT_FILENAME ), dtype = {"state_code" : str , "hhs" : str })
482
477
483
478
fips_pop ["state_code" ] = fips_pop ["fips" ].str [:2 ]
484
- fips_pop .merge (state_hhs , on = "state_code" , how = "left" ).drop (columns = ["state_code" , "pop" ]).sort_values (["fips" , "hhs" ]).to_csv (join (OUTPUT_DIR , FIPS_HHS_FILENAME ), index = False )
479
+ fips_pop = fips_pop .merge (state_hhs , on = "state_code" , how = "left" ).drop (columns = ["state_code" , "pop" ])
480
+ fips_pop .sort_values (["fips" , "hhs" ]).to_csv (join (OUTPUT_DIR , FIPS_HHS_FILENAME ), index = False )
485
481
486
482
487
483
def derive_zip_hhs_crosswalk ():
488
- """
489
- Builds a crosswalk between zip code and HHS regions.
490
- """
484
+ """Derive a crosswalk between zip code and HHS regions."""
491
485
if not isfile (join (OUTPUT_DIR , STATE_HHS_OUT_FILENAME )):
492
486
create_state_hhs_crosswalk ()
487
+
493
488
if not isfile (join (OUTPUT_DIR , ZIP_STATE_CODE_OUT_FILENAME )):
494
489
derive_zip_to_state_code ()
495
490
496
491
zip_state = pd .read_csv (join (OUTPUT_DIR , ZIP_STATE_CODE_OUT_FILENAME ), dtype = {"zip" : str , "pop" : int , "state_code" : str })
497
492
state_hhs = pd .read_csv (join (OUTPUT_DIR , STATE_HHS_OUT_FILENAME ), dtype = {"state_code" : str , "hhs" : str })
498
493
499
- zip_state .merge (state_hhs , on = "state_code" , how = "left" ).drop (columns = ["state_code" , "state_id" , "state_name" ]).sort_values (["zip" , "hhs" ]).to_csv (join (OUTPUT_DIR , ZIP_HHS_FILENAME ), index = False )
494
+ zip_state = zip_state .merge (state_hhs , on = "state_code" , how = "left" ).drop (columns = ["state_code" , "state_id" , "state_name" ])
495
+ zip_state .sort_values (["zip" , "hhs" ]).to_csv (join (OUTPUT_DIR , ZIP_HHS_FILENAME ), index = False )
500
496
501
497
502
498
def clear_dir (dir_path : str ):
@@ -524,4 +520,4 @@ def clear_dir(dir_path: str):
524
520
derive_fips_state_crosswalk ()
525
521
derive_zip_population_table ()
526
522
derive_fips_hhs_crosswalk ()
527
- derive_zip_hhs_crosswalk ()
523
+ derive_zip_hhs_crosswalk ()
0 commit comments