33
33
FIPS_PUERTO_RICO_POPULATION_URL = "https://www2.census.gov/geo/docs/maps-data/data/rel/zcta_county_rel_10.txt?"
34
34
STATE_HHS_FILE = "hhs.txt"
35
35
ZIP_POP_MISSING_FILE = "zip_pop_filling.csv"
36
+ CHNG_COUNTY_GROUPS_FILE = "chng_county_groups.csv"
36
37
37
38
# Out files
38
39
FIPS_STATE_OUT_FILENAME = "fips_state_table.csv"
39
40
FIPS_MSA_OUT_FILENAME = "fips_msa_table.csv"
40
41
FIPS_HRR_OUT_FILENAME = "fips_hrr_table.csv"
41
42
FIPS_ZIP_OUT_FILENAME = "fips_zip_table.csv"
42
43
FIPS_HHS_FILENAME = "fips_hhs_table.csv"
44
+ FIPS_CHNGFIPS_OUT_FILENAME = "fips_chng-fips_table.csv"
43
45
FIPS_POPULATION_OUT_FILENAME = "fips_pop.csv"
44
46
47
+ CHNGFIPS_STATE_OUT_FILENAME = "chng-fips_state_table.csv"
45
48
ZIP_HSA_OUT_FILENAME = "zip_hsa_table.csv"
46
49
ZIP_HRR_OUT_FILENAME = "zip_hrr_table.csv"
47
50
ZIP_FIPS_OUT_FILENAME = "zip_fips_table.csv"
@@ -475,6 +478,176 @@ def derive_zip_hhs_crosswalk():
475
478
zip_state .sort_values (["zip" , "hhs" ]).to_csv (join (OUTPUT_DIR , ZIP_HHS_FILENAME ), index = False )
476
479
477
480
481
+ def derive_fips_chngfips_crosswalk ():
482
+ """Build a crosswalk table for FIPS to CHNG FIPS."""
483
+ if not isfile (join (OUTPUT_DIR , FIPS_STATE_OUT_FILENAME )):
484
+ derive_fips_state_crosswalk ()
485
+
486
+ assign_county_groups ()
487
+ county_groups = pd .read_csv (CHNG_COUNTY_GROUPS_FILE , dtype = "string" , index_col = False )
488
+ # Split list of county FIPS codes into separate columns.
489
+ county_groups = pd .concat (
490
+ [county_groups , county_groups .fips_list .str .split ("|" , expand = True )],
491
+ axis = 1
492
+ ).drop (
493
+ columns = "fips_list"
494
+ )
495
+
496
+ # Change to long format.
497
+ county_groups = pd .melt (
498
+ county_groups ,
499
+ id_vars = ["state_fips" , "group" ],
500
+ var_name = "county_num" ,
501
+ value_name = "fips"
502
+ ).drop (
503
+ columns = "county_num"
504
+ ).dropna ()
505
+
506
+ county_groups ["state_fips" ] = county_groups ["state_fips" ].str .zfill (2 )
507
+ county_groups ["group" ] = county_groups ["group" ].str .zfill (2 )
508
+ county_groups ["fips" ] = county_groups ["fips" ].str .zfill (5 ).astype ("string" )
509
+ # Combine state codes and group ids into a single FIPS code.
510
+ county_groups ["chng-fips" ] = county_groups ["state_fips" ] + "g" + county_groups ["group" ]
511
+
512
+ county_groups = county_groups [["fips" , "chng-fips" ]]
513
+ fips_to_state = pd .read_csv (join (OUTPUT_DIR , FIPS_STATE_OUT_FILENAME ), dtype = "string" , index_col = False )
514
+
515
+ # Get all the fips that aren't included in the chng groupings.
516
+ extra_fips_list = list (set (fips_to_state .fips ) - set (county_groups .fips ))
517
+ # Normal fips codes and CHNG fips codes are the same for ungrouped counties.
518
+ extra_fips_df = pd .DataFrame ({"fips" : extra_fips_list , "chng-fips" : extra_fips_list }, dtype = "string" )
519
+
520
+ # Combine grouped and ungrouped counties.
521
+ pd .concat (
522
+ [county_groups , extra_fips_df ]
523
+ ).sort_values (
524
+ ["fips" , "chng-fips" ]
525
+ ).to_csv (
526
+ join (OUTPUT_DIR , FIPS_CHNGFIPS_OUT_FILENAME ), index = False
527
+ )
528
+
529
+
530
+ def derive_chngfips_state_crosswalk ():
531
+ """Build a crosswalk table for FIPS to CHNG FIPS."""
532
+ if not isfile (join (OUTPUT_DIR , FIPS_STATE_OUT_FILENAME )):
533
+ derive_fips_state_crosswalk ()
534
+
535
+ if not isfile (join (OUTPUT_DIR , FIPS_CHNGFIPS_OUT_FILENAME )):
536
+ derive_fips_chngfips_crosswalk ()
537
+
538
+ fips_to_group = pd .read_csv (join (OUTPUT_DIR , FIPS_CHNGFIPS_OUT_FILENAME ), dtype = "string" , index_col = False )
539
+ fips_to_state = pd .read_csv (join (OUTPUT_DIR , FIPS_STATE_OUT_FILENAME ), dtype = "string" , index_col = False )
540
+
541
+ group_to_state = fips_to_group .join (
542
+ fips_to_state .set_index ("fips" ), on = "fips" , how = "left"
543
+ ).drop (
544
+ columns = "fips"
545
+ ).drop_duplicates (
546
+ ).sort_values (
547
+ ["chng-fips" , "state_code" ]
548
+ )
549
+ group_to_state .to_csv (join (OUTPUT_DIR , CHNGFIPS_STATE_OUT_FILENAME ), index = False )
550
+
551
+
552
+ def fetch_county_groups_spreadsheet ():
553
+ # County mapping file is derived from
554
+ # https://docs.google.com/spreadsheets/d/1PEce4CjjHbRM1Z5xEMNI6Xsq_b2kkCh0/edit#gid=871427657
555
+ sheet_id = "1PEce4CjjHbRM1Z5xEMNI6Xsq_b2kkCh0"
556
+ sheet_name = "groupings"
557
+ # Request sheet in CSV format via tag in URL.
558
+ url = f"https://docs.google.com/spreadsheets/d/{ sheet_id } /gviz/tq?tqx=out:csv&sheet={ sheet_name } "
559
+
560
+ county_groups = pd .read_csv (
561
+ url , dtype = "string" , index_col = False
562
+ ).dropna (
563
+ how = "all" , axis = 1
564
+ )
565
+ county_groups ["state FIPS" ] = county_groups ["state FIPS" ].astype (int )
566
+
567
+ # Counties belonging to each group are listed (as FIPS codes) in the "county
568
+ # FIPS grouping" column, concatenated and separated by the pipe "|".
569
+ new_names = {
570
+ "state FIPS" : "state_fips" ,
571
+ "county FIPS grouping" : "fips_list"
572
+ }
573
+
574
+ county_groups = county_groups .rename (
575
+ columns = new_names
576
+ )[new_names .values ()]
577
+
578
+ return county_groups
579
+
580
+
581
+ def assign_county_groups ():
582
+ county_groups = fetch_county_groups_spreadsheet ()
583
+
584
+ # If a county groups mapping file already exists in `data_proc/geomap`, we
585
+ # have to be careful to not reassign a group number to a different group.
586
+ # Group numbers must remain fixed, even if a given county group is no longer
587
+ # being used.
588
+ if isfile (CHNG_COUNTY_GROUPS_FILE ):
589
+ old_county_groups = pd .read_csv (CHNG_COUNTY_GROUPS_FILE , dtype = "string" , index_col = False )
590
+ old_county_groups .group = old_county_groups .group .astype (int )
591
+ old_county_groups .state_fips = old_county_groups .state_fips .astype (int )
592
+
593
+ # Remove rows from county_groups if that `fips_list` value already
594
+ # exists in old_county_groups.
595
+ county_groups = county_groups [
596
+ ~ county_groups .fips_list .isin (old_county_groups .fips_list )
597
+ ]
598
+
599
+ # If grouping file has no new rows, no need to process again.
600
+ if county_groups .empty :
601
+ return
602
+ # Grouping spreadsheet contains rows not seen in old, on-disk county
603
+ # groupings file. Combining the two is delicate. While the code below
604
+ # appears to work, it has not been formally tested and could be
605
+ # invalid for even small changes to the format of the input county
606
+ # groupings file.
607
+ else :
608
+ raise NotImplementedError (
609
+ "Can't combine old and new county groupings automatically, "
610
+ "code below is not tested or robust to changes in input format."
611
+ "We recommend manually working with the code below and the new"
612
+ "data in a REPL."
613
+ )
614
+
615
+ # Assign an incrementing integer to be the group id of each remaining
616
+ # county grouping within a state using the given sort order.
617
+ county_groups ["group" ] = county_groups .groupby ("state_fips" ).cumcount () + 1
618
+
619
+ # Find max group number by state in old_county_groups, join on, and
620
+ # add max group number to group number.
621
+ max_group_by_state = old_county_groups .groupby (
622
+ "state_fips"
623
+ ).group .max (
624
+ ).reset_index (
625
+ ).rename (
626
+ columns = {"group" : "max_group" }
627
+ )
628
+ county_groups = county_groups .join (
629
+ max_group_by_state .set_index ("state_fips" ),
630
+ how = "left" ,
631
+ on = "state_fips"
632
+ ).assign (
633
+ group = lambda x : x .group + x .max_group
634
+ ).drop (
635
+ ["max_group" ], axis = 1
636
+ )
637
+
638
+ # Combine old_county_groups and county_groups
639
+ county_groups = pd .concat ([old_county_groups , county_groups ])
640
+ else :
641
+ # Group numbers are 1-indexed.
642
+ county_groups ["group" ] = county_groups .groupby ("state_fips" ).cumcount () + 1
643
+
644
+ county_groups .sort_values (
645
+ ["state_fips" ], kind = "stable"
646
+ ).to_csv (
647
+ CHNG_COUNTY_GROUPS_FILE , index = False
648
+ )
649
+
650
+
478
651
def clear_dir (dir_path : str ):
479
652
for fname in listdir (dir_path ):
480
653
remove (join (dir_path , fname ))
@@ -501,3 +674,5 @@ def clear_dir(dir_path: str):
501
674
derive_zip_population_table ()
502
675
derive_fips_hhs_crosswalk ()
503
676
derive_zip_hhs_crosswalk ()
677
+ derive_fips_chngfips_crosswalk ()
678
+ derive_chngfips_state_crosswalk ()
0 commit comments