@@ -483,12 +483,7 @@ def derive_fips_chngfips_crosswalk():
483
483
if not isfile (join (OUTPUT_DIR , FIPS_STATE_OUT_FILENAME )):
484
484
derive_fips_state_crosswalk ()
485
485
486
- # County mapping file is derived from
487
- # https://docs.google.com/spreadsheets/d/1PEce4CjjHbRM1Z5xEMNI6Xsq_b2kkCh0/edit#gid=871427657.
488
- # We assign an incrementing integer to be the group id of each county
489
- # grouping within the given state via:
490
- #
491
- # county_groups["group"] = (county_groups.groupby("state_fips").cumcount() + 1).astype("string")
486
+ assign_county_groups ()
492
487
county_groups = pd .read_csv (LOWPOP_COUNTY_GROUPS_FILE , dtype = "string" , index_col = False
493
488
).drop (columns = "fips_list" )
494
489
@@ -548,6 +543,99 @@ def derive_chngfips_state_crosswalk():
548
543
group_to_state .to_csv (join (OUTPUT_DIR , CHNGFIPS_STATE_OUT_FILENAME ), index = False )
549
544
550
545
546
+ def fetch_county_groups_spreadsheet ():
547
+ # County mapping file is derived from
548
+ # https://docs.google.com/spreadsheets/d/1PEce4CjjHbRM1Z5xEMNI6Xsq_b2kkCh0/edit#gid=871427657
549
+ sheet_id = "1PEce4CjjHbRM1Z5xEMNI6Xsq_b2kkCh0"
550
+ sheet_name = "groupings"
551
+ # Request sheet in CSV format via tag in URL.
552
+ url = f"https://docs.google.com/spreadsheets/d/{ sheet_id } /gviz/tq?tqx=out:csv&sheet={ sheet_name } "
553
+
554
+ county_groups = pd .read_csv (
555
+ url , dtype = "string" , index_col = False
556
+ ).dropna (
557
+ how = "all" , axis = 1
558
+ )
559
+ county_groups ["state FIPS" ] = county_groups ["state FIPS" ].astype (int )
560
+
561
+ # Counties belonging to each group are listed (as FIPS codes) in the "county
562
+ # FIPS grouping" column, concatenated and separated by the pipe "|". Each
563
+ # included FIPS code is also listed in its own unnamed column. County groups
564
+ # can contain varying numbers of counties, and if CHNG provides new or updated
565
+ # county groupings the number of unnamed columns listing included counties
566
+ # could change. Use a general approach to find all of them.
567
+ county_cols = [colname for colname in county_groups .columns if colname .startswith ("Unnamed: " )]
568
+
569
+ new_names = {
570
+ "state FIPS" : "state_fips" ,
571
+ "county FIPS grouping" : "fips_list" ,
572
+ ** { colname : ("county" + str (i + 1 )) for i , colname in enumerate (county_cols )}
573
+ }
574
+
575
+ county_groups = county_groups .rename (
576
+ columns = new_names
577
+ )[new_names .values ()]
578
+
579
+ return county_groups
580
+
581
+
582
+ def assign_county_groups ():
583
+ county_groups = fetch_county_groups_spreadsheet ()
584
+
585
+ # If a `lowpop_county_groups.csv` already exists in `data_proc/geomap`, we
586
+ # have to be careful to not reassign a group number to a different group.
587
+ # Group numbers must remain fixed, even if a given county group is no longer
588
+ # being used.
589
+ if isfile (LOWPOP_COUNTY_GROUPS_FILE ):
590
+ old_county_groups = pd .read_csv (LOWPOP_COUNTY_GROUPS_FILE , dtype = "string" , index_col = False )
591
+ old_county_groups .group = old_county_groups .group .astype (int )
592
+ old_county_groups .state_fips = old_county_groups .state_fips .astype (int )
593
+
594
+ # Remove rows from county_groups if that `fips_list` value already
595
+ # exists in old_county_groups.
596
+ county_groups = county_groups [
597
+ ~ county_groups .fips_list .isin (old_county_groups .fips_list )
598
+ ]
599
+
600
+ # If grouping file has no new rows, no need to process again.
601
+ if county_groups .empty :
602
+ return
603
+
604
+ # Assign an incrementing integer to be the group id of each remaining
605
+ # county grouping within a state using the given sort order.
606
+ county_groups ["group" ] = county_groups .groupby ("state_fips" ).cumcount () + 1
607
+
608
+ # Find max group number by state in old_county_groups, join on, and
609
+ # add max group number to group number.
610
+ max_group_by_state = old_county_groups .groupby (
611
+ "state_fips"
612
+ ).group .max (
613
+ ).reset_index (
614
+ ).rename (
615
+ columns = {"group" : "max_group" }
616
+ )
617
+ county_groups = county_groups .join (
618
+ max_group_by_state .set_index ("state_fips" ),
619
+ how = "left" ,
620
+ on = "state_fips"
621
+ ).assign (
622
+ group = lambda x : x .group + x .max_group
623
+ ).drop (
624
+ ["max_group" ], axis = 1
625
+ )
626
+
627
+ # Combine old_county_groups and county_groups
628
+ county_groups = pd .concat ([old_county_groups , county_groups ])
629
+ else :
630
+ county_groups ["group" ] = county_groups .groupby ("state_fips" ).cumcount () + 1
631
+
632
+ county_groups .sort_values (
633
+ ["state_fips" ], kind = "stable"
634
+ ).to_csv (
635
+ LOWPOP_COUNTY_GROUPS_FILE , index = False
636
+ )
637
+
638
+
551
639
def clear_dir (dir_path : str ):
552
640
for fname in listdir (dir_path ):
553
641
remove (join (dir_path , fname ))
0 commit comments