Skip to content

Commit bb7df1a

Browse files
committed
generate local county mapping from CHNG spreadsheet
1 parent 1a1bba7 commit bb7df1a

File tree

1 file changed

+94
-6
lines changed

1 file changed

+94
-6
lines changed

_delphi_utils_python/data_proc/geomap/geo_data_proc.py

Lines changed: 94 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -483,12 +483,7 @@ def derive_fips_chngfips_crosswalk():
483483
if not isfile(join(OUTPUT_DIR, FIPS_STATE_OUT_FILENAME)):
484484
derive_fips_state_crosswalk()
485485

486-
# County mapping file is derived from
487-
# https://docs.google.com/spreadsheets/d/1PEce4CjjHbRM1Z5xEMNI6Xsq_b2kkCh0/edit#gid=871427657.
488-
# We assign an incrementing integer to be the group id of each county
489-
# grouping within the given state via:
490-
#
491-
# county_groups["group"] = (county_groups.groupby("state_fips").cumcount() + 1).astype("string")
486+
assign_county_groups()
492487
county_groups = pd.read_csv(LOWPOP_COUNTY_GROUPS_FILE, dtype="string", index_col=False
493488
).drop(columns = "fips_list")
494489

@@ -548,6 +543,99 @@ def derive_chngfips_state_crosswalk():
548543
group_to_state.to_csv(join(OUTPUT_DIR, CHNGFIPS_STATE_OUT_FILENAME), index=False)
549544

550545

546+
def fetch_county_groups_spreadsheet():
547+
# County mapping file is derived from
548+
# https://docs.google.com/spreadsheets/d/1PEce4CjjHbRM1Z5xEMNI6Xsq_b2kkCh0/edit#gid=871427657
549+
sheet_id = "1PEce4CjjHbRM1Z5xEMNI6Xsq_b2kkCh0"
550+
sheet_name = "groupings"
551+
# Request sheet in CSV format via tag in URL.
552+
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
553+
554+
county_groups = pd.read_csv(
555+
url, dtype="string", index_col=False
556+
).dropna(
557+
how="all", axis=1
558+
)
559+
county_groups["state FIPS"] = county_groups["state FIPS"].astype(int)
560+
561+
# Counties belonging to each group are listed (as FIPS codes) in the "county
562+
# FIPS grouping" column, concatenated and separated by the pipe "|". Each
563+
# included FIPS code is also listed in its own unnamed column. County groups
564+
# can contain varying numbers of counties, and if CHNG provides new or updated
565+
# county groupings the number of unnamed columns listing included counties
566+
# could change. Use a general approach to find all of them.
567+
county_cols = [colname for colname in county_groups.columns if colname.startswith("Unnamed: ")]
568+
569+
new_names = {
570+
"state FIPS": "state_fips",
571+
"county FIPS grouping": "fips_list",
572+
**{ colname: ("county" + str(i + 1)) for i, colname in enumerate(county_cols)}
573+
}
574+
575+
county_groups = county_groups.rename(
576+
columns=new_names
577+
)[new_names.values()]
578+
579+
return county_groups
580+
581+
582+
def assign_county_groups():
583+
county_groups = fetch_county_groups_spreadsheet()
584+
585+
# If a `lowpop_county_groups.csv` already exists in `data_proc/geomap`, we
586+
# have to be careful to not reassign a group number to a different group.
587+
# Group numbers must remain fixed, even if a given county group is no longer
588+
# being used.
589+
if isfile(LOWPOP_COUNTY_GROUPS_FILE):
590+
old_county_groups = pd.read_csv(LOWPOP_COUNTY_GROUPS_FILE, dtype="string", index_col=False)
591+
old_county_groups.group = old_county_groups.group.astype(int)
592+
old_county_groups.state_fips = old_county_groups.state_fips.astype(int)
593+
594+
# Remove rows from county_groups if that `fips_list` value already
595+
# exists in old_county_groups.
596+
county_groups = county_groups[
597+
~county_groups.fips_list.isin(old_county_groups.fips_list)
598+
]
599+
600+
# If grouping file has no new rows, no need to process again.
601+
if county_groups.empty:
602+
return
603+
604+
# Assign an incrementing integer to be the group id of each remaining
605+
# county grouping within a state using the given sort order.
606+
county_groups["group"] = county_groups.groupby("state_fips").cumcount() + 1
607+
608+
# Find max group number by state in old_county_groups, join on, and
609+
# add max group number to group number.
610+
max_group_by_state = old_county_groups.groupby(
611+
"state_fips"
612+
).group.max(
613+
).reset_index(
614+
).rename(
615+
columns = {"group": "max_group"}
616+
)
617+
county_groups = county_groups.join(
618+
max_group_by_state.set_index("state_fips"),
619+
how="left",
620+
on="state_fips"
621+
).assign(
622+
group = lambda x: x.group + x.max_group
623+
).drop(
624+
["max_group"], axis=1
625+
)
626+
627+
# Combine old_county_groups and county_groups
628+
county_groups = pd.concat([old_county_groups, county_groups])
629+
else:
630+
county_groups["group"] = county_groups.groupby("state_fips").cumcount() + 1
631+
632+
county_groups.sort_values(
633+
["state_fips"], kind="stable"
634+
).to_csv(
635+
LOWPOP_COUNTY_GROUPS_FILE, index=False
636+
)
637+
638+
551639
def clear_dir(dir_path: str):
552640
for fname in listdir(dir_path):
553641
remove(join(dir_path, fname))

0 commit comments

Comments
 (0)