Skip to content

Commit 3943bd4

Browse files
authored
Merge pull request #1787 from cmu-delphi/ndefries/geomapper/popsafe-county-level
Add `popsafe-fips` to geomapper
2 parents ed530f0 + f785b22 commit 3943bd4

File tree

8 files changed

+12673
-44
lines changed

8 files changed

+12673
-44
lines changed

_delphi_utils_python/data_proc/geomap/chng_county_groups.csv

Lines changed: 402 additions & 0 deletions
Large diffs are not rendered by default.

_delphi_utils_python/data_proc/geomap/geo_data_proc.py

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,15 +33,18 @@
3333
FIPS_PUERTO_RICO_POPULATION_URL = "https://www2.census.gov/geo/docs/maps-data/data/rel/zcta_county_rel_10.txt?"
3434
STATE_HHS_FILE = "hhs.txt"
3535
ZIP_POP_MISSING_FILE = "zip_pop_filling.csv"
36+
CHNG_COUNTY_GROUPS_FILE = "chng_county_groups.csv"
3637

3738
# Out files
3839
FIPS_STATE_OUT_FILENAME = "fips_state_table.csv"
3940
FIPS_MSA_OUT_FILENAME = "fips_msa_table.csv"
4041
FIPS_HRR_OUT_FILENAME = "fips_hrr_table.csv"
4142
FIPS_ZIP_OUT_FILENAME = "fips_zip_table.csv"
4243
FIPS_HHS_FILENAME = "fips_hhs_table.csv"
44+
FIPS_CHNGFIPS_OUT_FILENAME = "fips_chng-fips_table.csv"
4345
FIPS_POPULATION_OUT_FILENAME = "fips_pop.csv"
4446

47+
CHNGFIPS_STATE_OUT_FILENAME = "chng-fips_state_table.csv"
4548
ZIP_HSA_OUT_FILENAME = "zip_hsa_table.csv"
4649
ZIP_HRR_OUT_FILENAME = "zip_hrr_table.csv"
4750
ZIP_FIPS_OUT_FILENAME = "zip_fips_table.csv"
@@ -475,6 +478,176 @@ def derive_zip_hhs_crosswalk():
475478
zip_state.sort_values(["zip", "hhs"]).to_csv(join(OUTPUT_DIR, ZIP_HHS_FILENAME), index=False)
476479

477480

481+
def derive_fips_chngfips_crosswalk():
482+
"""Build a crosswalk table for FIPS to CHNG FIPS."""
483+
if not isfile(join(OUTPUT_DIR, FIPS_STATE_OUT_FILENAME)):
484+
derive_fips_state_crosswalk()
485+
486+
assign_county_groups()
487+
county_groups = pd.read_csv(CHNG_COUNTY_GROUPS_FILE, dtype="string", index_col=False)
488+
# Split list of county FIPS codes into separate columns.
489+
county_groups = pd.concat(
490+
[county_groups, county_groups.fips_list.str.split("|", expand=True)],
491+
axis=1
492+
).drop(
493+
columns = "fips_list"
494+
)
495+
496+
# Change to long format.
497+
county_groups = pd.melt(
498+
county_groups,
499+
id_vars = ["state_fips", "group"],
500+
var_name = "county_num",
501+
value_name = "fips"
502+
).drop(
503+
columns="county_num"
504+
).dropna()
505+
506+
county_groups["state_fips"] = county_groups["state_fips"].str.zfill(2)
507+
county_groups["group"] = county_groups["group"].str.zfill(2)
508+
county_groups["fips"] = county_groups["fips"].str.zfill(5).astype("string")
509+
# Combine state codes and group ids into a single FIPS code.
510+
county_groups["chng-fips"] = county_groups["state_fips"] + "g" + county_groups["group"]
511+
512+
county_groups = county_groups[["fips", "chng-fips"]]
513+
fips_to_state = pd.read_csv(join(OUTPUT_DIR, FIPS_STATE_OUT_FILENAME), dtype="string", index_col=False)
514+
515+
# Get all the fips that aren't included in the chng groupings.
516+
extra_fips_list = list(set(fips_to_state.fips) - set(county_groups.fips))
517+
# Normal fips codes and CHNG fips codes are the same for ungrouped counties.
518+
extra_fips_df = pd.DataFrame({"fips" : extra_fips_list, "chng-fips" : extra_fips_list}, dtype="string")
519+
520+
# Combine grouped and ungrouped counties.
521+
pd.concat(
522+
[county_groups, extra_fips_df]
523+
).sort_values(
524+
["fips", "chng-fips"]
525+
).to_csv(
526+
join(OUTPUT_DIR, FIPS_CHNGFIPS_OUT_FILENAME), index=False
527+
)
528+
529+
530+
def derive_chngfips_state_crosswalk():
531+
"""Build a crosswalk table for FIPS to CHNG FIPS."""
532+
if not isfile(join(OUTPUT_DIR, FIPS_STATE_OUT_FILENAME)):
533+
derive_fips_state_crosswalk()
534+
535+
if not isfile(join(OUTPUT_DIR, FIPS_CHNGFIPS_OUT_FILENAME)):
536+
derive_fips_chngfips_crosswalk()
537+
538+
fips_to_group = pd.read_csv(join(OUTPUT_DIR, FIPS_CHNGFIPS_OUT_FILENAME), dtype="string", index_col=False)
539+
fips_to_state = pd.read_csv(join(OUTPUT_DIR, FIPS_STATE_OUT_FILENAME), dtype="string", index_col=False)
540+
541+
group_to_state = fips_to_group.join(
542+
fips_to_state.set_index("fips"), on="fips", how="left"
543+
).drop(
544+
columns = "fips"
545+
).drop_duplicates(
546+
).sort_values(
547+
["chng-fips", "state_code"]
548+
)
549+
group_to_state.to_csv(join(OUTPUT_DIR, CHNGFIPS_STATE_OUT_FILENAME), index=False)
550+
551+
552+
def fetch_county_groups_spreadsheet():
553+
# County mapping file is derived from
554+
# https://docs.google.com/spreadsheets/d/1PEce4CjjHbRM1Z5xEMNI6Xsq_b2kkCh0/edit#gid=871427657
555+
sheet_id = "1PEce4CjjHbRM1Z5xEMNI6Xsq_b2kkCh0"
556+
sheet_name = "groupings"
557+
# Request sheet in CSV format via tag in URL.
558+
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
559+
560+
county_groups = pd.read_csv(
561+
url, dtype="string", index_col=False
562+
).dropna(
563+
how="all", axis=1
564+
)
565+
county_groups["state FIPS"] = county_groups["state FIPS"].astype(int)
566+
567+
# Counties belonging to each group are listed (as FIPS codes) in the "county
568+
# FIPS grouping" column, concatenated and separated by the pipe "|".
569+
new_names = {
570+
"state FIPS": "state_fips",
571+
"county FIPS grouping": "fips_list"
572+
}
573+
574+
county_groups = county_groups.rename(
575+
columns=new_names
576+
)[new_names.values()]
577+
578+
return county_groups
579+
580+
581+
def assign_county_groups():
582+
county_groups = fetch_county_groups_spreadsheet()
583+
584+
# If a county groups mapping file already exists in `data_proc/geomap`, we
585+
# have to be careful to not reassign a group number to a different group.
586+
# Group numbers must remain fixed, even if a given county group is no longer
587+
# being used.
588+
if isfile(CHNG_COUNTY_GROUPS_FILE):
589+
old_county_groups = pd.read_csv(CHNG_COUNTY_GROUPS_FILE, dtype="string", index_col=False)
590+
old_county_groups.group = old_county_groups.group.astype(int)
591+
old_county_groups.state_fips = old_county_groups.state_fips.astype(int)
592+
593+
# Remove rows from county_groups if that `fips_list` value already
594+
# exists in old_county_groups.
595+
county_groups = county_groups[
596+
~county_groups.fips_list.isin(old_county_groups.fips_list)
597+
]
598+
599+
# If grouping file has no new rows, no need to process again.
600+
if county_groups.empty:
601+
return
602+
# Grouping spreadsheet contains rows not seen in old, on-disk county
603+
# groupings file. Combining the two is delicate. While the code below
604+
# appears to work, it has not been formally tested and could be
605+
# invalid for even small changes to the format of the input county
606+
# groupings file.
607+
else:
608+
raise NotImplementedError(
609+
"Can't combine old and new county groupings automatically, "
610+
"code below is not tested or robust to changes in input format."
611+
"We recommend manually working with the code below and the new"
612+
"data in a REPL."
613+
)
614+
615+
# Assign an incrementing integer to be the group id of each remaining
616+
# county grouping within a state using the given sort order.
617+
county_groups["group"] = county_groups.groupby("state_fips").cumcount() + 1
618+
619+
# Find max group number by state in old_county_groups, join on, and
620+
# add max group number to group number.
621+
max_group_by_state = old_county_groups.groupby(
622+
"state_fips"
623+
).group.max(
624+
).reset_index(
625+
).rename(
626+
columns = {"group": "max_group"}
627+
)
628+
county_groups = county_groups.join(
629+
max_group_by_state.set_index("state_fips"),
630+
how="left",
631+
on="state_fips"
632+
).assign(
633+
group = lambda x: x.group + x.max_group
634+
).drop(
635+
["max_group"], axis=1
636+
)
637+
638+
# Combine old_county_groups and county_groups
639+
county_groups = pd.concat([old_county_groups, county_groups])
640+
else:
641+
# Group numbers are 1-indexed.
642+
county_groups["group"] = county_groups.groupby("state_fips").cumcount() + 1
643+
644+
county_groups.sort_values(
645+
["state_fips"], kind="stable"
646+
).to_csv(
647+
CHNG_COUNTY_GROUPS_FILE, index=False
648+
)
649+
650+
478651
def clear_dir(dir_path: str):
479652
for fname in listdir(dir_path):
480653
remove(join(dir_path, fname))
@@ -501,3 +674,5 @@ def clear_dir(dir_path: str):
501674
derive_zip_population_table()
502675
derive_fips_hhs_crosswalk()
503676
derive_zip_hhs_crosswalk()
677+
derive_fips_chngfips_crosswalk()
678+
derive_chngfips_state_crosswalk()

0 commit comments

Comments
 (0)