cmu-delphi
diff --git a/‎_delphi_utils_python/data_proc/geomap/geo_data_proc.py
Lines changed: 73 additions & 18 deletions b/‎_delphi_utils_python/data_proc/geomap/geo_data_proc.py
Lines changed: 73 additions & 18 deletions
@@ -24,8 +24,8 @@
 FIPS_MSA_URL = "https://www2.census.gov/programs-surveys/metro-micro/geographies/reference-files/2018/delineation-files/list1_Sep_2018.xls"
 JHU_FIPS_URL = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv"
 STATE_CODES_URL = "http://www2.census.gov/geo/docs/reference/state.txt?#"
-
-# fips, zip, jhu_uid
+FIPS_POPULATION_URL = "https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.csv"
+FIPS_PUERTO_RICO_POPULATION_URL = "https://www2.census.gov/geo/docs/maps-data/data/rel/zcta_county_rel_10.txt?"
 
 # Out files
 FIPS_STATE_OUT_FILENAME = "fips_state_table.csv"
@@ -63,20 +63,6 @@ def create_fips_zip_crosswalk():
     # Pare down the dataframe to just the relevant columns: zip, fips, and population
     pop_df = pop_df[["zip", "fips", "POPPT"]].rename(columns={"POPPT": "pop"})
 
-    # Find the populations by FIPS and ZIP and write them to files
-    (
-        pop_df[["fips", "pop"]]
-        .groupby("fips")
-        .sum()
-        .to_csv(join(OUTPUT_DIR, FIPS_POPULATION_OUT_FILENAME))
-    )
-    (
-        pop_df[["zip", "pop"]]
-        .groupby("zip")
-        .sum()
-        .to_csv(join(OUTPUT_DIR, ZIP_POPULATION_OUT_FILENAME))
-    )
-
     # Find the population fractions (the heaviest computation, takes about a minute)
     # Note that the denominator in the fractions is the source population
     pop_df.set_index(["fips", "zip"], inplace=True)
@@ -255,7 +241,7 @@ def create_jhu_uid_fips_crosswalk():
     fips_st = jhu_df["fips"].str.len() <= 2
     jhu_df.loc[fips_st, "fips"] = jhu_df.loc[fips_st, "fips"].str.ljust(5, "0")
 
-    # Drop the FIPS codes in JHU that were hand-modified
+    # Drop the two FIPS codes in JHU (see above) that diverged from the official FIPS map
     dup_ind = jhu_df["fips"].isin(hand_additions["fips"].values) | jhu_df["fips"].isin(
         ["02158", "46102"]
     )
@@ -360,14 +346,81 @@ def create_state_hhs_crosswalk():
     )
 
 
+def create_fips_population_table():
+    census_pop = pd.read_csv(FIPS_POPULATION_URL, encoding="ISO-8859-1")
+    census_pop["fips"] = census_pop.apply(
+        lambda x: f"{x['STATE']:02d}{x['COUNTY']:03d}", axis=1
+    )
+    census_pop["pop"] = census_pop["POPESTIMATE2019"]
+    census_pop = census_pop[["fips", "pop"]]
+    census_pop = pd.concat(
+        [
+            census_pop,
+            pd.DataFrame(
+                {
+                    "fips": ["70002", "70003"],
+                    "pop": [0, 0],
+                }
+            ),
+        ]
+    )
+    census_pop = census_pop.reset_index(drop=True)
+
+    # Set population for Dukes and Nantucket
+    DN_FIPS = "70002"
+    DUKES_FIPS = "25007"
+    NANTU_FIPS = "25019"
+
+    census_pop.loc[census_pop["fips"] == DN_FIPS, "pop"] = (
+        census_pop.loc[census_pop["fips"] == DUKES_FIPS, "pop"].values
+        + census_pop.loc[census_pop["fips"] == NANTU_FIPS, "pop"].values
+    )
+
+    # Set population for Kansas City
+    census_pop.loc[census_pop["fips"] == "70003", "pop"] = 491918  # via Google
+
+    # Get the file with Puerto Rico populations (and a few counties other small counties)
+    df_pr = pd.read_csv(FIPS_PUERTO_RICO_POPULATION_URL)
+    df_pr["fips"] = df_pr["STATE"].astype(str).str.zfill(2) + df_pr["COUNTY"].astype(str).str.zfill(3)
+    df_pr["pop"] = df_pr["POPPT"]
+    df_pr = df_pr[["fips", "pop"]]
+    # Fill the missing data with 2010 information
+    df_pr = df_pr.groupby("fips").sum().reset_index()
+    df_pr = df_pr[~df_pr["fips"].isin(census_pop["fips"])]
+    census_pop_pr = pd.concat([census_pop, df_pr])
+    census_pop_pr.to_csv(join(OUTPUT_DIR, FIPS_POPULATION_OUT_FILENAME), index=False)
+
+
+def derive_zip_population_table():
+    if not isfile(join(OUTPUT_DIR, FIPS_POPULATION_OUT_FILENAME)):
+        create_fips_population_table()
+
+    if not isfile(join(OUTPUT_DIR, FIPS_ZIP_OUT_FILENAME)):
+        create_fips_zip_crosswalk()
+
+    census_pop = pd.read_csv(
+        join(OUTPUT_DIR, FIPS_POPULATION_OUT_FILENAME), dtype={"fips": str, "pop": int}
+    )
+    fz_df = pd.read_csv(
+        join(OUTPUT_DIR, FIPS_ZIP_OUT_FILENAME),
+        dtype={"fips": str, "zip": str, "weight": float},
+    )
+
+    df = census_pop.merge(fz_df, on="fips", how="left")
+    df["pop"] = df["pop"].multiply(df["weight"], axis=0)
+    df = df.drop(columns=["fips", "weight"]).groupby("zip").sum().dropna().reset_index()
+    df["pop"] = df["pop"].astype(int)
+    df.to_csv(join(OUTPUT_DIR, ZIP_POPULATION_OUT_FILENAME), index=False)
+
+
 def derive_fips_hrr_crosswalk():
     """Derives a crosswalk file from FIPS to HRR through FIPZ -> ZIP -> HRR
     from the crosswalk files made by the functions above."""
     if not isfile(join(OUTPUT_DIR, FIPS_ZIP_OUT_FILENAME)):
         create_fips_zip_crosswalk()
 
     if not isfile(join(OUTPUT_DIR, ZIP_HRR_OUT_FILENAME)):
-        create_fips_zip_crosswalk()
+        create_zip_hsa_hrr_crosswalk()
 
     fz_df = pd.read_csv(
         join(OUTPUT_DIR, FIPS_ZIP_OUT_FILENAME),
@@ -461,8 +514,10 @@ def derive_zip_to_state_code():
     create_jhu_uid_fips_crosswalk()
     create_state_codes_crosswalk()
     create_state_hhs_crosswalk()
+    create_fips_population_table()
 
     derive_fips_hrr_crosswalk()
     derive_zip_msa_crosswalk()
     derive_zip_to_state_code()
     derive_fips_state_crosswalk()
+    derive_zip_population_table()