cmu-delphi
diff --git a/‎_delphi_utils_python/data_proc/geomap/geo_data_proc.py
Lines changed: 146 additions & 90 deletions b/‎_delphi_utils_python/data_proc/geomap/geo_data_proc.py
Lines changed: 146 additions & 90 deletions
@@ -1,6 +1,4 @@
-"""Needed to process the geo files to get from xls file to a simpler csv.
-pip install xlrd
-
+"""
 Author: James Sharpnack @jsharpna
 Refactored by: Dmitry Shemetov @dshemetov
 """
@@ -54,45 +52,57 @@ def create_fips_zip_crosswalk():
     pop_df = pd.read_csv(FIPS_BY_ZIP_POP_URL)
 
     # Create the FIPS column by combining the state and county codes
-    pop_df["fips"] = pop_df["STATE"].astype(str).str.zfill(2) + pop_df["COUNTY"].astype(
-        str
-    ).str.zfill(3)
+    state_codes = pop_df["STATE"].astype(str).str.zfill(2)
+    county_codes = pop_df["COUNTY"].astype(str).str.zfill(3)
+    pop_df["fips"] = state_codes + county_codes
 
     # Create the ZIP column by adding leading zeros to the ZIP
     pop_df["zip"] = pop_df["ZCTA5"].astype(str).str.zfill(5)
 
     # Pare down the dataframe to just the relevant columns: zip, fips, and population
     pop_df = pop_df[["zip", "fips", "POPPT"]].rename(columns={"POPPT": "pop"})
 
-    # Find the populations by FIPS and ZIP
-    pop_fips = pop_df[["fips", "pop"]].groupby("fips").sum()
-    pop_zip = pop_df[["zip", "pop"]].groupby("zip").sum()
-    pop_fips.to_csv(join(OUTPUT_DIR, FIPS_POPULATION_OUT_FILENAME))
-    pop_zip.to_csv(join(OUTPUT_DIR, ZIP_POPULATION_OUT_FILENAME))
+    # Find the populations by FIPS and ZIP and write them to files
+    (
+        pop_df[["fips", "pop"]]
+        .groupby("fips")
+        .sum()
+        .to_csv(join(OUTPUT_DIR, FIPS_POPULATION_OUT_FILENAME))
+    )
+    (
+        pop_df[["zip", "pop"]]
+        .groupby("zip")
+        .sum()
+        .to_csv(join(OUTPUT_DIR, ZIP_POPULATION_OUT_FILENAME))
+    )
 
     # Find the population fractions (the heaviest computation, takes about a minute)
-    # Note that the denominator in the fractions is the target code population
+    # Note that the denominator in the fractions is the source population
     pop_df.set_index(["fips", "zip"], inplace=True)
-    fips_zip = pop_df.groupby("zip", as_index=False).apply(
+    fips_zip = pop_df.groupby("fips", as_index=False).apply(
         lambda g: g["pop"] / g["pop"].sum()
     )
-    zip_fips = pop_df.groupby("fips", as_index=False).apply(
+    zip_fips = pop_df.groupby("zip", as_index=False).apply(
         lambda g: g["pop"] / g["pop"].sum()
     )
 
     # Rename and write to file
-    fips_zip.reset_index(level=["fips", "zip"]).rename(
-        columns={"pop": "weight"}
-    ).to_csv(join(OUTPUT_DIR, FIPS_ZIP_OUT_FILENAME), index=False)
-    zip_fips.reset_index(level=["fips", "zip"]).rename(
-        columns={"pop": "weight"}
-    ).to_csv(join(OUTPUT_DIR, ZIP_FIPS_OUT_FILENAME), index=False)
+    (
+        fips_zip.reset_index(level=["fips", "zip"])
+        .rename(columns={"pop": "weight"})
+        .to_csv(join(OUTPUT_DIR, FIPS_ZIP_OUT_FILENAME), index=False)
+    )
+    (
+        zip_fips.reset_index(level=["fips", "zip"])
+        .rename(columns={"pop": "weight"})
+        .to_csv(join(OUTPUT_DIR, ZIP_FIPS_OUT_FILENAME), index=False)
+    )
 
 
 def create_zip_hsa_hrr_crosswalk():
     """Creates the crosswalk table from ZIP to HSA and from ZIP to HRR from source."""
-    zipped_csv = BytesIO(requests.get(ZIP_HSA_HRR_URL).content)
-    zip_df = pd.read_csv(ZipFile(zipped_csv).open(ZIP_HSA_HRR_FILENAME))
+    zipped_csv = ZipFile(BytesIO(requests.get(ZIP_HSA_HRR_URL).content))
+    zip_df = pd.read_csv(zipped_csv.open(ZIP_HSA_HRR_FILENAME))
 
     # Build the HSA table
     hsa_df = zip_df[["zipcode18", "hsanum"]].rename(
@@ -122,8 +132,13 @@ def create_fips_msa_crosswalk():
         "FIPS State Code": str,
         "FIPS County Code": str,
     }
-    msa_df = pd.read_excel(
-        FIPS_MSA_URL, skiprows=2, skipfooter=4, usecols=msa_cols.keys(), dtype=msa_cols,
+    # The following line requires the xlrd package.
+    msa_df = pd.read_excel( 
+        FIPS_MSA_URL,
+        skiprows=2,
+        skipfooter=4,
+        usecols=msa_cols.keys(),
+        dtype=msa_cols,
     )
 
     metro_bool = (
@@ -134,9 +149,10 @@ def create_fips_msa_crosswalk():
 
     # Combine state and county codes into a single FIPS code
     msa_df["fips"] = msa_df["FIPS State Code"].str.cat(msa_df["FIPS County Code"])
-    msa_df.rename(columns={"CBSA Code": "msa"}, inplace=True)
-    msa_df = msa_df[["fips", "msa"]]
-    msa_df.to_csv(join(OUTPUT_DIR, FIPS_MSA_OUT_FILENAME), index=False)
+
+    msa_df.rename(columns={"CBSA Code": "msa"})[["fips", "msa"]].to_csv(
+        join(OUTPUT_DIR, FIPS_MSA_OUT_FILENAME), index=False
+    )
 
 
 def create_jhu_uid_fips_crosswalk():
@@ -147,37 +163,72 @@ def create_jhu_uid_fips_crosswalk():
     hand_additions = pd.DataFrame(
         [
             # Split aggregation of Dukes and Nantucket, Massachusetts
-            {"jhu_uid": 84070002, "fips": "25007", "weight": 16535/(16535 + 10172)}, # Population: 16535
-            {"jhu_uid": 84070002, "fips": "25019", "weight": 10172/(16535 + 10172)}, # 10172
+            {
+                "jhu_uid": 84070002,
+                "fips": "25007",
+                "weight": 16535 / (16535 + 10172),
+            },  # Population: 16535
+            {
+                "jhu_uid": 84070002,
+                "fips": "25019",
+                "weight": 10172 / (16535 + 10172),
+            },  # 10172
             # Kansas City, Missouri
-            {"jhu_uid": 84070003, "fips": "29095", "weight": 674158 / 1084897}, # Population: 674158
-            {"jhu_uid": 84070003, "fips": "29165", "weight": 89322 / 1084897}, # 89322
-            {"jhu_uid": 84070003, "fips": "29037", "weight": 99478 / 1084897}, # 99478
-            {"jhu_uid": 84070003, "fips": "29047", "weight": 221939 / 1084897}, # 221939
+            {
+                "jhu_uid": 84070003,
+                "fips": "29095",
+                "weight": 674158 / 1084897,
+            },  # Population: 674158
+            {"jhu_uid": 84070003, "fips": "29165", "weight": 89322 / 1084897},  # 89322
+            {"jhu_uid": 84070003, "fips": "29037", "weight": 99478 / 1084897},  # 99478
+            {
+                "jhu_uid": 84070003,
+                "fips": "29047",
+                "weight": 221939 / 1084897,
+            },  # 221939
             # Kusilvak, Alaska
             {"jhu_uid": 84002158, "fips": "02270", "weight": 1.0},
             # Oglala Lakota
             {"jhu_uid": 84046102, "fips": "46113", "weight": 1.0},
             # Split aggregation of New York County (populations from JHU documentation)
-            {"jhu_uid": 84036061, "fips": "36005", "weight": 1418207/8336817}, # Population: 1,418,207
-            {"jhu_uid": 84036061, "fips": "36047", "weight": 2559903/8336817}, # 2,559,903
-            {"jhu_uid": 84036061, "fips": "36061", "weight": 1628706/8336817}, # 1,628,706
-            {"jhu_uid": 84036061, "fips": "36081", "weight": 2253858/8336817}, # 2,253,858
-            {"jhu_uid": 84036061, "fips": "36085", "weight": 476143/8336817}, # 476,143
+            {
+                "jhu_uid": 84036061,
+                "fips": "36005",
+                "weight": 1418207 / 8336817,
+            },  # Population: 1,418,207
+            {
+                "jhu_uid": 84036061,
+                "fips": "36047",
+                "weight": 2559903 / 8336817,
+            },  # 2,559,903
+            {
+                "jhu_uid": 84036061,
+                "fips": "36061",
+                "weight": 1628706 / 8336817,
+            },  # 1,628,706
+            {
+                "jhu_uid": 84036061,
+                "fips": "36081",
+                "weight": 2253858 / 8336817,
+            },  # 2,253,858
+            {
+                "jhu_uid": 84036061,
+                "fips": "36085",
+                "weight": 476143 / 8336817,
+            },  # 476,143
             # Aggregate Utah into a "State FIPS"
-            {'jhu_uid': 84070015, 'fips': "49000", 'weight': 1.},
-            {'jhu_uid': 84070016, 'fips': "49000", 'weight': 1.},
-            {'jhu_uid': 84070017, 'fips': "49000", 'weight': 1.},
-            {'jhu_uid': 84070018, 'fips': "49000", 'weight': 1.},
-            {'jhu_uid': 84070019, 'fips': "49000", 'weight': 1.},
-            {'jhu_uid': 84070020, 'fips': "49000", 'weight': 1.}
+            {"jhu_uid": 84070015, "fips": "49000", "weight": 1.0},
+            {"jhu_uid": 84070016, "fips": "49000", "weight": 1.0},
+            {"jhu_uid": 84070017, "fips": "49000", "weight": 1.0},
+            {"jhu_uid": 84070018, "fips": "49000", "weight": 1.0},
+            {"jhu_uid": 84070019, "fips": "49000", "weight": 1.0},
+            {"jhu_uid": 84070020, "fips": "49000", "weight": 1.0},
         ]
     )
 
-    jhu_df = pd.read_csv(JHU_FIPS_URL, dtype={"UID": str, "FIPS": str})
-    jhu_df = jhu_df.query("Country_Region == 'US'")
     jhu_df = (
-        jhu_df[["UID", "FIPS"]]
+        pd.read_csv(JHU_FIPS_URL, dtype={"UID": str, "FIPS": str})
+        .query("Country_Region == 'US'")[["UID", "FIPS"]]
         .rename(columns={"UID": "jhu_uid", "FIPS": "fips"})
         .dropna(subset=["fips"])
     )
@@ -186,7 +237,9 @@ def create_jhu_uid_fips_crosswalk():
     # These are Guam (66), Northern Mariana Islands (69), Virgin Islands (78),
     # and Puerto Rico (72).
     fips_st = jhu_df["fips"].str.len() <= 2
-    jhu_df.loc[fips_st, "fips"] = jhu_df.loc[fips_st, "fips"].astype(str).str.ljust(5, '0')
+    jhu_df.loc[fips_st, "fips"] = (
+        jhu_df.loc[fips_st, "fips"].astype(str).str.ljust(5, "0")
+    )
 
     # Drop the JHU UIDs that were hand-modified
     dup_ind = jhu_df["jhu_uid"].isin(hand_additions["jhu_uid"].values)
@@ -206,22 +259,23 @@ def create_jhu_uid_fips_crosswalk():
 
 def create_state_codes_crosswalk():
     """Creat the State ID -> State Name -> State code crosswalk file."""
-    df = pd.read_csv(
-        "http://www2.census.gov/geo/docs/reference/state.txt?#", delimiter="|"
-    )
-    df = df.drop(columns="STATENS").rename(
-        columns={
-            "STATE": "state_code",
-            "STUSAB": "state_id",
-            "STATE_NAME": "state_name",
-        }
+    df = (
+        pd.read_csv(STATE_CODES_URL, delimiter="|")
+        .drop(columns="STATENS")
+        .rename(
+            columns={
+                "STATE": "state_code",
+                "STUSAB": "state_id",
+                "STATE_NAME": "state_name",
+            }
+        )
     )
     df["state_code"] = df["state_code"].astype(str).str.zfill(2)
     df.to_csv(join(OUTPUT_DIR, STATE_OUT_FILENAME), index=False)
 
 
 def derive_fips_hrr_crosswalk():
-    """Derives a crosswalk file from FIPS to HRR through FIPZ -> ZIP -> HRR 
+    """Derives a crosswalk file from FIPS to HRR through FIPZ -> ZIP -> HRR
     from the crosswalk files made by the functions above."""
     if not isfile(join(OUTPUT_DIR, FIPS_ZIP_OUT_FILENAME)):
         create_fips_zip_crosswalk()
@@ -235,22 +289,17 @@ def derive_fips_hrr_crosswalk():
     )
     zh_df = pd.read_csv(
         join(OUTPUT_DIR, ZIP_HRR_OUT_FILENAME),
-        dtype={"fips": str, "zip": str, "weight": float},
+        dtype={"zip": str, "hrr": str},
     )
 
-    df = fz_df.join(zh_df.set_index("zip"), on="zip")
-    df = df.drop(columns="zip")
-    df = df.reset_index().set_index(["fips", "hrr"])
-    df = df.groupby(["hrr"], as_index=False).apply(
-        lambda g: g["weight"] / g["weight"].sum()
+    (
+        fz_df.merge(zh_df, on="zip", how="left")
+        .drop(columns="zip")
+        .groupby(["fips", "hrr"])
+        .sum()
+        .reset_index()
+        .to_csv(join(OUTPUT_DIR, FIPS_HRR_OUT_FILENAME), index=False)
     )
-    df = df.reset_index(level=["fips", "hrr"])
-
-    # Cast back to str
-    df["hrr"] = df["hrr"].astype(int).astype(str)
-    df["fips"] = df["fips"].astype(str).str.zfill(5)
-
-    df.to_csv(join(OUTPUT_DIR, FIPS_HRR_OUT_FILENAME), index=False)
 
 
 def derive_fips_state_crosswalk():
@@ -263,34 +312,38 @@ def derive_fips_state_crosswalk():
     )
 
     fips_pop["state_code"] = fips_pop["fips"].str[:2]
-    fips_pop = fips_pop.merge(state_codes, on="state_code", how="left")
-    fips_pop = fips_pop.drop(columns="pop")
-
-    fips_pop.to_csv(join(OUTPUT_DIR, FIPS_STATE_OUT_FILENAME), index=False)
+    (
+        fips_pop.merge(state_codes, on="state_code", how="left")
+        .drop(columns="pop")
+        .to_csv(join(OUTPUT_DIR, FIPS_STATE_OUT_FILENAME), index=False)
+    )
 
 
 def derive_zip_msa_crosswalk():
-    """Derives a crosswalk file from ZIP to MSA through ZIP -> FIPS -> HRR 
+    """Derives a crosswalk file from ZIP to MSA through ZIP -> FIPS -> HRR
     from the crosswalk files made by the functions above."""
     if not isfile(join(OUTPUT_DIR, ZIP_FIPS_OUT_FILENAME)):
         create_fips_zip_crosswalk()
 
     if not isfile(join(OUTPUT_DIR, FIPS_MSA_OUT_FILENAME)):
         create_fips_msa_crosswalk()
 
-    zf_df = pd.read_csv(join(OUTPUT_DIR, ZIP_FIPS_OUT_FILENAME))
-    fm_df = pd.read_csv(join(OUTPUT_DIR, FIPS_MSA_OUT_FILENAME))
+    zf_df = pd.read_csv(
+        join(OUTPUT_DIR, ZIP_FIPS_OUT_FILENAME),
+        dtype={"zip": str, "fips": str, "weight": float},
+    )
+    fm_df = pd.read_csv(
+        join(OUTPUT_DIR, FIPS_MSA_OUT_FILENAME), dtype={"fips": str, "msa": str}
+    )
 
-    df = zf_df.join(fm_df.set_index("fips"), on="fips")
-    df = df.drop(columns="fips")
-    df = df.set_index(["zip", "msa"])
-    df = df.groupby(["msa"], as_index=False).apply(
-        lambda g: g["weight"] / g["weight"].sum()
+    (
+        zf_df.merge(fm_df, on="fips")
+        .drop(columns="fips")
+        .groupby(["msa", "zip"])
+        .sum()
+        .reset_index()
+        .to_csv(join(OUTPUT_DIR, ZIP_MSA_OUT_FILENAME), index=False)
     )
-    df = df.reset_index(level=["zip", "msa"])
-    df["zip"] = df["zip"].astype(str).str.zfill(5)
-    df["msa"] = df["msa"].astype(int).astype(str)
-    df.to_csv(join(OUTPUT_DIR, ZIP_MSA_OUT_FILENAME), index=False)
 
 
 def derive_zip_to_state_code():
@@ -306,10 +359,13 @@ def derive_zip_to_state_code():
     zf_cf = pd.read_csv(
         join(OUTPUT_DIR, ZIP_FIPS_OUT_FILENAME), dtype={"zip": str, "fips": str}
     )
+
     zf_cf["state_code"] = zf_cf["fips"].str[:2]
-    df = zf_cf.merge(sdf, left_on="state_code", right_on="state_code", how="left")
-    df = df.drop(columns=["fips"])
-    df.to_csv(join(OUTPUT_DIR, ZIP_STATE_CODE_OUT_FILENAME), index=False)
+    (
+        zf_cf.merge(sdf, left_on="state_code", right_on="state_code", how="left")
+        .drop(columns=["fips"])
+        .to_csv(join(OUTPUT_DIR, ZIP_STATE_CODE_OUT_FILENAME), index=False)
+    )
 
 
 if __name__ == "__main__":