updated code for using geo utils

Jingjing Tang · krivard · commit b39d55f58d14 · 2020-11-17T11:14:16.000-05:00
diff --git a/quidel_covidtest/delphi_quidel_covidtest/geo_maps.py b/quidel_covidtest/delphi_quidel_covidtest/geo_maps.py
@@ -1,77 +1,49 @@
 """Contains geographic mapping tools."""
-
-def zip_to_msa(data, map_df):
-    """Map from zipcode to MSA (along with parent state).
-    Args:
-        data: dataframe at the day-zip resolution.
-    Returns:
-        tuple, a dataframe at day-msa, with parent state column, and their string keys
+from delphi_utils import GeoMapper
+
+date_col = "timestamp"
+data_cols = ['totalTest', 'numUniqueDevices', 'positiveTest', "population"]
+gmpr = GeoMapper() # Use geo utils
+GEO_KEY_DICT = {
+        "county": "fips",
+        "msa": "msa",
+        "hrr": "hrr",
+        "state": "state_id"
+}
+def geo_map(geo_res, df):
+    data = df.copy()
+    geo_key = GEO_KEY_DICT[geo_res]
+    # Add population for each zipcode
+    data = gmpr.add_population_column(data, "zip")
+    # zip -> geo_res
+    data = gmpr.replace_geocode(data, "zip", geo_key,
+                                date_col=date_col, data_cols=data_cols)
+    if geo_res == "state":
+        return data
+    # Add parent state
+    data = add_parent_state(data, geo_res, geo_key)
+    return data, geo_key
+
+def add_parent_state(data, geo_res, geo_key):
     """
-    # zip -> msa
-    zip_map = map_df[["zip", "cbsa_id"]].dropna().drop_duplicates()
-    # forget about the rest of the zips that aren't in MSA
-    data = data.merge(zip_map, how="left", on="zip").dropna().drop(columns=["zip"], axis=1)
-
-    # msa + parent state
-    # msa_map has mapping from msa to state, going by the state with the largest
-    # population (since a msa may span multiple states)
-    msa_map = map_df[["cbsa_id", "state_id", "population"]]
-    msa_map = msa_map.groupby(["cbsa_id"]).max().reset_index()
-    data = data.merge(msa_map, how="left", on="cbsa_id").drop(
-        columns=["population"]).dropna()
-    data = data.groupby(["timestamp", "cbsa_id", "state_id"]).sum().reset_index()
-    data["cbsa_id"] = data["cbsa_id"].apply(lambda x: str(int(x)).zfill(5))
-
-    return data, "cbsa_id"
-
-def zip_to_hrr(data, map_df):
-    """Map from zipcode to HRR (along with parent state).
-    Args:
-        data: dataframe at the day-zip resolution.
-    Returns:
-        tuple, a dataframe at day-msa, with parent state column, and their string keys
+    - map from msa/hrr to state, going by the state with the largest
+      population (since a msa/hrr may span multiple states)
+    - map from county to the corresponding state
     """
-    # zip -> msa
-    zip_map = map_df[["zip", "hrrnum"]].dropna().drop_duplicates()
-    # forget about the rest of the zips that aren't in MSA
-    data = data.merge(zip_map, how="left", on="zip").dropna().drop(columns=["zip"], axis=1)
-
-    # msa + parent state
-    # msa_map has mapping from msa to state, going by the state with the largest
-    # population (since a msa may span multiple states)
-    msa_map = map_df[["hrrnum", "state_id", "population"]]
-    msa_map = msa_map.groupby(["hrrnum"]).max().reset_index()
-    data = data.merge(msa_map, how="left", on="hrrnum").drop(
+    fips_to_state = gmpr._load_crosswalk(from_code="fips", to_code="state")
+    if geo_res == "county":
+        mix_map = fips_to_state[["fips", "state_id"]]
+    else:
+        fips_to_geo_res = gmpr._load_crosswalk(from_code="fips", to_code=geo_res)
+        mix_map = fips_to_geo_res[["fips", geo_res]].merge(
+                fips_to_state[["fips", "state_id"]],
+                on="fips",
+                how="inner")
+        mix_map = gmpr.add_population_column(mix_map, "fips").groupby(
+                geo_res).max().reset_index().drop(
+                ["fips", "population"], axis = 1)
+    # Merge the info of parent state to the data
+    data = data.merge(mix_map, how="left", on=geo_key).drop(
         columns=["population"]).dropna()
-    data = data.groupby(["timestamp", "hrrnum", "state_id"]).sum().reset_index()
-    data["hrrnum"] = data["hrrnum"].astype(int)
-
-    return data, "hrrnum"
-
-def zip_to_county(data, map_df):
-    """Aggregate zip codes to the county resolution, along with its parent state.
-    Args:
-        data: dataframe aggregated to the day-zip resolution
-    Returns:
-        dataframe at the day-county resolution and parent state, with their string keys
-    """
-    # zip -> county + parent state (county has unique state)
-    zip_map = map_df[["fips", "zip", "state_id"]].dropna().drop_duplicates()
-    data = data.merge(zip_map, how="left", on="zip").drop(columns=["zip"]).dropna()
-    data = data.groupby(["timestamp", "fips", "state_id"]).sum().reset_index()
-    data["fips"] = data["fips"].apply(lambda x: str(int(x)).zfill(5))
-
-    return data, "fips"
-
-def zip_to_state(data, map_df):
-    """Aggregate zip codes to the state resolution.
-    Args:
-        data: dataframe aggregated to the day-zip resolution
-    Returns:
-        dataframe at the day-state resolution, with the state key
-    """
-    zip_map = map_df[["zip", "state_id"]].dropna().drop_duplicates()
-    data = data.merge(zip_map, how="left", on="zip").drop(
-        columns=["zip"]).dropna()
-    data = data.groupby(["timestamp", "state_id"]).sum().reset_index()
-    return data
+    data = data.groupby(["timestamp", geo_key, "state_id"]).sum().reset_index()
+    return data
diff --git a/quidel_covidtest/delphi_quidel_covidtest/run.py b/quidel_covidtest/delphi_quidel_covidtest/run.py
@@ -9,7 +9,7 @@
 import pandas as pd
 from delphi_utils import read_params, add_prefix
 
-from .geo_maps import (zip_to_msa, zip_to_hrr, zip_to_county, zip_to_state)
+from .geo_maps import geo_map
 from .pull import (pull_quidel_covidtest,
                    check_export_start_date,
                    check_export_end_date,
@@ -29,12 +29,8 @@ def run_module():
     params = read_params()
     cache_dir = params["cache_dir"]
     export_dir = params["export_dir"]
-    static_file_dir = params["static_file_dir"]
     export_start_date = params["export_start_date"]
     export_end_date = params["export_end_date"]
-    map_df = pd.read_csv(
-        join(static_file_dir, "fips_prop_pop.csv"), dtype={"fips": int}
-    )
 
     # Pull data and update export date
     df, _end_date = pull_quidel_covidtest(params)
@@ -50,14 +46,14 @@ def run_module():
 
     # State Level
     data = df.copy()
-    state_groups = zip_to_state(data, map_df).groupby("state_id")
+    state_groups = geo_map("state", data).groupby("state_id")
 
     # Add prefix, if required
     sensors = add_prefix(SENSORS,
                          wip_signal=read_params()["wip_signal"],
                          prefix="wip_")
     smoothers = SMOOTHERS.copy()
-
+    
     for sensor in sensors:
         # For State Level
         print("state", sensor)
@@ -78,21 +74,15 @@ def run_module():
 
         # County/HRR/MSA level
         for geo_res in GEO_RESOLUTIONS:
-            print(geo_res, sensor)
-            data = df.copy()
-            if geo_res == COUNTY:
-                data, res_key = zip_to_county(data, map_df)
-            elif geo_res == MSA:
-                data, res_key = zip_to_msa(data, map_df)
-            else:
-                data, res_key = zip_to_hrr(data, map_df)
-
-            res_df = generate_sensor_for_other_geores(
-                state_groups, data, res_key, smooth=smoothers[sensor][1],
-                device=smoothers[sensor][0], first_date=first_date,
-                last_date=last_date)
-            export_csv(res_df, geo_res, sensor, receiving_dir=export_dir,
-                       start_date=export_start_date, end_date=export_end_date)
+            geo_data, res_key = geo_map(geo_res, data)
+            for sensor in sensors:
+                print(geo_res, sensor)
+                res_df = generate_sensor_for_other_geores(
+                    state_groups, geo_data, res_key, smooth=smoothers[sensor][1],
+                    device=smoothers[sensor][0], first_date=first_date,
+                    last_date=last_date)
+                export_csv(res_df, geo_res, sensor, receiving_dir=export_dir,
+                           start_date=export_start_date, end_date=export_end_date)
 
     # Export the cache file if the pipeline runs successfully.
     # Otherwise, don't update the cache file
diff --git a/quidel_covidtest/tests/test_geo_maps.py b/quidel_covidtest/tests/test_geo_maps.py
@@ -6,14 +6,9 @@
 import numpy as np
 
 
-from delphi_quidel_covidtest.geo_maps import (zip_to_msa, zip_to_hrr, 
-                                              zip_to_county, zip_to_state)
+from delphi_quidel_covidtest.geo_maps import geo_map
 
 
-map_df = pd.read_csv(
-        join("../static", "fips_prop_pop.csv"), dtype={"fips": int}
-    )
-
 class TestGeoMap:
     def test_county(self):
 
@@ -24,10 +19,11 @@ def test_county(self):
                               "2020-06-15", "2020-06-15", "2020-06-15"],
                 "totalTest": [100, 50, 200, 200, 250, 500],
                 "positiveTest": [10, 8, 15, 5, 20, 50],
+                "numUniqueDevices": [2, 1, 1, 1, 1, 1]
             }
         )
         
-        new_df, res_key = zip_to_county(df, map_df)
+        new_df, res_key = geo_map("county", df)
         
         assert res_key == 'fips'
         assert set(new_df["fips"].values) == set(['25027', '53011', '48439'])
@@ -44,10 +40,11 @@ def test_state(self):
                               "2020-06-15", "2020-06-15", "2020-06-15"],
                 "totalTest": [100, 50, 200, 200, 250, 500],
                 "positiveTest": [10, 8, 15, 5, 20, 50],
+                "numUniqueDevices": [2, 1, 1, 1, 1, 1]
             }
         )
         
-        new_df = zip_to_state(df, map_df)
+        new_df = geo_map("state", df)
 
         assert set(new_df["state_id"].values) == set(['ma', 'tx', 'wa'])
         assert set(new_df["timestamp"].values) == set(df["timestamp"].values)
@@ -63,12 +60,13 @@ def test_hrr(self):
                               "2020-06-15", "2020-06-15", "2020-06-15"],
                 "totalTest": [100, 50, 200, 200, 250, 500],
                 "positiveTest": [10, 8, 15, 5, 20, 50],
+                "numUniqueDevices": [2, 1, 1, 1, 1, 1]
             }
         )
         
-        new_df, res_key = zip_to_hrr(df, map_df)
+        new_df, res_key = geo_map("hrr", df)
 
-        assert set(new_df["hrrnum"].values) == set([16, 231, 340, 344, 394])
+        assert set(new_df["hrr"].values) == set(["16", "231", "340", "344", "394"])
         assert set(new_df["timestamp"].values) == set(df["timestamp"].values)
         assert set(new_df["totalTest"].values)  == set([500, 100, 250, 50, 400])
         assert set(new_df["positiveTest"].values) == set([50, 10, 20, 8, 20])
@@ -77,18 +75,19 @@ def test_msa(self):
 
         df = pd.DataFrame(
             {
-                "zip": [1607, 73716, 73719, 76010, 74435, 74936],
+                "zip": [1607, 73716, 73719, 76010, 74945, 74936],
                 "timestamp": ["2020-06-15", "2020-06-15", "2020-06-15", 
                               "2020-06-15", "2020-06-15", "2020-06-15"],
                 "totalTest": [100, 50, 200, 200, 250, 500],
                 "positiveTest": [10, 8, 15, 5, 20, 50],
+                "numUniqueDevices": [2, 1, 1, 1, 1, 1]
             }
         )
         
-        new_df, res_key = zip_to_msa(df, map_df)
+        new_df, res_key = geo_map("msa", df)
 
-        assert res_key == 'cbsa_id'
-        assert set(new_df["cbsa_id"].values) == set(['19100', '22900', '49340'])
+        assert res_key == 'msa'
+        assert set(new_df["msa"].values) == set(['19100', '22900', '49340'])
         assert set(new_df["timestamp"].values) == set(df["timestamp"].values)
         assert set(new_df["totalTest"].values)  == set([200, 750, 100])
         assert set(new_df["positiveTest"].values) == set([5, 70, 10])