cmu-delphi · krivard · Oct 26, 2020 · Oct 13, 2020 · Oct 14, 2020 · Oct 14, 2020
diff --git a/_delphi_utils_python/delphi_utils/geomap.py b/_delphi_utils_python/delphi_utils/geomap.py
@@ -77,6 +77,11 @@ class GeoMapper:
     ==========
     The main GeoMapper object loads and stores crosswalk dataframes on-demand.
 
+    When replacing geocodes with a new one an aggregation step is performed on the data columns
+    to merge entries  (i.e. in the case of a many to one mapping or a weighted mapping). This
+    requires a specification of the data columns, which are assumed to be all the columns that
+    are not the geocodes or the date column specified in date_col.
+
     Example 1: to add a new column with a new geocode, possibly with weights:
     > gmpr = GeoMapper()
     > df = gmpr.add_geocode(df, "fips", "zip", from_col="fips", new_col="geo_id",
@@ -305,7 +310,12 @@ def add_geocode(
             )
 
         # state codes are all stored in one table
-        if new_code in state_codes:
+        if from_code in state_codes and new_code in state_codes:
+            crosswalk = self._load_crosswalk(from_code="state", to_code="state")
+            crosswalk = crosswalk.rename(
+                columns={from_code: from_col, new_code: new_col}
+            )
+        elif new_code in state_codes:
             crosswalk = self._load_crosswalk(from_code=from_code, to_code="state")
             crosswalk = crosswalk.rename(
                 columns={from_code: from_col, new_code: new_col}
@@ -322,9 +332,13 @@ def add_geocode(
             df = df.merge(crosswalk, left_on=from_col, right_on=from_col, how="left")
 
         # Drop extra state columns
-        if new_code in state_codes:
+        if new_code in state_codes and not from_code in state_codes:
             state_codes.remove(new_code)
             df.drop(columns=state_codes, inplace=True)
+        elif new_code in state_codes and from_code in state_codes:
+            state_codes.remove(new_code)
+            state_codes.remove(from_code)
+            df.drop(columns=state_codes, inplace=True)
 
         return df
 
@@ -361,6 +375,9 @@ def replace_geocode(
         new_code: {'fips', 'zip', 'state_code', 'state_id', 'state_name', 'hrr', 'msa',
                    'hhs_region_number'}
             Specifies the geocode type of the data in new_col.
+        date_col: str or None, default "date"
+            Specify which column contains the date values. Used for value aggregation.
+            If None, then the aggregation is done only on geo_id.
         data_cols: list, default None
             A list of data column names to aggregate when doing a weighted coding. If set to
             None, then all the columns are used except for date_col and new_col.
@@ -389,7 +406,11 @@ def replace_geocode(
             # Multiply and aggregate (this automatically zeros NAs)
             df[data_cols] = df[data_cols].multiply(df["weight"], axis=0)
             df.drop("weight", axis=1, inplace=True)
-        df = df.groupby([date_col, new_col]).sum().reset_index()
+
+        if not date_col is None:
+            df = df.groupby([date_col, new_col]).sum().reset_index()
+        else:
+            df = df.groupby([new_col]).sum().reset_index()
         return df
 
     def add_population_column(self, data, geocode_type, geocode_col=None):

diff --git a/_delphi_utils_python/tests/test_geomap.py b/_delphi_utils_python/tests/test_geomap.py
@@ -278,11 +278,13 @@ def test_zip_to_state_id(self):
     def test_add_population_column(self):
         gmpr = GeoMapper()
         new_data = gmpr.add_population_column(self.fips_data_3, "fips")
-        assert new_data["population"].sum() == 274963
+        assert new_data.shape == (5, 5)
         new_data = gmpr.add_population_column(self.zip_data, "zip")
-        assert new_data["population"].sum() == 274902
+        assert new_data.shape == (6, 5)
         with pytest.raises(ValueError):
             new_data = gmpr.add_population_column(self.zip_data, "hrr")
+        new_data = gmpr.add_population_column(self.fips_data_5, "fips")
+        assert new_data.shape == (4, 5)
 
     def test_add_geocode(self):
         gmpr = GeoMapper()
@@ -382,13 +384,20 @@ def test_add_geocode(self):
         new_data2 = gmpr.add_geocode(new_data, "state_code", "hhs_region_number")
         assert new_data2["hhs_region_number"].unique().size == 2
 
+        # state_name -> state_id
+        new_data = gmpr.replace_geocode(self.zip_data, "zip", "state_name")
+        new_data2 = gmpr.add_geocode(new_data, "state_name", "state_id")
+        assert new_data2.shape == (4, 5)
+        new_data2 = gmpr.replace_geocode(new_data, "state_name", "state_id", new_col="abbr")
+        assert "abbr" in new_data2.columns
+
         # fips -> nation
-        new_data = gmpr.replace_geocode(self.fips_data_5, "fips", "nation")
+        new_data = gmpr.replace_geocode(self.fips_data_5, "fips", "nation", new_col="NATION")
         assert new_data.equals(
             pd.DataFrame().from_dict(
                 {
                     "date": {0: pd.Timestamp("2018-01-01 00:00:00")},
-                    "nation": {0: "us"},
+                    "NATION": {0: "us"},
                     "count": {0: 10024.0},
                     "total": {0: 100006.0},
                 }
@@ -411,6 +420,23 @@ def test_add_geocode(self):
             )
         )
 
+        # hrr -> nation
+        with pytest.raises(ValueError):    
+            new_data = gmpr.replace_geocode(self.zip_data, "zip", "hrr")
+            new_data2 = gmpr.replace_geocode(new_data, "hrr", "nation")
+
         # fips -> hrr (dropna=True/False check)
         assert not gmpr.add_geocode(self.fips_data_3, "fips", "hrr").isna().any().any()
         assert gmpr.add_geocode(self.fips_data_3, "fips", "hrr", dropna=False).isna().any().any()
+
+        # fips -> zip (date_col=None chech)
+        new_data = gmpr.replace_geocode(self.fips_data_5.drop(columns=["date"]), "fips", "hrr", date_col=None)
+        assert new_data.equals(
+            pd.DataFrame().from_dict(
+                {
+                    'hrr': {0: '1', 1: '183', 2: '184', 3: '382', 4: '7'},
+                    'count': {0: 1.772347174163783, 1: 7157.392403522299, 2: 2863.607596477701, 3: 1.0, 4: 0.22765282583621685},
+                    'total': {0: 3.544694348327566, 1: 71424.64801363471, 2: 28576.35198636529, 3: 1.0, 4: 0.4553056516724337}
+                }
+            )
+        )
diff --git a/cdc_covidnet/delphi_cdc_covidnet/__init__.py b/cdc_covidnet/delphi_cdc_covidnet/__init__.py
@@ -10,7 +10,6 @@
 
 from . import run
 from . import api_config
-from . import geo_maps
 from . import update_sensor
 from . import covidnet
 

diff --git a/cdc_covidnet/delphi_cdc_covidnet/geo_maps.py b/cdc_covidnet/delphi_cdc_covidnet/geo_maps.py
diff --git a/cdc_covidnet/delphi_cdc_covidnet/run.py b/cdc_covidnet/delphi_cdc_covidnet/run.py
@@ -54,7 +54,6 @@ def run_module():
         state_files,
         mmwr_info,
         params["export_dir"],
-        params["static_file_dir"],
         start_date,
         end_date)
 

diff --git a/cdc_covidnet/delphi_cdc_covidnet/update_sensor.py b/cdc_covidnet/delphi_cdc_covidnet/update_sensor.py
@@ -12,11 +12,10 @@
 import numpy as np
 import pandas as pd
 
-from delphi_utils import read_params
+from delphi_utils import read_params, GeoMapper
 import covidcast
 from .api_config import APIConfig
 from .covidnet import CovidNet
-from .geo_maps import GeoMaps
 from .constants import SIGNALS
 
 def write_to_csv(data: pd.DataFrame, out_name: str, output_path: str):
@@ -49,17 +48,18 @@ def write_to_csv(data: pd.DataFrame, out_name: str, output_path: str):
 
 
 def update_sensor(
-        state_files: List[str], mmwr_info: pd.DataFrame,
-        output_path: str, static_path: str,
-        start_date: datetime, end_date: datetime) -> pd.DataFrame:
+        state_files: List[str],
+        mmwr_info: pd.DataFrame,
+        output_path: str,
+        start_date: datetime,
+        end_date: datetime) -> pd.DataFrame:
     """
     Generate sensor values, and write to csv format.
 
     Args:
         state_files: List of JSON files representing COVID-NET hospitalization data for each state
         mmwr_info: Mappings from MMWR week to actual dates, as a pd.DataFrame
         output_path: Path to write the csvs to
-        static_path: Path for the static geographic fiels
         start_date: First sensor date (datetime.datetime)
         end_date: Last sensor date (datetime.datetime)
 
@@ -85,9 +85,15 @@ def update_sensor(
     ]
 
     # Set state id to two-letter abbreviation
-    geo_map = GeoMaps(static_path)
-    hosp_df = geo_map.state_name_to_abbr(hosp_df)
-
+    gmpr = GeoMapper()
+    hosp_df = gmpr.add_geocode(hosp_df,
+                               from_col=APIConfig.STATE_COL,
+                               from_code="state_name",
+                               new_code="state_id",
+                               dropna=False)
+    # To use the original column name, reassign original column and drop new one
+    hosp_df[APIConfig.STATE_COL] = hosp_df["state_id"].str.upper()
+    hosp_df.drop("state_id", axis=1, inplace=True)
     assert not hosp_df.duplicated(["date", "geo_id"]).any(), "Non-unique (date, geo_id) pairs"
     hosp_df.set_index(["date", "geo_id"], inplace=True)