Merge pull request #1424 from cmu-delphi/release/indicators_v0.2.15_utils_v0.2.7

krivard · web-flow · commit b9df5a464464 · 2021-12-13T17:22:55.000-05:00
Release covidcast-indicators 0.2.15
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.2.14
+current_version = 0.2.15
 commit = True
 message = chore: bump covidcast-indicators to {new_version}
 tag = False
diff --git a/google_symptoms/delphi_google_symptoms/geo.py b/google_symptoms/delphi_google_symptoms/geo.py
@@ -23,20 +23,29 @@ def generate_transition_matrix(geo_res):
         The first is a data frame for HRR regions and the second are MSA
         regions.
     """
-    map_df = gmpr.get_crosswalk("fips", geo_res)
-    # Add population as weights
-    map_df = gmpr.add_population_column(map_df, "fips")
+    if geo_res in ["hrr", "msa"]:
+        mapping_flag = "fips"
+        map_df = gmpr.get_crosswalk("fips", geo_res)
+        # Add population as weights
+        map_df = gmpr.add_population_column(map_df, "fips")
+    else:
+        mapping_flag = "state_id"
+        map_df = gmpr.get_crosswalk("state", "state")
+        map_df = gmpr.add_geocode(map_df, "state_code", geo_res)
+        map_df = gmpr.add_population_column(map_df, "state_code")
+
     if geo_res == "hrr":
         map_df["population"] = map_df["population"] *  map_df["weight"]
-    msa_pop = map_df.groupby(geo_res).sum().reset_index()
+
+    aggregated_pop = map_df.groupby(geo_res).sum().reset_index()
     map_df = map_df.merge(
-            msa_pop, on=geo_res, how="inner", suffixes=["_raw", "_groupsum"]
+            aggregated_pop, on=geo_res, how="inner", suffixes=["_raw", "_groupsum"]
             )
     map_df["weight"] = map_df["population_raw"] / map_df["population_groupsum"]
 
     map_df = pd.pivot_table(
-                 map_df, values='weight', index=["fips"], columns=[geo_res]
-              ).fillna(0).reset_index().rename({"fips": "geo_id"}, axis = 1)
+                 map_df, values='weight', index=[mapping_flag], columns=[geo_res]
+              ).fillna(0).reset_index().rename({mapping_flag: "geo_id"}, axis = 1)
     return map_df
 
 def geo_map(df, geo_res):
@@ -49,7 +58,7 @@ def geo_map(df, geo_res):
         a data frame with columns "geo_id", "timestamp",
         and columns for signal vals
     geo_res: str
-        "msa" or "hrr"
+        "msa", "hrr", "hhs" or "nation"
 
     Returns
     -------
@@ -58,7 +67,7 @@ def geo_map(df, geo_res):
         and columns for signal vals.
         The geo_id has been converted from fips to HRRs/MSAs
     """
-    if geo_res in {"county", "state"}:
+    if geo_res == "county":
         return df
 
     map_df = generate_transition_matrix(geo_res)
diff --git a/google_symptoms/delphi_google_symptoms/run.py b/google_symptoms/delphi_google_symptoms/run.py
@@ -13,7 +13,6 @@
 from pandas import to_datetime
 from delphi_utils import (
     create_export_csv,
-    geomap,
     get_structured_logger
 )
 from delphi_utils.validator.utils import lag_converter
@@ -84,14 +83,12 @@ def run_module(params):
                        export_start_date,
                        export_end_date,
                        num_export_days)
-    gmpr = geomap.GeoMapper()
 
     for geo_res in GEO_RESOLUTIONS:
         if geo_res == "state":
             df_pull = dfs["state"]
         elif geo_res in ["hhs", "nation"]:
-            df_pull = gmpr.replace_geocode(dfs["county"], "fips", geo_res, from_col="geo_id")
-            df_pull.rename(columns={geo_res: "geo_id"}, inplace=True)
+            df_pull = geo_map(dfs["state"], geo_res)
         else:
             df_pull = geo_map(dfs["county"], geo_res)
 
diff --git a/google_symptoms/tests/test_geo.py b/google_symptoms/tests/test_geo.py
@@ -110,3 +110,88 @@ def test_msa(self):
         assert new_df[METRICS[0]].values == pytest.approx(df_plus[METRICS[0]].tolist())
         assert new_df[METRICS[1]].values == pytest.approx(df_plus[METRICS[1]].tolist())
         assert new_df[COMBINED_METRIC].values == pytest.approx(df_plus[COMBINED_METRIC].tolist())
+        
+    def test_hhs(self):
+        gmpr = GeoMapper()
+        df = pd.DataFrame(
+            {
+                "geo_id": ["al", "fl", "tx"],
+                "timestamp": ["2020-02-15", "2020-02-15", "2020-02-15"],
+                METRICS[0]: [10, 15, 2],
+                METRICS[1]: [100, 20, 45],
+                COMBINED_METRIC: [110, 35, 47],
+            }
+        )
+
+        state2hhs = gmpr.add_population_column(gmpr.get_crosswalk("state", "state"), "state_code")
+        state2hhs = gmpr.add_geocode(state2hhs, "state_code", "hhs")
+        hhs_pop = state2hhs.groupby("hhs"
+            ).sum(
+            ).reset_index(
+            ).rename(columns={"population": "hhs_pop"})
+        df_plus = df.merge(state2hhs, left_on="geo_id", right_on="state_id", how="left"
+            ).merge(hhs_pop, on="hhs", how="left"
+            ).assign(
+                fractional_pop = lambda x: x.population / x.hhs_pop,
+                metric_0 = lambda x: x.fractional_pop * x[METRICS[0]],
+                metric_1 = lambda x: x.fractional_pop * x[METRICS[1]],
+                combined_metric = lambda x: x.metric_0 + x.metric_1
+            ).groupby("hhs"
+            ).sum(
+            ).drop(
+                labels=[METRICS[0], METRICS[1], COMBINED_METRIC],
+                axis="columns"
+            ).rename(
+                columns={"metric_0": METRICS[0], "metric_1": METRICS[1], "combined_metric": COMBINED_METRIC}
+            )
+
+        new_df = geo_map(df, "hhs").dropna()
+
+        assert set(new_df.keys()) == set(df.keys())
+        assert set(new_df["geo_id"]) == set(["4", "6"])
+        assert new_df[METRICS[0]].values == pytest.approx(df_plus[METRICS[0]].tolist())
+        assert new_df[METRICS[1]].values == pytest.approx(df_plus[METRICS[1]].tolist())
+        assert new_df[COMBINED_METRIC].values == pytest.approx(df_plus[COMBINED_METRIC].tolist())
+    
+    def test_nation(self):
+        gmpr = GeoMapper()
+        df = pd.DataFrame(
+            {
+                "geo_id": ["al", "il", "tx"],
+                "timestamp": ["2020-02-15", "2020-02-15", "2020-02-15"],
+                METRICS[0]: [10, 15, 2],
+                METRICS[1]: [100, 20, 45],
+                COMBINED_METRIC: [110, 35, 47],
+            }
+        )
+
+        state2nation = gmpr.add_population_column(gmpr.get_crosswalk("state", "state"), "state_code")
+        state2nation = gmpr.add_geocode(state2nation, "state_code", "nation")
+        nation_pop = state2nation.groupby("nation"
+            ).sum(
+            ).reset_index(
+            ).rename(columns={"population": "nation_pop"})
+        df_plus = df.merge(state2nation, left_on="geo_id", right_on="state_id", how="left"
+            ).merge(nation_pop, on="nation", how="left"
+            ).assign(
+                fractional_pop = lambda x: x.population / x.nation_pop,
+                metric_0 = lambda x: x.fractional_pop * x[METRICS[0]],
+                metric_1 = lambda x: x.fractional_pop * x[METRICS[1]],
+                combined_metric = lambda x: x.metric_0 + x.metric_1
+            ).groupby("nation"
+            ).sum(
+            ).drop(
+                labels=[METRICS[0], METRICS[1], COMBINED_METRIC],
+                axis="columns"
+            ).rename(
+                columns={"metric_0": METRICS[0], "metric_1": METRICS[1], "combined_metric": COMBINED_METRIC}
+            )
+
+        new_df = geo_map(df, "nation").dropna()
+
+        assert set(new_df.keys()) == set(df.keys())
+        assert set(new_df["geo_id"]) == set(["us"])
+        assert new_df[METRICS[0]].values == pytest.approx(df_plus[METRICS[0]].tolist())
+        assert new_df[METRICS[1]].values == pytest.approx(df_plus[METRICS[1]].tolist())
+        assert new_df[COMBINED_METRIC].values == pytest.approx(df_plus[COMBINED_METRIC].tolist())
+