Merge pull request #68 from cmu-delphi/jingjing/jhu_with_unassigned

krivard · web-flow · commit 5931e2145c50 · 2020-06-19T15:37:11.000-04:00
update code for unassigned cases/deaths
diff --git a/jhu/delphi_jhu/geo.py b/jhu/delphi_jhu/geo.py
@@ -89,6 +89,10 @@
 
 FIPS_TO_STATE = {v: k.lower() for k, v in STATE_TO_FIPS.items()}
 
+# Fake fips to States
+
+JHU_FAKE_FIPS_TO_MEGA_FIPS = {f'900{x}' : f'{x}000' for x in STATE_TO_FIPS.values()}
+
 
 def fips_to_state(fips: str) -> str:
     """Wrapper that handles exceptions to the FIPS scheme in the JHU data.
@@ -148,7 +152,7 @@ def disburse(df: pd.DataFrame, pooled_fips: str, fips_list: list):
     return df
 
 
-def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame):
+def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str):
     """
     Maps a DataFrame df, which contains data at the county resolution, and
     aggregate it to the geographic resolution geo_res.
@@ -162,22 +166,38 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame):
         ('county', 'state', 'msa', 'hrr').
     map_df: pd.DataFrame
         Loaded from static file "fips_prop_pop.csv".
+    sensor: str
+        sensor type. Valid options:
+        ("new_counts", "cumulative_counts",
+        "incidence", "cumulative_prop")
 
     Returns
     -------
     pd.DataFrame
         Columns: geo_id, timestamp, ...
     """
     VALID_GEO_RES = ("county", "state", "msa", "hrr")
+    #It is not clear to calculate the proportion for unassigned cases/deaths
+    PROP_SENSORS = ("incidence", "cumulative_prop")
     if geo_res not in VALID_GEO_RES:
         raise ValueError(f"geo_res must be one of {VALID_GEO_RES}")
-    df = df.copy()
+
+    df_mega = df[df['fips'].astype(int) >= 90001].copy()
+    df_mega['geo_id'] = df_mega['fips'].apply(lambda x: JHU_FAKE_FIPS_TO_MEGA_FIPS[x])
+
+    df = df[df['fips'].astype(int) < 90001].copy()
+
     if geo_res == "county":
         df["geo_id"] = df["fips"]
+        if sensor not in PROP_SENSORS:
+            df = df.append(df_mega)
     elif geo_res == "state":
         # Grab first two digits of fips
         # Map state fips to us postal code
-        df["geo_id"] = df["fips"].apply(fips_to_state)
+        df["geo_id"] = df["fips"]
+        # Add unassigned cases/deaths
+        df = df.append(df_mega)
+        df["geo_id"] = df["geo_id"].apply(fips_to_state)
     elif geo_res in ("msa", "hrr"):
         # Disburse Dukes & Nantucket to individual counties
         df = disburse(df, DN_FIPS, DN_COUNTY_FIPS)
@@ -200,8 +220,13 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame):
         merged["new_counts"] = merged["new_counts"] * merged["pop_prop"]
         merged["population"] = merged["population"] * merged["pop_prop"]
         df = merged.drop(["zip", "pop_prop", "hrrnum", "cbsa_id"], axis=1)
+        # if sensor not in PROP_SENSORS:
+        #     df_mega["geo_id"] = df_mega["geo_id"].apply(fips_to_state)
+        #     df = df.append(df_mega)
     df = df.drop("fips", axis=1)
     df = df.groupby(["geo_id", "timestamp"]).sum().reset_index()
+
+    # Value would be negative for megacounties , which would not be considered in the main function
     df["incidence"] = df["new_counts"] / df["population"] * INCIDENCE_BASE
     df["cumulative_prop"] = df["cumulative_counts"] / df["population"] * INCIDENCE_BASE
     return df
diff --git a/jhu/delphi_jhu/pull.py b/jhu/delphi_jhu/pull.py
@@ -62,7 +62,7 @@ def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFr
     MIN_FIPS = 1000
     MAX_FIPS = 57000
     EXTRA_FIPS = (
-        72,     # Puerto Rico (provided as the entire state)
+        72,     # Puerto Rico (provided as the entire state)
         70002,  # Kansas City, MO
         70003,  # Dukes and Nantucket Counties, MA
     )
@@ -79,9 +79,13 @@ def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFr
             & (df["FIPS"] < MAX_FIPS)
         )  # "Uncategorized", etc.
         | df["FIPS"].isin(EXTRA_FIPS)
+        # Get Fake FIPS for unassigned cases
+        | np.logical_and(df['FIPS'] >= 90001,
+                         df['FIPS'] <= 90056)
     ]
     # Merge in population LOWERCASE, consistent across confirmed and deaths
-    df = pd.merge(df, pop_df, on="FIPS")
+    # Set population as NAN for fake fips
+    df = pd.merge(df, pop_df, on="FIPS", how='left')
 
     # Manual correction for PR
     df.loc[df["FIPS"] == 72, "FIPS"] = 72000
diff --git a/jhu/delphi_jhu/run.py b/jhu/delphi_jhu/run.py
@@ -35,7 +35,7 @@
 ]
 SMOOTHERS = [
     "unsmoothed",
-    "seven_day_average",
+    #"seven_day_average",
 ]
 SENSOR_NAME_MAP = {
     "new_counts":           ("incidence_num", False),
@@ -84,7 +84,7 @@ def run_module():
         print(geo_res, metric, sensor, smoother)
         df = dfs[metric]
         # Aggregate to appropriate geographic resolution
-        df = geo_map(df, geo_res, map_df)
+        df = geo_map(df, geo_res, map_df, sensor)
         df["val"] = SMOOTHERS_MAP[smoother][0](df[sensor].values)
         df["se"] = np.nan
         df["sample_size"] = np.nan
diff --git a/jhu/tests/test_geo.py b/jhu/tests/test_geo.py
@@ -25,6 +25,13 @@ def test_normal(self):
         assert fips_to_state("12003") == "fl"
         assert fips_to_state("50103") == "vt"
         assert fips_to_state("15003") == "hi"
+    
+    def test_mega(self):
+        
+        assert fips_to_state("01000") == "al"
+        assert fips_to_state("13000") == "ga"
+        assert fips_to_state("44000") == "ri"
+        assert fips_to_state("12000") == "fl"
 
 
 class TestDisburse:
@@ -74,15 +81,27 @@ def test_county(self):
             }
         )
 
-        new_df = geo_map(df, "county", MAP_DF)
+        df_mega = pd.DataFrame(
+            {
+                "fips": ["90013", "90001"],
+                "timestamp": ["2020-02-15", "2020-02-15"],
+                "new_counts": [8, 2],
+                "cumulative_counts": [80, 12],
+                "population": [np.nan, np.nan],
+            }
+        )
+        
+        df = df.append(df_mega)
+        
+        new_df = geo_map(df, "county", MAP_DF, 'new_counts')
 
         exp_incidence = df["new_counts"] / df["population"] * 100000
         exp_cprop = df["cumulative_counts"] / df["population"] * 100000
-
-        assert set(new_df["geo_id"].values) == set(df["fips"].values)
+        
+        assert set(new_df["geo_id"].values) == set(['01000', '13000', '48027', '50103', '53003'])
         assert set(new_df["timestamp"].values) == set(df["timestamp"].values)
-        assert set(new_df["incidence"].values) == set(exp_incidence.values)
-        assert set(new_df["cumulative_prop"].values) == set(exp_cprop.values)
+        assert set(new_df["incidence"].values)  - set(exp_incidence.values) == set([np.Inf])
+        assert set(new_df["cumulative_prop"].values) - set(exp_cprop.values) == set([np.Inf])
 
     def test_state(self):
 
@@ -95,19 +114,31 @@ def test_state(self):
                 "population": [100, 2100, 300, 25],
             }
         )
+        
+        df_mega = pd.DataFrame(
+            {
+                "fips": ["90013", "90001", "04000", "25000"],
+                "timestamp": ["2020-02-15", "2020-02-15", "2020-02-15", "2020-02-15"],
+                "new_counts": [8, 2, 5, 10],
+                "cumulative_counts": [80, 12, 30, 100],
+                "population": [np.nan, np.nan, np.nan, np.nan],
+            }
+        )
+        
+        df = df.append(df_mega)
 
-        new_df = geo_map(df, "state", MAP_DF)
+        new_df = geo_map(df, "state", MAP_DF, 'new_counts')
 
-        exp_incidence = np.array([27, 13]) / np.array([2500, 25]) * 100000
-        exp_cprop = np.array([165, 60]) / np.array([2500, 25]) * 100000
+        exp_incidence = np.array([27 + 5, 13 + 10]) / np.array([2500, 25]) * 100000
+        exp_cprop = np.array([165 + 30, 60 + 100]) / np.array([2500, 25]) * 100000
 
-        assert (new_df["geo_id"].values == ["az", "ma"]).all()
-        assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all()
-        assert (new_df["new_counts"].values == [27, 13]).all()
-        assert (new_df["cumulative_counts"].values == [165, 60]).all()
-        assert (new_df["population"].values == [2500, 25]).all()
-        assert (new_df["incidence"].values == exp_incidence).all()
-        assert (new_df["cumulative_prop"].values == exp_cprop).all()
+        assert set(new_df["geo_id"].values) == set(["az", "ma", "al", "ga"])
+        assert set(new_df["timestamp"].values) == set(["2020-02-15"])
+        assert set(new_df["new_counts"].values) == set([32, 23, 2, 8])
+        assert set(new_df["cumulative_counts"].values) == set([195, 160, 12, 80])
+        assert set(new_df["population"].values) == set([2500, 25, 0])
+        assert set(new_df["incidence"].values) - set(exp_incidence) == set([np.Inf])
+        assert set(new_df["cumulative_prop"].values) - set(exp_cprop) == set([np.Inf])
 
     def test_hrr(self):
 
@@ -121,7 +152,19 @@ def test_hrr(self):
             }
         )
 
-        new_df = geo_map(df, "hrr", MAP_DF)
+        # df_mega = pd.DataFrame(
+        #     {
+        #         "fips": ["90013", "90001"],
+        #         "timestamp": ["2020-02-15", "2020-02-15"],
+        #         "new_counts": [8, 2],
+        #         "cumulative_counts": [80, 12],
+        #         "population": [np.nan, np.nan],
+        #     }
+        # )
+        
+        # df = df.append(df_mega)
+        
+        new_df = geo_map(df, "hrr", MAP_DF, 'new_counts')
 
         exp_incidence = np.array([13, 27]) / np.array([25, 2500]) * 100000
         exp_cprop = np.array([60, 165]) / np.array([25, 2500]) * 100000
@@ -145,8 +188,20 @@ def test_msa(self):
                 "population": [100, 2100, 300, 25],
             }
         )
-
-        new_df = geo_map(df, "msa", MAP_DF)
+        
+        # df_mega = pd.DataFrame(
+        #     {
+        #         "fips": ["90013", "90001"],
+        #         "timestamp": ["2020-02-15", "2020-02-15"],
+        #         "new_counts": [8, 2],
+        #         "cumulative_counts": [80, 12],
+        #         "population": [np.nan, np.nan],
+        #     }
+        # )
+        
+        # df = df.append(df_mega)
+
+        new_df = geo_map(df, "msa", MAP_DF, 'new_counts')
 
         exp_incidence = np.array([2, 13]) / np.array([300, 25]) * 100000
         exp_cprop = np.array([45, 60]) / np.array([300, 25]) * 100000