Add handling of unassigned cases/deaths to jhu

Jingjing Tang · krivard · krivard · commit 5ff04c0ee487 · 2020-07-01T12:57:01.000-04:00
Co-authored-by: krivard &lt;krivard@cs.cmu.edu&gt;
diff --git a/jhu/delphi_jhu/geo.py b/jhu/delphi_jhu/geo.py
@@ -89,6 +89,10 @@
 
 FIPS_TO_STATE = {v: k.lower() for k, v in STATE_TO_FIPS.items()}
 
+# Fake fips to States
+
+JHU_FAKE_FIPS_TO_MEGA_FIPS = {f'900{x}' : f'{x}000' for x in STATE_TO_FIPS.values()}
+
 
 def fips_to_state(fips: str) -> str:
     """Wrapper that handles exceptions to the FIPS scheme in the JHU data.
@@ -148,7 +152,7 @@ def disburse(df: pd.DataFrame, pooled_fips: str, fips_list: list):
     return df
 
 
-def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame):
+def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str):
     """
     Maps a DataFrame df, which contains data at the county resolution, and
     aggregate it to the geographic resolution geo_res.
@@ -162,22 +166,38 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame):
         ('county', 'state', 'msa', 'hrr').
     map_df: pd.DataFrame
         Loaded from static file "fips_prop_pop.csv".
+    sensor: str
+        sensor type. Valid options:
+        ("new_counts", "cumulative_counts",
+        "incidence", "cumulative_prop")
 
     Returns
     -------
     pd.DataFrame
         Columns: geo_id, timestamp, ...
     """
     VALID_GEO_RES = ("county", "state", "msa", "hrr")
+    #It is not clear to calculate the proportion for unassigned cases/deaths
+    PROP_SENSORS = ("incidence", "cumulative_prop")
     if geo_res not in VALID_GEO_RES:
         raise ValueError(f"geo_res must be one of {VALID_GEO_RES}")
-    df = df.copy()
+
+    df_mega = df[df['fips'].astype(int) >= 90001].copy()
+    df_mega['geo_id'] = df_mega['fips'].apply(lambda x: JHU_FAKE_FIPS_TO_MEGA_FIPS[x])
+
+    df = df[df['fips'].astype(int) < 90001].copy()
+
     if geo_res == "county":
         df["geo_id"] = df["fips"]
+        if sensor not in PROP_SENSORS:
+            df = df.append(df_mega)
     elif geo_res == "state":
         # Grab first two digits of fips
         # Map state fips to us postal code
-        df["geo_id"] = df["fips"].apply(fips_to_state)
+        df["geo_id"] = df["fips"]
+        # Add unassigned cases/deaths
+        df = df.append(df_mega)
+        df["geo_id"] = df["geo_id"].apply(fips_to_state)
     elif geo_res in ("msa", "hrr"):
         # Disburse Dukes & Nantucket to individual counties
         df = disburse(df, DN_FIPS, DN_COUNTY_FIPS)
@@ -200,8 +220,13 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame):
         merged["new_counts"] = merged["new_counts"] * merged["pop_prop"]
         merged["population"] = merged["population"] * merged["pop_prop"]
         df = merged.drop(["zip", "pop_prop", "hrrnum", "cbsa_id"], axis=1)
+        # if sensor not in PROP_SENSORS:
+        #     df_mega["geo_id"] = df_mega["geo_id"].apply(fips_to_state)
+        #     df = df.append(df_mega)
     df = df.drop("fips", axis=1)
     df = df.groupby(["geo_id", "timestamp"]).sum().reset_index()
+
+    # Value would be negative for megacounties , which would not be considered in the main function
     df["incidence"] = df["new_counts"] / df["population"] * INCIDENCE_BASE
     df["cumulative_prop"] = df["cumulative_counts"] / df["population"] * INCIDENCE_BASE
     return df
diff --git a/jhu/delphi_jhu/pull.py b/jhu/delphi_jhu/pull.py
@@ -62,7 +62,7 @@ def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFr
     MIN_FIPS = 1000
     MAX_FIPS = 57000
     EXTRA_FIPS = (
-        72,     # Puerto Rico (provided as the entire state)
+        72,     # Puerto Rico (provided as the entire state)
         70002,  # Kansas City, MO
         70003,  # Dukes and Nantucket Counties, MA
     )
@@ -79,9 +79,13 @@ def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFr
             & (df["FIPS"] < MAX_FIPS)
         )  # "Uncategorized", etc.
         | df["FIPS"].isin(EXTRA_FIPS)
+        # Get Fake FIPS for unassigned cases
+        | np.logical_and(df['FIPS'] >= 90001,
+                         df['FIPS'] <= 90056)
     ]
     # Merge in population LOWERCASE, consistent across confirmed and deaths
-    df = pd.merge(df, pop_df, on="FIPS")
+    # Set population as NAN for fake fips
+    df = pd.merge(df, pop_df, on="FIPS", how='left')
 
     # Manual correction for PR
     df.loc[df["FIPS"] == 72, "FIPS"] = 72000
diff --git a/jhu/delphi_jhu/run.py b/jhu/delphi_jhu/run.py
@@ -77,7 +77,7 @@ def run_module():
         print(geo_res, metric, sensor, smoother)
         df = dfs[metric]
         # Aggregate to appropriate geographic resolution
-        df = geo_map(df, geo_res, map_df)
+        df = geo_map(df, geo_res, map_df, sensor)
         df["val"] = SMOOTHERS_MAP[smoother][0](df[sensor].values)
         df["se"] = np.nan
         df["sample_size"] = np.nan
diff --git a/jhu/tests/receiving/.gitignore b/jhu/tests/receiving/.gitignore
@@ -0,0 +1,120 @@
+# You should hard commit a prototype for this file, but we
+# want to avoid accidental adding of API tokens and other
+# private data parameters
+params.json
+
+# Do not commit output files
+receiving/*.csv
+
+# Remove macOS files
+.DS_Store
+
+# virtual environment
+dview/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+coverage.xml
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+.static_storage/
+.media/
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
diff --git a/jhu/tests/test_geo.py b/jhu/tests/test_geo.py
@@ -25,6 +25,13 @@ def test_normal(self):
         assert fips_to_state("12003") == "fl"
         assert fips_to_state("50103") == "vt"
         assert fips_to_state("15003") == "hi"
+    
+    def test_mega(self):
+        
+        assert fips_to_state("01000") == "al"
+        assert fips_to_state("13000") == "ga"
+        assert fips_to_state("44000") == "ri"
+        assert fips_to_state("12000") == "fl"
 
 
 class TestDisburse:
@@ -60,7 +67,7 @@ def test_incorrect_geo(self):
         )
 
         with pytest.raises(ValueError):
-            geo_map(df, "département", MAP_DF)
+            geo_map(df, "département", MAP_DF, 'new_counts')
 
     def test_county(self):
 
@@ -74,15 +81,27 @@ def test_county(self):
             }
         )
 
-        new_df = geo_map(df, "county", MAP_DF)
+        df_mega = pd.DataFrame(
+            {
+                "fips": ["90013", "90001"],
+                "timestamp": ["2020-02-15", "2020-02-15"],
+                "new_counts": [8, 2],
+                "cumulative_counts": [80, 12],
+                "population": [np.nan, np.nan],
+            }
+        )
+        
+        df = df.append(df_mega)
+        
+        new_df = geo_map(df, "county", MAP_DF, 'new_counts')
 
         exp_incidence = df["new_counts"] / df["population"] * 100000
         exp_cprop = df["cumulative_counts"] / df["population"] * 100000
-
-        assert set(new_df["geo_id"].values) == set(df["fips"].values)
+        
+        assert set(new_df["geo_id"].values) == set(['01000', '13000', '48027', '50103', '53003'])
         assert set(new_df["timestamp"].values) == set(df["timestamp"].values)
-        assert set(new_df["incidence"].values) == set(exp_incidence.values)
-        assert set(new_df["cumulative_prop"].values) == set(exp_cprop.values)
+        assert set(new_df["incidence"].values)  - set(exp_incidence.values) == set([np.Inf])
+        assert set(new_df["cumulative_prop"].values) - set(exp_cprop.values) == set([np.Inf])
 
     def test_state(self):
 
@@ -95,19 +114,31 @@ def test_state(self):
                 "population": [100, 2100, 300, 25],
             }
         )
+        
+        df_mega = pd.DataFrame(
+            {
+                "fips": ["90013", "90001", "04000", "25000"],
+                "timestamp": ["2020-02-15", "2020-02-15", "2020-02-15", "2020-02-15"],
+                "new_counts": [8, 2, 5, 10],
+                "cumulative_counts": [80, 12, 30, 100],
+                "population": [np.nan, np.nan, np.nan, np.nan],
+            }
+        )
+        
+        df = df.append(df_mega)
 
-        new_df = geo_map(df, "state", MAP_DF)
+        new_df = geo_map(df, "state", MAP_DF, 'new_counts')
 
-        exp_incidence = np.array([27, 13]) / np.array([2500, 25]) * 100000
-        exp_cprop = np.array([165, 60]) / np.array([2500, 25]) * 100000
+        exp_incidence = np.array([27 + 5, 13 + 10]) / np.array([2500, 25]) * 100000
+        exp_cprop = np.array([165 + 30, 60 + 100]) / np.array([2500, 25]) * 100000
 
-        assert (new_df["geo_id"].values == ["az", "ma"]).all()
-        assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all()
-        assert (new_df["new_counts"].values == [27, 13]).all()
-        assert (new_df["cumulative_counts"].values == [165, 60]).all()
-        assert (new_df["population"].values == [2500, 25]).all()
-        assert (new_df["incidence"].values == exp_incidence).all()
-        assert (new_df["cumulative_prop"].values == exp_cprop).all()
+        assert set(new_df["geo_id"].values) == set(["az", "ma", "al", "ga"])
+        assert set(new_df["timestamp"].values) == set(["2020-02-15"])
+        assert set(new_df["new_counts"].values) == set([32, 23, 2, 8])
+        assert set(new_df["cumulative_counts"].values) == set([195, 160, 12, 80])
+        assert set(new_df["population"].values) == set([2500, 25, 0])
+        assert set(new_df["incidence"].values) - set(exp_incidence) == set([np.Inf])
+        assert set(new_df["cumulative_prop"].values) - set(exp_cprop) == set([np.Inf])
 
     def test_hrr(self):
 
@@ -121,7 +152,19 @@ def test_hrr(self):
             }
         )
 
-        new_df = geo_map(df, "hrr", MAP_DF)
+        # df_mega = pd.DataFrame(
+        #     {
+        #         "fips": ["90013", "90001"],
+        #         "timestamp": ["2020-02-15", "2020-02-15"],
+        #         "new_counts": [8, 2],
+        #         "cumulative_counts": [80, 12],
+        #         "population": [np.nan, np.nan],
+        #     }
+        # )
+        
+        # df = df.append(df_mega)
+        
+        new_df = geo_map(df, "hrr", MAP_DF, 'new_counts')
 
         exp_incidence = np.array([13, 27]) / np.array([25, 2500]) * 100000
         exp_cprop = np.array([60, 165]) / np.array([25, 2500]) * 100000
@@ -145,8 +188,20 @@ def test_msa(self):
                 "population": [100, 2100, 300, 25],
             }
         )
-
-        new_df = geo_map(df, "msa", MAP_DF)
+        
+        # df_mega = pd.DataFrame(
+        #     {
+        #         "fips": ["90013", "90001"],
+        #         "timestamp": ["2020-02-15", "2020-02-15"],
+        #         "new_counts": [8, 2],
+        #         "cumulative_counts": [80, 12],
+        #         "population": [np.nan, np.nan],
+        #     }
+        # )
+        
+        # df = df.append(df_mega)
+
+        new_df = geo_map(df, "msa", MAP_DF, 'new_counts')
 
         exp_incidence = np.array([2, 13]) / np.array([300, 25]) * 100000
         exp_cprop = np.array([45, 60]) / np.array([300, 25]) * 100000
diff --git a/jhu/tests/test_smooth.py b/jhu/tests/test_smooth.py
@@ -14,7 +14,7 @@ def test_output_files_smoothed(self, run_as_module):
 
         smoothed = pd.read_csv(
             join("receiving",
-                f"{dates[-1]}_state_confirmed_7dav_cumulative_num.csv")
+                f"{dates[-1]}_state_wip_confirmed_7dav_cumul_num.csv")
         )
 
         raw = pd.concat([

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@ def test_output_files_smoothed(self, run_as_module):`
`14`	`14`
`15`	`15`	`smoothed = pd.read_csv(`
`16`	`16`	`join("receiving",`
`17`		`- f"{dates[-1]}_state_confirmed_7dav_cumulative_num.csv")`
	`17`	`+ f"{dates[-1]}_state_wip_confirmed_7dav_cumul_num.csv")`
`18`	`18`	`)`
`19`	`19`
`20`	`20`	`raw = pd.concat([`