From 44febee95e8599c817ec88135dbeb57a46ba17a0 Mon Sep 17 00:00:00 2001
From: Jingjing Tang <jingjingtang@Jingjings-MacBook-Pro.local>
Date: Mon, 8 Jun 2020 15:33:59 -0400
Subject: [PATCH 01/18] update code for unassigned cases/deaths

---
 jhu/delphi_jhu/geo.py  | 81 ++++++++++++++++++++++++++++++++++++++++--
 jhu/delphi_jhu/pull.py |  8 +++--
 jhu/delphi_jhu/run.py  |  2 +-
 3 files changed, 85 insertions(+), 6 deletions(-)

diff --git a/jhu/delphi_jhu/geo.py b/jhu/delphi_jhu/geo.py
index 4e305917b..84629ec09 100644
--- a/jhu/delphi_jhu/geo.py
+++ b/jhu/delphi_jhu/geo.py
@@ -89,6 +89,62 @@
 
 FIPS_TO_STATE = {v: k.lower() for k, v in STATE_TO_FIPS.items()}
 
+# Fake fips to States
+
+FAKE_FIPS_TO_STATES = {
+    "90001":"al",
+    "90002":"ak",
+    "90004":"az",
+    "90005":"ar",
+    "90006":"ca",
+    "90008":"co",
+    "90009":"ct",
+    "90010":"de",
+    "90011":"dc",
+    "90012":"fl",
+    "90013":"ga",
+    "90015":"hi",
+    "90016":"id",
+    "90017":"il",
+    "90018":"in",
+    "90019":"ia",
+    "90020":"ks",
+    "90021":"ky",
+    "90022":"la",
+    "90023":"me",
+    "90024":"md",
+    "90025":"ma",
+    "90026":"mi",
+    "90027":"mn",
+    "90028":"ms",
+    "90029":"mo",
+    "90030":"mt",
+    "90031":"ne",
+    "90032":"nv",
+    "90033":"nh",
+    "90034":"nj",
+    "90035":"nm",
+    "90036":"ny",
+    "90037":"nc",
+    "90038":"nd",
+    "90039":"oh",
+    "90040":"ok",
+    "90041":"or",
+    "90042":"pa",
+    "90044":"ri",
+    "90045":"sc",
+    "90046":"sd",
+    "90047":"tn",
+    "90048":"tx",
+    "90049":"ut",
+    "90050":"vt",
+    "90051":"va",
+    "90053":"wa",
+    "90054":"wv",
+    "90055":"wi",
+    "90056":"wy"    
+}
+
 
 def fips_to_state(fips: str) -> str:
     """Wrapper that handles exceptions to the FIPS scheme in the JHU data.
@@ -118,7 +174,11 @@ def fips_to_state(fips: str) -> str:
         return FIPS_TO_STATE["25"]  # Dukes & Nantucket -> Massachusetts
     if fips == "70003":
         return FIPS_TO_STATE["29"]  # Kansas City -> Missouri
-    return FIPS_TO_STATE[fips[:2]]
+    # Fake fips -> states
+    if fips[:2] == '90':
+        return FAKE_FIPS_TO_STATES[fips]
+    else:
+        return FIPS_TO_STATE[fips[:2]]
 
 
 def disburse(df: pd.DataFrame, pooled_fips: str, fips_list: list):
@@ -148,7 +208,7 @@ def disburse(df: pd.DataFrame, pooled_fips: str, fips_list: list):
     return df
 
 
-def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame):
+def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str):
     """
     Maps a DataFrame df, which contains data at the county resolution, and
     aggregate it to the geographic resolution geo_res.
@@ -169,15 +229,26 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame):
         Columns: geo_id, timestamp, ...
     """
     VALID_GEO_RES = ("county", "state", "msa", "hrr")
+    #It is not clear to calculate the proportion for unassigned cases/deaths
+    PROP_SENSORS = ("incidence", "cumulative_prop")
     if geo_res not in VALID_GEO_RES:
         raise ValueError(f"geo_res must be one of {VALID_GEO_RES}")
-    df = df.copy()
+ 
+    df_mega = df[df['fips'].astype(int) >= 90001].copy()
+    df_mega['geo_id'] = df_mega['fips'].apply(fips_to_state)
+    
+    df = df[df['fips'].astype(int) < 90001].copy()
+    
     if geo_res == "county":
         df["geo_id"] = df["fips"]
+        if sensor not in PROP_SENSORS:
+            df = df.append(df_mega)
     elif geo_res == "state":
         # Grab first two digits of fips
         # Map state fips to us postal code
         df["geo_id"] = df["fips"].apply(fips_to_state)
+        # Add unassigned cases/deaths
+        df = df.append(df_mega)
     elif geo_res in ("msa", "hrr"):
         # Disburse Dukes & Nantucket to individual counties
         df = disburse(df, DN_FIPS, DN_COUNTY_FIPS)
@@ -200,8 +271,12 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame):
         merged["new_counts"] = merged["new_counts"] * merged["pop_prop"]
         merged["population"] = merged["population"] * merged["pop_prop"]
         df = merged.drop(["zip", "pop_prop", "hrrnum", "cbsa_id"], axis=1)
+        if sensor not in PROP_SENSORS:
+            df = df.append(df_mega)
     df = df.drop("fips", axis=1)
     df = df.groupby(["geo_id", "timestamp"]).sum().reset_index()
+    
+    # Value would be negative for megacounties , which would not be considered in the main function
     df["incidence"] = df["new_counts"] / df["population"] * INCIDENCE_BASE
     df["cumulative_prop"] = df["cumulative_counts"] / df["population"] * INCIDENCE_BASE
     return df
diff --git a/jhu/delphi_jhu/pull.py b/jhu/delphi_jhu/pull.py
index 1049d5e0a..a60487ae8 100644
--- a/jhu/delphi_jhu/pull.py
+++ b/jhu/delphi_jhu/pull.py
@@ -62,7 +62,7 @@ def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFr
     MIN_FIPS = 1000
     MAX_FIPS = 57000
     EXTRA_FIPS = (
-        72,     # Puerto Rico (provided as the entire state)
+        72,     # Puerto Rico (provided as the entire state)
         70002,  # Kansas City, MO
         70003,  # Dukes and Nantucket Counties, MA
     )
@@ -79,9 +79,13 @@ def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFr
             & (df["FIPS"] < MAX_FIPS)
         )  # "Uncategorized", etc.
         | df["FIPS"].isin(EXTRA_FIPS)
+        # Get Fake FIPS for unassigned cases
+        | np.logical_and(df['FIPS'] >= 90001,
+                         df['FIPS'] <= 90056)
     ]
     # Merge in population LOWERCASE, consistent across confirmed and deaths
-    df = pd.merge(df, pop_df, on="FIPS")
+    # set population as -1 for fake fips
+    df = pd.merge(df, pop_df, on="FIPS", how = 'left').fillna(-1)
 
     # Manual correction for PR
     df.loc[df["FIPS"] == 72, "FIPS"] = 72000
diff --git a/jhu/delphi_jhu/run.py b/jhu/delphi_jhu/run.py
index cfb738f21..a21c5bb54 100644
--- a/jhu/delphi_jhu/run.py
+++ b/jhu/delphi_jhu/run.py
@@ -77,7 +77,7 @@ def run_module():
         print(geo_res, metric, sensor, smoother)
         df = dfs[metric]
         # Aggregate to appropriate geographic resolution
-        df = geo_map(df, geo_res, map_df)
+        df = geo_map(df, geo_res, map_df, sensor)
         df["val"] = SMOOTHERS_MAP[smoother][0](df[sensor].values)
         df["se"] = np.nan
         df["sample_size"] = np.nan

From 849a32ec63ca9eee067104c664d4d5bc7bd92a01 Mon Sep 17 00:00:00 2001
From: Jingjing Tang <jingjingtang@Jingjings-MacBook-Pro.local>
Date: Mon, 8 Jun 2020 18:41:14 -0400
Subject: [PATCH 02/18] update the dict for fake fips

---
 jhu/delphi_jhu/geo.py | 110 ++++++++++++++++++++++--------------------
 1 file changed, 58 insertions(+), 52 deletions(-)

diff --git a/jhu/delphi_jhu/geo.py b/jhu/delphi_jhu/geo.py
index 84629ec09..0ad797692 100644
--- a/jhu/delphi_jhu/geo.py
+++ b/jhu/delphi_jhu/geo.py
@@ -91,60 +91,62 @@
 
 # Fake fips to States
 
-FAKE_FIPS_TO_STATES = {
-    "90001":"al",
-    "90002":"ak",
-    "90004":"az",
-    "90005":"ar",
-    "90006":"ca",
-    "90008":"co",
-    "90009":"ct",
-    "90010":"de",
-    "90011":"dc",
-    "90012":"fl",
-    "90013":"ga",
-    "90015":"hi",
-    "90016":"id",
-    "90017":"il",
-    "90018":"in",
-    "90019":"ia",
-    "90020":"ks",
-    "90021":"ky",
-    "90022":"la",
-    "90023":"me",
-    "90024":"md",
-    "90025":"ma",
-    "90026":"mi",
-    "90027":"mn",
-    "90028":"ms",
-    "90029":"mo",
-    "90030":"mt",
-    "90031":"ne",
-    "90032":"nv",
-    "90033":"nh",
-    "90034":"nj",
-    "90035":"nm",
-    "90036":"ny",
-    "90037":"nc",
-    "90038":"nd",
-    "90039":"oh",
-    "90040":"ok",
-    "90041":"or",
-    "90042":"pa",
-    "90044":"ri",
-    "90045":"sc",
-    "90046":"sd",
-    "90047":"tn",
-    "90048":"tx",
-    "90049":"ut",
-    "90050":"vt",
-    "90051":"va",
-    "90053":"wa",
-    "90054":"wv",
-    "90055":"wi",
-    "90056":"wy"    
+STATES_TO_JHU_FIPS_FOR_UNASSIGNED = {
+    "AL":"01",
+    "AK":"02",
+    "AZ":"04",
+    "AR":"05",
+    "CA":"06",
+    "CO":"08",
+    "CT":"09",
+    "DE":"10",
+    "DC":"11",
+    "FL":"12",
+    "GA":"13",
+    "HI":"15",
+    "ID":"16",
+    "IL":"17",
+    "IN":"18",
+    "IA":"19",
+    "KS":"20",
+    "KY":"21",
+    "LA":"22",
+    "ME":"23",
+    "MD":"24",
+    "MA":"25",
+    "MI":"26",
+    "MN":"27",
+    "MS":"28",
+    "MO":"29",
+    "MT":"30",
+    "NE":"31",
+    "NV":"32",
+    "NH":"33",
+    "NJ":"34",
+    "NM":"35",
+    "NY":"36",
+    "NC":"37",
+    "ND":"38",
+    "OH":"39",
+    "OK":"40",
+    "OR":"41",
+    "PA":"42",
+    "RI":"44",
+    "SC":"45",
+    "SD":"46",
+    "TN":"47",
+    "TX":"48",
+    "UT":"49",
+    "VT":"50",
+    "VA":"51",
+    "WA":"53",
+    "WV":"54",
+    "WI":"55",
+    "WY":"56"    
 }
 
+FAKE_FIPS_TO_STATES = {f'900{v}' : k.lower() for k, v in STATES_TO_JHU_FIPS_FOR_UNASSIGNED.items()}
+
 
 def fips_to_state(fips: str) -> str:
     """Wrapper that handles exceptions to the FIPS scheme in the JHU data.
@@ -222,6 +224,10 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str):
         ('county', 'state', 'msa', 'hrr').
     map_df: pd.DataFrame
         Loaded from static file "fips_prop_pop.csv".
+    sensor: str
+        sensor type. Valid options:
+        ("new_counts", "cumulative_counts",
+        "incidence", "cumulative_prop")
 
     Returns
     -------

From f1f5cddef7bf210a618499b88b14ad8e45a85311 Mon Sep 17 00:00:00 2001
From: Jingjing Tang <jingjingtang@Jingjings-MacBook-Pro.local>
Date: Mon, 8 Jun 2020 18:47:34 -0400
Subject: [PATCH 03/18] Set population for fake fips as NAN

---
 jhu/delphi_jhu/pull.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/jhu/delphi_jhu/pull.py b/jhu/delphi_jhu/pull.py
index a60487ae8..3be512559 100644
--- a/jhu/delphi_jhu/pull.py
+++ b/jhu/delphi_jhu/pull.py
@@ -84,8 +84,8 @@ def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFr
                          df['FIPS'] <= 90056)
     ]
     # Merge in population LOWERCASE, consistent across confirmed and deaths
-    # set population as -1 for fake fips
-    df = pd.merge(df, pop_df, on="FIPS", how = 'left').fillna(-1)
+    # Set population as NAN for fake fips
+    df = pd.merge(df, pop_df, on="FIPS", how = 'left')
 
     # Manual correction for PR
     df.loc[df["FIPS"] == 72, "FIPS"] = 72000

From 84bbfc6ec10be9401f8bb36c8c7318a85c27c7b2 Mon Sep 17 00:00:00 2001
From: Jingjing Tang <jingjingtang@Jingjings-MacBook-Pro.local>
Date: Mon, 8 Jun 2020 19:24:28 -0400
Subject: [PATCH 04/18] update geo_id for megacounty

---
 jhu/delphi_jhu/geo.py | 67 ++++---------------------------------------
 1 file changed, 5 insertions(+), 62 deletions(-)

diff --git a/jhu/delphi_jhu/geo.py b/jhu/delphi_jhu/geo.py
index 0ad797692..dba93c371 100644
--- a/jhu/delphi_jhu/geo.py
+++ b/jhu/delphi_jhu/geo.py
@@ -91,61 +91,7 @@
 
 # Fake fips to States
 
-STATES_TO_JHU_FIPS_FOR_UNASSIGNED = {
-    "AL":"01",
-    "AK":"02",
-    "AZ":"04",
-    "AR":"05",
-    "CA":"06",
-    "CO":"08",
-    "CT":"09",
-    "DE":"10",
-    "DC":"11",
-    "FL":"12",
-    "GA":"13",
-    "HI":"15",
-    "ID":"16",
-    "IL":"17",
-    "IN":"18",
-    "IA":"19",
-    "KS":"20",
-    "KY":"21",
-    "LA":"22",
-    "ME":"23",
-    "MD":"24",
-    "MA":"25",
-    "MI":"26",
-    "MN":"27",
-    "MS":"28",
-    "MO":"29",
-    "MT":"30",
-    "NE":"31",
-    "NV":"32",
-    "NH":"33",
-    "NJ":"34",
-    "NM":"35",
-    "NY":"36",
-    "NC":"37",
-    "ND":"38",
-    "OH":"39",
-    "OK":"40",
-    "OR":"41",
-    "PA":"42",
-    "RI":"44",
-    "SC":"45",
-    "SD":"46",
-    "TN":"47",
-    "TX":"48",
-    "UT":"49",
-    "VT":"50",
-    "VA":"51",
-    "WA":"53",
-    "WV":"54",
-    "WI":"55",
-    "WY":"56"    
-}
-
-FAKE_FIPS_TO_STATES = {f'900{v}' : k.lower() for k, v in STATES_TO_JHU_FIPS_FOR_UNASSIGNED.items()}
+JHU_FAKE_FIPS_TO_MEGA_FIPS = {f'900{x}' : f'{x}000' for x in STATE_TO_FIPS.values()}
 
 
 def fips_to_state(fips: str) -> str:
@@ -176,11 +122,7 @@ def fips_to_state(fips: str) -> str:
         return FIPS_TO_STATE["25"]  # Dukes & Nantucket -> Massachusetts
     if fips == "70003":
         return FIPS_TO_STATE["29"]  # Kansas City -> Missouri
-    # Fake fips -> states
-    if fips[:2] == '90':
-        return FAKE_FIPS_TO_STATES[fips]
-    else:
-        return FIPS_TO_STATE[fips[:2]]
+    return FIPS_TO_STATE[fips[:2]]
 
 
 def disburse(df: pd.DataFrame, pooled_fips: str, fips_list: list):
@@ -241,7 +183,7 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str):
         raise ValueError(f"geo_res must be one of {VALID_GEO_RES}")
  
     df_mega = df[df['fips'].astype(int) >= 90001].copy()
-    df_mega['geo_id'] = df_mega['fips'].apply(fips_to_state)
+    df_mega['geo_id'] = df_mega['fips'].apply(lambda x: JHU_FAKE_FIPS_TO_MEGA_FIPS[x])
     
     df = df[df['fips'].astype(int) < 90001].copy()
     
@@ -252,9 +194,10 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str):
     elif geo_res == "state":
         # Grab first two digits of fips
         # Map state fips to us postal code
-        df["geo_id"] = df["fips"].apply(fips_to_state)
+        df["geo_id"] = df["fips"]
         # Add unassigned cases/deaths
         df = df.append(df_mega)
+        df["geo_id"] = df["geo_id"].apply(fips_to_state)
     elif geo_res in ("msa", "hrr"):
         # Disburse Dukes & Nantucket to individual counties
         df = disburse(df, DN_FIPS, DN_COUNTY_FIPS)

From 030d38ad172c1a412261c6b153246459313c451c Mon Sep 17 00:00:00 2001
From: Jingjing Tang <jingjingtang@Jingjings-MacBook-Pro.local>
Date: Wed, 10 Jun 2020 00:08:42 -0400
Subject: [PATCH 05/18] update naming for megacounty

---
 jhu/delphi_jhu/geo.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/jhu/delphi_jhu/geo.py b/jhu/delphi_jhu/geo.py
index dba93c371..22d7a65cb 100644
--- a/jhu/delphi_jhu/geo.py
+++ b/jhu/delphi_jhu/geo.py
@@ -221,6 +221,7 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str):
         merged["population"] = merged["population"] * merged["pop_prop"]
         df = merged.drop(["zip", "pop_prop", "hrrnum", "cbsa_id"], axis=1)
         if sensor not in PROP_SENSORS:
+            df_mega["geo_id"] = df_mega["geo_id"].apply(fips_to_state)
             df = df.append(df_mega)
     df = df.drop("fips", axis=1)
     df = df.groupby(["geo_id", "timestamp"]).sum().reset_index()

From f66b43e9726f083ee94260d2d019186d0aac056d Mon Sep 17 00:00:00 2001
From: Jingjing Tang <jingjingtang@Jingjings-MacBook-Pro.local>
Date: Wed, 10 Jun 2020 00:09:17 -0400
Subject: [PATCH 06/18] modify test cases for megacounty aggregation

---
 jhu/tests/test_geo.py | 121 +++++++++++++++++++++++++++++++-----------
 1 file changed, 90 insertions(+), 31 deletions(-)

diff --git a/jhu/tests/test_geo.py b/jhu/tests/test_geo.py
index 8d45eb336..7dd74c0c4 100644
--- a/jhu/tests/test_geo.py
+++ b/jhu/tests/test_geo.py
@@ -25,6 +25,13 @@ def test_normal(self):
         assert fips_to_state("12003") == "fl"
         assert fips_to_state("50103") == "vt"
         assert fips_to_state("15003") == "hi"
+    
+    def test_mega(self):
+        
+        assert fips_to_state("01000") == "al"
+        assert fips_to_state("13000") == "ga"
+        assert fips_to_state("44000") == "ri"
+        assert fips_to_state("12000") == "fl"
 
 
 class TestDisburse:
@@ -74,15 +81,27 @@ def test_county(self):
             }
         )
 
-        new_df = geo_map(df, "county", MAP_DF)
+        df_mega = pd.DataFrame(
+            {
+                "fips": ["90013", "90001"],
+                "timestamp": ["2020-02-15", "2020-02-15"],
+                "new_counts": [8, 2],
+                "cumulative_counts": [80, 12],
+                "population": [np.nan, np.nan],
+            }
+        )
+        
+        df = df.append(df_mega)
+        
+        new_df = geo_map(df, "county", MAP_DF, 'new_counts')
 
         exp_incidence = df["new_counts"] / df["population"] * 100000
         exp_cprop = df["cumulative_counts"] / df["population"] * 100000
-
-        assert set(new_df["geo_id"].values) == set(df["fips"].values)
+        
+        assert set(new_df["geo_id"].values) == set(['01000', '13000', '48027', '50103', '53003'])
         assert set(new_df["timestamp"].values) == set(df["timestamp"].values)
-        assert set(new_df["incidence"].values) == set(exp_incidence.values)
-        assert set(new_df["cumulative_prop"].values) == set(exp_cprop.values)
+        assert set(new_df["incidence"].values)  - set(exp_incidence.values) == set([np.Inf])
+        assert set(new_df["cumulative_prop"].values) - set(exp_cprop.values) == set([np.Inf])
 
     def test_state(self):
 
@@ -95,19 +114,31 @@ def test_state(self):
                 "population": [100, 2100, 300, 25],
             }
         )
+        
+        df_mega = pd.DataFrame(
+            {
+                "fips": ["90013", "90001", "04000", "25000"],
+                "timestamp": ["2020-02-15", "2020-02-15", "2020-02-15", "2020-02-15"],
+                "new_counts": [8, 2, 5, 10],
+                "cumulative_counts": [80, 12, 30, 100],
+                "population": [np.nan, np.nan, np.nan, np.nan],
+            }
+        )
+        
+        df = df.append(df_mega)
 
-        new_df = geo_map(df, "state", MAP_DF)
+        new_df = geo_map(df, "state", MAP_DF, 'new_counts')
 
-        exp_incidence = np.array([27, 13]) / np.array([2500, 25]) * 100000
-        exp_cprop = np.array([165, 60]) / np.array([2500, 25]) * 100000
+        exp_incidence = np.array([27 + 5, 13 + 10]) / np.array([2500, 25]) * 100000
+        exp_cprop = np.array([165 + 30, 60 + 100]) / np.array([2500, 25]) * 100000
 
-        assert (new_df["geo_id"].values == ["az", "ma"]).all()
-        assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all()
-        assert (new_df["new_counts"].values == [27, 13]).all()
-        assert (new_df["cumulative_counts"].values == [165, 60]).all()
-        assert (new_df["population"].values == [2500, 25]).all()
-        assert (new_df["incidence"].values == exp_incidence).all()
-        assert (new_df["cumulative_prop"].values == exp_cprop).all()
+        assert set(new_df["geo_id"].values) == set(["az", "ma", "al", "ga"])
+        assert set(new_df["timestamp"].values) == set(["2020-02-15"])
+        assert set(new_df["new_counts"].values) == set([32, 23, 2, 8])
+        assert set(new_df["cumulative_counts"].values) == set([195, 160, 80, 12])
+        assert set(new_df["population"].values) == set([2500, 25, 0])
+        assert set(new_df["incidence"].values) - set(exp_incidence) == set([np.Inf])
+        assert set(new_df["cumulative_prop"].values) - set(exp_cprop) == set([np.Inf])
 
     def test_hrr(self):
 
@@ -121,18 +152,32 @@ def test_hrr(self):
             }
         )
 
-        new_df = geo_map(df, "hrr", MAP_DF)
+        df_mega = pd.DataFrame(
+            {
+                "fips": ["90013", "90001"],
+                "timestamp": ["2020-02-15", "2020-02-15"],
+                "new_counts": [8, 2],
+                "cumulative_counts": [80, 12],
+                "population": [np.nan, np.nan],
+            }
+        )
+        
+        df = df.append(df_mega)
+        
+        new_df = geo_map(df, "hrr", MAP_DF, 'new_counts')
 
         exp_incidence = np.array([13, 27]) / np.array([25, 2500]) * 100000
         exp_cprop = np.array([60, 165]) / np.array([25, 2500]) * 100000
 
-        assert (new_df["geo_id"].values == [110, 147]).all()
-        assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all()
-        assert new_df["new_counts"].values == pytest.approx([13.0, 27.0])
-        assert new_df["cumulative_counts"].values == pytest.approx([60, 165])
-        assert new_df["population"].values == pytest.approx([25, 2500])
-        assert new_df["incidence"].values == pytest.approx(exp_incidence)
-        assert new_df["cumulative_prop"].values == pytest.approx(exp_cprop)
+        assert set(new_df["geo_id"].values) == set([110, 147, "al", "ga"])
+        assert set(new_df["timestamp"].values) == set(["2020-02-15"])
+        assert new_df["new_counts"].values == pytest.approx([13.0, 27.0, 2, 8])
+        assert new_df["cumulative_counts"].values == pytest.approx([60, 165, 12, 80])
+        assert new_df["population"].values == pytest.approx([25, 2500, 0, 0])
+        assert new_df["incidence"].values == pytest.approx(np.append(exp_incidence, 
+                                                                     [np.Inf, np.Inf]))
+        assert new_df["cumulative_prop"].values == pytest.approx(np.append(exp_cprop,
+                                                                           [np.Inf, np.Inf]))
 
     def test_msa(self):
 
@@ -145,16 +190,30 @@ def test_msa(self):
                 "population": [100, 2100, 300, 25],
             }
         )
+        
+        df_mega = pd.DataFrame(
+            {
+                "fips": ["90013", "90001"],
+                "timestamp": ["2020-02-15", "2020-02-15"],
+                "new_counts": [8, 2],
+                "cumulative_counts": [80, 12],
+                "population": [np.nan, np.nan],
+            }
+        )
+        
+        df = df.append(df_mega)
 
-        new_df = geo_map(df, "msa", MAP_DF)
+        new_df = geo_map(df, "msa", MAP_DF, 'new_counts')
 
         exp_incidence = np.array([2, 13]) / np.array([300, 25]) * 100000
         exp_cprop = np.array([45, 60]) / np.array([300, 25]) * 100000
 
-        assert (new_df["geo_id"].values == [31420, 49340]).all()
-        assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all()
-        assert new_df["new_counts"].values == pytest.approx([2.0, 13.0])
-        assert new_df["cumulative_counts"].values == pytest.approx([45, 60])
-        assert new_df["population"].values == pytest.approx([300, 25])
-        assert new_df["incidence"].values == pytest.approx(exp_incidence)
-        assert new_df["cumulative_prop"].values == pytest.approx(exp_cprop)
+        assert set(new_df["geo_id"].values) == set([31420, 49340,  "al", "ga"])
+        assert set(new_df["timestamp"]) == set(["2020-02-15"])
+        assert new_df["new_counts"].values == pytest.approx([2.0, 13.0, 2.0, 8.0])
+        assert new_df["cumulative_counts"].values == pytest.approx([45, 60, 12, 80])
+        assert new_df["population"].values == pytest.approx([300, 25, 0, 0])
+        assert new_df["incidence"].values == pytest.approx(np.append(exp_incidence, 
+                                                                     [np.Inf, np.Inf]))
+        assert new_df["cumulative_prop"].values == pytest.approx(np.append(exp_cprop,
+                                                                           [np.Inf, np.Inf]))

From 49be65daf62d04f2117be678dadff3559ae11e39 Mon Sep 17 00:00:00 2001
From: Jingjing Tang <jingjingtang@Jingjings-MacBook-Pro.local>
Date: Wed, 10 Jun 2020 00:24:08 -0400
Subject: [PATCH 07/18] delete whitespace

---
 jhu/delphi_jhu/pull.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jhu/delphi_jhu/pull.py b/jhu/delphi_jhu/pull.py
index 3be512559..d4131db82 100644
--- a/jhu/delphi_jhu/pull.py
+++ b/jhu/delphi_jhu/pull.py
@@ -85,7 +85,7 @@ def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFr
     ]
     # Merge in population LOWERCASE, consistent across confirmed and deaths
     # Set population as NAN for fake fips
-    df = pd.merge(df, pop_df, on="FIPS", how = 'left')
+    df = pd.merge(df, pop_df, on="FIPS", how='left')
 
     # Manual correction for PR
     df.loc[df["FIPS"] == 72, "FIPS"] = 72000

From 7ee404cf4afd74f5f8f3ec47a5052b3a458459fc Mon Sep 17 00:00:00 2001
From: Jingjing Tang <jingjingtang@Jingjings-MacBook-Pro.local>
Date: Wed, 10 Jun 2020 00:28:46 -0400
Subject: [PATCH 08/18] delete whitespace

---
 jhu/delphi_jhu/geo.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/jhu/delphi_jhu/geo.py b/jhu/delphi_jhu/geo.py
index 22d7a65cb..4d4ed9698 100644
--- a/jhu/delphi_jhu/geo.py
+++ b/jhu/delphi_jhu/geo.py
@@ -181,12 +181,12 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str):
     PROP_SENSORS = ("incidence", "cumulative_prop")
     if geo_res not in VALID_GEO_RES:
         raise ValueError(f"geo_res must be one of {VALID_GEO_RES}")
- 
+
     df_mega = df[df['fips'].astype(int) >= 90001].copy()
     df_mega['geo_id'] = df_mega['fips'].apply(lambda x: JHU_FAKE_FIPS_TO_MEGA_FIPS[x])
-    
+
     df = df[df['fips'].astype(int) < 90001].copy()
-    
+
     if geo_res == "county":
         df["geo_id"] = df["fips"]
         if sensor not in PROP_SENSORS:
@@ -225,7 +225,7 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str):
             df = df.append(df_mega)
     df = df.drop("fips", axis=1)
     df = df.groupby(["geo_id", "timestamp"]).sum().reset_index()
-    
+
     # Value would be negative for megacounties , which would not be considered in the main function
     df["incidence"] = df["new_counts"] / df["population"] * INCIDENCE_BASE
     df["cumulative_prop"] = df["cumulative_counts"] / df["population"] * INCIDENCE_BASE

From 25162d0baa8270d8954b2715c0fd2d6e2256c650 Mon Sep 17 00:00:00 2001
From: Jingjing Tang <31444565+jingjtang@users.noreply.github.com>
Date: Wed, 10 Jun 2020 20:56:52 -0400
Subject: [PATCH 09/18] Update jhu/tests/test_geo.py

Co-authored-by: krivard <krivard@cs.cmu.edu>
---
 jhu/tests/test_geo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jhu/tests/test_geo.py b/jhu/tests/test_geo.py
index 7dd74c0c4..7796d037b 100644
--- a/jhu/tests/test_geo.py
+++ b/jhu/tests/test_geo.py
@@ -135,7 +135,7 @@ def test_state(self):
         assert set(new_df["geo_id"].values) == set(["az", "ma", "al", "ga"])
         assert set(new_df["timestamp"].values) == set(["2020-02-15"])
         assert set(new_df["new_counts"].values) == set([32, 23, 2, 8])
-        assert set(new_df["cumulative_counts"].values) == set([195, 160, 80, 12])
+        assert set(new_df["cumulative_counts"].values) == set([195, 160, 12, 80])
         assert set(new_df["population"].values) == set([2500, 25, 0])
         assert set(new_df["incidence"].values) - set(exp_incidence) == set([np.Inf])
         assert set(new_df["cumulative_prop"].values) - set(exp_cprop) == set([np.Inf])

From 0dae1693eee0b729d8f628607ea721fd24b6f271 Mon Sep 17 00:00:00 2001
From: Jingjing Tang <31444565+jingjtang@users.noreply.github.com>
Date: Wed, 10 Jun 2020 20:57:51 -0400
Subject: [PATCH 10/18] Update jhu/tests/test_geo.py

Co-authored-by: krivard <krivard@cs.cmu.edu>
---
 jhu/tests/test_geo.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/jhu/tests/test_geo.py b/jhu/tests/test_geo.py
index 7796d037b..633255642 100644
--- a/jhu/tests/test_geo.py
+++ b/jhu/tests/test_geo.py
@@ -169,15 +169,13 @@ def test_hrr(self):
         exp_incidence = np.array([13, 27]) / np.array([25, 2500]) * 100000
         exp_cprop = np.array([60, 165]) / np.array([25, 2500]) * 100000
 
-        assert set(new_df["geo_id"].values) == set([110, 147, "al", "ga"])
-        assert set(new_df["timestamp"].values) == set(["2020-02-15"])
-        assert new_df["new_counts"].values == pytest.approx([13.0, 27.0, 2, 8])
-        assert new_df["cumulative_counts"].values == pytest.approx([60, 165, 12, 80])
-        assert new_df["population"].values == pytest.approx([25, 2500, 0, 0])
-        assert new_df["incidence"].values == pytest.approx(np.append(exp_incidence, 
-                                                                     [np.Inf, np.Inf]))
-        assert new_df["cumulative_prop"].values == pytest.approx(np.append(exp_cprop,
-                                                                           [np.Inf, np.Inf]))
+          assert (new_df["geo_id"].values == [110, 147]).all()
+          assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all()
+          assert new_df["new_counts"].values == pytest.approx([13.0, 27.0])
+          assert new_df["cumulative_counts"].values == pytest.approx([60, 165])
+          assert new_df["population"].values == pytest.approx([25, 2500])
+          assert new_df["incidence"].values == pytest.approx(exp_incidence)
+          assert new_df["cumulative_prop"].values == pytest.approx(exp_cprop)
 
     def test_msa(self):
 

From 64790fba8ac4651a2a4baa1965b598e942af2b5f Mon Sep 17 00:00:00 2001
From: Jingjing Tang <31444565+jingjtang@users.noreply.github.com>
Date: Wed, 10 Jun 2020 20:58:12 -0400
Subject: [PATCH 11/18] Update jhu/tests/test_geo.py

Co-authored-by: krivard <krivard@cs.cmu.edu>
---
 jhu/tests/test_geo.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/jhu/tests/test_geo.py b/jhu/tests/test_geo.py
index 633255642..5a7274180 100644
--- a/jhu/tests/test_geo.py
+++ b/jhu/tests/test_geo.py
@@ -206,12 +206,10 @@ def test_msa(self):
         exp_incidence = np.array([2, 13]) / np.array([300, 25]) * 100000
         exp_cprop = np.array([45, 60]) / np.array([300, 25]) * 100000
 
-        assert set(new_df["geo_id"].values) == set([31420, 49340,  "al", "ga"])
-        assert set(new_df["timestamp"]) == set(["2020-02-15"])
-        assert new_df["new_counts"].values == pytest.approx([2.0, 13.0, 2.0, 8.0])
-        assert new_df["cumulative_counts"].values == pytest.approx([45, 60, 12, 80])
-        assert new_df["population"].values == pytest.approx([300, 25, 0, 0])
-        assert new_df["incidence"].values == pytest.approx(np.append(exp_incidence, 
-                                                                     [np.Inf, np.Inf]))
-        assert new_df["cumulative_prop"].values == pytest.approx(np.append(exp_cprop,
-                                                                           [np.Inf, np.Inf]))
+         assert (new_df["geo_id"].values == [31420, 49340]).all()
+         assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all()
+         assert new_df["new_counts"].values == pytest.approx([2.0, 13.0])
+         assert new_df["cumulative_counts"].values == pytest.approx([45, 60])
+         assert new_df["population"].values == pytest.approx([300, 25])
+         assert new_df["incidence"].values == pytest.approx(exp_incidence)
+         assert new_df["cumulative_prop"].values == pytest.approx(exp_cprop)

From d187ebc78b99ba839858c613e4141f9ec5e323b0 Mon Sep 17 00:00:00 2001
From: Jingjing Tang <jingjingtang@Jingjings-MacBook-Pro.local>
Date: Thu, 11 Jun 2020 10:00:38 -0400
Subject: [PATCH 12/18] disable 7dav_ signal

---
 jhu/delphi_jhu/run.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/jhu/delphi_jhu/run.py b/jhu/delphi_jhu/run.py
index a21c5bb54..4d4ad4f84 100644
--- a/jhu/delphi_jhu/run.py
+++ b/jhu/delphi_jhu/run.py
@@ -35,7 +35,7 @@
 ]
 SMOOTHERS = [
     "unsmoothed",
-    "seven_day_average",
+    #"seven_day_average",
 ]
 SENSOR_NAME_MAP = {
     "new_counts":           ("incidence_num", False),
@@ -45,7 +45,7 @@
 }
 SMOOTHERS_MAP = {
     "unsmoothed":           (identity, ''),
-    "seven_day_average":    (seven_day_moving_average, '7day_avg_'),
+    #"seven_day_average":    (seven_day_moving_average, '7dav_'),
 }
 GEO_RESOLUTIONS = [
     "county",

From 131b88bf90e6381f8b955e752ea5ea1b78858ecc Mon Sep 17 00:00:00 2001
From: Jingjing Tang <jingjingtang@Jingjings-MacBook-Pro.local>
Date: Thu, 11 Jun 2020 10:01:23 -0400
Subject: [PATCH 13/18] cut 7day avg signal name

---
 jhu/tests/test_smooth.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jhu/tests/test_smooth.py b/jhu/tests/test_smooth.py
index 1b43841bf..1f77fc76a 100644
--- a/jhu/tests/test_smooth.py
+++ b/jhu/tests/test_smooth.py
@@ -14,7 +14,7 @@ def test_output_files_smoothed(self, run_as_module):
 
         smoothed = pd.read_csv(
             join("receiving",
-                f"{dates[-1]}_state_confirmed_7day_avg_cumulative_num.csv")
+                f"{dates[-1]}_state_confirmed_7dav_cumulative_num.csv")
         )
 
         raw = pd.concat([

From 003ac0cb011a06173e4ce3b366393b5e464cebd6 Mon Sep 17 00:00:00 2001
From: Jingjing Tang <jingjingtang@Jingjings-MacBook-Pro.local>
Date: Thu, 11 Jun 2020 10:49:31 -0400
Subject: [PATCH 14/18] delete df_mega for msa and hrr

---
 jhu/delphi_jhu/geo.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/jhu/delphi_jhu/geo.py b/jhu/delphi_jhu/geo.py
index 4d4ed9698..c471a4ae3 100644
--- a/jhu/delphi_jhu/geo.py
+++ b/jhu/delphi_jhu/geo.py
@@ -220,9 +220,9 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str):
         merged["new_counts"] = merged["new_counts"] * merged["pop_prop"]
         merged["population"] = merged["population"] * merged["pop_prop"]
         df = merged.drop(["zip", "pop_prop", "hrrnum", "cbsa_id"], axis=1)
-        if sensor not in PROP_SENSORS:
-            df_mega["geo_id"] = df_mega["geo_id"].apply(fips_to_state)
-            df = df.append(df_mega)
+        # if sensor not in PROP_SENSORS:
+        #     df_mega["geo_id"] = df_mega["geo_id"].apply(fips_to_state)
+        #     df = df.append(df_mega)
     df = df.drop("fips", axis=1)
     df = df.groupby(["geo_id", "timestamp"]).sum().reset_index()
 

From b911d8db39f7583e52f5b425c915dfe3fa68bfde Mon Sep 17 00:00:00 2001
From: Jingjing Tang <jingjingtang@Jingjings-MacBook-Pro.local>
Date: Thu, 11 Jun 2020 10:51:43 -0400
Subject: [PATCH 15/18] revert test cases for msa and hrr

---
 jhu/tests/test_geo.py | 68 +++++++++++++++++++++----------------------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/jhu/tests/test_geo.py b/jhu/tests/test_geo.py
index 5a7274180..a96ad0d31 100644
--- a/jhu/tests/test_geo.py
+++ b/jhu/tests/test_geo.py
@@ -152,30 +152,30 @@ def test_hrr(self):
             }
         )
 
-        df_mega = pd.DataFrame(
-            {
-                "fips": ["90013", "90001"],
-                "timestamp": ["2020-02-15", "2020-02-15"],
-                "new_counts": [8, 2],
-                "cumulative_counts": [80, 12],
-                "population": [np.nan, np.nan],
-            }
-        )
+        # df_mega = pd.DataFrame(
+        #     {
+        #         "fips": ["90013", "90001"],
+        #         "timestamp": ["2020-02-15", "2020-02-15"],
+        #         "new_counts": [8, 2],
+        #         "cumulative_counts": [80, 12],
+        #         "population": [np.nan, np.nan],
+        #     }
+        # )
         
-        df = df.append(df_mega)
+        # df = df.append(df_mega)
         
         new_df = geo_map(df, "hrr", MAP_DF, 'new_counts')
 
         exp_incidence = np.array([13, 27]) / np.array([25, 2500]) * 100000
         exp_cprop = np.array([60, 165]) / np.array([25, 2500]) * 100000
 
-          assert (new_df["geo_id"].values == [110, 147]).all()
-          assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all()
-          assert new_df["new_counts"].values == pytest.approx([13.0, 27.0])
-          assert new_df["cumulative_counts"].values == pytest.approx([60, 165])
-          assert new_df["population"].values == pytest.approx([25, 2500])
-          assert new_df["incidence"].values == pytest.approx(exp_incidence)
-          assert new_df["cumulative_prop"].values == pytest.approx(exp_cprop)
+        assert (new_df["geo_id"].values == [110, 147]).all()
+        assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all()
+        assert new_df["new_counts"].values == pytest.approx([13.0, 27.0])
+        assert new_df["cumulative_counts"].values == pytest.approx([60, 165])
+        assert new_df["population"].values == pytest.approx([25, 2500])
+        assert new_df["incidence"].values == pytest.approx(exp_incidence)
+        assert new_df["cumulative_prop"].values == pytest.approx(exp_cprop)
 
     def test_msa(self):
 
@@ -189,27 +189,27 @@ def test_msa(self):
             }
         )
         
-        df_mega = pd.DataFrame(
-            {
-                "fips": ["90013", "90001"],
-                "timestamp": ["2020-02-15", "2020-02-15"],
-                "new_counts": [8, 2],
-                "cumulative_counts": [80, 12],
-                "population": [np.nan, np.nan],
-            }
-        )
+        # df_mega = pd.DataFrame(
+        #     {
+        #         "fips": ["90013", "90001"],
+        #         "timestamp": ["2020-02-15", "2020-02-15"],
+        #         "new_counts": [8, 2],
+        #         "cumulative_counts": [80, 12],
+        #         "population": [np.nan, np.nan],
+        #     }
+        # )
         
-        df = df.append(df_mega)
+        # df = df.append(df_mega)
 
         new_df = geo_map(df, "msa", MAP_DF, 'new_counts')
 
         exp_incidence = np.array([2, 13]) / np.array([300, 25]) * 100000
         exp_cprop = np.array([45, 60]) / np.array([300, 25]) * 100000
 
-         assert (new_df["geo_id"].values == [31420, 49340]).all()
-         assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all()
-         assert new_df["new_counts"].values == pytest.approx([2.0, 13.0])
-         assert new_df["cumulative_counts"].values == pytest.approx([45, 60])
-         assert new_df["population"].values == pytest.approx([300, 25])
-         assert new_df["incidence"].values == pytest.approx(exp_incidence)
-         assert new_df["cumulative_prop"].values == pytest.approx(exp_cprop)
+        assert (new_df["geo_id"].values == [31420, 49340]).all()
+        assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all()
+        assert new_df["new_counts"].values == pytest.approx([2.0, 13.0])
+        assert new_df["cumulative_counts"].values == pytest.approx([45, 60])
+        assert new_df["population"].values == pytest.approx([300, 25])
+        assert new_df["incidence"].values == pytest.approx(exp_incidence)
+        assert new_df["cumulative_prop"].values == pytest.approx(exp_cprop)

From 99c3860e6d8967eea2389c1a447a9e9cd47853c7 Mon Sep 17 00:00:00 2001
From: Jingjing Tang <jingjingtang@Jingjings-MacBook-Pro.local>
Date: Thu, 11 Jun 2020 10:54:32 -0400
Subject: [PATCH 16/18] resolve conflicts

---
 jhu/delphi_jhu/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jhu/delphi_jhu/run.py b/jhu/delphi_jhu/run.py
index 4d4ad4f84..aeef3fa57 100644
--- a/jhu/delphi_jhu/run.py
+++ b/jhu/delphi_jhu/run.py
@@ -45,7 +45,7 @@
 }
 SMOOTHERS_MAP = {
     "unsmoothed":           (identity, ''),
-    #"seven_day_average":    (seven_day_moving_average, '7dav_'),
+    #"seven_day_average":    (seven_day_moving_average, '7dav_', True),
 }
 GEO_RESOLUTIONS = [
     "county",

From eabe62436ac130e0f6d3acf7767fe235fd3e1b53 Mon Sep 17 00:00:00 2001
From: Jingjing Tang <jingjingtang@Jingjings-MacBook-Pro.local>
Date: Thu, 11 Jun 2020 10:55:20 -0400
Subject: [PATCH 17/18] resolve conflicts

---
 jhu/delphi_jhu/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jhu/delphi_jhu/run.py b/jhu/delphi_jhu/run.py
index aeef3fa57..485b663ca 100644
--- a/jhu/delphi_jhu/run.py
+++ b/jhu/delphi_jhu/run.py
@@ -44,7 +44,7 @@
     "cumulative_prop":      ("cumulative_prop", False),
 }
 SMOOTHERS_MAP = {
-    "unsmoothed":           (identity, ''),
+    "unsmoothed":           (identity, '', False),
     #"seven_day_average":    (seven_day_moving_average, '7dav_', True),
 }
 GEO_RESOLUTIONS = [

From cc203c6ab59e4df896a9685eb6fb9da24361a63d Mon Sep 17 00:00:00 2001
From: Jingjing Tang <jingjingtang@Jingjings-MacBook-Pro.local>
Date: Fri, 19 Jun 2020 10:05:49 -0400
Subject: [PATCH 18/18] resolve conflicts

---
 jhu/delphi_jhu/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jhu/delphi_jhu/run.py b/jhu/delphi_jhu/run.py
index 485b663ca..c845fa317 100644
--- a/jhu/delphi_jhu/run.py
+++ b/jhu/delphi_jhu/run.py
@@ -45,7 +45,7 @@
 }
 SMOOTHERS_MAP = {
     "unsmoothed":           (identity, '', False),
-    #"seven_day_average":    (seven_day_moving_average, '7dav_', True),
+    "seven_day_average":    (seven_day_moving_average, '7dav_', True),
 }
 GEO_RESOLUTIONS = [
     "county",