From 44febee95e8599c817ec88135dbeb57a46ba17a0 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Mon, 8 Jun 2020 15:33:59 -0400 Subject: [PATCH 01/18] update code for unassigned cases/deaths --- jhu/delphi_jhu/geo.py | 81 ++++++++++++++++++++++++++++++++++++++++-- jhu/delphi_jhu/pull.py | 8 +++-- jhu/delphi_jhu/run.py | 2 +- 3 files changed, 85 insertions(+), 6 deletions(-) diff --git a/jhu/delphi_jhu/geo.py b/jhu/delphi_jhu/geo.py index 4e305917b..84629ec09 100644 --- a/jhu/delphi_jhu/geo.py +++ b/jhu/delphi_jhu/geo.py @@ -89,6 +89,62 @@ FIPS_TO_STATE = {v: k.lower() for k, v in STATE_TO_FIPS.items()} +# Fake fips to States + +FAKE_FIPS_TO_STATES = { + "90001":"al", + "90002":"ak", + "90004":"az", + "90005":"ar", + "90006":"ca", + "90008":"co", + "90009":"ct", + "90010":"de", + "90011":"dc", + "90012":"fl", + "90013":"ga", + "90015":"hi", + "90016":"id", + "90017":"il", + "90018":"in", + "90019":"ia", + "90020":"ks", + "90021":"ky", + "90022":"la", + "90023":"me", + "90024":"md", + "90025":"ma", + "90026":"mi", + "90027":"mn", + "90028":"ms", + "90029":"mo", + "90030":"mt", + "90031":"ne", + "90032":"nv", + "90033":"nh", + "90034":"nj", + "90035":"nm", + "90036":"ny", + "90037":"nc", + "90038":"nd", + "90039":"oh", + "90040":"ok", + "90041":"or", + "90042":"pa", + "90044":"ri", + "90045":"sc", + "90046":"sd", + "90047":"tn", + "90048":"tx", + "90049":"ut", + "90050":"vt", + "90051":"va", + "90053":"wa", + "90054":"wv", + "90055":"wi", + "90056":"wy" +} + def fips_to_state(fips: str) -> str: """Wrapper that handles exceptions to the FIPS scheme in the JHU data. @@ -118,7 +174,11 @@ def fips_to_state(fips: str) -> str: return FIPS_TO_STATE["25"] # Dukes & Nantucket -> Massachusetts if fips == "70003": return FIPS_TO_STATE["29"] # Kansas City -> Missouri - return FIPS_TO_STATE[fips[:2]] + # Fake fips -> states + if fips[:2] == '90': + return FAKE_FIPS_TO_STATES[fips] + else: + return FIPS_TO_STATE[fips[:2]] def disburse(df: pd.DataFrame, pooled_fips: str, fips_list: list): @@ -148,7 +208,7 @@ def disburse(df: pd.DataFrame, pooled_fips: str, fips_list: list): return df -def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame): +def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str): """ Maps a DataFrame df, which contains data at the county resolution, and aggregate it to the geographic resolution geo_res. @@ -169,15 +229,26 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame): Columns: geo_id, timestamp, ... """ VALID_GEO_RES = ("county", "state", "msa", "hrr") + #It is not clear to calculate the proportion for unassigned cases/deaths + PROP_SENSORS = ("incidence", "cumulative_prop") if geo_res not in VALID_GEO_RES: raise ValueError(f"geo_res must be one of {VALID_GEO_RES}") - df = df.copy() + + df_mega = df[df['fips'].astype(int) >= 90001].copy() + df_mega['geo_id'] = df_mega['fips'].apply(fips_to_state) + + df = df[df['fips'].astype(int) < 90001].copy() + if geo_res == "county": df["geo_id"] = df["fips"] + if sensor not in PROP_SENSORS: + df = df.append(df_mega) elif geo_res == "state": # Grab first two digits of fips # Map state fips to us postal code df["geo_id"] = df["fips"].apply(fips_to_state) + # Add unassigned cases/deaths + df = df.append(df_mega) elif geo_res in ("msa", "hrr"): # Disburse Dukes & Nantucket to individual counties df = disburse(df, DN_FIPS, DN_COUNTY_FIPS) @@ -200,8 +271,12 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame): merged["new_counts"] = merged["new_counts"] * merged["pop_prop"] merged["population"] = merged["population"] * merged["pop_prop"] df = merged.drop(["zip", "pop_prop", "hrrnum", "cbsa_id"], axis=1) + if sensor not in PROP_SENSORS: + df = df.append(df_mega) df = df.drop("fips", axis=1) df = df.groupby(["geo_id", "timestamp"]).sum().reset_index() + + # Value would be negative for megacounties , which would not be considered in the main function df["incidence"] = df["new_counts"] / df["population"] * INCIDENCE_BASE df["cumulative_prop"] = df["cumulative_counts"] / df["population"] * INCIDENCE_BASE return df diff --git a/jhu/delphi_jhu/pull.py b/jhu/delphi_jhu/pull.py index 1049d5e0a..a60487ae8 100644 --- a/jhu/delphi_jhu/pull.py +++ b/jhu/delphi_jhu/pull.py @@ -62,7 +62,7 @@ def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFr MIN_FIPS = 1000 MAX_FIPS = 57000 EXTRA_FIPS = ( - 72, # Puerto Rico (provided as the entire state) + 72, # Puerto Rico (provided as the entire state) 70002, # Kansas City, MO 70003, # Dukes and Nantucket Counties, MA ) @@ -79,9 +79,13 @@ def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFr & (df["FIPS"] < MAX_FIPS) ) # "Uncategorized", etc. | df["FIPS"].isin(EXTRA_FIPS) + # Get Fake FIPS for unassigned cases + | np.logical_and(df['FIPS'] >= 90001, + df['FIPS'] <= 90056) ] # Merge in population LOWERCASE, consistent across confirmed and deaths - df = pd.merge(df, pop_df, on="FIPS") + # set population as -1 for fake fips + df = pd.merge(df, pop_df, on="FIPS", how = 'left').fillna(-1) # Manual correction for PR df.loc[df["FIPS"] == 72, "FIPS"] = 72000 diff --git a/jhu/delphi_jhu/run.py b/jhu/delphi_jhu/run.py index cfb738f21..a21c5bb54 100644 --- a/jhu/delphi_jhu/run.py +++ b/jhu/delphi_jhu/run.py @@ -77,7 +77,7 @@ def run_module(): print(geo_res, metric, sensor, smoother) df = dfs[metric] # Aggregate to appropriate geographic resolution - df = geo_map(df, geo_res, map_df) + df = geo_map(df, geo_res, map_df, sensor) df["val"] = SMOOTHERS_MAP[smoother][0](df[sensor].values) df["se"] = np.nan df["sample_size"] = np.nan From 849a32ec63ca9eee067104c664d4d5bc7bd92a01 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Mon, 8 Jun 2020 18:41:14 -0400 Subject: [PATCH 02/18] update the dict for fake fips --- jhu/delphi_jhu/geo.py | 110 ++++++++++++++++++++++-------------------- 1 file changed, 58 insertions(+), 52 deletions(-) diff --git a/jhu/delphi_jhu/geo.py b/jhu/delphi_jhu/geo.py index 84629ec09..0ad797692 100644 --- a/jhu/delphi_jhu/geo.py +++ b/jhu/delphi_jhu/geo.py @@ -91,60 +91,62 @@ # Fake fips to States -FAKE_FIPS_TO_STATES = { - "90001":"al", - "90002":"ak", - "90004":"az", - "90005":"ar", - "90006":"ca", - "90008":"co", - "90009":"ct", - "90010":"de", - "90011":"dc", - "90012":"fl", - "90013":"ga", - "90015":"hi", - "90016":"id", - "90017":"il", - "90018":"in", - "90019":"ia", - "90020":"ks", - "90021":"ky", - "90022":"la", - "90023":"me", - "90024":"md", - "90025":"ma", - "90026":"mi", - "90027":"mn", - "90028":"ms", - "90029":"mo", - "90030":"mt", - "90031":"ne", - "90032":"nv", - "90033":"nh", - "90034":"nj", - "90035":"nm", - "90036":"ny", - "90037":"nc", - "90038":"nd", - "90039":"oh", - "90040":"ok", - "90041":"or", - "90042":"pa", - "90044":"ri", - "90045":"sc", - "90046":"sd", - "90047":"tn", - "90048":"tx", - "90049":"ut", - "90050":"vt", - "90051":"va", - "90053":"wa", - "90054":"wv", - "90055":"wi", - "90056":"wy" +STATES_TO_JHU_FIPS_FOR_UNASSIGNED = { + "AL":"01", + "AK":"02", + "AZ":"04", + "AR":"05", + "CA":"06", + "CO":"08", + "CT":"09", + "DE":"10", + "DC":"11", + "FL":"12", + "GA":"13", + "HI":"15", + "ID":"16", + "IL":"17", + "IN":"18", + "IA":"19", + "KS":"20", + "KY":"21", + "LA":"22", + "ME":"23", + "MD":"24", + "MA":"25", + "MI":"26", + "MN":"27", + "MS":"28", + "MO":"29", + "MT":"30", + "NE":"31", + "NV":"32", + "NH":"33", + "NJ":"34", + "NM":"35", + "NY":"36", + "NC":"37", + "ND":"38", + "OH":"39", + "OK":"40", + "OR":"41", + "PA":"42", + "RI":"44", + "SC":"45", + "SD":"46", + "TN":"47", + "TX":"48", + "UT":"49", + "VT":"50", + "VA":"51", + "WA":"53", + "WV":"54", + "WI":"55", + "WY":"56" } +FAKE_FIPS_TO_STATES = {f'900{v}' : k.lower() for k, v in STATES_TO_JHU_FIPS_FOR_UNASSIGNED.items()} + def fips_to_state(fips: str) -> str: """Wrapper that handles exceptions to the FIPS scheme in the JHU data. @@ -222,6 +224,10 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str): ('county', 'state', 'msa', 'hrr'). map_df: pd.DataFrame Loaded from static file "fips_prop_pop.csv". + sensor: str + sensor type. Valid options: + ("new_counts", "cumulative_counts", + "incidence", "cumulative_prop") Returns ------- From f1f5cddef7bf210a618499b88b14ad8e45a85311 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Mon, 8 Jun 2020 18:47:34 -0400 Subject: [PATCH 03/18] Set population for fake fips as NAN --- jhu/delphi_jhu/pull.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jhu/delphi_jhu/pull.py b/jhu/delphi_jhu/pull.py index a60487ae8..3be512559 100644 --- a/jhu/delphi_jhu/pull.py +++ b/jhu/delphi_jhu/pull.py @@ -84,8 +84,8 @@ def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFr df['FIPS'] <= 90056) ] # Merge in population LOWERCASE, consistent across confirmed and deaths - # set population as -1 for fake fips - df = pd.merge(df, pop_df, on="FIPS", how = 'left').fillna(-1) + # Set population as NAN for fake fips + df = pd.merge(df, pop_df, on="FIPS", how = 'left') # Manual correction for PR df.loc[df["FIPS"] == 72, "FIPS"] = 72000 From 84bbfc6ec10be9401f8bb36c8c7318a85c27c7b2 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Mon, 8 Jun 2020 19:24:28 -0400 Subject: [PATCH 04/18] update geo_id for megacounty --- jhu/delphi_jhu/geo.py | 67 ++++--------------------------------------- 1 file changed, 5 insertions(+), 62 deletions(-) diff --git a/jhu/delphi_jhu/geo.py b/jhu/delphi_jhu/geo.py index 0ad797692..dba93c371 100644 --- a/jhu/delphi_jhu/geo.py +++ b/jhu/delphi_jhu/geo.py @@ -91,61 +91,7 @@ # Fake fips to States -STATES_TO_JHU_FIPS_FOR_UNASSIGNED = { - "AL":"01", - "AK":"02", - "AZ":"04", - "AR":"05", - "CA":"06", - "CO":"08", - "CT":"09", - "DE":"10", - "DC":"11", - "FL":"12", - "GA":"13", - "HI":"15", - "ID":"16", - "IL":"17", - "IN":"18", - "IA":"19", - "KS":"20", - "KY":"21", - "LA":"22", - "ME":"23", - "MD":"24", - "MA":"25", - "MI":"26", - "MN":"27", - "MS":"28", - "MO":"29", - "MT":"30", - "NE":"31", - "NV":"32", - "NH":"33", - "NJ":"34", - "NM":"35", - "NY":"36", - "NC":"37", - "ND":"38", - "OH":"39", - "OK":"40", - "OR":"41", - "PA":"42", - "RI":"44", - "SC":"45", - "SD":"46", - "TN":"47", - "TX":"48", - "UT":"49", - "VT":"50", - "VA":"51", - "WA":"53", - "WV":"54", - "WI":"55", - "WY":"56" -} - -FAKE_FIPS_TO_STATES = {f'900{v}' : k.lower() for k, v in STATES_TO_JHU_FIPS_FOR_UNASSIGNED.items()} +JHU_FAKE_FIPS_TO_MEGA_FIPS = {f'900{x}' : f'{x}000' for x in STATE_TO_FIPS.values()} def fips_to_state(fips: str) -> str: @@ -176,11 +122,7 @@ def fips_to_state(fips: str) -> str: return FIPS_TO_STATE["25"] # Dukes & Nantucket -> Massachusetts if fips == "70003": return FIPS_TO_STATE["29"] # Kansas City -> Missouri - # Fake fips -> states - if fips[:2] == '90': - return FAKE_FIPS_TO_STATES[fips] - else: - return FIPS_TO_STATE[fips[:2]] + return FIPS_TO_STATE[fips[:2]] def disburse(df: pd.DataFrame, pooled_fips: str, fips_list: list): @@ -241,7 +183,7 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str): raise ValueError(f"geo_res must be one of {VALID_GEO_RES}") df_mega = df[df['fips'].astype(int) >= 90001].copy() - df_mega['geo_id'] = df_mega['fips'].apply(fips_to_state) + df_mega['geo_id'] = df_mega['fips'].apply(lambda x: JHU_FAKE_FIPS_TO_MEGA_FIPS[x]) df = df[df['fips'].astype(int) < 90001].copy() @@ -252,9 +194,10 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str): elif geo_res == "state": # Grab first two digits of fips # Map state fips to us postal code - df["geo_id"] = df["fips"].apply(fips_to_state) + df["geo_id"] = df["fips"] # Add unassigned cases/deaths df = df.append(df_mega) + df["geo_id"] = df["geo_id"].apply(fips_to_state) elif geo_res in ("msa", "hrr"): # Disburse Dukes & Nantucket to individual counties df = disburse(df, DN_FIPS, DN_COUNTY_FIPS) From 030d38ad172c1a412261c6b153246459313c451c Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Wed, 10 Jun 2020 00:08:42 -0400 Subject: [PATCH 05/18] update naming for megacounty --- jhu/delphi_jhu/geo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/jhu/delphi_jhu/geo.py b/jhu/delphi_jhu/geo.py index dba93c371..22d7a65cb 100644 --- a/jhu/delphi_jhu/geo.py +++ b/jhu/delphi_jhu/geo.py @@ -221,6 +221,7 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str): merged["population"] = merged["population"] * merged["pop_prop"] df = merged.drop(["zip", "pop_prop", "hrrnum", "cbsa_id"], axis=1) if sensor not in PROP_SENSORS: + df_mega["geo_id"] = df_mega["geo_id"].apply(fips_to_state) df = df.append(df_mega) df = df.drop("fips", axis=1) df = df.groupby(["geo_id", "timestamp"]).sum().reset_index() From f66b43e9726f083ee94260d2d019186d0aac056d Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Wed, 10 Jun 2020 00:09:17 -0400 Subject: [PATCH 06/18] modify test cases for megacounty aggregation --- jhu/tests/test_geo.py | 121 +++++++++++++++++++++++++++++++----------- 1 file changed, 90 insertions(+), 31 deletions(-) diff --git a/jhu/tests/test_geo.py b/jhu/tests/test_geo.py index 8d45eb336..7dd74c0c4 100644 --- a/jhu/tests/test_geo.py +++ b/jhu/tests/test_geo.py @@ -25,6 +25,13 @@ def test_normal(self): assert fips_to_state("12003") == "fl" assert fips_to_state("50103") == "vt" assert fips_to_state("15003") == "hi" + + def test_mega(self): + + assert fips_to_state("01000") == "al" + assert fips_to_state("13000") == "ga" + assert fips_to_state("44000") == "ri" + assert fips_to_state("12000") == "fl" class TestDisburse: @@ -74,15 +81,27 @@ def test_county(self): } ) - new_df = geo_map(df, "county", MAP_DF) + df_mega = pd.DataFrame( + { + "fips": ["90013", "90001"], + "timestamp": ["2020-02-15", "2020-02-15"], + "new_counts": [8, 2], + "cumulative_counts": [80, 12], + "population": [np.nan, np.nan], + } + ) + + df = df.append(df_mega) + + new_df = geo_map(df, "county", MAP_DF, 'new_counts') exp_incidence = df["new_counts"] / df["population"] * 100000 exp_cprop = df["cumulative_counts"] / df["population"] * 100000 - - assert set(new_df["geo_id"].values) == set(df["fips"].values) + + assert set(new_df["geo_id"].values) == set(['01000', '13000', '48027', '50103', '53003']) assert set(new_df["timestamp"].values) == set(df["timestamp"].values) - assert set(new_df["incidence"].values) == set(exp_incidence.values) - assert set(new_df["cumulative_prop"].values) == set(exp_cprop.values) + assert set(new_df["incidence"].values) - set(exp_incidence.values) == set([np.Inf]) + assert set(new_df["cumulative_prop"].values) - set(exp_cprop.values) == set([np.Inf]) def test_state(self): @@ -95,19 +114,31 @@ def test_state(self): "population": [100, 2100, 300, 25], } ) + + df_mega = pd.DataFrame( + { + "fips": ["90013", "90001", "04000", "25000"], + "timestamp": ["2020-02-15", "2020-02-15", "2020-02-15", "2020-02-15"], + "new_counts": [8, 2, 5, 10], + "cumulative_counts": [80, 12, 30, 100], + "population": [np.nan, np.nan, np.nan, np.nan], + } + ) + + df = df.append(df_mega) - new_df = geo_map(df, "state", MAP_DF) + new_df = geo_map(df, "state", MAP_DF, 'new_counts') - exp_incidence = np.array([27, 13]) / np.array([2500, 25]) * 100000 - exp_cprop = np.array([165, 60]) / np.array([2500, 25]) * 100000 + exp_incidence = np.array([27 + 5, 13 + 10]) / np.array([2500, 25]) * 100000 + exp_cprop = np.array([165 + 30, 60 + 100]) / np.array([2500, 25]) * 100000 - assert (new_df["geo_id"].values == ["az", "ma"]).all() - assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all() - assert (new_df["new_counts"].values == [27, 13]).all() - assert (new_df["cumulative_counts"].values == [165, 60]).all() - assert (new_df["population"].values == [2500, 25]).all() - assert (new_df["incidence"].values == exp_incidence).all() - assert (new_df["cumulative_prop"].values == exp_cprop).all() + assert set(new_df["geo_id"].values) == set(["az", "ma", "al", "ga"]) + assert set(new_df["timestamp"].values) == set(["2020-02-15"]) + assert set(new_df["new_counts"].values) == set([32, 23, 2, 8]) + assert set(new_df["cumulative_counts"].values) == set([195, 160, 80, 12]) + assert set(new_df["population"].values) == set([2500, 25, 0]) + assert set(new_df["incidence"].values) - set(exp_incidence) == set([np.Inf]) + assert set(new_df["cumulative_prop"].values) - set(exp_cprop) == set([np.Inf]) def test_hrr(self): @@ -121,18 +152,32 @@ def test_hrr(self): } ) - new_df = geo_map(df, "hrr", MAP_DF) + df_mega = pd.DataFrame( + { + "fips": ["90013", "90001"], + "timestamp": ["2020-02-15", "2020-02-15"], + "new_counts": [8, 2], + "cumulative_counts": [80, 12], + "population": [np.nan, np.nan], + } + ) + + df = df.append(df_mega) + + new_df = geo_map(df, "hrr", MAP_DF, 'new_counts') exp_incidence = np.array([13, 27]) / np.array([25, 2500]) * 100000 exp_cprop = np.array([60, 165]) / np.array([25, 2500]) * 100000 - assert (new_df["geo_id"].values == [110, 147]).all() - assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all() - assert new_df["new_counts"].values == pytest.approx([13.0, 27.0]) - assert new_df["cumulative_counts"].values == pytest.approx([60, 165]) - assert new_df["population"].values == pytest.approx([25, 2500]) - assert new_df["incidence"].values == pytest.approx(exp_incidence) - assert new_df["cumulative_prop"].values == pytest.approx(exp_cprop) + assert set(new_df["geo_id"].values) == set([110, 147, "al", "ga"]) + assert set(new_df["timestamp"].values) == set(["2020-02-15"]) + assert new_df["new_counts"].values == pytest.approx([13.0, 27.0, 2, 8]) + assert new_df["cumulative_counts"].values == pytest.approx([60, 165, 12, 80]) + assert new_df["population"].values == pytest.approx([25, 2500, 0, 0]) + assert new_df["incidence"].values == pytest.approx(np.append(exp_incidence, + [np.Inf, np.Inf])) + assert new_df["cumulative_prop"].values == pytest.approx(np.append(exp_cprop, + [np.Inf, np.Inf])) def test_msa(self): @@ -145,16 +190,30 @@ def test_msa(self): "population": [100, 2100, 300, 25], } ) + + df_mega = pd.DataFrame( + { + "fips": ["90013", "90001"], + "timestamp": ["2020-02-15", "2020-02-15"], + "new_counts": [8, 2], + "cumulative_counts": [80, 12], + "population": [np.nan, np.nan], + } + ) + + df = df.append(df_mega) - new_df = geo_map(df, "msa", MAP_DF) + new_df = geo_map(df, "msa", MAP_DF, 'new_counts') exp_incidence = np.array([2, 13]) / np.array([300, 25]) * 100000 exp_cprop = np.array([45, 60]) / np.array([300, 25]) * 100000 - assert (new_df["geo_id"].values == [31420, 49340]).all() - assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all() - assert new_df["new_counts"].values == pytest.approx([2.0, 13.0]) - assert new_df["cumulative_counts"].values == pytest.approx([45, 60]) - assert new_df["population"].values == pytest.approx([300, 25]) - assert new_df["incidence"].values == pytest.approx(exp_incidence) - assert new_df["cumulative_prop"].values == pytest.approx(exp_cprop) + assert set(new_df["geo_id"].values) == set([31420, 49340, "al", "ga"]) + assert set(new_df["timestamp"]) == set(["2020-02-15"]) + assert new_df["new_counts"].values == pytest.approx([2.0, 13.0, 2.0, 8.0]) + assert new_df["cumulative_counts"].values == pytest.approx([45, 60, 12, 80]) + assert new_df["population"].values == pytest.approx([300, 25, 0, 0]) + assert new_df["incidence"].values == pytest.approx(np.append(exp_incidence, + [np.Inf, np.Inf])) + assert new_df["cumulative_prop"].values == pytest.approx(np.append(exp_cprop, + [np.Inf, np.Inf])) From 49be65daf62d04f2117be678dadff3559ae11e39 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Wed, 10 Jun 2020 00:24:08 -0400 Subject: [PATCH 07/18] delete whitespace --- jhu/delphi_jhu/pull.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jhu/delphi_jhu/pull.py b/jhu/delphi_jhu/pull.py index 3be512559..d4131db82 100644 --- a/jhu/delphi_jhu/pull.py +++ b/jhu/delphi_jhu/pull.py @@ -85,7 +85,7 @@ def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFr ] # Merge in population LOWERCASE, consistent across confirmed and deaths # Set population as NAN for fake fips - df = pd.merge(df, pop_df, on="FIPS", how = 'left') + df = pd.merge(df, pop_df, on="FIPS", how='left') # Manual correction for PR df.loc[df["FIPS"] == 72, "FIPS"] = 72000 From 7ee404cf4afd74f5f8f3ec47a5052b3a458459fc Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Wed, 10 Jun 2020 00:28:46 -0400 Subject: [PATCH 08/18] delete whitespace --- jhu/delphi_jhu/geo.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/jhu/delphi_jhu/geo.py b/jhu/delphi_jhu/geo.py index 22d7a65cb..4d4ed9698 100644 --- a/jhu/delphi_jhu/geo.py +++ b/jhu/delphi_jhu/geo.py @@ -181,12 +181,12 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str): PROP_SENSORS = ("incidence", "cumulative_prop") if geo_res not in VALID_GEO_RES: raise ValueError(f"geo_res must be one of {VALID_GEO_RES}") - + df_mega = df[df['fips'].astype(int) >= 90001].copy() df_mega['geo_id'] = df_mega['fips'].apply(lambda x: JHU_FAKE_FIPS_TO_MEGA_FIPS[x]) - + df = df[df['fips'].astype(int) < 90001].copy() - + if geo_res == "county": df["geo_id"] = df["fips"] if sensor not in PROP_SENSORS: @@ -225,7 +225,7 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str): df = df.append(df_mega) df = df.drop("fips", axis=1) df = df.groupby(["geo_id", "timestamp"]).sum().reset_index() - + # Value would be negative for megacounties , which would not be considered in the main function df["incidence"] = df["new_counts"] / df["population"] * INCIDENCE_BASE df["cumulative_prop"] = df["cumulative_counts"] / df["population"] * INCIDENCE_BASE From 25162d0baa8270d8954b2715c0fd2d6e2256c650 Mon Sep 17 00:00:00 2001 From: Jingjing Tang <31444565+jingjtang@users.noreply.github.com> Date: Wed, 10 Jun 2020 20:56:52 -0400 Subject: [PATCH 09/18] Update jhu/tests/test_geo.py Co-authored-by: krivard --- jhu/tests/test_geo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jhu/tests/test_geo.py b/jhu/tests/test_geo.py index 7dd74c0c4..7796d037b 100644 --- a/jhu/tests/test_geo.py +++ b/jhu/tests/test_geo.py @@ -135,7 +135,7 @@ def test_state(self): assert set(new_df["geo_id"].values) == set(["az", "ma", "al", "ga"]) assert set(new_df["timestamp"].values) == set(["2020-02-15"]) assert set(new_df["new_counts"].values) == set([32, 23, 2, 8]) - assert set(new_df["cumulative_counts"].values) == set([195, 160, 80, 12]) + assert set(new_df["cumulative_counts"].values) == set([195, 160, 12, 80]) assert set(new_df["population"].values) == set([2500, 25, 0]) assert set(new_df["incidence"].values) - set(exp_incidence) == set([np.Inf]) assert set(new_df["cumulative_prop"].values) - set(exp_cprop) == set([np.Inf]) From 0dae1693eee0b729d8f628607ea721fd24b6f271 Mon Sep 17 00:00:00 2001 From: Jingjing Tang <31444565+jingjtang@users.noreply.github.com> Date: Wed, 10 Jun 2020 20:57:51 -0400 Subject: [PATCH 10/18] Update jhu/tests/test_geo.py Co-authored-by: krivard --- jhu/tests/test_geo.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/jhu/tests/test_geo.py b/jhu/tests/test_geo.py index 7796d037b..633255642 100644 --- a/jhu/tests/test_geo.py +++ b/jhu/tests/test_geo.py @@ -169,15 +169,13 @@ def test_hrr(self): exp_incidence = np.array([13, 27]) / np.array([25, 2500]) * 100000 exp_cprop = np.array([60, 165]) / np.array([25, 2500]) * 100000 - assert set(new_df["geo_id"].values) == set([110, 147, "al", "ga"]) - assert set(new_df["timestamp"].values) == set(["2020-02-15"]) - assert new_df["new_counts"].values == pytest.approx([13.0, 27.0, 2, 8]) - assert new_df["cumulative_counts"].values == pytest.approx([60, 165, 12, 80]) - assert new_df["population"].values == pytest.approx([25, 2500, 0, 0]) - assert new_df["incidence"].values == pytest.approx(np.append(exp_incidence, - [np.Inf, np.Inf])) - assert new_df["cumulative_prop"].values == pytest.approx(np.append(exp_cprop, - [np.Inf, np.Inf])) + assert (new_df["geo_id"].values == [110, 147]).all() + assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all() + assert new_df["new_counts"].values == pytest.approx([13.0, 27.0]) + assert new_df["cumulative_counts"].values == pytest.approx([60, 165]) + assert new_df["population"].values == pytest.approx([25, 2500]) + assert new_df["incidence"].values == pytest.approx(exp_incidence) + assert new_df["cumulative_prop"].values == pytest.approx(exp_cprop) def test_msa(self): From 64790fba8ac4651a2a4baa1965b598e942af2b5f Mon Sep 17 00:00:00 2001 From: Jingjing Tang <31444565+jingjtang@users.noreply.github.com> Date: Wed, 10 Jun 2020 20:58:12 -0400 Subject: [PATCH 11/18] Update jhu/tests/test_geo.py Co-authored-by: krivard --- jhu/tests/test_geo.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/jhu/tests/test_geo.py b/jhu/tests/test_geo.py index 633255642..5a7274180 100644 --- a/jhu/tests/test_geo.py +++ b/jhu/tests/test_geo.py @@ -206,12 +206,10 @@ def test_msa(self): exp_incidence = np.array([2, 13]) / np.array([300, 25]) * 100000 exp_cprop = np.array([45, 60]) / np.array([300, 25]) * 100000 - assert set(new_df["geo_id"].values) == set([31420, 49340, "al", "ga"]) - assert set(new_df["timestamp"]) == set(["2020-02-15"]) - assert new_df["new_counts"].values == pytest.approx([2.0, 13.0, 2.0, 8.0]) - assert new_df["cumulative_counts"].values == pytest.approx([45, 60, 12, 80]) - assert new_df["population"].values == pytest.approx([300, 25, 0, 0]) - assert new_df["incidence"].values == pytest.approx(np.append(exp_incidence, - [np.Inf, np.Inf])) - assert new_df["cumulative_prop"].values == pytest.approx(np.append(exp_cprop, - [np.Inf, np.Inf])) + assert (new_df["geo_id"].values == [31420, 49340]).all() + assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all() + assert new_df["new_counts"].values == pytest.approx([2.0, 13.0]) + assert new_df["cumulative_counts"].values == pytest.approx([45, 60]) + assert new_df["population"].values == pytest.approx([300, 25]) + assert new_df["incidence"].values == pytest.approx(exp_incidence) + assert new_df["cumulative_prop"].values == pytest.approx(exp_cprop) From d187ebc78b99ba839858c613e4141f9ec5e323b0 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Thu, 11 Jun 2020 10:00:38 -0400 Subject: [PATCH 12/18] disable 7dav_ signal --- jhu/delphi_jhu/run.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jhu/delphi_jhu/run.py b/jhu/delphi_jhu/run.py index a21c5bb54..4d4ad4f84 100644 --- a/jhu/delphi_jhu/run.py +++ b/jhu/delphi_jhu/run.py @@ -35,7 +35,7 @@ ] SMOOTHERS = [ "unsmoothed", - "seven_day_average", + #"seven_day_average", ] SENSOR_NAME_MAP = { "new_counts": ("incidence_num", False), @@ -45,7 +45,7 @@ } SMOOTHERS_MAP = { "unsmoothed": (identity, ''), - "seven_day_average": (seven_day_moving_average, '7day_avg_'), + #"seven_day_average": (seven_day_moving_average, '7dav_'), } GEO_RESOLUTIONS = [ "county", From 131b88bf90e6381f8b955e752ea5ea1b78858ecc Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Thu, 11 Jun 2020 10:01:23 -0400 Subject: [PATCH 13/18] cut 7day avg signal name --- jhu/tests/test_smooth.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jhu/tests/test_smooth.py b/jhu/tests/test_smooth.py index 1b43841bf..1f77fc76a 100644 --- a/jhu/tests/test_smooth.py +++ b/jhu/tests/test_smooth.py @@ -14,7 +14,7 @@ def test_output_files_smoothed(self, run_as_module): smoothed = pd.read_csv( join("receiving", - f"{dates[-1]}_state_confirmed_7day_avg_cumulative_num.csv") + f"{dates[-1]}_state_confirmed_7dav_cumulative_num.csv") ) raw = pd.concat([ From 003ac0cb011a06173e4ce3b366393b5e464cebd6 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Thu, 11 Jun 2020 10:49:31 -0400 Subject: [PATCH 14/18] delete df_mega for msa and hrr --- jhu/delphi_jhu/geo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jhu/delphi_jhu/geo.py b/jhu/delphi_jhu/geo.py index 4d4ed9698..c471a4ae3 100644 --- a/jhu/delphi_jhu/geo.py +++ b/jhu/delphi_jhu/geo.py @@ -220,9 +220,9 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str): merged["new_counts"] = merged["new_counts"] * merged["pop_prop"] merged["population"] = merged["population"] * merged["pop_prop"] df = merged.drop(["zip", "pop_prop", "hrrnum", "cbsa_id"], axis=1) - if sensor not in PROP_SENSORS: - df_mega["geo_id"] = df_mega["geo_id"].apply(fips_to_state) - df = df.append(df_mega) + # if sensor not in PROP_SENSORS: + # df_mega["geo_id"] = df_mega["geo_id"].apply(fips_to_state) + # df = df.append(df_mega) df = df.drop("fips", axis=1) df = df.groupby(["geo_id", "timestamp"]).sum().reset_index() From b911d8db39f7583e52f5b425c915dfe3fa68bfde Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Thu, 11 Jun 2020 10:51:43 -0400 Subject: [PATCH 15/18] revert test cases for msa and hrr --- jhu/tests/test_geo.py | 68 +++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/jhu/tests/test_geo.py b/jhu/tests/test_geo.py index 5a7274180..a96ad0d31 100644 --- a/jhu/tests/test_geo.py +++ b/jhu/tests/test_geo.py @@ -152,30 +152,30 @@ def test_hrr(self): } ) - df_mega = pd.DataFrame( - { - "fips": ["90013", "90001"], - "timestamp": ["2020-02-15", "2020-02-15"], - "new_counts": [8, 2], - "cumulative_counts": [80, 12], - "population": [np.nan, np.nan], - } - ) + # df_mega = pd.DataFrame( + # { + # "fips": ["90013", "90001"], + # "timestamp": ["2020-02-15", "2020-02-15"], + # "new_counts": [8, 2], + # "cumulative_counts": [80, 12], + # "population": [np.nan, np.nan], + # } + # ) - df = df.append(df_mega) + # df = df.append(df_mega) new_df = geo_map(df, "hrr", MAP_DF, 'new_counts') exp_incidence = np.array([13, 27]) / np.array([25, 2500]) * 100000 exp_cprop = np.array([60, 165]) / np.array([25, 2500]) * 100000 - assert (new_df["geo_id"].values == [110, 147]).all() - assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all() - assert new_df["new_counts"].values == pytest.approx([13.0, 27.0]) - assert new_df["cumulative_counts"].values == pytest.approx([60, 165]) - assert new_df["population"].values == pytest.approx([25, 2500]) - assert new_df["incidence"].values == pytest.approx(exp_incidence) - assert new_df["cumulative_prop"].values == pytest.approx(exp_cprop) + assert (new_df["geo_id"].values == [110, 147]).all() + assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all() + assert new_df["new_counts"].values == pytest.approx([13.0, 27.0]) + assert new_df["cumulative_counts"].values == pytest.approx([60, 165]) + assert new_df["population"].values == pytest.approx([25, 2500]) + assert new_df["incidence"].values == pytest.approx(exp_incidence) + assert new_df["cumulative_prop"].values == pytest.approx(exp_cprop) def test_msa(self): @@ -189,27 +189,27 @@ def test_msa(self): } ) - df_mega = pd.DataFrame( - { - "fips": ["90013", "90001"], - "timestamp": ["2020-02-15", "2020-02-15"], - "new_counts": [8, 2], - "cumulative_counts": [80, 12], - "population": [np.nan, np.nan], - } - ) + # df_mega = pd.DataFrame( + # { + # "fips": ["90013", "90001"], + # "timestamp": ["2020-02-15", "2020-02-15"], + # "new_counts": [8, 2], + # "cumulative_counts": [80, 12], + # "population": [np.nan, np.nan], + # } + # ) - df = df.append(df_mega) + # df = df.append(df_mega) new_df = geo_map(df, "msa", MAP_DF, 'new_counts') exp_incidence = np.array([2, 13]) / np.array([300, 25]) * 100000 exp_cprop = np.array([45, 60]) / np.array([300, 25]) * 100000 - assert (new_df["geo_id"].values == [31420, 49340]).all() - assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all() - assert new_df["new_counts"].values == pytest.approx([2.0, 13.0]) - assert new_df["cumulative_counts"].values == pytest.approx([45, 60]) - assert new_df["population"].values == pytest.approx([300, 25]) - assert new_df["incidence"].values == pytest.approx(exp_incidence) - assert new_df["cumulative_prop"].values == pytest.approx(exp_cprop) + assert (new_df["geo_id"].values == [31420, 49340]).all() + assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all() + assert new_df["new_counts"].values == pytest.approx([2.0, 13.0]) + assert new_df["cumulative_counts"].values == pytest.approx([45, 60]) + assert new_df["population"].values == pytest.approx([300, 25]) + assert new_df["incidence"].values == pytest.approx(exp_incidence) + assert new_df["cumulative_prop"].values == pytest.approx(exp_cprop) From 99c3860e6d8967eea2389c1a447a9e9cd47853c7 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Thu, 11 Jun 2020 10:54:32 -0400 Subject: [PATCH 16/18] resolve conflicts --- jhu/delphi_jhu/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jhu/delphi_jhu/run.py b/jhu/delphi_jhu/run.py index 4d4ad4f84..aeef3fa57 100644 --- a/jhu/delphi_jhu/run.py +++ b/jhu/delphi_jhu/run.py @@ -45,7 +45,7 @@ } SMOOTHERS_MAP = { "unsmoothed": (identity, ''), - #"seven_day_average": (seven_day_moving_average, '7dav_'), + #"seven_day_average": (seven_day_moving_average, '7dav_', True), } GEO_RESOLUTIONS = [ "county", From eabe62436ac130e0f6d3acf7767fe235fd3e1b53 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Thu, 11 Jun 2020 10:55:20 -0400 Subject: [PATCH 17/18] resolve conflicts --- jhu/delphi_jhu/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jhu/delphi_jhu/run.py b/jhu/delphi_jhu/run.py index aeef3fa57..485b663ca 100644 --- a/jhu/delphi_jhu/run.py +++ b/jhu/delphi_jhu/run.py @@ -44,7 +44,7 @@ "cumulative_prop": ("cumulative_prop", False), } SMOOTHERS_MAP = { - "unsmoothed": (identity, ''), + "unsmoothed": (identity, '', False), #"seven_day_average": (seven_day_moving_average, '7dav_', True), } GEO_RESOLUTIONS = [ From cc203c6ab59e4df896a9685eb6fb9da24361a63d Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Fri, 19 Jun 2020 10:05:49 -0400 Subject: [PATCH 18/18] resolve conflicts --- jhu/delphi_jhu/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jhu/delphi_jhu/run.py b/jhu/delphi_jhu/run.py index 485b663ca..c845fa317 100644 --- a/jhu/delphi_jhu/run.py +++ b/jhu/delphi_jhu/run.py @@ -45,7 +45,7 @@ } SMOOTHERS_MAP = { "unsmoothed": (identity, '', False), - #"seven_day_average": (seven_day_moving_average, '7dav_', True), + "seven_day_average": (seven_day_moving_average, '7dav_', True), } GEO_RESOLUTIONS = [ "county",