diff --git a/jhu/delphi_jhu/geo.py b/jhu/delphi_jhu/geo.py index 4e305917b..c471a4ae3 100644 --- a/jhu/delphi_jhu/geo.py +++ b/jhu/delphi_jhu/geo.py @@ -89,6 +89,10 @@ FIPS_TO_STATE = {v: k.lower() for k, v in STATE_TO_FIPS.items()} +# Fake fips to States + +JHU_FAKE_FIPS_TO_MEGA_FIPS = {f'900{x}' : f'{x}000' for x in STATE_TO_FIPS.values()} + def fips_to_state(fips: str) -> str: """Wrapper that handles exceptions to the FIPS scheme in the JHU data. @@ -148,7 +152,7 @@ def disburse(df: pd.DataFrame, pooled_fips: str, fips_list: list): return df -def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame): +def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str): """ Maps a DataFrame df, which contains data at the county resolution, and aggregate it to the geographic resolution geo_res. @@ -162,6 +166,10 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame): ('county', 'state', 'msa', 'hrr'). map_df: pd.DataFrame Loaded from static file "fips_prop_pop.csv". + sensor: str + sensor type. Valid options: + ("new_counts", "cumulative_counts", + "incidence", "cumulative_prop") Returns ------- @@ -169,15 +177,27 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame): Columns: geo_id, timestamp, ... """ VALID_GEO_RES = ("county", "state", "msa", "hrr") + #It is not clear to calculate the proportion for unassigned cases/deaths + PROP_SENSORS = ("incidence", "cumulative_prop") if geo_res not in VALID_GEO_RES: raise ValueError(f"geo_res must be one of {VALID_GEO_RES}") - df = df.copy() + + df_mega = df[df['fips'].astype(int) >= 90001].copy() + df_mega['geo_id'] = df_mega['fips'].apply(lambda x: JHU_FAKE_FIPS_TO_MEGA_FIPS[x]) + + df = df[df['fips'].astype(int) < 90001].copy() + if geo_res == "county": df["geo_id"] = df["fips"] + if sensor not in PROP_SENSORS: + df = df.append(df_mega) elif geo_res == "state": # Grab first two digits of fips # Map state fips to us postal code - df["geo_id"] = df["fips"].apply(fips_to_state) + df["geo_id"] = df["fips"] + # Add unassigned cases/deaths + df = df.append(df_mega) + df["geo_id"] = df["geo_id"].apply(fips_to_state) elif geo_res in ("msa", "hrr"): # Disburse Dukes & Nantucket to individual counties df = disburse(df, DN_FIPS, DN_COUNTY_FIPS) @@ -200,8 +220,13 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame): merged["new_counts"] = merged["new_counts"] * merged["pop_prop"] merged["population"] = merged["population"] * merged["pop_prop"] df = merged.drop(["zip", "pop_prop", "hrrnum", "cbsa_id"], axis=1) + # if sensor not in PROP_SENSORS: + # df_mega["geo_id"] = df_mega["geo_id"].apply(fips_to_state) + # df = df.append(df_mega) df = df.drop("fips", axis=1) df = df.groupby(["geo_id", "timestamp"]).sum().reset_index() + + # Value would be negative for megacounties , which would not be considered in the main function df["incidence"] = df["new_counts"] / df["population"] * INCIDENCE_BASE df["cumulative_prop"] = df["cumulative_counts"] / df["population"] * INCIDENCE_BASE return df diff --git a/jhu/delphi_jhu/pull.py b/jhu/delphi_jhu/pull.py index 1049d5e0a..d4131db82 100644 --- a/jhu/delphi_jhu/pull.py +++ b/jhu/delphi_jhu/pull.py @@ -62,7 +62,7 @@ def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFr MIN_FIPS = 1000 MAX_FIPS = 57000 EXTRA_FIPS = ( - 72, # Puerto Rico (provided as the entire state) + 72, # Puerto Rico (provided as the entire state) 70002, # Kansas City, MO 70003, # Dukes and Nantucket Counties, MA ) @@ -79,9 +79,13 @@ def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFr & (df["FIPS"] < MAX_FIPS) ) # "Uncategorized", etc. | df["FIPS"].isin(EXTRA_FIPS) + # Get Fake FIPS for unassigned cases + | np.logical_and(df['FIPS'] >= 90001, + df['FIPS'] <= 90056) ] # Merge in population LOWERCASE, consistent across confirmed and deaths - df = pd.merge(df, pop_df, on="FIPS") + # Set population as NAN for fake fips + df = pd.merge(df, pop_df, on="FIPS", how='left') # Manual correction for PR df.loc[df["FIPS"] == 72, "FIPS"] = 72000 diff --git a/jhu/delphi_jhu/run.py b/jhu/delphi_jhu/run.py index cfb738f21..c845fa317 100644 --- a/jhu/delphi_jhu/run.py +++ b/jhu/delphi_jhu/run.py @@ -35,7 +35,7 @@ ] SMOOTHERS = [ "unsmoothed", - "seven_day_average", + #"seven_day_average", ] SENSOR_NAME_MAP = { "new_counts": ("incidence_num", False), @@ -44,8 +44,8 @@ "cumulative_prop": ("cumulative_prop", False), } SMOOTHERS_MAP = { - "unsmoothed": (identity, ''), - "seven_day_average": (seven_day_moving_average, '7day_avg_'), + "unsmoothed": (identity, '', False), + "seven_day_average": (seven_day_moving_average, '7dav_', True), } GEO_RESOLUTIONS = [ "county", @@ -77,7 +77,7 @@ def run_module(): print(geo_res, metric, sensor, smoother) df = dfs[metric] # Aggregate to appropriate geographic resolution - df = geo_map(df, geo_res, map_df) + df = geo_map(df, geo_res, map_df, sensor) df["val"] = SMOOTHERS_MAP[smoother][0](df[sensor].values) df["se"] = np.nan df["sample_size"] = np.nan diff --git a/jhu/tests/test_geo.py b/jhu/tests/test_geo.py index 8d45eb336..a96ad0d31 100644 --- a/jhu/tests/test_geo.py +++ b/jhu/tests/test_geo.py @@ -25,6 +25,13 @@ def test_normal(self): assert fips_to_state("12003") == "fl" assert fips_to_state("50103") == "vt" assert fips_to_state("15003") == "hi" + + def test_mega(self): + + assert fips_to_state("01000") == "al" + assert fips_to_state("13000") == "ga" + assert fips_to_state("44000") == "ri" + assert fips_to_state("12000") == "fl" class TestDisburse: @@ -74,15 +81,27 @@ def test_county(self): } ) - new_df = geo_map(df, "county", MAP_DF) + df_mega = pd.DataFrame( + { + "fips": ["90013", "90001"], + "timestamp": ["2020-02-15", "2020-02-15"], + "new_counts": [8, 2], + "cumulative_counts": [80, 12], + "population": [np.nan, np.nan], + } + ) + + df = df.append(df_mega) + + new_df = geo_map(df, "county", MAP_DF, 'new_counts') exp_incidence = df["new_counts"] / df["population"] * 100000 exp_cprop = df["cumulative_counts"] / df["population"] * 100000 - - assert set(new_df["geo_id"].values) == set(df["fips"].values) + + assert set(new_df["geo_id"].values) == set(['01000', '13000', '48027', '50103', '53003']) assert set(new_df["timestamp"].values) == set(df["timestamp"].values) - assert set(new_df["incidence"].values) == set(exp_incidence.values) - assert set(new_df["cumulative_prop"].values) == set(exp_cprop.values) + assert set(new_df["incidence"].values) - set(exp_incidence.values) == set([np.Inf]) + assert set(new_df["cumulative_prop"].values) - set(exp_cprop.values) == set([np.Inf]) def test_state(self): @@ -95,19 +114,31 @@ def test_state(self): "population": [100, 2100, 300, 25], } ) + + df_mega = pd.DataFrame( + { + "fips": ["90013", "90001", "04000", "25000"], + "timestamp": ["2020-02-15", "2020-02-15", "2020-02-15", "2020-02-15"], + "new_counts": [8, 2, 5, 10], + "cumulative_counts": [80, 12, 30, 100], + "population": [np.nan, np.nan, np.nan, np.nan], + } + ) + + df = df.append(df_mega) - new_df = geo_map(df, "state", MAP_DF) + new_df = geo_map(df, "state", MAP_DF, 'new_counts') - exp_incidence = np.array([27, 13]) / np.array([2500, 25]) * 100000 - exp_cprop = np.array([165, 60]) / np.array([2500, 25]) * 100000 + exp_incidence = np.array([27 + 5, 13 + 10]) / np.array([2500, 25]) * 100000 + exp_cprop = np.array([165 + 30, 60 + 100]) / np.array([2500, 25]) * 100000 - assert (new_df["geo_id"].values == ["az", "ma"]).all() - assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all() - assert (new_df["new_counts"].values == [27, 13]).all() - assert (new_df["cumulative_counts"].values == [165, 60]).all() - assert (new_df["population"].values == [2500, 25]).all() - assert (new_df["incidence"].values == exp_incidence).all() - assert (new_df["cumulative_prop"].values == exp_cprop).all() + assert set(new_df["geo_id"].values) == set(["az", "ma", "al", "ga"]) + assert set(new_df["timestamp"].values) == set(["2020-02-15"]) + assert set(new_df["new_counts"].values) == set([32, 23, 2, 8]) + assert set(new_df["cumulative_counts"].values) == set([195, 160, 12, 80]) + assert set(new_df["population"].values) == set([2500, 25, 0]) + assert set(new_df["incidence"].values) - set(exp_incidence) == set([np.Inf]) + assert set(new_df["cumulative_prop"].values) - set(exp_cprop) == set([np.Inf]) def test_hrr(self): @@ -121,7 +152,19 @@ def test_hrr(self): } ) - new_df = geo_map(df, "hrr", MAP_DF) + # df_mega = pd.DataFrame( + # { + # "fips": ["90013", "90001"], + # "timestamp": ["2020-02-15", "2020-02-15"], + # "new_counts": [8, 2], + # "cumulative_counts": [80, 12], + # "population": [np.nan, np.nan], + # } + # ) + + # df = df.append(df_mega) + + new_df = geo_map(df, "hrr", MAP_DF, 'new_counts') exp_incidence = np.array([13, 27]) / np.array([25, 2500]) * 100000 exp_cprop = np.array([60, 165]) / np.array([25, 2500]) * 100000 @@ -145,8 +188,20 @@ def test_msa(self): "population": [100, 2100, 300, 25], } ) - - new_df = geo_map(df, "msa", MAP_DF) + + # df_mega = pd.DataFrame( + # { + # "fips": ["90013", "90001"], + # "timestamp": ["2020-02-15", "2020-02-15"], + # "new_counts": [8, 2], + # "cumulative_counts": [80, 12], + # "population": [np.nan, np.nan], + # } + # ) + + # df = df.append(df_mega) + + new_df = geo_map(df, "msa", MAP_DF, 'new_counts') exp_incidence = np.array([2, 13]) / np.array([300, 25]) * 100000 exp_cprop = np.array([45, 60]) / np.array([300, 25]) * 100000 diff --git a/jhu/tests/test_smooth.py b/jhu/tests/test_smooth.py index 1b43841bf..1f77fc76a 100644 --- a/jhu/tests/test_smooth.py +++ b/jhu/tests/test_smooth.py @@ -14,7 +14,7 @@ def test_output_files_smoothed(self, run_as_module): smoothed = pd.read_csv( join("receiving", - f"{dates[-1]}_state_confirmed_7day_avg_cumulative_num.csv") + f"{dates[-1]}_state_confirmed_7dav_cumulative_num.csv") ) raw = pd.concat([