cmu-delphi
diff --git a/‎.gitignore
Lines changed: 4 additions & 0 deletions b/‎.gitignore
Lines changed: 4 additions & 0 deletions
diff --git a/‎_delphi_utils_python/data_proc/geomap/geo_data_proc.py
Lines changed: 1 addition & 0 deletions b/‎_delphi_utils_python/data_proc/geomap/geo_data_proc.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎_delphi_utils_python/delphi_utils/geomap.py
Lines changed: 10 additions & 7 deletions b/‎_delphi_utils_python/delphi_utils/geomap.py
Lines changed: 10 additions & 7 deletions
diff --git a/‎jhu/delphi_jhu/geo.py
Lines changed: 6 additions & 0 deletions b/‎jhu/delphi_jhu/geo.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎jhu/delphi_jhu/pull.py
Lines changed: 73 additions & 80 deletions b/‎jhu/delphi_jhu/pull.py
Lines changed: 73 additions & 80 deletions
diff --git a/‎jhu/delphi_jhu/run.py
Lines changed: 3 additions & 7 deletions b/‎jhu/delphi_jhu/run.py
Lines changed: 3 additions & 7 deletions
@@ -126,3 +126,7 @@ venv.bak/
 .retry
 .indicators-ansible-vault-pass
 indicators-ansible-vault-pass
+
+# testing_utils
+testing_utils/cache
+testing_utils/*.csv
@@ -391,6 +391,7 @@ def create_fips_population_table():
     df_pr = df_pr.groupby("fips").sum().reset_index()
     df_pr = df_pr[~df_pr["fips"].isin(census_pop["fips"])]
     census_pop_pr = pd.concat([census_pop, df_pr])
+
     census_pop_pr.to_csv(join(OUTPUT_DIR, FIPS_POPULATION_OUT_FILENAME), index=False)
 
 
 
@@ -413,9 +413,10 @@ def replace_geocode(
             df = df.groupby([new_col]).sum().reset_index()
         return df
 
-    def add_population_column(self, data, geocode_type, geocode_col=None):
+    def add_population_column(self, data, geocode_type, geocode_col=None, dropna=True):
         """
-        Appends a population column to a dateframe, based on the FIPS or ZIP code.
+        Appends a population column to a dataframe, based on the FIPS or ZIP code. If no
+        dataframe is provided, the full crosswalk from geocode to population is returned.
 
         Parameters
         ---------
@@ -433,24 +434,26 @@ def add_population_column(self, data, geocode_type, geocode_col=None):
             A dataframe with a population column appended.
         """
         geocode_col = geocode_type if geocode_col is None else geocode_col
+        data = data.copy()
 
         if geocode_type not in ["fips", "zip"]:
             raise ValueError(
                 "Only fips and zip geocodes supported. \
                 For other codes, aggregate those."
             )
 
+        pop_df = self._load_crosswalk(from_code=geocode_type, to_code="pop")
+
         if not is_string_dtype(data[geocode_col]):
             data[geocode_col] = data[geocode_col].astype(str).str.zfill(5)
 
-        pop_df = self._load_crosswalk(from_code=geocode_type, to_code="pop")
-
+        merge_type = "inner" if dropna else "left"
         data_with_pop = (
-            data.copy()
-            .merge(pop_df, left_on=geocode_col, right_on=geocode_type, how="inner")
+            data
+            .merge(pop_df, left_on=geocode_col, right_on=geocode_type, how=merge_type)
             .rename(columns={"pop": "population"})
         )
-        data_with_pop["population"] = data_with_pop["population"].astype(int)
+
         return data_with_pop
 
     @staticmethod
 
@@ -35,6 +35,12 @@ def geo_map(df: pd.DataFrame, geo_res: str):
     if geo_res == "county":
         df.rename(columns={'fips': 'geo_id'}, inplace=True)
     elif geo_res == "state":
+        df = df.set_index("fips")
+        # Zero out the state FIPS population to avoid double counting.
+        state_fips_codes = {str(x).zfill(2) + "000" for x in range(1,73)}
+        subset_state_fips_codes = set(df.index.values) & state_fips_codes
+        df.loc[subset_state_fips_codes, "population"] = 0
+        df = df.reset_index()
         df = gmpr.replace_geocode(df, "fips", "state_id", new_col="geo_id", date_col="timestamp")
     else:
         df = gmpr.replace_geocode(df, "fips", geo_res, new_col="geo_id", date_col="timestamp")
 
@@ -1,18 +1,66 @@
 # -*- coding: utf-8 -*-
 
-import re
 import pandas as pd
 import numpy as np
 from delphi_utils import GeoMapper
 
-def detect_date_col(col_name: str):
-    """determine if column name is a date"""
-    date_match = re.match(r'\d{1,2}\/\d{1,2}\/\d{1,2}', col_name)
-    if date_match:
-        return True
-    return False
 
-def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFrame:
+def download_data(base_url: str, metric: str) -> pd.DataFrame:
+    """
+    Downloads the data from the JHU repo, extracts the UID and the date columns, and
+    enforces the date datatype on the the time column.
+    """
+    # Read data
+    df = pd.read_csv(base_url.format(metric=metric))
+    # Keep the UID and the time series columns only
+    # The regex filters for columns with the date format MM-DD-YY or M-D-YY
+    df = df.filter(regex="\d{1,2}\/\d{1,2}\/\d{2}|UID").melt(
+        id_vars=["UID"], var_name="timestamp", value_name="cumulative_counts"
+    )
+    df["timestamp"] = pd.to_datetime(df["timestamp"])
+    return df
+
+
+def create_diffs_column(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Using the cumulative_counts column from the dataframe, partitions the dataframe
+    into separate time-series based on fips, and then computes pairwise differences
+    of the cumulative values to get the incidence values. Boundary cases are handled
+    by zero-filling the day prior.
+    """
+    # Take time-diffs in each geo_code partition
+    df = df.set_index(["fips", "timestamp"])
+    df["new_counts"] = df.groupby(level=0)["cumulative_counts"].diff()
+    # Fill the NA value for the first date of each partition with the cumulative value that day
+    # (i.e. pretend the cumulative count the day before was 0)
+    na_value_mask = df["new_counts"].isna()
+    df.loc[na_value_mask, "new_counts"] = df.loc[na_value_mask, "cumulative_counts"]
+    df = df.reset_index()
+    return df
+
+
+def sanity_check_data(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Perform a final set of sanity checks on the data.
+    """
+    days_by_fips = df.groupby("fips").count()["cumulative_counts"].unique()
+    unique_days = df["timestamp"].unique()
+
+    # each FIPS has same number of rows
+    if (len(days_by_fips) > 1) or (days_by_fips[0] != len(unique_days)):
+        raise ValueError("Differing number of days by fips")
+
+    min_timestamp = min(unique_days)
+    max_timestamp = max(unique_days)
+    n_days = (max_timestamp - min_timestamp) / np.timedelta64(1, "D") + 1
+    if n_days != len(unique_days):
+        raise ValueError(
+            f"Not every day between {min_timestamp} and "
+            "{max_timestamp} is represented."
+        )
+
+
+def pull_jhu_data(base_url: str, metric: str, gmpr: GeoMapper) -> pd.DataFrame:
     """Pulls the latest Johns Hopkins CSSE data, and conforms it into a dataset
 
     The output dataset has:
@@ -28,92 +76,37 @@ def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFr
     may be negative.  This is wholly dependent on the quality of the raw
     dataset.
 
-    We filter the data such that we only keep rows with valid FIPS, or "FIPS"
-    codes defined under the exceptions of the README.  The current  exceptions
-    include:
-
-    - 70002: Dukes County and Nantucket County in Massachusetts, which are
-      reported together
-    - 70003: Kansas City, Missouri, which reports counts separately from the
-      four counties it intesects (Platte, Cass, Clay, Jackson Counties)
+    We filter the data such that we only keep rows with valid FIPS or "FIPS"
+    codes defined under the exceptions of the README.
 
     Parameters
     ----------
     base_url: str
-        Base URL for pulling the JHU CSSE data
+        Base URL for pulling the JHU CSSE data.
     metric: str
         One of 'confirmed' or 'deaths'.
-    pop_df: pd.DataFrame
-        Read from static file "fips_population.csv".
+    gmpr: GeoMapper
+        An instance of the geomapping utility.
 
     Returns
     -------
     pd.DataFrame
         Dataframe as described above.
     """
+    df = download_data(base_url, metric)
 
-    # Read data
-    df = pd.read_csv(base_url.format(metric=metric))
-
-    # FIPS are missing for some nonstandard FIPS
-    date_cols = [col_name for col_name in df.columns if detect_date_col(col_name)]
-    keep_cols = date_cols + ['UID']
-    df = df[keep_cols]
-
-    df = df.melt(
-        id_vars=["UID"],
-        var_name="timestamp",
-        value_name="cumulative_counts",
+    gmpr = GeoMapper()
+    df = gmpr.replace_geocode(
+        df, "jhu_uid", "fips", from_col="UID", date_col="timestamp"
     )
-    df["timestamp"] = pd.to_datetime(df["timestamp"])
 
-    gmpr = GeoMapper()
-    df = gmpr.replace_geocode(df, "jhu_uid", "fips", from_col="UID", date_col="timestamp")
-
-    # Merge in population LOWERCASE, consistent across confirmed and deaths
-    # Set population as NAN for fake fips
-    pop_df.rename(columns={'FIPS':'fips'}, inplace=True)
-    pop_df['fips'] = pop_df['fips'].astype(int).\
-        astype(str).str.zfill(5)
-    df = df.merge(pop_df, on="fips", how='left')
-
-    # Add a dummy first row here on day before first day
-    # code below could be cleaned with groupby.diff
-
-    min_ts = min(df["timestamp"])
-    df_dummy = df.loc[df["timestamp"] == min_ts].copy()
-    df_dummy.loc[:, "timestamp"] = min_ts - pd.Timedelta(days=1)
-    df_dummy.loc[:, "cumulative_counts"] = 0
-    df = pd.concat([df_dummy, df])
-    # Obtain new_counts
-    df.sort_values(["fips", "timestamp"], inplace=True)
-    df["new_counts"] = df["cumulative_counts"].diff()  # 1st discrete difference
-    # Handle edge cases where we diffed across fips
-    mask = df["fips"] != df["fips"].shift(1)
-    df.loc[mask, "new_counts"] = np.nan
-    df.reset_index(inplace=True, drop=True)
+    # Merge in population, set population as NAN for fake fips
+    df = gmpr.add_population_column(df, "fips")
+    df = create_diffs_column(df)
 
     # Final sanity checks
-    days_by_fips = df.groupby("fips").count()["cumulative_counts"].unique()
-    unique_days = df["timestamp"].unique()
-    # each FIPS has same number of rows
-    if (len(days_by_fips) > 1) or (days_by_fips[0] != len(unique_days)):
-        raise ValueError("Differing number of days by fips")
-    min_timestamp = min(unique_days)
-    max_timestamp = max(unique_days)
-    n_days = (max_timestamp - min_timestamp) / np.timedelta64(1, "D") + 1
-    if n_days != len(unique_days):
-        raise ValueError(
-            f"Not every day between {min_timestamp} and "
-            "{max_timestamp} is represented."
-        )
-    return df.loc[
-        df["timestamp"] >= min_ts,
-        [  # Reorder
-            "fips",
-            "timestamp",
-            "population",
-            "new_counts",
-            "cumulative_counts",
-        ],
-    ]
+    sanity_check_data(df)
+
+    # Reorder columns
+    df = df[["fips", "timestamp", "population", "new_counts", "cumulative_counts"]]
+    return df
@@ -17,6 +17,7 @@
     S3ArchiveDiffer,
 )
 
+from delphi_utils import GeoMapper
 from .geo import geo_map
 from .pull import pull_jhu_data
 from .smooth import (
@@ -72,7 +73,6 @@ def run_module():
     export_start_date = params["export_start_date"]
     export_dir = params["export_dir"]
     base_url = params["base_url"]
-    static_file_dir = params["static_file_dir"]
     cache_dir = params["cache_dir"]
 
     if len(params["bucket_name"]) > 0:
@@ -84,12 +84,8 @@ def run_module():
     else:
         arch_diff = None
 
-    pop_df = pd.read_csv(
-        join(static_file_dir, "fips_population.csv"),
-        dtype={"fips": float, "population": float},
-    )
-
-    dfs = {metric: pull_jhu_data(base_url, metric, pop_df) for metric in METRICS}
+    gmpr = GeoMapper()
+    dfs = {metric: pull_jhu_data(base_url, metric, gmpr) for metric in METRICS}
     for metric, geo_res, sensor, smoother in product(
             METRICS, GEO_RESOLUTIONS, SENSORS, SMOOTHERS):
         print(metric, geo_res, sensor, smoother)