cmu-delphi · minhkhul · Jun 24, 2024 · Jun 24, 2024 · Jun 24, 2024 · Jun 24, 2024
diff --git a/doctor_visits/delphi_doctor_visits/config.py b/doctor_visits/delphi_doctor_visits/config.py
@@ -24,13 +24,36 @@ class Config:
     GEO_COL = "PatCountyFIPS"
     AGE_COL = "PatAgeGroup"
     HRR_COLS = ["Pat HRR Name", "Pat HRR ID"]
-    ID_COLS = [DATE_COL] + [GEO_COL] + [AGE_COL] + HRR_COLS
-    FILT_COLS = ID_COLS + COUNT_COLS
-    DTYPES = {"ServiceDate": str, "PatCountyFIPS": str,
-              "Denominator": int, "Flu1": int,
-              "Covid_like": int, "Flu_like": int,
-              "Mixed": int, "PatAgeGroup": str,
-              "Pat HRR Name": str, "Pat HRR ID": float}
+    # as of 2020-05-11, input file expected to have 10 columns
+    # id cols: ServiceDate, PatCountyFIPS, PatAgeGroup, Pat HRR ID/Pat HRR Name
+    # value cols: Denominator, Covid_like, Flu_like, Flu1, Mixed
+    ID_COLS = [DATE_COL] + [GEO_COL] + HRR_COLS + [AGE_COL]
+    # drop HRR columns - unused for now since we assign HRRs by FIPS
+    FILT_COLS = [DATE_COL] + [GEO_COL] + [AGE_COL] + COUNT_COLS
+    DTYPES = {
+        "ServiceDate": str,
+        "PatCountyFIPS": str,
+        "Denominator": int,
+        "Flu1": int,
+        "Covid_like": int,
+        "Flu_like": int,
+        "Mixed": int,
+        "PatAgeGroup": str,
+        "Pat HRR Name": str,
+        "Pat HRR ID": float,
+        "servicedate": str,
+        "patCountyFIPS": str,
+        "patAgeGroup": str,
+        "patHRRname": str,
+        "patHRRid": float,
+    }
+    DEVIANT_COLS_MAP = {
+        "servicedate": "ServiceDate",
+        "patCountyFIPS": "PatCountyFIPS",
+        "patHRRname": "Pat HRR Name",
+        "patAgeGroup": "PatAgeGroup",
+        "patHRRid": "Pat HRR ID",
+    }
 
     SMOOTHER_BANDWIDTH = 100  # bandwidth for the linear left Gaussian filter
     MAX_BACKFILL_WINDOW = 7  # maximum number of days used to average a backfill correction

diff --git a/doctor_visits/delphi_doctor_visits/input/SYNEDI_AGG_OUTPATIENT_18052020_1455CDT.csv.gz b/doctor_visits/delphi_doctor_visits/input/SYNEDI_AGG_OUTPATIENT_18052020_1455CDT.csv.gz
diff --git a/doctor_visits/delphi_doctor_visits/modify_claims_drops.py b/doctor_visits/delphi_doctor_visits/modify_claims_drops.py
diff --git a/doctor_visits/delphi_doctor_visits/process_data.py b/doctor_visits/delphi_doctor_visits/process_data.py
@@ -0,0 +1,101 @@
+import dask.dataframe as dd
+from datetime import datetime
+import numpy as np
+import pandas as pd
+from pathlib import Path
+
+from .config import Config
+
+
+def write_to_csv(output_df: pd.DataFrame, geo_level: str, se:bool, out_name: str, logger, output_path="."):
+    """Write sensor values to csv.
+
+    Args:
+      output_dict: dictionary containing sensor rates, se, unique dates, and unique geo_id
+      geo_level: geographic resolution, one of ["county", "state", "msa", "hrr", "nation", "hhs"]
+      se: boolean to write out standard errors, if true, use an obfuscated name
+      out_name: name of the output file
+      output_path: outfile path to write the csv (default is current directory)
+    """
+    if se:
+        logger.info(f"========= WARNING: WRITING SEs TO {out_name} =========")
+
+    out_n = 0
+    for d in set(output_df["date"]):
+        filename = "%s/%s_%s_%s.csv" % (output_path,
+                                        (d + Config.DAY_SHIFT).strftime("%Y%m%d"),
+                                        geo_level,
+                                        out_name)
+        single_date_df = output_df[output_df["date"] == d]
+        with open(filename, "w") as outfile:
+            outfile.write("geo_id,val,se,direction,sample_size\n")
+
+            for line in single_date_df.itertuples():
+                geo_id = line.geo_id
+                sensor = 100 * line.val  # report percentages
+                se_val = 100 * line.se
+                assert not np.isnan(sensor), "sensor value is nan, check pipeline"
+                assert sensor < 90, f"strangely high percentage {geo_id, sensor}"
+                if not np.isnan(se_val):
+                    assert se_val < 5, f"standard error suspiciously high! investigate {geo_id}"
+
+                if se:
+                    assert sensor > 0 and se_val > 0, "p=0, std_err=0 invalid"
+                    outfile.write(
+                        "%s,%f,%s,%s,%s\n" % (geo_id, sensor, se_val, "NA", "NA"))
+                else:
+                    # for privacy reasons we will not report the standard error
+                    outfile.write(
+                        "%s,%f,%s,%s,%s\n" % (geo_id, sensor, "NA", "NA", "NA"))
+                out_n += 1
+    logger.debug(f"wrote {out_n} rows for {geo_level}")
+
+
+def csv_to_df(filepath: str, startdate: datetime, enddate: datetime, dropdate: datetime, logger) -> pd.DataFrame:
+    '''
+    Reads csv using Dask and filters out based on date range and currently unused column,
+    then converts back into pandas dataframe.
+    Parameters
+    ----------
+      filepath: path to the aggregated doctor-visits data
+      startdate: first sensor date (YYYY-mm-dd)
+      enddate: last sensor date (YYYY-mm-dd)
+      dropdate: data drop date (YYYY-mm-dd)
+
+    -------
+    '''
+    filepath = Path(filepath)
+    logger.info(f"Processing {filepath}")
+
+    ddata = dd.read_csv(
+        filepath,
+        compression="gzip",
+        dtype=Config.DTYPES,
+        blocksize=None,
+    )
+
+    ddata = ddata.dropna()
+    # rename inconsistent column names to match config column names
+    ddata = ddata.rename(columns=Config.DEVIANT_COLS_MAP)
+
+    ddata = ddata[Config.FILT_COLS]
+    ddata[Config.DATE_COL] = dd.to_datetime(ddata[Config.DATE_COL])
+
+    # restrict to training start and end date
+    startdate = startdate - Config.DAY_SHIFT
+
+    assert startdate > Config.FIRST_DATA_DATE, "Start date <= first day of data"
+    assert startdate < enddate, "Start date >= end date"
+    assert enddate <= dropdate, "End date > drop date"
+
+    date_filter = ((ddata[Config.DATE_COL] >= Config.FIRST_DATA_DATE) & (ddata[Config.DATE_COL] < dropdate))
+
+    df = ddata[date_filter].compute()
+
+    # aggregate age groups (so data is unique by service date and FIPS)
+    df = df.groupby([Config.DATE_COL, Config.GEO_COL]).sum(numeric_only=True).reset_index()
+    assert np.sum(df.duplicated()) == 0, "Duplicates after age group aggregation"
+    assert (df[Config.COUNT_COLS] >= 0).all().all(), "Counts must be nonnegative"
+
+    logger.info(f"Done processing {filepath}")
+    return df
diff --git a/doctor_visits/delphi_doctor_visits/run.py b/doctor_visits/delphi_doctor_visits/run.py
@@ -14,9 +14,9 @@
 from delphi_utils import get_structured_logger
 
 # first party
-from .update_sensor import update_sensor, write_to_csv
+from .update_sensor import update_sensor
+from .process_data import csv_to_df, write_to_csv
 from .download_claims_ftp_files import download
-from .modify_claims_drops import modify_and_write
 from .get_latest_claims_name import get_latest_filename
 
 
@@ -55,9 +55,6 @@ def run_module(params):  # pylint: disable=too-many-statements
     # find the latest files (these have timestamps)
     claims_file = get_latest_filename(params["indicator"]["input_dir"], logger)
 
-    # modify data
-    modify_and_write(claims_file, logger)
-
     ## get end date from input file
     # the filename is expected to be in the format:
     # "EDI_AGG_OUTPATIENT_DDMMYYYY_HHMM{timezone}.csv.gz"
@@ -89,6 +86,7 @@ def run_module(params):  # pylint: disable=too-many-statements
     ## geographies
     geos = ["state", "msa", "hrr", "county", "hhs", "nation"]
 
+    claims_df = csv_to_df(claims_file, startdate_dt, enddate_dt, dropdate_dt, logger)
 
     ## print out other vars
     logger.info("outpath:\t\t%s", export_dir)
@@ -107,10 +105,10 @@ def run_module(params):  # pylint: disable=too-many-statements
             else:
                 logger.info("starting %s, no adj", geo)
             sensor = update_sensor(
-                filepath=claims_file,
-                startdate=startdate,
-                enddate=enddate,
-                dropdate=dropdate,
+                data=claims_df,
+                startdate=startdate_dt,
+                enddate=enddate_dt,
+                dropdate=dropdate_dt,
                 geo=geo,
                 parallel=params["indicator"]["parallel"],
                 weekday=weekday,

diff --git a/doctor_visits/delphi_doctor_visits/update_sensor.py b/doctor_visits/delphi_doctor_visits/update_sensor.py
@@ -9,7 +9,7 @@
 """
 
 # standard packages
-from datetime import timedelta
+from datetime import timedelta, datetime
 from multiprocessing import Pool, cpu_count
 
 # third party
@@ -23,57 +23,14 @@
 from .sensor import DoctorVisitsSensor
 
 
-def write_to_csv(output_df: pd.DataFrame, geo_level, se, out_name, logger, output_path="."):
-    """Write sensor values to csv.
-
-    Args:
-      output_dict: dictionary containing sensor rates, se, unique dates, and unique geo_id
-      se: boolean to write out standard errors, if true, use an obfuscated name
-      out_name: name of the output file
-      output_path: outfile path to write the csv (default is current directory)
-    """
-    if se:
-        logger.info(f"========= WARNING: WRITING SEs TO {out_name} =========")
-
-    out_n = 0
-    for d in set(output_df["date"]):
-        filename = "%s/%s_%s_%s.csv" % (output_path,
-                                        (d + Config.DAY_SHIFT).strftime("%Y%m%d"),
-                                        geo_level,
-                                        out_name)
-        single_date_df = output_df[output_df["date"] == d]
-        with open(filename, "w") as outfile:
-            outfile.write("geo_id,val,se,direction,sample_size\n")
-
-            for line in single_date_df.itertuples():
-                geo_id = line.geo_id
-                sensor = 100 * line.val # report percentages
-                se_val = 100 * line.se
-                assert not np.isnan(sensor), "sensor value is nan, check pipeline"
-                assert sensor < 90, f"strangely high percentage {geo_id, sensor}"
-                if not np.isnan(se_val):
-                    assert se_val < 5, f"standard error suspiciously high! investigate {geo_id}"
-
-                if se:
-                    assert sensor > 0 and se_val > 0, "p=0, std_err=0 invalid"
-                    outfile.write(
-                        "%s,%f,%s,%s,%s\n" % (geo_id, sensor, se_val, "NA", "NA"))
-                else:
-                    # for privacy reasons we will not report the standard error
-                    outfile.write(
-                        "%s,%f,%s,%s,%s\n" % (geo_id, sensor, "NA", "NA", "NA"))
-                out_n += 1
-    logger.debug(f"wrote {out_n} rows for {geo_level}")
-
-
 def update_sensor(
-        filepath, startdate, enddate, dropdate, geo, parallel,
-        weekday, se, logger
+        data:pd.DataFrame, startdate:datetime, enddate:datetime, dropdate:datetime, geo:str, parallel: bool,
+        weekday:bool, se:bool, logger
 ):
     """Generate sensor values.
 
     Args:
-      filepath: path to the aggregated doctor-visits data
+      data: dataframe of the cleaned claims file
       startdate: first sensor date (YYYY-mm-dd)
       enddate: last sensor date (YYYY-mm-dd)
       dropdate: data drop date (YYYY-mm-dd)
@@ -83,41 +40,10 @@ def update_sensor(
       se: boolean to write out standard errors, if true, use an obfuscated name
       logger: the structured logger
     """
-    # as of 2020-05-11, input file expected to have 10 columns
-    # id cols: ServiceDate, PatCountyFIPS, PatAgeGroup, Pat HRR ID/Pat HRR Name
-    # value cols: Denominator, Covid_like, Flu_like, Flu1, Mixed
-    data = pd.read_csv(
-        filepath,
-        usecols=Config.FILT_COLS,
-        dtype=Config.DTYPES,
-        parse_dates=[Config.DATE_COL],
-    )
-    assert (
-            np.sum(data.duplicated(subset=Config.ID_COLS)) == 0
-    ), "Duplicated data! Check the input file"
-
-    # drop HRR columns - unused for now since we assign HRRs by FIPS
-    data.drop(columns=Config.HRR_COLS, inplace=True)
-    data.dropna(inplace=True)  # drop rows with any missing entries
-
-    # aggregate age groups (so data is unique by service date and FIPS)
-    data = data.groupby([Config.DATE_COL, Config.GEO_COL]).sum(numeric_only=True).reset_index()
-    assert np.sum(data.duplicated()) == 0, "Duplicates after age group aggregation"
-    assert (data[Config.COUNT_COLS] >= 0).all().all(), "Counts must be nonnegative"
-
-    ## collect dates
-    # restrict to training start and end date
+
     drange = lambda s, e: np.array([s + timedelta(days=x) for x in range((e - s).days)])
-    startdate = pd.to_datetime(startdate) - Config.DAY_SHIFT
-    burnindate = startdate - Config.DAY_SHIFT
-    enddate = pd.to_datetime(enddate)
-    dropdate = pd.to_datetime(dropdate)
-    assert startdate > Config.FIRST_DATA_DATE, "Start date <= first day of data"
-    assert startdate < enddate, "Start date >= end date"
-    assert enddate <= dropdate, "End date > drop date"
-    data = data[(data[Config.DATE_COL] >= Config.FIRST_DATA_DATE) & \
-                (data[Config.DATE_COL] < dropdate)]
     fit_dates = drange(Config.FIRST_DATA_DATE, dropdate)
+    burnindate = startdate - Config.DAY_SHIFT
     burn_in_dates = drange(burnindate, dropdate)
     sensor_dates = drange(startdate, enddate)
     # The ordering of sensor dates corresponds to the order of burn-in dates

diff --git a/doctor_visits/setup.py b/doctor_visits/setup.py
@@ -11,6 +11,7 @@
     "pytest-cov",
     "pytest",
     "scikit-learn",
+    "dask",
 ]
 
 setup(

diff --git a/.../tests/comparison/process_data/main_after_date_SYNEDI_AGG_OUTPATIENT_07022020_1455CDT.pkl b/.../tests/comparison/process_data/main_after_date_SYNEDI_AGG_OUTPATIENT_07022020_1455CDT.pkl
diff --git a/doctor_visits/tests/test_data/SYNEDI_AGG_OUTPATIENT_07022020_1455CDT.csv.gz b/doctor_visits/tests/test_data/SYNEDI_AGG_OUTPATIENT_07022020_1455CDT.csv.gz
diff --git a/doctor_visits/tests/test_data/SYNEDI_AGG_OUTPATIENT_07022020_1455CDT.pkl b/doctor_visits/tests/test_data/SYNEDI_AGG_OUTPATIENT_07022020_1455CDT.pkl
diff --git a/doctor_visits/tests/test_geomap.py b/doctor_visits/tests/test_geomap.py
@@ -3,14 +3,14 @@
 from delphi_doctor_visits.geo_maps import GeoMaps
 from delphi_doctor_visits.config import Config
 
-CONFIG = Config()
 DATA = pd.read_csv(
     "test_data/SYNEDI_AGG_OUTPATIENT_07022020_1455CDT.csv.gz",
-    usecols=CONFIG.FILT_COLS,
-    dtype=CONFIG.DTYPES,
-    parse_dates=[CONFIG.DATE_COL],
+    dtype=Config.DTYPES,
     nrows=9,
 )
+DATA.rename(columns=Config.DEVIANT_COLS_MAP, inplace=True)
+DATA = DATA[Config.FILT_COLS]
+DATA[Config.DATE_COL] = DATA[Config.DATE_COL].apply(pd.to_datetime)
 
 GM = GeoMaps()
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,7 @@ @@
         "pytest-cov",
         "pytest",
         "scikit-learn",
+        "dask",
     ]
     setup(
@@ Expand Down @@