Merge pull request #168 from cmu-delphi/rename_signals

krivard · web-flow · commit 31d882bf30ba · 2020-08-11T14:35:11.000-04:00
safegraph: standardizing signal names
diff --git a/safegraph/delphi_safegraph/constants.py b/safegraph/delphi_safegraph/constants.py
@@ -0,0 +1,18 @@
+
+
+HOME_DWELL = 'median_home_dwell_time'
+COMPLETELY_HOME = 'completely_home_prop'
+FULL_TIME_WORK = 'full_time_work_prop'
+PART_TIME_WORK = 'part_time_work_prop'
+
+SIGNALS = [
+    HOME_DWELL,
+    COMPLETELY_HOME,
+    FULL_TIME_WORK,
+    PART_TIME_WORK
+]
+
+GEO_RESOLUTIONS = [
+    'county',
+    'state',
+]
diff --git a/safegraph/delphi_safegraph/process.py b/safegraph/delphi_safegraph/process.py
@@ -1,60 +1,120 @@
-# -*- coding: utf-8 -*-
+import covidcast
 import numpy as np
 import pandas as pd
 
+from .constants import HOME_DWELL, COMPLETELY_HOME, FULL_TIME_WORK, PART_TIME_WORK
 from .geo import FIPS_TO_STATE
 
 # Magic number for modular arithmetic; CBG -> FIPS
 MOD = 10000000
 
-def construct_signals(cbg_df, signal_names):
-    '''Construct Census-block level signals.
+def add_prefix(signal_names, wip_signal, prefix: str):
+    """Adds prefix to signal if there is a WIP signal
+    Parameters
+    ----------
+    signal_names: List[str]
+        Names of signals to be exported
+    prefix : 'wip_'
+        prefix for new/non public signals
+    wip_signal : List[str] or bool
+        a list of wip signals: [], OR
+        all signals in the registry: True OR
+        only signals that have never been published: False
+    Returns
+    -------
+    List of signal names
+        wip/non wip signals for further computation
+    """
+
+    if wip_signal is True:
+        return [prefix + signal for signal in signal_names]
+    if isinstance(wip_signal, list):
+        make_wip = set(wip_signal)
+        return [
+            (prefix if signal in make_wip else "") + signal
+            for signal in signal_names
+        ]
+    if wip_signal in {False, ""}:
+        return [
+            signal if public_signal(signal)
+            else prefix + signal
+            for signal in signal_names
+        ]
+    raise ValueError("Supply True | False or '' or [] | list()")
+
+# Check if the signal name is public
+def public_signal(signal_):
+    """Checks if the signal name is already public using COVIDcast
+    Parameters
+    ----------
+    signal_ : str
+        Name of the signal
+    Returns
+    -------
+    bool
+        True if the signal is present
+        False if the signal is not present
+    """
+    epidata_df = covidcast.metadata()
+    for index in range(len(epidata_df)):
+        if epidata_df['signal'][index] == signal_:
+            return True
+    return False
 
+
+def construct_signals(cbg_df, signal_names):
+    """Construct Census-block level signals.
     In its current form, we prepare the following signals in addition to those
     already available in raw form from Safegraph:
-
     - completely_home_prop, defined as:
         completely_home_device_count / device_count
     - full_time_work_prop, defined as:
         full_time_work_behavior_devices / device_count
     - part_time_work_prop, defined as:
         part_time_work_behavior_devices / device_count
-
     Documentation for the social distancing metrics:
     https://docs.safegraph.com/docs/social-distancing-metrics
-
     Parameters
     ----------
     cbg_df: pd.DataFrame
         Census block group-level dataframe with raw social distancing
         indicators from Safegraph.
     signal_names: List[str]
         Names of signals to be exported.
-
     Returns
     -------
     pd.DataFrame
         Dataframe with columns: timestamp, county_fips, and
         {each signal described above}.
-    '''
+    """
+
     # Preparation
     cbg_df['timestamp'] = cbg_df['date_range_start'].apply(
-                    lambda x: str(x).split('T')[0])
+        lambda x: str(x).split('T')[0])
     cbg_df['county_fips'] = (cbg_df['origin_census_block_group'] // MOD).apply(
-                            lambda x: f'{int(x):05d}')
+        lambda x: f'{int(x):05d}')
+
     # Transformation: create signal not available in raw data
-    cbg_df['completely_home_prop'] = (cbg_df['completely_home_device_count']
-                                        / cbg_df['device_count'])
-    cbg_df['full_time_work_prop'] =  (cbg_df['full_time_work_behavior_devices']
-                                        / cbg_df['device_count'])
-    cbg_df['part_time_work_prop'] =  (cbg_df['part_time_work_behavior_devices']
-                                        / cbg_df['device_count'])
+    for signal in signal_names:
+        if signal.endswith(FULL_TIME_WORK):
+            cbg_df[signal] = (cbg_df['full_time_work_behavior_devices']
+                              / cbg_df['device_count'])
+        elif signal.endswith(COMPLETELY_HOME):
+            cbg_df[signal] = (cbg_df['completely_home_device_count']
+                              / cbg_df['device_count'])
+        elif signal.endswith(PART_TIME_WORK):
+            cbg_df[signal] = (cbg_df['part_time_work_behavior_devices']
+                              / cbg_df['device_count'])
+        elif signal.endswith(HOME_DWELL):
+            cbg_df[signal] = (cbg_df['median_home_dwell_time'])
+
+
     # Subsetting
     return cbg_df[['timestamp', 'county_fips'] + signal_names]
 
+
 def aggregate(df, signal_names, geo_resolution='county'):
     '''Aggregate signals to appropriate resolution and produce standard errors.
-
     Parameters
     ----------
     df: pd.DataFrame
@@ -64,7 +124,6 @@ def aggregate(df, signal_names, geo_resolution='county'):
         Names of signals to be exported.
     geo_resolution: str
         One of ('county', 'state')
-
     Returns
     -------
     pd.DataFrame:
@@ -77,68 +136,64 @@ def aggregate(df, signal_names, geo_resolution='county'):
         df['geo_id'] = df['county_fips']
     elif geo_resolution == 'state':
         df['geo_id'] = df['county_fips'].apply(lambda x:
-                FIPS_TO_STATE[x[:2]])
+                                               FIPS_TO_STATE[x[:2]])
     else:
         raise ValueError(f'`geo_resolution` must be one of {GEO_RESOLUTION}.')
 
     # Aggregation and signal creation
     df_mean = df.groupby(['geo_id', 'timestamp'])[
-            signal_names
-           ].mean()
+        signal_names
+    ].mean()
     df_sd = df.groupby(['geo_id', 'timestamp'])[
-            signal_names
-           ].std()
+        signal_names
+    ].std()
     df_n = df.groupby(['geo_id', 'timestamp'])[
-            signal_names
-           ].count()
+        signal_names
+    ].count()
     agg_df = pd.DataFrame.join(df_mean, df_sd,
-            lsuffix='_mean', rsuffix='_sd')
+                               lsuffix='_mean', rsuffix='_sd')
     agg_df = pd.DataFrame.join(agg_df, df_n.rename({
-               signal: signal+'_n' for signal in signal_names
-           }, axis=1))
+        signal: signal + '_n' for signal in signal_names
+    }, axis=1))
     for signal in signal_names:
         agg_df[f'{signal}_se'] = (agg_df[f'{signal}_sd']
-                                 /np.sqrt(agg_df[f'{signal}_n']))
+                                  / np.sqrt(agg_df[f'{signal}_n']))
     return agg_df.reset_index()
 
-def process(fname, signals, geo_resolutions, export_dir):
+
+def process(fname, signal_names, geo_resolutions, export_dir):
     '''Process an input census block group-level CSV and export it.  Assumes
     that the input file has _only_ one date of data.
-
     Parameters
     ----------
+    export_dir
+        path where the output files are saved
+    signal_names : List[str]
+        signal names to be processed
     fname: str
         Input filename.
-    signals: List[Tuple[str, bool]]
-        List of (signal_name, wip).
     geo_resolutions: List[str]
         List of geo resolutions to export the data.
-
     Returns
     -------
     None
     '''
-    signal_names, wip = (list(x) for x in zip(*signals))
     cbg_df = construct_signals(pd.read_csv(fname), signal_names)
     unique_date = cbg_df['timestamp'].unique()
     if len(unique_date) != 1:
         raise ValueError(f'More than one timestamp found in input file {fname}.')
     date = unique_date[0].replace('-', '')
     for geo_res in geo_resolutions:
         df = aggregate(cbg_df, signal_names, geo_res)
-        for signal, wip in signals:
+        for signal in signal_names:
             df_export = df[
-                    ['geo_id']
-                    + [f'{signal}_{x}' for x in ('mean', 'se', 'n')]
-                    ].rename({
-                        f'{signal}_mean': 'val',
-                        f'{signal}_se': 'se',
-                        f'{signal}_n': 'sample_size',
-                    }, axis=1)
-            if wip:
-                signal = 'wip_' + signal
+                ['geo_id']
+                + [f'{signal}_{x}' for x in ('mean', 'se', 'n')]
+                ].rename({
+                f'{signal}_mean': 'val',
+                f'{signal}_se': 'se',
+                f'{signal}_n': 'sample_size',
+            }, axis=1)
             df_export.to_csv(f'{export_dir}/{date}_{geo_res}_{signal}.csv',
-                    na_rep='NA',
-                    index=False,)
-    return
-
+                             na_rep='NA',
+                             index=False, )
diff --git a/safegraph/delphi_safegraph/run.py b/safegraph/delphi_safegraph/run.py
@@ -1,33 +1,16 @@
-# -*- coding: utf-8 -*-
 """Functions to call when running the function.
-
 This module should contain a function called `run_module`, that is executed
 when the module is run with `python -m MODULE_NAME`.
 """
 import glob
 import multiprocessing as mp
 import subprocess
-from datetime import datetime
 from functools import partial
 
-import numpy as np
-import pandas as pd
 from delphi_utils import read_params
 
-from .process import process
-
-SIGNALS = [
-    # signal_name                wip
-    ('median_home_dwell_time',   False),
-    ('completely_home_prop',     False),
-    ('full_time_work_prop',      False),
-    ('part_time_work_prop',      False),
-]
-GEO_RESOLUTIONS = [
-    'county',
-    'state',
-]
-
+from .constants import SIGNALS, GEO_RESOLUTIONS
+from .process import process, add_prefix
 
 def run_module():
 
@@ -39,31 +22,31 @@ def run_module():
     aws_secret_access_key = params["aws_secret_access_key"]
     aws_default_region = params["aws_default_region"]
     aws_endpoint = params["aws_endpoint"]
+    wip_signal = params["wip_signal"]
 
     process_file = partial(process,
-            signals=SIGNALS,
-            geo_resolutions=GEO_RESOLUTIONS,
-            export_dir=export_dir,
-        )
+                           signal_names=add_prefix(SIGNALS, wip_signal, prefix='wip_'),
+                           geo_resolutions=GEO_RESOLUTIONS,
+                           export_dir=export_dir,
+                           )
 
     # Update raw data
     # Why call subprocess rather than using a native Python client, e.g. boto3?
     # Because boto3 does not have a simple rsync-like call that can perform
     # the following behavior elegantly.
     subprocess.run(
-            f'aws s3 sync s3://sg-c19-response/social-distancing/v2/ '
-            f'{raw_data_dir}/social-distancing/ --endpoint {aws_endpoint}',
-            env={
-                'AWS_ACCESS_KEY_ID': aws_access_key_id,
-                'AWS_SECRET_ACCESS_KEY': aws_secret_access_key,
-                'AWS_DEFAULT_REGION': aws_default_region,
-            },
-            shell=True,
-        )
+        f'aws s3 sync s3://sg-c19-response/social-distancing/v2/ '
+        f'{raw_data_dir}/social-distancing/ --endpoint {aws_endpoint}',
+        env={
+            'AWS_ACCESS_KEY_ID': aws_access_key_id,
+            'AWS_SECRET_ACCESS_KEY': aws_secret_access_key,
+            'AWS_DEFAULT_REGION': aws_default_region,
+        },
+        shell=True,
+    )
 
     files = glob.glob(f'{raw_data_dir}/social-distancing/**/*.csv.gz',
-            recursive=True)
+                      recursive=True)
 
     with mp.Pool(n_core) as pool:
         pool.map(process_file, files)
-
diff --git a/safegraph/params.json.template b/safegraph/params.json.template
@@ -7,5 +7,6 @@
   "aws_access_key_id": "",
   "aws_secret_access_key": "",
   "aws_default_region": "",
-  "aws_endpoint": ""
+  "aws_endpoint": "",
+   "wip_signal" : ""
 }
diff --git a/safegraph/setup.py b/safegraph/setup.py
@@ -2,6 +2,7 @@
 from setuptools import find_packages
 
 required = [
+    "covidcast",
     "numpy",
     "pandas",
     "pytest",
diff --git a/safegraph/tests/params.json.template b/safegraph/tests/params.json.template
@@ -0,0 +1,12 @@
+{
+  "static_file_dir": "./static",
+  "raw_data_dir": "/mnt/data/safegraph/",
+  "export_dir": "./receiving",
+  "cache_dir": "./cache",
+  "n_core": "12",
+  "aws_access_key_id": "",
+  "aws_secret_access_key": "",
+  "aws_default_region": "",
+  "aws_endpoint": "",
+   "wip_signal" : ""
+}
diff --git a/safegraph/tests/test_process.py b/safegraph/tests/test_process.py

Original file line number	Diff line number	Diff line change
`@@ -7,5 +7,6 @@`
`7`	`7`	`"aws_access_key_id": "",`
`8`	`8`	`"aws_secret_access_key": "",`
`9`	`9`	`"aws_default_region": "",`
`10`		`- "aws_endpoint": ""`
	`10`	`+ "aws_endpoint": "",`
	`11`	`+ "wip_signal" : ""`
`11`	`12`	`}`