Update: added new use case in safegraph

vishakha1812 · vishakha1812 · commit 9e398ae3219c · 2020-08-03T13:32:08.000-07:00
diff --git a/safegraph/delphi_safegraph/process.py b/safegraph/delphi_safegraph/process.py
@@ -1,60 +1,91 @@
-# -*- coding: utf-8 -*-
 import numpy as np
 import pandas as pd
 
 from .geo import FIPS_TO_STATE
 
 # Magic number for modular arithmetic; CBG -> FIPS
 MOD = 10000000
+from delphi_utils import read_params
+from delphi_epidata import Epidata
+
+
+# Add prefix to the signal name, if needed
+def signal_name(signal_names, wip_signal, prefix):
+    if wip_signal is not None:
+        if wip_signal and type(wip_signal) == bool:
+            new_signal_list = []
+            [new_signal_list.append(prefix + signal) if epidata_signal(signal) else new_signal_list.append(signal) for
+             signal in signal_names]
+            return new_signal_list
+        if type(wip_signal) == list:
+            for signal in wip_signal:
+                if epidata_signal(signal):
+                    new_list = [prefix + signal]
+                    signal_names.remove(signal)
+            signal_names.extend(new_list)
+    return signal_names
+
+
+# Check if the signal name is public
+def epidata_signal(signal_):
+    epidata_df = Epidata.covidcast_meta()
+    for index in range(len(epidata_df['epidata'])):
+        for key in epidata_df['epidata'][index]:
+            if key == 'signal':
+                if epidata_df['epidata'][index][key] == signal_:
+                    return False
+    return True
 
-def construct_signals(cbg_df, signal_names):
-    '''Construct Census-block level signals.
 
+def construct_signals(cbg_df, signal_names):
+    """Construct Census-block level signals.
     In its current form, we prepare the following signals in addition to those
     already available in raw form from Safegraph:
-
     - completely_home_prop, defined as:
         completely_home_device_count / device_count
     - full_time_work_prop, defined as:
         full_time_work_behavior_devices / device_count
     - part_time_work_prop, defined as:
         part_time_work_behavior_devices / device_count
-
     Documentation for the social distancing metrics:
     https://docs.safegraph.com/docs/social-distancing-metrics
-
     Parameters
     ----------
     cbg_df: pd.DataFrame
         Census block group-level dataframe with raw social distancing
         indicators from Safegraph.
     signal_names: List[str]
         Names of signals to be exported.
-
     Returns
     -------
     pd.DataFrame
         Dataframe with columns: timestamp, county_fips, and
         {each signal described above}.
-    '''
+    """
+
+    COMPLETELY_HOME = signal_names[1]
+    FULL_TIME_WORK = signal_names[2]
+    PART_TIME_WORK = signal_names[3]
+
     # Preparation
     cbg_df['timestamp'] = cbg_df['date_range_start'].apply(
-                    lambda x: str(x).split('T')[0])
+        lambda x: str(x).split('T')[0])
     cbg_df['county_fips'] = (cbg_df['origin_census_block_group'] // MOD).apply(
-                            lambda x: f'{int(x):05d}')
+        lambda x: f'{int(x):05d}')
     # Transformation: create signal not available in raw data
-    cbg_df['completely_home_prop'] = (cbg_df['completely_home_device_count']
-                                        / cbg_df['device_count'])
-    cbg_df['full_time_work_prop'] =  (cbg_df['full_time_work_behavior_devices']
-                                        / cbg_df['device_count'])
-    cbg_df['part_time_work_prop'] =  (cbg_df['part_time_work_behavior_devices']
-                                        / cbg_df['device_count'])
+    cbg_df[COMPLETELY_HOME] = (cbg_df['completely_home_device_count']
+                               / cbg_df['device_count'])
+    cbg_df[FULL_TIME_WORK] = (cbg_df['full_time_work_behavior_devices']
+                              / cbg_df['device_count'])
+    cbg_df[PART_TIME_WORK] = (cbg_df['part_time_work_behavior_devices']
+                              / cbg_df['device_count'])
+
     # Subsetting
     return cbg_df[['timestamp', 'county_fips'] + signal_names]
 
+
 def aggregate(df, signal_names, geo_resolution='county'):
     '''Aggregate signals to appropriate resolution and produce standard errors.
-
     Parameters
     ----------
     df: pd.DataFrame
@@ -64,7 +95,6 @@ def aggregate(df, signal_names, geo_resolution='county'):
         Names of signals to be exported.
     geo_resolution: str
         One of ('county', 'state')
-
     Returns
     -------
     pd.DataFrame:
@@ -77,34 +107,34 @@ def aggregate(df, signal_names, geo_resolution='county'):
         df['geo_id'] = df['county_fips']
     elif geo_resolution == 'state':
         df['geo_id'] = df['county_fips'].apply(lambda x:
-                FIPS_TO_STATE[x[:2]])
+                                               FIPS_TO_STATE[x[:2]])
     else:
         raise ValueError(f'`geo_resolution` must be one of {GEO_RESOLUTION}.')
 
     # Aggregation and signal creation
     df_mean = df.groupby(['geo_id', 'timestamp'])[
-            signal_names
-           ].mean()
+        signal_names
+    ].mean()
     df_sd = df.groupby(['geo_id', 'timestamp'])[
-            signal_names
-           ].std()
+        signal_names
+    ].std()
     df_n = df.groupby(['geo_id', 'timestamp'])[
-            signal_names
-           ].count()
+        signal_names
+    ].count()
     agg_df = pd.DataFrame.join(df_mean, df_sd,
-            lsuffix='_mean', rsuffix='_sd')
+                               lsuffix='_mean', rsuffix='_sd')
     agg_df = pd.DataFrame.join(agg_df, df_n.rename({
-               signal: signal+'_n' for signal in signal_names
-           }, axis=1))
+        signal: signal + '_n' for signal in signal_names
+    }, axis=1))
     for signal in signal_names:
         agg_df[f'{signal}_se'] = (agg_df[f'{signal}_sd']
-                                 /np.sqrt(agg_df[f'{signal}_n']))
+                                  / np.sqrt(agg_df[f'{signal}_n']))
     return agg_df.reset_index()
 
-def process(fname, signals, geo_resolutions, export_dir):
+
+def process(fname, signal_names, geo_resolutions, export_dir):
     '''Process an input census block group-level CSV and export it.  Assumes
     that the input file has _only_ one date of data.
-
     Parameters
     ----------
     fname: str
@@ -113,32 +143,27 @@ def process(fname, signals, geo_resolutions, export_dir):
         List of (signal_name, wip).
     geo_resolutions: List[str]
         List of geo resolutions to export the data.
-
     Returns
     -------
     None
     '''
-    signal_names, wip = (list(x) for x in zip(*signals))
     cbg_df = construct_signals(pd.read_csv(fname), signal_names)
     unique_date = cbg_df['timestamp'].unique()
     if len(unique_date) != 1:
         raise ValueError(f'More than one timestamp found in input file {fname}.')
     date = unique_date[0].replace('-', '')
     for geo_res in geo_resolutions:
         df = aggregate(cbg_df, signal_names, geo_res)
-        for signal, wip in signals:
+        for signal in signal_names:
             df_export = df[
-                    ['geo_id']
-                    + [f'{signal}_{x}' for x in ('mean', 'se', 'n')]
-                    ].rename({
-                        f'{signal}_mean': 'val',
-                        f'{signal}_se': 'se',
-                        f'{signal}_n': 'sample_size',
-                    }, axis=1)
-            if wip:
-                signal = 'wip_' + signal
+                ['geo_id']
+                + [f'{signal}_{x}' for x in ('mean', 'se', 'n')]
+                ].rename({
+                f'{signal}_mean': 'val',
+                f'{signal}_se': 'se',
+                f'{signal}_n': 'sample_size',
+            }, axis=1)
             df_export.to_csv(f'{export_dir}/{date}_{geo_res}_{signal}.csv',
-                    na_rep='NA',
-                    index=False,)
+                             na_rep='NA',
+                             index=False, )
     return
-
diff --git a/safegraph/delphi_safegraph/run.py b/safegraph/delphi_safegraph/run.py
@@ -1,6 +1,4 @@
-# -*- coding: utf-8 -*-
 """Functions to call when running the function.
-
 This module should contain a function called `run_module`, that is executed
 when the module is run with `python -m MODULE_NAME`.
 """
@@ -14,15 +12,16 @@
 import pandas as pd
 from delphi_utils import read_params
 
-from .process import process
+from .process import process, signal_name
 
 SIGNALS = [
-    # signal_name                wip
-    ('median_home_dwell_time',   False),
-    ('completely_home_prop',     False),
-    ('full_time_work_prop',      False),
-    ('part_time_work_prop',      False),
+    'median_home_dwell_time',
+    'completely_home_prop',
+    'full_time_work_prop',
+    'part_time_work_prop'
+
 ]
+
 GEO_RESOLUTIONS = [
     'county',
     'state',
@@ -39,31 +38,31 @@ def run_module():
     aws_secret_access_key = params["aws_secret_access_key"]
     aws_default_region = params["aws_default_region"]
     aws_endpoint = params["aws_endpoint"]
+    wip_signal = params["wip_signal"]
 
     process_file = partial(process,
-            signals=SIGNALS,
-            geo_resolutions=GEO_RESOLUTIONS,
-            export_dir=export_dir,
-        )
+                           signal_names=signal_name(SIGNALS, wip_signal, prefix='wip_'),
+                           geo_resolutions=GEO_RESOLUTIONS,
+                           export_dir=export_dir,
+                           )
 
     # Update raw data
     # Why call subprocess rather than using a native Python client, e.g. boto3?
     # Because boto3 does not have a simple rsync-like call that can perform
     # the following behavior elegantly.
     subprocess.run(
-            f'aws s3 sync s3://sg-c19-response/social-distancing/v2/ '
-            f'{raw_data_dir}/social-distancing/ --endpoint {aws_endpoint}',
-            env={
-                'AWS_ACCESS_KEY_ID': aws_access_key_id,
-                'AWS_SECRET_ACCESS_KEY': aws_secret_access_key,
-                'AWS_DEFAULT_REGION': aws_default_region,
-            },
-            shell=True,
-        )
+        f'aws s3 sync s3://sg-c19-response/social-distancing/v2/ '
+        f'{raw_data_dir}/social-distancing/ --endpoint {aws_endpoint}',
+        env={
+            'AWS_ACCESS_KEY_ID': aws_access_key_id,
+            'AWS_SECRET_ACCESS_KEY': aws_secret_access_key,
+            'AWS_DEFAULT_REGION': aws_default_region,
+        },
+        shell=True,
+    )
 
     files = glob.glob(f'{raw_data_dir}/social-distancing/**/*.csv.gz',
-            recursive=True)
+                      recursive=True)
 
     with mp.Pool(n_core) as pool:
         pool.map(process_file, files)
-
diff --git a/safegraph/params.json.template b/safegraph/params.json.template
@@ -7,5 +7,6 @@
   "aws_access_key_id": "",
   "aws_secret_access_key": "",
   "aws_default_region": "",
-  "aws_endpoint": ""
+  "aws_endpoint": "",
+   "wip_prefix" : []
 }
diff --git a/safegraph/tests/test_process.py b/safegraph/tests/test_process.py
@@ -6,48 +6,54 @@
 import numpy as np
 import pandas as pd
 from delphi_safegraph.process import (
-        construct_signals,
-        aggregate,
-    )
+    construct_signals,
+    aggregate,
+    signal_name
+)
 from delphi_safegraph.run import SIGNALS
+from delphi_utils import read_params
+signal_names = SIGNALS
 
-signal_names, _ = (list(x) for x in zip(*SIGNALS))
 
 class TestProcess:
     def test_construct_signals_present(self):
-
         cbg_df = construct_signals(pd.read_csv('raw_data/sample_raw_data.csv'),
-                signal_names)
+                                   signal_names)
         assert 'completely_home_prop' in set(cbg_df.columns)
         assert 'full_time_work_prop' in set(cbg_df.columns)
         assert 'part_time_work_prop' in set(cbg_df.columns)
         assert 'median_home_dwell_time' in set(cbg_df.columns)
 
     def test_construct_signals_proportions(self):
-
         cbg_df = construct_signals(pd.read_csv('raw_data/sample_raw_data.csv'),
-                signal_names)
+                                   signal_names)
         assert np.all(cbg_df['completely_home_prop'].values <= 1)
         assert np.all(cbg_df['full_time_work_prop'].values <= 1)
         assert np.all(cbg_df['part_time_work_prop'].values <= 1)
 
     def test_aggregate_county(self):
-    
         cbg_df = construct_signals(pd.read_csv('raw_data/sample_raw_data.csv'),
-                signal_names)
+                                   signal_names)
         df = aggregate(cbg_df, signal_names, 'county')
 
         assert np.all(df[f'{signal_names[0]}_n'].values > 0)
         x = df[f'{signal_names[0]}_se'].values
         assert np.all(x[~np.isnan(x)] >= 0)
 
     def test_aggregate_state(self):
-    
         cbg_df = construct_signals(pd.read_csv('raw_data/sample_raw_data.csv'),
-                signal_names)
+                                   signal_names)
         df = aggregate(cbg_df, signal_names, 'state')
 
         assert np.all(df[f'{signal_names[0]}_n'].values > 0)
         x = df[f'{signal_names[0]}_se'].values
         assert np.all(x[~np.isnan(x)] >= 0)
 
+    def test_signal_name(self):
+        assert read_params()["wip_signal"] is not None, "supply value in params"
+        assert type(read_params()["wip_signal"]) == list or type(read_params()["wip_signal"]) == bool, "Supply True|False|list()"
+        signals = signal_name(signal_names, wip_signal=read_params()['wip_signal'],prefix='wip_')
+        assert (len(signals) >= len(signal_names))
+
+
+

Original file line number	Diff line number	Diff line change
`@@ -7,5 +7,6 @@`
`7`	`7`	`"aws_access_key_id": "",`
`8`	`8`	`"aws_secret_access_key": "",`
`9`	`9`	`"aws_default_region": "",`
`10`		`- "aws_endpoint": ""`
	`10`	`+ "aws_endpoint": "",`
	`11`	`+ "wip_prefix" : []`
`11`	`12`	`}`