1
- # -*- coding: utf-8 -*-
1
+ import covidcast
2
2
import numpy as np
3
3
import pandas as pd
4
4
5
+ from .constants import HOME_DWELL , COMPLETELY_HOME , FULL_TIME_WORK , PART_TIME_WORK
5
6
from .geo import FIPS_TO_STATE
6
7
7
8
# Magic number for modular arithmetic; CBG -> FIPS
8
9
MOD = 10000000
9
10
10
- def construct_signals (cbg_df , signal_names ):
11
- '''Construct Census-block level signals.
11
+ def add_prefix (signal_names , wip_signal , prefix : str ):
12
+ """Adds prefix to signal if there is a WIP signal
13
+ Parameters
14
+ ----------
15
+ signal_names: List[str]
16
+ Names of signals to be exported
17
+ prefix : 'wip_'
18
+ prefix for new/non public signals
19
+ wip_signal : List[str] or bool
20
+ a list of wip signals: [], OR
21
+ all signals in the registry: True OR
22
+ only signals that have never been published: False
23
+ Returns
24
+ -------
25
+ List of signal names
26
+ wip/non wip signals for further computation
27
+ """
28
+
29
+ if wip_signal is True :
30
+ return [prefix + signal for signal in signal_names ]
31
+ if isinstance (wip_signal , list ):
32
+ make_wip = set (wip_signal )
33
+ return [
34
+ (prefix if signal in make_wip else "" ) + signal
35
+ for signal in signal_names
36
+ ]
37
+ if wip_signal in {False , "" }:
38
+ return [
39
+ signal if public_signal (signal )
40
+ else prefix + signal
41
+ for signal in signal_names
42
+ ]
43
+ raise ValueError ("Supply True | False or '' or [] | list()" )
44
+
45
+ # Check if the signal name is public
46
+ def public_signal (signal_ ):
47
+ """Checks if the signal name is already public using COVIDcast
48
+ Parameters
49
+ ----------
50
+ signal_ : str
51
+ Name of the signal
52
+ Returns
53
+ -------
54
+ bool
55
+ True if the signal is present
56
+ False if the signal is not present
57
+ """
58
+ epidata_df = covidcast .metadata ()
59
+ for index in range (len (epidata_df )):
60
+ if epidata_df ['signal' ][index ] == signal_ :
61
+ return True
62
+ return False
12
63
64
+
65
+ def construct_signals (cbg_df , signal_names ):
66
+ """Construct Census-block level signals.
13
67
In its current form, we prepare the following signals in addition to those
14
68
already available in raw form from Safegraph:
15
-
16
69
- completely_home_prop, defined as:
17
70
completely_home_device_count / device_count
18
71
- full_time_work_prop, defined as:
19
72
full_time_work_behavior_devices / device_count
20
73
- part_time_work_prop, defined as:
21
74
part_time_work_behavior_devices / device_count
22
-
23
75
Documentation for the social distancing metrics:
24
76
https://docs.safegraph.com/docs/social-distancing-metrics
25
-
26
77
Parameters
27
78
----------
28
79
cbg_df: pd.DataFrame
29
80
Census block group-level dataframe with raw social distancing
30
81
indicators from Safegraph.
31
82
signal_names: List[str]
32
83
Names of signals to be exported.
33
-
34
84
Returns
35
85
-------
36
86
pd.DataFrame
37
87
Dataframe with columns: timestamp, county_fips, and
38
88
{each signal described above}.
39
- '''
89
+ """
90
+
40
91
# Preparation
41
92
cbg_df ['timestamp' ] = cbg_df ['date_range_start' ].apply (
42
- lambda x : str (x ).split ('T' )[0 ])
93
+ lambda x : str (x ).split ('T' )[0 ])
43
94
cbg_df ['county_fips' ] = (cbg_df ['origin_census_block_group' ] // MOD ).apply (
44
- lambda x : f'{ int (x ):05d} ' )
95
+ lambda x : f'{ int (x ):05d} ' )
96
+
45
97
# Transformation: create signal not available in raw data
46
- cbg_df ['completely_home_prop' ] = (cbg_df ['completely_home_device_count' ]
47
- / cbg_df ['device_count' ])
48
- cbg_df ['full_time_work_prop' ] = (cbg_df ['full_time_work_behavior_devices' ]
49
- / cbg_df ['device_count' ])
50
- cbg_df ['part_time_work_prop' ] = (cbg_df ['part_time_work_behavior_devices' ]
51
- / cbg_df ['device_count' ])
98
+ for signal in signal_names :
99
+ if signal .endswith (FULL_TIME_WORK ):
100
+ cbg_df [signal ] = (cbg_df ['full_time_work_behavior_devices' ]
101
+ / cbg_df ['device_count' ])
102
+ elif signal .endswith (COMPLETELY_HOME ):
103
+ cbg_df [signal ] = (cbg_df ['completely_home_device_count' ]
104
+ / cbg_df ['device_count' ])
105
+ elif signal .endswith (PART_TIME_WORK ):
106
+ cbg_df [signal ] = (cbg_df ['part_time_work_behavior_devices' ]
107
+ / cbg_df ['device_count' ])
108
+ elif signal .endswith (HOME_DWELL ):
109
+ cbg_df [signal ] = (cbg_df ['median_home_dwell_time' ])
110
+
111
+
52
112
# Subsetting
53
113
return cbg_df [['timestamp' , 'county_fips' ] + signal_names ]
54
114
115
+
55
116
def aggregate (df , signal_names , geo_resolution = 'county' ):
56
117
'''Aggregate signals to appropriate resolution and produce standard errors.
57
-
58
118
Parameters
59
119
----------
60
120
df: pd.DataFrame
@@ -64,7 +124,6 @@ def aggregate(df, signal_names, geo_resolution='county'):
64
124
Names of signals to be exported.
65
125
geo_resolution: str
66
126
One of ('county', 'state')
67
-
68
127
Returns
69
128
-------
70
129
pd.DataFrame:
@@ -77,68 +136,64 @@ def aggregate(df, signal_names, geo_resolution='county'):
77
136
df ['geo_id' ] = df ['county_fips' ]
78
137
elif geo_resolution == 'state' :
79
138
df ['geo_id' ] = df ['county_fips' ].apply (lambda x :
80
- FIPS_TO_STATE [x [:2 ]])
139
+ FIPS_TO_STATE [x [:2 ]])
81
140
else :
82
141
raise ValueError (f'`geo_resolution` must be one of { GEO_RESOLUTION } .' )
83
142
84
143
# Aggregation and signal creation
85
144
df_mean = df .groupby (['geo_id' , 'timestamp' ])[
86
- signal_names
87
- ].mean ()
145
+ signal_names
146
+ ].mean ()
88
147
df_sd = df .groupby (['geo_id' , 'timestamp' ])[
89
- signal_names
90
- ].std ()
148
+ signal_names
149
+ ].std ()
91
150
df_n = df .groupby (['geo_id' , 'timestamp' ])[
92
- signal_names
93
- ].count ()
151
+ signal_names
152
+ ].count ()
94
153
agg_df = pd .DataFrame .join (df_mean , df_sd ,
95
- lsuffix = '_mean' , rsuffix = '_sd' )
154
+ lsuffix = '_mean' , rsuffix = '_sd' )
96
155
agg_df = pd .DataFrame .join (agg_df , df_n .rename ({
97
- signal : signal + '_n' for signal in signal_names
98
- }, axis = 1 ))
156
+ signal : signal + '_n' for signal in signal_names
157
+ }, axis = 1 ))
99
158
for signal in signal_names :
100
159
agg_df [f'{ signal } _se' ] = (agg_df [f'{ signal } _sd' ]
101
- / np .sqrt (agg_df [f'{ signal } _n' ]))
160
+ / np .sqrt (agg_df [f'{ signal } _n' ]))
102
161
return agg_df .reset_index ()
103
162
104
- def process (fname , signals , geo_resolutions , export_dir ):
163
+
164
+ def process (fname , signal_names , geo_resolutions , export_dir ):
105
165
'''Process an input census block group-level CSV and export it. Assumes
106
166
that the input file has _only_ one date of data.
107
-
108
167
Parameters
109
168
----------
169
+ export_dir
170
+ path where the output files are saved
171
+ signal_names : List[str]
172
+ signal names to be processed
110
173
fname: str
111
174
Input filename.
112
- signals: List[Tuple[str, bool]]
113
- List of (signal_name, wip).
114
175
geo_resolutions: List[str]
115
176
List of geo resolutions to export the data.
116
-
117
177
Returns
118
178
-------
119
179
None
120
180
'''
121
- signal_names , wip = (list (x ) for x in zip (* signals ))
122
181
cbg_df = construct_signals (pd .read_csv (fname ), signal_names )
123
182
unique_date = cbg_df ['timestamp' ].unique ()
124
183
if len (unique_date ) != 1 :
125
184
raise ValueError (f'More than one timestamp found in input file { fname } .' )
126
185
date = unique_date [0 ].replace ('-' , '' )
127
186
for geo_res in geo_resolutions :
128
187
df = aggregate (cbg_df , signal_names , geo_res )
129
- for signal , wip in signals :
188
+ for signal in signal_names :
130
189
df_export = df [
131
- ['geo_id' ]
132
- + [f'{ signal } _{ x } ' for x in ('mean' , 'se' , 'n' )]
133
- ].rename ({
134
- f'{ signal } _mean' : 'val' ,
135
- f'{ signal } _se' : 'se' ,
136
- f'{ signal } _n' : 'sample_size' ,
137
- }, axis = 1 )
138
- if wip :
139
- signal = 'wip_' + signal
190
+ ['geo_id' ]
191
+ + [f'{ signal } _{ x } ' for x in ('mean' , 'se' , 'n' )]
192
+ ].rename ({
193
+ f'{ signal } _mean' : 'val' ,
194
+ f'{ signal } _se' : 'se' ,
195
+ f'{ signal } _n' : 'sample_size' ,
196
+ }, axis = 1 )
140
197
df_export .to_csv (f'{ export_dir } /{ date } _{ geo_res } _{ signal } .csv' ,
141
- na_rep = 'NA' ,
142
- index = False ,)
143
- return
144
-
198
+ na_rep = 'NA' ,
199
+ index = False , )
0 commit comments