1
- # -*- coding: utf-8 -*-
2
1
import numpy as np
3
2
import pandas as pd
4
3
5
4
from .geo import FIPS_TO_STATE
6
5
7
6
# Magic number for modular arithmetic; CBG -> FIPS
8
7
MOD = 10000000
8
+ from delphi_utils import read_params
9
+ from delphi_epidata import Epidata
10
+
11
+
12
+ # Add prefix to the signal name, if needed
13
+ def signal_name (signal_names , wip_signal , prefix ):
14
+ if wip_signal is not None :
15
+ if wip_signal and type (wip_signal ) == bool :
16
+ new_signal_list = []
17
+ [new_signal_list .append (prefix + signal ) if epidata_signal (signal ) else new_signal_list .append (signal ) for
18
+ signal in signal_names ]
19
+ return new_signal_list
20
+ if type (wip_signal ) == list :
21
+ for signal in wip_signal :
22
+ if epidata_signal (signal ):
23
+ new_list = [prefix + signal ]
24
+ signal_names .remove (signal )
25
+ signal_names .extend (new_list )
26
+ return signal_names
27
+
28
+
29
+ # Check if the signal name is public
30
+ def epidata_signal (signal_ ):
31
+ epidata_df = Epidata .covidcast_meta ()
32
+ for index in range (len (epidata_df ['epidata' ])):
33
+ for key in epidata_df ['epidata' ][index ]:
34
+ if key == 'signal' :
35
+ if epidata_df ['epidata' ][index ][key ] == signal_ :
36
+ return False
37
+ return True
9
38
10
- def construct_signals (cbg_df , signal_names ):
11
- '''Construct Census-block level signals.
12
39
40
+ def construct_signals (cbg_df , signal_names ):
41
+ """Construct Census-block level signals.
13
42
In its current form, we prepare the following signals in addition to those
14
43
already available in raw form from Safegraph:
15
-
16
44
- completely_home_prop, defined as:
17
45
completely_home_device_count / device_count
18
46
- full_time_work_prop, defined as:
19
47
full_time_work_behavior_devices / device_count
20
48
- part_time_work_prop, defined as:
21
49
part_time_work_behavior_devices / device_count
22
-
23
50
Documentation for the social distancing metrics:
24
51
https://docs.safegraph.com/docs/social-distancing-metrics
25
-
26
52
Parameters
27
53
----------
28
54
cbg_df: pd.DataFrame
29
55
Census block group-level dataframe with raw social distancing
30
56
indicators from Safegraph.
31
57
signal_names: List[str]
32
58
Names of signals to be exported.
33
-
34
59
Returns
35
60
-------
36
61
pd.DataFrame
37
62
Dataframe with columns: timestamp, county_fips, and
38
63
{each signal described above}.
39
- '''
64
+ """
65
+
66
+ COMPLETELY_HOME = signal_names [1 ]
67
+ FULL_TIME_WORK = signal_names [2 ]
68
+ PART_TIME_WORK = signal_names [3 ]
69
+
40
70
# Preparation
41
71
cbg_df ['timestamp' ] = cbg_df ['date_range_start' ].apply (
42
- lambda x : str (x ).split ('T' )[0 ])
72
+ lambda x : str (x ).split ('T' )[0 ])
43
73
cbg_df ['county_fips' ] = (cbg_df ['origin_census_block_group' ] // MOD ).apply (
44
- lambda x : f'{ int (x ):05d} ' )
74
+ lambda x : f'{ int (x ):05d} ' )
45
75
# Transformation: create signal not available in raw data
46
- cbg_df ['completely_home_prop' ] = (cbg_df ['completely_home_device_count' ]
47
- / cbg_df ['device_count' ])
48
- cbg_df ['full_time_work_prop' ] = (cbg_df ['full_time_work_behavior_devices' ]
49
- / cbg_df ['device_count' ])
50
- cbg_df ['part_time_work_prop' ] = (cbg_df ['part_time_work_behavior_devices' ]
51
- / cbg_df ['device_count' ])
76
+ cbg_df [COMPLETELY_HOME ] = (cbg_df ['completely_home_device_count' ]
77
+ / cbg_df ['device_count' ])
78
+ cbg_df [FULL_TIME_WORK ] = (cbg_df ['full_time_work_behavior_devices' ]
79
+ / cbg_df ['device_count' ])
80
+ cbg_df [PART_TIME_WORK ] = (cbg_df ['part_time_work_behavior_devices' ]
81
+ / cbg_df ['device_count' ])
82
+
52
83
# Subsetting
53
84
return cbg_df [['timestamp' , 'county_fips' ] + signal_names ]
54
85
86
+
55
87
def aggregate (df , signal_names , geo_resolution = 'county' ):
56
88
'''Aggregate signals to appropriate resolution and produce standard errors.
57
-
58
89
Parameters
59
90
----------
60
91
df: pd.DataFrame
@@ -64,7 +95,6 @@ def aggregate(df, signal_names, geo_resolution='county'):
64
95
Names of signals to be exported.
65
96
geo_resolution: str
66
97
One of ('county', 'state')
67
-
68
98
Returns
69
99
-------
70
100
pd.DataFrame:
@@ -77,34 +107,34 @@ def aggregate(df, signal_names, geo_resolution='county'):
77
107
df ['geo_id' ] = df ['county_fips' ]
78
108
elif geo_resolution == 'state' :
79
109
df ['geo_id' ] = df ['county_fips' ].apply (lambda x :
80
- FIPS_TO_STATE [x [:2 ]])
110
+ FIPS_TO_STATE [x [:2 ]])
81
111
else :
82
112
raise ValueError (f'`geo_resolution` must be one of { GEO_RESOLUTION } .' )
83
113
84
114
# Aggregation and signal creation
85
115
df_mean = df .groupby (['geo_id' , 'timestamp' ])[
86
- signal_names
87
- ].mean ()
116
+ signal_names
117
+ ].mean ()
88
118
df_sd = df .groupby (['geo_id' , 'timestamp' ])[
89
- signal_names
90
- ].std ()
119
+ signal_names
120
+ ].std ()
91
121
df_n = df .groupby (['geo_id' , 'timestamp' ])[
92
- signal_names
93
- ].count ()
122
+ signal_names
123
+ ].count ()
94
124
agg_df = pd .DataFrame .join (df_mean , df_sd ,
95
- lsuffix = '_mean' , rsuffix = '_sd' )
125
+ lsuffix = '_mean' , rsuffix = '_sd' )
96
126
agg_df = pd .DataFrame .join (agg_df , df_n .rename ({
97
- signal : signal + '_n' for signal in signal_names
98
- }, axis = 1 ))
127
+ signal : signal + '_n' for signal in signal_names
128
+ }, axis = 1 ))
99
129
for signal in signal_names :
100
130
agg_df [f'{ signal } _se' ] = (agg_df [f'{ signal } _sd' ]
101
- / np .sqrt (agg_df [f'{ signal } _n' ]))
131
+ / np .sqrt (agg_df [f'{ signal } _n' ]))
102
132
return agg_df .reset_index ()
103
133
104
- def process (fname , signals , geo_resolutions , export_dir ):
134
+
135
+ def process (fname , signal_names , geo_resolutions , export_dir ):
105
136
'''Process an input census block group-level CSV and export it. Assumes
106
137
that the input file has _only_ one date of data.
107
-
108
138
Parameters
109
139
----------
110
140
fname: str
@@ -113,32 +143,27 @@ def process(fname, signals, geo_resolutions, export_dir):
113
143
List of (signal_name, wip).
114
144
geo_resolutions: List[str]
115
145
List of geo resolutions to export the data.
116
-
117
146
Returns
118
147
-------
119
148
None
120
149
'''
121
- signal_names , wip = (list (x ) for x in zip (* signals ))
122
150
cbg_df = construct_signals (pd .read_csv (fname ), signal_names )
123
151
unique_date = cbg_df ['timestamp' ].unique ()
124
152
if len (unique_date ) != 1 :
125
153
raise ValueError (f'More than one timestamp found in input file { fname } .' )
126
154
date = unique_date [0 ].replace ('-' , '' )
127
155
for geo_res in geo_resolutions :
128
156
df = aggregate (cbg_df , signal_names , geo_res )
129
- for signal , wip in signals :
157
+ for signal in signal_names :
130
158
df_export = df [
131
- ['geo_id' ]
132
- + [f'{ signal } _{ x } ' for x in ('mean' , 'se' , 'n' )]
133
- ].rename ({
134
- f'{ signal } _mean' : 'val' ,
135
- f'{ signal } _se' : 'se' ,
136
- f'{ signal } _n' : 'sample_size' ,
137
- }, axis = 1 )
138
- if wip :
139
- signal = 'wip_' + signal
159
+ ['geo_id' ]
160
+ + [f'{ signal } _{ x } ' for x in ('mean' , 'se' , 'n' )]
161
+ ].rename ({
162
+ f'{ signal } _mean' : 'val' ,
163
+ f'{ signal } _se' : 'se' ,
164
+ f'{ signal } _n' : 'sample_size' ,
165
+ }, axis = 1 )
140
166
df_export .to_csv (f'{ export_dir } /{ date } _{ geo_res } _{ signal } .csv' ,
141
- na_rep = 'NA' ,
142
- index = False ,)
167
+ na_rep = 'NA' ,
168
+ index = False , )
143
169
return
144
-
0 commit comments