1
- import dask .dataframe as dd
1
+ """Module providing functions for processing and wrangling data."""
2
+
2
3
from datetime import datetime
4
+ from pathlib import Path
5
+
6
+ import dask .dataframe as dd
3
7
import numpy as np
4
8
import pandas as pd
5
- from pathlib import Path
6
9
7
10
from .config import Config
8
11
9
- def format_outname (prefix : str , se : bool , weekday :bool ):
10
- '''
12
+
13
+ def format_outname (prefix : str , se : bool , weekday : bool ):
14
+ """
15
+ Write out results.
11
16
12
17
Parameters
13
18
----------
14
- prefix
15
- se
16
- weekday
19
+ prefix:
20
+ se: boolean to write out standard errors, if true, use an obfuscated name
21
+ weekday: boolean for weekday adjustments.
22
+ signals will be generated with weekday adjustments (True) or without
23
+ adjustments (False)
17
24
18
25
Returns
19
26
-------
20
-
21
- '''
22
- # write out results
27
+ outname str
28
+ """
23
29
out_name = "smoothed_adj_cli" if weekday else "smoothed_cli"
24
30
if se :
25
31
assert prefix is not None , "template has no obfuscated prefix"
26
32
out_name = prefix + "_" + out_name
27
33
return out_name
28
34
35
+
29
36
def format_df (df : pd .DataFrame , geo_id : str , se : bool , logger ):
30
- '''
31
- format dataframe and checks for anomalies to write results
37
+ """
38
+ Format dataframe and checks for anomalies to write results.
39
+
32
40
Parameters
33
41
----------
34
42
df: dataframe from output from update_sensor
@@ -39,9 +47,9 @@ def format_df(df: pd.DataFrame, geo_id: str, se: bool, logger):
39
47
Returns
40
48
-------
41
49
filtered and formatted dataframe
42
- '''
50
+ """
43
51
# report in percentage
44
- df [' val' ] = df [' val' ] * 100
52
+ df [" val" ] = df [" val" ] * 100
45
53
df ["se" ] = df ["se" ] * 100
46
54
47
55
val_isnull = df ["val" ].isnull ()
@@ -50,23 +58,23 @@ def format_df(df: pd.DataFrame, geo_id: str, se: bool, logger):
50
58
logger .info ("sensor value is nan, check pipeline" )
51
59
df = df [~ val_isnull ]
52
60
53
- se_too_high = df ['se' ] >= 5
61
+ se_too_high = df ["se" ] >= 5
54
62
df_se_too_high = df [se_too_high ]
55
63
if len (df_se_too_high ) > 0 :
56
64
logger .info (f"standard error suspiciously high! investigate { geo_id } " )
57
65
df = df [~ se_too_high ]
58
66
59
- sensor_too_high = df [' val' ] >= 90
67
+ sensor_too_high = df [" val" ] >= 90
60
68
df_sensor_too_high = df [sensor_too_high ]
61
69
if len (df_sensor_too_high ) > 0 :
62
70
logger .info (f"standard error suspiciously high! investigate { geo_id } " )
63
71
df = df [~ sensor_too_high ]
64
72
65
73
if se :
66
- valid_cond = (df ['se' ] > 0 ) & (df [' val' ] > 0 )
74
+ valid_cond = (df ["se" ] > 0 ) & (df [" val" ] > 0 )
67
75
invalid_df = df [~ valid_cond ]
68
76
if len (invalid_df ) > 0 :
69
- logger .info (f "p=0, std_err=0 invalid" )
77
+ logger .info ("p=0, std_err=0 invalid" )
70
78
df = df [valid_cond ]
71
79
else :
72
80
df ["se" ] = np .NAN
@@ -75,8 +83,10 @@ def format_df(df: pd.DataFrame, geo_id: str, se: bool, logger):
75
83
df ["sample_size" ] = np .NAN
76
84
return df
77
85
78
- def write_to_csv (output_df : pd .DataFrame , prefix : str , geo_id : str , weekday : bool , se :bool , logger , output_path = "." ):
79
- """Write sensor values to csv.
86
+
87
+ def write_to_csv (output_df : pd .DataFrame , prefix : str , geo_id : str , weekday : bool , se : bool , logger , output_path = "." ):
88
+ """
89
+ Write sensor values to csv.
80
90
81
91
Args:
82
92
output_dict: dictionary containing sensor rates, se, unique dates, and unique geo_id
@@ -91,24 +101,21 @@ def write_to_csv(output_df: pd.DataFrame, prefix: str, geo_id: str, weekday: boo
91
101
if se :
92
102
logger .info (f"========= WARNING: WRITING SEs TO { out_name } =========" )
93
103
94
- dates = set (list (output_df [' date' ]))
95
- grouped = filtered_df .groupby (' date' )
104
+ dates = set (list (output_df [" date" ]))
105
+ grouped = filtered_df .groupby (" date" )
96
106
for d in dates :
97
- filename = "%s/%s_%s_%s.csv" % (output_path ,
98
- (d + Config .DAY_SHIFT ).strftime ("%Y%m%d" ),
99
- geo_id ,
100
- out_name )
107
+ filename = "%s/%s_%s_%s.csv" % (output_path , (d + Config .DAY_SHIFT ).strftime ("%Y%m%d" ), geo_id , out_name )
101
108
single_date_df = grouped .get_group (d )
102
- single_date_df = single_date_df .drop (columns = [' date' ])
109
+ single_date_df = single_date_df .drop (columns = [" date" ])
103
110
single_date_df .to_csv (filename , index = False , na_rep = "NA" )
104
111
105
112
logger .debug (f"wrote { len (single_date_df )} rows for { geo_id } " )
106
113
107
114
108
115
def csv_to_df (filepath : str , startdate : datetime , enddate : datetime , dropdate : datetime , logger ) -> pd .DataFrame :
109
- '''
110
- Reads csv using Dask and filters out based on date range and currently unused column,
111
- then converts back into pandas dataframe.
116
+ """
117
+ Read csv using Dask, filters unneeded data, then converts back into pandas dataframe.
118
+
112
119
Parameters
113
120
----------
114
121
filepath: path to the aggregated doctor-visits data
@@ -117,7 +124,7 @@ def csv_to_df(filepath: str, startdate: datetime, enddate: datetime, dropdate: d
117
124
dropdate: data drop date (YYYY-mm-dd)
118
125
119
126
-------
120
- '''
127
+ """
121
128
filepath = Path (filepath )
122
129
logger .info (f"Processing { filepath } " )
123
130
@@ -142,7 +149,7 @@ def csv_to_df(filepath: str, startdate: datetime, enddate: datetime, dropdate: d
142
149
assert startdate < enddate , "Start date >= end date"
143
150
assert enddate <= dropdate , "End date > drop date"
144
151
145
- date_filter = (( ddata [Config .DATE_COL ] >= Config .FIRST_DATA_DATE ) & (ddata [Config .DATE_COL ] < dropdate ) )
152
+ date_filter = (ddata [Config .DATE_COL ] >= Config .FIRST_DATA_DATE ) & (ddata [Config .DATE_COL ] < dropdate )
146
153
147
154
df = ddata [date_filter ].compute ()
148
155
0 commit comments