1
1
"""Collects and reads covidcast data from a set of local CSV files."""
2
2
3
3
# standard library
4
- from dataclasses import dataclass
5
- from datetime import date
6
- import glob
7
4
import os
8
5
import re
6
+ from dataclasses import dataclass
7
+ from datetime import date
8
+ from glob import glob
9
+ from typing import Iterator , NamedTuple , Optional , Tuple
9
10
10
11
# third party
11
- import pandas as pd
12
12
import epiweeks as epi
13
+ import pandas as pd
13
14
14
15
# first party
15
16
from delphi_utils import Nans
16
17
from delphi .utils .epiweek import delta_epiweeks
17
- from .logger import get_structured_logger
18
+ from delphi .epidata .acquisition .covidcast .database import CovidcastRow
19
+ from delphi .epidata .acquisition .covidcast .logger import get_structured_logger
20
+
21
+ DFRow = NamedTuple ('DFRow' , [('geo_id' , str ), ('value' , float ), ('stderr' , float ), ('sample_size' , float ), ('missing_value' , int ), ('missing_stderr' , int ), ('missing_sample_size' , int )])
22
+ PathDetails = NamedTuple ('PathDetails' , [('issue' , int ), ('lag' , int ), ('source' , str ), ('signal' , str ), ('time_type' , str ), ('time_value' , int ), ('geo_type' , str )])
23
+
18
24
19
25
@dataclass
20
26
class CsvRowValue :
@@ -27,6 +33,7 @@ class CsvRowValue:
27
33
missing_stderr : int
28
34
missing_sample_size : int
29
35
36
+
30
37
class CsvImporter :
31
38
"""Finds and parses covidcast CSV files."""
32
39
@@ -60,6 +67,7 @@ class CsvImporter:
60
67
"missing_sample_size" : "Int64"
61
68
}
62
69
70
+
63
71
@staticmethod
64
72
def is_sane_day (value ):
65
73
"""Return whether `value` is a sane (maybe not valid) YYYYMMDD date.
@@ -76,6 +84,7 @@ def is_sane_day(value):
76
84
return False
77
85
return date (year = year ,month = month ,day = day )
78
86
87
+
79
88
@staticmethod
80
89
def is_sane_week (value ):
81
90
"""Return whether `value` is a sane (maybe not valid) YYYYWW epiweek.
@@ -91,22 +100,24 @@ def is_sane_week(value):
91
100
return False
92
101
return value
93
102
103
+
94
104
@staticmethod
95
- def find_issue_specific_csv_files (scan_dir , glob = glob ):
105
+ def find_issue_specific_csv_files (scan_dir ):
96
106
logger = get_structured_logger ('find_issue_specific_csv_files' )
97
- for path in sorted (glob . glob (os .path .join (scan_dir , '*' ))):
107
+ for path in sorted (glob (os .path .join (scan_dir , '*' ))):
98
108
issuedir_match = CsvImporter .PATTERN_ISSUE_DIR .match (path .lower ())
99
109
if issuedir_match and os .path .isdir (path ):
100
110
issue_date_value = int (issuedir_match .group (2 ))
101
111
issue_date = CsvImporter .is_sane_day (issue_date_value )
102
112
if issue_date :
103
113
logger .info (event = 'processing csv files from issue' , detail = issue_date , file = path )
104
- yield from CsvImporter .find_csv_files (path , issue = (issue_date , epi .Week .fromdate (issue_date )), glob = glob )
114
+ yield from CsvImporter .find_csv_files (path , issue = (issue_date , epi .Week .fromdate (issue_date )))
105
115
else :
106
116
logger .warning (event = 'invalid issue directory day' , detail = issue_date_value , file = path )
107
117
118
+
108
119
@staticmethod
109
- def find_csv_files (scan_dir , issue = (date .today (), epi .Week .fromdate (date .today ())), glob = glob ):
120
+ def find_csv_files (scan_dir , issue = (date .today (), epi .Week .fromdate (date .today ()))):
110
121
"""Recursively search for and yield covidcast-format CSV files.
111
122
112
123
scan_dir: the directory to scan (recursively)
@@ -122,11 +133,11 @@ def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today()
122
133
issue_value = - 1
123
134
lag_value = - 1
124
135
125
- for path in sorted (glob . glob (os .path .join (scan_dir , '*' , '*' ))):
126
-
136
+ for path in sorted (glob (os .path .join (scan_dir , '*' , '*' ))):
137
+ # safe to ignore this file
127
138
if not path .lower ().endswith ('.csv' ):
128
- # safe to ignore this file
129
139
continue
140
+
130
141
# match a daily or weekly naming pattern
131
142
daily_match = CsvImporter .PATTERN_DAILY .match (path .lower ())
132
143
weekly_match = CsvImporter .PATTERN_WEEKLY .match (path .lower ())
@@ -174,14 +185,16 @@ def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today()
174
185
yield (path , None )
175
186
continue
176
187
177
- yield (path , (source , signal , time_type , geo_type , time_value , issue_value , lag_value ))
188
+ yield (path , PathDetails (issue_value , lag_value , source , signal , time_type , time_value , geo_type ))
189
+
178
190
179
191
@staticmethod
180
192
def is_header_valid (columns ):
181
193
"""Return whether the given pandas columns contains the required fields."""
182
194
183
195
return set (columns ) >= CsvImporter .REQUIRED_COLUMNS
184
196
197
+
185
198
@staticmethod
186
199
def floaty_int (value : str ) -> int :
187
200
"""Cast a string to an int, even if it looks like a float.
@@ -195,6 +208,7 @@ def floaty_int(value: str) -> int:
195
208
raise ValueError ('not an int: "%s"' % str (value ))
196
209
return int (float_value )
197
210
211
+
198
212
@staticmethod
199
213
def maybe_apply (func , quantity ):
200
214
"""Apply the given function to the given quantity if not null-ish."""
@@ -205,6 +219,7 @@ def maybe_apply(func, quantity):
205
219
else :
206
220
return func (quantity )
207
221
222
+
208
223
@staticmethod
209
224
def validate_quantity (row , attr_quantity ):
210
225
"""Take a row and validate a given associated quantity (e.g., val, se, stderr).
@@ -218,6 +233,7 @@ def validate_quantity(row, attr_quantity):
218
233
# val was a string or another data
219
234
return "Error"
220
235
236
+
221
237
@staticmethod
222
238
def validate_missing_code (row , attr_quantity , attr_name , filepath = None , logger = None ):
223
239
"""Take a row and validate the missing code associated with
@@ -250,8 +266,9 @@ def validate_missing_code(row, attr_quantity, attr_name, filepath=None, logger=N
250
266
251
267
return missing_entry
252
268
269
+
253
270
@staticmethod
254
- def extract_and_check_row (row , geo_type , filepath = None ):
271
+ def extract_and_check_row (row : DFRow , geo_type : str , filepath : Optional [ str ] = None ) -> Tuple [ Optional [ CsvRowValue ], Optional [ str ]] :
255
272
"""Extract and return `CsvRowValue` from a CSV row, with sanity checks.
256
273
257
274
Also returns the name of the field which failed sanity check, or None.
@@ -331,8 +348,9 @@ def extract_and_check_row(row, geo_type, filepath=None):
331
348
# return extracted and validated row values
332
349
return (CsvRowValue (geo_id , value , stderr , sample_size , missing_value , missing_stderr , missing_sample_size ), None )
333
350
351
+
334
352
@staticmethod
335
- def load_csv (filepath , geo_type ) :
353
+ def load_csv (filepath : str , details : PathDetails ) -> Iterator [ Optional [ CovidcastRow ]] :
336
354
"""Load, validate, and yield data as `RowValues` from a CSV file.
337
355
338
356
filepath: the CSV file to be loaded
@@ -357,9 +375,32 @@ def load_csv(filepath, geo_type):
357
375
table .rename (columns = {"val" : "value" , "se" : "stderr" , "missing_val" : "missing_value" , "missing_se" : "missing_stderr" }, inplace = True )
358
376
359
377
for row in table .itertuples (index = False ):
360
- row_values , error = CsvImporter .extract_and_check_row (row , geo_type , filepath )
378
+ csv_row_values , error = CsvImporter .extract_and_check_row (row , details .geo_type , filepath )
379
+
361
380
if error :
362
381
logger .warning (event = 'invalid value for row' , detail = (str (row ), error ), file = filepath )
363
382
yield None
364
383
continue
365
- yield row_values
384
+
385
+ yield CovidcastRow (
386
+ details .source ,
387
+ details .signal ,
388
+ details .time_type ,
389
+ details .geo_type ,
390
+ details .time_value ,
391
+ csv_row_values .geo_value ,
392
+ csv_row_values .value ,
393
+ csv_row_values .stderr ,
394
+ csv_row_values .sample_size ,
395
+ csv_row_values .missing_value ,
396
+ csv_row_values .missing_stderr ,
397
+ csv_row_values .missing_sample_size ,
398
+ details .issue ,
399
+ details .lag ,
400
+ # These four fields are unused by database acquisition
401
+ # TODO: These will be used when CovidcastRow is updated.
402
+ # id=None,
403
+ # direction=None,
404
+ # direction_updated_timestamp=0,
405
+ # value_updated_timestamp=0,
406
+ )
0 commit comments