4
4
import covidcast
5
5
import pandas as pd
6
6
from datetime import date , datetime , timedelta
7
- from .errors import *
7
+ from .errors import APIDataFetchError
8
8
import re
9
9
from typing import List
10
10
import json
11
11
12
+ filename_regex = re .compile (r'^(?P<date>\d{8})_(?P<geo_type>\w+?)_(?P<signal>\w+)\.csv$' )
12
13
13
- def get_filenames_with_geo_signal (path , date_slist : List [str ]):
14
-
15
- if pipeline_version == 'new' :
16
- meta = covidcast .metadata ()
17
- fb_meta = meta [meta ['data_source' ]== DATA_SOURCE ]
18
- unique_signals = fb_meta ['signal' ].unique ().tolist ()
19
- unique_geotypes = fb_meta ['geo_type' ].unique ().tolist ()
20
-
21
-
22
- ##### Currently metadata returns --*community*-- signals that don't get generated
23
- ##### in the new fb-pipeline. Seiving them out for now.
24
- # Todo - Include weighted whh_cmnty_cli and wnohh_cmnty_cli
25
- for sig in unique_signals :
26
- if "community" in sig :
27
- unique_signals .remove (sig )
28
-
29
-
30
- geo_sig_cmbo = list (product (unique_geotypes , unique_signals ))
31
- print (geo_sig_cmbo )
32
- print ("Number of mixed types:" , len (geo_sig_cmbo ))
33
-
34
- for cmb in geo_sig_cmbo :
35
- print (cmb )
36
-
37
-
38
- filenames = read_relevant_date_filenames (data_folder , date_slist [0 ])
39
-
40
- else :
41
- sdate = date_slist [0 ]
42
- filenames = [f for f in listdir (path ) if isfile (join (path , f ))]
43
-
44
- sdate_filenames = [fname for fname in filenames if fname .find (sdate ) != - 1 ]
45
-
46
- # example: 20200624_county_smoothed_nohh_cmnty_cli
47
- filename_regex = re .compile (r'^(\d{8})_([a-z]+)_(raw\S*|smoothed\S*)[_?](w?)([ci]li).csv$' )
48
- geo_sig_cmbo = list ()
49
- for f in sdate_filenames :
50
-
51
- m = filename_regex .match (f )
52
- if (not m .group (0 )):
53
- print ('=nameformat= not recognized as a daily format' )
54
-
55
- geo_type = m .group (2 )
56
-
57
-
58
- if m .group (4 ): # weighted data 'w'
59
- signal = "" .join ([m .group (4 ), m .group (5 )])
60
- signal = "_" .join ([m .group (3 ), signal ])
61
- # max_weighted_date = survey_date
62
- else :
63
- signal = "_" .join ([m .group (3 ), m .group (5 )])
64
- # max_date = survey_date
65
-
66
- geo_sig_cmbo .append ((geo_type , signal ))
67
14
15
+ def get_filenames_with_geo_signal (path , data_source , date_slist : List [str ]):
16
+ meta = covidcast .metadata ()
17
+ source_meta = meta [meta ['data_source' ]== data_source ]
18
+ unique_signals = source_meta ['signal' ].unique ().tolist ()
19
+ unique_geotypes = source_meta ['geo_type' ].unique ().tolist ()
20
+
21
+ ##### Currently metadata returns --*community*-- signals that don't get generated
22
+ ##### in the new fb-pipeline. Seiving them out for now.
23
+ # Todo - Include weighted whh_cmnty_cli and wnohh_cmnty_cli
24
+ for sig in unique_signals :
25
+ if "community" in sig :
26
+ unique_signals .remove (sig )
27
+
28
+ geo_sig_cmbo = list (product (unique_geotypes , unique_signals ))
29
+ print (geo_sig_cmbo )
30
+ print ("Number of mixed types:" , len (geo_sig_cmbo ))
31
+
32
+ for cmb in geo_sig_cmbo :
33
+ print (cmb )
34
+
35
+ filenames = read_relevant_date_filenames (data_folder , date_slist [0 ])
68
36
return filenames , geo_sig_cmbo
69
37
70
38
71
39
def read_filenames (path ):
72
- daily_filenames = [f for f in listdir (path ) if isfile (join (path , f ))]
40
+ daily_filenames = [ ( f , filename_regex . match ( f )) for f in listdir (path ) if isfile (join (path , f ))]
73
41
return daily_filenames
74
42
75
43
def read_relevant_date_filenames (data_path , date_slist ):
@@ -80,7 +48,7 @@ def read_relevant_date_filenames(data_path, date_slist):
80
48
for dt in date_slist :
81
49
if fl .find (dt ) != - 1 :
82
50
filenames .append (fl )
83
- return filenames
51
+ return filenames
84
52
85
53
def read_geo_sig_cmbo_files (geo_sig_cmbo , data_folder , filenames , date_slist ):
86
54
for geo_sig in geo_sig_cmbo :
@@ -105,6 +73,16 @@ def read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist):
105
73
df_list .append (df )
106
74
yield pd .concat (df_list ), geo_sig [0 ], geo_sig [1 ]
107
75
76
+ def load_csv (path ):
77
+ return pd .read_csv (
78
+ path ,
79
+ dtype = {
80
+ 'geo_id' : str ,
81
+ 'val' : float ,
82
+ 'se' : float ,
83
+ 'sample_size' : float ,
84
+ })
85
+
108
86
def fetch_daily_data (data_source , survey_date , geo_type , signal ):
109
87
data_to_validate = covidcast .signal (data_source , signal , survey_date , survey_date , geo_type )
110
88
if not isinstance (data_to_validate , pd .DataFrame ):
@@ -114,4 +92,4 @@ def fetch_daily_data(data_source, survey_date, geo_type, signal):
114
92
", geography-type:" + geo_type
115
93
raise APIDataFetchError (custom_msg )
116
94
return data_to_validate
117
-
95
+
0 commit comments