Skip to content

Commit cd8541f

Browse files
committed
Remove old-codebase method; add pandas loader
1 parent 52fa474 commit cd8541f

File tree

2 files changed

+40
-64
lines changed

2 files changed

+40
-64
lines changed

validator/delphi_validator/datafetcher.py

Lines changed: 36 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -4,72 +4,40 @@
44
import covidcast
55
import pandas as pd
66
from datetime import date, datetime, timedelta
7-
from .errors import *
7+
from .errors import APIDataFetchError
88
import re
99
from typing import List
1010
import json
1111

12+
filename_regex = re.compile(r'^(?P<date>\d{8})_(?P<geo_type>\w+?)_(?P<signal>\w+)\.csv$')
1213

13-
def get_filenames_with_geo_signal(path, date_slist: List[str]):
14-
15-
if pipeline_version == 'new':
16-
meta = covidcast.metadata()
17-
fb_meta = meta[meta['data_source']==DATA_SOURCE]
18-
unique_signals = fb_meta['signal'].unique().tolist()
19-
unique_geotypes = fb_meta['geo_type'].unique().tolist()
20-
21-
22-
##### Currently metadata returns --*community*-- signals that don't get generated
23-
##### in the new fb-pipeline. Seiving them out for now.
24-
# Todo - Include weighted whh_cmnty_cli and wnohh_cmnty_cli
25-
for sig in unique_signals:
26-
if "community" in sig:
27-
unique_signals.remove(sig)
28-
29-
30-
geo_sig_cmbo = list(product(unique_geotypes, unique_signals))
31-
print(geo_sig_cmbo)
32-
print("Number of mixed types:", len(geo_sig_cmbo))
33-
34-
for cmb in geo_sig_cmbo:
35-
print(cmb)
36-
37-
38-
filenames = read_relevant_date_filenames(data_folder, date_slist[0])
39-
40-
else:
41-
sdate = date_slist[0]
42-
filenames = [f for f in listdir(path) if isfile(join(path, f))]
43-
44-
sdate_filenames = [fname for fname in filenames if fname.find(sdate) != -1]
45-
46-
# example: 20200624_county_smoothed_nohh_cmnty_cli
47-
filename_regex = re.compile(r'^(\d{8})_([a-z]+)_(raw\S*|smoothed\S*)[_?](w?)([ci]li).csv$')
48-
geo_sig_cmbo = list()
49-
for f in sdate_filenames:
50-
51-
m = filename_regex.match(f)
52-
if (not m.group(0)):
53-
print('=nameformat= not recognized as a daily format')
54-
55-
geo_type = m.group(2)
56-
57-
58-
if m.group(4): # weighted data 'w'
59-
signal = "".join([m.group(4), m.group(5)])
60-
signal = "_".join([m.group(3), signal])
61-
# max_weighted_date = survey_date
62-
else:
63-
signal = "_".join([m.group(3), m.group(5)])
64-
# max_date = survey_date
65-
66-
geo_sig_cmbo.append((geo_type, signal))
6714

15+
def get_filenames_with_geo_signal(path, data_source, date_slist: List[str]):
16+
meta = covidcast.metadata()
17+
source_meta = meta[meta['data_source']==data_source]
18+
unique_signals = source_meta['signal'].unique().tolist()
19+
unique_geotypes = source_meta['geo_type'].unique().tolist()
20+
21+
##### Currently metadata returns --*community*-- signals that don't get generated
22+
##### in the new fb-pipeline. Seiving them out for now.
23+
# Todo - Include weighted whh_cmnty_cli and wnohh_cmnty_cli
24+
for sig in unique_signals:
25+
if "community" in sig:
26+
unique_signals.remove(sig)
27+
28+
geo_sig_cmbo = list(product(unique_geotypes, unique_signals))
29+
print(geo_sig_cmbo)
30+
print("Number of mixed types:", len(geo_sig_cmbo))
31+
32+
for cmb in geo_sig_cmbo:
33+
print(cmb)
34+
35+
filenames = read_relevant_date_filenames(data_folder, date_slist[0])
6836
return filenames, geo_sig_cmbo
6937

7038

7139
def read_filenames(path):
72-
daily_filenames = [f for f in listdir(path) if isfile(join(path, f))]
40+
daily_filenames = [ (f, filename_regex.match(f)) for f in listdir(path) if isfile(join(path, f))]
7341
return daily_filenames
7442

7543
def read_relevant_date_filenames(data_path, date_slist):
@@ -80,7 +48,7 @@ def read_relevant_date_filenames(data_path, date_slist):
8048
for dt in date_slist:
8149
if fl.find(dt) != -1:
8250
filenames.append(fl)
83-
return filenames
51+
return filenames
8452

8553
def read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist):
8654
for geo_sig in geo_sig_cmbo:
@@ -105,6 +73,16 @@ def read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist):
10573
df_list.append(df)
10674
yield pd.concat(df_list), geo_sig[0], geo_sig[1]
10775

76+
def load_csv(path):
77+
return pd.read_csv(
78+
path,
79+
dtype={
80+
'geo_id': str,
81+
'val': float,
82+
'se': float,
83+
'sample_size': float,
84+
})
85+
10886
def fetch_daily_data(data_source, survey_date, geo_type, signal):
10987
data_to_validate = covidcast.signal(data_source, signal, survey_date, survey_date, geo_type)
11088
if not isinstance(data_to_validate, pd.DataFrame):
@@ -114,4 +92,4 @@ def fetch_daily_data(data_source, survey_date, geo_type, signal):
11492
", geography-type:" + geo_type
11593
raise APIDataFetchError(custom_msg)
11694
return data_to_validate
117-
95+

validator/delphi_validator/run.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
This module should contain a function called `run_module`, that is executed
55
when the module is run with `python -m delphi_validator`.
66
"""
7+
from datetime import datetime
78
import numpy as np
89
import pandas as pd
910
from delphi_utils import read_params
@@ -12,12 +13,9 @@
1213

1314

1415
def run_module():
15-
params = read_params()["validation"]
16+
parent_params = read_params()
17+
params = parent_params['validation']
1618
dtobj_sdate = datetime.strptime(params['start_date'], '%Y-%m-%d')
1719
dtobj_edate = datetime.strptime(params['end_date'], '%Y-%m-%d')
1820
max_check_lookbehind = int(params["ref_window_size"])
19-
20-
# Collecting all filenames
21-
daily_filnames = read_filenames(params["data_folder"])
22-
23-
validate(daily_filnames, dtobj_sdate, dtobj_edate)
21+
validate(parent_params["export_dir"], dtobj_sdate, dtobj_edate)

0 commit comments

Comments
 (0)