Skip to content

Commit 728a060

Browse files
authored
Merge pull request #426 from JedGrabman/jedgrabman/geo_id_validation
Find Unexpected Values for geo_id compared to historical geo_ids seen
2 parents 8984aac + 42ad65e commit 728a060

File tree

10 files changed

+4285
-12
lines changed

10 files changed

+4285
-12
lines changed

validator/delphi_validator/validate.py

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ def __init__(self, params):
113113
"""
114114
# Get user settings from params or if not provided, set default.
115115
self.data_source = params['data_source']
116+
self.validator_static_file_dir = params.get('validator_static_file_dir', '../validator/static')
116117

117118
# Date/time settings
118119
self.span_length = timedelta(days=params['span_length'])
@@ -244,9 +245,33 @@ def check_df_format(self, df_to_test, nameformat):
244245

245246
self.increment_total_checks()
246247

247-
def check_bad_geo_id(self, df_to_test, nameformat, geo_type):
248+
def check_bad_geo_id_value(self, df_to_test, filename, geo_type):
248249
"""
249-
Check validity of geo type and values, according to regex pattern.
250+
Check for bad geo_id values, by comparing to a list of known values (drawn from historical data)
251+
252+
Arguments:
253+
- df_to_test: pandas dataframe of CSV source data containing the geo_id column to check
254+
- geo_type: string from CSV name specifying geo type (state, county, msa, etc.) of data
255+
"""
256+
file_path = join(self.validator_static_file_dir, geo_type + '_geo.csv')
257+
valid_geo_df = pd.read_csv(file_path, dtype = {'geo_id': str})
258+
valid_geos = valid_geo_df['geo_id'].values
259+
unexpected_geos = [geo for geo in df_to_test['geo_id'] if geo.lower() not in valid_geos]
260+
if len(unexpected_geos) > 0:
261+
self.raised_errors.append(ValidationError(
262+
("check_bad_geo_id_value", filename),
263+
unexpected_geos, "Unrecognized geo_ids (not in historical data)"))
264+
self.increment_total_checks()
265+
upper_case_geos = [geo for geo in df_to_test['geo_id'] if geo.lower() != geo]
266+
if len(upper_case_geos) > 0:
267+
self.raised_warnings.append(ValidationError(
268+
("check_geo_id_lowercase", filename),
269+
upper_case_geos, "geo_id contains uppercase characters. Lowercase is preferred."))
270+
self.increment_total_checks()
271+
272+
def check_bad_geo_id_format(self, df_to_test, nameformat, geo_type):
273+
"""
274+
Check validity of geo_type and format of geo_ids, according to regex pattern.
250275
251276
Arguments:
252277
- df_to_test: pandas dataframe of CSV source data
@@ -720,8 +745,9 @@ def validate(self, export_dir):
720745
data_df = load_csv(join(export_dir, filename))
721746

722747
self.check_df_format(data_df, filename)
723-
self.check_bad_geo_id(
748+
self.check_bad_geo_id_format(
724749
data_df, filename, match.groupdict()['geo_type'])
750+
self.check_bad_geo_id_value(data_df, filename, match.groupdict()['geo_type'])
725751
self.check_bad_val(data_df, filename, match.groupdict()['signal'])
726752
self.check_bad_se(data_df, filename)
727753
self.check_bad_sample_size(data_df, filename)

validator/params.json.template

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"end_date": "2020-09-08",
55
"span_length": 3,
66
"ref_window_size": 7,
7+
"validator_static_file_dir": "../validator/static",
78
"minimum_sample_size": 100,
89
"missing_se_allowed": true,
910
"missing_sample_size_allowed": true,

validator/scripts/unique_geoids.R

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
library(covidcast)
2+
3+
geo_types = c("county", "state", "hrr", "msa")
4+
for(type in geo_types){
5+
dtf = covidcast_signal("indicator-combination", "confirmed_7dav_incidence_num", start_day = "2020-10-01", end_day = "2020-10-01", geo_type = type)
6+
file_name = paste0("../static/", type, "_geo.csv")
7+
write.table(unique(dtf$geo_value), file = file_name, row.names = F, col.names = "geo_id")
8+
}
9+
10+
dtf = covidcast_signal("ght", "raw_search", start_day = "2020-10-01", end_day = "2020-10-01", geo_type = "dma")
11+
file_name = "../static/dma_geo.csv"
12+
write.table(unique(dtf$geo_value), file = file_name, row.names = F, col.names = "geo_id")
13+
14+
national_file = "../static/national_geo.csv"
15+
write.table("us", file = national_file, row.names = F, col.names = "geo_id")

0 commit comments

Comments
 (0)