@@ -113,6 +113,7 @@ def __init__(self, params):
113
113
"""
114
114
# Get user settings from params or if not provided, set default.
115
115
self .data_source = params ['data_source' ]
116
+ self .validator_static_file_dir = params .get ('validator_static_file_dir' , '../validator/static' )
116
117
117
118
# Date/time settings
118
119
self .span_length = timedelta (days = params ['span_length' ])
@@ -244,9 +245,33 @@ def check_df_format(self, df_to_test, nameformat):
244
245
245
246
self .increment_total_checks ()
246
247
247
- def check_bad_geo_id (self , df_to_test , nameformat , geo_type ):
248
+ def check_bad_geo_id_value (self , df_to_test , filename , geo_type ):
248
249
"""
249
- Check validity of geo type and values, according to regex pattern.
250
+ Check for bad geo_id values, by comparing to a list of known values (drawn from historical data)
251
+
252
+ Arguments:
253
+ - df_to_test: pandas dataframe of CSV source data containing the geo_id column to check
254
+ - geo_type: string from CSV name specifying geo type (state, county, msa, etc.) of data
255
+ """
256
+ file_path = join (self .validator_static_file_dir , geo_type + '_geo.csv' )
257
+ valid_geo_df = pd .read_csv (file_path , dtype = {'geo_id' : str })
258
+ valid_geos = valid_geo_df ['geo_id' ].values
259
+ unexpected_geos = [geo for geo in df_to_test ['geo_id' ] if geo .lower () not in valid_geos ]
260
+ if len (unexpected_geos ) > 0 :
261
+ self .raised_errors .append (ValidationError (
262
+ ("check_bad_geo_id_value" , filename ),
263
+ unexpected_geos , "Unrecognized geo_ids (not in historical data)" ))
264
+ self .increment_total_checks ()
265
+ upper_case_geos = [geo for geo in df_to_test ['geo_id' ] if geo .lower () != geo ]
266
+ if len (upper_case_geos ) > 0 :
267
+ self .raised_warnings .append (ValidationError (
268
+ ("check_geo_id_lowercase" , filename ),
269
+ upper_case_geos , "geo_id contains uppercase characters. Lowercase is preferred." ))
270
+ self .increment_total_checks ()
271
+
272
+ def check_bad_geo_id_format (self , df_to_test , nameformat , geo_type ):
273
+ """
274
+ Check validity of geo_type and format of geo_ids, according to regex pattern.
250
275
251
276
Arguments:
252
277
- df_to_test: pandas dataframe of CSV source data
@@ -720,8 +745,9 @@ def validate(self, export_dir):
720
745
data_df = load_csv (join (export_dir , filename ))
721
746
722
747
self .check_df_format (data_df , filename )
723
- self .check_bad_geo_id (
748
+ self .check_bad_geo_id_format (
724
749
data_df , filename , match .groupdict ()['geo_type' ])
750
+ self .check_bad_geo_id_value (data_df , filename , match .groupdict ()['geo_type' ])
725
751
self .check_bad_val (data_df , filename , match .groupdict ()['signal' ])
726
752
self .check_bad_se (data_df , filename )
727
753
self .check_bad_sample_size (data_df , filename )
0 commit comments