diff --git a/Jenkinsfile b/Jenkinsfile index 0c9ac09ae..7dc4cb457 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -45,6 +45,7 @@ pipeline { } parallel deploy_staging } + sh "jenkins/deploy-staging-api-match-list.sh" } } stage('Deploy production') { diff --git a/_delphi_utils_python/Makefile b/_delphi_utils_python/Makefile index 240142c9d..6db4b759d 100644 --- a/_delphi_utils_python/Makefile +++ b/_delphi_utils_python/Makefile @@ -12,7 +12,8 @@ install: venv lint: . env/bin/activate; \ - pylint $(dir) + pylint $(dir); \ + pydocstyle $(dir) test: . env/bin/activate ;\ diff --git a/_delphi_utils_python/delphi_utils/__init__.py b/_delphi_utils_python/delphi_utils/__init__.py index c2b5ef0d1..c796035b8 100644 --- a/_delphi_utils_python/delphi_utils/__init__.py +++ b/_delphi_utils_python/delphi_utils/__init__.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -"""Common Utility Functions to Support DELPHI Indicators -""" +"""Common Utility Functions to Support DELPHI Indicators.""" from __future__ import absolute_import diff --git a/_delphi_utils_python/delphi_utils/archive.py b/_delphi_utils_python/delphi_utils/archive.py index 42f2ab4f3..a707fc4a1 100644 --- a/_delphi_utils_python/delphi_utils/archive.py +++ b/_delphi_utils_python/delphi_utils/archive.py @@ -1,5 +1,6 @@ """ Utilities for diffing and archiving covidcast export CSVs. + Aims to simplify the creation of issues for new and backfilled value for indicators. Also handles archiving of export CSVs to some backend (git, S3 etc.) before replacing them. @@ -52,6 +53,7 @@ def diff_export_csv( ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Find differences in exported covidcast CSVs, using geo_id as the index. + Treats NA == NA as True. Parameters @@ -68,7 +70,6 @@ def diff_export_csv( changed_df is the pd.DataFrame of common rows from after_csv with changed values. added_df is the pd.DataFrame of added rows from after_csv. """ - export_csv_dtypes = {"geo_id": str, "val": float, "se": float, "sample_size": float} @@ -99,7 +100,7 @@ def run_module(archive_type: str, cache_dir: str, export_dir: str, **kwargs): - """Builds and runs an ArchiveDiffer. + """Build and run an ArchiveDiffer. Parameters ---------- @@ -132,13 +133,11 @@ def run_module(archive_type: str, class ArchiveDiffer: - """ - Base class for performing diffing and archiving of exported covidcast CSVs - """ + """Base class for performing diffing and archiving of exported covidcast CSVs.""" def __init__(self, cache_dir: str, export_dir: str): """ - Initialize an ArchiveDiffer + Initialize an ArchiveDiffer. Parameters ---------- @@ -157,7 +156,8 @@ def __init__(self, cache_dir: str, export_dir: str): def update_cache(self): """ - For making sure cache_dir is updated correctly from a backend. + Make sure cache_dir is updated correctly from a backend. + To be implemented by specific archiving backends. Should set self._cache_updated = True after verifying cache is updated. """ @@ -165,7 +165,8 @@ def update_cache(self): def diff_exports(self) -> Tuple[Files, FileDiffMap, Files]: """ - Finds diffs across and within CSV files, from cache_dir to export_dir. + Find diffs across and within CSV files, from cache_dir to export_dir. + Should be called after update_cache() succeeds. Only works on *.csv files, ignores every other file. @@ -223,7 +224,8 @@ def diff_exports(self) -> Tuple[Files, FileDiffMap, Files]: def archive_exports(self, exported_files: Files) -> Tuple[Files, Files]: """ - Handles actual archiving of files, depending on specific backend. + Handle actual archiving of files, depending on specific backend. + To be implemented by specific archiving backends. Parameters @@ -241,6 +243,8 @@ def archive_exports(self, exported_files: Files) -> Tuple[Files, Files]: def filter_exports(self, common_diffs: FileDiffMap): """ + Filter export directory to only contain relevant files. + Filters down the export_dir to only contain: 1) New files, 2) Changed files, filtered-down to the ADDED and CHANGED rows only. Should be called after archive_exports() so we archive the raw exports before @@ -269,7 +273,7 @@ def filter_exports(self, common_diffs: FileDiffMap): replace(diff_file, exported_file) def run(self): - """Runs the differ and archives the changed and new files.""" + """Run the differ and archive the changed and new files.""" self.update_cache() # Diff exports, and make incremental versions @@ -293,7 +297,8 @@ def run(self): class S3ArchiveDiffer(ArchiveDiffer): """ - AWS S3 backend for archving + AWS S3 backend for archiving. + Archives CSV files into a S3 bucket, with keys "{indicator_prefix}/{csv_file_name}". Ideally, versioning should be enabled in this bucket to track versions of each CSV file. """ @@ -306,6 +311,7 @@ def __init__( ): """ Initialize a S3ArchiveDiffer. + See this link for possible aws_credentials kwargs: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html#boto3.session.Session @@ -330,9 +336,7 @@ def __init__( self.indicator_prefix = indicator_prefix def update_cache(self): - """ - For making sure cache_dir is updated with all latest files from the S3 bucket. - """ + """Make sure cache_dir is updated with all latest files from the S3 bucket.""" # List all indicator-related objects from S3 archive_objects = self.bucket.objects.filter( Prefix=self.indicator_prefix).all() @@ -358,7 +362,7 @@ def archive_exports(self, # pylint: disable=arguments-differ update_s3: bool = True ) -> Tuple[Files, Files]: """ - Handles actual archiving of files to the S3 bucket. + Handle actual archiving of files to the S3 bucket. Parameters ---------- @@ -398,7 +402,8 @@ def archive_exports(self, # pylint: disable=arguments-differ class GitArchiveDiffer(ArchiveDiffer): """ - Local git repo backend for archiving + Local git repo backend for archiving. + Archives CSV files into a local git repo as commits. Assumes that a git repository is already set up. """ @@ -446,7 +451,8 @@ def __init__( def get_branch(self, branch_name: Optional[str] = None) -> Head: """ - Retrieves a Head object representing a branch of specified name. + Retrieve a Head object representing a branch of specified name. + Creates the branch from the current active branch if does not exist yet. Parameters @@ -469,6 +475,8 @@ def get_branch(self, branch_name: Optional[str] = None) -> Head: @contextmanager def archiving_branch(self): """ + Context manager for checking out a branch. + Useful for checking out self.branch within a context, then switching back to original branch when finished. """ @@ -482,8 +490,9 @@ def archiving_branch(self): def update_cache(self): """ + Check if cache_dir is clean: has everything nicely committed if override_dirty=False. + Since we are using a local git repo, assumes there is nothing to update from. - Checks if cache_dir is clean: has everything nice committed if override_dirty=False """ # Make sure cache directory is clean: has everything nicely committed if not self.override_dirty: @@ -495,14 +504,16 @@ def update_cache(self): def diff_exports(self) -> Tuple[Files, FileDiffMap, Files]: """ - Same as base class diff_exports, but in context of specified branch + Find diffs across and within CSV files, from cache_dir to export_dir. + + Same as base class diff_exports, but in context of specified branch. """ with self.archiving_branch(): return super().diff_exports() def archive_exports(self, exported_files: Files) -> Tuple[Files, Files]: """ - Handles actual archiving of files to the local git repo. + Handle actual archiving of files to the local git repo. Parameters ---------- diff --git a/_delphi_utils_python/delphi_utils/geomap.py b/_delphi_utils_python/delphi_utils/geomap.py index 55e70630c..d61a2a823 100644 --- a/_delphi_utils_python/delphi_utils/geomap.py +++ b/_delphi_utils_python/delphi_utils/geomap.py @@ -91,8 +91,9 @@ class GeoMapper: # pylint: disable=too-many-public-methods """ def __init__(self): - """Initialize geomapper. Holds loading the crosswalk tables - until a conversion function is first used. + """Initialize geomapper. + + Holds loading the crosswalk tables until a conversion function is first used. Parameters --------- @@ -110,7 +111,7 @@ def __init__(self): # Utility functions def _load_crosswalk(self, from_code, to_code): - """Loads the crosswalk from from_code -> to_code.""" + """Load the crosswalk from from_code -> to_code.""" stream = pkg_resources.resource_stream( __name__, self.crosswalk_filepaths[from_code][to_code] ) @@ -189,7 +190,7 @@ def _load_crosswalk(self, from_code, to_code): @staticmethod def convert_fips_to_mega(data, fips_col="fips", mega_col="megafips"): - """convert fips string to a megafips string""" + """Convert fips string to a megafips string.""" data = data.copy() data[mega_col] = data[fips_col].astype(str).str.zfill(5) data[mega_col] = data[mega_col].str.slice_replace(start=2, stop=5, repl="000") @@ -205,7 +206,7 @@ def megacounty_creation( date_col="date", mega_col="megafips", ): - """create megacounty column + """Create megacounty column. Parameters --------- @@ -412,8 +413,9 @@ def replace_geocode( def add_population_column(self, data, geocode_type, geocode_col=None, dropna=True): """ - Appends a population column to a dataframe, based on the FIPS or ZIP code. If no - dataframe is provided, the full crosswalk from geocode to population is returned. + Append a population column to a dataframe, based on the FIPS or ZIP code. + + If no dataframe is provided, the full crosswalk from geocode to population is returned. Parameters --------- @@ -464,7 +466,7 @@ def fips_to_megacounty( mega_col="megafips", count_cols=None, ): - """Convert and aggregate from FIPS to megaFIPS + """Convert and aggregate from FIPS to megaFIPS. Parameters --------- diff --git a/_delphi_utils_python/delphi_utils/signal.py b/_delphi_utils_python/delphi_utils/signal.py index 51a3fe74c..298c87bc3 100644 --- a/_delphi_utils_python/delphi_utils/signal.py +++ b/_delphi_utils_python/delphi_utils/signal.py @@ -2,7 +2,8 @@ import covidcast def add_prefix(signal_names, wip_signal, prefix="wip_"): - """Adds prefix to signal if there is a WIP signal + """Add prefix to signal if there is a WIP signal. + Parameters ---------- signal_names: List[str] @@ -18,7 +19,6 @@ def add_prefix(signal_names, wip_signal, prefix="wip_"): List of signal names wip/non wip signals for further computation """ - if wip_signal is True: return [prefix + signal for signal in signal_names] if isinstance(wip_signal, list): @@ -37,7 +37,8 @@ def add_prefix(signal_names, wip_signal, prefix="wip_"): def public_signal(signal): - """Checks if the signal name is already public using COVIDcast + """Check if the signal name is already public using COVIDcast. + Parameters ---------- signal : str diff --git a/_delphi_utils_python/delphi_utils/utils.py b/_delphi_utils_python/delphi_utils/utils.py index 8de61aa37..9baa4f85b 100644 --- a/_delphi_utils_python/delphi_utils/utils.py +++ b/_delphi_utils_python/delphi_utils/utils.py @@ -5,7 +5,7 @@ from shutil import copyfile def read_params(): - """Reads a file named 'params.json' in the current working directory. + """Read a file named 'params.json' in the current working directory. If the file does not exist, it copies the file 'params.json.template' to 'param.json' and then reads the file. diff --git a/ansible/ansible-deploy-staging-api-proxy-match-list.yaml b/ansible/ansible-deploy-staging-api-proxy-match-list.yaml new file mode 100644 index 000000000..f9a39dbd8 --- /dev/null +++ b/ansible/ansible-deploy-staging-api-proxy-match-list.yaml @@ -0,0 +1,14 @@ +--- +- hosts: api_proxy_staging + remote_user: deploy + vars_files: + - vars.yaml + - vault.yaml + tasks: + - name: Set staging api proxy openresty signal match list template. + template: + src: "templates/staging-api-match-list.j2" + dest: "/common/staging-api-match-list" + owner: "deploy" + group: "deploy" + mode: "0777" diff --git a/ansible/inventory b/ansible/inventory index 0237ec33b..424d05c3d 100644 --- a/ansible/inventory +++ b/ansible/inventory @@ -3,3 +3,6 @@ delphi-master-prod-01.delphi.cmu.edu [runtime_host_staging] app-mono-dev-01.delphi.cmu.edu + +[api_proxy_staging] +api-staging.delphi.cmu.edu diff --git a/ansible/templates/changehc-params-prod.json.j2 b/ansible/templates/changehc-params-prod.json.j2 index 6b3e09818..799e82a79 100644 --- a/ansible/templates/changehc-params-prod.json.j2 +++ b/ansible/templates/changehc-params-prod.json.j2 @@ -2,8 +2,14 @@ "static_file_dir": "./static", "export_dir": "/common/covidcast/receiving/chng", "cache_dir": "./cache", - "input_denom_file": null, - "input_covid_file": null, + "input_files": { + "denom": null, + "covid": null, + "flu": null, + "mixed": null, + "flu_like": null, + "covid_like": null + }, "start_date": "2020-02-01", "end_date": null, "drop_date": null, @@ -13,6 +19,7 @@ "parallel": false, "geos": ["state", "msa", "hrr", "county"], "weekday": [true, false], + "types": ["covid","cli"], "wip_signal": "", "aws_credentials": { "aws_access_key_id": "", diff --git a/ansible/templates/nchs_mortality-params-prod.json.j2 b/ansible/templates/nchs_mortality-params-prod.json.j2 new file mode 100644 index 000000000..33ed8afa7 --- /dev/null +++ b/ansible/templates/nchs_mortality-params-prod.json.j2 @@ -0,0 +1,15 @@ +{ + "export_start_date": "2020-02-01", + "static_file_dir": "./static", + "export_dir": "/common/covidcast/receiving/nchs-mortality", + "cache_dir": "./cache", + "daily_export_dir": "./daily_receiving", + "daily_cache_dir": "./daily_cache", + "token": "{{ nchs_mortality_token }}", + "mode":"", + "aws_credentials": { + "aws_access_key_id": "{{ delphi_aws_access_key_id }}", + "aws_secret_access_key": "{{ delphi_aws_secret_access_key }}" + }, + "bucket_name": "delphi-covidcast-indicator-output" +} diff --git a/ansible/templates/staging-api-match-list.j2 b/ansible/templates/staging-api-match-list.j2 new file mode 100644 index 000000000..5938e59f9 --- /dev/null +++ b/ansible/templates/staging-api-match-list.j2 @@ -0,0 +1,4 @@ +data_source=quidel-staging&signal=covid_ag_ +data_source=chng +data_source=safegraph +data_source=google-symptoms \ No newline at end of file diff --git a/ansible/vars.yaml b/ansible/vars.yaml index 374990c92..67059ccc1 100644 --- a/ansible/vars.yaml +++ b/ansible/vars.yaml @@ -21,3 +21,4 @@ changehc_sftp_host: "{{ vault_changehc_sftp_host }}" changehc_sftp_port: "{{ vault_changehc_sftp_port }}" changehc_sftp_user: "{{ vault_changehc_sftp_user }}" changehc_sftp_password: "{{ vault_changehc_sftp_password }}" +nchs_mortality_token: "{{ vault_nchs_mortality_token }}" diff --git a/ansible/vault.yaml b/ansible/vault.yaml index 74899f4ab..70dba079b 100644 --- a/ansible/vault.yaml +++ b/ansible/vault.yaml @@ -1,42 +1,46 @@ $ANSIBLE_VAULT;1.1;AES256 -66636361333161346433333963373239653963623663663037323339313139303634646361373037 -3031383130623131653561656465333831613164396563310a613735366261303931663033663031 -30643562663566393861616263633338333965383361623236616339663237653733333936366539 -3465666462363634390a356463663863333536313765613065353336626633623264633132356634 -38623763306165306333663139646465313038303134333461666364656231383965393364666266 -39613864386337323032323033623166623265333033396130336633366431333337646263363963 -65666461346164643164313838373766316165643635663238326638626136333462373035306161 -65383662653061633939383039316562646431343436303838656237323465633936373463336335 -39663938303130383665336335303534613135656562326134643730666661356439393838383832 -65336562363865626665616438633238353065383735363136363931353161653036643231623939 -39303635373861343036663265343636333066616336663338376438663130373438636364626335 -32313638633262616664636337323063323166383636356132326339643631613138376266343838 -64653636383663626535653334393730393534616165326436396431636237323535393262646233 -33313266643962626665353738356262393435313065623863633235373561393231313533383662 -65316435386562633763386435643666643930336233383266633033323530633232656337623635 -66616435366532663166303766393939346635666136336635373834613834366532313638616464 -32313765353362346266323938393561333039663063306538316133343765623632313231613733 -31353364316662323332383964326338386164363062353636336636316233636466313838626263 -66323435346434663864396262623636316434626631643463653337653063623733323233623731 -66636564633438633731346162653561326532616165313866646531376436616266303764666237 -31636430623564396432346230666565326137613364363064363963356332346234303865303937 -37343863653939323139333533633230323465653962343332306464376138396137626566373063 -38626231643961383466376561666338663236613638333831663633376464656432333063363164 -34346566636232313734646137303531643536613537373664643663623263653831656136323239 -34353034633038316263386636376234393563393466643133383866656134646463666433336234 -32323665323634323635386430316238656366376561653365613934303262343938303833323464 -32353062363132376131326466393266313662376330663831323136313262333066656261353234 -35353966333733333639646130383235333738643833376166396564313666633135613938323236 -35643261376537663338323834363734343735366131313133623433323138313631653639323438 -61626631656665643939666437303932336437656234626639333138353466613463343032326639 -63313432666364306137343131656630326263616135343034303836396435383330626463373733 -37303434363735616339353661316461333535656131323631633731353539316264643735633862 -31363638353435303963333864393733396232643931613866636463653638646236356563333938 -61323134326131353264313264663661333261323533333537323361653636326636653137306330 -36643164643438346436613538353835656132366163383865333237383038353839626439346632 -66663132313836353766383031383732383838666261653230613336323631346533613361636331 -61326535646438643732373130646366653738343236396561623666396231316330383266623839 -64666138373032353136666234383434343162323761623061326233363532396132623237306362 -35636535313336636566393532303337333630393165616264653466376566323533323737343534 -64343831653131306138383263663933396338373736663565373739373330336432326666386230 -31616338336265333832613331623339656231643838353261373037636435396531 +37636562333839646464626131323661363864633662636462316134616636623935663965313232 +3062303232396637656534333137346334373139353933640a336538663339646139623838646330 +65643539623265333465316433616333656233356333393033363030633237623638303665316531 +3539333532643530390a363438636262646661633166393334356339396638323066313331373138 +35663566656137653162383564616534333762316332353862653030633138646238353734323537 +31643831313133373865623631626363343434303632333431393030363534343437653636333138 +37313263616535316166643838623966353135623531646632373432326634613830353835616662 +37333966363561623337663839626136616134323763316436373466336532306161663332646365 +30366238633332636336363734306336666431306530383665633465613131623736306161363131 +66663837616637643233633533303630316530376661333337303731333530646236666332646366 +30316134643462323733336137326134386563653766643836326335623530656330306662383534 +31353032373736643337663263643461373261383662366631313235633837383838383237636332 +32343266363666316163306335386236363534616661373662666534386634323964376361396431 +66313364303464363038623531343135393237306466303933306434366137363564393137346361 +35356363633439386364393032306435343166343933376534376436383637333235613236643564 +65383439663834363035393864336630646361383063303632653739623639333665663739623232 +66643562396538653666656664393139323732653734636632633230623762613233656233613537 +63333063626434356436613864613839343633646130323063306137633863366662333935643063 +36343734613535623936376139633261656165353633336163623134663236316563376234656234 +30313936623733366261633337613137633161613834656263343432396330303737313138303565 +31323035306666343937633063663834313835613730373266663339663061353333336164326233 +65636235333935333535366565636563633637333831393238326237663065343731303838336239 +64653664323561323035393735343635336563336139623365326664646135623238383864613437 +37323665376339396437333338636535316336353036343266366462643339373535646166633965 +64306166626266353237306336336335643131373239346637663165303163316331336237623961 +61303333386338303430313564663762323233663036663931313764386430303833643162653766 +64393434643763366662643565376332643166373930643962376234313333336239663863643065 +62363131613530356631653265386239626462313437643235363833633862396136613133313765 +38363864313835633963653437303839656634373333613662323131343363313235656239613562 +37313365653966623364633131663630643432393734653263343836333036396136636431313736 +37633234333033326233643636366338613765656466633461663731393830393838636665323336 +61643039643738393537306366333961623962373633386130373637383133636366653336303738 +31653466666334393064346162386662663464623730663466313966356437613666363735336630 +31656333346530336231313931363966653531613239643135613061306131343230623934353436 +30353866336534663137393530346232646630373865363636393566663165616564373532656666 +39353838373435386262343039363263336562613163393234613239303034656162636336643937 +33613138373261356562613434343534643937653531313939316565623539386536333338313931 +32643136326338396662336637343934353763646134346532633434653837656232616633646637 +33363464623339316435396562373464636537343763313431303632653638666535653133643364 +35303766346330643731353064363436663536313236613734306234376233393465383535663437 +61356234343332633964363637373133396532306432656263313261313466653733336232333230 +65333062396361363363323763366234303566313335643565653137346661393561323265646130 +36613565356563623361303062396563633135383366353962363334376537653466386665373536 +64613435346237393263356262623262646365613765333266393034353061626461613435633534 +6662 diff --git a/changehc/Makefile b/changehc/Makefile index 56a71a88c..968732f99 100644 --- a/changehc/Makefile +++ b/changehc/Makefile @@ -13,7 +13,8 @@ install: venv lint: . env/bin/activate; \ - pylint $(dir) + pylint $(dir); \ + pydocstyle $(dir) test: . env/bin/activate ;\ diff --git a/changehc/delphi_changehc/config.py b/changehc/delphi_changehc/config.py index 07f3dde8b..17b459b9e 100644 --- a/changehc/delphi_changehc/config.py +++ b/changehc/delphi_changehc/config.py @@ -9,8 +9,7 @@ class Config: - """Static configuration variables. - """ + """Static configuration variables.""" ## dates FIRST_DATA_DATE = datetime(2020, 1, 1) @@ -25,15 +24,29 @@ class Config: ## data columns COVID_COL = "COVID" DENOM_COL = "Denominator" - COUNT_COLS = ["COVID"] + ["Denominator"] + FLU_COL = "Flu" + MIXED_COL = "Mixed" + FLU_LIKE_COL = "Flu-like" + COVID_LIKE_COL = "Covid-like" + COUNT_COLS = [COVID_COL,DENOM_COL,FLU_COL,MIXED_COL,FLU_LIKE_COL,COVID_LIKE_COL] DATE_COL = "date" GEO_COL = "fips" ID_COLS = [DATE_COL] + [GEO_COL] FILT_COLS = ID_COLS + COUNT_COLS + DENOM_COLS = [GEO_COL, DATE_COL, DENOM_COL] COVID_COLS = [GEO_COL, DATE_COL, COVID_COL] - DENOM_DTYPES = {"date": str, "Denominator": str, "fips": str} - COVID_DTYPES = {"date": str, "COVID": str, "fips": str} + FLU_COLS = [GEO_COL, DATE_COL, FLU_COL] + MIXED_COLS = [GEO_COL, DATE_COL, MIXED_COL] + FLU_LIKE_COLS = [GEO_COL, DATE_COL, FLU_LIKE_COL] + COVID_LIKE_COLS = [GEO_COL, DATE_COL, COVID_LIKE_COL] + + DENOM_DTYPES = {DATE_COL: str, DENOM_COL: str, GEO_COL: str} + COVID_DTYPES = {DATE_COL: str, COVID_COL: str, GEO_COL: str} + FLU_DTYPES = {DATE_COL: str, FLU_COL: str, GEO_COL: str} + MIXED_DTYPES = {DATE_COL: str, MIXED_COL: str, GEO_COL: str} + FLU_LIKE_DTYPES = {DATE_COL: str, FLU_LIKE_COL: str, GEO_COL: str} + COVID_LIKE_DTYPES = {DATE_COL: str, COVID_LIKE_COL: str, GEO_COL: str} SMOOTHER_BANDWIDTH = 100 # bandwidth for the linear left Gaussian filter MIN_DEN = 100 # number of total visits needed to produce a sensor @@ -45,9 +58,11 @@ class Config: class Constants: """ - Contains the maximum number of geo units for each geo type + Contains the maximum number of geo units for each geo type. + Used for sanity checks """ + # number of counties in usa, including megacounties NUM_COUNTIES = 3141 + 52 NUM_HRRS = 308 diff --git a/changehc/delphi_changehc/constants.py b/changehc/delphi_changehc/constants.py index e60662522..ab2790a86 100644 --- a/changehc/delphi_changehc/constants.py +++ b/changehc/delphi_changehc/constants.py @@ -1,7 +1,9 @@ -"""Registry for signal names and geo types""" +"""Registry for signal names and geo types.""" SMOOTHED = "smoothed_outpatient_covid" SMOOTHED_ADJ = "smoothed_adj_outpatient_covid" -SIGNALS = [SMOOTHED, SMOOTHED_ADJ] +SMOOTHED_CLI = "smoothed_outpatient_cli" +SMOOTHED_ADJ_CLI = "smoothed_adj_outpatient_cli" +SIGNALS = [SMOOTHED, SMOOTHED_ADJ, SMOOTHED_CLI, SMOOTHED_ADJ_CLI] NA = "NA" HRR = "hrr" FIPS = "fips" diff --git a/changehc/delphi_changehc/download_ftp_files.py b/changehc/delphi_changehc/download_ftp_files.py index 576492615..75627ce81 100644 --- a/changehc/delphi_changehc/download_ftp_files.py +++ b/changehc/delphi_changehc/download_ftp_files.py @@ -1,5 +1,4 @@ -""" -Downloads files modified in the last 24 hours from the specified ftp server.""" +"""Download files modified in the last 24 hours from the specified ftp server.""" # standard import datetime @@ -11,19 +10,19 @@ def print_callback(filename, bytes_so_far, bytes_total): - """Log file transfer progress""" + """Log file transfer progress.""" rough_percent_transferred = int(100 * (bytes_so_far / bytes_total)) if (rough_percent_transferred % 25) == 0: print(f'{filename} transfer: {rough_percent_transferred}%') def get_files_from_dir(sftp, out_path): - """Download files from sftp server that have been uploaded in last day + """Download files from sftp server that have been uploaded in last day. + Args: sftp: SFTP Session from Paramiko client out_path: Path to local directory into which to download the files """ - current_time = datetime.datetime.now() # go through files in recieving dir @@ -44,13 +43,13 @@ def get_files_from_dir(sftp, out_path): sftp.get(infile, outfile, callback=callback_for_filename) -def download(out_path, ftp_conn): - """Downloads files necessary to create CHC signal from ftp server. +def download_covid(out_path, ftp_conn): + """Download files necessary to create chng-covid signal from ftp server. + Args: out_path: Path to local directory into which to download the files ftp_conn: Dict containing login credentials to ftp server """ - # open client try: client = paramiko.SSHClient() @@ -71,3 +70,41 @@ def download(out_path, ftp_conn): finally: if client: client.close() + + +def download_cli(out_path, ftp_conn): + """Download files necessary to create chng-cli signal from ftp server. + + Args: + out_path: Path to local directory into which to download the files + ftp_conn: Dict containing login credentials to ftp server + """ + # open client + try: + client = paramiko.SSHClient() + client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + + client.connect(ftp_conn["host"], username=ftp_conn["user"], + password=ftp_conn["pass"], + port=ftp_conn["port"], + allow_agent=False, look_for_keys=False) + sftp = client.open_sftp() + + sftp.chdir('/dailycounts/All_Outpatients_By_County') + get_files_from_dir(sftp, out_path) + + sftp.chdir('/dailycounts/Flu_Patient_Count_By_County') + get_files_from_dir(sftp, out_path) + + sftp.chdir('/dailycounts/Mixed_Patient_Count_By_County') + get_files_from_dir(sftp, out_path) + + sftp.chdir('/dailycounts/Flu_Like_Patient_Count_By_County') + get_files_from_dir(sftp, out_path) + + sftp.chdir('/dailycounts/Covid_Like_Patient_Count_By_County') + get_files_from_dir(sftp, out_path) + + finally: + if client: + client.close() diff --git a/changehc/delphi_changehc/load_data.py b/changehc/delphi_changehc/load_data.py index bf5509beb..3b6a539db 100644 --- a/changehc/delphi_changehc/load_data.py +++ b/changehc/delphi_changehc/load_data.py @@ -12,116 +12,113 @@ from .config import Config -def load_denom_data(denom_filepath, dropdate, base_geo): - """Load in and set up denominator data. +def load_chng_data(filepath, dropdate, base_geo, + col_names, col_types, counts_col): + """Load in and set up daily count data from Change. Args: - denom_filepath: path to the aggregated denominator data + filepath: path to aggregated data dropdate: data drop date (datetime object) base_geo: base geographic unit before aggregation ('fips') + col_names: column names of data + col_types: column types of data + counts_col: name of column containing counts Returns: - cleaned denominator dataframe + cleaned dataframe """ assert base_geo == "fips", "base unit must be 'fips'" - - denom_suffix = denom_filepath.split("/")[-1].split(".")[0][9:] - assert denom_suffix == "All_Outpatients_By_County" - denom_filetype = denom_filepath.split("/")[-1].split(".")[1] - assert denom_filetype == "dat" - - denom_data = pd.read_csv( - denom_filepath, + count_flag = False + date_flag = False + geo_flag = False + for n in col_names: + if n == counts_col: + count_flag = True + elif n == Config.DATE_COL: + date_flag = True + elif n == "fips": + geo_flag = True + assert count_flag, "counts_col must be present in col_names" + assert date_flag, "'%s' must be present in col_names"%(Config.DATE_COL) + assert geo_flag, "'fips' must be present in col_names" + + data = pd.read_csv( + filepath, sep="|", header=None, - names=Config.DENOM_COLS, - dtype=Config.DENOM_DTYPES, + names=col_names, + dtype=col_types, ) - denom_data[Config.DATE_COL] = \ - pd.to_datetime(denom_data[Config.DATE_COL],errors="coerce") + data[Config.DATE_COL] = \ + pd.to_datetime(data[Config.DATE_COL],errors="coerce") # restrict to start and end date - denom_data = denom_data[ - (denom_data[Config.DATE_COL] >= Config.FIRST_DATA_DATE) & - (denom_data[Config.DATE_COL] < dropdate) + data = data[ + (data[Config.DATE_COL] >= Config.FIRST_DATA_DATE) & + (data[Config.DATE_COL] < dropdate) ] # counts between 1 and 3 are coded as "3 or less", we convert to 1 - denom_data[Config.DENOM_COL][ - denom_data[Config.DENOM_COL] == "3 or less" + data[counts_col][ + data[counts_col] == "3 or less" ] = "1" - denom_data[Config.DENOM_COL] = denom_data[Config.DENOM_COL].astype(int) + data[counts_col] = data[counts_col].astype(int) assert ( - (denom_data[Config.DENOM_COL] >= 0).all().all() - ), "Denominator counts must be nonnegative" + (data[counts_col] >= 0).all().all() + ), "Counts must be nonnegative" # aggregate age groups (so data is unique by date and base geography) - denom_data = denom_data.groupby([base_geo, Config.DATE_COL]).sum() - denom_data.dropna(inplace=True) # drop rows with any missing entries + data = data.groupby([base_geo, Config.DATE_COL]).sum() + data.dropna(inplace=True) # drop rows with any missing entries + + return data - return denom_data -def load_covid_data(covid_filepath, dropdate, base_geo): - """Load in and set up denominator data. +def load_combined_data(denom_filepath, covid_filepath, dropdate, base_geo): + """Load in denominator and covid data, and combine them. Args: + denom_filepath: path to the aggregated denominator data covid_filepath: path to the aggregated covid data dropdate: data drop date (datetime object) base_geo: base geographic unit before aggregation ('fips') Returns: - cleaned denominator dataframe + combined multiindexed dataframe, index 0 is geo_base, index 1 is date """ assert base_geo == "fips", "base unit must be 'fips'" - covid_suffix = covid_filepath.split("/")[-1].split(".")[0][9:] - assert covid_suffix == "Covid_Outpatients_By_County" - covid_filetype = covid_filepath.split("/")[-1].split(".")[1] - assert covid_filetype == "dat" - - covid_data = pd.read_csv( - covid_filepath, - sep="|", - header=None, - names=Config.COVID_COLS, - dtype=Config.COVID_DTYPES, - parse_dates=[Config.DATE_COL] - ) - - covid_data[Config.DATE_COL] = \ - pd.to_datetime(covid_data[Config.DATE_COL],errors="coerce") - - # restrict to start and end date - covid_data = covid_data[ - (covid_data[Config.DATE_COL] >= Config.FIRST_DATA_DATE) & - (covid_data[Config.DATE_COL] < dropdate) - ] - - # counts between 1 and 3 are coded as "3 or less", we convert to 1 - covid_data[Config.COVID_COL][ - covid_data[Config.COVID_COL] == "3 or less" - ] = "1" - covid_data[Config.COVID_COL] = covid_data[Config.COVID_COL].astype(int) + # load each data stream + denom_data = load_chng_data(denom_filepath, dropdate, base_geo, + Config.DENOM_COLS, Config.DENOM_DTYPES, Config.DENOM_COL) + covid_data = load_chng_data(covid_filepath, dropdate, base_geo, + Config.COVID_COLS, Config.COVID_DTYPES, Config.COVID_COL) - assert ( - (covid_data[Config.COVID_COL] >= 0).all().all() - ), "COVID counts must be nonnegative" + # merge data + data = denom_data.merge(covid_data, how="outer", left_index=True, right_index=True) + assert data.isna().all(axis=1).sum() == 0, "entire row is NA after merge" - # aggregate age groups (so data is unique by date and base geography) - covid_data = covid_data.groupby([base_geo, Config.DATE_COL]).sum() - covid_data.dropna(inplace=True) # drop rows with any missing entries + # calculate combined numerator and denominator + data.fillna(0, inplace=True) + data["num"] = data[Config.COVID_COL] + data["den"] = data[Config.DENOM_COL] + data = data[["num", "den"]] - return covid_data + return data -def load_combined_data(denom_filepath, covid_filepath, dropdate, base_geo): - """Load in denominator and covid data, and combine them. +def load_cli_data(denom_filepath, flu_filepath, mixed_filepath, flu_like_filepath, + covid_like_filepath, dropdate, base_geo): + """Load in denominator and covid-like data, and combine them. Args: denom_filepath: path to the aggregated denominator data - covid_filepath: path to the aggregated covid data + flu_filepath: path to the aggregated flu data + mixed_filepath: path to the aggregated mixed data + flu_like_filepath: path to the aggregated flu-like data + covid_like_filepath: path to the aggregated covid-like data dropdate: data drop date (datetime object) base_geo: base geographic unit before aggregation ('fips') @@ -131,16 +128,29 @@ def load_combined_data(denom_filepath, covid_filepath, dropdate, base_geo): assert base_geo == "fips", "base unit must be 'fips'" # load each data stream - denom_data = load_denom_data(denom_filepath, dropdate, base_geo) - covid_data = load_covid_data(covid_filepath, dropdate, base_geo) + denom_data = load_chng_data(denom_filepath, dropdate, base_geo, + Config.DENOM_COLS, Config.DENOM_DTYPES, Config.DENOM_COL) + flu_data = load_chng_data(flu_filepath, dropdate, base_geo, + Config.FLU_COLS, Config.FLU_DTYPES, Config.FLU_COL) + mixed_data = load_chng_data(mixed_filepath, dropdate, base_geo, + Config.MIXED_COLS, Config.MIXED_DTYPES, Config.MIXED_COL) + flu_like_data = load_chng_data(flu_like_filepath, dropdate, base_geo, + Config.FLU_LIKE_COLS, Config.FLU_LIKE_DTYPES, Config.FLU_LIKE_COL) + covid_like_data = load_chng_data(covid_like_filepath, dropdate, base_geo, + Config.COVID_LIKE_COLS, Config.COVID_LIKE_DTYPES, Config.COVID_LIKE_COL) # merge data - data = denom_data.merge(covid_data, how="outer", left_index=True, right_index=True) + data = denom_data.merge(flu_data, how="outer", left_index=True, right_index=True) + data = data.merge(mixed_data, how="outer", left_index=True, right_index=True) + data = data.merge(flu_like_data, how="outer", left_index=True, right_index=True) + data = data.merge(covid_like_data, how="outer", left_index=True, right_index=True) assert data.isna().all(axis=1).sum() == 0, "entire row is NA after merge" # calculate combined numerator and denominator data.fillna(0, inplace=True) - data["num"] = data[Config.COVID_COL] + data["num"] = -data[Config.FLU_COL] + data[Config.MIXED_COL] + data[Config.FLU_LIKE_COL] + data["num"] = data["num"].clip(lower=0) + data["num"] = data["num"] + data[Config.COVID_LIKE_COL] data["den"] = data[Config.DENOM_COL] data = data[["num", "den"]] diff --git a/changehc/delphi_changehc/run.py b/changehc/delphi_changehc/run.py index 168602f56..9b9de34ea 100644 --- a/changehc/delphi_changehc/run.py +++ b/changehc/delphi_changehc/run.py @@ -14,44 +14,85 @@ from delphi_utils import read_params # first party -from .download_ftp_files import download +from .download_ftp_files import download_covid, download_cli +from .load_data import load_combined_data, load_cli_data from .update_sensor import CHCSensorUpdator +def retrieve_files(params, filedate): + """Return filenames of relevant files, downloading them if necessary.""" + files = params["input_files"] + if files["denom"] is None: + + ## download recent files from FTP server + logging.info("downloading recent files through SFTP") + if "covid" in params["types"]: + download_covid(params["cache_dir"], params["ftp_conn"]) + if "cli" in params["types"]: + download_cli(params["cache_dir"], params["ftp_conn"]) + + denom_file = "%s/%s_All_Outpatients_By_County.dat.gz" % (params["cache_dir"],filedate) + covid_file = "%s/%s_Covid_Outpatients_By_County.dat.gz" % (params["cache_dir"],filedate) + flu_file = "%s/%s_Flu_Patient_Count_By_County.dat.gz" % (params["cache_dir"],filedate) + mixed_file = "%s/%s_Mixed_Patient_Count_By_County.dat.gz" % (params["cache_dir"],filedate) + flu_like_file = "%s/%s_Flu_Like_Patient_Count_By_County.dat.gz" % (params["cache_dir"],filedate) + covid_like_file = "%s/%s_Covid_Like_Patient_Count_By_County.dat.gz" % (params["cache_dir"],filedate) + else: + denom_file = files["denom"] + covid_file = files["covid"] + flu_file = files["flu"] + mixed_file = files["mixed"] + flu_like_file = files["flu_like"] + covid_like_file = files["covid_like"] + + file_dict = {"denom": denom_file} + if "covid" in params["types"]: + file_dict["covid"] = covid_file + if "cli" in params["types"]: + file_dict["flu"] = flu_file + file_dict["mixed"] = mixed_file + file_dict["flu_like"] = flu_like_file + file_dict["covid_like"] = covid_like_file + return file_dict + + +def make_asserts(params): + """Assert that for each type, filenames are either all present or all absent.""" + files = params["input_files"] + if "covid" in params["types"]: + assert (files["denom"] is None) == (files["covid"] is None), \ + "exactly one of denom and covid files are provided" + if "cli" in params["types"]: + if files["denom"] is None: + assert files["flu"] is None and \ + files["mixed"] is None and \ + files["flu_like"] is None and \ + files["covid_like"] is None,\ + "files must be all present or all absent" + else: + assert files["flu"] is not None and \ + files["mixed"] is not None and \ + files["flu_like"] is not None and \ + files["covid_like"] is not None,\ + "files must be all present or all absent" -def run_module(): - """Run the delphi_changehc module. - """ +def run_module(): + """Run the delphi_changehc module.""" params = read_params() logging.basicConfig(level=logging.DEBUG) - # the filenames are expected to be in the format: - # Denominator: "YYYYMMDD_All_Outpatients_By_County.dat.gz" - # Numerator: "YYYYMMDD_Covid_Outpatients_By_County.dat.gz" - - assert (params["input_denom_file"] is None) == (params["input_covid_file"] is None), \ - "exactly one of denom and covid files are provided" + make_asserts(params) if params["drop_date"] is None: - # files are dropped about 8pm the day after the issue date - dropdate_dt = (datetime.now() - timedelta(days=1,hours=20)) + # files are dropped about 4pm the day after the issue date + dropdate_dt = (datetime.now() - timedelta(days=1,hours=16)) dropdate_dt = dropdate_dt.replace(hour=0,minute=0,second=0,microsecond=0) else: dropdate_dt = datetime.strptime(params["drop_date"], "%Y-%m-%d") filedate = dropdate_dt.strftime("%Y%m%d") - if params["input_denom_file"] is None: - - ## download recent files from FTP server - logging.info("downloading recent files through SFTP") - download(params["cache_dir"], params["ftp_conn"]) - - input_denom_file = "%s/%s_All_Outpatients_By_County.dat.gz" % (params["cache_dir"],filedate) - input_covid_file = "%s/%s_Covid_Outpatients_By_County.dat.gz" % (params["cache_dir"],filedate) - else: - input_denom_file = params["input_denom_file"] - input_covid_file = params["input_covid_file"] + file_dict = retrieve_files(params, filedate) dropdate = str(dropdate_dt.date()) @@ -80,29 +121,37 @@ def run_module(): logging.info("outpath:\t\t%s", params["export_dir"]) logging.info("parallel:\t\t%s", params["parallel"]) logging.info("weekday:\t\t%s", params["weekday"]) + logging.info("types:\t\t%s", params["types"]) logging.info("se:\t\t\t%s", params["se"]) ## start generating for geo in params["geos"]: - for weekday in params["weekday"]: - if weekday: - logging.info("starting %s, weekday adj", geo) - else: - logging.info("starting %s, no adj", geo) - su_inst = CHCSensorUpdator( - startdate, - enddate, - dropdate, - geo, - params["parallel"], - weekday, - params["se"] - ) - su_inst.update_sensor( - input_denom_file, - input_covid_file, - params["export_dir"] - ) - logging.info("finished %s", geo) + for numtype in params["types"]: + for weekday in params["weekday"]: + if weekday: + logging.info("starting %s, %s, weekday adj", geo, numtype) + else: + logging.info("starting %s, %s, no adj", geo, numtype) + su_inst = CHCSensorUpdator( + startdate, + enddate, + dropdate, + geo, + params["parallel"], + weekday, + numtype, + params["se"] + ) + if numtype == "covid": + data = load_combined_data(file_dict["denom"], + file_dict["covid"],dropdate_dt,"fips") + elif numtype == "cli": + data = load_cli_data(file_dict["denom"],file_dict["flu"],file_dict["mixed"], + file_dict["flu_like"],file_dict["covid_like"],dropdate_dt,"fips") + su_inst.update_sensor( + data, + params["export_dir"] + ) + logging.info("finished %s", geo) logging.info("finished all") diff --git a/changehc/delphi_changehc/sensor.py b/changehc/delphi_changehc/sensor.py index 949e12ede..32c23f66b 100644 --- a/changehc/delphi_changehc/sensor.py +++ b/changehc/delphi_changehc/sensor.py @@ -20,19 +20,19 @@ class CHCSensor: - """Sensor class to fit a signal using Covid counts from Change HC outpatient data. - """ + """Sensor class to fit a signal using Covid counts from Change HC outpatient data.""" + smoother = Smoother("savgol", poly_fit_degree=1, gaussian_bandwidth=Config.SMOOTHER_BANDWIDTH) @staticmethod def gauss_smooth(count,total): - """smooth using the left_gauss_linear + """Smooth using the left_gauss_linear. Args: count, total: array - """ + """ count_smooth = CHCSensor.smoother.smooth(count) total_smooth = CHCSensor.smoother.smooth(total) total_clip = np.clip(total_smooth, 0, None) @@ -46,12 +46,12 @@ def backfill( k=Config.MAX_BACKFILL_WINDOW, min_visits_to_fill=Config.MIN_CUM_VISITS): """ - Adjust for backfill (retroactively added observations) by using a - variable length smoother, which starts from the RHS and moves - leftwards (backwards through time). We cumulatively sum the total - visits (denominator), until we have observed some minimum number of - counts, then calculate the sum over that bin. We restrict the - bin size so to avoid inluding long-past values. + Adjust for retroactively added observations (backfill) by using a variable length smoother. + + The smoother starts from the RHS and moves leftwards (backwards through time). + We cumulatively sum the total visits (denominator), until we have observed some minimum number of + counts, then calculate the sum over that bin. We restrict the + bin size so to avoid including long-past values. Args: num: array of covid counts diff --git a/changehc/delphi_changehc/update_sensor.py b/changehc/delphi_changehc/update_sensor.py index 58d3fe2d8..2489b88da 100644 --- a/changehc/delphi_changehc/update_sensor.py +++ b/changehc/delphi_changehc/update_sensor.py @@ -1,5 +1,6 @@ """ Generate CHC sensors. + Author: Aaron Rumack Created: 2020-10-14 """ @@ -14,14 +15,14 @@ # first party from .config import Config, Constants -from .constants import SIGNALS, SMOOTHED, SMOOTHED_ADJ, NA -from .load_data import load_combined_data +from .constants import SMOOTHED, SMOOTHED_ADJ, SMOOTHED_CLI, SMOOTHED_ADJ_CLI, NA from .sensor import CHCSensor from .weekday import Weekday def write_to_csv(output_dict, write_se, out_name, output_path="."): """Write sensor values to csv. + Args: output_dict: dictionary containing sensor rates, se, unique dates, and unique geo_id write_se: boolean to write out standard errors, if true, use an obfuscated name @@ -73,8 +74,7 @@ def write_to_csv(output_dict, write_se, out_name, output_path="."): class CHCSensorUpdator: # pylint: disable=too-many-instance-attributes - """Contains methods to update sensor and write results to csv - """ + """Contains methods to update sensor and write results to csv.""" def __init__(self, startdate, @@ -83,8 +83,10 @@ def __init__(self, geo, parallel, weekday, + numtype, se): - """Init Sensor Updator + """Init Sensor Updator. + Args: startdate: first sensor date (YYYY-mm-dd) enddate: last sensor date (YYYY-mm-dd) @@ -92,6 +94,7 @@ def __init__(self, geo: geographic resolution, one of ["county", "state", "msa", "hrr"] parallel: boolean to run the sensor update in parallel weekday: boolean to adjust for weekday effects + numtype: type of count data used, one of ["covid", "cli"] se: boolean to write out standard errors, if true, use an obfuscated name """ self.startdate, self.enddate, self.dropdate = [ @@ -103,11 +106,13 @@ def __init__(self, assert self.enddate <= self.dropdate, "end date > drop date" assert geo in ['county', 'state', 'msa', 'hrr'],\ f"{geo} is invalid, pick one of 'county', 'state', 'msa', 'hrr'" - self.geo, self.parallel, self.weekday, self.se = geo.lower(), parallel, weekday, se + self.geo, self.parallel, self.weekday, self.numtype, self.se = geo.lower(), parallel, weekday, numtype, se # output file naming - signals = SIGNALS.copy() - signals.remove(SMOOTHED if self.weekday else SMOOTHED_ADJ) + if self.numtype == "covid": + signals = [SMOOTHED_ADJ if self.weekday else SMOOTHED] + elif self.numtype == "cli": + signals = [SMOOTHED_ADJ_CLI if self.weekday else SMOOTHED_CLI] signal_names = add_prefix( signals, wip_signal=read_params()["wip_signal"]) @@ -120,8 +125,7 @@ def __init__(self, self.sensor_dates = None def shift_dates(self): - """shift estimates forward to account for time lag, compute burnindates, sensordates - """ + """Shift estimates forward to account for time lag, compute burnindates, sensordates.""" drange = lambda s, e: pd.date_range(start=s,periods=(e-s).days,freq='D') self.startdate = self.startdate - Config.DAY_SHIFT self.burnindate = self.startdate - Config.BURN_IN_PERIOD @@ -131,7 +135,8 @@ def shift_dates(self): return True def geo_reindex(self, data): - """Reindex based on geography, include all date, geo pairs + """Reindex based on geography, include all date, geo pairs. + Args: data: dataframe, the output of loadcombineddata Returns: @@ -171,15 +176,13 @@ def geo_reindex(self, data): return data_frame - def update_sensor(self, - denom_filepath, - covid_filepath, + data, outpath): """Generate sensor values, and write to csv format. + Args: - denom_filepath: path to the aggregated denominator data - covid_filepath: path to the aggregated covid data + data: pd.DataFrame with columns num and den outpath: output path for the csv results """ self.shift_dates() @@ -187,9 +190,6 @@ def update_sensor(self, (self.burn_in_dates <= self.enddate) # load data - base_geo = "fips" - data = load_combined_data(denom_filepath, covid_filepath, self.dropdate, base_geo) - data.reset_index(inplace=True) data_frame = self.geo_reindex(data) # handle if we need to adjust by weekday diff --git a/changehc/delphi_changehc/weekday.py b/changehc/delphi_changehc/weekday.py index 03b69cba4..e02997482 100644 --- a/changehc/delphi_changehc/weekday.py +++ b/changehc/delphi_changehc/weekday.py @@ -55,7 +55,6 @@ def get_params(data): Return a matrix of parameters: the entire vector of betas, for each time series column in the data. """ - tmp = data.reset_index() denoms = tmp.groupby(Config.DATE_COL).sum()["den"] nums = tmp.groupby(Config.DATE_COL).sum()["num"] @@ -113,7 +112,6 @@ def calc_adjustment(params, sub_data): -- this has the same effect. """ - tmp = sub_data.reset_index() wd_correction = np.zeros((len(tmp["num"]))) diff --git a/changehc/params.json.template b/changehc/params.json.template index 54467564b..569b60849 100644 --- a/changehc/params.json.template +++ b/changehc/params.json.template @@ -2,8 +2,14 @@ "static_file_dir": "./static", "export_dir": "./receiving", "cache_dir": "./cache", - "input_denom_file": null, - "input_covid_file": null, + "input_files": { + "denom": null, + "covid": null, + "flu": null, + "mixed": null, + "flu_like": null, + "covid_like": null + }, "start_date": "2020-02-01", "end_date": null, "drop_date": null, @@ -13,6 +19,7 @@ "parallel": false, "geos": ["state", "msa", "hrr", "county"], "weekday": [true, false], + "types": ["covid","cli"], "wip_signal": "", "aws_credentials": { "aws_access_key_id": "", diff --git a/changehc/tests/test_load_data.py b/changehc/tests/test_load_data.py index 3f09d9ec2..52b119e59 100644 --- a/changehc/tests/test_load_data.py +++ b/changehc/tests/test_load_data.py @@ -18,23 +18,21 @@ class TestLoadData: - denom_data = load_denom_data(DENOM_FILEPATH, DROP_DATE, "fips") - covid_data = load_covid_data(COVID_FILEPATH, DROP_DATE, "fips") + denom_data = load_chng_data(DENOM_FILEPATH, DROP_DATE, "fips", + Config.DENOM_COLS, Config.DENOM_DTYPES, Config.DENOM_COL) + covid_data = load_chng_data(COVID_FILEPATH, DROP_DATE, "fips", + Config.COVID_COLS, Config.COVID_DTYPES, Config.COVID_COL) combined_data = load_combined_data(DENOM_FILEPATH, COVID_FILEPATH, DROP_DATE, "fips") def test_base_unit(self): with pytest.raises(AssertionError): - load_denom_data(DENOM_FILEPATH, DROP_DATE, "foo") + load_chng_data(DENOM_FILEPATH, DROP_DATE, "foo", + Config.DENOM_COLS, Config.DENOM_DTYPES, Config.DENOM_COL) with pytest.raises(AssertionError): - load_denom_data("test_data/20200101_foo.dat", DROP_DATE, "fips") - - with pytest.raises(AssertionError): - load_covid_data(COVID_FILEPATH, DROP_DATE, "foo") - - with pytest.raises(AssertionError): - load_covid_data("test_data/20200101_foo.dat", DROP_DATE, "fips") + load_chng_data(DENOM_FILEPATH, DROP_DATE, "fips", + Config.DENOM_COLS, Config.DENOM_DTYPES, Config.COVID_COL) with pytest.raises(AssertionError): load_combined_data(DENOM_FILEPATH, COVID_FILEPATH, DROP_DATE, "foo") diff --git a/changehc/tests/test_update_sensor.py b/changehc/tests/test_update_sensor.py index bee2789fa..642274b96 100644 --- a/changehc/tests/test_update_sensor.py +++ b/changehc/tests/test_update_sensor.py @@ -31,6 +31,7 @@ class TestCHCSensorUpdator: geo = "county" parallel = False weekday = False + numtype = "covid" se = False prefix = "foo" small_test_data = pd.DataFrame({ @@ -48,6 +49,7 @@ def test_shift_dates(self): self.geo, self.parallel, self.weekday, + self.numtype, self.se ) ## Test init @@ -69,6 +71,7 @@ def test_geo_reindex(self): 'county', self.parallel, self.weekday, + self.numtype, self.se ) su_inst.shift_dates() @@ -87,6 +90,7 @@ def test_update_sensor(self): geo, self.parallel, self.weekday, + self.numtype, self.se ) @@ -97,8 +101,7 @@ def test_update_sensor(self): s3_client = Session(**aws_credentials).client("s3") s3_client.create_bucket(Bucket=params["bucket_name"]) su_inst.update_sensor( - DENOM_FILEPATH, - COVID_FILEPATH, + self.small_test_data, td.name) assert len(os.listdir(td.name)) == len(su_inst.sensor_dates),\ diff --git a/jenkins/deploy-staging-api-match-list.sh b/jenkins/deploy-staging-api-match-list.sh new file mode 100755 index 000000000..435b9bdf6 --- /dev/null +++ b/jenkins/deploy-staging-api-match-list.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +# +# Jenkins deploy staging api match list +# + +set -eo pipefail +source ~/.bash_profile + +# +# Deploy +# + +cd "${WORKSPACE}/ansible" || exit + +# Ansible! +ansible-playbook ansible-deploy-staging-api-proxy-match-list.yaml -i inventory diff --git a/nchs_mortality/delphi_nchs_mortality/archive_diffs.py b/nchs_mortality/delphi_nchs_mortality/archive_diffs.py index 725925c8d..1c73d5aa6 100644 --- a/nchs_mortality/delphi_nchs_mortality/archive_diffs.py +++ b/nchs_mortality/delphi_nchs_mortality/archive_diffs.py @@ -79,7 +79,8 @@ def arch_diffs(params, daily_arch_diff): remove(exported_file) for exported_file, diff_file in common_diffs.items(): remove(exported_file) - remove(diff_file) + if diff_file is not None: + remove(diff_file) # Report failures: someone should probably look at them for exported_file in fails: diff --git a/nchs_mortality/tests/conftest.py b/nchs_mortality/tests/conftest.py index 03be344e2..b489e0cfa 100644 --- a/nchs_mortality/tests/conftest.py +++ b/nchs_mortality/tests/conftest.py @@ -7,6 +7,7 @@ from os import listdir, remove from os.path import join +from shutil import copy from delphi_utils import read_params from delphi_nchs_mortality.run import run_module @@ -27,6 +28,9 @@ def run_as_module(date): if ".csv" in fname: remove(join("daily_cache", fname)) + # Simulate the cache already being partially populated + copy("test_data/weekly_202025_state_wip_deaths_covid_incidence_prop.csv", "daily_cache") + for fname in listdir("daily_receiving"): if ".csv" in fname: remove(join("daily_receiving", fname)) diff --git a/nchs_mortality/tests/test_data/weekly_202025_state_wip_deaths_covid_incidence_prop.csv b/nchs_mortality/tests/test_data/weekly_202025_state_wip_deaths_covid_incidence_prop.csv new file mode 100644 index 000000000..8d511176b --- /dev/null +++ b/nchs_mortality/tests/test_data/weekly_202025_state_wip_deaths_covid_incidence_prop.csv @@ -0,0 +1,45 @@ +geo_id,val,se,sample_size +al,1.79475177,, +ak,0.00000000,, +az,2.80269167,, +ar,1.49115052,, +ca,1.14648064,, +co,0.65986703,, +ct,1.23412225,, +dc,3.96741605,, +fl,1.01500452,, +ga,0.88533724,, +hi,0.00000000,, +id,0.00000000,, +il,2.08336278,, +in,1.63393378,, +ia,1.01424057,, +ks,0.68650341,, +ky,0.58195844,, +la,1.82843120,, +md,2.21645868,, +ma,1.88610726,, +mi,0.57075014,, +mn,1.18802079,, +ms,2.40543506,, +mo,0.68432575,, +ne,1.18899426,, +nv,0.64931773,, +nh,0.95608552,, +nj,1.81261604,, +nm,1.19227653,, +ny,1.10519612,, +nc,0.95346300,, +oh,0.89827275,, +ok,0.45489340,, +or,0.28451276,, +pa,1.42165408,, +ri,3.11508541,, +sc,1.10707256,, +sd,1.37854660,, +tn,0.86394050,, +tx,1.03462971,, +ut,0.43668694,, +va,1.01712904,, +wa,0.40709699,, +wi,0.63547307,, diff --git a/quidel/Makefile b/quidel/Makefile index 56a71a88c..968732f99 100644 --- a/quidel/Makefile +++ b/quidel/Makefile @@ -13,7 +13,8 @@ install: venv lint: . env/bin/activate; \ - pylint $(dir) + pylint $(dir); \ + pydocstyle $(dir) test: . env/bin/activate ;\ diff --git a/quidel/delphi_quidel/constants.py b/quidel/delphi_quidel/constants.py index e660d4f8e..6a905c945 100644 --- a/quidel/delphi_quidel/constants.py +++ b/quidel/delphi_quidel/constants.py @@ -1,4 +1,4 @@ -"""Registry for constants""" +"""Registry for constants.""" # global constants MIN_OBS = 50 # minimum number of observations in order to compute a proportion. MAX_BORROW_OBS = 20 # maximum number of observations can be borrowed in geographical pooling diff --git a/quidel/delphi_quidel/data_tools.py b/quidel/delphi_quidel/data_tools.py index fbbce4de7..9ada778ef 100644 --- a/quidel/delphi_quidel/data_tools.py +++ b/quidel/delphi_quidel/data_tools.py @@ -1,17 +1,20 @@ -""" -Functions to calculate the quidel sensor statistic. -""" +"""Functions to calculate the quidel sensor statistic.""" import numpy as np import pandas as pd def _prop_var(p, n): - """var(X/n) = 1/(n^2)var(X) = (npq)/(n^2) = pq/n""" + """ + Calculate variance of proportion. + + var(X/n) = 1/(n^2)var(X) = (npq)/(n^2) = pq/n + """ return p * (1 - p) / n def fill_dates(y_data, first_date, last_date): """ Ensure all dates are listed in the data, otherwise, add days with 0 counts. + Args: y_data: dataframe with datetime index first_date: datetime.datetime @@ -36,8 +39,9 @@ def fill_dates(y_data, first_date, last_date): def _slide_window_sum(arr, k): """ - Sliding window sum, with fixed window size k. For indices 0:k, we - DO compute a sum, using whatever points are available. + Sliding window sum, with fixed window size k. + + For indices 0:k, we DO compute a sum, using whatever points are available. Reference: https://stackoverflow.com/a/38507725 @@ -51,7 +55,6 @@ def _slide_window_sum(arr, k): sarr: np.ndarray Array of same length of arr, holding the sliding window sum. """ - if not isinstance(k, int): raise ValueError('k must be int.') temp = np.append(np.zeros(k - 1), arr) @@ -61,12 +64,11 @@ def _slide_window_sum(arr, k): def _geographical_pooling(tpooled_tests, tpooled_ptests, min_obs, max_borrow_obs): """ - Calculates the proportion of parent samples (tests) that must be "borrowed" - in order to properly compute the statistic. If there are no samples - available in the parent, the borrow_prop is 0. If the parent does not + Calculate proportion of parent samples (tests) that must be "borrowed" in order to compute the statistic. + + If there are no samples available in the parent, the borrow_prop is 0. If the parent does not have enough samples, we return a borrow_prop of 1, and the fact that the - pooled samples are insufficient are handled in the statistic fitting - step. + pooled samples are insufficient are handled in the statistic fitting step. Args: tpooled_tests: np.ndarray[float] @@ -117,8 +119,7 @@ def _geographical_pooling(tpooled_tests, tpooled_ptests, min_obs, max_borrow_obs def raw_positive_prop(positives, tests, min_obs): """ - Calculates the proportion of positive tests for a single geographic - location, without any temporal smoothing. + Calculate the proportion of positive tests for a single geographic location, without any temporal smoothing. If on any day t, tests[t] < min_obs, then we report np.nan. @@ -171,8 +172,7 @@ def raw_positive_prop(positives, tests, min_obs): def smoothed_positive_prop(positives, tests, min_obs, max_borrow_obs, pool_days, parent_positives=None, parent_tests=None): """ - Calculates the proportion of negative tests for a single geographic - location, with temporal smoothing. + Calculate the proportion of negative tests for a single geographic location, with temporal smoothing. For a given day t, if sum(tests[(t-pool_days+1):(t+1)]) < min_obs, then we 'borrow' min_obs - sum(tests[(t-pool_days+1):(t+1)]) observations from the @@ -219,7 +219,6 @@ def smoothed_positive_prop(positives, tests, min_obs, max_borrow_obs, pool_days, np.ndarray Effective sample size (after temporal and geographic pooling). """ - positives = positives.astype(float) tests = tests.astype(float) if (parent_positives is None) or (parent_tests is None): @@ -264,9 +263,8 @@ def smoothed_positive_prop(positives, tests, min_obs, max_borrow_obs, pool_days, def raw_tests_per_device(devices, tests, min_obs): - ''' - Calculates the tests per device for a single geographic - location, without any temporal smoothing. + """ + Calculate the tests per device for a single geographic location, without any temporal smoothing. If on any day t, tests[t] < min_obs, then we report np.nan. The second and third returned np.ndarray are the standard errors, @@ -289,7 +287,7 @@ def raw_tests_per_device(devices, tests, min_obs): Placeholder for standard errors np.ndarray Sample size used to compute estimates. - ''' + """ devices = devices.astype(float) tests = tests.astype(float) if (np.any(np.isnan(devices)) or np.any(np.isnan(tests))): @@ -309,8 +307,8 @@ def raw_tests_per_device(devices, tests, min_obs): def smoothed_tests_per_device(devices, tests, min_obs, max_borrow_obs, pool_days, parent_devices=None, parent_tests=None): """ - Calculates the ratio of tests per device for a single geographic - location, with temporal smoothing. + Calculate the ratio of tests per device for a single geographic location, with temporal smoothing. + For a given day t, if sum(tests[(t-pool_days+1):(t+1)]) < min_obs, then we 'borrow' min_obs - sum(tests[(t-pool_days+1):(t+1)]) observations from the parents over the same timespan. Importantly, it will make sure NOT to diff --git a/quidel/delphi_quidel/generate_sensor.py b/quidel/delphi_quidel/generate_sensor.py index 7558f81c6..43778c9b4 100644 --- a/quidel/delphi_quidel/generate_sensor.py +++ b/quidel/delphi_quidel/generate_sensor.py @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- -""" -Functions to help generate sensor for different geographical levels -""" +"""Functions to help generate sensor for different geographical levels.""" import pandas as pd from .data_tools import (fill_dates, raw_positive_prop, smoothed_positive_prop, @@ -11,7 +9,8 @@ def generate_sensor_for_states(state_groups, smooth, device, first_date, last_date): """ - fit over states + Fit over states. + Args: state_groups: pd.groupby.generic.DataFrameGroupBy state_key: "state_id" @@ -70,7 +69,8 @@ def generate_sensor_for_states(state_groups, smooth, device, first_date, last_da def generate_sensor_for_other_geores(state_groups, data, res_key, smooth, device, first_date, last_date): """ - fit over counties/HRRs/MSAs + Fit over counties/HRRs/MSAs. + Args: data: pd.DataFrame res_key: "fips", "cbsa_id" or "hrrnum" diff --git a/quidel/delphi_quidel/geo_maps.py b/quidel/delphi_quidel/geo_maps.py index f868e2748..03f4f61cf 100644 --- a/quidel/delphi_quidel/geo_maps.py +++ b/quidel/delphi_quidel/geo_maps.py @@ -1,4 +1,5 @@ """Contains geographic mapping tools.""" + def geo_map(geo_res, data, map_df): """Call appropriate mapping function based on desired geo resolution.""" if geo_res == "county": @@ -11,6 +12,7 @@ def geo_map(geo_res, data, map_df): def zip_to_msa(data, map_df): """Map from zipcode to MSA (along with parent state). + Args: data: dataframe at the day-zip resolution. Returns: @@ -35,6 +37,7 @@ def zip_to_msa(data, map_df): def zip_to_hrr(data, map_df): """Map from zipcode to HRR (along with parent state). + Args: data: dataframe at the day-zip resolution. Returns: @@ -59,6 +62,7 @@ def zip_to_hrr(data, map_df): def zip_to_county(data, map_df): """Aggregate zip codes to the county resolution, along with its parent state. + Args: data: dataframe aggregated to the day-zip resolution Returns: @@ -74,6 +78,7 @@ def zip_to_county(data, map_df): def zip_to_state(data, map_df): """Aggregate zip codes to the state resolution. + Args: data: dataframe aggregated to the day-zip resolution Returns: diff --git a/quidel/delphi_quidel/pull.py b/quidel/delphi_quidel/pull.py index 22fc3b6d0..18304d012 100644 --- a/quidel/delphi_quidel/pull.py +++ b/quidel/delphi_quidel/pull.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- """Simply downloads email attachments. + Uses this handy package: https://pypi.org/project/imap-tools/ """ import io @@ -26,6 +27,7 @@ def compare_dates(date1, date2, flag): """ Compare two dates. + If op == "l" return the larger date If op == "s" return the smaller date """ @@ -38,9 +40,7 @@ def compare_dates(date1, date2, flag): return date1 def check_whether_date_in_range(search_date, start_date, end_date): - """ - Check whether the search date is in a valid time range - """ + """Check whether the search date is in a valid time range.""" if search_date > end_date: return False if search_date < start_date: @@ -48,10 +48,7 @@ def check_whether_date_in_range(search_date, start_date, end_date): return True def read_historical_data(): - """ - Read historical flu antigen test data stored in - midas /common/quidel-historical-raw - """ + """Read historical flu antigen test data stored in midas /common/quidel-historical-raw.""" pull_dir = "/common/quidel-historical-raw" columns = ['SofiaSerNum', 'TestDate', 'Facility', 'ZipCode', 'FluA', 'FluB', 'StorageDate'] @@ -65,9 +62,9 @@ def read_historical_data(): def regulate_column_names(df, test_type): """ - Regulate column names for flu_ag test data since Quidel changed their - column names multiple times. We want to finalize the column name list - to be: + Regulate column names for flu_ag test data since Quidel changed their column names multiple times. + + We want to finalize the column name list to be: ['SofiaSerNum', 'TestDate', 'Facility', 'Zip', 'FluA', 'FluB', 'StorageDate'] """ @@ -87,7 +84,7 @@ def regulate_column_names(df, test_type): def get_from_email(column_names, start_dates, end_dates, mail_server, account, sender, password): """ - Get raw data from email account + Get raw data from email account. Parameters: start_date: datetime.datetime @@ -145,9 +142,7 @@ def get_from_email(column_names, start_dates, end_dates, mail_server, return dfs, time_flag def fix_zipcode(df): - """ - Fix zipcode that is 9 digit instead of 5 digit - """ + """Fix zipcode that is 9 digit instead of 5 digit.""" zipcode5 = [] fixnum = 0 for zipcode in df['Zip'].values: @@ -163,6 +158,8 @@ def fix_zipcode(df): def fix_date(df): """ + Remove invalid dates and select correct test date to use. + Quidel antigen tests are labeled with Test Date and Storage Date. In principle, the TestDate should reflect when the test was performed and the StorageDate when the test was logged in the MyVirena cloud storage device. We expect @@ -190,6 +187,7 @@ def preprocess_new_data(start_dates, end_dates, mail_server, account, sender, password, test_mode): """ Pull and pre-process Quidel Antigen Test data from datadrop email. + Drop unnecessary columns. Temporarily consider the positive rate sensor only which is related to number of total tests and number of positive tests. @@ -285,7 +283,7 @@ def preprocess_new_data(start_dates, end_dates, mail_server, account, def check_intermediate_file(cache_dir, pull_start_dates): """ - Check whether there is a cache file containing historical data already + Check whether there is a cache file containing historical data already. Parameters: cache_dir: str @@ -313,8 +311,7 @@ def check_intermediate_file(cache_dir, pull_start_dates): def pull_quidel_data(params): """ - Pull the quidel test data. Decide whether to combine the newly - received data with stored historical records in ./cache + Pull the quidel test data and decide whether to combine the new data with stored historical records in ./cache. Parameters: params: dict @@ -371,7 +368,8 @@ def pull_quidel_data(params): def check_export_end_date(input_export_end_dates, _end_date, end_from_today_minus): """ - Update the export_end_date according to the data received + Update the export_end_date according to the data received. + By default, set the export end date to be the last pulling date - 5 days (END_FROM_TODAY_MINUS = 5). Otherwise, use the required date if it is earlier than the default one. @@ -404,8 +402,7 @@ def check_export_end_date(input_export_end_dates, _end_date, def check_export_start_date(export_start_dates, export_end_dates, export_day_range): """ - Update the export_start_date according to the export_end_date so that it - could be export_end_date - EXPORT_DAY_RANGE + Update export_start_date according to the export_end_date so that it could be export_end_date - EXPORT_DAY_RANGE. Parameters: export_start_date: dict @@ -438,7 +435,7 @@ def check_export_start_date(export_start_dates, export_end_dates, def update_cache_file(dfs, _end_date, cache_dir): """ - Update cache file. Remove the old one, export the new one + Update cache file. Remove the old one, export the new one. Parameter: df: pd.DataFrame diff --git a/quidel_covidtest/Makefile b/quidel_covidtest/Makefile index 56a71a88c..968732f99 100644 --- a/quidel_covidtest/Makefile +++ b/quidel_covidtest/Makefile @@ -13,7 +13,8 @@ install: venv lint: . env/bin/activate; \ - pylint $(dir) + pylint $(dir); \ + pydocstyle $(dir) test: . env/bin/activate ;\ diff --git a/quidel_covidtest/delphi_quidel_covidtest/constants.py b/quidel_covidtest/delphi_quidel_covidtest/constants.py index 71c403cb5..8e35fff30 100644 --- a/quidel_covidtest/delphi_quidel_covidtest/constants.py +++ b/quidel_covidtest/delphi_quidel_covidtest/constants.py @@ -1,4 +1,4 @@ -"""Registry for constants""" +"""Registry for constants.""" # global constants MIN_OBS = 50 # minimum number of observations in order to compute a proportion. POOL_DAYS = 7 # number of days in the past (including today) to pool over diff --git a/quidel_covidtest/delphi_quidel_covidtest/data_tools.py b/quidel_covidtest/delphi_quidel_covidtest/data_tools.py index d3ee3768f..e0f987c05 100644 --- a/quidel_covidtest/delphi_quidel_covidtest/data_tools.py +++ b/quidel_covidtest/delphi_quidel_covidtest/data_tools.py @@ -1,23 +1,26 @@ -""" -Functions to calculate the quidel sensor statistic. -""" +"""Functions to calculate the quidel sensor statistic.""" import numpy as np import pandas as pd def remove_null_samples(df): - """Removes entries in a data frame whose sample sizes are null.""" + """Remove entries in a data frame whose sample sizes are null.""" return df[df["sample_size"].notnull()] def _prop_var(p, n): - """var(X/n) = 1/(n^2)var(X) = (npq)/(n^2) = pq/n""" + """ + Calculate variance of proportion. + + var(X/n) = 1/(n^2)var(X) = (npq)/(n^2) = pq/n + """ return p * (1 - p) / n def fill_dates(y_data, first_date, last_date): """ Ensure all dates are listed in the data, otherwise, add days with 0 counts. + Args: y_data: dataframe with datetime index first_date: datetime.datetime @@ -42,8 +45,9 @@ def fill_dates(y_data, first_date, last_date): def _slide_window_sum(arr, k): """ - Sliding window sum, with fixed window size k. For indices 0:k, we - DO compute a sum, using whatever points are available. + Sliding window sum, with fixed window size k. + + For indices 0:k, we DO compute a sum, using whatever points are available. Reference: https://stackoverflow.com/a/38507725 @@ -57,7 +61,6 @@ def _slide_window_sum(arr, k): sarr: np.ndarray Array of same length of arr, holding the sliding window sum. """ - if not isinstance(k, int): raise ValueError('k must be int.') temp = np.append(np.zeros(k - 1), arr) @@ -67,12 +70,11 @@ def _slide_window_sum(arr, k): def _geographical_pooling(tpooled_tests, tpooled_ptests, min_obs): """ - Calculates the proportion of parent samples (tests) that must be "borrowed" - in order to properly compute the statistic. If there are no samples - available in the parent, the borrow_prop is 0. If the parent does not + Calculate proportion of parent samples (tests) that must be "borrowed" in order to compute the statistic. + + If there are no samples available in the parent, the borrow_prop is 0. If the parent does not have enough samples, we return a borrow_prop of 1, and the fact that the - pooled samples are insufficient are handled in the statistic fitting - step. + pooled samples are insufficient are handled in the statistic fitting step. Args: tpooled_tests: np.ndarray[float] @@ -115,8 +117,7 @@ def _geographical_pooling(tpooled_tests, tpooled_ptests, min_obs): def raw_positive_prop(positives, tests, min_obs): """ - Calculates the proportion of positive tests for a single geographic - location, without any temporal smoothing. + Calculate the proportion of positive tests for a single geographic location, without any temporal smoothing. If on any day t, tests[t] < min_obs, then we report np.nan. @@ -169,8 +170,7 @@ def raw_positive_prop(positives, tests, min_obs): def smoothed_positive_prop(positives, tests, min_obs, pool_days, parent_positives=None, parent_tests=None): """ - Calculates the proportion of negative tests for a single geographic - location, with temporal smoothing. + Calculate the proportion of negative tests for a single geographic location, with temporal smoothing. For a given day t, if sum(tests[(t-pool_days+1):(t+1)]) < min_obs, then we 'borrow' min_obs - sum(tests[(t-pool_days+1):(t+1)]) observations from the @@ -215,7 +215,6 @@ def smoothed_positive_prop(positives, tests, min_obs, pool_days, np.ndarray Effective sample size (after temporal and geographic pooling). """ - positives = positives.astype(float) tests = tests.astype(float) if (parent_positives is None) or (parent_tests is None): @@ -259,9 +258,8 @@ def smoothed_positive_prop(positives, tests, min_obs, pool_days, def raw_tests_per_device(devices, tests, min_obs): - ''' - Calculates the tests per device for a single geographic - location, without any temporal smoothing. + """ + Calculate the tests per device for a single geographic location, without any temporal smoothing. If on any day t, tests[t] < min_obs, then we report np.nan. The second and third returned np.ndarray are the standard errors, @@ -284,7 +282,7 @@ def raw_tests_per_device(devices, tests, min_obs): Placeholder for standard errors np.ndarray Sample size used to compute estimates. - ''' + """ devices = devices.astype(float) tests = tests.astype(float) if (np.any(np.isnan(devices)) or np.any(np.isnan(tests))): @@ -304,8 +302,8 @@ def raw_tests_per_device(devices, tests, min_obs): def smoothed_tests_per_device(devices, tests, min_obs, pool_days, parent_devices=None, parent_tests=None): """ - Calculates the ratio of tests per device for a single geographic - location, with temporal smoothing. + Calculate the ratio of tests per device for a single geographic location, with temporal smoothing. + For a given day t, if sum(tests[(t-pool_days+1):(t+1)]) < min_obs, then we 'borrow' min_obs - sum(tests[(t-pool_days+1):(t+1)]) observations from the parents over the same timespan. Importantly, it will make sure NOT to diff --git a/quidel_covidtest/delphi_quidel_covidtest/generate_sensor.py b/quidel_covidtest/delphi_quidel_covidtest/generate_sensor.py index eb8a7cff7..efd5f95a8 100644 --- a/quidel_covidtest/delphi_quidel_covidtest/generate_sensor.py +++ b/quidel_covidtest/delphi_quidel_covidtest/generate_sensor.py @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- -""" -Functions to help generate sensor for different geographical levels -""" +"""Functions to help generate sensor for different geographical levels.""" import pandas as pd from .data_tools import (fill_dates, raw_positive_prop, smoothed_positive_prop, @@ -13,7 +11,8 @@ def generate_sensor_for_states(state_groups, smooth, device, first_date, last_date): """ - fit over states + Fit over states. + Args: state_groups: pd.groupby.generic.DataFrameGroupBy state_key: "state_id" @@ -70,7 +69,8 @@ def generate_sensor_for_states(state_groups, smooth, device, first_date, last_da def generate_sensor_for_other_geores(state_groups, data, res_key, smooth, device, first_date, last_date): """ - fit over counties/HRRs/MSAs + Fit over counties/HRRs/MSAs. + Args: data: pd.DataFrame res_key: "fips", "cbsa_id" or "hrrnum" diff --git a/quidel_covidtest/delphi_quidel_covidtest/geo_maps.py b/quidel_covidtest/delphi_quidel_covidtest/geo_maps.py index 6573175f5..939383a8a 100644 --- a/quidel_covidtest/delphi_quidel_covidtest/geo_maps.py +++ b/quidel_covidtest/delphi_quidel_covidtest/geo_maps.py @@ -13,9 +13,7 @@ def geo_map(geo_res, df): - """ - Map a geocode to a new value. - """ + """Map a geocode to a new value.""" data = df.copy() geo_key = GEO_KEY_DICT[geo_res] # Add population for each zipcode @@ -32,6 +30,8 @@ def geo_map(geo_res, df): def add_parent_state(data, geo_res, geo_key): """ + Add parent state column to DataFrame. + - map from msa/hrr to state, going by the state with the largest population (since a msa/hrr may span multiple states) - map from county to the corresponding state diff --git a/quidel_covidtest/delphi_quidel_covidtest/pull.py b/quidel_covidtest/delphi_quidel_covidtest/pull.py index d98b7caa1..f7ff5cad2 100644 --- a/quidel_covidtest/delphi_quidel_covidtest/pull.py +++ b/quidel_covidtest/delphi_quidel_covidtest/pull.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- """Simply downloads email attachments. + Uses this handy package: https://pypi.org/project/imap-tools/ """ import io @@ -15,7 +16,8 @@ def get_from_email(start_date, end_date, mail_server, account, sender, password): """ - Get raw data from email account + Get raw data from email account. + Args: start_date: datetime.datetime pull data from email received from the start date @@ -56,9 +58,7 @@ def get_from_email(start_date, end_date, mail_server, return df, time_flag def fix_zipcode(df): - """ - Fix zipcode that is 9 digit instead of 5 digit - """ + """Fix zipcode that is 9 digit instead of 5 digit.""" zipcode5 = [] fixnum = 0 for zipcode in df['Zip'].values: @@ -74,6 +74,8 @@ def fix_zipcode(df): def fix_date(df): """ + Remove invalid dates and select correct test date to use. + Quidel Covid Test are labeled with Test Date and Storage Date. In principle, the TestDate should reflect when the test was performed and the StorageDate when the test was logged in the MyVirena cloud storage device. We expect @@ -101,6 +103,7 @@ def preprocess_new_data(start_date, end_date, mail_server, account, sender, password, test_mode): """ Pull and pre-process Quidel Covid Test data from datadrop email. + Drop unnecessary columns. Temporarily consider the positive rate sensor only which is related to number of total tests and number of positive tests. @@ -173,9 +176,7 @@ def preprocess_new_data(start_date, end_date, mail_server, account, return df_merged, time_flag def check_intermediate_file(cache_dir, pull_start_date): - """ - Check whether there is a cache file containing historical data already - """ + """Check whether there is a cache file containing historical data already.""" for filename in os.listdir(cache_dir): if ".csv" in filename: pull_start_date = datetime.strptime(filename.split("_")[2].split(".")[0], @@ -187,8 +188,7 @@ def check_intermediate_file(cache_dir, pull_start_date): def pull_quidel_covidtest(params): """ - Pull the quidel covid test data. Decide whether to combine the newly - received data with stored historical records in ./cache + Pull the quidel covid test data and ecide whether to combine the new data with stored historical records in ./cache. Parameters: params: dict @@ -240,7 +240,8 @@ def pull_quidel_covidtest(params): def check_export_end_date(input_export_end_date, _end_date, end_from_today_minus): """ - Update the export_end_date according to the data received + Update the export_end_date according to the data received. + By default, set the export end date to be the last pulling date - 5 days (end_from_today_minus = 5). Otherwise, use the required date if it is earlier than the default one. @@ -267,8 +268,7 @@ def check_export_end_date(input_export_end_date, _end_date, def check_export_start_date(export_start_date, export_end_date, export_day_range): """ - Update the export_start_date according to the export_end_date so that it - could be export_end_date - export_day_range + Update export_start_date according to the export_end_date so that it could be export_end_date - export_day_range. Parameters: export_start_date: str @@ -296,7 +296,7 @@ def check_export_start_date(export_start_date, export_end_date, def update_cache_file(df, _end_date, cache_dir): """ - Update cache file. Remove the old one, export the new one + Update cache file. Remove the old one, export the new one. Parameter: df: pd.DataFrame diff --git a/sir_complainsalot/params.json.template b/sir_complainsalot/params.json.template index 7fadcd413..71d7d9082 100644 --- a/sir_complainsalot/params.json.template +++ b/sir_complainsalot/params.json.template @@ -1,40 +1,53 @@ { - "channel": "#covid-19-outages", + "channel": "#sir-complains-a-lot", "slack_token": "", "sources": { "doctor-visits": { "max_age": 5, - "maintainers": ["U010VE2T51N"] + "maintainers": ["U01AP8GSWG3","U01069KCRS7"] }, "hospital-admissions": { "max_age": 5, - "maintainers": ["U010VE2T51N"], - "retired-signals": ["smoothed_covid19", "smoothed_adj_covid19"] + "maintainers": ["U01AP8GSWG3","U01069KCRS7"], + "retired-signals": ["smoothed_covid19","smoothed_adj_covid19"] + }, + "chng": { + "max_age": 6, + "maintainers": ["U01AP8GSWG3","U01069KCRS7"] + }, + "google-symptoms": { + "max_age": 11, + "maintainers": ["U01AP8GSWG3","U01069KCRS7"] }, "ght": { "max_age": 5, - "maintainers": ["U010VE2T51N"] - }, - "jhu-csse": { - "max_age": 2, - "maintainers": ["UUCGWMJ5P"] + "maintainers": ["U01AP8GSWG3","U01069KCRS7"] }, "usa-facts": { + "max_age": 3, + "maintainers": ["U01AP8GSWG3","U01069KCRS7"] + }, + "jhu-csse": { "max_age": 2, - "maintainers": ["UUCGWMJ5P"] + "maintainers": ["U01AP8GSWG3","U01069KCRS7"] }, "safegraph": { - "max_age": 4, - "maintainers": ["U010VE2T51N"] + "max_age": 11, + "maintainers": ["U01AP8GSWG3","U01069KCRS7"] }, "fb-survey": { - "max_age": 2, + "max_age": 3, "maintainers": ["U01069KCRS7"] }, "indicator-combination": { - "max_age": 2, - "maintainers": ["U010VE2T51N"], + "max_age": 3, + "maintainers": ["U01AP8GSWG3","U01069KCRS7"], "retired-signals": ["nmf_day_doc_fbs_ght"] + }, + "quidel": { + "max_age":6, + "maintainers": ["U01AP8GSWG3","U01069KCRS7"], + "retired-signals": ["raw_pct_negative","smoothed_pct_negative","raw_tests_per_device","smoothed_tests_per_device"] } } } diff --git a/validator/.pylintrc b/validator/.pylintrc new file mode 100644 index 000000000..58c6edbba --- /dev/null +++ b/validator/.pylintrc @@ -0,0 +1,22 @@ + +[MESSAGES CONTROL] + +disable=logging-format-interpolation, + too-many-locals, + too-many-arguments, + # Allow pytest functions to be part of a class. + no-self-use, + # Allow pytest classes to have one test. + too-few-public-methods + +[BASIC] + +# Allow arbitrarily short-named variables. +variable-rgx=[a-z_][a-z0-9_]* +argument-rgx=[a-z_][a-z0-9_]* +attr-rgx=[a-z_][a-z0-9_]* + +[DESIGN] + +# Don't complain about pytest "unused" arguments. +ignored-argument-names=(_.*|run_as_module) diff --git a/validator/PLANS.md b/validator/PLANS.md index 395494a97..d24adb22f 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -25,6 +25,7 @@ * Outliers in cases and deaths signals using [this method](https://github.com/cmu-delphi/covidcast-forecast/tree/dev/corrections/data_corrections) * Source data for specified date range is empty * API data for specified date range is empty +* Duplicate rows ## Current features @@ -39,7 +40,6 @@ ### Starter/small issues -* Check for duplicate rows * Backfill problems, especially with JHU and USA Facts, where a change to old data results in a datapoint that doesn’t agree with surrounding data ([JHU examples](https://delphi-org.slack.com/archives/CF9G83ZJ9/p1600729151013900)) or is very different from the value it replaced. If date is already in the API, have any values changed significantly within the "backfill" window (use span_length setting). See [this](https://github.com/cmu-delphi/covidcast-indicators/pull/155#discussion_r504195207) for context. * Run check_missing_date_files (or similar) on every geo type-signal type separately in comparative checks loop. diff --git a/validator/delphi_validator/datafetcher.py b/validator/delphi_validator/datafetcher.py index a5f80e3bb..f9f6dfeba 100644 --- a/validator/delphi_validator/datafetcher.py +++ b/validator/delphi_validator/datafetcher.py @@ -4,19 +4,79 @@ """ import re +import threading from os import listdir from os.path import isfile, join -from itertools import product import pandas as pd import numpy as np import covidcast -from .errors import APIDataFetchError +from .errors import APIDataFetchError, ValidationError -filename_regex = re.compile( +FILENAME_REGEX = re.compile( r'^(?P\d{8})_(?P\w+?)_(?P\w+)\.csv$') +def make_date_filter(start_date, end_date): + """ + Create a function to return a boolean of whether a filename of appropriate + format contains a date within (inclusive) the specified date range. + + Arguments: + - start_date: datetime date object + - end_date: datetime date object + + Returns: + - Custom function object + """ + # Convert dates from datetime format to int. + start_code = int(start_date.strftime("%Y%m%d")) + end_code = int(end_date.strftime("%Y%m%d")) + + def custom_date_filter(match): + """ + Return a boolean of whether a filename of appropriate format contains a date + within the specified date range. + + Arguments: + - match: regex match object based on FILENAME_REGEX applied to a filename str + + Returns: + - boolean + """ + # If regex match doesn't exist, current filename is not an appropriately + # formatted source data file. + if not match: + return False + + # Convert date found in CSV name to int. + code = int(match.groupdict()['date']) + + # Return boolean True if current file date "code" is within the defined date range. + return start_code <= code <= end_code + + return custom_date_filter + + +def load_all_files(export_dir, start_date, end_date): + """Load all files in a directory. + Parameters + ---------- + export_dir: str + directory from which to load files + + Returns + ------- + loaded_data: List[Tuple(str, re.match, pd.DataFrame)] + triples of filenames, filename matches with the geo regex, and the data from the file + """ + export_files = read_filenames(export_dir) + date_filter = make_date_filter(start_date, end_date) + + # Make list of tuples of CSV names and regex match objects. + return [(f, m, load_csv(join(export_dir, f))) for (f, m) in export_files if date_filter(m)] + + def read_filenames(path): """ Return a list of tuples of every filename and regex match to the CSV filename @@ -28,7 +88,7 @@ def read_filenames(path): Returns: - list of tuples """ - daily_filenames = [(f, filename_regex.match(f)) + daily_filenames = [(f, FILENAME_REGEX.match(f)) for f in listdir(path) if isfile(join(path, f))] return daily_filenames @@ -54,10 +114,10 @@ def get_geo_signal_combos(data_source): """ meta = covidcast.metadata() source_meta = meta[meta['data_source'] == data_source] - unique_signals = source_meta['signal'].unique().tolist() - unique_geotypes = source_meta['geo_type'].unique().tolist() + # Need to convert np.records to tuples so they are hashable and can be used in sets and dicts. + geo_signal_combos = list(map(tuple, + source_meta[["geo_type", "signal"]].to_records(index=False))) - geo_signal_combos = list(product(unique_geotypes, unique_signals)) print("Number of expected geo region-signal combinations:", len(geo_signal_combos)) @@ -75,7 +135,7 @@ def fetch_api_reference(data_source, start_date, end_date, geo_type, signal_type if not isinstance(api_df, pd.DataFrame): custom_msg = "Error fetching data from " + str(start_date) + \ " to " + str(end_date) + \ - "for data source: " + data_source + \ + " for data source: " + data_source + \ ", signal type: " + signal_type + \ ", geo type: " + geo_type @@ -95,3 +155,61 @@ def fetch_api_reference(data_source, start_date, end_date, geo_type, signal_type ).reindex(columns=column_names) return api_df + + +def get_one_api_df(data_source, min_date, max_date, + geo_type, signal_type, + api_semaphore, dict_lock, output_dict): + """ + Pull API data for a single geo type-signal combination. Raises + error if data couldn't be retrieved. Saves data to data dict. + """ + api_semaphore.acquire() + + # Pull reference data from API for all dates. + try: + geo_sig_api_df_or_error = fetch_api_reference( + data_source, min_date, max_date, geo_type, signal_type) + + except APIDataFetchError as e: + geo_sig_api_df_or_error = ValidationError( + ("api_data_fetch_error", geo_type, signal_type), None, e) + + api_semaphore.release() + + # Use a lock so only one thread can access the dictionary. + dict_lock.acquire() + output_dict[(geo_type, signal_type)] = geo_sig_api_df_or_error + dict_lock.release() + + +def threaded_api_calls(data_source, min_date, max_date, geo_signal_combos, n_threads=32): + """ + Get data from API for all geo-signal combinations in a threaded way + to save time. + """ + if n_threads > 32: + n_threads = 32 + print("Warning: Don't run more than 32 threads at once due " + + "to API resource limitations") + + output_dict = dict() + dict_lock = threading.Lock() + api_semaphore = threading.Semaphore(value=n_threads) + + thread_objs = [threading.Thread( + target=get_one_api_df, args=(data_source, min_date, max_date, + geo_type, signal_type, + api_semaphore, + dict_lock, output_dict) + ) for geo_type, signal_type in geo_signal_combos] + + # Start all threads. + for thread in thread_objs: + thread.start() + + # Wait until all threads are finished. + for thread in thread_objs: + thread.join() + + return output_dict diff --git a/validator/delphi_validator/errors.py b/validator/delphi_validator/errors.py index aa688ab54..bbcfd8df0 100644 --- a/validator/delphi_validator/errors.py +++ b/validator/delphi_validator/errors.py @@ -35,3 +35,4 @@ def __init__(self, check_data_id, expression, message): check_data_id, tuple) and not isinstance(check_data_id, list) else tuple(check_data_id) self.expression = expression self.message = message + super().__init__(self.check_data_id, self.expression, self.message) diff --git a/validator/delphi_validator/report.py b/validator/delphi_validator/report.py new file mode 100644 index 000000000..89af69700 --- /dev/null +++ b/validator/delphi_validator/report.py @@ -0,0 +1,98 @@ +"""Validation output reports.""" +import sys +from datetime import date, datetime +from typing import List, Tuple + +class ValidationReport: + """Class for reporting the results of validation.""" + def __init__(self, errors_to_suppress: List[Tuple[str]]): + """Initialize a ValidationReport. + Parameters + ---------- + errors_to_suppress: List[Tuple[str]] + List of error identifications to ignore. + + Attributes + ---------- + errors_to_suppress: List[Tuple[str]] + See above + num_suppressed: int + Number of errors suppressed + total_checks: int + Number of validation checks performed + raised_errors: List[Exception] + Errors raised from validation failures + raised_warnings: List[Exception] + Warnings raised from validation execution + unsuppressed_errors: List[Exception] + Errors raised from validation failures not found in `self.errors_to_suppress` + """ + self.errors_to_suppress = errors_to_suppress.copy() + self.num_suppressed = 0 + self.total_checks = 0 + self.raised_errors = [] + self.raised_warnings = [] + self.unsuppressed_errors = [] + + def add_raised_error(self, error): + """Add an error to the report. + Parameters + ---------- + error: Exception + Error raised in validation + + Returns + ------- + None + """ + self.raised_errors.append(error) + # Convert any dates in check_data_id to strings for the purpose of comparing + # to manually suppressed errors. + raised_check_id = tuple([ + item.strftime("%Y-%m-%d") if isinstance(item, (date, datetime)) + else item for item in error.check_data_id]) + + if raised_check_id in self.errors_to_suppress: + self.errors_to_suppress.remove(raised_check_id) + self.num_suppressed += 1 + else: + self.unsuppressed_errors.append(error) + + def increment_total_checks(self): + """Records a check.""" + self.total_checks += 1 + + def add_raised_warning(self, warning): + """Add a warning to the report. + Parameters + ---------- + warning: Warning + Warning raised in validation + + Returns + ------- + None + """ + self.raised_warnings.append(warning) + + def __str__(self): + """String representation of report.""" + out_str = f"{self.total_checks} checks run\n" + out_str += f"{len(self.unsuppressed_errors)} checks failed\n" + out_str += f"{self.num_suppressed} checks suppressed\n" + out_str += f"{len(self.raised_warnings)} warnings\n" + for message in self.unsuppressed_errors: + out_str += f"{message}\n" + for message in self.raised_warnings: + out_str += f"{message}\n" + return out_str + + def print_and_exit(self): + """ + Print results and, if any not-suppressed exceptions were raised, exit with non-zero status. + """ + print(self) + if len(self.unsuppressed_errors) != 0: + sys.exit(1) + else: + sys.exit(0) diff --git a/validator/delphi_validator/run.py b/validator/delphi_validator/run.py index 74371518b..ed6236b34 100644 --- a/validator/delphi_validator/run.py +++ b/validator/delphi_validator/run.py @@ -9,8 +9,9 @@ def run_module(): + """Run the validator as a module.""" parent_params = read_params() params = parent_params['validation'] validator = Validator(params) - validator.validate(parent_params["export_dir"]) + validator.validate(parent_params["export_dir"]).print_and_exit() diff --git a/validator/delphi_validator/utils.py b/validator/delphi_validator/utils.py new file mode 100644 index 000000000..4f900a5c7 --- /dev/null +++ b/validator/delphi_validator/utils.py @@ -0,0 +1,47 @@ +"""Utility functions for validation.""" +from datetime import datetime +import pandas as pd + +# Recognized geo types. +GEO_REGEX_DICT = { + 'county': r'^\d{5}$', + 'hrr': r'^\d{1,3}$', + 'msa': r'^\d{5}$', + 'dma': r'^\d{3}$', + 'state': r'^[a-zA-Z]{2}$', + 'national': r'^[a-zA-Z]{2}$' +} + + +def relative_difference_by_min(x, y): + """ + Calculate relative difference between two numbers. + """ + return (x - y) / min(x, y) + + +def aggregate_frames(frames_list): + """Aggregates a list of data frames into a single frame. + + Parameters + ---------- + frames_list: List[Tuple(str, re.match, pd.DataFrame)] + triples of filenames, filename matches with the geo regex, and the data from the file + + Returns + ------- + A pd.DataFrame concatenation of all data frames in `frames_list` with additional columns for + geo_type, time_value, and signal derived from the corresponding re.match. + """ + all_frames = [] + for _, match, data_df in frames_list: + df = data_df.copy() + # Get geo_type, date, and signal name as specified by CSV name. + df['geo_type'] = match.groupdict()['geo_type'] + df['time_value'] = datetime.strptime( + match.groupdict()['date'], "%Y%m%d").date() + df['signal'] = match.groupdict()['signal'] + + all_frames.append(df) + + return pd.concat(all_frames).reset_index(drop=True) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 2b05ac9ae..0ffd2d3f2 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -2,76 +2,15 @@ """ Tools to validate CSV source data, including various check methods. """ -import sys import re import math -import threading from os.path import join from datetime import date, datetime, timedelta import pandas as pd from .errors import ValidationError, APIDataFetchError -from .datafetcher import filename_regex, \ - read_filenames, load_csv, get_geo_signal_combos, \ - fetch_api_reference - -# Recognized geo types. -geo_regex_dict = { - 'county': '^\d{5}$', - 'hrr': '^\d{1,3}$', - 'msa': '^\d{5}$', - 'dma': '^\d{3}$', - 'state': '^[a-zA-Z]{2}$', - 'national': '^[a-zA-Z]{2}$' -} - - -def relative_difference_by_min(x, y): - """ - Calculate relative difference between two numbers. - """ - return (x - y) / min(x, y) - - -def make_date_filter(start_date, end_date): - """ - Create a function to return a boolean of whether a filename of appropriate - format contains a date within (inclusive) the specified date range. - - Arguments: - - start_date: datetime date object - - end_date: datetime date object - - Returns: - - Custom function object - """ - # Convert dates from datetime format to int. - start_code = int(start_date.strftime("%Y%m%d")) - end_code = int(end_date.strftime("%Y%m%d")) - - def custom_date_filter(match): - """ - Return a boolean of whether a filename of appropriate format contains a date - within the specified date range. - - Arguments: - - match: regex match object based on filename_regex applied to a filename str - - Returns: - - boolean - """ - # If regex match doesn't exist, current filename is not an appropriately - # formatted source data file. - if not match: - return False - - # Convert date found in CSV name to int. - code = int(match.groupdict()['date']) - - # Return boolean True if current file date "code" is within the defined date range. - return start_code <= code <= end_code - - return custom_date_filter - +from .datafetcher import FILENAME_REGEX, get_geo_signal_combos, threaded_api_calls, load_all_files +from .utils import GEO_REGEX_DICT, relative_difference_by_min, aggregate_frames +from .report import ValidationReport class Validator(): """ Class containing validation() function and supporting functions. Stores a list @@ -105,11 +44,12 @@ def __init__(self, params): avg, etc) - expected_lag: dict of signal names: int pairs; how many days behind do we expect each signal to be - - suppressed_errors: set of check_data_ids used to identify error messages to ignore - - raised_errors: list to append data upload-blocking errors to as they are raised - - total_checks: incremental counter to track total number of checks run - - raised_warnings: list to append non-data upload-blocking errors to as they are raised """ + # TODO(https://github.com/cmu-delphi/covidcast-indicators/issues/579) + # Refactor this class to avoid the too-many-instance-attributes error. + # + # pylint: disable=too-many-instance-attributes + # Get user settings from params or if not provided, set default. self.data_source = params['data_source'] self.validator_static_file_dir = params.get( @@ -143,23 +83,19 @@ def __init__(self, params): self.suppressed_errors = {(item,) if not isinstance(item, tuple) and not isinstance( item, list) else tuple(item) for item in params.get('suppressed_errors', [])} - # Output - self.raised_errors = [] - self.total_checks = 0 - - self.raised_warnings = [] + self.active_report = ValidationReport(self.suppressed_errors) + # pylint: enable=too-many-instance-attributes - def increment_total_checks(self): - """ Add 1 to total_checks counter """ - self.total_checks += 1 def check_missing_date_files(self, daily_filenames): """ Check for missing dates between the specified start and end dates. Arguments: - - daily_filenames: list of tuples, each containing CSV source data filename - and the regex match object corresponding to filename_regex. + - daily_filenames: List[Tuple(str, re.match, pd.DataFrame)] + triples of filenames, filename matches with the geo regex, and the data from the + file + - report: ValidationReport; report where results are added Returns: - None @@ -178,45 +114,14 @@ def check_missing_date_files(self, daily_filenames): check_dateholes.sort() if check_dateholes: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( "check_missing_date_files", check_dateholes, "Missing dates are observed; if these dates are" + " already in the API they would not be updated")) - self.increment_total_checks() + self.active_report.increment_total_checks() - def check_settings(self): - """ - Perform some automated format & sanity checks of parameters. - - Arguments: - - None - - Returns: - - None - """ - if not isinstance(self.max_check_lookbehind, timedelta): - self.raised_errors.append(ValidationError( - ("check_type_max_check_lookbehind"), - self.max_check_lookbehind, - "max_check_lookbehind must be of type datetime.timedelta")) - - self.increment_total_checks() - - if not isinstance(self.generation_date, date): - self.raised_errors.append(ValidationError( - ("check_type_generation_date"), self.generation_date, - "generation_date must be a datetime.date type")) - - self.increment_total_checks() - - if self.generation_date > date.today(): - self.raised_errors.append(ValidationError( - ("check_future_generation_date"), self.generation_date, - "generation_date must not be in the future")) - - self.increment_total_checks() def check_df_format(self, df_to_test, nameformat): """ @@ -230,20 +135,20 @@ def check_df_format(self, df_to_test, nameformat): Returns: - None """ - pattern_found = filename_regex.match(nameformat) + pattern_found = FILENAME_REGEX.match(nameformat) if not nameformat or not pattern_found: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_filename_format", nameformat), nameformat, 'nameformat not recognized')) - self.increment_total_checks() + self.active_report.increment_total_checks() if not isinstance(df_to_test, pd.DataFrame): - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_file_data_format", nameformat), type(df_to_test), 'df_to_test must be a pandas dataframe.')) - self.increment_total_checks() + self.active_report.increment_total_checks() def check_bad_geo_id_value(self, df_to_test, filename, geo_type): """ @@ -253,24 +158,25 @@ def check_bad_geo_id_value(self, df_to_test, filename, geo_type): Arguments: - df_to_test: pandas dataframe of CSV source data containing the geo_id column to check - geo_type: string from CSV name specifying geo type (state, county, msa, etc.) of data - """ + - report: ValidationReport; report where results are added + """ file_path = join(self.validator_static_file_dir, geo_type + '_geo.csv') valid_geo_df = pd.read_csv(file_path, dtype={'geo_id': str}) valid_geos = valid_geo_df['geo_id'].values unexpected_geos = [geo for geo in df_to_test['geo_id'] if geo.lower() not in valid_geos] if len(unexpected_geos) > 0: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_bad_geo_id_value", filename), unexpected_geos, "Unrecognized geo_ids (not in historical data)")) - self.increment_total_checks() + self.active_report.increment_total_checks() upper_case_geos = [ geo for geo in df_to_test['geo_id'] if geo.lower() != geo] if len(upper_case_geos) > 0: - self.raised_warnings.append(ValidationError( + self.active_report.add_raised_warning(ValidationError( ("check_geo_id_lowercase", filename), upper_case_geos, "geo_id contains uppercase characters. Lowercase is preferred.")) - self.increment_total_checks() + self.active_report.increment_total_checks() def check_bad_geo_id_format(self, df_to_test, nameformat, geo_type): """ @@ -279,6 +185,7 @@ def check_bad_geo_id_format(self, df_to_test, nameformat, geo_type): Arguments: - df_to_test: pandas dataframe of CSV source data - geo_type: string from CSV name specifying geo type (state, county, msa, hrr) of data + - report: ValidationReport; report where results are added Returns: - None @@ -302,7 +209,7 @@ def find_all_unexpected_geo_ids(df_to_test, geo_regex, geo_type): df_to_test["geo_id"] = [geo[0] for geo in df_to_test["geo_id"].str.split(".")] - self.raised_warnings.append(ValidationError( + self.active_report.add_raised_warning(ValidationError( ("check_geo_id_type", nameformat), None, "geo_ids saved as floats; strings preferred")) @@ -319,19 +226,19 @@ def find_all_unexpected_geo_ids(df_to_test, geo_regex, geo_type): df_to_test['geo_id']) if geo not in expected_geos} if len(unexpected_geos) > 0: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_geo_id_format", nameformat), unexpected_geos, "Non-conforming geo_ids found")) - if geo_type not in geo_regex_dict: - self.raised_errors.append(ValidationError( + if geo_type not in GEO_REGEX_DICT: + self.active_report.add_raised_error(ValidationError( ("check_geo_type", nameformat), geo_type, "Unrecognized geo type")) else: find_all_unexpected_geo_ids( - df_to_test, geo_regex_dict[geo_type], geo_type) + df_to_test, GEO_REGEX_DICT[geo_type], geo_type) - self.increment_total_checks() + self.active_report.increment_total_checks() def check_bad_val(self, df_to_test, nameformat, signal_type): """ @@ -340,6 +247,7 @@ def check_bad_val(self, df_to_test, nameformat, signal_type): Arguments: - df_to_test: pandas dataframe of a single CSV of source data - signal_type: string from CSV name specifying signal type (smoothed_cli, etc) of data + - report: ValidationReport; report where results are added Returns: - None @@ -350,36 +258,36 @@ def check_bad_val(self, df_to_test, nameformat, signal_type): if percent_option: if not df_to_test[(df_to_test['val'] > 100)].empty: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_val_pct_gt_100", nameformat), df_to_test[(df_to_test['val'] > 100)], "val column can't have any cell greater than 100 for percents")) - self.increment_total_checks() + self.active_report.increment_total_checks() if proportion_option: if not df_to_test[(df_to_test['val'] > 100000)].empty: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_val_prop_gt_100k", nameformat), df_to_test[(df_to_test['val'] > 100000)], "val column can't have any cell greater than 100000 for proportions")) - self.increment_total_checks() + self.active_report.increment_total_checks() if df_to_test['val'].isnull().values.any(): - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_val_missing", nameformat), None, "val column can't have any cell that is NA")) - self.increment_total_checks() + self.active_report.increment_total_checks() if not df_to_test[(df_to_test['val'] < 0)].empty: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_val_lt_0", nameformat), df_to_test[(df_to_test['val'] < 0)], "val column can't have any cell smaller than 0")) - self.increment_total_checks() + self.active_report.increment_total_checks() def check_bad_se(self, df_to_test, nameformat): """ @@ -406,35 +314,35 @@ def check_bad_se(self, df_to_test, nameformat): '~((se > 0) & (se < 50) & (se <= se_upper_limit))') if not result.empty: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_se_not_missing_and_in_range", nameformat), result, "se must be in (0, min(50,val*(1+eps))] and not missing")) - self.increment_total_checks() + self.active_report.increment_total_checks() if df_to_test["se"].isnull().mean() > 0.5: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_se_many_missing", nameformat), None, 'Recent se values are >50% NA')) - self.increment_total_checks() + self.active_report.increment_total_checks() elif self.missing_se_allowed: result = df_to_test.query( '~(se.isnull() | ((se > 0) & (se < 50) & (se <= se_upper_limit)))') if not result.empty: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_se_missing_or_in_range", nameformat), result, "se must be NA or in (0, min(50,val*(1+eps))]")) - self.increment_total_checks() + self.active_report.increment_total_checks() result_jeffreys = df_to_test.query('(val == 0) & (se == 0)') result_alt = df_to_test.query('se == 0') if not result_jeffreys.empty: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_se_0_when_val_0", nameformat), None, "when signal value is 0, se must be non-zero. please " @@ -442,11 +350,11 @@ def check_bad_se(self, df_to_test, nameformat): + " (see wikipedia.org/wiki/Binomial_proportion_confidence" + "_interval#Jeffreys_interval for details)")) elif not result_alt.empty: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_se_0", nameformat), result_alt, "se must be non-zero")) - self.increment_total_checks() + self.active_report.increment_total_checks() # Remove se_upper_limit column. df_to_test.drop(columns=["se_upper_limit"]) @@ -459,40 +367,41 @@ def check_bad_sample_size(self, df_to_test, nameformat): - df_to_test: pandas dataframe of a single CSV of source data (one day-signal-geo_type combo) - nameformat: str CSV name; for example, "20200624_county_smoothed_nohh_cmnty_cli.csv" + - report: ValidationReport; report where results are added Returns: - None """ if not self.missing_sample_size_allowed: if df_to_test['sample_size'].isnull().values.any(): - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_n_missing", nameformat), None, "sample_size must not be NA")) - self.increment_total_checks() + self.active_report.increment_total_checks() # Find rows with sample size less than minimum allowed result = df_to_test.query( '(sample_size < @self.minimum_sample_size)') if not result.empty: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_n_gt_min", nameformat), result, f"sample size must be >= {self.minimum_sample_size}")) - self.increment_total_checks() + self.active_report.increment_total_checks() elif self.missing_sample_size_allowed: result = df_to_test.query( '~(sample_size.isnull() | (sample_size >= @self.minimum_sample_size))') if not result.empty: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_n_missing_or_gt_min", nameformat), result, f"sample size must be NA or >= {self.minimum_sample_size}")) - self.increment_total_checks() + self.active_report.increment_total_checks() def check_min_allowed_max_date(self, max_date, geo_type, signal_type): """ @@ -502,6 +411,7 @@ def check_min_allowed_max_date(self, max_date, geo_type, signal_type): - max_date: date of most recent data to be validated; datetime format. - geo_type: str; geo type name (county, msa, hrr, state) as in the CSV name - signal_type: str; signal name as in the CSV name + - report: ValidationReport; report where results are added Returns: - None @@ -511,12 +421,12 @@ def check_min_allowed_max_date(self, max_date, geo_type, signal_type): else 1) if max_date < self.generation_date - thres: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_min_max_date", geo_type, signal_type), max_date, "date of most recent generated file seems too long ago")) - self.increment_total_checks() + self.active_report.increment_total_checks() def check_max_allowed_max_date(self, max_date, geo_type, signal_type): """ @@ -526,17 +436,18 @@ def check_max_allowed_max_date(self, max_date, geo_type, signal_type): - max_date: date of most recent data to be validated; datetime format. - geo_type: str; geo type name (county, msa, hrr, state) as in the CSV name - signal_type: str; signal name as in the CSV name + - report: ValidationReport; report where results are added Returns: - None """ if max_date > self.generation_date: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_max_max_date", geo_type, signal_type), max_date, "date of most recent generated file seems too recent")) - self.increment_total_checks() + self.active_report.increment_total_checks() def check_max_date_vs_reference(self, df_to_test, df_to_reference, checking_date, geo_type, signal_type): @@ -550,12 +461,13 @@ def check_max_date_vs_reference(self, df_to_test, df_to_reference, checking_date COVIDcast API or semirecent data - geo_type: str; geo type name (county, msa, hrr, state) as in the CSV name - signal_type: str; signal name as in the CSV name + - report: ValidationReport; report where results are added Returns: - None """ if df_to_test["time_value"].max() < df_to_reference["time_value"].max(): - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_max_date_vs_reference", checking_date.date(), geo_type, signal_type), (df_to_test["time_value"].max(), @@ -566,7 +478,7 @@ def check_max_date_vs_reference(self, df_to_test, df_to_reference, checking_date 'working files have already been compared against the reference, ' + 'that there is a bug somewhere')) - self.increment_total_checks() + self.active_report.increment_total_checks() def check_rapid_change_num_rows(self, df_to_test, df_to_reference, checking_date, geo_type, signal_type): @@ -580,6 +492,7 @@ def check_rapid_change_num_rows(self, df_to_test, df_to_reference, checking_date - checking_date: datetime date - geo_type: str; geo type name (county, msa, hrr, state) as in the CSV name - signal_type: str; signal name as in the CSV name + - report: ValidationReport; report where results are added Returns: - None @@ -598,14 +511,14 @@ def check_rapid_change_num_rows(self, df_to_test, df_to_reference, checking_date raise e if abs(compare_rows) > 0.35: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_rapid_change_num_rows", checking_date, geo_type, signal_type), (test_rows_per_reporting_day, reference_rows_per_reporting_day), "Number of rows per day (-with-any-rows) seems to have changed " + "rapidly (reference vs test data)")) - self.increment_total_checks() + self.active_report.increment_total_checks() def check_positive_negative_spikes(self, source_df, api_frames, geo, sig): """ @@ -625,17 +538,14 @@ def check_positive_negative_spikes(self, source_df, api_frames, geo, sig): - sig: str; signal name as in the CSV name """ - self.increment_total_checks() + self.active_report.increment_total_checks() # Combine all possible frames so that the rolling window calculations make sense. source_frame_start = source_df["time_value"].min() source_frame_end = source_df["time_value"].max() - api_frames_end = min(api_frames["time_value"].max( - ), source_frame_start-timedelta(days=1)) all_frames = pd.concat([api_frames, source_df]). \ drop_duplicates(subset=["geo_id", "time_value"], keep='last'). \ sort_values(by=['time_value']).reset_index(drop=True) - if "index" in all_frames.columns: - all_frames = all_frames.drop(columns=["index"]) + # Tuned Variables from Dan's Code for flagging outliers. Size_cut is a # check on the minimum value reported, sig_cut is a check # on the ftstat or ststat reported (t-statistics) and sig_consec @@ -649,61 +559,56 @@ def check_positive_negative_spikes(self, source_df, api_frames, geo, sig): def outlier_flag(frame): if (abs(frame["val"]) > size_cut) and not (pd.isna(frame["ststat"])) \ and (frame["ststat"] > sig_cut): - return 1 + return True if (abs(frame["val"]) > size_cut) and (pd.isna(frame["ststat"])) and \ not (pd.isna(frame["ftstat"])) and (frame["ftstat"] > sig_cut): - return 1 + return True if (frame["val"] < -size_cut) and not (pd.isna(frame["ststat"])) and \ not pd.isna(frame["ftstat"]): - return 1 - return 0 + return True + return False def outlier_nearby(frame): if (not pd.isna(frame['ststat'])) and (frame['ststat'] > sig_consec): - return 1 + return True if pd.isna(frame['ststat']) and (frame['ftstat'] > sig_consec): - return 1 - return 0 + return True + return False # Calculate ftstat and ststat values for the rolling windows, group fames by geo region region_group = all_frames.groupby("geo_id") window_size = 14 - shift_val = 0 - # Shift the window to match how R calculates rolling windows with even numbers - if window_size % 2 == 0: - shift_val = -1 + shift_val = -1 if window_size % 2 == 0 else 0 # Calculate the t-statistics for the two rolling windows (windows center and windows right) all_full_frames = [] for _, group in region_group: - rolling_windows = group["val"].rolling( - window_size, min_periods=window_size) - center_windows = group["val"].rolling( - window_size, min_periods=window_size, center=True) + rolling_windows = group["val"].rolling(window_size, min_periods=window_size) + center_windows = group["val"].rolling(window_size, min_periods=window_size, center=True) fmedian = rolling_windows.median() smedian = center_windows.median().shift(shift_val) fsd = rolling_windows.std() + 0.00001 # if std is 0 ssd = center_windows.std().shift(shift_val) + 0.00001 # if std is 0 - vals_modified_f = group["val"] - fmedian.fillna(0) - vals_modified_s = group["val"] - smedian.fillna(0) - ftstat = abs(vals_modified_f)/fsd - ststat = abs(vals_modified_s)/ssd - group['ftstat'] = ftstat - group['ststat'] = ststat + group['ftstat'] = abs(group["val"] - fmedian.fillna(0)) / fsd + group['ststat'] = abs(group["val"] - smedian.fillna(0)) / ssd all_full_frames.append(group) all_frames = pd.concat(all_full_frames) # Determine outliers in source frames only, only need the reference # data from just before the start of the source data # because lead and lag outlier calculations are only one day + # + # These variables are interpolated into the call to `api_df_or_error.query()` + # below but pylint doesn't recognize that. + # pylint: disable=unused-variable + api_frames_end = min(api_frames["time_value"].max(), source_frame_start-timedelta(days=1)) + # pylint: enable=unused-variable outlier_df = all_frames.query( 'time_value >= @api_frames_end & time_value <= @source_frame_end') outlier_df = outlier_df.sort_values(by=['geo_id', 'time_value']) \ .reset_index(drop=True).copy() - outlier_df["flag"] = 0 - outlier_df["flag"] = outlier_df.apply(outlier_flag, axis=1) - outliers = outlier_df[outlier_df["flag"] == 1] + outliers = outlier_df[outlier_df.apply(outlier_flag, axis=1)] outliers_reset = outliers.copy().reset_index(drop=True) # Find the lead outliers and the lag outliers. Check that the selected row @@ -720,16 +625,13 @@ def outlier_nearby(frame): sel_lower_df = lower_df[lower_compare["geo_id"] == lower_df["geo_id"]].copy() - sel_upper_df["flag"] = 0 - sel_lower_df["flag"] = 0 - - sel_upper_df["flag"] = sel_upper_df.apply(outlier_nearby, axis=1) - sel_lower_df["flag"] = sel_lower_df.apply(outlier_nearby, axis=1) - - upper_outliers = sel_upper_df[sel_upper_df["flag"] == 1] - lower_outliers = sel_lower_df[sel_lower_df["flag"] == 1] + outliers_list = [outliers] + if sel_upper_df.size > 0: + outliers_list.append(sel_upper_df[sel_upper_df.apply(outlier_nearby, axis=1)]) + if sel_lower_df.size > 0: + outliers_list.append(sel_lower_df[sel_lower_df.apply(outlier_nearby, axis=1)]) - all_outliers = pd.concat([outliers, upper_outliers, lower_outliers]). \ + all_outliers = pd.concat(outliers_list). \ sort_values(by=['time_value', 'geo_id']). \ drop_duplicates().reset_index(drop=True) @@ -738,7 +640,7 @@ def outlier_nearby(frame): "time_value >= @source_frame_start & time_value <= @source_frame_end") if source_outliers.shape[0] > 0: - self.raised_errors.append(ValidationError( + self.active_report.raised_errors.append(ValidationError( ("check_positive_negative_spikes", source_frame_start, source_frame_end, geo, sig), (source_outliers), @@ -819,10 +721,8 @@ def check_avg_val_vs_reference(self, df_to_test, df_to_reference, checking_date, # Set thresholds for raw and smoothed variables. classes = ['mean_stddiff', 'val_mean_stddiff', 'mean_stdabsdiff'] - raw_thresholds = pd.DataFrame( - [[1.50, 1.30, 1.80]], columns=classes) - smoothed_thresholds = raw_thresholds.apply( - lambda x: x/(math.sqrt(7) * 1.5)) + raw_thresholds = pd.DataFrame([[1.50, 1.30, 1.80]], columns=classes) + smoothed_thresholds = raw_thresholds.apply(lambda x: x/(math.sqrt(7) * 1.5)) switcher = { 'raw': raw_thresholds, @@ -840,11 +740,10 @@ def check_avg_val_vs_reference(self, df_to_test, df_to_reference, checking_date, (abs(df_all[df_all["variable"] == "val"]["mean_stddiff"]) > float(thres["val_mean_stddiff"])).any() ) - mean_stdabsdiff_high = ( - df_all["mean_stdabsdiff"] > float(thres["mean_stdabsdiff"])).any() + mean_stdabsdiff_high = (df_all["mean_stdabsdiff"] > float(thres["mean_stdabsdiff"])).any() if mean_stddiff_high or mean_stdabsdiff_high: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_test_vs_reference_avg_changed", checking_date, geo_type, signal_type), (mean_stddiff_high, mean_stdabsdiff_high), @@ -854,7 +753,17 @@ def check_avg_val_vs_reference(self, df_to_test, df_to_reference, checking_date, + 'to average values of corresponding variables. For the former check, ' + 'tolerances for `val` are more restrictive than those for other columns.')) - self.increment_total_checks() + self.active_report.increment_total_checks() + + def check_duplicate_rows(self, data_df, filename): + is_duplicate = data_df.duplicated() + if (any(is_duplicate)): + duplicate_row_idxs = list(data_df[is_duplicate].index) + self.active_report.add_raised_warning(ValidationError( + ("check_duplicate_rows", filename), + duplicate_row_idxs, + "Some rows are duplicated, which may indicate data integrity issues")) + self.active_report.increment_total_checks() def validate(self, export_dir): """ @@ -864,26 +773,32 @@ def validate(self, export_dir): - export_dir: path to data CSVs Returns: - - None + - ValidationReport collating the validation outcomes """ + self.active_report = ValidationReport(self.suppressed_errors) + frames_list = load_all_files(export_dir, self.start_date, self.end_date) + self._run_single_file_checks(frames_list) + all_frames = aggregate_frames(frames_list) + self._run_combined_file_checks(all_frames) + return self.active_report - # Get relevant data file names and info. - - export_files = read_filenames(export_dir) - date_filter = make_date_filter(self.start_date, self.end_date) + def _run_single_file_checks(self, file_list): + """ + Perform checks over single-file data sets. - # Make list of tuples of CSV names and regex match objects. - validate_files = [(f, m) for (f, m) in export_files if date_filter(m)] - self.check_missing_date_files(validate_files) - self.check_settings() + Parameters + ---------- + loaded_data: List[Tuple(str, re.match, pd.DataFrame)] + triples of filenames, filename matches with the geo regex, and the data from the file + """ - all_frames = [] + self.check_missing_date_files(file_list) # Individual file checks # For every daily file, read in and do some basic format and value checks. - for filename, match in validate_files: - data_df = load_csv(join(export_dir, filename)) + for filename, match, data_df in file_list: self.check_df_format(data_df, filename) + self.check_duplicate_rows(data_df, filename) self.check_bad_geo_id_format( data_df, filename, match.groupdict()['geo_type']) self.check_bad_geo_id_value( @@ -892,17 +807,15 @@ def validate(self, export_dir): self.check_bad_se(data_df, filename) self.check_bad_sample_size(data_df, filename) - # Get geo_type, date, and signal name as specified by CSV name. - data_df['geo_type'] = match.groupdict()['geo_type'] - data_df['time_value'] = datetime.strptime( - match.groupdict()['date'], "%Y%m%d").date() - data_df['signal'] = match.groupdict()['signal'] - - # Add current CSV data to all_frames. - all_frames.append(data_df) - - all_frames = pd.concat(all_frames) + def _run_combined_file_checks(self, all_frames): + """ + Performs all checks over the combined data set from all files. + Parameters + ---------- + all_frames: pd.DataFrame + combined data from all input files + """ # recent_lookbehind: start from the check date and working backward in time, # how many days at a time do we want to check for anomalies? # Choosing 1 day checks just the daily data. @@ -922,13 +835,11 @@ def validate(self, export_dir): # Get all expected combinations of geo_type and signal. geo_signal_combos = get_geo_signal_combos(self.data_source) - all_api_df = self.threaded_api_calls( - self.start_date - outlier_lookbehind, - self.end_date, geo_signal_combos) + all_api_df = threaded_api_calls(self.data_source, self.start_date - outlier_lookbehind, + self.end_date, geo_signal_combos) # Keeps script from checking all files in a test run. - if self.test_mode: - kroc = 0 + kroc = 0 # Comparison checks # Run checks for recent dates in each geo-sig combo vs semirecent (previous @@ -939,10 +850,10 @@ def validate(self, export_dir): # Drop unused columns. geo_sig_df.drop(columns=["geo_type", "signal"]) - self.increment_total_checks() + self.active_report.increment_total_checks() if geo_sig_df.empty: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_missing_geo_sig_combo", geo_type, signal_type), None, "file with geo_type-signal combo does not exist")) @@ -953,22 +864,31 @@ def validate(self, export_dir): self.check_max_allowed_max_date(max_date, geo_type, signal_type) # Get relevant reference data from API dictionary. - geo_sig_api_df = all_api_df[(geo_type, signal_type)] + api_df_or_error = all_api_df[(geo_type, signal_type)] - if geo_sig_api_df is None: + self.active_report.increment_total_checks() + if isinstance(api_df_or_error, APIDataFetchError): + self.active_report.raised_errors.append(api_df_or_error) continue # Outlier dataframe if (signal_type in ["confirmed_7dav_cumulative_num", "confirmed_7dav_incidence_num", - "confirmed_cumulative_num", "confirmed_incidence_num", "deaths_7dav_cumulative_num", + "confirmed_cumulative_num", "confirmed_incidence_num", + "deaths_7dav_cumulative_num", "deaths_cumulative_num"]): earliest_available_date = geo_sig_df["time_value"].min() source_df = geo_sig_df.query( 'time_value <= @date_list[-1] & time_value >= @date_list[0]') + + # These variables are interpolated into the call to `api_df_or_error.query()` + # below but pylint doesn't recognize that. + # pylint: disable=unused-variable outlier_start_date = earliest_available_date - outlier_lookbehind outlier_end_date = earliest_available_date - timedelta(days=1) - outlier_api_df = geo_sig_api_df.query( + outlier_api_df = api_df_or_error.query( 'time_value <= @outlier_end_date & time_value >= @outlier_start_date') + # pylint: enable=unused-variable + self.check_positive_negative_spikes( source_df, outlier_api_df, geo_type, signal_type) @@ -980,10 +900,10 @@ def validate(self, export_dir): recent_df = geo_sig_df.query( 'time_value <= @checking_date & time_value >= @recent_cutoff_date') - self.increment_total_checks() + self.active_report.increment_total_checks() if recent_df.empty: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_missing_geo_sig_date_combo", checking_date, geo_type, signal_type), None, @@ -993,19 +913,24 @@ def validate(self, export_dir): continue # Reference dataframe runs backwards from the recent_cutoff_date + # + # These variables are interpolated into the call to `api_df_or_error.query()` + # below but pylint doesn't recognize that. + # pylint: disable=unused-variable reference_start_date = recent_cutoff_date - \ min(semirecent_lookbehind, self.max_check_lookbehind) - \ timedelta(days=1) reference_end_date = recent_cutoff_date - timedelta(days=1) + # pylint: enable=unused-variable # Subset API data to relevant range of dates. - reference_api_df = geo_sig_api_df.query( + reference_api_df = api_df_or_error.query( "time_value >= @reference_start_date & time_value <= @reference_end_date") - self.increment_total_checks() + self.active_report.increment_total_checks() if reference_api_df.empty: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("empty_reference_data", checking_date, geo_type, signal_type), None, "reference data is empty; comparative checks could not be performed")) @@ -1023,105 +948,6 @@ def validate(self, export_dir): recent_df, reference_api_df, checking_date, geo_type, signal_type) # Keeps script from checking all files in a test run. - - if self.test_mode: - kroc += 1 - if kroc == 2: - break - - self.exit() - - def get_one_api_df(self, min_date, max_date, - geo_type, signal_type, - api_semaphore, dict_lock, output_dict): - """ - Pull API data for a single geo type-signal combination. Raises - error if data couldn't be retrieved. Saves data to data dict. - """ - api_semaphore.acquire() - - # Pull reference data from API for all dates. - try: - geo_sig_api_df = fetch_api_reference( - self.data_source, min_date, max_date, geo_type, signal_type) - - except APIDataFetchError as e: - self.increment_total_checks() - self.raised_errors.append(ValidationError( - ("api_data_fetch_error", geo_type, signal_type), None, e)) - - geo_sig_api_df = None - - api_semaphore.release() - - # Use a lock so only one thread can access the dictionary. - dict_lock.acquire() - output_dict[(geo_type, signal_type)] = geo_sig_api_df - dict_lock.release() - - def threaded_api_calls(self, min_date, max_date, - geo_signal_combos, n_threads=32): - """ - Get data from API for all geo-signal combinations in a threaded way - to save time. - """ - if n_threads > 32: - n_threads = 32 - print("Warning: Don't run more than 32 threads at once due " - + "to API resource limitations") - - output_dict = dict() - dict_lock = threading.Lock() - api_semaphore = threading.Semaphore(value=n_threads) - - thread_objs = [threading.Thread( - target=self.get_one_api_df, args=(min_date, max_date, - geo_type, signal_type, - api_semaphore, - dict_lock, output_dict) - ) for geo_type, signal_type in geo_signal_combos] - - # Start all threads. - for thread in thread_objs: - thread.start() - - # Wait until all threads are finished. - for thread in thread_objs: - thread.join() - - return output_dict - - def exit(self): - """ - If any not-suppressed exceptions were raised, print and exit with non-zero status. - """ - suppressed_counter = 0 - subset_raised_errors = [] - - for val_error in self.raised_errors: - # Convert any dates in check_data_id to strings for the purpose of comparing - # to manually suppressed errors. - raised_check_id = tuple([ - item.strftime("%Y-%m-%d") if isinstance(item, (date, datetime)) - else item for item in val_error.check_data_id]) - - if raised_check_id not in self.suppressed_errors: - subset_raised_errors.append(val_error) - else: - self.suppressed_errors.remove(raised_check_id) - suppressed_counter += 1 - - print(self.total_checks, "checks run") - print(len(subset_raised_errors), "checks failed") - print(suppressed_counter, "checks suppressed") - print(len(self.raised_warnings), "warnings") - - for message in subset_raised_errors: - print(message) - for message in self.raised_warnings: - print(message) - - if len(subset_raised_errors) != 0: - sys.exit(1) - else: - sys.exit(0) + kroc += 1 + if self.test_mode and kroc == 2: + break diff --git a/validator/tests/test_checks.py b/validator/tests/test_checks.py index a1add4738..7e993dfdb 100644 --- a/validator/tests/test_checks.py +++ b/validator/tests/test_checks.py @@ -3,8 +3,8 @@ import numpy as np import pandas as pd -from delphi_validator.datafetcher import filename_regex -from delphi_validator.validate import Validator, make_date_filter +from delphi_validator.datafetcher import FILENAME_REGEX, make_date_filter +from delphi_validator.validate import Validator class TestDateFilter: @@ -14,7 +14,7 @@ def test_same_day(self): date_filter = make_date_filter( start_date, end_date) - filenames = [(f, filename_regex.match(f)) + filenames = [(f, FILENAME_REGEX.match(f)) for f in ("20200901_county_signal_signal.csv", "20200902_county_signal_signal.csv", "20200903_county_signal_signal.csv")] @@ -30,7 +30,7 @@ def test_inclusive(self): date_filter = make_date_filter( start_date, end_date) - filenames = [(f, filename_regex.match(f)) + filenames = [(f, FILENAME_REGEX.match(f)) for f in ("20200901_county_signal_signal.csv", "20200902_county_signal_signal.csv", "20200903_county_signal_signal.csv", @@ -46,7 +46,7 @@ def test_empty(self): date_filter = make_date_filter( start_date, end_date) - filenames = [(f, filename_regex.match(f)) + filenames = [(f, FILENAME_REGEX.match(f)) for f in ()] subset_filenames = [(f, m) for (f, m) in filenames if date_filter(m)] @@ -69,7 +69,7 @@ def test_default_settings(self): assert validator.sanity_check_value_diffs == True assert len(validator.suppressed_errors) == 0 assert isinstance(validator.suppressed_errors, set) - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 class TestCheckMissingDates: @@ -82,10 +82,10 @@ def test_empty_filelist(self): filenames = list() validator.check_missing_date_files(filenames) - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_missing_date_files" in [ - err.check_data_id[0] for err in validator.raised_errors] - assert len(validator.raised_errors[0].expression) == 9 + err.check_data_id[0] for err in validator.active_report.raised_errors] + assert len(validator.active_report.raised_errors[0].expression) == 9 def test_same_day(self): params = {"data_source": "", "span_length": 0, @@ -95,9 +95,9 @@ def test_same_day(self): filenames = [("20200901_county_signal_signal.csv", "match_obj")] validator.check_missing_date_files(filenames) - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 assert "check_missing_date_files" not in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] def test_duplicate_dates(self): params = {"data_source": "", "span_length": 1, @@ -110,34 +110,34 @@ def test_duplicate_dates(self): ("20200903_usa_signal_signal.csv", "match_obj")] validator.check_missing_date_files(filenames) - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_missing_date_files" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] assert len([err.expression[0] for - err in validator.raised_errors if err.check_data_id[0] == + err in validator.active_report.raised_errors if err.check_data_id[0] == "check_missing_date_files"]) == 1 assert [err.expression[0] for - err in validator.raised_errors if err.check_data_id[0] == + err in validator.active_report.raised_errors if err.check_data_id[0] == "check_missing_date_files"][0] == datetime.strptime("20200902", "%Y%m%d").date() class TestNameFormat: def test_match_existence(self): - pattern_found = filename_regex.match("20200903_usa_signal_signal.csv") + pattern_found = FILENAME_REGEX.match("20200903_usa_signal_signal.csv") assert pattern_found - pattern_found = filename_regex.match("2020090_usa_signal_signal.csv") + pattern_found = FILENAME_REGEX.match("2020090_usa_signal_signal.csv") assert not pattern_found - pattern_found = filename_regex.match("20200903_usa_signal_signal.pdf") + pattern_found = FILENAME_REGEX.match("20200903_usa_signal_signal.pdf") assert not pattern_found - pattern_found = filename_regex.match("20200903_usa_.csv") + pattern_found = FILENAME_REGEX.match("20200903_usa_.csv") assert not pattern_found def test_expected_groups(self): - pattern_found = filename_regex.match( + pattern_found = FILENAME_REGEX.match( "20200903_usa_signal_signal.csv").groupdict() assert pattern_found["date"] == "20200903" assert pattern_found["geo_type"] == "usa" @@ -153,18 +153,18 @@ def test_empty_df(self): empty_df = pd.DataFrame(columns=["geo_id"], dtype=str) validator.check_bad_geo_id_format(empty_df, "name", "county") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 def test_invalid_geo_type(self): validator = Validator(self.params) empty_df = pd.DataFrame(columns=["geo_id"], dtype=str) validator.check_bad_geo_id_format(empty_df, "name", "hello") - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_geo_type" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] assert [err.expression for - err in validator.raised_errors if err.check_data_id[0] == + err in validator.active_report.raised_errors if err.check_data_id[0] == "check_geo_type"][0] == "hello" def test_invalid_geo_id_county(self): @@ -173,10 +173,10 @@ def test_invalid_geo_id_county(self): "abc12"], columns=["geo_id"]) validator.check_bad_geo_id_format(df, "name", "county") - assert len(validator.raised_errors) == 1 - assert "check_geo_id_format" in validator.raised_errors[0].check_data_id - assert len(validator.raised_errors[0].expression) == 2 - assert "54321" not in validator.raised_errors[0].expression + assert len(validator.active_report.raised_errors) == 1 + assert "check_geo_id_format" in validator.active_report.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors[0].expression) == 2 + assert "54321" not in validator.active_report.raised_errors[0].expression def test_invalid_geo_id_msa(self): validator = Validator(self.params) @@ -184,10 +184,10 @@ def test_invalid_geo_id_msa(self): "abc12"], columns=["geo_id"]) validator.check_bad_geo_id_format(df, "name", "msa") - assert len(validator.raised_errors) == 1 - assert "check_geo_id_format" in validator.raised_errors[0].check_data_id - assert len(validator.raised_errors[0].expression) == 2 - assert "54321" not in validator.raised_errors[0].expression + assert len(validator.active_report.raised_errors) == 1 + assert "check_geo_id_format" in validator.active_report.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors[0].expression) == 2 + assert "54321" not in validator.active_report.raised_errors[0].expression def test_invalid_geo_id_hrr(self): validator = Validator(self.params) @@ -195,12 +195,12 @@ def test_invalid_geo_id_hrr(self): "a", ".", "ab1"], columns=["geo_id"]) validator.check_bad_geo_id_format(df, "name", "hrr") - assert len(validator.raised_errors) == 1 - assert "check_geo_id_format" in validator.raised_errors[0].check_data_id - assert len(validator.raised_errors[0].expression) == 5 - assert "1" not in validator.raised_errors[0].expression - assert "12" not in validator.raised_errors[0].expression - assert "123" not in validator.raised_errors[0].expression + assert len(validator.active_report.raised_errors) == 1 + assert "check_geo_id_format" in validator.active_report.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors[0].expression) == 5 + assert "1" not in validator.active_report.raised_errors[0].expression + assert "12" not in validator.active_report.raised_errors[0].expression + assert "123" not in validator.active_report.raised_errors[0].expression def test_invalid_geo_id_state(self): validator = Validator(self.params) @@ -208,12 +208,12 @@ def test_invalid_geo_id_state(self): "Hawaii", "a", "H.I."], columns=["geo_id"]) validator.check_bad_geo_id_format(df, "name", "state") - assert len(validator.raised_errors) == 1 - assert "check_geo_id_format" in validator.raised_errors[0].check_data_id - assert len(validator.raised_errors[0].expression) == 4 - assert "aa" not in validator.raised_errors[0].expression - assert "hi" not in validator.raised_errors[0].expression - assert "HI" not in validator.raised_errors[0].expression + assert len(validator.active_report.raised_errors) == 1 + assert "check_geo_id_format" in validator.active_report.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors[0].expression) == 4 + assert "aa" not in validator.active_report.raised_errors[0].expression + assert "hi" not in validator.active_report.raised_errors[0].expression + assert "HI" not in validator.active_report.raised_errors[0].expression def test_invalid_geo_id_national(self): validator = Validator(self.params) @@ -221,13 +221,49 @@ def test_invalid_geo_id_national(self): "usausa", "US"], columns=["geo_id"]) validator.check_bad_geo_id_format(df, "name", "national") - assert len(validator.raised_errors) == 1 - assert "check_geo_id_format" in validator.raised_errors[0].check_data_id - assert len(validator.raised_errors[0].expression) == 3 - assert "us" not in validator.raised_errors[0].expression - assert "US" not in validator.raised_errors[0].expression - assert "SP" not in validator.raised_errors[0].expression + assert len(validator.active_report.raised_errors) == 1 + assert "check_geo_id_format" in validator.active_report.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors[0].expression) == 3 + assert "us" not in validator.active_report.raised_errors[0].expression + assert "US" not in validator.active_report.raised_errors[0].expression + assert "SP" not in validator.active_report.raised_errors[0].expression +class TestDuplicatedRows: + params = {"data_source": "", "span_length": 1, + "end_date": "2020-09-02", "expected_lag": {}} + def test_no_duplicates(self): + validator = Validator(self.params) + df = pd.DataFrame([["a", "1"], ["b", "2"], ["c", "3"]]) + validator.check_duplicate_rows(df, "file") + assert len(validator.active_report.raised_warnings) == 0 + + def test_single_column_duplicates_but_not_row(self): + validator = Validator(self.params) + df = pd.DataFrame([["a", "1"], ["a", "2"], ["b", "2"]]) + validator.check_duplicate_rows(df, "file") + assert len(validator.active_report.raised_warnings) == 0 + + def test_non_consecutive_duplicates(self): + validator = Validator(self.params) + df = pd.DataFrame([["a", "1"], ["b", "2"], ["a", "1"]]) + validator.check_duplicate_rows(df, "file") + assert len(validator.active_report.raised_warnings) == 1 + assert validator.active_report.raised_warnings[0].expression == [2] + assert validator.active_report.raised_warnings[0].check_data_id[1] == "file" + + def test_multiple_distinct_duplicates(self): + validator = Validator(self.params) + df = pd.DataFrame([["a", "1"], ["b", "2"], ["a", "1"], ["b", "2"]]) + validator.check_duplicate_rows(df, "file") + assert len(validator.active_report.raised_warnings) == 1 + assert validator.active_report.raised_warnings[0].expression == [2, 3] + + def test_more_than_two_copies(self): + validator = Validator(self.params) + df = pd.DataFrame([["a", "1"], ["b", "2"], ["b", "2"], ["b", "2"]]) + validator.check_duplicate_rows(df, "file") + assert len(validator.active_report.raised_warnings) == 1 + assert validator.active_report.raised_warnings[0].expression == [2, 3] class TestCheckBadGeoIdValue: params = {"data_source": "", "span_length": 0, @@ -238,31 +274,31 @@ def test_empty_df(self): validator = Validator(self.params) empty_df = pd.DataFrame(columns=["geo_id"], dtype=str) validator.check_bad_geo_id_value(empty_df, "name", "county") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 def test_invalid_geo_id_county(self): validator = Validator(self.params) df = pd.DataFrame(["01001", "88888", "99999"], columns=["geo_id"]) validator.check_bad_geo_id_value(df, "name", "county") - assert len(validator.raised_errors) == 1 - assert "check_bad_geo_id_value" in validator.raised_errors[0].check_data_id - assert len(validator.raised_errors[0].expression) == 2 - assert "01001" not in validator.raised_errors[0].expression - assert "88888" in validator.raised_errors[0].expression - assert "99999" in validator.raised_errors[0].expression + assert len(validator.active_report.raised_errors) == 1 + assert "check_bad_geo_id_value" in validator.active_report.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors[0].expression) == 2 + assert "01001" not in validator.active_report.raised_errors[0].expression + assert "88888" in validator.active_report.raised_errors[0].expression + assert "99999" in validator.active_report.raised_errors[0].expression def test_invalid_geo_id_msa(self): validator = Validator(self.params) df = pd.DataFrame(["10180", "88888", "99999"], columns=["geo_id"]) validator.check_bad_geo_id_value(df, "name", "msa") - assert len(validator.raised_errors) == 1 - assert "check_bad_geo_id_value" in validator.raised_errors[0].check_data_id - assert len(validator.raised_errors[0].expression) == 2 - assert "10180" not in validator.raised_errors[0].expression - assert "88888" in validator.raised_errors[0].expression - assert "99999" in validator.raised_errors[0].expression + assert len(validator.active_report.raised_errors) == 1 + assert "check_bad_geo_id_value" in validator.active_report.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors[0].expression) == 2 + assert "10180" not in validator.active_report.raised_errors[0].expression + assert "88888" in validator.active_report.raised_errors[0].expression + assert "99999" in validator.active_report.raised_errors[0].expression def test_invalid_geo_id_hrr(self): validator = Validator(self.params) @@ -270,47 +306,47 @@ def test_invalid_geo_id_hrr(self): "888"], columns=["geo_id"]) validator.check_bad_geo_id_value(df, "name", "hrr") - assert len(validator.raised_errors) == 1 - assert "check_bad_geo_id_value" in validator.raised_errors[0].check_data_id - assert len(validator.raised_errors[0].expression) == 3 - assert "1" not in validator.raised_errors[0].expression - assert "11" not in validator.raised_errors[0].expression - assert "111" not in validator.raised_errors[0].expression - assert "8" in validator.raised_errors[0].expression - assert "88" in validator.raised_errors[0].expression - assert "888" in validator.raised_errors[0].expression + assert len(validator.active_report.raised_errors) == 1 + assert "check_bad_geo_id_value" in validator.active_report.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors[0].expression) == 3 + assert "1" not in validator.active_report.raised_errors[0].expression + assert "11" not in validator.active_report.raised_errors[0].expression + assert "111" not in validator.active_report.raised_errors[0].expression + assert "8" in validator.active_report.raised_errors[0].expression + assert "88" in validator.active_report.raised_errors[0].expression + assert "888" in validator.active_report.raised_errors[0].expression def test_invalid_geo_id_state(self): validator = Validator(self.params) df = pd.DataFrame(["aa", "ak"], columns=["geo_id"]) validator.check_bad_geo_id_value(df, "name", "state") - assert len(validator.raised_errors) == 1 - assert "check_bad_geo_id_value" in validator.raised_errors[0].check_data_id - assert len(validator.raised_errors[0].expression) == 1 - assert "ak" not in validator.raised_errors[0].expression - assert "aa" in validator.raised_errors[0].expression + assert len(validator.active_report.raised_errors) == 1 + assert "check_bad_geo_id_value" in validator.active_report.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors[0].expression) == 1 + assert "ak" not in validator.active_report.raised_errors[0].expression + assert "aa" in validator.active_report.raised_errors[0].expression def test_uppercase_geo_id(self): validator = Validator(self.params) df = pd.DataFrame(["ak", "AK"], columns=["geo_id"]) validator.check_bad_geo_id_value(df, "name", "state") - assert len(validator.raised_errors) == 0 - assert len(validator.raised_warnings) == 1 - assert "check_geo_id_lowercase" in validator.raised_warnings[0].check_data_id - assert "AK" in validator.raised_warnings[0].expression + assert len(validator.active_report.raised_errors) == 0 + assert len(validator.active_report.raised_warnings) == 1 + assert "check_geo_id_lowercase" in validator.active_report.raised_warnings[0].check_data_id + assert "AK" in validator.active_report.raised_warnings[0].expression def test_invalid_geo_id_national(self): validator = Validator(self.params) df = pd.DataFrame(["us", "zz"], columns=["geo_id"]) validator.check_bad_geo_id_value(df, "name", "national") - assert len(validator.raised_errors) == 1 - assert "check_bad_geo_id_value" in validator.raised_errors[0].check_data_id - assert len(validator.raised_errors[0].expression) == 1 - assert "us" not in validator.raised_errors[0].expression - assert "zz" in validator.raised_errors[0].expression + assert len(validator.active_report.raised_errors) == 1 + assert "check_bad_geo_id_value" in validator.active_report.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors[0].expression) == 1 + assert "us" not in validator.active_report.raised_errors[0].expression + assert "zz" in validator.active_report.raised_errors[0].expression class TestCheckBadVal: @@ -324,39 +360,39 @@ def test_empty_df(self): validator.check_bad_val(empty_df, "", "prop") validator.check_bad_val(empty_df, "", "pct") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 def test_missing(self): validator = Validator(self.params) df = pd.DataFrame([np.nan], columns=["val"]) validator.check_bad_val(df, "name", "signal") - assert len(validator.raised_errors) == 1 - assert "check_val_missing" in validator.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors) == 1 + assert "check_val_missing" in validator.active_report.raised_errors[0].check_data_id def test_lt_0(self): validator = Validator(self.params) df = pd.DataFrame([-5], columns=["val"]) validator.check_bad_val(df, "name", "signal") - assert len(validator.raised_errors) == 1 - assert "check_val_lt_0" in validator.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors) == 1 + assert "check_val_lt_0" in validator.active_report.raised_errors[0].check_data_id def test_gt_max_pct(self): validator = Validator(self.params) df = pd.DataFrame([1e7], columns=["val"]) validator.check_bad_val(df, "name", "pct") - assert len(validator.raised_errors) == 1 - assert "check_val_pct_gt_100" in validator.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors) == 1 + assert "check_val_pct_gt_100" in validator.active_report.raised_errors[0].check_data_id def test_gt_max_prop(self): validator = Validator(self.params) df = pd.DataFrame([1e7], columns=["val"]) validator.check_bad_val(df, "name", "prop") - assert len(validator.raised_errors) == 1 - assert "check_val_prop_gt_100k" in validator.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors) == 1 + assert "check_val_prop_gt_100k" in validator.active_report.raised_errors[0].check_data_id class TestCheckBadSe: @@ -369,12 +405,12 @@ def test_empty_df(self): columns=["val", "se", "sample_size"], dtype=float) validator.check_bad_se(empty_df, "") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 validator.missing_se_allowed = True validator.check_bad_se(empty_df, "") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 def test_missing(self): validator = Validator(self.params) @@ -383,16 +419,16 @@ def test_missing(self): "val", "se", "sample_size"]) validator.check_bad_se(df, "name") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 validator.missing_se_allowed = False validator.check_bad_se(df, "name") - assert len(validator.raised_errors) == 2 + assert len(validator.active_report.raised_errors) == 2 assert "check_se_not_missing_and_in_range" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] assert "check_se_many_missing" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] def test_e_0_missing_allowed(self): validator = Validator(self.params) @@ -401,11 +437,11 @@ def test_e_0_missing_allowed(self): 1, np.nan, np.nan]], columns=["val", "se", "sample_size"]) validator.check_bad_se(df, "name") - assert len(validator.raised_errors) == 2 + assert len(validator.active_report.raised_errors) == 2 assert "check_se_missing_or_in_range" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] assert "check_se_0" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] def test_e_0_missing_not_allowed(self): validator = Validator(self.params) @@ -414,11 +450,11 @@ def test_e_0_missing_not_allowed(self): 1, np.nan, np.nan]], columns=["val", "se", "sample_size"]) validator.check_bad_se(df, "name") - assert len(validator.raised_errors) == 2 + assert len(validator.active_report.raised_errors) == 2 assert "check_se_not_missing_and_in_range" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] assert "check_se_0" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] def test_jeffreys(self): validator = Validator(self.params) @@ -427,11 +463,11 @@ def test_jeffreys(self): 1, np.nan, np.nan]], columns=["val", "se", "sample_size"]) validator.check_bad_se(df, "name") - assert len(validator.raised_errors) == 2 + assert len(validator.active_report.raised_errors) == 2 assert "check_se_not_missing_and_in_range" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] assert "check_se_0_when_val_0" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] class TestCheckBadN: @@ -444,12 +480,12 @@ def test_empty_df(self): columns=["val", "se", "sample_size"], dtype=float) validator.check_bad_sample_size(empty_df, "") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 validator.missing_sample_size_allowed = True validator.check_bad_sample_size(empty_df, "") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 def test_missing(self): validator = Validator(self.params) @@ -458,14 +494,14 @@ def test_missing(self): "val", "se", "sample_size"]) validator.check_bad_sample_size(df, "name") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 validator.missing_sample_size_allowed = False validator.check_bad_sample_size(df, "name") - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_n_missing" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] def test_lt_min_missing_allowed(self): validator = Validator(self.params) @@ -474,9 +510,9 @@ def test_lt_min_missing_allowed(self): 1, np.nan, np.nan]], columns=["val", "se", "sample_size"]) validator.check_bad_sample_size(df, "name") - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_n_missing_or_gt_min" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] def test_lt_min_missing_not_allowed(self): validator = Validator(self.params) @@ -485,9 +521,9 @@ def test_lt_min_missing_not_allowed(self): 1, np.nan, 245]], columns=["val", "se", "sample_size"]) validator.check_bad_sample_size(df, "name") - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_n_gt_min" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] class TestCheckRapidChange: @@ -501,7 +537,7 @@ def test_same_df(self): validator.check_rapid_change_num_rows( test_df, ref_df, date.today(), "geo", "signal") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 def test_0_vs_many(self): validator = Validator(self.params) @@ -513,9 +549,9 @@ def test_0_vs_many(self): validator.check_rapid_change_num_rows( test_df, ref_df, time_value, "geo", "signal") - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_rapid_change_num_rows" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] class TestCheckAvgValDiffs: @@ -534,7 +570,7 @@ def test_same_val(self): validator.check_avg_val_vs_reference( test_df, ref_df, date.today(), "geo", "signal") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 def test_same_se(self): validator = Validator(self.params) @@ -548,7 +584,7 @@ def test_same_se(self): validator.check_avg_val_vs_reference( test_df, ref_df, date.today(), "geo", "signal") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 def test_same_n(self): validator = Validator(self.params) @@ -562,7 +598,7 @@ def test_same_n(self): validator.check_avg_val_vs_reference( test_df, ref_df, date.today(), "geo", "signal") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 def test_same_val_se_n(self): validator = Validator(self.params) @@ -576,7 +612,7 @@ def test_same_val_se_n(self): validator.check_avg_val_vs_reference( test_df, ref_df, date.today(), "geo", "signal") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 def test_10x_val(self): validator = Validator(self.params) @@ -591,7 +627,7 @@ def test_10x_val(self): test_df, ref_df, datetime.combine(date.today(), datetime.min.time()), "geo", "signal") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 def test_100x_val(self): validator = Validator(self.params) @@ -606,9 +642,9 @@ def test_100x_val(self): test_df, ref_df, datetime.combine(date.today(), datetime.min.time()), "geo", "signal") - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_test_vs_reference_avg_changed" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] def test_1000x_val(self): validator = Validator(self.params) @@ -623,10 +659,9 @@ def test_1000x_val(self): test_df, ref_df, datetime.combine(date.today(), datetime.min.time()), "geo", "signal") - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_test_vs_reference_avg_changed" in [ - err.check_data_id[0] for err in validator.raised_errors] - + err.check_data_id[0] for err in validator.active_report.raised_errors] class TestDataOutlier: params = {"data_source": "", "span_length": 1, @@ -666,19 +701,19 @@ def test_pos_outlier(self): test_df, ref_df, "state", "signal") - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_positive_negative_spikes" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] def test_neg_outlier(self): validator = Validator(self.params) ref_val = [100, 101, 100, 101, 100, - 100, 100, 100, 100, 100, - 100, 102, 100, 100, 100, - 100, 100, 101, 100, 100, - 100, 100, 100, 99, 100, - 100, 98, 100, 100, 100] + 100, 100, 100, 100, 100, + 100, 102, 100, 100, 100, + 100, 100, 101, 100, 100, + 100, 100, 100, 99, 100, + 100, 98, 100, 100, 100] test_val = [10, 10, 10] @@ -706,9 +741,9 @@ def test_neg_outlier(self): test_df, ref_df, "state", "signal") - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_positive_negative_spikes" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] def test_zero_outlier(self): validator = Validator(self.params) @@ -746,9 +781,9 @@ def test_zero_outlier(self): - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_positive_negative_spikes" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] def test_no_outlier(self): validator = Validator(self.params) @@ -786,7 +821,7 @@ def test_no_outlier(self): test_df, ref_df, "state", "signal") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 def test_source_api_overlap(self): validator = Validator(self.params) @@ -824,6 +859,6 @@ def test_source_api_overlap(self): test_df, ref_df, "state", "signal") - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_positive_negative_spikes" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] diff --git a/validator/tests/test_datafetcher.py b/validator/tests/test_datafetcher.py new file mode 100644 index 000000000..4c2efeb9b --- /dev/null +++ b/validator/tests/test_datafetcher.py @@ -0,0 +1,104 @@ +"""Tests for datafetcher.py.""" + +from datetime import date +import mock +import numpy as np +import pandas as pd +from delphi_validator.datafetcher import (FILENAME_REGEX, + make_date_filter, + get_geo_signal_combos, + threaded_api_calls) +from delphi_validator.errors import APIDataFetchError, ValidationError + + +class TestDataFetcher: + """Tests for various data fetching utilities.""" + def test_make_date_filter(self): + date_filter = make_date_filter(date(2020, 4, 4), date(2020, 5, 23)) + + assert date_filter(FILENAME_REGEX.match("20200420_a_b.csv")) + assert not date_filter(FILENAME_REGEX.match("20200403_a_b.csv")) + assert not date_filter(FILENAME_REGEX.match("20200620_a_b.csv")) + assert not date_filter(FILENAME_REGEX.match("202006_a_b.csv")) + + @mock.patch("covidcast.metadata") + def test_get_geo_signal_combos(self, mock_metadata): + """Test that the geo signal combos are correctly pulled from the covidcast metadata.""" + mock_metadata.return_value = pd.DataFrame({"data_source": ["a", "a", "a", + "b", "b", "b"], + "signal": ["w", "x", "x", + "y", "y", "z"], + "geo_type": ["state", "state", "county", + "hrr", "msa", "msa"] + }) + + assert set(get_geo_signal_combos("a")) == set([("state", "w"), + ("state", "x"), + ("county", "x")]) + assert set(get_geo_signal_combos("b")) == set([("hrr", "y"), + ("msa", "y"), + ("msa", "z")]) + + @mock.patch("covidcast.signal") + def test_threaded_api_calls(self, mock_signal): + """Test that calls to the covidcast API are made.""" + + signal_data_1 = pd.DataFrame({"geo_value": ["1044"], + "stderr": [None], + "value": [3], + "issue": [10], + "lag": [7], + "sample_size": [None], + "time_value": [10] + }) + signal_data_2 = pd.DataFrame({"geo_value": ["0888"], + "stderr": [2], + "value": [14], + "issue": [10], + "lag": [1], + "sample_size": [100], + "time_value": [8] + }) + + def mock_signal_return_fn(unused_data_source, signal_type, unused_start_date, + unused_end_date, geo_type): + """Function to return data when covidcast.signal() is called.""" + if signal_type == "a": + return signal_data_1 + elif geo_type == "county": + return signal_data_2 + else: + return None + + mock_signal.side_effect = mock_signal_return_fn + + processed_signal_data_1 = pd.DataFrame({"geo_id": ["1044"], + "val": [3], + "se": [np.nan], + "sample_size": [np.nan], + "time_value": [10] + }) + processed_signal_data_2 = pd.DataFrame({"geo_id": ["0888"], + "val": [14], + "se": [2], + "sample_size": [100], + "time_value": [8] + }) + expected = { + ("county", "a"): processed_signal_data_1, + ("county", "b"): processed_signal_data_2, + ("state", "a"): processed_signal_data_1, + ("state", "b"): ValidationError(("api_data_fetch_error", "state", "b"), None, + APIDataFetchError("Error fetching data from 2020-03-10 " + "to 2020-06-10 for data source: " + "source, signal type: b, geo type: " + "state")) + } + actual = threaded_api_calls("source", date(2020, 3, 10), date(2020, 6, 10), expected.keys()) + + assert set(expected.keys()) == set(actual.keys()) + for k, v in actual.items(): + if isinstance(v, pd.DataFrame): + pd.testing.assert_frame_equal(v, expected[k]) + else: + assert str(v) == str(expected[k]) diff --git a/validator/tests/test_report.py b/validator/tests/test_report.py new file mode 100644 index 000000000..5269a6857 --- /dev/null +++ b/validator/tests/test_report.py @@ -0,0 +1,44 @@ +"""Tests for delphi_validator.report.""" +from datetime import date +from delphi_validator.errors import ValidationError +from delphi_validator.report import ValidationReport + +class TestValidationReport: + """Tests for ValidationReport class.""" + + ERROR_1 = ValidationError(("good", date(2020, 10, 5)), "exp 1", "msg 1") + ERROR_2 = ValidationError(("bad", date(2020, 11, 18)), "exp 2", "msg 2") + + def test_add_raised_unsuppressed_error(self): + """Test that an unsupressed error shows up in the unsuppressed error list.""" + report = ValidationReport([("bad", "2020-10-05")]) + report.add_raised_error(self.ERROR_1) + report.add_raised_error(self.ERROR_2) + assert report.unsuppressed_errors == [self.ERROR_1, self.ERROR_2] + + def test_add_raised_suppressed_error(self): + """Test that an supressed error does not show up in the unsuppressed error list.""" + report = ValidationReport([("good", "2020-10-05")]) + report.add_raised_error(self.ERROR_1) + + assert len(report.unsuppressed_errors) == 0 + assert report.num_suppressed == 1 + assert len(report.errors_to_suppress) == 0 + + # Each error can only be surpressed once. + report.add_raised_error(self.ERROR_1) + assert report.unsuppressed_errors == [self.ERROR_1] + + def test_str(self): + """Test that the string representation contains all information.""" + report = ValidationReport([("good", "2020-10-05")]) + report.increment_total_checks() + report.increment_total_checks() + report.increment_total_checks() + report.add_raised_warning(ImportWarning("wrong import")) + report.add_raised_warning(ImportWarning("right import")) + report.add_raised_error(self.ERROR_1) + report.add_raised_error(self.ERROR_2) + + assert str(report) == "3 checks run\n1 checks failed\n1 checks suppressed\n2 warnings\n"\ + "(('bad', datetime.date(2020, 11, 18)), 'exp 2', 'msg 2')\nwrong import\nright import\n" diff --git a/validator/tests/test_utils.py b/validator/tests/test_utils.py new file mode 100644 index 000000000..ee100b1d9 --- /dev/null +++ b/validator/tests/test_utils.py @@ -0,0 +1,28 @@ +"""Tests for module utils.""" + +from datetime import date +import pandas as pd +from delphi_validator.datafetcher import FILENAME_REGEX +from delphi_validator.utils import relative_difference_by_min, aggregate_frames + +class TestUtils: + """Tests for module utils.""" + + def test_relative_difference_by_min(self): + """Test basic functionality of relative_difference_by_min.""" + assert relative_difference_by_min(16, 10) == 0.6 + + def test_aggregate_frames(self): + """Test that frames are aggregated and their data is derived from the re.match objects.""" + frame_1 = pd.DataFrame({"data": list(range(10))}) + frame_2 = pd.DataFrame({"data": list(range(10, 20))}) + match_1 = FILENAME_REGEX.match("20200404_state_signal_1.csv") + match_2 = FILENAME_REGEX.match("20200505_county_signal_2.csv") + + actual = aggregate_frames([(None, match_1, frame_1), (None, match_2, frame_2)]) + expected = pd.DataFrame({"data": list(range(20)), + "geo_type": ["state"] * 10 + ["county"] * 10, + "time_value": [date(2020, 4, 4)] * 10 + [date(2020, 5, 5)] * 10, + "signal": ["signal_1"] * 10 + ["signal_2"] * 10 + }) + pd.testing.assert_frame_equal(actual, expected)