From 8c752d93dc28e617f1b6f0bda32e30a67a6e0816 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 10 Feb 2021 13:32:17 -0800 Subject: [PATCH 01/43] Update utilities for NAN codes: * update export utility to export, validate, and test the missing cols * handle deleted rows: replaced with nan values * handle deleted files: replace with an empty CSV file * handle comparisons between CSVs with/without missing cols --- _delphi_utils_python/delphi_utils/archive.py | 52 ++++-- _delphi_utils_python/delphi_utils/export.py | 42 ++++- _delphi_utils_python/tests/test_archive.py | 172 ++++++++++++++++--- _delphi_utils_python/tests/test_export.py | 76 +++++++- 4 files changed, 298 insertions(+), 44 deletions(-) diff --git a/_delphi_utils_python/delphi_utils/archive.py b/_delphi_utils_python/delphi_utils/archive.py index 5d1036bcd..31b88a1d1 100644 --- a/_delphi_utils_python/delphi_utils/archive.py +++ b/_delphi_utils_python/delphi_utils/archive.py @@ -40,9 +40,11 @@ from git import Repo from git.refs.head import Head import pandas as pd +import numpy as np from .utils import read_params from .logger import get_structured_logger +from .nancodes import Nans Files = List[str] FileDiffMap = Dict[str, Optional[str]] @@ -73,8 +75,10 @@ def diff_export_csv( changed_df is the pd.DataFrame of common rows from after_csv with changed values. added_df is the pd.DataFrame of added rows from after_csv. """ - export_csv_dtypes = {"geo_id": str, "val": float, - "se": float, "sample_size": float} + export_csv_dtypes = { + "geo_id": str, "val": float, "se": float, "sample_size": float, + "missing_val": int, "missing_se": int, "missing_sample_size": int + } before_df = pd.read_csv(before_csv, dtype=export_csv_dtypes) before_df.set_index("geo_id", inplace=True) @@ -89,12 +93,22 @@ def diff_export_csv( before_df_cmn = before_df.reindex(common_idx) after_df_cmn = after_df.reindex(common_idx) - # Exact comparisons, treating NA == NA as True - same_mask = before_df_cmn == after_df_cmn - same_mask |= pd.isna(before_df_cmn) & pd.isna(after_df_cmn) + # If CSVs have different columns (no missingness), mark all values as new + if ("missing_val" in before_df_cmn.columns) ^ ("missing_val" in after_df_cmn.columns): + same_mask = after_df_cmn.copy() + same_mask.loc[:] = False + else: + # Exact comparisons, treating NA == NA as True + same_mask = before_df_cmn == after_df_cmn + same_mask |= pd.isna(before_df_cmn) & pd.isna(after_df_cmn) + + # Code deleted entries as nans with the deleted missing code + deleted_df = before_df.loc[deleted_idx, :].copy() + deleted_df[["val", "se", "sample_size"]] = np.nan + deleted_df[["missing_val", "missing_se", "missing_sample_size"]] = Nans.DELETED return ( - before_df.loc[deleted_idx, :], + deleted_df, after_df_cmn.loc[~(same_mask.all(axis=1)), :], after_df.loc[added_idx, :]) @@ -227,11 +241,11 @@ def diff_exports(self) -> Tuple[Files, FileDiffMap, Files]: deleted_df, changed_df, added_df = diff_export_csv( before_file, after_file) - new_issues_df = pd.concat([changed_df, added_df], axis=0) + new_issues_df = pd.concat([deleted_df, changed_df, added_df], axis=0) if len(deleted_df) > 0: print( - f"Warning, diff has deleted indices in {after_file} that will be ignored") + f"Diff has deleted indices in {after_file} that have been coded as nans.") # Write the diffs to diff_file, if applicable if len(new_issues_df) > 0: @@ -240,6 +254,17 @@ def diff_exports(self) -> Tuple[Files, FileDiffMap, Files]: new_issues_df.to_csv(diff_file, na_rep="NA") common_diffs[after_file] = diff_file + # Replace deleted files with empty versions, but only if the cached version is not + # already empty + for deleted_file in deleted_files: + deleted_df = pd.read_csv(deleted_file) + if not deleted_df.empty: + print( + f"Diff has deleted {deleted_file} and replaced it with an empty CSV.") + empty_df = deleted_df[0:0] + new_deleted_filename = join(self.export_dir, basename(deleted_file)) + empty_df.to_csv(new_deleted_filename, index=False) + return deleted_files, common_diffs, new_files def archive_exports(self, exported_files: Files) -> Tuple[Files, Files]: @@ -266,9 +291,10 @@ def filter_exports(self, common_diffs: FileDiffMap): Filter export directory to only contain relevant files. Filters down the export_dir to only contain: - 1) New files, 2) Changed files, filtered-down to the ADDED and CHANGED rows only. - Should be called after archive_exports() so we archive the raw exports before - potentially modifying them. + 1) New files, 2) Changed files, filtered-down to the ADDED and CHANGED rows + only, and 3) Deleted files replaced with empty CSVs with the same name. Should + be called after archive_exports() so we archive the raw exports before potentially + modifying them. Parameters ---------- @@ -297,9 +323,9 @@ def run(self): self.update_cache() # Diff exports, and make incremental versions - _, common_diffs, new_files = self.diff_exports() + deleted_files, common_diffs, new_files = self.diff_exports() - # Archive changed and new files only + # Archive changed, new, and emptied deleted files to_archive = [f for f, diff in common_diffs.items() if diff is not None] to_archive += new_files diff --git a/_delphi_utils_python/delphi_utils/export.py b/_delphi_utils_python/delphi_utils/export.py index 5a3b804b2..afc1a4c8a 100644 --- a/_delphi_utils_python/delphi_utils/export.py +++ b/_delphi_utils_python/delphi_utils/export.py @@ -3,10 +3,32 @@ from datetime import datetime from os.path import join from typing import Optional +import logging import numpy as np import pandas as pd +from .nancodes import Nans + +def filter_contradicting_missing_codes(df, sensor, metric, date, logger=None): + """Find values with contradictory missingness codes, filter them, and log.""" + columns = ["val", "se", "sample_size"] + # Get indicies where the XNOR is true (i.e. both are true or both are false). + masks = [ + ~(df[column].isna() ^ df["missing_" + column].eq(Nans.NOT_MISSING)) + for column in columns + ] + for mask in masks: + if not logger is None and df.loc[mask].size > 0: + logger.info( + "Filtering contradictory missing code in " + + "{0}_{1}_{2}.".format(sensor, metric, date.strftime(format="%Y-%m-%d")) + ) + df = df.loc[~mask] + elif logger is None and df.loc[mask].size > 0: + df = df.loc[~mask] + return df + def create_export_csv( df: pd.DataFrame, export_dir: str, @@ -16,7 +38,8 @@ def create_export_csv( start_date: Optional[datetime] = None, end_date: Optional[datetime] = None, remove_null_samples: Optional[bool] = False, - write_empty_days: Optional[bool] = False + write_empty_days: Optional[bool] = False, + logger: Optional[logging.Logger] = None ): """Export data in the format expected by the Delphi API. @@ -43,6 +66,8 @@ def create_export_csv( write_empty_days: Optional[bool] If true, every day in between start_date and end_date will have a CSV file written even if there is no data for the day. If false, only the days present are written. + logger: Optional[logging.Logger] + Pass a logger object here to log information about contradictory missing codes. Returns --------- @@ -70,7 +95,20 @@ def create_export_csv( else: export_filename = f"{date.strftime('%Y%m%d')}_{geo_res}_{metric}_{sensor}.csv" export_file = join(export_dir, export_filename) - export_df = df[df["timestamp"] == date][["geo_id", "val", "se", "sample_size",]] + expected_columns = [ + "geo_id", + "val", + "se", + "sample_size", + "missing_val", + "missing_se", + "missing_sample_size" + ] + export_df = df[df["timestamp"] == date].filter(items=expected_columns) + if "missing_val" in export_df.columns: + export_df = filter_contradicting_missing_codes( + export_df, sensor, metric, date, logger=logger + ) if remove_null_samples: export_df = export_df[export_df["sample_size"].notnull()] export_df = export_df.round({"val": 7, "se": 7}) diff --git a/_delphi_utils_python/tests/test_archive.py b/_delphi_utils_python/tests/test_archive.py index 1b068f898..4c7d1fc57 100644 --- a/_delphi_utils_python/tests/test_archive.py +++ b/_delphi_utils_python/tests/test_archive.py @@ -14,8 +14,12 @@ from delphi_utils.archive import ArchiveDiffer, GitArchiveDiffer, S3ArchiveDiffer,\ archiver_from_params +from delphi_utils.nancodes import Nans -CSV_DTYPES = {"geo_id": str, "val": float, "se": float, "sample_size": float} +CSV_DTYPES = { + "geo_id": str, "val": float, "se": float, "sample_size": float, + "missing_val": int, "missing_se":int, "missing_sample_size": int + } CSVS_BEFORE = { # Common @@ -23,20 +27,40 @@ "geo_id": ["1", "2", "3"], "val": [1.000000001, 2.00000002, 3.00000003], "se": [0.1, 0.2, 0.3], - "sample_size": [10.0, 20.0, 30.0]}), + "sample_size": [10.0, 20.0, 30.0], + "missing_val": [Nans.NOT_MISSING] * 3, + "missing_se": [Nans.NOT_MISSING] * 3, + "missing_sample_size": [Nans.NOT_MISSING] * 3, + }), "csv1": pd.DataFrame({ "geo_id": ["1", "2", "3"], "val": [1.0, 2.0, 3.0], "se": [np.nan, 0.20000002, 0.30000003], - "sample_size": [10.0, 20.0, 30.0]}), + "sample_size": [10.0, 20.0, 30.0], + "missing_val": [Nans.NOT_MISSING] * 3, + "missing_se": [Nans.NOT_MISSING] * 3, + "missing_sample_size": [Nans.NOT_MISSING] * 3, + }), # Deleted "csv2": pd.DataFrame({ "geo_id": ["1"], "val": [1.0], "se": [0.1], - "sample_size": [10.0]}), + "sample_size": [10.0], + "missing_val": [Nans.NOT_MISSING], + "missing_se": [Nans.NOT_MISSING], + "missing_sample_size": [Nans.NOT_MISSING], + }), + + # Common, but updated with missing columns + "csv4": pd.DataFrame({ + "geo_id": ["1"], + "val": [1.0], + "se": [0.1], + "sample_size": [10.0] + }), } CSVS_AFTER = { @@ -45,23 +69,45 @@ "geo_id": ["1", "2", "3"], "val": [1.0, 2.0, 3.0], "se": [0.10000001, 0.20000002, 0.30000003], - "sample_size": [10.0, 20.0, 30.0]}), + "sample_size": [10.0, 20.0, 30.0], + "missing_val": [Nans.NOT_MISSING] * 3, + "missing_se": [Nans.NOT_MISSING] * 3, + "missing_sample_size": [Nans.NOT_MISSING] * 3, + }), "csv1": pd.DataFrame({ "geo_id": ["1", "2", "4"], "val": [1.0, 2.1, 4.0], "se": [np.nan, 0.21, np.nan], - "sample_size": [10.0, 21.0, 40.0]}), + "sample_size": [10.0, 21.0, 40.0], + "missing_val": [Nans.NOT_MISSING] * 3, + "missing_se": [Nans.NOT_MISSING] * 3, + "missing_sample_size": [Nans.NOT_MISSING] * 3, + }), # Added "csv3": pd.DataFrame({ "geo_id": ["2"], "val": [2.0000002], "se": [0.2], - "sample_size": [20.0]}), + "sample_size": [20.0], + "missing_val": [Nans.NOT_MISSING], + "missing_se": [Nans.NOT_MISSING], + "missing_sample_size": [Nans.NOT_MISSING], + }), + + # Common, but updated with missing columns + "csv4": pd.DataFrame({ + "geo_id": ["1"], + "val": [1.0], + "se": [0.1], + "sample_size": [10.0], + "missing_val": [Nans.NOT_MISSING], + "missing_se": [Nans.NOT_MISSING], + "missing_sample_size": [Nans.NOT_MISSING], + }), } - class TestArchiveDiffer: def test_stubs(self): @@ -80,10 +126,27 @@ def test_diff_and_filter_exports(self, tmp_path): mkdir(export_dir) csv1_diff = pd.DataFrame({ - "geo_id": ["2", "4"], - "val": [2.1, 4.0], - "se": [0.21, np.nan], - "sample_size": [21.0, 40.0]}) + "geo_id": ["3", "2", "4"], + "val": [np.nan, 2.1, 4.0], + "se": [np.nan, 0.21, np.nan], + "sample_size": [np.nan, 21.0, 40.0], + "missing_val": [Nans.DELETED] + [Nans.NOT_MISSING] * 2, + "missing_se": [Nans.DELETED] + [Nans.NOT_MISSING] * 2, + "missing_sample_size": [Nans.DELETED] + [Nans.NOT_MISSING] * 2, + }) + + csv2_deleted = pd.DataFrame( + np.empty(0, dtype=[ + ("geo_id", str), + ("val", float), + ("se", float), + ("sample_size", float), + ("missing_val", int), + ("missing_se", int), + ("missing_sample_size", int) + ]), + index=[] + ) arch_diff = ArchiveDiffer(cache_dir, export_dir) @@ -106,7 +169,7 @@ def test_diff_and_filter_exports(self, tmp_path): # Check return values assert set(deleted_files) == {join(cache_dir, "csv2.csv")} assert set(common_diffs.keys()) == { - join(export_dir, f) for f in ["csv0.csv", "csv1.csv"]} + join(export_dir, f) for f in ["csv0.csv", "csv1.csv", "csv4.csv"]} assert set(new_files) == {join(export_dir, "csv3.csv")} assert common_diffs[join(export_dir, "csv0.csv")] is None assert common_diffs[join(export_dir, "csv1.csv")] == join( @@ -114,7 +177,10 @@ def test_diff_and_filter_exports(self, tmp_path): # Check filesystem for actual files assert set(listdir(export_dir)) == { - "csv0.csv", "csv1.csv", "csv1.csv.diff", "csv3.csv"} + "csv0.csv", "csv1.csv", "csv1.csv.diff", + "csv3.csv", "csv4.csv", "csv4.csv.diff", + "csv2.csv" + } assert_frame_equal( pd.read_csv(join(export_dir, "csv1.csv.diff"), dtype=CSV_DTYPES), csv1_diff) @@ -131,8 +197,11 @@ def test_diff_and_filter_exports(self, tmp_path): arch_diff.filter_exports(common_diffs) - # Check exports directory just has incremental changes - assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv"} + # Check exports directory just has incremental and deleted changes + assert set(listdir(export_dir)) == {"csv1.csv", "csv2.csv", "csv3.csv", "csv4.csv"} + assert_frame_equal( + pd.read_csv(join(export_dir, "csv2.csv"), dtype=CSV_DTYPES), + csv2_deleted) assert_frame_equal( pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES), csv1_diff) @@ -259,15 +328,34 @@ def test_run(self, tmp_path, s3_client): assert_frame_equal(pd.read_csv(body, dtype=CSV_DTYPES), df) # Check exports directory just has incremental changes - assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv"} + assert set(listdir(export_dir)) == {"csv1.csv", "csv2.csv", "csv3.csv", "csv4.csv"} csv1_diff = pd.DataFrame({ - "geo_id": ["2", "4"], - "val": [2.1, 4.0], - "se": [0.21, np.nan], - "sample_size": [21.0, 40.0]}) + "geo_id": ["3", "2", "4"], + "val": [np.nan, 2.1, 4.0], + "se": [np.nan, 0.21, np.nan], + "sample_size": [np.nan, 21.0, 40.0], + "missing_val": [Nans.DELETED] + [Nans.NOT_MISSING] * 2, + "missing_se": [Nans.DELETED] + [Nans.NOT_MISSING] * 2, + "missing_sample_size": [Nans.DELETED] + [Nans.NOT_MISSING] * 2, + }) assert_frame_equal( pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES), csv1_diff) + csv2_deleted = pd.DataFrame( + np.empty(0, dtype=[ + ("geo_id", str), + ("val", float), + ("se", float), + ("sample_size", float), + ("missing_val", int), + ("missing_se", int), + ("missing_sample_size", int) + ]), + index=[] + ) + assert_frame_equal( + pd.read_csv(join(export_dir, "csv2.csv"), dtype=CSV_DTYPES), + csv2_deleted) class TestGitArchiveDiffer: @@ -346,7 +434,11 @@ def test_diff_exports(self, tmp_path): "geo_id": ["1", "2", "3"], "val": [1.0, 2.0, 3.0], "se": [0.1, 0.2, 0.3], - "sample_size": [10.0, 20.0, 30.0]}) + "sample_size": [10.0, 20.0, 30.0], + "missing_val": [Nans.NOT_MISSING] * 3, + "missing_se": [Nans.NOT_MISSING] * 3, + "missing_sample_size": [Nans.NOT_MISSING] * 3, + }) # Write exact same CSV into cache and export, so no diffs expected csv1.to_csv(join(cache_dir, "csv1.csv"), index=False) @@ -383,7 +475,11 @@ def test_archive_exports(self, tmp_path): "geo_id": ["1", "2", "3"], "val": [1.0, 2.0, 3.0], "se": [0.1, 0.2, 0.3], - "sample_size": [10.0, 20.0, 30.0]}) + "sample_size": [10.0, 20.0, 30.0], + "missing_val": [Nans.NOT_MISSING] * 3, + "missing_se": [Nans.NOT_MISSING] * 3, + "missing_sample_size": [Nans.NOT_MISSING] * 3, + }) # csv1.csv is now a dirty edit in the repo, and to be exported too csv1.to_csv(join(cache_dir, "csv1.csv"), index=False) @@ -460,15 +556,35 @@ def test_run(self, tmp_path): original_branch.checkout() # Check exports directory just has incremental changes - assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv"} + assert set(listdir(export_dir)) == {"csv1.csv", "csv2.csv", "csv3.csv", "csv4.csv"} csv1_diff = pd.DataFrame({ - "geo_id": ["2", "4"], - "val": [2.1, 4.0], - "se": [0.21, np.nan], - "sample_size": [21.0, 40.0]}) + "geo_id": ["3", "2", "4"], + "val": [np.nan, 2.1, 4.0], + "se": [np.nan, 0.21, np.nan], + "sample_size": [np.nan, 21.0, 40.0], + "missing_val": [Nans.DELETED] + [Nans.NOT_MISSING] * 2, + "missing_se": [Nans.DELETED] + [Nans.NOT_MISSING] * 2, + "missing_sample_size": [Nans.DELETED] + [Nans.NOT_MISSING] * 2, + }) assert_frame_equal( pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES), csv1_diff) + csv2_deleted = pd.DataFrame( + np.empty(0, dtype=[ + ("geo_id", str), + ("val", float), + ("se", float), + ("sample_size", float), + ("missing_val", int), + ("missing_se", int), + ("missing_sample_size", int) + ]), + index=[] + ) + assert_frame_equal( + pd.read_csv(join(export_dir, "csv2.csv"), dtype=CSV_DTYPES), + csv2_deleted) + class TestFromParams: diff --git a/_delphi_utils_python/tests/test_export.py b/_delphi_utils_python/tests/test_export.py index 31ec5c113..b22a710cd 100644 --- a/_delphi_utils_python/tests/test_export.py +++ b/_delphi_utils_python/tests/test_export.py @@ -3,8 +3,11 @@ from os import listdir, remove from os.path import join +import mock +import numpy as np import pandas as pd -from delphi_utils import create_export_csv + +from delphi_utils import create_export_csv, Nans def _clean_directory(directory): """Clean files out of a directory.""" @@ -43,6 +46,34 @@ class TestExport: } ) + # A sample data frame with missingness. + DF2 = pd.DataFrame( + { + "geo_id": ["51093", "51175", "51175", "51620"], + "timestamp": TIMES, + "val": [3.12345678910, np.nan, 2.2, 2.6], + "se": [0.15, 0.22, np.nan, 0.34], + "sample_size": [100, 100, 101, None], + "missing_val": [Nans.NOT_MISSING, Nans.OTHER, Nans.NOT_MISSING, Nans.NOT_MISSING], + "missing_se": [Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.OTHER, Nans.NOT_MISSING], + "missing_sample_size": [Nans.NOT_MISSING] * 3 + [Nans.OTHER] + } + ) + + # A sample data frame with contradictory missing codes. + DF3 = pd.DataFrame( + { + "geo_id": ["51093", "51175", "51175", "51620"], + "timestamp": TIMES, + "val": [np.nan, np.nan, 2.2, 2.6], + "se": [0.15, 0.22, np.nan, 0.34], + "sample_size": [100, 100, 101, None], + "missing_val": [Nans.NOT_MISSING, Nans.OTHER, Nans.NOT_MISSING, Nans.NOT_MISSING], + "missing_se": [Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.OTHER, Nans.NOT_MISSING], + "missing_sample_size": [Nans.NOT_MISSING] * 3 + [Nans.OTHER] + } + ) + # Directory in which to store tests. TEST_DIR = "test_dir" @@ -235,3 +266,46 @@ def test_export_without_null_removal(self): ] ) assert pd.read_csv(join(self.TEST_DIR, "20200606_state_test.csv")).size > 0 + + def test_export_df_with_missingness(self): + _clean_directory(self.TEST_DIR) + + create_export_csv( + df=self.DF2.copy(), + export_dir=self.TEST_DIR, + geo_res="state", + sensor="test", + remove_null_samples=False + ) + assert _non_ignored_files_set(self.TEST_DIR) == set( + [ + "20200215_state_test.csv", + "20200301_state_test.csv", + "20200315_state_test.csv", + ] + ) + assert pd.read_csv(join(self.TEST_DIR, "20200315_state_test.csv")).size > 0 + + @mock.patch("delphi_utils.logger") + def test_export_df_with_contradictory_missingness(self, mock_logger): + _clean_directory(self.TEST_DIR) + + create_export_csv( + df=self.DF3.copy(), + export_dir=self.TEST_DIR, + geo_res="state", + sensor="test", + remove_null_samples=False, + logger=mock_logger + ) + assert _non_ignored_files_set(self.TEST_DIR) == set( + [ + "20200215_state_test.csv", + "20200301_state_test.csv", + "20200315_state_test.csv", + ] + ) + assert pd.read_csv(join(self.TEST_DIR, "20200315_state_test.csv")).size > 0 + mock_logger.info.assert_called_once_with( + "Filtering contradictory missing code in test_None_2020-02-15." + ) From 83cb333f9965ccd06afe954eba5443b84920b9b4 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 12 May 2021 14:29:50 -0700 Subject: [PATCH 02/43] Nans: update archiver deletion handling --- _delphi_utils_python/delphi_utils/archive.py | 6 +++++- _delphi_utils_python/tests/test_archive.py | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/_delphi_utils_python/delphi_utils/archive.py b/_delphi_utils_python/delphi_utils/archive.py index 31b88a1d1..0eb302e7d 100644 --- a/_delphi_utils_python/delphi_utils/archive.py +++ b/_delphi_utils_python/delphi_utils/archive.py @@ -256,7 +256,9 @@ def diff_exports(self) -> Tuple[Files, FileDiffMap, Files]: # Replace deleted files with empty versions, but only if the cached version is not # already empty + new_deleted_files = [] for deleted_file in deleted_files: + breakpoint() deleted_df = pd.read_csv(deleted_file) if not deleted_df.empty: print( @@ -264,8 +266,9 @@ def diff_exports(self) -> Tuple[Files, FileDiffMap, Files]: empty_df = deleted_df[0:0] new_deleted_filename = join(self.export_dir, basename(deleted_file)) empty_df.to_csv(new_deleted_filename, index=False) + new_deleted_files.append(deleted_file) - return deleted_files, common_diffs, new_files + return new_deleted_files, common_diffs, new_files def archive_exports(self, exported_files: Files) -> Tuple[Files, Files]: """ @@ -329,6 +332,7 @@ def run(self): to_archive = [f for f, diff in common_diffs.items() if diff is not None] to_archive += new_files + to_archive += deleted_files _, fails = self.archive_exports(to_archive) # Filter existing exports to exclude those that failed to archive diff --git a/_delphi_utils_python/tests/test_archive.py b/_delphi_utils_python/tests/test_archive.py index 4c7d1fc57..092d4d8a8 100644 --- a/_delphi_utils_python/tests/test_archive.py +++ b/_delphi_utils_python/tests/test_archive.py @@ -298,6 +298,7 @@ def test_run(self, tmp_path, s3_client): export_dir = join(str(tmp_path), "export") mkdir(cache_dir) mkdir(export_dir) + breakpoint() # Set up current buckets to be `CSVS_BEFORE`. s3_client.create_bucket(Bucket=self.bucket_name) From cd140b153c92e2deeb39f41d1969c82e783d81bb Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 13 May 2021 15:51:13 -0700 Subject: [PATCH 03/43] Nans: update archiver deletion --- _delphi_utils_python/delphi_utils/archive.py | 32 ++++++---- _delphi_utils_python/tests/test_archive.py | 66 +++++++++----------- 2 files changed, 48 insertions(+), 50 deletions(-) diff --git a/_delphi_utils_python/delphi_utils/archive.py b/_delphi_utils_python/delphi_utils/archive.py index 0eb302e7d..ce6553018 100644 --- a/_delphi_utils_python/delphi_utils/archive.py +++ b/_delphi_utils_python/delphi_utils/archive.py @@ -254,21 +254,26 @@ def diff_exports(self) -> Tuple[Files, FileDiffMap, Files]: new_issues_df.to_csv(diff_file, na_rep="NA") common_diffs[after_file] = diff_file + export_csv_dtypes = { + "geo_id": str, "val": float, "se": float, "sample_size": float, + "missing_val": int, "missing_se": int, "missing_sample_size": int + } + # Replace deleted files with empty versions, but only if the cached version is not # already empty - new_deleted_files = [] + deleted_files_export = [] for deleted_file in deleted_files: - breakpoint() - deleted_df = pd.read_csv(deleted_file) - if not deleted_df.empty: - print( - f"Diff has deleted {deleted_file} and replaced it with an empty CSV.") - empty_df = deleted_df[0:0] - new_deleted_filename = join(self.export_dir, basename(deleted_file)) - empty_df.to_csv(new_deleted_filename, index=False) - new_deleted_files.append(deleted_file) - - return new_deleted_files, common_diffs, new_files + deleted_df = pd.read_csv(deleted_file, dtype=export_csv_dtypes) + print( + f"Diff has deleted {deleted_file}; generating a CSV with deleted rows." + ) + deleted_df[["val", "se", "sample_size"]] = np.nan + deleted_df[["missing_val", "missing_se", "missing_sample_size"]] = Nans.DELETED + filename = join(self.export_dir, basename(deleted_file)) + deleted_df.to_csv(filename, index=False) + deleted_files_export.append(filename) + + return deleted_files_export, common_diffs, new_files def archive_exports(self, exported_files: Files) -> Tuple[Files, Files]: """ @@ -444,6 +449,9 @@ def archive_exports(self, # pylint: disable=arguments-differ archive_success.append(exported_file) except FileNotFoundError: archive_fail.append(exported_file) + except shutil.SameFileError: + # no need to copy if the cached file is the same + archive_success.append(exported_file) self._exports_archived = True diff --git a/_delphi_utils_python/tests/test_archive.py b/_delphi_utils_python/tests/test_archive.py index 092d4d8a8..0a21ecbb9 100644 --- a/_delphi_utils_python/tests/test_archive.py +++ b/_delphi_utils_python/tests/test_archive.py @@ -135,18 +135,15 @@ def test_diff_and_filter_exports(self, tmp_path): "missing_sample_size": [Nans.DELETED] + [Nans.NOT_MISSING] * 2, }) - csv2_deleted = pd.DataFrame( - np.empty(0, dtype=[ - ("geo_id", str), - ("val", float), - ("se", float), - ("sample_size", float), - ("missing_val", int), - ("missing_se", int), - ("missing_sample_size", int) - ]), - index=[] - ) + csv2_deleted = pd.DataFrame({ + "geo_id": ["1"], + "val": [np.nan], + "se": [np.nan], + "sample_size": [np.nan], + "missing_val": [Nans.DELETED], + "missing_se": [Nans.DELETED], + "missing_sample_size": [Nans.DELETED], + }) arch_diff = ArchiveDiffer(cache_dir, export_dir) @@ -167,7 +164,7 @@ def test_diff_and_filter_exports(self, tmp_path): deleted_files, common_diffs, new_files = arch_diff.diff_exports() # Check return values - assert set(deleted_files) == {join(cache_dir, "csv2.csv")} + assert set(deleted_files) == {join(export_dir, "csv2.csv")} assert set(common_diffs.keys()) == { join(export_dir, f) for f in ["csv0.csv", "csv1.csv", "csv4.csv"]} assert set(new_files) == {join(export_dir, "csv3.csv")} @@ -298,7 +295,6 @@ def test_run(self, tmp_path, s3_client): export_dir = join(str(tmp_path), "export") mkdir(cache_dir) mkdir(export_dir) - breakpoint() # Set up current buckets to be `CSVS_BEFORE`. s3_client.create_bucket(Bucket=self.bucket_name) @@ -342,18 +338,15 @@ def test_run(self, tmp_path, s3_client): assert_frame_equal( pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES), csv1_diff) - csv2_deleted = pd.DataFrame( - np.empty(0, dtype=[ - ("geo_id", str), - ("val", float), - ("se", float), - ("sample_size", float), - ("missing_val", int), - ("missing_se", int), - ("missing_sample_size", int) - ]), - index=[] - ) + csv2_deleted = pd.DataFrame({ + "geo_id": ["1"], + "val": [np.nan], + "se": [np.nan], + "sample_size": [np.nan], + "missing_val": [Nans.DELETED], + "missing_se": [Nans.DELETED], + "missing_sample_size": [Nans.DELETED], + }) assert_frame_equal( pd.read_csv(join(export_dir, "csv2.csv"), dtype=CSV_DTYPES), csv2_deleted) @@ -570,18 +563,15 @@ def test_run(self, tmp_path): assert_frame_equal( pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES), csv1_diff) - csv2_deleted = pd.DataFrame( - np.empty(0, dtype=[ - ("geo_id", str), - ("val", float), - ("se", float), - ("sample_size", float), - ("missing_val", int), - ("missing_se", int), - ("missing_sample_size", int) - ]), - index=[] - ) + csv2_deleted = pd.DataFrame({ + "geo_id": ["1"], + "val": [np.nan], + "se": [np.nan], + "sample_size": [np.nan], + "missing_val": [Nans.DELETED], + "missing_se": [Nans.DELETED], + "missing_sample_size": [Nans.DELETED], + }) assert_frame_equal( pd.read_csv(join(export_dir, "csv2.csv"), dtype=CSV_DTYPES), csv2_deleted) From 7a9e45cda05151c740759d3bb65e0851e1a0a438 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Tue, 31 Aug 2021 11:50:29 -0700 Subject: [PATCH 04/43] Nancodes archiver: rename variable for clarity --- _delphi_utils_python/delphi_utils/archive.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/_delphi_utils_python/delphi_utils/archive.py b/_delphi_utils_python/delphi_utils/archive.py index ce6553018..994dfd7df 100644 --- a/_delphi_utils_python/delphi_utils/archive.py +++ b/_delphi_utils_python/delphi_utils/archive.py @@ -261,19 +261,19 @@ def diff_exports(self) -> Tuple[Files, FileDiffMap, Files]: # Replace deleted files with empty versions, but only if the cached version is not # already empty - deleted_files_export = [] + deleted_files_nanfilled = [] for deleted_file in deleted_files: deleted_df = pd.read_csv(deleted_file, dtype=export_csv_dtypes) print( - f"Diff has deleted {deleted_file}; generating a CSV with deleted rows." + f"Diff has deleted {deleted_file}; generating a CSV with corresponding deleted rows." ) deleted_df[["val", "se", "sample_size"]] = np.nan deleted_df[["missing_val", "missing_se", "missing_sample_size"]] = Nans.DELETED filename = join(self.export_dir, basename(deleted_file)) deleted_df.to_csv(filename, index=False) - deleted_files_export.append(filename) + deleted_files_nanfilled.append(filename) - return deleted_files_export, common_diffs, new_files + return deleted_files_nanfilled, common_diffs, new_files def archive_exports(self, exported_files: Files) -> Tuple[Files, Files]: """ From 19e8be40494bf25e4b0e44289da93c87709af4c7 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Tue, 31 Aug 2021 12:03:22 -0700 Subject: [PATCH 05/43] Nancodes archiver: small formatting change --- _delphi_utils_python/tests/test_archive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_delphi_utils_python/tests/test_archive.py b/_delphi_utils_python/tests/test_archive.py index 0a21ecbb9..111acf92f 100644 --- a/_delphi_utils_python/tests/test_archive.py +++ b/_delphi_utils_python/tests/test_archive.py @@ -133,7 +133,7 @@ def test_diff_and_filter_exports(self, tmp_path): "missing_val": [Nans.DELETED] + [Nans.NOT_MISSING] * 2, "missing_se": [Nans.DELETED] + [Nans.NOT_MISSING] * 2, "missing_sample_size": [Nans.DELETED] + [Nans.NOT_MISSING] * 2, - }) + }) csv2_deleted = pd.DataFrame({ "geo_id": ["1"], From 01e7b3daa978f29d41cdac60472cf4b309cce6a1 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 14 Sep 2021 17:43:41 -0400 Subject: [PATCH 06/43] un-retire schooling indicators from sirCAL Revert #1192 --- ansible/templates/sir_complainsalot-params-prod.json.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/templates/sir_complainsalot-params-prod.json.j2 b/ansible/templates/sir_complainsalot-params-prod.json.j2 index 0076bcc82..de4b732e6 100644 --- a/ansible/templates/sir_complainsalot-params-prod.json.j2 +++ b/ansible/templates/sir_complainsalot-params-prod.json.j2 @@ -37,7 +37,7 @@ "fb-survey": { "max_age": 3, "maintainers": ["U01069KCRS7"], - "retired-signals": ["smoothed_anxious_5d", "smoothed_wanxious_5d", "smoothed_depressed_5d", "smoothed_wdepressed_5d", "smoothed_felt_isolated_5d", "smoothed_wfelt_isolated_5d", "smoothed_large_event_1d", "smoothed_wlarge_event_1d", "smoothed_restaurant_1d", "smoothed_wrestaurant_1d", "smoothed_shop_1d", "smoothed_wshop_1d", "smoothed_spent_time_1d", "smoothed_wspent_time_1d", "smoothed_travel_outside_state_5d", "smoothed_wtravel_outside_state_5d", "smoothed_work_outside_home_1d", "smoothed_wwork_outside_home_1d", "smoothed_wearing_mask", "smoothed_wwearing_mask", "smoothed_vaccine_likely_local_health", "smoothed_wvaccine_likely_local_health", "smoothed_others_masked", "smoothed_wothers_masked", "smoothed_wanted_test_14d", "smoothed_wwanted_test_14d", "smoothed_covid_vaccinated_or_accept", "smoothed_wcovid_vaccinated_or_accept", "smoothed_accept_covid_vaccine", "smoothed_waccept_covid_vaccine", "smoothed_hesitancy_reason_allergic", "smoothed_whesitancy_reason_allergic", "smoothed_hesitancy_reason_not_recommended", "smoothed_whesitancy_reason_not_recommended", "smoothed_hesitancy_reason_distrust_vaccines", "smoothed_whesitancy_reason_distrust_vaccines", "smoothed_hesitancy_reason_health_condition", "smoothed_whesitancy_reason_health_condition", "smoothed_hesitancy_reason_pregnant", "smoothed_whesitancy_reason_pregnant", "smoothed_vaccine_likely_friends", "smoothed_wvaccine_likely_friends", "smoothed_vaccine_likely_who", "smoothed_wvaccine_likely_who", "smoothed_vaccine_likely_govt_health", "smoothed_wvaccine_likely_govt_health", "smoothed_vaccine_likely_politicians", "smoothed_wvaccine_likely_politicians", "smoothed_vaccine_likely_doctors", "smoothed_wvaccine_likely_doctors", "smoothed_felt_isolated_7d", "smoothed_wfelt_isolated_7d", "smoothed_worried_become_ill", "smoothed_wworried_become_ill", "smoothed_inperson_school_fulltime", "smoothed_winperson_school_fulltime", "smoothed_inperson_school_parttime", "smoothed_winperson_school_parttime", ["smoothed_vaccine_barrier_appointment_time_tried", "msa"], ["smoothed_vaccine_barrier_childcare_tried", "msa"], ["smoothed_vaccine_barrier_document_tried", "msa"], ["smoothed_vaccine_barrier_eligible_tried", "msa"], ["smoothed_vaccine_barrier_language_tried", "msa"], ["smoothed_vaccine_barrier_no_appointments_tried", "msa"], ["smoothed_vaccine_barrier_none_tried", "msa"], ["smoothed_vaccine_barrier_technical_difficulties_tried", "msa"], ["smoothed_vaccine_barrier_technology_access_tried", "msa"], ["smoothed_vaccine_barrier_time_tried", "msa"], ["smoothed_vaccine_barrier_travel_tried", "msa"], ["smoothed_vaccine_barrier_type_tried", "msa"], ["smoothed_wvaccine_barrier_appointment_time_tried", "msa"], ["smoothed_wvaccine_barrier_childcare_tried", "msa"], ["smoothed_wvaccine_barrier_document_tried", "msa"], ["smoothed_wvaccine_barrier_eligible_tried", "msa"], ["smoothed_wvaccine_barrier_language_tried", "msa"], ["smoothed_wvaccine_barrier_no_appointments_tried", "msa"], ["smoothed_wvaccine_barrier_none_tried", "msa"], ["smoothed_wvaccine_barrier_technical_difficulties_tried", "msa"], ["smoothed_wvaccine_barrier_technology_access_tried", "msa"], ["smoothed_wvaccine_barrier_time_tried", "msa"], ["smoothed_wvaccine_barrier_travel_tried", "msa"], ["smoothed_wvaccine_barrier_type_tried", "msa"]] + "retired-signals": ["smoothed_anxious_5d", "smoothed_wanxious_5d", "smoothed_depressed_5d", "smoothed_wdepressed_5d", "smoothed_felt_isolated_5d", "smoothed_wfelt_isolated_5d", "smoothed_large_event_1d", "smoothed_wlarge_event_1d", "smoothed_restaurant_1d", "smoothed_wrestaurant_1d", "smoothed_shop_1d", "smoothed_wshop_1d", "smoothed_spent_time_1d", "smoothed_wspent_time_1d", "smoothed_travel_outside_state_5d", "smoothed_wtravel_outside_state_5d", "smoothed_work_outside_home_1d", "smoothed_wwork_outside_home_1d", "smoothed_wearing_mask", "smoothed_wwearing_mask", "smoothed_vaccine_likely_local_health", "smoothed_wvaccine_likely_local_health", "smoothed_others_masked", "smoothed_wothers_masked", "smoothed_wanted_test_14d", "smoothed_wwanted_test_14d", "smoothed_covid_vaccinated_or_accept", "smoothed_wcovid_vaccinated_or_accept", "smoothed_accept_covid_vaccine", "smoothed_waccept_covid_vaccine", "smoothed_hesitancy_reason_allergic", "smoothed_whesitancy_reason_allergic", "smoothed_hesitancy_reason_not_recommended", "smoothed_whesitancy_reason_not_recommended", "smoothed_hesitancy_reason_distrust_vaccines", "smoothed_whesitancy_reason_distrust_vaccines", "smoothed_hesitancy_reason_health_condition", "smoothed_whesitancy_reason_health_condition", "smoothed_hesitancy_reason_pregnant", "smoothed_whesitancy_reason_pregnant", "smoothed_vaccine_likely_friends", "smoothed_wvaccine_likely_friends", "smoothed_vaccine_likely_who", "smoothed_wvaccine_likely_who", "smoothed_vaccine_likely_govt_health", "smoothed_wvaccine_likely_govt_health", "smoothed_vaccine_likely_politicians", "smoothed_wvaccine_likely_politicians", "smoothed_vaccine_likely_doctors", "smoothed_wvaccine_likely_doctors", "smoothed_felt_isolated_7d", "smoothed_wfelt_isolated_7d", "smoothed_worried_become_ill", "smoothed_wworried_become_ill", ["smoothed_vaccine_barrier_appointment_time_tried", "msa"], ["smoothed_vaccine_barrier_childcare_tried", "msa"], ["smoothed_vaccine_barrier_document_tried", "msa"], ["smoothed_vaccine_barrier_eligible_tried", "msa"], ["smoothed_vaccine_barrier_language_tried", "msa"], ["smoothed_vaccine_barrier_no_appointments_tried", "msa"], ["smoothed_vaccine_barrier_none_tried", "msa"], ["smoothed_vaccine_barrier_technical_difficulties_tried", "msa"], ["smoothed_vaccine_barrier_technology_access_tried", "msa"], ["smoothed_vaccine_barrier_time_tried", "msa"], ["smoothed_vaccine_barrier_travel_tried", "msa"], ["smoothed_vaccine_barrier_type_tried", "msa"], ["smoothed_wvaccine_barrier_appointment_time_tried", "msa"], ["smoothed_wvaccine_barrier_childcare_tried", "msa"], ["smoothed_wvaccine_barrier_document_tried", "msa"], ["smoothed_wvaccine_barrier_eligible_tried", "msa"], ["smoothed_wvaccine_barrier_language_tried", "msa"], ["smoothed_wvaccine_barrier_no_appointments_tried", "msa"], ["smoothed_wvaccine_barrier_none_tried", "msa"], ["smoothed_wvaccine_barrier_technical_difficulties_tried", "msa"], ["smoothed_wvaccine_barrier_technology_access_tried", "msa"], ["smoothed_wvaccine_barrier_time_tried", "msa"], ["smoothed_wvaccine_barrier_travel_tried", "msa"], ["smoothed_wvaccine_barrier_type_tried", "msa"]] }, "indicator-combination": { "max_age": 4, From 91bb0ad1fe6d18c406e1e39135fbbb507c213a45 Mon Sep 17 00:00:00 2001 From: Brian Clark Date: Wed, 15 Sep 2021 09:37:56 -0400 Subject: [PATCH 07/43] Build facebook container image --- .github/workflows/build-container-images.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-container-images.yml b/.github/workflows/build-container-images.yml index e32b58093..c01460c87 100644 --- a/.github/workflows/build-container-images.yml +++ b/.github/workflows/build-container-images.yml @@ -9,7 +9,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - packages: [ "" ] + packages: [ facebook ] steps: - name: Checkout code uses: actions/checkout@v2 From 616ca89054e95f4372388ead40108a0f99e98d99 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 15 Sep 2021 11:48:48 -0400 Subject: [PATCH 08/43] break line --- .../sir_complainsalot-params-prod.json.j2 | 41 ++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/ansible/templates/sir_complainsalot-params-prod.json.j2 b/ansible/templates/sir_complainsalot-params-prod.json.j2 index de4b732e6..9cc9a8250 100644 --- a/ansible/templates/sir_complainsalot-params-prod.json.j2 +++ b/ansible/templates/sir_complainsalot-params-prod.json.j2 @@ -37,7 +37,46 @@ "fb-survey": { "max_age": 3, "maintainers": ["U01069KCRS7"], - "retired-signals": ["smoothed_anxious_5d", "smoothed_wanxious_5d", "smoothed_depressed_5d", "smoothed_wdepressed_5d", "smoothed_felt_isolated_5d", "smoothed_wfelt_isolated_5d", "smoothed_large_event_1d", "smoothed_wlarge_event_1d", "smoothed_restaurant_1d", "smoothed_wrestaurant_1d", "smoothed_shop_1d", "smoothed_wshop_1d", "smoothed_spent_time_1d", "smoothed_wspent_time_1d", "smoothed_travel_outside_state_5d", "smoothed_wtravel_outside_state_5d", "smoothed_work_outside_home_1d", "smoothed_wwork_outside_home_1d", "smoothed_wearing_mask", "smoothed_wwearing_mask", "smoothed_vaccine_likely_local_health", "smoothed_wvaccine_likely_local_health", "smoothed_others_masked", "smoothed_wothers_masked", "smoothed_wanted_test_14d", "smoothed_wwanted_test_14d", "smoothed_covid_vaccinated_or_accept", "smoothed_wcovid_vaccinated_or_accept", "smoothed_accept_covid_vaccine", "smoothed_waccept_covid_vaccine", "smoothed_hesitancy_reason_allergic", "smoothed_whesitancy_reason_allergic", "smoothed_hesitancy_reason_not_recommended", "smoothed_whesitancy_reason_not_recommended", "smoothed_hesitancy_reason_distrust_vaccines", "smoothed_whesitancy_reason_distrust_vaccines", "smoothed_hesitancy_reason_health_condition", "smoothed_whesitancy_reason_health_condition", "smoothed_hesitancy_reason_pregnant", "smoothed_whesitancy_reason_pregnant", "smoothed_vaccine_likely_friends", "smoothed_wvaccine_likely_friends", "smoothed_vaccine_likely_who", "smoothed_wvaccine_likely_who", "smoothed_vaccine_likely_govt_health", "smoothed_wvaccine_likely_govt_health", "smoothed_vaccine_likely_politicians", "smoothed_wvaccine_likely_politicians", "smoothed_vaccine_likely_doctors", "smoothed_wvaccine_likely_doctors", "smoothed_felt_isolated_7d", "smoothed_wfelt_isolated_7d", "smoothed_worried_become_ill", "smoothed_wworried_become_ill", ["smoothed_vaccine_barrier_appointment_time_tried", "msa"], ["smoothed_vaccine_barrier_childcare_tried", "msa"], ["smoothed_vaccine_barrier_document_tried", "msa"], ["smoothed_vaccine_barrier_eligible_tried", "msa"], ["smoothed_vaccine_barrier_language_tried", "msa"], ["smoothed_vaccine_barrier_no_appointments_tried", "msa"], ["smoothed_vaccine_barrier_none_tried", "msa"], ["smoothed_vaccine_barrier_technical_difficulties_tried", "msa"], ["smoothed_vaccine_barrier_technology_access_tried", "msa"], ["smoothed_vaccine_barrier_time_tried", "msa"], ["smoothed_vaccine_barrier_travel_tried", "msa"], ["smoothed_vaccine_barrier_type_tried", "msa"], ["smoothed_wvaccine_barrier_appointment_time_tried", "msa"], ["smoothed_wvaccine_barrier_childcare_tried", "msa"], ["smoothed_wvaccine_barrier_document_tried", "msa"], ["smoothed_wvaccine_barrier_eligible_tried", "msa"], ["smoothed_wvaccine_barrier_language_tried", "msa"], ["smoothed_wvaccine_barrier_no_appointments_tried", "msa"], ["smoothed_wvaccine_barrier_none_tried", "msa"], ["smoothed_wvaccine_barrier_technical_difficulties_tried", "msa"], ["smoothed_wvaccine_barrier_technology_access_tried", "msa"], ["smoothed_wvaccine_barrier_time_tried", "msa"], ["smoothed_wvaccine_barrier_travel_tried", "msa"], ["smoothed_wvaccine_barrier_type_tried", "msa"]] + "retired-signals": [ + "smoothed_anxious_5d", "smoothed_wanxious_5d", + "smoothed_depressed_5d", "smoothed_wdepressed_5d", + "smoothed_felt_isolated_5d", "smoothed_wfelt_isolated_5d", + "smoothed_large_event_1d", "smoothed_wlarge_event_1d", + "smoothed_restaurant_1d", "smoothed_wrestaurant_1d", + "smoothed_shop_1d", "smoothed_wshop_1d", + "smoothed_spent_time_1d", "smoothed_wspent_time_1d", + "smoothed_travel_outside_state_5d", "smoothed_wtravel_outside_state_5d", + "smoothed_work_outside_home_1d", "smoothed_wwork_outside_home_1d", + "smoothed_wearing_mask", "smoothed_wwearing_mask", + "smoothed_vaccine_likely_local_health", "smoothed_wvaccine_likely_local_health", + "smoothed_others_masked", "smoothed_wothers_masked", + "smoothed_wanted_test_14d", "smoothed_wwanted_test_14d", + "smoothed_covid_vaccinated_or_accept", "smoothed_wcovid_vaccinated_or_accept", + "smoothed_accept_covid_vaccine", "smoothed_waccept_covid_vaccine", + "smoothed_hesitancy_reason_allergic", "smoothed_whesitancy_reason_allergic", + "smoothed_hesitancy_reason_not_recommended", "smoothed_whesitancy_reason_not_recommended", + "smoothed_hesitancy_reason_distrust_vaccines", "smoothed_whesitancy_reason_distrust_vaccines", + "smoothed_hesitancy_reason_health_condition", "smoothed_whesitancy_reason_health_condition", + "smoothed_hesitancy_reason_pregnant", "smoothed_whesitancy_reason_pregnant", + "smoothed_vaccine_likely_friends", "smoothed_wvaccine_likely_friends", + "smoothed_vaccine_likely_who", "smoothed_wvaccine_likely_who", + "smoothed_vaccine_likely_govt_health", "smoothed_wvaccine_likely_govt_health", + "smoothed_vaccine_likely_politicians", "smoothed_wvaccine_likely_politicians", + "smoothed_vaccine_likely_doctors", "smoothed_wvaccine_likely_doctors", + "smoothed_felt_isolated_7d", "smoothed_wfelt_isolated_7d", + "smoothed_worried_become_ill", "smoothed_wworried_become_ill", + ["smoothed_vaccine_barrier_appointment_time_tried", "msa"], ["smoothed_wvaccine_barrier_appointment_time_tried", "msa"], + ["smoothed_vaccine_barrier_childcare_tried", "msa"], ["smoothed_wvaccine_barrier_childcare_tried", "msa"], + ["smoothed_vaccine_barrier_document_tried", "msa"], ["smoothed_wvaccine_barrier_document_tried", "msa"], + ["smoothed_vaccine_barrier_eligible_tried", "msa"], ["smoothed_wvaccine_barrier_eligible_tried", "msa"], + ["smoothed_vaccine_barrier_language_tried", "msa"], ["smoothed_wvaccine_barrier_language_tried", "msa"], + ["smoothed_vaccine_barrier_no_appointments_tried", "msa"], ["smoothed_wvaccine_barrier_no_appointments_tried", "msa"], + ["smoothed_vaccine_barrier_none_tried", "msa"], ["smoothed_wvaccine_barrier_none_tried", "msa"], + ["smoothed_wvaccine_barrier_technical_difficulties_tried", "msa"], ["smoothed_vaccine_barrier_technical_difficulties_tried", "msa"], + ["smoothed_wvaccine_barrier_technology_access_tried", "msa"], ["smoothed_wvaccine_barrier_technology_access_tried", "msa"], + ["smoothed_vaccine_barrier_time_tried", "msa"], ["smoothed_wvaccine_barrier_time_tried", "msa"], + ["smoothed_vaccine_barrier_travel_tried", "msa"], ["smoothed_wvaccine_barrier_travel_tried", "msa"], + ["smoothed_vaccine_barrier_type_tried", "msa"], ["smoothed_wvaccine_barrier_type_tried", "msa"] }, "indicator-combination": { "max_age": 4, From 1db3077babded5c47ab622c9ebbd8b3f9dd22fbc Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 15 Sep 2021 11:51:14 -0400 Subject: [PATCH 09/43] add closing bracket --- ansible/templates/sir_complainsalot-params-prod.json.j2 | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/templates/sir_complainsalot-params-prod.json.j2 b/ansible/templates/sir_complainsalot-params-prod.json.j2 index 9cc9a8250..7a1f5dddd 100644 --- a/ansible/templates/sir_complainsalot-params-prod.json.j2 +++ b/ansible/templates/sir_complainsalot-params-prod.json.j2 @@ -77,6 +77,7 @@ ["smoothed_vaccine_barrier_time_tried", "msa"], ["smoothed_wvaccine_barrier_time_tried", "msa"], ["smoothed_vaccine_barrier_travel_tried", "msa"], ["smoothed_wvaccine_barrier_travel_tried", "msa"], ["smoothed_vaccine_barrier_type_tried", "msa"], ["smoothed_wvaccine_barrier_type_tried", "msa"] + ] }, "indicator-combination": { "max_age": 4, From 7aac1de51715a18645a25889e9cccda4f85207c4 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 15 Sep 2021 13:00:45 -0700 Subject: [PATCH 10/43] Nancodes: make linter happy --- _delphi_utils_python/delphi_utils/archive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_delphi_utils_python/delphi_utils/archive.py b/_delphi_utils_python/delphi_utils/archive.py index 994dfd7df..ea8f27ac4 100644 --- a/_delphi_utils_python/delphi_utils/archive.py +++ b/_delphi_utils_python/delphi_utils/archive.py @@ -265,7 +265,7 @@ def diff_exports(self) -> Tuple[Files, FileDiffMap, Files]: for deleted_file in deleted_files: deleted_df = pd.read_csv(deleted_file, dtype=export_csv_dtypes) print( - f"Diff has deleted {deleted_file}; generating a CSV with corresponding deleted rows." + f"Diff deleted {deleted_file}; generating corresponding CSV with deleted rows." ) deleted_df[["val", "se", "sample_size"]] = np.nan deleted_df[["missing_val", "missing_se", "missing_sample_size"]] = Nans.DELETED From b4e66ce160e3069b114e8e0528ca312374a0be38 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 15 Sep 2021 14:42:04 -0700 Subject: [PATCH 11/43] Correctly ignore all receiving/*.csv files --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 94dfe3178..e664af2da 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,7 @@ params.json # Do not commit output files -receiving/*.csv +**/receiving/*.csv # Do not commit hidden macOS files .DS_Store From 84280040011ae2bf67212e004000bf5f9aacd0d1 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 15 Sep 2021 14:48:54 -0700 Subject: [PATCH 12/43] Update setup.py files to "Python :: 3.8" annotation --- _delphi_utils_python/setup.py | 2 +- changehc/setup.py | 2 +- claims_hosp/setup.py | 2 +- combo_cases_and_deaths/setup.py | 2 +- doctor_visits/setup.py | 2 +- google_symptoms/setup.py | 2 +- jhu/setup.py | 2 +- nchs_mortality/setup.py | 2 +- nowcast/setup.py | 2 +- quidel/setup.py | 2 +- quidel_covidtest/setup.py | 2 +- safegraph_patterns/setup.py | 2 +- sir_complainsalot/setup.py | 2 +- usafacts/setup.py | 2 +- 14 files changed, 14 insertions(+), 14 deletions(-) diff --git a/_delphi_utils_python/setup.py b/_delphi_utils_python/setup.py index 906bd0a4e..731d7e957 100644 --- a/_delphi_utils_python/setup.py +++ b/_delphi_utils_python/setup.py @@ -35,7 +35,7 @@ classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", - "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", ], packages=find_packages(), package_data={'': ['data/*.csv']} diff --git a/changehc/setup.py b/changehc/setup.py index 6edef8528..d702874b3 100644 --- a/changehc/setup.py +++ b/changehc/setup.py @@ -27,7 +27,7 @@ classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", - "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", ], packages=find_packages(), ) diff --git a/claims_hosp/setup.py b/claims_hosp/setup.py index c7527193b..6c88e4383 100644 --- a/claims_hosp/setup.py +++ b/claims_hosp/setup.py @@ -24,7 +24,7 @@ classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", - "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", ], packages=find_packages(), ) diff --git a/combo_cases_and_deaths/setup.py b/combo_cases_and_deaths/setup.py index 8ea4b187b..db97840a7 100644 --- a/combo_cases_and_deaths/setup.py +++ b/combo_cases_and_deaths/setup.py @@ -22,7 +22,7 @@ classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", - "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", ], packages=find_packages(), ) diff --git a/doctor_visits/setup.py b/doctor_visits/setup.py index 3c74af4b6..d7c0fe0a9 100644 --- a/doctor_visits/setup.py +++ b/doctor_visits/setup.py @@ -23,7 +23,7 @@ classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", - "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", ], packages=find_packages(), ) diff --git a/google_symptoms/setup.py b/google_symptoms/setup.py index 16a5aaecc..8cd7590ca 100644 --- a/google_symptoms/setup.py +++ b/google_symptoms/setup.py @@ -25,7 +25,7 @@ classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", - "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", ], packages=find_packages(), ) diff --git a/jhu/setup.py b/jhu/setup.py index 9f17f34f4..4c015a7c7 100644 --- a/jhu/setup.py +++ b/jhu/setup.py @@ -22,7 +22,7 @@ classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", - "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", ], packages=find_packages(), ) diff --git a/nchs_mortality/setup.py b/nchs_mortality/setup.py index 6ae6bfbf7..76915936b 100644 --- a/nchs_mortality/setup.py +++ b/nchs_mortality/setup.py @@ -25,7 +25,7 @@ classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", - "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", ], packages=find_packages(), ) diff --git a/nowcast/setup.py b/nowcast/setup.py index 6a2ebc88f..54e88ee80 100644 --- a/nowcast/setup.py +++ b/nowcast/setup.py @@ -25,7 +25,7 @@ classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", - "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", ], packages=find_packages(), ) diff --git a/quidel/setup.py b/quidel/setup.py index f912aa0d0..2fddc0cdd 100644 --- a/quidel/setup.py +++ b/quidel/setup.py @@ -26,7 +26,7 @@ classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", - "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", ], packages=find_packages(), ) diff --git a/quidel_covidtest/setup.py b/quidel_covidtest/setup.py index 9537a175b..4c01e8593 100644 --- a/quidel_covidtest/setup.py +++ b/quidel_covidtest/setup.py @@ -26,7 +26,7 @@ classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", - "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", ], packages=find_packages(), ) diff --git a/safegraph_patterns/setup.py b/safegraph_patterns/setup.py index 4053b622e..5ead94b33 100644 --- a/safegraph_patterns/setup.py +++ b/safegraph_patterns/setup.py @@ -22,7 +22,7 @@ classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", - "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", ], packages=find_packages(), ) diff --git a/sir_complainsalot/setup.py b/sir_complainsalot/setup.py index 3b18b5f19..c51253104 100644 --- a/sir_complainsalot/setup.py +++ b/sir_complainsalot/setup.py @@ -22,7 +22,7 @@ classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", - "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", ], packages=find_packages(), ) diff --git a/usafacts/setup.py b/usafacts/setup.py index b11951c7f..e15cae933 100644 --- a/usafacts/setup.py +++ b/usafacts/setup.py @@ -22,7 +22,7 @@ classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", - "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", ], packages=find_packages(), ) From d1c063bc66be4591b4c10b4ebd1df0409dba03ed Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 16 Sep 2021 10:27:08 -0400 Subject: [PATCH 13/43] check if readr is installed --- .github/workflows/r-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/r-ci.yml b/.github/workflows/r-ci.yml index 116537194..6a196a3b2 100644 --- a/.github/workflows/r-ci.yml +++ b/.github/workflows/r-ci.yml @@ -48,7 +48,7 @@ jobs: ${{ runner.os }}-r-facebook-survey- - name: Install R dependencies run: | - if ( packageVersion("readr") != "1.4.0" ) { + if ( !require("readr") || packageVersion("readr") != "1.4.0" ) { install.packages("devtools") devtools::install_version("readr", version = "1.4.0") } From efd9423e9e550e8bda38c0219c4670b4a43d11a5 Mon Sep 17 00:00:00 2001 From: alexcoda Date: Sun, 19 Sep 2021 11:35:05 -0700 Subject: [PATCH 14/43] Fix value check in quidel data_tools --- quidel/delphi_quidel/data_tools.py | 9 ++++----- quidel_covidtest/delphi_quidel_covidtest/data_tools.py | 9 ++++----- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/quidel/delphi_quidel/data_tools.py b/quidel/delphi_quidel/data_tools.py index 5d67dd812..6f9cb45c9 100644 --- a/quidel/delphi_quidel/data_tools.py +++ b/quidel/delphi_quidel/data_tools.py @@ -290,11 +290,10 @@ def raw_tests_per_device(devices, tests, min_obs): """ devices = devices.astype(float) tests = tests.astype(float) - if (np.any(np.isnan(devices)) or np.any(np.isnan(tests))): - print(devices) - print(tests) - raise ValueError('devices and tests should be non-negative ' - 'with no np.nan') + if np.any(np.isnan(devices)) or np.any(d < 0 for d in devices): + raise ValueError("devices should be non-negative with no np.nan") + if np.any(np.isnan(tests)) or np.any(d < 0 for d in tests): + raise ValueError("tests should be non-negative with no np.nan") if min_obs <= 0: raise ValueError('min_obs should be positive') tests[tests < min_obs] = np.nan diff --git a/quidel_covidtest/delphi_quidel_covidtest/data_tools.py b/quidel_covidtest/delphi_quidel_covidtest/data_tools.py index 18898ec8e..54995dc90 100644 --- a/quidel_covidtest/delphi_quidel_covidtest/data_tools.py +++ b/quidel_covidtest/delphi_quidel_covidtest/data_tools.py @@ -296,11 +296,10 @@ def raw_tests_per_device(devices, tests, min_obs): """ devices = devices.astype(float) tests = tests.astype(float) - if (np.any(np.isnan(devices)) or np.any(np.isnan(tests))): - print(devices) - print(tests) - raise ValueError('devices and tests should be non-negative ' - 'with no np.nan') + if np.any(np.isnan(devices)) or np.any(d < 0 for d in devices): + raise ValueError("devices should be non-negative with no np.nan") + if np.any(np.isnan(tests)) or np.any(d < 0 for d in tests): + raise ValueError("tests should be non-negative with no np.nan") if min_obs <= 0: raise ValueError('min_obs should be positive') tests[tests < min_obs] = np.nan From 77f26f7fefb05e4ed4fb536e2016a15639c2f97c Mon Sep 17 00:00:00 2001 From: alexcoda Date: Sun, 19 Sep 2021 11:38:24 -0700 Subject: [PATCH 15/43] Replace print statements with logging And add log statements for data export variants --- .../delphi_combo_cases_and_deaths/run.py | 19 +++++------ covid_act_now/delphi_covid_act_now/run.py | 7 ++-- hhs_facilities/delphi_hhs_facilities/run.py | 3 ++ hhs_hosp/delphi_hhs/run.py | 4 +++ .../delphi_nchs_mortality/archive_diffs.py | 8 +++-- nchs_mortality/delphi_nchs_mortality/run.py | 7 ++-- quidel/delphi_quidel/data_tools.py | 3 -- quidel/delphi_quidel/pull.py | 34 +++++++++---------- quidel/delphi_quidel/run.py | 9 ++--- .../delphi_quidel_covidtest/data_tools.py | 3 -- .../delphi_quidel_covidtest/pull.py | 31 +++++++++-------- .../delphi_quidel_covidtest/run.py | 12 ++++--- .../delphi_safegraph_patterns/process.py | 16 ++++++--- .../delphi_safegraph_patterns/run.py | 3 +- usafacts/delphi_usafacts/run.py | 2 +- 15 files changed, 90 insertions(+), 71 deletions(-) diff --git a/combo_cases_and_deaths/delphi_combo_cases_and_deaths/run.py b/combo_cases_and_deaths/delphi_combo_cases_and_deaths/run.py index c54f3f3be..bddd1833f 100755 --- a/combo_cases_and_deaths/delphi_combo_cases_and_deaths/run.py +++ b/combo_cases_and_deaths/delphi_combo_cases_and_deaths/run.py @@ -33,14 +33,6 @@ covidcast.covidcast._ASYNC_CALL = True # pylint: disable=protected-access -def check_none_data_frame(data_frame, label, date_range): - """Log and return True when a data frame is None.""" - if data_frame is None: - print(f"{label} completely unavailable in range {date_range}") - return True - return False - - def maybe_append(usa_facts, jhu): """ Append dataframes if available, otherwise return USAFacts. @@ -133,7 +125,7 @@ def get_updated_dates(signal, geo, date_range, issue_range=None, fetcher=covidca issues=issue_range ) - if check_none_data_frame(usafacts_df, "USA-FACTS", date_range): + if usafacts_df is None: return None merged_df = merge_dfs_by_geos(usafacts_df, jhu_df, geo) @@ -142,7 +134,7 @@ def get_updated_dates(signal, geo, date_range, issue_range=None, fetcher=covidca return unique_dates -def combine_usafacts_and_jhu(signal, geo, date_range, issue_range=None, fetcher=covidcast.signal): +def combine_usafacts_and_jhu(signal, geo, date_range, logger, issue_range=None, fetcher=covidcast.signal): """Add rows for PR from JHU signals to USA-FACTS signals. For hhs and nation, fetch the county `num` data so we can compute the proportions correctly @@ -158,6 +150,7 @@ def combine_usafacts_and_jhu(signal, geo, date_range, issue_range=None, fetcher= # This occurs if the usafacts ~and the jhu query were empty if unique_dates is None: + logger.info("USA-FACTS completely unavailable for dates", date_range=date_range) return EMPTY_FRAME # Query only the represented window so that every geo is represented; a single window call is @@ -329,9 +322,15 @@ def run_module(params): log_exceptions=params["common"].get("log_exceptions", True)) for metric, geo_res, sensor_name, signal in variants: + logger.info("Generating signal and exporting to CSV", + geo_res = geo_res, + metric = metric, + sensor = sensor_name, + signal = signal) df = combine_usafacts_and_jhu(signal, geo_res, extend_raw_date_range(params, sensor_name), + logger, params['indicator']['issue_range']) df["timestamp"] = pd.to_datetime(df["timestamp"]) start_date = pd.to_datetime(params['indicator']['export_start_date']) diff --git a/covid_act_now/delphi_covid_act_now/run.py b/covid_act_now/delphi_covid_act_now/run.py index d9d983f0d..7cc96f6e4 100644 --- a/covid_act_now/delphi_covid_act_now/run.py +++ b/covid_act_now/delphi_covid_act_now/run.py @@ -45,7 +45,7 @@ def run_module(params): parquet_url = params["indicator"]["parquet_url"] # Load CAN county-level testing data - print("Pulling CAN data") + logger.info("Pulling CAN data") df_pq = load_data(parquet_url) df_county_testing = extract_testing_metrics(df_pq) @@ -54,7 +54,8 @@ def run_module(params): max_dates_exported = [] # Perform geo aggregations and export to receiving for geo_res in GEO_RESOLUTIONS: - print(f"Processing {geo_res}") + logger.info("Generating signal and exporting to CSV", + geo_res = geo_res) df = geo_map(df_county_testing, geo_res) # Export 'pcr_specimen_positivity_rate' @@ -79,7 +80,7 @@ def run_module(params): max_dates_exported.append(latest) # x2 to count both positivity and tests signals num_exported_files += exported_csv_dates.size * 2 - print(f"Exported dates: {earliest} to {latest}") + logger.info("Exported for dates between", earliest=earliest, latest=latest) elapsed_time_in_seconds = round(time.time() - start_time, 2) max_lag_in_days = (datetime.now() - min(max_dates_exported)).days diff --git a/hhs_facilities/delphi_hhs_facilities/run.py b/hhs_facilities/delphi_hhs_facilities/run.py index b41df5bcc..43d3a9bdd 100644 --- a/hhs_facilities/delphi_hhs_facilities/run.py +++ b/hhs_facilities/delphi_hhs_facilities/run.py @@ -36,6 +36,9 @@ def run_module(params) -> None: filled_fips_df = fill_missing_fips(raw_df, gmpr) stats = [] for geo, (sig_name, sig_cols, sig_func, sig_offset) in product(GEO_RESOLUTIONS, SIGNALS): + logger.info("Generating signal and exporting to CSV", + geo_res = geo, + signal_name = sig_name) mapped_df = convert_geo(filled_fips_df, geo, gmpr) output_df = generate_signal(mapped_df, sig_cols, sig_func, sig_offset) dates = create_export_csv(output_df, params["common"]["export_dir"], geo, sig_name) diff --git a/hhs_hosp/delphi_hhs/run.py b/hhs_hosp/delphi_hhs/run.py index 6af654845..45c2f5bc1 100644 --- a/hhs_hosp/delphi_hhs/run.py +++ b/hhs_hosp/delphi_hhs/run.py @@ -105,6 +105,10 @@ def run_module(params): geo_mapper = GeoMapper() stats = [] for sensor, smoother, geo in product(SIGNALS, SMOOTHERS, GEOS): + logger.info("Generating signal and exporting to CSV", + geo_res = geo, + sensor = sensor, + smoother = smoother) df = geo_mapper.add_geocode(make_signal(all_columns, sensor), "state_id", "state_code", diff --git a/nchs_mortality/delphi_nchs_mortality/archive_diffs.py b/nchs_mortality/delphi_nchs_mortality/archive_diffs.py index 6524203b3..e8b790cee 100644 --- a/nchs_mortality/delphi_nchs_mortality/archive_diffs.py +++ b/nchs_mortality/delphi_nchs_mortality/archive_diffs.py @@ -8,7 +8,7 @@ from delphi_utils import S3ArchiveDiffer -def arch_diffs(params, daily_arch_diff): +def arch_diffs(params, daily_arch_diff, logger): """ Archive differences between new updates and existing data. @@ -23,6 +23,8 @@ def arch_diffs(params, daily_arch_diff): Read from params.json daily_arch_diff: S3ArchiveDiffer Used to store and update cache + logger: logging.Logger + The structured logger. """ weekly_export_dir = params["common"]["weekly_export_dir"] daily_export_dir = params["common"]["daily_export_dir"] @@ -59,7 +61,7 @@ def arch_diffs(params, daily_arch_diff): # Report failures: someone should probably look at them for exported_file in fails: - print(f"Failed to archive (weekly) '{exported_file}'") + logger.info("Failed to archive (weekly)", filename={exported_file}) # Daily run of archiving utility # - Uploads changed files to S3 @@ -83,4 +85,4 @@ def arch_diffs(params, daily_arch_diff): # Report failures: someone should probably look at them for exported_file in fails: - print(f"Failed to archive (daily) '{exported_file}'") + logger.info("Failed to archive (daily)", filename={exported_file}) diff --git a/nchs_mortality/delphi_nchs_mortality/run.py b/nchs_mortality/delphi_nchs_mortality/run.py index fa0226fcb..1673e79c1 100644 --- a/nchs_mortality/delphi_nchs_mortality/run.py +++ b/nchs_mortality/delphi_nchs_mortality/run.py @@ -62,7 +62,8 @@ def run_module(params: Dict[str, Any]): df_pull = pull_nchs_mortality_data(token, test_file) for metric in METRICS: if metric == 'percent_of_expected_deaths': - print(metric) + logger.info("Generating signal and exporting to CSV", + metric = metric) df = df_pull.copy() df["val"] = df[metric] df["se"] = np.nan @@ -80,7 +81,9 @@ def run_module(params: Dict[str, Any]): stats.append((max(dates), len(dates))) else: for sensor in SENSORS: - print(metric, sensor) + logger.info("Generating signal and exporting to CSV", + metric = metric, + sensor = sensor) df = df_pull.copy() if sensor == "num": df["val"] = df[metric] diff --git a/quidel/delphi_quidel/data_tools.py b/quidel/delphi_quidel/data_tools.py index 6f9cb45c9..c0ebeb750 100644 --- a/quidel/delphi_quidel/data_tools.py +++ b/quidel/delphi_quidel/data_tools.py @@ -86,8 +86,6 @@ def _geographical_pooling(tpooled_tests, tpooled_ptests, min_obs, max_borrow_obs Same length as tests; proportion of parent observations to borrow. """ if (np.any(np.isnan(tpooled_tests)) or np.any(np.isnan(tpooled_ptests))): - print(tpooled_tests) - print(tpooled_ptests) raise ValueError('[parent] tests should be non-negative ' 'with no np.nan') if max_borrow_obs > min_obs: @@ -153,7 +151,6 @@ def raw_positive_prop(positives, tests, min_obs): positives = positives.astype(float) tests = tests.astype(float) if np.any(np.isnan(positives)) or np.any(np.isnan(tests)): - print(positives, tests) raise ValueError('positives and tests should be non-negative ' 'with no np.nan') if np.any(positives > tests): diff --git a/quidel/delphi_quidel/pull.py b/quidel/delphi_quidel/pull.py index 6132ffe40..3a0c6f25f 100644 --- a/quidel/delphi_quidel/pull.py +++ b/quidel/delphi_quidel/pull.py @@ -82,7 +82,7 @@ def regulate_column_names(df, test_type): return df def get_from_email(column_names, start_dates, end_dates, mail_server, - account, sender, password): + account, sender, password, logger): """ Get raw data from email account. @@ -98,6 +98,8 @@ def get_from_email(column_names, start_dates, end_dates, mail_server, email account of the sender password: str password of the datadrop email + logger: logging.Logger + The structured logger. Returns: df: pd.DataFrame @@ -131,7 +133,7 @@ def get_from_email(column_names, start_dates, end_dates, mail_server, if not whether_in_range: continue - print(f"Pulling {test} data received on %s"%search_date.date()) + logger.info(f"Pulling data", test=test, date=search_date.date()) toread = io.BytesIO() toread.write(att.payload) toread.seek(0) # reset the pointer @@ -153,10 +155,9 @@ def fix_zipcode(df): zipcode = int(float(zipcode)) zipcode5.append(zipcode) df['zip'] = zipcode5 - # print('Fixing %.2f %% of the data' % (fixnum * 100 / len(zipcode5))) return df -def fix_date(df): +def fix_date(df, logger): """ Remove invalid dates and select correct test date to use. @@ -175,16 +176,16 @@ def fix_date(df): df.insert(2, "timestamp", df["TestDate"]) mask = df["TestDate"] <= df["StorageDate"] - print("Removing %.2f%% of unusual data" % ((len(df) - np.sum(mask)) * 100 / len(df))) + logger.info(f"Removing {((len(df) - np.sum(mask)) * 100 / len(df)):.2f} of unusual data") df = df[mask] mask = df["StorageDate"] - df["TestDate"] > pd.Timedelta(days=90) - print("Fixing %.2f%% of outdated data" % (np.sum(mask) * 100 / len(df))) + logger.info(f"Fixing {(np.sum(mask) * 100 / len(df)):.2f} of outdated data") df["timestamp"].values[mask] = df["StorageDate"].values[mask] return df def preprocess_new_data(start_dates, end_dates, mail_server, account, - sender, password, test_mode): + sender, password, test_mode, logger): """ Pull and pre-process Quidel Antigen Test data from datadrop email. @@ -206,6 +207,8 @@ def preprocess_new_data(start_dates, end_dates, mail_server, account, password of the datadrop email test_mode: bool pull raw data from email or not + logger: logging.Logger + The structured logger. Returns: df: pd.DataFrame time_flag: datetime.date: @@ -220,7 +223,7 @@ def preprocess_new_data(start_dates, end_dates, mail_server, account, else: # Get new data from email dfs, time_flag = get_from_email(COLUMN_NAMES, start_dates, end_dates, - mail_server, account, sender, password) + mail_server, account, sender, password, logger) # No new data can be pulled if time_flag is None: @@ -228,13 +231,12 @@ def preprocess_new_data(start_dates, end_dates, mail_server, account, df_finals = {} for test_type in TEST_TYPES: - print(f"For {test_type}:") + logger.info(f"For {test_type}:") df = dfs[test_type] # Fix some of the fipcodes that are 9 digit instead of 5 digit df = fix_zipcode(df) # Create a column CanonicalDate according to StarageDate and TestDate - df = fix_date(df) - + df = fix_date(df, logger) # Compute numUniqueDevices numUniqueDevices = df.groupby( by=["timestamp", "zip"], @@ -309,17 +311,15 @@ def check_intermediate_file(cache_dir, pull_start_dates): sep=",", parse_dates=["timestamp"]) return previous_dfs, pull_start_dates -def pull_quidel_data(params): +def pull_quidel_data(params, logger): """ Pull new quidel test data and decide whether to combine it with historical records in ./cache. Parameters: params: dict including all the information read from params.json - END_FROM_TODAY_MINUS: int - report data until - X days - EXPORT_DAY_RANGE: int - number of dates to report + logger: logging.Logger + The structured logger. Returns: DataFrame: @@ -355,7 +355,7 @@ def pull_quidel_data(params): # Use _end_date to check the most recent date that we received data dfs, _end_date = preprocess_new_data( pull_start_dates, pull_end_dates, mail_server, - account, sender, password, test_mode) + account, sender, password, test_mode, logger) # Utilize previously stored data for test_type in TEST_TYPES: diff --git a/quidel/delphi_quidel/run.py b/quidel/delphi_quidel/run.py index 49f6ec66b..cd83d746a 100644 --- a/quidel/delphi_quidel/run.py +++ b/quidel/delphi_quidel/run.py @@ -63,9 +63,9 @@ def run_module(params: Dict[str, Any]): ) # Pull data and update export date - dfs, _end_date = pull_quidel_data(params["indicator"]) + dfs, _end_date = pull_quidel_data(params["indicator"], logger) if _end_date is None: - print("The data is up-to-date. Currently, no new data to be ingested.") + logger.info("The data is up-to-date. Currently, no new data to be ingested.") return export_end_dates = check_export_end_date(export_end_dates, _end_date, END_FROM_TODAY_MINUS) @@ -81,7 +81,6 @@ def run_module(params: Dict[str, Any]): for sensor in sensors: # Check either covid_ag or flu_ag test_type = "covid_ag" if "covid_ag" in sensor else "flu_ag" - print("state", sensor) data = dfs[test_type].copy() state_groups = geo_map("state", data, map_df).groupby("state_id") first_date, last_date = data["timestamp"].min(), data["timestamp"].max() @@ -97,7 +96,9 @@ def run_module(params: Dict[str, Any]): # County/HRR/MSA level for geo_res in GEO_RESOLUTIONS: - print(geo_res, sensor) + logger.info("Generating signal and exporting to CSV", + geo_res = geo_res, + sensor = sensor) data = dfs[test_type].copy() data, res_key = geo_map(geo_res, data, map_df) res_df = generate_sensor_for_other_geores( diff --git a/quidel_covidtest/delphi_quidel_covidtest/data_tools.py b/quidel_covidtest/delphi_quidel_covidtest/data_tools.py index 54995dc90..fac0b58b2 100644 --- a/quidel_covidtest/delphi_quidel_covidtest/data_tools.py +++ b/quidel_covidtest/delphi_quidel_covidtest/data_tools.py @@ -92,8 +92,6 @@ def _geographical_pooling(tpooled_tests, tpooled_ptests, min_obs): """ if (np.any(np.isnan(tpooled_tests)) or np.any(np.isnan(tpooled_ptests))): - print(tpooled_tests) - print(tpooled_ptests) raise ValueError('[parent] tests should be non-negative ' 'with no np.nan') # STEP 1: "TOP UP" USING PARENT LOCATION @@ -156,7 +154,6 @@ def raw_positive_prop(positives, tests, min_obs): positives = positives.astype(float) tests = tests.astype(float) if np.any(np.isnan(positives)) or np.any(np.isnan(tests)): - print(positives, tests) raise ValueError('positives and tests should be non-negative ' 'with no np.nan') if np.any(positives > tests): diff --git a/quidel_covidtest/delphi_quidel_covidtest/pull.py b/quidel_covidtest/delphi_quidel_covidtest/pull.py index fe042ed38..9ce036e10 100644 --- a/quidel_covidtest/delphi_quidel_covidtest/pull.py +++ b/quidel_covidtest/delphi_quidel_covidtest/pull.py @@ -8,7 +8,7 @@ import pandas as pd import numpy as np -def get_from_s3(start_date, end_date, bucket): +def get_from_s3(start_date, end_date, bucket, logger): """ Get raw data from aws s3 bucket. @@ -19,6 +19,8 @@ def get_from_s3(start_date, end_date, bucket): pull data from file tagged with date on/before the end date bucket: s3.Bucket the aws s3 bucket that stores quidel data + logger: logging.Logger + The structured logger. output: df: pd.DataFrame time_flag: datetime.datetime @@ -49,7 +51,7 @@ def get_from_s3(start_date, end_date, bucket): for search_date in [start_date + timedelta(days=x) for x in range(n_days)]: if search_date in s3_files.keys(): # Avoid appending duplicate datasets - print("Pulling data received on %s"%search_date.date()) + logger.info(f"Pulling data received on {search_date.date()}") # Fetch data received on the same day for fn in s3_files[search_date]: @@ -76,10 +78,9 @@ def fix_zipcode(df): zipcode = int(float(zipcode)) zipcode5.append(zipcode) df['zip'] = zipcode5 - # print('Fixing %.2f %% of the data' % (fixnum * 100 / len(zipcode5))) return df -def fix_date(df): +def fix_date(df, logger): """ Remove invalid dates and select correct test date to use. @@ -98,15 +99,15 @@ def fix_date(df): df.insert(2, "timestamp", df["TestDate"]) mask = df["TestDate"] <= df["StorageDate"] - print("Removing %.2f%% of unusual data" % ((len(df) - np.sum(mask)) * 100 / len(df))) + logger.info(f"Removing {((len(df) - np.sum(mask)) * 100 / len(df)):.2f} of unusual data") df = df[mask] mask = df["StorageDate"] - df["TestDate"] > pd.Timedelta(days=90) - print("Fixing %.2f%% of outdated data" % (np.sum(mask) * 100 / len(df))) + logger.info(f"Fixing {(np.sum(mask) * 100 / len(df)):.2f} of outdated data") df["timestamp"].values[mask] = df["StorageDate"].values[mask] return df -def preprocess_new_data(start_date, end_date, params, test_mode): +def preprocess_new_data(start_date, end_date, params, test_mode, logger): """ Pull and pre-process Quidel Covid Test data. @@ -123,6 +124,8 @@ def preprocess_new_data(start_date, end_date, params, test_mode): read from params.json test_mode: bool pull raw data from s3 or not + logger: logging.Logger + The structured logger. output: df: pd.DataFrame time_flag: datetime.date: @@ -144,7 +147,7 @@ def preprocess_new_data(start_date, end_date, params, test_mode): aws_secret_access_key=aws_secret_access_key) bucket = s3.Bucket(bucket_name) # Get new data from s3 - df, time_flag = get_from_s3(start_date, end_date, bucket) + df, time_flag = get_from_s3(start_date, end_date, bucket, logger) # No new data can be pulled if time_flag is None: @@ -154,7 +157,7 @@ def preprocess_new_data(start_date, end_date, params, test_mode): df = fix_zipcode(df) # Create a column CanonicalDate according to StarageDate and TestDate - df = fix_date(df) + df = fix_date(df, logger) # Compute overallPositive overall_pos = df[df["OverallResult"] == "positive"].groupby( @@ -197,7 +200,7 @@ def check_intermediate_file(cache_dir, pull_start_date): return previous_df, pull_start_date return None, pull_start_date -def pull_quidel_covidtest(params): +def pull_quidel_covidtest(params, logger): """Pull the quidel covid test data. Conditionally merge new data with historical data from ./cache. @@ -205,10 +208,8 @@ def pull_quidel_covidtest(params): Parameters: params: dict including all the information read from params.json - end_from_today_minus: int - report data until - X days - export_day_range: int - number of dates to report + logger: logging.Logger + The structured logger. Returns: DataFrame: @@ -237,7 +238,7 @@ def pull_quidel_covidtest(params): # Pull data from the file at 5 digit zipcode level # Use _end_date to check the most recent date that we received data df, _end_date = preprocess_new_data( - pull_start_date, pull_end_date, params, test_mode) + pull_start_date, pull_end_date, params, test_mode, logger) # Utilize previously stored data if previous_df is not None: diff --git a/quidel_covidtest/delphi_quidel_covidtest/run.py b/quidel_covidtest/delphi_quidel_covidtest/run.py index d82f80135..5f084440c 100644 --- a/quidel_covidtest/delphi_quidel_covidtest/run.py +++ b/quidel_covidtest/delphi_quidel_covidtest/run.py @@ -76,9 +76,9 @@ def run_module(params: Dict[str, Any]): export_day_range = params["indicator"]["export_day_range"] # Pull data and update export date - df, _end_date = pull_quidel_covidtest(params["indicator"]) + df, _end_date = pull_quidel_covidtest(params["indicator"], logger) if _end_date is None: - print("The data is up-to-date. Currently, no new data to be ingested.") + logger.info("The data is up-to-date. Currently, no new data to be ingested.") return export_end_date = check_export_end_date(export_end_date, _end_date, END_FROM_TODAY_MINUS) @@ -98,7 +98,9 @@ def run_module(params: Dict[str, Any]): geo_data, res_key = geo_map(geo_res, data) geo_groups = geo_data.groupby(res_key) for sensor in sensors: - print(geo_res, sensor) + logger.info("Generating signal and exporting to CSV", + geo_res=geo_res, + sensor=sensor) if sensor.endswith(SMOOTHED_POSITIVE): smoothers[sensor] = smoothers.pop(SMOOTHED_POSITIVE) elif sensor.endswith(RAW_POSITIVE): @@ -125,7 +127,9 @@ def run_module(params: Dict[str, Any]): for geo_res in PARENT_GEO_RESOLUTIONS: geo_data, res_key = geo_map(geo_res, data) for sensor in sensors: - print(geo_res, sensor) + logger.info("Generating signal and exporting to CSV", + geo_res=geo_res, + sensor=sensor) res_df = generate_sensor_for_parent_geo( geo_groups, geo_data, res_key, smooth=smoothers[sensor][1], device=smoothers[sensor][0], first_date=first_date, diff --git a/safegraph_patterns/delphi_safegraph_patterns/process.py b/safegraph_patterns/delphi_safegraph_patterns/process.py index 1445ce028..330cf6762 100644 --- a/safegraph_patterns/delphi_safegraph_patterns/process.py +++ b/safegraph_patterns/delphi_safegraph_patterns/process.py @@ -125,7 +125,7 @@ def aggregate(df, metric, geo_res): return df.rename({geo_key: "geo_id"}, axis=1) def process(fname, sensors, metrics, geo_resolutions, - export_dir, brand_df, stats): + export_dir, brand_df, stats, logger): """ Process an input census block group-level CSV and export it. @@ -135,16 +135,20 @@ def process(fname, sensors, metrics, geo_resolutions, ---------- fname: str Input filename. - metrics: List[Tuple[str, bool]] - List of (metric_name, wip). sensors: List[str] List of (sensor) + metrics: List[Tuple[str, bool]] + List of (metric_name, wip). geo_resolutions: List[str] List of geo resolutions to export the data. + export_dir: str + The directory to export files to. brand_df: pd.DataFrame mapping info from naics_code to safegraph_brand_id stats: List[Tuple[datetime, int]] List to which we will add (max export date, number of export dates) + logger: logging.Logger + The structured logger. Returns ------- @@ -164,7 +168,7 @@ def process(fname, sensors, metrics, geo_resolutions, usecols=used_cols, parse_dates=["date_range_start", "date_range_end"]) dfs = construct_signals(df, metric_names, naics_codes, brand_df) - print("Finished pulling data from " + fname) + logger.info("Finished pulling data.", filename=fname) else: files = glob.glob(f'{fname}/**/*.csv.gz', recursive=True) dfs_dict = {"bars_visit": [], "restaurants_visit": []} @@ -180,9 +184,11 @@ def process(fname, sensors, metrics, geo_resolutions, ).groupby(["timestamp", "zip"]).sum().reset_index() dfs["restaurants_visit"] = pd.concat(dfs_dict["restaurants_visit"] ).groupby(["timestamp", "zip"]).sum().reset_index() - print("Finished pulling data from " + fname) + logger.info("Finished pulling data.", filename=fname) for geo_res, sensor in product(geo_resolutions, sensors): for metric, wip in zip(metric_names, wips): + logger.info("Generating signal and exporting to CSV", + geo_res=geo_res, metric=metric, sensor=sensor) df_export = aggregate(dfs[metric], metric, geo_res) df_export["val"] = df_export["_".join([metric, sensor])] df_export["se"] = np.nan diff --git a/safegraph_patterns/delphi_safegraph_patterns/run.py b/safegraph_patterns/delphi_safegraph_patterns/run.py index ffb0e4eb7..6eb474b9b 100644 --- a/safegraph_patterns/delphi_safegraph_patterns/run.py +++ b/safegraph_patterns/delphi_safegraph_patterns/run.py @@ -101,7 +101,8 @@ def run_module(params): sensors=SENSORS, geo_resolutions=GEO_RESOLUTIONS, export_dir=export_dir, - stats=stats + stats=stats, + logger=logger, ) with mp.Pool(n_core) as pool: diff --git a/usafacts/delphi_usafacts/run.py b/usafacts/delphi_usafacts/run.py index 90c11e28a..4c659679a 100644 --- a/usafacts/delphi_usafacts/run.py +++ b/usafacts/delphi_usafacts/run.py @@ -98,7 +98,7 @@ def run_module(params: Dict[str, Dict[str, Any]]): METRICS, GEO_RESOLUTIONS, SENSORS, SMOOTHERS): if "cumulative" in sensor and "seven_day_average" in smoother: continue - logger.info("generating signal and exporting to CSV", + logger.info("Generating signal and exporting to CSV", geo_res = geo_res, metric = metric, sensor = sensor, From f2f55d3b216257f150fea5e9113a38db433d51f4 Mon Sep 17 00:00:00 2001 From: alexcoda Date: Sun, 19 Sep 2021 12:00:16 -0700 Subject: [PATCH 16/43] lint --- nchs_mortality/delphi_nchs_mortality/run.py | 2 +- quidel/delphi_quidel/pull.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nchs_mortality/delphi_nchs_mortality/run.py b/nchs_mortality/delphi_nchs_mortality/run.py index 1673e79c1..1cf3d36d5 100644 --- a/nchs_mortality/delphi_nchs_mortality/run.py +++ b/nchs_mortality/delphi_nchs_mortality/run.py @@ -110,7 +110,7 @@ def run_module(params: Dict[str, Any]): # - Uploads changed files to S3 # - Does not export any issues into receiving if "archive" in params: - arch_diffs(params, daily_arch_diff) + arch_diffs(params, daily_arch_diff, logger) elapsed_time_in_seconds = round(time.time() - start_time, 2) min_max_date = stats and min(s[0] for s in stats) diff --git a/quidel/delphi_quidel/pull.py b/quidel/delphi_quidel/pull.py index 3a0c6f25f..1643f9304 100644 --- a/quidel/delphi_quidel/pull.py +++ b/quidel/delphi_quidel/pull.py @@ -133,7 +133,7 @@ def get_from_email(column_names, start_dates, end_dates, mail_server, if not whether_in_range: continue - logger.info(f"Pulling data", test=test, date=search_date.date()) + logger.info("Pulling data", test=test, date=search_date.date()) toread = io.BytesIO() toread.write(att.payload) toread.seek(0) # reset the pointer From 23c134ad77b3f0dcd44d5078a0d5521410f3fad9 Mon Sep 17 00:00:00 2001 From: alexcoda Date: Sun, 19 Sep 2021 12:04:01 -0700 Subject: [PATCH 17/43] lint --- combo_cases_and_deaths/delphi_combo_cases_and_deaths/run.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/combo_cases_and_deaths/delphi_combo_cases_and_deaths/run.py b/combo_cases_and_deaths/delphi_combo_cases_and_deaths/run.py index bddd1833f..7fbaa2898 100755 --- a/combo_cases_and_deaths/delphi_combo_cases_and_deaths/run.py +++ b/combo_cases_and_deaths/delphi_combo_cases_and_deaths/run.py @@ -134,7 +134,8 @@ def get_updated_dates(signal, geo, date_range, issue_range=None, fetcher=covidca return unique_dates -def combine_usafacts_and_jhu(signal, geo, date_range, logger, issue_range=None, fetcher=covidcast.signal): +def combine_usafacts_and_jhu(signal, geo, date_range, logger, + issue_range=None, fetcher=covidcast.signal): """Add rows for PR from JHU signals to USA-FACTS signals. For hhs and nation, fetch the county `num` data so we can compute the proportions correctly From f5825512f8b98fb9b50eee2d7cffde1d94fbaf83 Mon Sep 17 00:00:00 2001 From: alexcoda Date: Sun, 19 Sep 2021 12:10:44 -0700 Subject: [PATCH 18/43] Fix missing logger in tests --- quidel/tests/test_pull.py | 5 ++++- quidel_covidtest/tests/test_pull.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/quidel/tests/test_pull.py b/quidel/tests/test_pull.py index 435624f7e..bf27b0bd6 100644 --- a/quidel/tests/test_pull.py +++ b/quidel/tests/test_pull.py @@ -1,3 +1,4 @@ +import logging from datetime import datetime import pandas as pd @@ -36,6 +37,8 @@ def test_fix_date(self): class TestingPullData: def test_pull_quidel_data(self): + logger = logging.Logger("test_logger") + dfs, _ = pull_quidel_data({ "static_file_dir": "../static", "input_cache_dir": "./cache", @@ -49,7 +52,7 @@ def test_pull_quidel_data(self): "sender": "", "wip_signal": [""], "test_mode": True - }) + }, logger) # For covid_ag df = dfs["covid_ag"] diff --git a/quidel_covidtest/tests/test_pull.py b/quidel_covidtest/tests/test_pull.py index 48bb48d14..acdae32fe 100644 --- a/quidel_covidtest/tests/test_pull.py +++ b/quidel_covidtest/tests/test_pull.py @@ -1,3 +1,4 @@ +import logging from datetime import datetime import pandas as pd @@ -36,6 +37,8 @@ def test_fix_date(self): class TestingPullData: def test_pull_quidel_covidtest(self): + logger = logging.Logger("test_logger") + df, _ = pull_quidel_covidtest({ "static_file_dir": "../static", "input_cache_dir": "./cache", @@ -50,7 +53,7 @@ def test_pull_quidel_covidtest(self): "bucket_name": "", "wip_signal": "", "test_mode": True - }) + }, logger) first_date = df["timestamp"].min().date() last_date = df["timestamp"].max().date() From bb3121d36ba9af18eac334d174ba03a3729db236 Mon Sep 17 00:00:00 2001 From: alexcoda Date: Sun, 19 Sep 2021 12:22:02 -0700 Subject: [PATCH 19/43] Fix missing logger in tests --- combo_cases_and_deaths/tests/test_run.py | 14 ++++++++------ quidel/tests/test_pull.py | 7 +++---- quidel_covidtest/tests/test_pull.py | 8 ++++---- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/combo_cases_and_deaths/tests/test_run.py b/combo_cases_and_deaths/tests/test_run.py index 8d03627d4..3432d22d1 100644 --- a/combo_cases_and_deaths/tests/test_run.py +++ b/combo_cases_and_deaths/tests/test_run.py @@ -1,4 +1,5 @@ """Tests for running combo cases and deaths indicator.""" +import logging from datetime import date from itertools import product import os @@ -17,6 +18,7 @@ COLUMN_MAPPING) from delphi_combo_cases_and_deaths.constants import METRICS, SMOOTH_TYPES, SENSORS +LOGGER = logging.Logger("test_logger") def test_issue_dates(): """The smoothed value for a particular date is computed from the raw @@ -98,7 +100,7 @@ def make_mock(geo): ("1 1", 4, 1 if geo in ["nation", "hhs"] else 2), ("0 0", 2, 0) ]: - df = combine_usafacts_and_jhu("", geo, date_range, fetcher=mock_covidcast_signal) + df = combine_usafacts_and_jhu("", geo, date_range, LOGGER, fetcher=mock_covidcast_signal) assert df.size == expected_size * len(COLUMN_MAPPING), f""" Wrong number of rows in combined data frame for the number of available signals. @@ -126,7 +128,7 @@ def test_multiple_issues(mock_covidcast_signal): }), None ] * 2 - result = combine_usafacts_and_jhu("confirmed_incidence_num", "county", date_range=(0, 1), fetcher=mock_covidcast_signal) + result = combine_usafacts_and_jhu("confirmed_incidence_num", "county", date_range=(0, 1), logger=LOGGER, fetcher=mock_covidcast_signal) pd.testing.assert_frame_equal( result, pd.DataFrame( @@ -186,7 +188,7 @@ def test_combine_usafacts_and_jhu_special_geos(mock_covidcast_signal): ] * 6 # each call to combine_usafacts_and_jhu makes (2 + 2 * len(unique_timestamps)) = 12 calls to the fetcher pd.testing.assert_frame_equal( - combine_usafacts_and_jhu("confirmed_incidence_num", "nation", date_range=(0, 1), fetcher=mock_covidcast_signal), + combine_usafacts_and_jhu("confirmed_incidence_num", "nation", date_range=(0, 1), logger=LOGGER, fetcher=mock_covidcast_signal), pd.DataFrame({"timestamp": [20200101], "geo_id": ["us"], "val": [50 + 100 + 200], @@ -194,7 +196,7 @@ def test_combine_usafacts_and_jhu_special_geos(mock_covidcast_signal): "sample_size": [None]}) ) pd.testing.assert_frame_equal( - combine_usafacts_and_jhu("confirmed_incidence_prop", "nation", date_range=(0, 1), fetcher=mock_covidcast_signal), + combine_usafacts_and_jhu("confirmed_incidence_prop", "nation", date_range=(0, 1), logger=LOGGER, fetcher=mock_covidcast_signal), pd.DataFrame({"timestamp": [20200101], "geo_id": ["us"], "val": [(50 + 100 + 200) / (4903185 + 3723066) * 100000], @@ -202,7 +204,7 @@ def test_combine_usafacts_and_jhu_special_geos(mock_covidcast_signal): "sample_size": [None]}) ) pd.testing.assert_frame_equal( - combine_usafacts_and_jhu("confirmed_incidence_num", "county", date_range=(0, 1), fetcher=mock_covidcast_signal), + combine_usafacts_and_jhu("confirmed_incidence_num", "county", date_range=(0, 1), logger=LOGGER, fetcher=mock_covidcast_signal), pd.DataFrame({"geo_id": ["01000", "01001", "72001"], "val": [50, 100, 200], "timestamp": [20200101, 20200101, 20200101]}, @@ -229,7 +231,7 @@ def test_no_nation_jhu(mock_covidcast_signal): "value": [1], "timestamp": [20200101]}) ] - result = combine_usafacts_and_jhu("_num", "nation", date_range=(0, 1), fetcher=mock_covidcast_signal) + result = combine_usafacts_and_jhu("_num", "nation", date_range=(0, 1), logger=LOGGER, fetcher=mock_covidcast_signal) assert mock_covidcast_signal.call_args_list[-1] == call( "jhu-csse", diff --git a/quidel/tests/test_pull.py b/quidel/tests/test_pull.py index bf27b0bd6..1f36fb85e 100644 --- a/quidel/tests/test_pull.py +++ b/quidel/tests/test_pull.py @@ -14,6 +14,7 @@ END_FROM_TODAY_MINUS = 5 EXPORT_DAY_RANGE = 40 +LOGGER = logging.Logger("test_logger") class TestFixData: def test_fix_zipcode(self): @@ -29,7 +30,7 @@ def test_fix_date(self): datetime(2020, 6, 14), datetime(2020, 7, 10)], "TestDate":[datetime(2020, 1, 19), datetime(2020, 6, 10), datetime(2020, 6, 11), datetime(2020, 7, 2)]}) - df = fix_date(df) + df = fix_date(df, LOGGER) assert set(df["timestamp"]) == set([datetime(2020, 5, 19), datetime(2020, 6, 11), datetime(2020, 7, 2)]) @@ -37,8 +38,6 @@ def test_fix_date(self): class TestingPullData: def test_pull_quidel_data(self): - logger = logging.Logger("test_logger") - dfs, _ = pull_quidel_data({ "static_file_dir": "../static", "input_cache_dir": "./cache", @@ -52,7 +51,7 @@ def test_pull_quidel_data(self): "sender": "", "wip_signal": [""], "test_mode": True - }, logger) + }, LOGGER) # For covid_ag df = dfs["covid_ag"] diff --git a/quidel_covidtest/tests/test_pull.py b/quidel_covidtest/tests/test_pull.py index acdae32fe..01df4c7c6 100644 --- a/quidel_covidtest/tests/test_pull.py +++ b/quidel_covidtest/tests/test_pull.py @@ -15,6 +15,8 @@ END_FROM_TODAY_MINUS = 5 EXPORT_DAY_RANGE = 40 +LOGGER = logging.Logger("test_logger") + class TestFixData: def test_fix_zipcode(self): @@ -29,7 +31,7 @@ def test_fix_date(self): datetime(2020, 6, 14), datetime(2020, 7, 10)], "TestDate":[datetime(2020, 1, 19), datetime(2020, 6, 10), datetime(2020, 6, 11), datetime(2020, 7, 2)]}) - df = fix_date(df) + df = fix_date(df, LOGGER) assert set(df["timestamp"]) == set([datetime(2020, 5, 19), datetime(2020, 6, 11), datetime(2020, 7, 2)]) @@ -37,8 +39,6 @@ def test_fix_date(self): class TestingPullData: def test_pull_quidel_covidtest(self): - logger = logging.Logger("test_logger") - df, _ = pull_quidel_covidtest({ "static_file_dir": "../static", "input_cache_dir": "./cache", @@ -53,7 +53,7 @@ def test_pull_quidel_covidtest(self): "bucket_name": "", "wip_signal": "", "test_mode": True - }, logger) + }, LOGGER) first_date = df["timestamp"].min().date() last_date = df["timestamp"].max().date() From 39e7dff42c94d6c4cc9eb14413e93933f3a6d9ae Mon Sep 17 00:00:00 2001 From: alexcoda Date: Sun, 19 Sep 2021 12:33:18 -0700 Subject: [PATCH 20/43] Instantiate logger correctly in tests --- combo_cases_and_deaths/tests/test_run.py | 14 +++++++------- quidel/tests/test_pull.py | 6 +++--- quidel_covidtest/tests/test_pull.py | 6 +++--- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/combo_cases_and_deaths/tests/test_run.py b/combo_cases_and_deaths/tests/test_run.py index 3432d22d1..e83af9abb 100644 --- a/combo_cases_and_deaths/tests/test_run.py +++ b/combo_cases_and_deaths/tests/test_run.py @@ -18,7 +18,7 @@ COLUMN_MAPPING) from delphi_combo_cases_and_deaths.constants import METRICS, SMOOTH_TYPES, SENSORS -LOGGER = logging.Logger("test_logger") +TEST_LOGGER = logging.getLogger() def test_issue_dates(): """The smoothed value for a particular date is computed from the raw @@ -100,7 +100,7 @@ def make_mock(geo): ("1 1", 4, 1 if geo in ["nation", "hhs"] else 2), ("0 0", 2, 0) ]: - df = combine_usafacts_and_jhu("", geo, date_range, LOGGER, fetcher=mock_covidcast_signal) + df = combine_usafacts_and_jhu("", geo, date_range, TEST_LOGGER, fetcher=mock_covidcast_signal) assert df.size == expected_size * len(COLUMN_MAPPING), f""" Wrong number of rows in combined data frame for the number of available signals. @@ -128,7 +128,7 @@ def test_multiple_issues(mock_covidcast_signal): }), None ] * 2 - result = combine_usafacts_and_jhu("confirmed_incidence_num", "county", date_range=(0, 1), logger=LOGGER, fetcher=mock_covidcast_signal) + result = combine_usafacts_and_jhu("confirmed_incidence_num", "county", date_range=(0, 1), logger=TEST_LOGGER, fetcher=mock_covidcast_signal) pd.testing.assert_frame_equal( result, pd.DataFrame( @@ -188,7 +188,7 @@ def test_combine_usafacts_and_jhu_special_geos(mock_covidcast_signal): ] * 6 # each call to combine_usafacts_and_jhu makes (2 + 2 * len(unique_timestamps)) = 12 calls to the fetcher pd.testing.assert_frame_equal( - combine_usafacts_and_jhu("confirmed_incidence_num", "nation", date_range=(0, 1), logger=LOGGER, fetcher=mock_covidcast_signal), + combine_usafacts_and_jhu("confirmed_incidence_num", "nation", date_range=(0, 1), logger=TEST_LOGGER, fetcher=mock_covidcast_signal), pd.DataFrame({"timestamp": [20200101], "geo_id": ["us"], "val": [50 + 100 + 200], @@ -196,7 +196,7 @@ def test_combine_usafacts_and_jhu_special_geos(mock_covidcast_signal): "sample_size": [None]}) ) pd.testing.assert_frame_equal( - combine_usafacts_and_jhu("confirmed_incidence_prop", "nation", date_range=(0, 1), logger=LOGGER, fetcher=mock_covidcast_signal), + combine_usafacts_and_jhu("confirmed_incidence_prop", "nation", date_range=(0, 1), logger=TEST_LOGGER, fetcher=mock_covidcast_signal), pd.DataFrame({"timestamp": [20200101], "geo_id": ["us"], "val": [(50 + 100 + 200) / (4903185 + 3723066) * 100000], @@ -204,7 +204,7 @@ def test_combine_usafacts_and_jhu_special_geos(mock_covidcast_signal): "sample_size": [None]}) ) pd.testing.assert_frame_equal( - combine_usafacts_and_jhu("confirmed_incidence_num", "county", date_range=(0, 1), logger=LOGGER, fetcher=mock_covidcast_signal), + combine_usafacts_and_jhu("confirmed_incidence_num", "county", date_range=(0, 1), logger=TEST_LOGGER, fetcher=mock_covidcast_signal), pd.DataFrame({"geo_id": ["01000", "01001", "72001"], "val": [50, 100, 200], "timestamp": [20200101, 20200101, 20200101]}, @@ -231,7 +231,7 @@ def test_no_nation_jhu(mock_covidcast_signal): "value": [1], "timestamp": [20200101]}) ] - result = combine_usafacts_and_jhu("_num", "nation", date_range=(0, 1), logger=LOGGER, fetcher=mock_covidcast_signal) + result = combine_usafacts_and_jhu("_num", "nation", date_range=(0, 1), logger=TEST_LOGGER, fetcher=mock_covidcast_signal) assert mock_covidcast_signal.call_args_list[-1] == call( "jhu-csse", diff --git a/quidel/tests/test_pull.py b/quidel/tests/test_pull.py index 1f36fb85e..17f596ce9 100644 --- a/quidel/tests/test_pull.py +++ b/quidel/tests/test_pull.py @@ -14,7 +14,7 @@ END_FROM_TODAY_MINUS = 5 EXPORT_DAY_RANGE = 40 -LOGGER = logging.Logger("test_logger") +TEST_LOGGER = logging.getLogger() class TestFixData: def test_fix_zipcode(self): @@ -30,7 +30,7 @@ def test_fix_date(self): datetime(2020, 6, 14), datetime(2020, 7, 10)], "TestDate":[datetime(2020, 1, 19), datetime(2020, 6, 10), datetime(2020, 6, 11), datetime(2020, 7, 2)]}) - df = fix_date(df, LOGGER) + df = fix_date(df, TEST_LOGGER) assert set(df["timestamp"]) == set([datetime(2020, 5, 19), datetime(2020, 6, 11), datetime(2020, 7, 2)]) @@ -51,7 +51,7 @@ def test_pull_quidel_data(self): "sender": "", "wip_signal": [""], "test_mode": True - }, LOGGER) + }, TEST_LOGGER) # For covid_ag df = dfs["covid_ag"] diff --git a/quidel_covidtest/tests/test_pull.py b/quidel_covidtest/tests/test_pull.py index 01df4c7c6..17ddbb6fd 100644 --- a/quidel_covidtest/tests/test_pull.py +++ b/quidel_covidtest/tests/test_pull.py @@ -15,7 +15,7 @@ END_FROM_TODAY_MINUS = 5 EXPORT_DAY_RANGE = 40 -LOGGER = logging.Logger("test_logger") +TEST_LOGGER = logging.getLogger() class TestFixData: def test_fix_zipcode(self): @@ -31,7 +31,7 @@ def test_fix_date(self): datetime(2020, 6, 14), datetime(2020, 7, 10)], "TestDate":[datetime(2020, 1, 19), datetime(2020, 6, 10), datetime(2020, 6, 11), datetime(2020, 7, 2)]}) - df = fix_date(df, LOGGER) + df = fix_date(df, TEST_LOGGER) assert set(df["timestamp"]) == set([datetime(2020, 5, 19), datetime(2020, 6, 11), datetime(2020, 7, 2)]) @@ -53,7 +53,7 @@ def test_pull_quidel_covidtest(self): "bucket_name": "", "wip_signal": "", "test_mode": True - }, LOGGER) + }, TEST_LOGGER) first_date = df["timestamp"].min().date() last_date = df["timestamp"].max().date() From af7a90ebed462e9a3653325d4ffea7d85396e8cd Mon Sep 17 00:00:00 2001 From: alexcoda Date: Sun, 19 Sep 2021 12:38:08 -0700 Subject: [PATCH 21/43] Fix error check --- quidel/delphi_quidel/data_tools.py | 4 ++-- quidel_covidtest/delphi_quidel_covidtest/data_tools.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/quidel/delphi_quidel/data_tools.py b/quidel/delphi_quidel/data_tools.py index c0ebeb750..92afb2159 100644 --- a/quidel/delphi_quidel/data_tools.py +++ b/quidel/delphi_quidel/data_tools.py @@ -287,9 +287,9 @@ def raw_tests_per_device(devices, tests, min_obs): """ devices = devices.astype(float) tests = tests.astype(float) - if np.any(np.isnan(devices)) or np.any(d < 0 for d in devices): + if np.any(np.isnan(devices)) or np.any(devices < 0): raise ValueError("devices should be non-negative with no np.nan") - if np.any(np.isnan(tests)) or np.any(d < 0 for d in tests): + if np.any(np.isnan(tests)) or np.any(tests < 0): raise ValueError("tests should be non-negative with no np.nan") if min_obs <= 0: raise ValueError('min_obs should be positive') diff --git a/quidel_covidtest/delphi_quidel_covidtest/data_tools.py b/quidel_covidtest/delphi_quidel_covidtest/data_tools.py index fac0b58b2..f89a353ed 100644 --- a/quidel_covidtest/delphi_quidel_covidtest/data_tools.py +++ b/quidel_covidtest/delphi_quidel_covidtest/data_tools.py @@ -293,9 +293,9 @@ def raw_tests_per_device(devices, tests, min_obs): """ devices = devices.astype(float) tests = tests.astype(float) - if np.any(np.isnan(devices)) or np.any(d < 0 for d in devices): + if np.any(np.isnan(devices)) or np.any(devices < 0): raise ValueError("devices should be non-negative with no np.nan") - if np.any(np.isnan(tests)) or np.any(d < 0 for d in tests): + if np.any(np.isnan(tests)) or np.any(tests < 0): raise ValueError("tests should be non-negative with no np.nan") if min_obs <= 0: raise ValueError('min_obs should be positive') From 985e98b84eca3bd311c4c08bd6d8cdc4d4ec218b Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 20 Sep 2021 12:07:35 -0400 Subject: [PATCH 22/43] only install remotes if not avail; upgrade as needed --- .github/workflows/r-ci.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/r-ci.yml b/.github/workflows/r-ci.yml index 116537194..68f175983 100644 --- a/.github/workflows/r-ci.yml +++ b/.github/workflows/r-ci.yml @@ -52,8 +52,11 @@ jobs: install.packages("devtools") devtools::install_version("readr", version = "1.4.0") } - install.packages("remotes") - remotes::update_packages(c("rcmdcheck", "mockr"), upgrade="always") + + if ( !require("remotes") ) { + install.packages("remotes") + } + remotes::update_packages(c("rcmdcheck", "mockr", "remotes"), upgrade="always") dependency_list <- remotes::dev_package_deps(dependencies=TRUE) remotes::update_packages(dependency_list$package[dependency_list$package != "readr"], upgrade="always") shell: Rscript {0} From b32ed7f343c75835a6f47cd12120ca2f2588947d Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 20 Sep 2021 12:08:18 -0400 Subject: [PATCH 23/43] test run with no cache --- .github/workflows/r-ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/r-ci.yml b/.github/workflows/r-ci.yml index 68f175983..40d76986f 100644 --- a/.github/workflows/r-ci.yml +++ b/.github/workflows/r-ci.yml @@ -43,12 +43,12 @@ jobs: uses: actions/cache@v2 with: path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-facebook-survey-${{ steps.get-date.outputs.date }} + key: ${{ runner.os }}-r-facebook-survey-${{ steps.get-date.outputs.date }}-testcache restore-keys: | - ${{ runner.os }}-r-facebook-survey- + ${{ runner.os }}-r-facebook-survey-testcache - name: Install R dependencies run: | - if ( packageVersion("readr") != "1.4.0" ) { + if ( !require("readr") || packageVersion("readr") != "1.4.0" ) { install.packages("devtools") devtools::install_version("readr", version = "1.4.0") } From 3669748759ddd8fb70bef6477a1f17d862bdaae1 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 20 Sep 2021 12:40:32 -0400 Subject: [PATCH 24/43] Revert "test run with no cache" This reverts commit b32ed7f343c75835a6f47cd12120ca2f2588947d. --- .github/workflows/r-ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/r-ci.yml b/.github/workflows/r-ci.yml index 40d76986f..68f175983 100644 --- a/.github/workflows/r-ci.yml +++ b/.github/workflows/r-ci.yml @@ -43,12 +43,12 @@ jobs: uses: actions/cache@v2 with: path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-facebook-survey-${{ steps.get-date.outputs.date }}-testcache + key: ${{ runner.os }}-r-facebook-survey-${{ steps.get-date.outputs.date }} restore-keys: | - ${{ runner.os }}-r-facebook-survey-testcache + ${{ runner.os }}-r-facebook-survey- - name: Install R dependencies run: | - if ( !require("readr") || packageVersion("readr") != "1.4.0" ) { + if ( packageVersion("readr") != "1.4.0" ) { install.packages("devtools") devtools::install_version("readr", version = "1.4.0") } From 25f1b4ea6f3904627fc23e9a136619500bca09a5 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 20 Sep 2021 16:16:39 -0400 Subject: [PATCH 25/43] set E2 to integer on read --- facebook/delphiFacebook/R/responses.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/facebook/delphiFacebook/R/responses.R b/facebook/delphiFacebook/R/responses.R index a6a92a7a2..bca1d72a8 100644 --- a/facebook/delphiFacebook/R/responses.R +++ b/facebook/delphiFacebook/R/responses.R @@ -129,7 +129,10 @@ load_response_one <- function(input_filename, params, contingency_run) { Q79 = col_integer(), Q80 = col_integer(), I5 = col_character(), - I7 = col_character()), + I7 = col_character(), + E2_1 = col_integer(), + E2_2 = col_integer() + ), locale = locale(grouping_mark = "")) if (nrow(input_data) == 0) { return(tibble()) From 056ad8153527c38bf25e16ceba65e71ff8a9f7d8 Mon Sep 17 00:00:00 2001 From: Brian Clark Date: Tue, 21 Sep 2021 15:54:08 -0400 Subject: [PATCH 26/43] Add new host to inventory --- ansible/inventory | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/inventory b/ansible/inventory index 424d05c3d..acabbb490 100644 --- a/ansible/inventory +++ b/ansible/inventory @@ -1,5 +1,5 @@ [runtime_host] -delphi-master-prod-01.delphi.cmu.edu +bigchunk-dev-02.delphi.cmu.edu [runtime_host_staging] app-mono-dev-01.delphi.cmu.edu From 9ff405f50275d5aaec811e19d2a4edd54517f0b5 Mon Sep 17 00:00:00 2001 From: Brian Clark Date: Tue, 21 Sep 2021 17:09:06 -0400 Subject: [PATCH 27/43] Re-add primary back to inventory --- ansible/inventory | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/inventory b/ansible/inventory index acabbb490..d67b775c6 100644 --- a/ansible/inventory +++ b/ansible/inventory @@ -1,4 +1,5 @@ [runtime_host] +delphi-master-prod-01.delphi.cmu.edu bigchunk-dev-02.delphi.cmu.edu [runtime_host_staging] From 5f5b292e2c661198fb2a77f2504e194cf884801f Mon Sep 17 00:00:00 2001 From: Alex Coda Date: Tue, 21 Sep 2021 17:34:46 -0700 Subject: [PATCH 28/43] Update quidel/delphi_quidel/pull.py Co-authored-by: Katie Mazaitis --- quidel/delphi_quidel/pull.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quidel/delphi_quidel/pull.py b/quidel/delphi_quidel/pull.py index 1643f9304..ffbba3283 100644 --- a/quidel/delphi_quidel/pull.py +++ b/quidel/delphi_quidel/pull.py @@ -176,7 +176,7 @@ def fix_date(df, logger): df.insert(2, "timestamp", df["TestDate"]) mask = df["TestDate"] <= df["StorageDate"] - logger.info(f"Removing {((len(df) - np.sum(mask)) * 100 / len(df)):.2f} of unusual data") + logger.info(f"Removing {((len(df) - np.sum(mask)) * 100 / len(df)):.2f}% of unusual data") df = df[mask] mask = df["StorageDate"] - df["TestDate"] > pd.Timedelta(days=90) From 18ae0729b9450e9159fef5faaeea9152ff28d98e Mon Sep 17 00:00:00 2001 From: Alex Coda Date: Tue, 21 Sep 2021 17:34:50 -0700 Subject: [PATCH 29/43] Update quidel/delphi_quidel/pull.py Co-authored-by: Katie Mazaitis --- quidel/delphi_quidel/pull.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quidel/delphi_quidel/pull.py b/quidel/delphi_quidel/pull.py index ffbba3283..f168b3355 100644 --- a/quidel/delphi_quidel/pull.py +++ b/quidel/delphi_quidel/pull.py @@ -180,7 +180,7 @@ def fix_date(df, logger): df = df[mask] mask = df["StorageDate"] - df["TestDate"] > pd.Timedelta(days=90) - logger.info(f"Fixing {(np.sum(mask) * 100 / len(df)):.2f} of outdated data") + logger.info(f"Fixing {(np.sum(mask) * 100 / len(df)):.2f}% of outdated data") df["timestamp"].values[mask] = df["StorageDate"].values[mask] return df From 1aa181c3151baf5d431f1c3a772634f7a37311f4 Mon Sep 17 00:00:00 2001 From: Alex Coda Date: Tue, 21 Sep 2021 17:34:57 -0700 Subject: [PATCH 30/43] Update quidel_covidtest/delphi_quidel_covidtest/pull.py Co-authored-by: Katie Mazaitis --- quidel_covidtest/delphi_quidel_covidtest/pull.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quidel_covidtest/delphi_quidel_covidtest/pull.py b/quidel_covidtest/delphi_quidel_covidtest/pull.py index 9ce036e10..b5f9eb9d0 100644 --- a/quidel_covidtest/delphi_quidel_covidtest/pull.py +++ b/quidel_covidtest/delphi_quidel_covidtest/pull.py @@ -99,7 +99,7 @@ def fix_date(df, logger): df.insert(2, "timestamp", df["TestDate"]) mask = df["TestDate"] <= df["StorageDate"] - logger.info(f"Removing {((len(df) - np.sum(mask)) * 100 / len(df)):.2f} of unusual data") + logger.info(f"Removing {((len(df) - np.sum(mask)) * 100 / len(df)):.2f}% of unusual data") df = df[mask] mask = df["StorageDate"] - df["TestDate"] > pd.Timedelta(days=90) From b6361dec6ff234a170974ffe59543b3ed71e181e Mon Sep 17 00:00:00 2001 From: Alex Coda Date: Tue, 21 Sep 2021 17:35:03 -0700 Subject: [PATCH 31/43] Update quidel_covidtest/delphi_quidel_covidtest/pull.py Co-authored-by: Katie Mazaitis --- quidel_covidtest/delphi_quidel_covidtest/pull.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quidel_covidtest/delphi_quidel_covidtest/pull.py b/quidel_covidtest/delphi_quidel_covidtest/pull.py index b5f9eb9d0..3efa9ed23 100644 --- a/quidel_covidtest/delphi_quidel_covidtest/pull.py +++ b/quidel_covidtest/delphi_quidel_covidtest/pull.py @@ -103,7 +103,7 @@ def fix_date(df, logger): df = df[mask] mask = df["StorageDate"] - df["TestDate"] > pd.Timedelta(days=90) - logger.info(f"Fixing {(np.sum(mask) * 100 / len(df)):.2f} of outdated data") + logger.info(f"Fixing {(np.sum(mask) * 100 / len(df)):.2f}% of outdated data") df["timestamp"].values[mask] = df["StorageDate"].values[mask] return df From 8b22079b2c5c6c1c19a44b0eabddcb1b74c58014 Mon Sep 17 00:00:00 2001 From: Andrew Chin Date: Tue, 21 Sep 2021 21:42:20 -0400 Subject: [PATCH 32/43] Switch CDC Covidnet to use structed logger --- cdc_covidnet/delphi_cdc_covidnet/covidnet.py | 6 +++--- cdc_covidnet/delphi_cdc_covidnet/run.py | 22 ++++++++++++-------- cdc_covidnet/tests/test_covidnet.py | 6 ++++-- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/cdc_covidnet/delphi_cdc_covidnet/covidnet.py b/cdc_covidnet/delphi_cdc_covidnet/covidnet.py index 03c2b7775..b202e2fa3 100644 --- a/cdc_covidnet/delphi_cdc_covidnet/covidnet.py +++ b/cdc_covidnet/delphi_cdc_covidnet/covidnet.py @@ -6,7 +6,7 @@ """ import json -import logging +from logging import Logger import os from typing import Tuple, List from multiprocessing import cpu_count, Pool @@ -100,7 +100,7 @@ def download_hosp_data( @staticmethod def download_all_hosp_data( - mappings_file: str, cache_path: str, parallel: bool = False + mappings_file: str, cache_path: str, logger: Logger, parallel: bool = False ) -> List[str]: """ Download hospitalization data for all states listed in the mappings JSON file to disk. @@ -146,7 +146,7 @@ def download_all_hosp_data( else: for args in state_args: CovidNet.download_hosp_data(*args) - logging.debug("Downloading for nid=%s, cid=%s", args[0], args[1]) + logger.debug("Downloading for nid=%s, cid=%s", args[0], args[1]) return state_files diff --git a/cdc_covidnet/delphi_cdc_covidnet/run.py b/cdc_covidnet/delphi_cdc_covidnet/run.py index 87e1419ce..0214d52ae 100644 --- a/cdc_covidnet/delphi_cdc_covidnet/run.py +++ b/cdc_covidnet/delphi_cdc_covidnet/run.py @@ -4,12 +4,13 @@ This module should contain a function called `run_module`, that is executed when the module is run with `python -m delphi_cdc_covidnet`. """ -import logging from datetime import datetime from os import remove from os.path import join from typing import Dict, Any +from delphi_utils import get_structured_logger + from .covidnet import CovidNet from .update_sensor import update_sensor @@ -32,7 +33,9 @@ def run_module(params: Dict[str, Dict[str, Any]]): - "wip_signal": list of str or bool, to be passed to delphi_utils.add_prefix. - "input_cache_dir": str, directory to download source files. """ - logging.basicConfig(level=logging.DEBUG) + logger = get_structured_logger( + __name__, filename=params["common"].get("log_filename"), + log_exceptions=params["common"].get("log_exceptions", True)) start_date = datetime.strptime(params["indicator"]["start_date"], "%Y-%m-%d") @@ -42,15 +45,15 @@ def run_module(params: Dict[str, Dict[str, Any]]): else: end_date = datetime.strptime(params["indicator"]["end_date"], "%Y-%m-%d") - logging.info("start date:\t%s", start_date.date()) - logging.info("end date:\t%s", end_date.date()) + logger.info("start date:\t%s", start_date.date()) + logger.info("end date:\t%s", end_date.date()) - logging.info("outpath:\t%s", params["common"]["export_dir"]) - logging.info("parallel:\t%s", params["indicator"]["parallel"]) + logger.info("outpath:\t%s", params["common"]["export_dir"]) + logger.info("parallel:\t%s", params["indicator"]["parallel"]) # Only geo is state, and no weekday adjustment for now # COVID-NET data is by weeks anyway, not daily - logging.info("starting state, no adj") + logger.info("starting state, no adj") # Download latest COVID-NET files into the cache directory first mappings_file = join(params["indicator"]["input_cache_dir"], "init.json") @@ -58,7 +61,8 @@ def run_module(params: Dict[str, Dict[str, Any]]): _, mmwr_info, _ = CovidNet.read_mappings(mappings_file) state_files = CovidNet.download_all_hosp_data( mappings_file, params["indicator"]["input_cache_dir"], - parallel=params["indicator"]["parallel"]) + parallel=params["indicator"]["parallel"], + logger=logger) update_sensor( state_files, @@ -73,4 +77,4 @@ def run_module(params: Dict[str, Dict[str, Any]]): for state_file in state_files: remove(state_file) - logging.info("finished all") + logger.info("finished all") diff --git a/cdc_covidnet/tests/test_covidnet.py b/cdc_covidnet/tests/test_covidnet.py index 6846b9f5e..efe03fe29 100644 --- a/cdc_covidnet/tests/test_covidnet.py +++ b/cdc_covidnet/tests/test_covidnet.py @@ -1,4 +1,5 @@ import json +import logging from os.path import join, exists from tempfile import TemporaryDirectory @@ -7,6 +8,7 @@ from delphi_cdc_covidnet.api_config import APIConfig from delphi_cdc_covidnet.covidnet import CovidNet +TEST_LOGGER = logging.getLogger() class TestCovidNet: @@ -65,14 +67,14 @@ def test_hosp_data(self): # Non-parallel state_files = CovidNet.download_all_hosp_data( - init_file, temp_dir, parallel=False) + init_file, temp_dir, TEST_LOGGER, parallel=False) assert len(state_files) == num_states for state_file in state_files: assert exists(state_file) # Parallel state_files_par = CovidNet.download_all_hosp_data( - init_file, temp_dir, parallel=True) + init_file, temp_dir, TEST_LOGGER, parallel=True) assert set(state_files) == set(state_files_par) assert len(state_files_par) == num_states for state_file in state_files_par: From c7272fc2b9ce135e428a5c10d5e21b5bc830b704 Mon Sep 17 00:00:00 2001 From: Andrew Chin Date: Tue, 21 Sep 2021 22:04:21 -0400 Subject: [PATCH 33/43] Switch to structed logger for ChangeHC --- changehc/delphi_changehc/run.py | 3 ++- changehc/delphi_changehc/sensor.py | 4 ++-- changehc/delphi_changehc/update_sensor.py | 26 +++++++++++++---------- changehc/tests/test_sensor.py | 4 +++- changehc/tests/test_update_sensor.py | 26 ++++++++++++++++------- 5 files changed, 40 insertions(+), 23 deletions(-) diff --git a/changehc/delphi_changehc/run.py b/changehc/delphi_changehc/run.py index c9b340403..9580b5728 100644 --- a/changehc/delphi_changehc/run.py +++ b/changehc/delphi_changehc/run.py @@ -173,7 +173,8 @@ def run_module(params: Dict[str, Dict[str, Any]]): weekday, numtype, params["indicator"]["se"], - params["indicator"]["wip_signal"] + params["indicator"]["wip_signal"], + logger ) if numtype == "covid": data = load_combined_data(file_dict["denom"], diff --git a/changehc/delphi_changehc/sensor.py b/changehc/delphi_changehc/sensor.py index f4a8934ab..d1422567b 100644 --- a/changehc/delphi_changehc/sensor.py +++ b/changehc/delphi_changehc/sensor.py @@ -87,7 +87,7 @@ def backfill( return new_num, new_den @staticmethod - def fit(y_data, first_sensor_date, geo_id, num_col="num", den_col="den"): + def fit(y_data, first_sensor_date, geo_id, logger, num_col="num", den_col="den"): """Fitting routine. Args: @@ -121,7 +121,7 @@ def fit(y_data, first_sensor_date, geo_id, num_col="num", den_col="den"): se_valid = valid_rates.eval('sqrt(rate * (1 - rate) / den)') rate_data['se'] = se_valid - logging.debug("{0}: {1:.3f},[{2:.3f}]".format( + logger.debug("{0}: {1:.3f},[{2:.3f}]".format( geo_id, rate_data['rate'][-1], rate_data['se'][-1] )) return {"geo_id": geo_id, diff --git a/changehc/delphi_changehc/update_sensor.py b/changehc/delphi_changehc/update_sensor.py index a87ea853f..95de8fe21 100644 --- a/changehc/delphi_changehc/update_sensor.py +++ b/changehc/delphi_changehc/update_sensor.py @@ -20,7 +20,7 @@ from .weekday import Weekday -def write_to_csv(df, geo_level, write_se, day_shift, out_name, output_path=".", start_date=None, end_date=None): +def write_to_csv(df, geo_level, write_se, day_shift, out_name, logger, output_path=".", start_date=None, end_date=None): """Write sensor values to csv. Args: @@ -47,7 +47,7 @@ def write_to_csv(df, geo_level, write_se, day_shift, out_name, output_path=".", assert df[suspicious_se_mask].empty, " se contains suspiciously large values" assert not df["se"].isna().any(), " se contains nan values" if write_se: - logging.info("========= WARNING: WRITING SEs TO {0} =========".format(out_name)) + logger.info("========= WARNING: WRITING SEs TO {0} =========".format(out_name)) else: df.loc[:, "se"] = np.nan @@ -55,7 +55,7 @@ def write_to_csv(df, geo_level, write_se, day_shift, out_name, output_path=".", suspicious_val_mask = df["val"].gt(90) if not df[suspicious_val_mask].empty: for geo in df.loc[suspicious_val_mask, "geo_id"]: - logging.warning("value suspiciously high, {0}: {1}".format( + logger.warning("value suspiciously high, {0}: {1}".format( geo, out_name )) @@ -68,10 +68,10 @@ def write_to_csv(df, geo_level, write_se, day_shift, out_name, output_path=".", sensor=out_name, write_empty_days=True ) - logging.debug("wrote {0} rows for {1} {2}".format( + logger.debug("wrote {0} rows for {1} {2}".format( df.size, df["geo_id"].unique().size, geo_level )) - logging.debug("wrote files to {0}".format(output_path)) + logger.debug("wrote files to {0}".format(output_path)) return dates @@ -87,7 +87,8 @@ def __init__(self, weekday, numtype, se, - wip_signal): + wip_signal, + logger): """Init Sensor Updator. Args: @@ -100,7 +101,9 @@ def __init__(self, numtype: type of count data used, one of ["covid", "cli"] se: boolean to write out standard errors, if true, use an obfuscated name wip_signal: Prefix for WIP signals + logger: the structured logger """ + self.logger = logger self.startdate, self.enddate, self.dropdate = [ pd.to_datetime(t) for t in (startdate, enddate, dropdate)] # handle dates @@ -149,7 +152,7 @@ def geo_reindex(self, data): geo = self.geo gmpr = GeoMapper() if geo not in {"county", "state", "msa", "hrr", "nation", "hhs"}: - logging.error("{0} is invalid, pick one of 'county', " + self.logger.error("{0} is invalid, pick one of 'county', " "'state', 'msa', 'hrr', 'hss','nation'".format(geo)) return False if geo == "county": @@ -201,12 +204,12 @@ def update_sensor(self, sub_data.reset_index(level=0,inplace=True) if self.weekday: sub_data = Weekday.calc_adjustment(wd_params, sub_data) - res = CHCSensor.fit(sub_data, self.burnindate, geo_id) + res = CHCSensor.fit(sub_data, self.burnindate, geo_id, self.logger) res = pd.DataFrame(res).loc[final_sensor_idxs] dfs.append(res) else: n_cpu = min(10, cpu_count()) - logging.debug("starting pool with {0} workers".format(n_cpu)) + self.logger.debug("starting pool with {0} workers".format(n_cpu)) with Pool(n_cpu) as pool: pool_results = [] for geo_id, sub_data in data_frame.groupby(level=0,as_index=False): @@ -215,7 +218,7 @@ def update_sensor(self, sub_data = Weekday.calc_adjustment(wd_params, sub_data) pool_results.append( pool.apply_async( - CHCSensor.fit, args=(sub_data, self.burnindate, geo_id,), + CHCSensor.fit, args=(sub_data, self.burnindate, geo_id, self.logger), ) ) pool_results = [proc.get() for proc in pool_results] @@ -244,7 +247,8 @@ def update_sensor(self, write_se=self.se, day_shift=Config.DAY_SHIFT, out_name=signal, - output_path=output_path + output_path=output_path, + logger=self.logger ) if len(dates) > 0: stats.append((max(dates), len(dates))) diff --git a/changehc/tests/test_sensor.py b/changehc/tests/test_sensor.py index 7c4aef01d..32afc3081 100644 --- a/changehc/tests/test_sensor.py +++ b/changehc/tests/test_sensor.py @@ -1,4 +1,5 @@ # standard +import logging import numpy as np import numpy.random as nr @@ -19,6 +20,7 @@ COVID_FILEPATH = PARAMS["indicator"]["input_covid_file"] DENOM_FILEPATH = PARAMS["indicator"]["input_denom_file"] DROP_DATE = pd.to_datetime(PARAMS["indicator"]["drop_date"]) +TEST_LOGGER = logging.getLogger() class TestLoadData: combined_data = load_combined_data(DENOM_FILEPATH, COVID_FILEPATH, DROP_DATE, @@ -56,7 +58,7 @@ def test_fit_fips(self): for fips in all_fips: sub_data = self.combined_data.loc[fips] sub_data = sub_data.reindex(date_range, fill_value=0) - res0 = CHCSensor.fit(sub_data, date_range[0], fips) + res0 = CHCSensor.fit(sub_data, date_range[0], fips, TEST_LOGGER) if np.isnan(res0["rate"]).all(): assert res0["incl"].sum() == 0 diff --git a/changehc/tests/test_update_sensor.py b/changehc/tests/test_update_sensor.py index 779960bf7..c202c4325 100644 --- a/changehc/tests/test_update_sensor.py +++ b/changehc/tests/test_update_sensor.py @@ -1,4 +1,5 @@ # standard +import logging from copy import deepcopy import os from os.path import join, exists @@ -27,6 +28,7 @@ DENOM_FILEPATH = PARAMS["indicator"]["input_denom_file"] DROP_DATE = pd.to_datetime(PARAMS["indicator"]["drop_date"]) OUTPATH="test_data/" +TEST_LOGGER = logging.getLogger() class TestCHCSensorUpdator: """Tests for updating the sensors.""" @@ -53,7 +55,8 @@ def test_shift_dates(self): self.weekday, self.numtype, self.se, - "" + "", + TEST_LOGGER ) ## Test init assert su_inst.startdate.month == 2 @@ -77,7 +80,8 @@ def test_geo_reindex(self): self.weekday, self.numtype, self.se, - "" + "", + TEST_LOGGER ) su_inst.shift_dates() test_data = pd.DataFrame({ @@ -103,7 +107,8 @@ def test_update_sensor(self): self.weekday, self.numtype, self.se, - "" + "", + TEST_LOGGER ) # As of 3/3/21 (40c258a), this set of data has county outputting data, state and hhs not # outputting data, and nation outputting data, which is undesirable. Ideal behaviour @@ -149,7 +154,8 @@ def test_write_to_csv_results(self): write_se=False, day_shift=CONFIG.DAY_SHIFT, out_name="name_of_signal", - output_path=td.name + output_path=td.name, + logger=TEST_LOGGER ) # check outputs @@ -203,7 +209,8 @@ def test_write_to_csv_with_se_results(self): write_se=True, day_shift=CONFIG.DAY_SHIFT, out_name="name_of_signal", - output_path=td.name + output_path=td.name, + logger=TEST_LOGGER ) # check outputs @@ -243,7 +250,8 @@ def test_write_to_csv_wrong_results(self): write_se=False, day_shift=CONFIG.DAY_SHIFT, out_name="name_of_signal", - output_path=td.name + output_path=td.name, + logger=TEST_LOGGER ) # nan se for included loc-date @@ -258,7 +266,8 @@ def test_write_to_csv_wrong_results(self): write_se=True, day_shift=CONFIG.DAY_SHIFT, out_name="name_of_signal", - output_path=td.name + output_path=td.name, + logger=TEST_LOGGER ) # large se value @@ -273,7 +282,8 @@ def test_write_to_csv_wrong_results(self): write_se=True, day_shift=CONFIG.DAY_SHIFT, out_name="name_of_signal", - output_path=td.name + output_path=td.name, + logger=TEST_LOGGER ) td.cleanup() From 5b56357ce5cb7f3b495414203bfee84aa2fc25fd Mon Sep 17 00:00:00 2001 From: Andrew Chin Date: Tue, 21 Sep 2021 22:22:17 -0400 Subject: [PATCH 34/43] switch doctor visits to structured logger --- doctor_visits/delphi_doctor_visits/run.py | 44 ++++++++++--------- doctor_visits/delphi_doctor_visits/sensor.py | 10 ++--- .../delphi_doctor_visits/update_sensor.py | 18 ++++---- doctor_visits/delphi_doctor_visits/weekday.py | 7 ++- doctor_visits/tests/test_update_sensor.py | 7 ++- 5 files changed, 46 insertions(+), 40 deletions(-) diff --git a/doctor_visits/delphi_doctor_visits/run.py b/doctor_visits/delphi_doctor_visits/run.py index 65f30ed69..84bc1af0e 100644 --- a/doctor_visits/delphi_doctor_visits/run.py +++ b/doctor_visits/delphi_doctor_visits/run.py @@ -6,10 +6,11 @@ """ # standard packages -import logging from datetime import datetime, timedelta from pathlib import Path +from delphi_utils import get_structured_logger + # first party from .update_sensor import update_sensor, write_to_csv @@ -37,7 +38,9 @@ def run_module(params): - "obfuscated_prefix": str, prefix for signal name if write_se is True. - "parallel": bool, whether to update sensor in parallel. """ - logging.basicConfig(level=logging.DEBUG) + logger = get_structured_logger( + __name__, filename=params["common"].get("log_filename"), + log_exceptions=params["common"].get("log_exceptions", True)) ## get end date from input file # the filename is expected to be in the format: @@ -61,30 +64,30 @@ def run_module(params): startdate_dt = enddate_dt - timedelta(days=n_backfill_days) enddate = str(enddate_dt.date()) startdate = str(startdate_dt.date()) - logging.info("drop date:\t\t%s", dropdate) - logging.info("first sensor date:\t%s", startdate) - logging.info("last sensor date:\t%s", enddate) - logging.info("n_backfill_days:\t%s", n_backfill_days) - logging.info("n_waiting_days:\t%s", n_waiting_days) + logger.info("drop date:\t\t%s", dropdate) + logger.info("first sensor date:\t%s", startdate) + logger.info("last sensor date:\t%s", enddate) + logger.info("n_backfill_days:\t%s", n_backfill_days) + logger.info("n_waiting_days:\t%s", n_waiting_days) ## geographies geos = ["state", "msa", "hrr", "county", "hhs", "nation"] ## print out other vars - logging.info("outpath:\t\t%s", export_dir) - logging.info("parallel:\t\t%s", params["indicator"]["parallel"]) - logging.info("weekday:\t\t%s", params["indicator"]["weekday"]) - logging.info("write se:\t\t%s", se) - logging.info("obfuscated prefix:\t%s", prefix) + logger.info("outpath:\t\t%s", export_dir) + logger.info("parallel:\t\t%s", params["indicator"]["parallel"]) + logger.info("weekday:\t\t%s", params["indicator"]["weekday"]) + logger.info("write se:\t\t%s", se) + logger.info("obfuscated prefix:\t%s", prefix) ## start generating for geo in geos: for weekday in params["indicator"]["weekday"]: if weekday: - logging.info("starting %s, weekday adj", geo) + logger.info("starting %s, weekday adj", geo) else: - logging.info("starting %s, no adj", geo) + logger.info("starting %s, no adj", geo) sensor = update_sensor( filepath=params["indicator"]["input_file"], startdate=startdate, @@ -93,10 +96,11 @@ def run_module(params): geo=geo, parallel=params["indicator"]["parallel"], weekday=weekday, - se=params["indicator"]["se"] + se=params["indicator"]["se"], + logger=logger, ) if sensor is None: - logging.error("No sensors calculated, no output will be produced") + logger.error("No sensors calculated, no output will be produced") continue # write out results out_name = "smoothed_adj_cli" if weekday else "smoothed_cli" @@ -104,8 +108,8 @@ def run_module(params): assert prefix is not None, "template has no obfuscated prefix" out_name = prefix + "_" + out_name - write_to_csv(sensor, geo, se, out_name, export_dir) - logging.debug(f"wrote files to {export_dir}") - logging.info("finished %s", geo) + write_to_csv(sensor, geo, se, out_name, logger, export_dir) + logger.debug(f"wrote files to {export_dir}") + logger.info("finished %s", geo) - logging.info("finished all") + logger.info("finished all") diff --git a/doctor_visits/delphi_doctor_visits/sensor.py b/doctor_visits/delphi_doctor_visits/sensor.py index 22690f916..e96c8bfe0 100644 --- a/doctor_visits/delphi_doctor_visits/sensor.py +++ b/doctor_visits/delphi_doctor_visits/sensor.py @@ -6,9 +6,6 @@ """ -# standard packages -import logging - # third party import numpy as np import pandas as pd @@ -162,7 +159,8 @@ def fit(y_data, geo_id, recent_min_visits, min_recent_obs, - jeffreys): + jeffreys, + logger): """Fitting routine. Args: @@ -217,7 +215,7 @@ def fit(y_data, # if all rates are zero, don't bother if code_vals.sum() == 0: if jeffreys: - logging.error("p is 0 even though we used Jefferys estimate") + logger.error("p is 0 even though we used Jefferys estimate") new_rates.append(np.zeros((n_dates,))) continue @@ -240,7 +238,7 @@ def fit(y_data, se[include] = np.sqrt( np.divide((new_rates[include] * (1 - new_rates[include])), den[include])) - logging.debug(f"{geo_id}: {new_rates[-1]:.3f},[{se[-1]:.3f}]") + logger.debug(f"{geo_id}: {new_rates[-1]:.3f},[{se[-1]:.3f}]") included_indices = [x for x in final_sensor_idxs if include[x]] diff --git a/doctor_visits/delphi_doctor_visits/update_sensor.py b/doctor_visits/delphi_doctor_visits/update_sensor.py index 931ec3afa..068d2a058 100644 --- a/doctor_visits/delphi_doctor_visits/update_sensor.py +++ b/doctor_visits/delphi_doctor_visits/update_sensor.py @@ -9,7 +9,6 @@ """ # standard packages -import logging from datetime import timedelta from multiprocessing import Pool, cpu_count @@ -24,7 +23,7 @@ from .weekday import Weekday -def write_to_csv(output_df: pd.DataFrame, geo_level, se, out_name, output_path="."): +def write_to_csv(output_df: pd.DataFrame, geo_level, se, out_name, logger, output_path="."): """Write sensor values to csv. Args: @@ -34,7 +33,7 @@ def write_to_csv(output_df: pd.DataFrame, geo_level, se, out_name, output_path=" output_path: outfile path to write the csv (default is current directory) """ if se: - logging.info(f"========= WARNING: WRITING SEs TO {out_name} =========") + logger.info(f"========= WARNING: WRITING SEs TO {out_name} =========") out_n = 0 for d in set(output_df["date"]): @@ -64,12 +63,12 @@ def write_to_csv(output_df: pd.DataFrame, geo_level, se, out_name, output_path=" outfile.write( "%s,%f,%s,%s,%s\n" % (geo_id, sensor, "NA", "NA", "NA")) out_n += 1 - logging.debug(f"wrote {out_n} rows for {geo_level}") + logger.debug(f"wrote {out_n} rows for {geo_level}") def update_sensor( filepath, startdate, enddate, dropdate, geo, parallel, - weekday, se + weekday, se, logger ): """Generate sensor values. @@ -82,6 +81,7 @@ def update_sensor( parallel: boolean to run the sensor update in parallel weekday: boolean to adjust for weekday effects se: boolean to write out standard errors, if true, use an obfuscated name + logger: the structured logger """ # as of 2020-05-11, input file expected to have 10 columns # id cols: ServiceDate, PatCountyFIPS, PatAgeGroup, Pat HRR ID/Pat HRR Name @@ -125,7 +125,7 @@ def update_sensor( (burn_in_dates >= startdate) & (burn_in_dates <= enddate))[0][:len(sensor_dates)] # handle if we need to adjust by weekday - params = Weekday.get_params(data) if weekday else None + params = Weekday.get_params(data, logger) if weekday else None if weekday and np.any(np.all(params == 0,axis=1)): # Weekday correction failed for at least one count type return None @@ -155,13 +155,14 @@ def update_sensor( geo_id, Config.MIN_RECENT_VISITS, Config.MIN_RECENT_OBS, - jeffreys + jeffreys, + logger ) out.append(res) else: n_cpu = min(10, cpu_count()) - logging.debug(f"starting pool with {n_cpu} workers") + logger.debug(f"starting pool with {n_cpu} workers") with Pool(n_cpu) as pool: pool_results = [] @@ -182,6 +183,7 @@ def update_sensor( Config.MIN_RECENT_VISITS, Config.MIN_RECENT_OBS, jeffreys, + logger ), ) ) diff --git a/doctor_visits/delphi_doctor_visits/weekday.py b/doctor_visits/delphi_doctor_visits/weekday.py index 86e5278b2..b05318d39 100644 --- a/doctor_visits/delphi_doctor_visits/weekday.py +++ b/doctor_visits/delphi_doctor_visits/weekday.py @@ -4,8 +4,7 @@ Created: 2020-05-06 """ -# standard packages -import logging + # third party import cvxpy as cp @@ -19,7 +18,7 @@ class Weekday: """Class to handle weekday effects.""" @staticmethod - def get_params(data): + def get_params(data, logger): r"""Correct a signal estimated as numerator/denominator for weekday effects. The ordinary estimate would be numerator_t/denominator_t for each time point @@ -98,7 +97,7 @@ def get_params(data): pass else: # Leaving params[i,:] = 0 is equivalent to not performing weekday correction - logging.error("Unable to calculate weekday correction") + logger.error("Unable to calculate weekday correction") return params diff --git a/doctor_visits/tests/test_update_sensor.py b/doctor_visits/tests/test_update_sensor.py index 4e504e19c..ab74c1c90 100644 --- a/doctor_visits/tests/test_update_sensor.py +++ b/doctor_visits/tests/test_update_sensor.py @@ -1,9 +1,11 @@ """Tests for update_sensor.py.""" - +import logging import pandas as pd from delphi_doctor_visits.update_sensor import update_sensor +TEST_LOGGER = logging.getLogger() + class TestUpdateSensor: def test_update_sensor(self): actual = update_sensor( @@ -14,7 +16,8 @@ def test_update_sensor(self): geo="state", parallel=False, weekday=False, - se=False + se=False, + logger=TEST_LOGGER, ) comparison = pd.read_csv("./comparison/update_sensor/all.csv", parse_dates=["date"]) From c63a4544ef6d326e06b3d5ff55a7b0c48967c281 Mon Sep 17 00:00:00 2001 From: Andrew Chin Date: Tue, 21 Sep 2021 23:53:14 -0400 Subject: [PATCH 35/43] Remove bare except in DV --- doctor_visits/delphi_doctor_visits/weekday.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doctor_visits/delphi_doctor_visits/weekday.py b/doctor_visits/delphi_doctor_visits/weekday.py index 86e5278b2..bb1c4bf5e 100644 --- a/doctor_visits/delphi_doctor_visits/weekday.py +++ b/doctor_visits/delphi_doctor_visits/weekday.py @@ -9,6 +9,7 @@ # third party import cvxpy as cp +from cvxpy.error import SolverError import numpy as np # first party @@ -92,7 +93,7 @@ def get_params(data): _ = prob.solve() params[i,:] = b.value break - except: + except SolverError: # If the magnitude of the objective function is too large, an error is # thrown; Rescale the objective function by going through loop pass From ec697ed2320c2e82a394daa7dabb78f1737a0bd5 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 22 Sep 2021 15:44:13 -0700 Subject: [PATCH 36/43] Nancodes archiver: remove deleted file nan replacements --- _delphi_utils_python/delphi_utils/archive.py | 24 +-------- _delphi_utils_python/tests/test_archive.py | 51 +++----------------- 2 files changed, 8 insertions(+), 67 deletions(-) diff --git a/_delphi_utils_python/delphi_utils/archive.py b/_delphi_utils_python/delphi_utils/archive.py index ea8f27ac4..e2e826396 100644 --- a/_delphi_utils_python/delphi_utils/archive.py +++ b/_delphi_utils_python/delphi_utils/archive.py @@ -254,26 +254,7 @@ def diff_exports(self) -> Tuple[Files, FileDiffMap, Files]: new_issues_df.to_csv(diff_file, na_rep="NA") common_diffs[after_file] = diff_file - export_csv_dtypes = { - "geo_id": str, "val": float, "se": float, "sample_size": float, - "missing_val": int, "missing_se": int, "missing_sample_size": int - } - - # Replace deleted files with empty versions, but only if the cached version is not - # already empty - deleted_files_nanfilled = [] - for deleted_file in deleted_files: - deleted_df = pd.read_csv(deleted_file, dtype=export_csv_dtypes) - print( - f"Diff deleted {deleted_file}; generating corresponding CSV with deleted rows." - ) - deleted_df[["val", "se", "sample_size"]] = np.nan - deleted_df[["missing_val", "missing_se", "missing_sample_size"]] = Nans.DELETED - filename = join(self.export_dir, basename(deleted_file)) - deleted_df.to_csv(filename, index=False) - deleted_files_nanfilled.append(filename) - - return deleted_files_nanfilled, common_diffs, new_files + return deleted_files, common_diffs, new_files def archive_exports(self, exported_files: Files) -> Tuple[Files, Files]: """ @@ -331,13 +312,12 @@ def run(self): self.update_cache() # Diff exports, and make incremental versions - deleted_files, common_diffs, new_files = self.diff_exports() + _, common_diffs, new_files = self.diff_exports() # Archive changed, new, and emptied deleted files to_archive = [f for f, diff in common_diffs.items() if diff is not None] to_archive += new_files - to_archive += deleted_files _, fails = self.archive_exports(to_archive) # Filter existing exports to exclude those that failed to archive diff --git a/_delphi_utils_python/tests/test_archive.py b/_delphi_utils_python/tests/test_archive.py index 111acf92f..ece76f556 100644 --- a/_delphi_utils_python/tests/test_archive.py +++ b/_delphi_utils_python/tests/test_archive.py @@ -135,16 +135,6 @@ def test_diff_and_filter_exports(self, tmp_path): "missing_sample_size": [Nans.DELETED] + [Nans.NOT_MISSING] * 2, }) - csv2_deleted = pd.DataFrame({ - "geo_id": ["1"], - "val": [np.nan], - "se": [np.nan], - "sample_size": [np.nan], - "missing_val": [Nans.DELETED], - "missing_se": [Nans.DELETED], - "missing_sample_size": [Nans.DELETED], - }) - arch_diff = ArchiveDiffer(cache_dir, export_dir) # Test diff_exports @@ -164,7 +154,7 @@ def test_diff_and_filter_exports(self, tmp_path): deleted_files, common_diffs, new_files = arch_diff.diff_exports() # Check return values - assert set(deleted_files) == {join(export_dir, "csv2.csv")} + assert set(deleted_files) == {join(cache_dir, "csv2.csv")} assert set(common_diffs.keys()) == { join(export_dir, f) for f in ["csv0.csv", "csv1.csv", "csv4.csv"]} assert set(new_files) == {join(export_dir, "csv3.csv")} @@ -175,8 +165,7 @@ def test_diff_and_filter_exports(self, tmp_path): # Check filesystem for actual files assert set(listdir(export_dir)) == { "csv0.csv", "csv1.csv", "csv1.csv.diff", - "csv3.csv", "csv4.csv", "csv4.csv.diff", - "csv2.csv" + "csv3.csv", "csv4.csv", "csv4.csv.diff" } assert_frame_equal( pd.read_csv(join(export_dir, "csv1.csv.diff"), dtype=CSV_DTYPES), @@ -194,11 +183,8 @@ def test_diff_and_filter_exports(self, tmp_path): arch_diff.filter_exports(common_diffs) - # Check exports directory just has incremental and deleted changes - assert set(listdir(export_dir)) == {"csv1.csv", "csv2.csv", "csv3.csv", "csv4.csv"} - assert_frame_equal( - pd.read_csv(join(export_dir, "csv2.csv"), dtype=CSV_DTYPES), - csv2_deleted) + # Check exports directory just has incremental changes + assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv"} assert_frame_equal( pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES), csv1_diff) @@ -325,7 +311,7 @@ def test_run(self, tmp_path, s3_client): assert_frame_equal(pd.read_csv(body, dtype=CSV_DTYPES), df) # Check exports directory just has incremental changes - assert set(listdir(export_dir)) == {"csv1.csv", "csv2.csv", "csv3.csv", "csv4.csv"} + assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv"} csv1_diff = pd.DataFrame({ "geo_id": ["3", "2", "4"], "val": [np.nan, 2.1, 4.0], @@ -338,18 +324,6 @@ def test_run(self, tmp_path, s3_client): assert_frame_equal( pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES), csv1_diff) - csv2_deleted = pd.DataFrame({ - "geo_id": ["1"], - "val": [np.nan], - "se": [np.nan], - "sample_size": [np.nan], - "missing_val": [Nans.DELETED], - "missing_se": [Nans.DELETED], - "missing_sample_size": [Nans.DELETED], - }) - assert_frame_equal( - pd.read_csv(join(export_dir, "csv2.csv"), dtype=CSV_DTYPES), - csv2_deleted) class TestGitArchiveDiffer: @@ -550,7 +524,7 @@ def test_run(self, tmp_path): original_branch.checkout() # Check exports directory just has incremental changes - assert set(listdir(export_dir)) == {"csv1.csv", "csv2.csv", "csv3.csv", "csv4.csv"} + assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv"} csv1_diff = pd.DataFrame({ "geo_id": ["3", "2", "4"], "val": [np.nan, 2.1, 4.0], @@ -563,19 +537,6 @@ def test_run(self, tmp_path): assert_frame_equal( pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES), csv1_diff) - csv2_deleted = pd.DataFrame({ - "geo_id": ["1"], - "val": [np.nan], - "se": [np.nan], - "sample_size": [np.nan], - "missing_val": [Nans.DELETED], - "missing_se": [Nans.DELETED], - "missing_sample_size": [Nans.DELETED], - }) - assert_frame_equal( - pd.read_csv(join(export_dir, "csv2.csv"), dtype=CSV_DTYPES), - csv2_deleted) - class TestFromParams: From 50bf96b1937cd28dc31914183dad38343ce149de Mon Sep 17 00:00:00 2001 From: Andrew Chin Date: Fri, 24 Sep 2021 23:21:11 -0400 Subject: [PATCH 37/43] Refactor NCHS mortality to use delphi export util --- _delphi_utils_python/delphi_utils/export.py | 13 +++++-- _delphi_utils_python/setup.py | 1 + .../delphi_nchs_mortality/export.py | 36 ------------------- nchs_mortality/delphi_nchs_mortality/run.py | 13 +++---- 4 files changed, 18 insertions(+), 45 deletions(-) delete mode 100644 nchs_mortality/delphi_nchs_mortality/export.py diff --git a/_delphi_utils_python/delphi_utils/export.py b/_delphi_utils_python/delphi_utils/export.py index 5a3b804b2..cb96cfa33 100644 --- a/_delphi_utils_python/delphi_utils/export.py +++ b/_delphi_utils_python/delphi_utils/export.py @@ -4,6 +4,7 @@ from os.path import join from typing import Optional +from epiweeks import Week import numpy as np import pandas as pd @@ -16,7 +17,8 @@ def create_export_csv( start_date: Optional[datetime] = None, end_date: Optional[datetime] = None, remove_null_samples: Optional[bool] = False, - write_empty_days: Optional[bool] = False + write_empty_days: Optional[bool] = False, + weekly_dates = False, ): """Export data in the format expected by the Delphi API. @@ -65,10 +67,15 @@ def create_export_csv( dates = pd.date_range(start_date, end_date) for date in dates: + if weekly_dates: + t = Week.fromdate(pd.to_datetime(str(date))) + date_str = "weekly_" + str(t.year) + str(t.week).zfill(2) + else: + date_str = date.strftime('%Y%m%d') if metric is None: - export_filename = f"{date.strftime('%Y%m%d')}_{geo_res}_{sensor}.csv" + export_filename = f"{date_str}_{geo_res}_{sensor}.csv" else: - export_filename = f"{date.strftime('%Y%m%d')}_{geo_res}_{metric}_{sensor}.csv" + export_filename = f"{date_str}_{geo_res}_{metric}_{sensor}.csv" export_file = join(export_dir, export_filename) export_df = df[df["timestamp"] == date][["geo_id", "val", "se", "sample_size",]] if remove_null_samples: diff --git a/_delphi_utils_python/setup.py b/_delphi_utils_python/setup.py index 731d7e957..016263c32 100644 --- a/_delphi_utils_python/setup.py +++ b/_delphi_utils_python/setup.py @@ -7,6 +7,7 @@ required = [ "boto3", "covidcast", + "epiweeks", "freezegun", "gitpython", "mock", diff --git a/nchs_mortality/delphi_nchs_mortality/export.py b/nchs_mortality/delphi_nchs_mortality/export.py deleted file mode 100644 index 47033a50a..000000000 --- a/nchs_mortality/delphi_nchs_mortality/export.py +++ /dev/null @@ -1,36 +0,0 @@ -# -*- coding: utf-8 -*- -"""Function to export the dataset in the format expected of the API.""" -import pandas as pd -from epiweeks import Week - -def export_csv(df, geo_name, sensor, export_dir, start_date): - """Export data set in format expected for ingestion by the API. - - Parameters - ---------- - df: pd.DataFrame - data frame with columns "geo_id", "timestamp", and "val" - geo_name: str - name of the geographic region, such as "state" or "hrr" - sensor: str - name of the sensor; only used for naming the output file - export_dir: str - path to location where the output CSV files to be uploaded should be stored - start_date: datetime.datetime - The first date to report - end_date: datetime.datetime - The last date to report - """ - df = df.copy() - df = df[df["timestamp"] >= start_date] - - dates = df["timestamp"].unique() - for date in dates: - t = Week.fromdate(pd.to_datetime(str(date))) - date_short = "weekly_" + str(t.year) + str(t.week).zfill(2) - export_fn = f"{date_short}_{geo_name}_{sensor}.csv" - result_df = df[df["timestamp"] == date][["geo_id", "val", "se", "sample_size"]] - result_df.to_csv(f"{export_dir}/{export_fn}", - index=False, - float_format="%.8f") - return pd.to_datetime(dates) diff --git a/nchs_mortality/delphi_nchs_mortality/run.py b/nchs_mortality/delphi_nchs_mortality/run.py index 1cf3d36d5..ec5416bb2 100644 --- a/nchs_mortality/delphi_nchs_mortality/run.py +++ b/nchs_mortality/delphi_nchs_mortality/run.py @@ -9,12 +9,11 @@ from typing import Dict, Any import numpy as np -from delphi_utils import S3ArchiveDiffer, get_structured_logger +from delphi_utils import S3ArchiveDiffer, get_structured_logger, create_export_csv from .archive_diffs import arch_diffs from .constants import (METRICS, SENSOR_NAME_MAP, SENSORS, INCIDENCE_BASE, GEO_RES) -from .export import export_csv from .pull import pull_nchs_mortality_data @@ -70,12 +69,13 @@ def run_module(params: Dict[str, Any]): df["sample_size"] = np.nan df = df[~df["val"].isnull()] sensor_name = "_".join([SENSOR_NAME_MAP[metric]]) - dates = export_csv( + dates = create_export_csv( df, - geo_name=GEO_RES, + geo_res=GEO_RES, export_dir=daily_export_dir, start_date=datetime.strptime(export_start_date, "%Y-%m-%d"), sensor=sensor_name, + weekly_dates=True ) if len(dates) > 0: stats.append((max(dates), len(dates))) @@ -93,12 +93,13 @@ def run_module(params: Dict[str, Any]): df["sample_size"] = np.nan df = df[~df["val"].isnull()] sensor_name = "_".join([SENSOR_NAME_MAP[metric], sensor]) - dates = export_csv( + dates = create_export_csv( df, - geo_name=GEO_RES, + geo_res=GEO_RES, export_dir=daily_export_dir, start_date=datetime.strptime(export_start_date, "%Y-%m-%d"), sensor=sensor_name, + weekly_dates=True ) if len(dates) > 0: stats.append((max(dates), len(dates))) From 5fa63149d58f09a4df253e956d78258fafb1e65f Mon Sep 17 00:00:00 2001 From: Andrew Chin Date: Fri, 24 Sep 2021 23:28:40 -0400 Subject: [PATCH 38/43] Remove test for old export func --- nchs_mortality/tests/test_export.py | 51 ----------------------------- 1 file changed, 51 deletions(-) delete mode 100644 nchs_mortality/tests/test_export.py diff --git a/nchs_mortality/tests/test_export.py b/nchs_mortality/tests/test_export.py deleted file mode 100644 index c05f287ca..000000000 --- a/nchs_mortality/tests/test_export.py +++ /dev/null @@ -1,51 +0,0 @@ -from datetime import datetime -from os.path import join, exists - -import pandas as pd - -from delphi_nchs_mortality.export import export_csv - - -class TestExport: - def test_export(self): - - # create fake dataset and save in a temporary directory - input_data = pd.DataFrame( - { - "geo_id": ["a", "a", "b", "b", "c", "c"], - "val": [0, 2, 3, 5, 10, 12], - "timestamp": [datetime(2020, 6, 2), datetime(2020, 6, 9)] * 3, - "se": [0.01, 0.02, 0.01, 0.01, 0.005, 0.01], - "sample_size": [100, 200, 500, 50, 80, 10] - } - ) - - export_csv( - input_data, - geo_name = "state", - sensor="region_thing", - export_dir="./receiving", - start_date = datetime(2020, 6, 2), - ) - - # check data for 2020-06-02 - expected_name = "weekly_202023_state_region_thing.csv" - assert exists(join("./receiving", expected_name)) - - output_data = pd.read_csv(join("./receiving", expected_name)) - - assert (output_data.columns == ["geo_id", "val", "se", "sample_size"]).all() - assert (output_data.geo_id == ["a", "b", "c"]).all() - assert (output_data.se.values == [0.01, 0.01, 0.005]).all() - assert (output_data.sample_size.values == [100, 500, 80]).all() - - # check data for 2020-06-03 - expected_name = "weekly_202024_state_region_thing.csv" - assert exists(join("./receiving", expected_name)) - - output_data = pd.read_csv(join("./receiving", expected_name)) - - assert (output_data.columns == ["geo_id", "val", "se", "sample_size"]).all() - assert (output_data.geo_id == ["a", "b", "c"]).all() - assert (output_data.se.values == [0.02, 0.01, 0.01]).all() - assert (output_data.sample_size.values == [200, 50, 10]).all() From 28ac486a41025bdcf322ccbf5c9be0ad96e19dac Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 27 Sep 2021 13:42:06 -0700 Subject: [PATCH 39/43] Update archiver docstrings Co-authored-by: Katie Mazaitis --- _delphi_utils_python/delphi_utils/archive.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/_delphi_utils_python/delphi_utils/archive.py b/_delphi_utils_python/delphi_utils/archive.py index e2e826396..7f75c8cab 100644 --- a/_delphi_utils_python/delphi_utils/archive.py +++ b/_delphi_utils_python/delphi_utils/archive.py @@ -280,10 +280,9 @@ def filter_exports(self, common_diffs: FileDiffMap): Filter export directory to only contain relevant files. Filters down the export_dir to only contain: - 1) New files, 2) Changed files, filtered-down to the ADDED and CHANGED rows - only, and 3) Deleted files replaced with empty CSVs with the same name. Should - be called after archive_exports() so we archive the raw exports before potentially - modifying them. + 1) New files, 2) Changed files, filtered-down to the ADDED and CHANGED rows only. + Should be called after archive_exports() so we archive the raw exports before + potentially modifying them. Parameters ---------- From a46f866bc4267dbfd6ff9287c33049fcc57fc4ee Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 27 Sep 2021 13:42:12 -0700 Subject: [PATCH 40/43] Update archiver docstrings Co-authored-by: Katie Mazaitis --- _delphi_utils_python/delphi_utils/archive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_delphi_utils_python/delphi_utils/archive.py b/_delphi_utils_python/delphi_utils/archive.py index 7f75c8cab..eb8aac8d2 100644 --- a/_delphi_utils_python/delphi_utils/archive.py +++ b/_delphi_utils_python/delphi_utils/archive.py @@ -313,7 +313,7 @@ def run(self): # Diff exports, and make incremental versions _, common_diffs, new_files = self.diff_exports() - # Archive changed, new, and emptied deleted files + # Archive changed and new files only to_archive = [f for f, diff in common_diffs.items() if diff is not None] to_archive += new_files From 7cb9b8cdc2b0c1cd87ec33d6d8eaaa7b5d74b4d1 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 27 Sep 2021 14:22:00 -0700 Subject: [PATCH 41/43] Nancodes archiver/export: explicit tests --- _delphi_utils_python/tests/test_archive.py | 30 ++++- _delphi_utils_python/tests/test_export.py | 125 ++++++++++++++++----- 2 files changed, 119 insertions(+), 36 deletions(-) diff --git a/_delphi_utils_python/tests/test_archive.py b/_delphi_utils_python/tests/test_archive.py index ece76f556..3050908f2 100644 --- a/_delphi_utils_python/tests/test_archive.py +++ b/_delphi_utils_python/tests/test_archive.py @@ -61,6 +61,17 @@ "se": [0.1], "sample_size": [10.0] }), + + # Common, but missing columns removed + "csv5": pd.DataFrame({ + "geo_id": ["1"], + "val": [1.0], + "se": [0.1], + "sample_size": [10.0], + "missing_val": [Nans.NOT_MISSING], + "missing_se": [Nans.NOT_MISSING], + "missing_sample_size": [Nans.NOT_MISSING], + }), } CSVS_AFTER = { @@ -106,6 +117,14 @@ "missing_se": [Nans.NOT_MISSING], "missing_sample_size": [Nans.NOT_MISSING], }), + + # Common, but missing columns removed + "csv5": pd.DataFrame({ + "geo_id": ["1"], + "val": [1.0], + "se": [0.1], + "sample_size": [10.0] + }), } class TestArchiveDiffer: @@ -156,7 +175,7 @@ def test_diff_and_filter_exports(self, tmp_path): # Check return values assert set(deleted_files) == {join(cache_dir, "csv2.csv")} assert set(common_diffs.keys()) == { - join(export_dir, f) for f in ["csv0.csv", "csv1.csv", "csv4.csv"]} + join(export_dir, f) for f in ["csv0.csv", "csv1.csv", "csv4.csv", "csv5.csv"]} assert set(new_files) == {join(export_dir, "csv3.csv")} assert common_diffs[join(export_dir, "csv0.csv")] is None assert common_diffs[join(export_dir, "csv1.csv")] == join( @@ -165,7 +184,8 @@ def test_diff_and_filter_exports(self, tmp_path): # Check filesystem for actual files assert set(listdir(export_dir)) == { "csv0.csv", "csv1.csv", "csv1.csv.diff", - "csv3.csv", "csv4.csv", "csv4.csv.diff" + "csv3.csv", "csv4.csv", "csv4.csv.diff", + "csv5.csv", "csv5.csv.diff" } assert_frame_equal( pd.read_csv(join(export_dir, "csv1.csv.diff"), dtype=CSV_DTYPES), @@ -184,7 +204,7 @@ def test_diff_and_filter_exports(self, tmp_path): arch_diff.filter_exports(common_diffs) # Check exports directory just has incremental changes - assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv"} + assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv", "csv5.csv"} assert_frame_equal( pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES), csv1_diff) @@ -311,7 +331,7 @@ def test_run(self, tmp_path, s3_client): assert_frame_equal(pd.read_csv(body, dtype=CSV_DTYPES), df) # Check exports directory just has incremental changes - assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv"} + assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv", "csv5.csv"} csv1_diff = pd.DataFrame({ "geo_id": ["3", "2", "4"], "val": [np.nan, 2.1, 4.0], @@ -524,7 +544,7 @@ def test_run(self, tmp_path): original_branch.checkout() # Check exports directory just has incremental changes - assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv"} + assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv", "csv5.csv"} csv1_diff = pd.DataFrame({ "geo_id": ["3", "2", "4"], "val": [np.nan, 2.1, 4.0], diff --git a/_delphi_utils_python/tests/test_export.py b/_delphi_utils_python/tests/test_export.py index b22a710cd..d9906300d 100644 --- a/_delphi_utils_python/tests/test_export.py +++ b/_delphi_utils_python/tests/test_export.py @@ -9,6 +9,7 @@ from delphi_utils import create_export_csv, Nans + def _clean_directory(directory): """Clean files out of a directory.""" for fname in listdir(directory): @@ -29,6 +30,7 @@ def _non_ignored_files_set(directory): class TestExport: """Tests for exporting CSVs.""" + # List of times for data points. TIMES = [ datetime.strptime(x, "%Y-%m-%d") @@ -54,9 +56,19 @@ class TestExport: "val": [3.12345678910, np.nan, 2.2, 2.6], "se": [0.15, 0.22, np.nan, 0.34], "sample_size": [100, 100, 101, None], - "missing_val": [Nans.NOT_MISSING, Nans.OTHER, Nans.NOT_MISSING, Nans.NOT_MISSING], - "missing_se": [Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.OTHER, Nans.NOT_MISSING], - "missing_sample_size": [Nans.NOT_MISSING] * 3 + [Nans.OTHER] + "missing_val": [ + Nans.NOT_MISSING, + Nans.OTHER, + Nans.NOT_MISSING, + Nans.NOT_MISSING, + ], + "missing_se": [ + Nans.NOT_MISSING, + Nans.NOT_MISSING, + Nans.OTHER, + Nans.NOT_MISSING, + ], + "missing_sample_size": [Nans.NOT_MISSING] * 3 + [Nans.OTHER], } ) @@ -68,9 +80,19 @@ class TestExport: "val": [np.nan, np.nan, 2.2, 2.6], "se": [0.15, 0.22, np.nan, 0.34], "sample_size": [100, 100, 101, None], - "missing_val": [Nans.NOT_MISSING, Nans.OTHER, Nans.NOT_MISSING, Nans.NOT_MISSING], - "missing_se": [Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.OTHER, Nans.NOT_MISSING], - "missing_sample_size": [Nans.NOT_MISSING] * 3 + [Nans.OTHER] + "missing_val": [ + Nans.NOT_MISSING, + Nans.OTHER, + Nans.NOT_MISSING, + Nans.NOT_MISSING, + ], + "missing_se": [ + Nans.NOT_MISSING, + Nans.NOT_MISSING, + Nans.OTHER, + Nans.NOT_MISSING, + ], + "missing_sample_size": [Nans.NOT_MISSING] * 3 + [Nans.OTHER], } ) @@ -116,10 +138,14 @@ def test_export_rounding(self): ) pd.testing.assert_frame_equal( pd.read_csv(join(self.TEST_DIR, "20200215_county_deaths_test.csv")), - pd.DataFrame({"geo_id": [51093, 51175], - "val": [round(3.12345678910, 7), 2.1], - "se": [0.15, 0.22], - "sample_size": [100, 100]}) + pd.DataFrame( + { + "geo_id": [51093, 51175], + "val": [round(3.12345678910, 7), 2.1], + "se": [0.15, 0.22], + "sample_size": [100, 100], + } + ), ) def test_export_without_metric(self): @@ -211,13 +237,16 @@ def test_export_with_null_removal(self): """Test that `remove_null_samples = True` removes entries with null samples.""" _clean_directory(self.TEST_DIR) - df_with_nulls = self.DF.copy().append({ - "geo_id": "66666", - "timestamp": datetime(2020, 6, 6), - "val": 10, - "se": 0.2, - "sample_size": pd.NA}, - ignore_index=True) + df_with_nulls = self.DF.copy().append( + { + "geo_id": "66666", + "timestamp": datetime(2020, 6, 6), + "val": 10, + "se": 0.2, + "sample_size": pd.NA, + }, + ignore_index=True, + ) create_export_csv( df=df_with_nulls, @@ -241,13 +270,16 @@ def test_export_without_null_removal(self): """Test that `remove_null_samples = False` does not remove entries with null samples.""" _clean_directory(self.TEST_DIR) - df_with_nulls = self.DF.copy().append({ - "geo_id": "66666", - "timestamp": datetime(2020, 6, 6), - "val": 10, - "se": 0.2, - "sample_size": pd.NA}, - ignore_index=True) + df_with_nulls = self.DF.copy().append( + { + "geo_id": "66666", + "timestamp": datetime(2020, 6, 6), + "val": 10, + "se": 0.2, + "sample_size": pd.NA, + }, + ignore_index=True, + ) create_export_csv( df=df_with_nulls, @@ -267,24 +299,56 @@ def test_export_without_null_removal(self): ) assert pd.read_csv(join(self.TEST_DIR, "20200606_state_test.csv")).size > 0 + def test_export_df_without_missingness(self): + _clean_directory(self.TEST_DIR) + + create_export_csv( + df=self.DF.copy(), export_dir=self.TEST_DIR, geo_res="county", sensor="test" + ) + df = pd.read_csv(join(self.TEST_DIR, "20200215_county_test.csv")).astype( + {"geo_id": str, "sample_size": int} + ) + expected_df = pd.DataFrame( + { + "geo_id": ["51093", "51175"], + "val": [3.12345678910, 2.1], + "se": [0.15, 0.22], + "sample_size": [100, 100], + } + ).astype({"geo_id": str, "sample_size": int}) + pd.testing.assert_frame_equal(df, expected_df) + def test_export_df_with_missingness(self): _clean_directory(self.TEST_DIR) create_export_csv( df=self.DF2.copy(), export_dir=self.TEST_DIR, - geo_res="state", + geo_res="county", sensor="test", - remove_null_samples=False ) assert _non_ignored_files_set(self.TEST_DIR) == set( [ - "20200215_state_test.csv", - "20200301_state_test.csv", - "20200315_state_test.csv", + "20200215_county_test.csv", + "20200301_county_test.csv", + "20200315_county_test.csv", ] ) - assert pd.read_csv(join(self.TEST_DIR, "20200315_state_test.csv")).size > 0 + df = pd.read_csv(join(self.TEST_DIR, "20200215_county_test.csv")).astype( + {"geo_id": str, "sample_size": int} + ) + expected_df = pd.DataFrame( + { + "geo_id": ["51093", "51175"], + "val": [3.12345678910, np.nan], + "se": [0.15, 0.22], + "sample_size": [100, 100], + "missing_val": [Nans.NOT_MISSING, Nans.OTHER], + "missing_se": [Nans.NOT_MISSING] * 2, + "missing_sample_size": [Nans.NOT_MISSING] * 2, + } + ).astype({"geo_id": str, "sample_size": int}) + pd.testing.assert_frame_equal(df, expected_df) @mock.patch("delphi_utils.logger") def test_export_df_with_contradictory_missingness(self, mock_logger): @@ -295,7 +359,6 @@ def test_export_df_with_contradictory_missingness(self, mock_logger): export_dir=self.TEST_DIR, geo_res="state", sensor="test", - remove_null_samples=False, logger=mock_logger ) assert _non_ignored_files_set(self.TEST_DIR) == set( From baffc14a216300dc1c01f64f2b77bb3c37554c0b Mon Sep 17 00:00:00 2001 From: Delphi Deploy Bot Date: Wed, 29 Sep 2021 14:16:21 +0000 Subject: [PATCH 42/43] chore: bump delphi_utils to 0.1.13 --- _delphi_utils_python/.bumpversion.cfg | 2 +- _delphi_utils_python/delphi_utils/__init__.py | 2 +- _delphi_utils_python/setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/_delphi_utils_python/.bumpversion.cfg b/_delphi_utils_python/.bumpversion.cfg index 2d7554882..67f34ca69 100644 --- a/_delphi_utils_python/.bumpversion.cfg +++ b/_delphi_utils_python/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.12 +current_version = 0.1.13 commit = True message = chore: bump delphi_utils to {new_version} tag = False diff --git a/_delphi_utils_python/delphi_utils/__init__.py b/_delphi_utils_python/delphi_utils/__init__.py index 6d0b523fd..99f35fc0a 100644 --- a/_delphi_utils_python/delphi_utils/__init__.py +++ b/_delphi_utils_python/delphi_utils/__init__.py @@ -14,4 +14,4 @@ from .signal import add_prefix from .nancodes import Nans -__version__ = "0.1.12" +__version__ = "0.1.13" diff --git a/_delphi_utils_python/setup.py b/_delphi_utils_python/setup.py index 016263c32..35fe44fac 100644 --- a/_delphi_utils_python/setup.py +++ b/_delphi_utils_python/setup.py @@ -25,7 +25,7 @@ setup( name="delphi_utils", - version="0.1.12", + version="0.1.13", description="Shared Utility Functions for Indicators", long_description=long_description, long_description_content_type="text/markdown", From 39249812d1bf0cfc30115e3cc57bd2f7752c57cf Mon Sep 17 00:00:00 2001 From: Delphi Deploy Bot Date: Wed, 29 Sep 2021 14:16:21 +0000 Subject: [PATCH 43/43] chore: bump covidcast-indicators to 0.1.17 --- .bumpversion.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 97256893a..8c461bf8b 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.16 +current_version = 0.1.17 commit = True message = chore: bump covidcast-indicators to {new_version} tag = False