From 911c0e09031cbaf30791b3acdf82e8e98810cbdc Mon Sep 17 00:00:00 2001 From: Chua Eu Jing Date: Wed, 12 Aug 2020 16:39:27 -0400 Subject: [PATCH 1/6] Added archiving diffing utility --- usafacts/delphi_usafacts/run.py | 29 ++++++++++++++++++++++++++++- usafacts/params.json.template | 7 ++++++- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/usafacts/delphi_usafacts/run.py b/usafacts/delphi_usafacts/run.py index 03a9e09ee..a75a8b2b7 100644 --- a/usafacts/delphi_usafacts/run.py +++ b/usafacts/delphi_usafacts/run.py @@ -11,7 +11,11 @@ import numpy as np import pandas as pd -from delphi_utils import read_params, create_export_csv +from delphi_utils import ( + read_params, + create_export_csv, + S3ArchiveDiffer, +) from .geo import geo_map from .pull import pull_usafacts_data @@ -73,6 +77,13 @@ def run_module(): export_dir = params["export_dir"] base_url = params["base_url"] static_file_dir = params["static_file_dir"] + cache_dir = params["cache_dir"] + + arch_diff = S3ArchiveDiffer( + cache_dir, export_dir, + params["bucket_name"], "usafacts", + params["aws_credentials"]) + arch_diff.update_cache() map_df = pd.read_csv( join(static_file_dir, "fips_prop_pop.csv"), dtype={"fips": int} @@ -107,3 +118,19 @@ def run_module(): geo_res=geo_res, sensor=sensor_name, ) + + # Diff exports, and make incremental versions + _, common_diffs, new_files = arch_diff.diff_exports() + + # Archive changed and new files only + to_archive = [f for f, diff in common_diffs.items() if diff is not None] + to_archive += new_files + _, fails = arch_diff.archive_exports(to_archive) + + # Filter existing exports to exclude those that failed to archive + succ_common_diffs = {f: diff for f, diff in common_diffs.items() if f not in fails} + arch_diff.filter_exports(succ_common_diffs) + + # Report failures: someone should probably look at them + for exported_file in fails: + print(f"Failed to archive '{exported_file}'") diff --git a/usafacts/params.json.template b/usafacts/params.json.template index 12f81b651..ee806ba8f 100644 --- a/usafacts/params.json.template +++ b/usafacts/params.json.template @@ -3,5 +3,10 @@ "static_file_dir": "./static", "export_dir": "./receiving", "cache_dir": "./cache", - "base_url": "https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_{metric}_usafacts.csv" + "base_url": "https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_{metric}_usafacts.csv", + "aws_credentials": { + "aws_access_key_id": "", + "aws_secret_access_key": "" + }, + "bucket_name": "" } From 5d3b8ec11493d5c594234e57cc5690ff4af6b1a3 Mon Sep 17 00:00:00 2001 From: Chua Eu Jing Date: Wed, 12 Aug 2020 16:39:54 -0400 Subject: [PATCH 2/6] Updated unit tests --- usafacts/tests/conftest.py | 12 +++++++++++- usafacts/tests/params.json.template | 7 ++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/usafacts/tests/conftest.py b/usafacts/tests/conftest.py index 20d4017f1..7f2d9fb18 100644 --- a/usafacts/tests/conftest.py +++ b/usafacts/tests/conftest.py @@ -1,10 +1,13 @@ # -*- coding: utf-8 -*- +from boto3 import Session +from moto import mock_s3 import pytest from os import listdir, remove from os.path import join +from delphi_utils import read_params from delphi_usafacts.run import run_module @@ -14,4 +17,11 @@ def run_as_module(): for fname in listdir("receiving"): remove(join("receiving", fname)) - run_module() + with mock_s3(): + # Create the fake bucket we will be using + params = read_params() + aws_credentials = params["aws_credentials"] + s3_client = Session(**aws_credentials).client("s3") + s3_client.create_bucket(Bucket=params["bucket_name"]) + + run_module() diff --git a/usafacts/tests/params.json.template b/usafacts/tests/params.json.template index a21cba192..69c83ce37 100644 --- a/usafacts/tests/params.json.template +++ b/usafacts/tests/params.json.template @@ -3,5 +3,10 @@ "static_file_dir": "../static", "export_dir": "./receiving", "cache_dir": "./cache", - "base_url": "./test_data/small_{metric}.csv" + "base_url": "./test_data/small_{metric}.csv", + "aws_credentials": { + "aws_access_key_id": "FAKE_TEST_ACCESS_KEY_ID", + "aws_secret_access_key": "FAKE_TEST_SECRET_ACCESS_KEY" + }, + "bucket_name": "test-bucket" } From 2d3fb8051399467318fd9822f43999dc0b491c22 Mon Sep 17 00:00:00 2001 From: Chua Eu Jing Date: Wed, 12 Aug 2020 16:40:28 -0400 Subject: [PATCH 3/6] Updated ansible template --- ansible/files/usafacts-params-prod.json | 7 ------- ansible/templates/usafacts-params-prod.json.j2 | 12 ++++++++++++ 2 files changed, 12 insertions(+), 7 deletions(-) delete mode 100644 ansible/files/usafacts-params-prod.json create mode 100644 ansible/templates/usafacts-params-prod.json.j2 diff --git a/ansible/files/usafacts-params-prod.json b/ansible/files/usafacts-params-prod.json deleted file mode 100644 index e1c18fe2d..000000000 --- a/ansible/files/usafacts-params-prod.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "export_start_date": "latest", - "static_file_dir": "./static", - "export_dir": "/common/covidcast/receiving/usa-facts", - "cache_dir": "./cache", - "base_url": "https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_{metric}_usafacts.csv" -} diff --git a/ansible/templates/usafacts-params-prod.json.j2 b/ansible/templates/usafacts-params-prod.json.j2 new file mode 100644 index 000000000..7b2ef16c7 --- /dev/null +++ b/ansible/templates/usafacts-params-prod.json.j2 @@ -0,0 +1,12 @@ +{ + "export_start_date": "latest", + "static_file_dir": "./static", + "export_dir": "/common/covidcast/receiving/usa-facts", + "cache_dir": "./cache", + "base_url": "https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_{metric}_usafacts.csv", + "aws_credentials": { + "aws_access_key_id": "{{ delphi_aws_access_key_id }}", + "aws_secret_access_key": "{{ delphi_aws_secret_access_key }}" + }, + "bucket_name": "delphi-covidcast-indicator-output" +} From 4debc31dc72b61f138b8bef5c09330992c505cee Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Thu, 15 Oct 2020 11:47:41 -0400 Subject: [PATCH 4/6] fix broken usafacts tests to read from the proper directories --- usafacts/tests/conftest.py | 6 ++++-- usafacts/tests/test_run.py | 4 ++-- usafacts/tests/test_smooth.py | 4 ++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/usafacts/tests/conftest.py b/usafacts/tests/conftest.py index 20d4017f1..f360e2f1c 100644 --- a/usafacts/tests/conftest.py +++ b/usafacts/tests/conftest.py @@ -11,7 +11,9 @@ @pytest.fixture(scope="session") def run_as_module(): # Clean receiving directory - for fname in listdir("receiving"): - remove(join("receiving", fname)) + for fname in listdir("../receiving"): + if fname[0] == ".": + continue + remove(join("../receiving", fname)) run_module() diff --git a/usafacts/tests/test_run.py b/usafacts/tests/test_run.py index 82c47a9d4..489c3bb01 100644 --- a/usafacts/tests/test_run.py +++ b/usafacts/tests/test_run.py @@ -10,7 +10,7 @@ class TestRun: def test_output_files_exist(self, run_as_module): - csv_files = listdir("receiving") + csv_files = listdir("../receiving") dates = [ "20200229", @@ -48,6 +48,6 @@ def test_output_files_exist(self, run_as_module): def test_output_file_format(self, run_as_module): df = pd.read_csv( - join("receiving", "20200310_state_confirmed_cumulative_num.csv") + join("../receiving", "20200310_state_confirmed_cumulative_num.csv") ) assert (df.columns.values == ["geo_id", "val", "se", "sample_size"]).all() diff --git a/usafacts/tests/test_smooth.py b/usafacts/tests/test_smooth.py index b50089530..8abfb21a9 100644 --- a/usafacts/tests/test_smooth.py +++ b/usafacts/tests/test_smooth.py @@ -13,13 +13,13 @@ def test_output_files_smoothed(self, run_as_module): dates = [str(x) for x in range(20200304, 20200311)] smoothed = pd.read_csv( - join("receiving", + join("../receiving", f"{dates[-1]}_state_confirmed_7dav_cumulative_num.csv") ) raw = pd.concat([ pd.read_csv( - join("receiving", + join("../receiving", f"{date}_state_confirmed_cumulative_num.csv") ) for date in dates ]) From 51c0c03b3fe14c60f2052812b1de23aa83c6ade1 Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Mon, 19 Oct 2020 10:36:45 -0400 Subject: [PATCH 5/6] make new receiving directory in test directory --- usafacts/tests/conftest.py | 4 ++-- usafacts/tests/receiving/.gitignore | 0 usafacts/tests/test_run.py | 4 ++-- usafacts/tests/test_smooth.py | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) create mode 100644 usafacts/tests/receiving/.gitignore diff --git a/usafacts/tests/conftest.py b/usafacts/tests/conftest.py index f360e2f1c..5b199f458 100644 --- a/usafacts/tests/conftest.py +++ b/usafacts/tests/conftest.py @@ -11,9 +11,9 @@ @pytest.fixture(scope="session") def run_as_module(): # Clean receiving directory - for fname in listdir("../receiving"): + for fname in listdir("receiving"): if fname[0] == ".": continue - remove(join("../receiving", fname)) + remove(join("receiving", fname)) run_module() diff --git a/usafacts/tests/receiving/.gitignore b/usafacts/tests/receiving/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/usafacts/tests/test_run.py b/usafacts/tests/test_run.py index 489c3bb01..82c47a9d4 100644 --- a/usafacts/tests/test_run.py +++ b/usafacts/tests/test_run.py @@ -10,7 +10,7 @@ class TestRun: def test_output_files_exist(self, run_as_module): - csv_files = listdir("../receiving") + csv_files = listdir("receiving") dates = [ "20200229", @@ -48,6 +48,6 @@ def test_output_files_exist(self, run_as_module): def test_output_file_format(self, run_as_module): df = pd.read_csv( - join("../receiving", "20200310_state_confirmed_cumulative_num.csv") + join("receiving", "20200310_state_confirmed_cumulative_num.csv") ) assert (df.columns.values == ["geo_id", "val", "se", "sample_size"]).all() diff --git a/usafacts/tests/test_smooth.py b/usafacts/tests/test_smooth.py index 8abfb21a9..b50089530 100644 --- a/usafacts/tests/test_smooth.py +++ b/usafacts/tests/test_smooth.py @@ -13,13 +13,13 @@ def test_output_files_smoothed(self, run_as_module): dates = [str(x) for x in range(20200304, 20200311)] smoothed = pd.read_csv( - join("../receiving", + join("receiving", f"{dates[-1]}_state_confirmed_7dav_cumulative_num.csv") ) raw = pd.concat([ pd.read_csv( - join("../receiving", + join("receiving", f"{date}_state_confirmed_cumulative_num.csv") ) for date in dates ]) From a1a1b509b41c19b8f513b3ec403ced896127ad53 Mon Sep 17 00:00:00 2001 From: Brian Clark Date: Tue, 20 Oct 2020 12:47:57 -0400 Subject: [PATCH 6/6] Temporarily skip linting in Jenkins --- jenkins/usafacts-jenkins-test.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/jenkins/usafacts-jenkins-test.sh b/jenkins/usafacts-jenkins-test.sh index 729ef19da..4a11c5c71 100755 --- a/jenkins/usafacts-jenkins-test.sh +++ b/jenkins/usafacts-jenkins-test.sh @@ -15,7 +15,9 @@ local_indicator="usafacts" cd "${WORKSPACE}/${local_indicator}" || exit # Linter -env/bin/pylint delphi_"${local_indicator}" +#env/bin/pylint delphi_"${local_indicator}" +echo "Skip linting because we have weird breakage :( \ + TODO: https://github.com/cmu-delphi/covidcast-indicators/issues/333" # Unit tests and code coverage cd tests || exit && \