Merge pull request #1287 from cmu-delphi/release/indicators_v0.1.20_utils_v0.1.14

krivard · web-flow · commit 2f9c20a54581 · 2021-10-04T16:22:01.000-04:00
Release covidcast-indicators 0.1.20
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.1.19
+current_version = 0.1.20
 commit = True
 message = chore: bump covidcast-indicators to {new_version}
 tag = False
diff --git a/_delphi_utils_python/.bumpversion.cfg b/_delphi_utils_python/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.1.13
+current_version = 0.1.14
 commit = True
 message = chore: bump delphi_utils to {new_version}
 tag = False
diff --git a/_delphi_utils_python/delphi_utils/__init__.py b/_delphi_utils_python/delphi_utils/__init__.py
@@ -14,4 +14,4 @@
 from .signal import add_prefix
 from .nancodes import Nans
 
-__version__ = "0.1.13"
+__version__ = "0.1.14"
diff --git a/_delphi_utils_python/delphi_utils/archive.py b/_delphi_utils_python/delphi_utils/archive.py
@@ -105,7 +105,8 @@ def diff_export_csv(
     # Code deleted entries as nans with the deleted missing code
     deleted_df = before_df.loc[deleted_idx, :].copy()
     deleted_df[["val", "se", "sample_size"]] = np.nan
-    deleted_df[["missing_val", "missing_se", "missing_sample_size"]] = Nans.DELETED
+    if "missing_val" in after_df_cmn.columns:
+        deleted_df[["missing_val", "missing_se", "missing_sample_size"]] = Nans.DELETED
 
     return (
         deleted_df,
diff --git a/_delphi_utils_python/setup.py b/_delphi_utils_python/setup.py
@@ -25,7 +25,7 @@
 
 setup(
     name="delphi_utils",
-    version="0.1.13",
+    version="0.1.14",
     description="Shared Utility Functions for Indicators",
     long_description=long_description,
     long_description_content_type="text/markdown",
diff --git a/_delphi_utils_python/tests/test_archive.py b/_delphi_utils_python/tests/test_archive.py
@@ -18,11 +18,11 @@
 
 CSV_DTYPES = {
     "geo_id": str, "val": float, "se": float, "sample_size": float,
-    "missing_val": int, "missing_se":int, "missing_sample_size": int
+    "missing_val": int, "missing_se": int, "missing_sample_size": int
     }
 
 CSVS_BEFORE = {
-    # Common
+    # All rows unchanged
     "csv0": pd.DataFrame({
         "geo_id": ["1", "2", "3"],
         "val": [1.000000001, 2.00000002, 3.00000003],
@@ -33,6 +33,7 @@
         "missing_sample_size": [Nans.NOT_MISSING] * 3,
         }),
 
+    # One row deleted and one row added
     "csv1": pd.DataFrame({
         "geo_id": ["1", "2", "3"],
         "val": [1.0, 2.0, 3.0],
@@ -43,7 +44,7 @@
         "missing_sample_size": [Nans.NOT_MISSING] * 3,
         }),
 
-    # Deleted
+    # File deleted
     "csv2": pd.DataFrame({
         "geo_id": ["1"],
         "val": [1.0],
@@ -54,15 +55,15 @@
         "missing_sample_size": [Nans.NOT_MISSING],
         }),
 
-    # Common, but updated with missing columns
+    # All rows common, but missing columns added
     "csv4": pd.DataFrame({
         "geo_id": ["1"],
         "val": [1.0],
         "se": [0.1],
         "sample_size": [10.0]
         }),
 
-    # Common, but missing columns removed
+    # All rows common, but missing columns removed
     "csv5": pd.DataFrame({
         "geo_id": ["1"],
         "val": [1.0],
@@ -72,10 +73,34 @@
         "missing_se": [Nans.NOT_MISSING],
         "missing_sample_size": [Nans.NOT_MISSING],
         }),
+
+    # All rows common, but no missing columns
+    "csv6": pd.DataFrame({
+        "geo_id": ["1"],
+        "val": [1.0],
+        "se": [0.1],
+        "sample_size": [10.0]
+        }),
+
+    # Row deleted and row added, but no missing columns (will not be uploaded)
+    "csv7": pd.DataFrame({
+        "geo_id": ["1", "2"],
+        "val": [1.0, 2.0],
+        "se": [0.1, 0.2],
+        "sample_size": [10.0, 20.0]
+        }),
+
+    # Row deleted and row added, but no missing columns
+    "csv8": pd.DataFrame({
+        "geo_id": ["1", "2"],
+        "val": [1.0, 2.0],
+        "se": [0.1, 0.2],
+        "sample_size": [10.0, 20.0]
+        }),
 }
 
 CSVS_AFTER = {
-    # Common
+    # All rows unchanged
     "csv0": pd.DataFrame({
         "geo_id": ["1", "2", "3"],
         "val": [1.0, 2.0, 3.0],
@@ -86,6 +111,7 @@
         "missing_sample_size": [Nans.NOT_MISSING] * 3,
         }),
 
+    # One row deleted and one row added
     "csv1": pd.DataFrame({
         "geo_id": ["1", "2", "4"],
         "val": [1.0, 2.1, 4.0],
@@ -96,7 +122,7 @@
         "missing_sample_size": [Nans.NOT_MISSING] * 3,
         }),
 
-    # Added
+    # File added
     "csv3": pd.DataFrame({
         "geo_id": ["2"],
         "val": [2.0000002],
@@ -107,7 +133,7 @@
         "missing_sample_size": [Nans.NOT_MISSING],
         }),
 
-    # Common, but updated with missing columns
+    # All rows common, but missing columns added
     "csv4": pd.DataFrame({
         "geo_id": ["1"],
         "val": [1.0],
@@ -118,13 +144,37 @@
         "missing_sample_size": [Nans.NOT_MISSING],
         }),
 
-    # Common, but missing columns removed
+    # All rows common, but missing columns removed
     "csv5": pd.DataFrame({
         "geo_id": ["1"],
         "val": [1.0],
         "se": [0.1],
         "sample_size": [10.0]
         }),
+
+    # All rows common, but no missing columns
+    "csv6": pd.DataFrame({
+        "geo_id": ["1"],
+        "val": [1.0],
+        "se": [0.1],
+        "sample_size": [10.0]
+        }),
+
+    # Row deleted and row added, but no missing columns (will not be uploaded)
+    "csv7": pd.DataFrame({
+        "geo_id": ["1"],
+        "val": [1.0],
+        "se": [0.1],
+        "sample_size": [10.0]
+        }),
+
+    # Row deleted and row added, but no missing columns
+    "csv8": pd.DataFrame({
+        "geo_id": ["1", "3"],
+        "val": [1.0, 3.0],
+        "se": [0.1, 0.3],
+        "sample_size": [10.0, 30.0]
+        }),
 }
 
 class TestArchiveDiffer:
@@ -175,17 +225,22 @@ def test_diff_and_filter_exports(self, tmp_path):
         # Check return values
         assert set(deleted_files) == {join(cache_dir, "csv2.csv")}
         assert set(common_diffs.keys()) == {
-            join(export_dir, f) for f in ["csv0.csv", "csv1.csv", "csv4.csv", "csv5.csv"]}
+            join(export_dir, f) for f in ["csv0.csv", "csv1.csv", "csv4.csv", "csv5.csv", "csv6.csv", "csv7.csv", "csv8.csv"]}
         assert set(new_files) == {join(export_dir, "csv3.csv")}
         assert common_diffs[join(export_dir, "csv0.csv")] is None
         assert common_diffs[join(export_dir, "csv1.csv")] == join(
             export_dir, "csv1.csv.diff")
 
         # Check filesystem for actual files
         assert set(listdir(export_dir)) == {
-            "csv0.csv", "csv1.csv", "csv1.csv.diff",
-            "csv3.csv", "csv4.csv", "csv4.csv.diff",
-            "csv5.csv", "csv5.csv.diff"
+            "csv0.csv",
+            "csv1.csv", "csv1.csv.diff",
+            "csv3.csv",
+            "csv4.csv", "csv4.csv.diff",
+            "csv5.csv", "csv5.csv.diff",
+            "csv6.csv",
+            "csv7.csv", "csv7.csv.diff",
+            "csv8.csv", "csv8.csv.diff"
         }
         assert_frame_equal(
             pd.read_csv(join(export_dir, "csv1.csv.diff"), dtype=CSV_DTYPES),
@@ -204,7 +259,7 @@ def test_diff_and_filter_exports(self, tmp_path):
         arch_diff.filter_exports(common_diffs)
 
         # Check exports directory just has incremental changes
-        assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv", "csv5.csv"}
+        assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv", "csv5.csv", "csv7.csv", "csv8.csv"}
         assert_frame_equal(
             pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES),
             csv1_diff)
@@ -325,13 +380,11 @@ def test_run(self, tmp_path, s3_client):
 
         # Check that the buckets now contain the exported files.
         for csv_name, df in CSVS_AFTER.items():
-            body = s3_client.get_object(
-                Bucket=self.bucket_name,
-                Key=f"{self.indicator_prefix}/{csv_name}.csv")["Body"]
+            body = s3_client.get_object(Bucket=self.bucket_name, Key=f"{self.indicator_prefix}/{csv_name}.csv")["Body"]
             assert_frame_equal(pd.read_csv(body, dtype=CSV_DTYPES), df)
 
         # Check exports directory just has incremental changes
-        assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv", "csv5.csv"}
+        assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv", "csv5.csv", "csv7.csv", "csv8.csv"}
         csv1_diff = pd.DataFrame({
             "geo_id": ["3", "2", "4"],
             "val": [np.nan, 2.1, 4.0],
@@ -539,12 +592,11 @@ def test_run(self, tmp_path):
         arch_diff.get_branch(branch_name).checkout()
         for csv_name, df in CSVS_AFTER.items():
             assert_frame_equal(
-                pd.read_csv(
-                    join(cache_dir, f"{csv_name}.csv"), dtype=CSV_DTYPES), df)
+                pd.read_csv(join(cache_dir, f"{csv_name}.csv"), dtype=CSV_DTYPES), df)
         original_branch.checkout()
 
         # Check exports directory just has incremental changes
-        assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv", "csv5.csv"}
+        assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv", "csv5.csv", "csv7.csv", "csv8.csv"}
         csv1_diff = pd.DataFrame({
             "geo_id": ["3", "2", "4"],
             "val": [np.nan, 2.1, 4.0],