Nancodes archiver/export: explicit tests

dshemetov · dshemetov · commit 7cb9b8cdc2b0 · 2021-09-27T14:22:00.000-07:00
diff --git a/_delphi_utils_python/tests/test_archive.py b/_delphi_utils_python/tests/test_archive.py
@@ -61,6 +61,17 @@
         "se": [0.1],
         "sample_size": [10.0]
         }),
+
+    # Common, but missing columns removed
+    "csv5": pd.DataFrame({
+        "geo_id": ["1"],
+        "val": [1.0],
+        "se": [0.1],
+        "sample_size": [10.0],
+        "missing_val": [Nans.NOT_MISSING],
+        "missing_se": [Nans.NOT_MISSING],
+        "missing_sample_size": [Nans.NOT_MISSING],
+        }),
 }
 
 CSVS_AFTER = {
@@ -106,6 +117,14 @@
         "missing_se": [Nans.NOT_MISSING],
         "missing_sample_size": [Nans.NOT_MISSING],
         }),
+
+    # Common, but missing columns removed
+    "csv5": pd.DataFrame({
+        "geo_id": ["1"],
+        "val": [1.0],
+        "se": [0.1],
+        "sample_size": [10.0]
+        }),
 }
 
 class TestArchiveDiffer:
@@ -156,7 +175,7 @@ def test_diff_and_filter_exports(self, tmp_path):
         # Check return values
         assert set(deleted_files) == {join(cache_dir, "csv2.csv")}
         assert set(common_diffs.keys()) == {
-            join(export_dir, f) for f in ["csv0.csv", "csv1.csv", "csv4.csv"]}
+            join(export_dir, f) for f in ["csv0.csv", "csv1.csv", "csv4.csv", "csv5.csv"]}
         assert set(new_files) == {join(export_dir, "csv3.csv")}
         assert common_diffs[join(export_dir, "csv0.csv")] is None
         assert common_diffs[join(export_dir, "csv1.csv")] == join(
@@ -165,7 +184,8 @@ def test_diff_and_filter_exports(self, tmp_path):
         # Check filesystem for actual files
         assert set(listdir(export_dir)) == {
             "csv0.csv", "csv1.csv", "csv1.csv.diff",
-            "csv3.csv", "csv4.csv", "csv4.csv.diff"
+            "csv3.csv", "csv4.csv", "csv4.csv.diff",
+            "csv5.csv", "csv5.csv.diff"
         }
         assert_frame_equal(
             pd.read_csv(join(export_dir, "csv1.csv.diff"), dtype=CSV_DTYPES),
@@ -184,7 +204,7 @@ def test_diff_and_filter_exports(self, tmp_path):
         arch_diff.filter_exports(common_diffs)
 
         # Check exports directory just has incremental changes
-        assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv"}
+        assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv", "csv5.csv"}
         assert_frame_equal(
             pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES),
             csv1_diff)
@@ -311,7 +331,7 @@ def test_run(self, tmp_path, s3_client):
             assert_frame_equal(pd.read_csv(body, dtype=CSV_DTYPES), df)
 
         # Check exports directory just has incremental changes
-        assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv"}
+        assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv", "csv5.csv"}
         csv1_diff = pd.DataFrame({
             "geo_id": ["3", "2", "4"],
             "val": [np.nan, 2.1, 4.0],
@@ -524,7 +544,7 @@ def test_run(self, tmp_path):
         original_branch.checkout()
 
         # Check exports directory just has incremental changes
-        assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv"}
+        assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv", "csv5.csv"}
         csv1_diff = pd.DataFrame({
             "geo_id": ["3", "2", "4"],
             "val": [np.nan, 2.1, 4.0],
diff --git a/_delphi_utils_python/tests/test_export.py b/_delphi_utils_python/tests/test_export.py
@@ -9,6 +9,7 @@
 
 from delphi_utils import create_export_csv, Nans
 
+
 def _clean_directory(directory):
     """Clean files out of a directory."""
     for fname in listdir(directory):
@@ -29,6 +30,7 @@ def _non_ignored_files_set(directory):
 
 class TestExport:
     """Tests for exporting CSVs."""
+
     # List of times for data points.
     TIMES = [
         datetime.strptime(x, "%Y-%m-%d")
@@ -54,9 +56,19 @@ class TestExport:
             "val": [3.12345678910, np.nan, 2.2, 2.6],
             "se": [0.15, 0.22, np.nan, 0.34],
             "sample_size": [100, 100, 101, None],
-            "missing_val": [Nans.NOT_MISSING, Nans.OTHER, Nans.NOT_MISSING, Nans.NOT_MISSING],
-            "missing_se": [Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.OTHER, Nans.NOT_MISSING],
-            "missing_sample_size": [Nans.NOT_MISSING] * 3 + [Nans.OTHER]
+            "missing_val": [
+                Nans.NOT_MISSING,
+                Nans.OTHER,
+                Nans.NOT_MISSING,
+                Nans.NOT_MISSING,
+            ],
+            "missing_se": [
+                Nans.NOT_MISSING,
+                Nans.NOT_MISSING,
+                Nans.OTHER,
+                Nans.NOT_MISSING,
+            ],
+            "missing_sample_size": [Nans.NOT_MISSING] * 3 + [Nans.OTHER],
         }
     )
 
@@ -68,9 +80,19 @@ class TestExport:
             "val": [np.nan, np.nan, 2.2, 2.6],
             "se": [0.15, 0.22, np.nan, 0.34],
             "sample_size": [100, 100, 101, None],
-            "missing_val": [Nans.NOT_MISSING, Nans.OTHER, Nans.NOT_MISSING, Nans.NOT_MISSING],
-            "missing_se": [Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.OTHER, Nans.NOT_MISSING],
-            "missing_sample_size": [Nans.NOT_MISSING] * 3 + [Nans.OTHER]
+            "missing_val": [
+                Nans.NOT_MISSING,
+                Nans.OTHER,
+                Nans.NOT_MISSING,
+                Nans.NOT_MISSING,
+            ],
+            "missing_se": [
+                Nans.NOT_MISSING,
+                Nans.NOT_MISSING,
+                Nans.OTHER,
+                Nans.NOT_MISSING,
+            ],
+            "missing_sample_size": [Nans.NOT_MISSING] * 3 + [Nans.OTHER],
         }
     )
 
@@ -116,10 +138,14 @@ def test_export_rounding(self):
         )
         pd.testing.assert_frame_equal(
             pd.read_csv(join(self.TEST_DIR, "20200215_county_deaths_test.csv")),
-            pd.DataFrame({"geo_id": [51093, 51175],
-                          "val": [round(3.12345678910, 7), 2.1],
-                          "se": [0.15, 0.22],
-                          "sample_size": [100, 100]})
+            pd.DataFrame(
+                {
+                    "geo_id": [51093, 51175],
+                    "val": [round(3.12345678910, 7), 2.1],
+                    "se": [0.15, 0.22],
+                    "sample_size": [100, 100],
+                }
+            ),
         )
 
     def test_export_without_metric(self):
@@ -211,13 +237,16 @@ def test_export_with_null_removal(self):
         """Test that `remove_null_samples = True` removes entries with null samples."""
         _clean_directory(self.TEST_DIR)
 
-        df_with_nulls = self.DF.copy().append({
-                                "geo_id": "66666",
-                                "timestamp": datetime(2020, 6, 6),
-                                "val": 10,
-                                "se": 0.2,
-                                "sample_size": pd.NA},
-                            ignore_index=True)
+        df_with_nulls = self.DF.copy().append(
+            {
+                "geo_id": "66666",
+                "timestamp": datetime(2020, 6, 6),
+                "val": 10,
+                "se": 0.2,
+                "sample_size": pd.NA,
+            },
+            ignore_index=True,
+        )
 
         create_export_csv(
             df=df_with_nulls,
@@ -241,13 +270,16 @@ def test_export_without_null_removal(self):
         """Test that `remove_null_samples = False` does not remove entries with null samples."""
         _clean_directory(self.TEST_DIR)
 
-        df_with_nulls = self.DF.copy().append({
-                                "geo_id": "66666",
-                                "timestamp": datetime(2020, 6, 6),
-                                "val": 10,
-                                "se": 0.2,
-                                "sample_size": pd.NA},
-                            ignore_index=True)
+        df_with_nulls = self.DF.copy().append(
+            {
+                "geo_id": "66666",
+                "timestamp": datetime(2020, 6, 6),
+                "val": 10,
+                "se": 0.2,
+                "sample_size": pd.NA,
+            },
+            ignore_index=True,
+        )
 
         create_export_csv(
             df=df_with_nulls,
@@ -267,24 +299,56 @@ def test_export_without_null_removal(self):
         )
         assert pd.read_csv(join(self.TEST_DIR, "20200606_state_test.csv")).size > 0
 
+    def test_export_df_without_missingness(self):
+        _clean_directory(self.TEST_DIR)
+
+        create_export_csv(
+            df=self.DF.copy(), export_dir=self.TEST_DIR, geo_res="county", sensor="test"
+        )
+        df = pd.read_csv(join(self.TEST_DIR, "20200215_county_test.csv")).astype(
+            {"geo_id": str, "sample_size": int}
+        )
+        expected_df = pd.DataFrame(
+            {
+                "geo_id": ["51093", "51175"],
+                "val": [3.12345678910, 2.1],
+                "se": [0.15, 0.22],
+                "sample_size": [100, 100],
+            }
+        ).astype({"geo_id": str, "sample_size": int})
+        pd.testing.assert_frame_equal(df, expected_df)
+
     def test_export_df_with_missingness(self):
         _clean_directory(self.TEST_DIR)
 
         create_export_csv(
             df=self.DF2.copy(),
             export_dir=self.TEST_DIR,
-            geo_res="state",
+            geo_res="county",
             sensor="test",
-            remove_null_samples=False
         )
         assert _non_ignored_files_set(self.TEST_DIR) == set(
             [
-                "20200215_state_test.csv",
-                "20200301_state_test.csv",
-                "20200315_state_test.csv",
+                "20200215_county_test.csv",
+                "20200301_county_test.csv",
+                "20200315_county_test.csv",
             ]
         )
-        assert pd.read_csv(join(self.TEST_DIR, "20200315_state_test.csv")).size > 0
+        df = pd.read_csv(join(self.TEST_DIR, "20200215_county_test.csv")).astype(
+            {"geo_id": str, "sample_size": int}
+        )
+        expected_df = pd.DataFrame(
+            {
+                "geo_id": ["51093", "51175"],
+                "val": [3.12345678910, np.nan],
+                "se": [0.15, 0.22],
+                "sample_size": [100, 100],
+                "missing_val": [Nans.NOT_MISSING, Nans.OTHER],
+                "missing_se": [Nans.NOT_MISSING] * 2,
+                "missing_sample_size": [Nans.NOT_MISSING] * 2,
+            }
+        ).astype({"geo_id": str, "sample_size": int})
+        pd.testing.assert_frame_equal(df, expected_df)
 
     @mock.patch("delphi_utils.logger")
     def test_export_df_with_contradictory_missingness(self, mock_logger):
@@ -295,7 +359,6 @@ def test_export_df_with_contradictory_missingness(self, mock_logger):
             export_dir=self.TEST_DIR,
             geo_res="state",
             sensor="test",
-            remove_null_samples=False,
             logger=mock_logger
         )
         assert _non_ignored_files_set(self.TEST_DIR) == set(