Merge pull request #488 from cmu-delphi/robustify-usafacts

krivard · web-flow · commit a4712623f29e · 2020-11-10T16:03:36.000-05:00
Add string to int cleaning for USAFacts
diff --git a/usafacts/delphi_usafacts/pull.py b/usafacts/delphi_usafacts/pull.py
@@ -55,6 +55,9 @@ def pull_usafacts_data(base_url: str, metric: str, geo_mapper: GeoMapper) -> pd.
     """
     # Read data
     df = pd.read_csv(base_url.format(metric=metric)).rename({"countyFIPS":"FIPS"}, axis=1)
+    # Clean commas in count fields in case the input file included them
+    df[df.columns[4:]] = df[df.columns[4:]].applymap(
+        lambda x: int(x.replace(",", "")) if isinstance(x, str) else x)
     # Check missing FIPS
     null_mask = pd.isnull(df["FIPS"])
     assert null_mask.sum() == 0
diff --git a/usafacts/tests/test_data/small_deaths_pull.csv b/usafacts/tests/test_data/small_deaths_pull.csv
@@ -0,0 +1,5 @@
+countyFIPS,County Name,State,stateFIPS,2/29/20,3/1/20,3/2/20
+1,New York City Unallocated/Probable,NY,36,0,0,1
+6000,Somewhere,NY,36,11,12,13
+2270,Place,NY,36,101,101,"1,0,2"
+36009,City,NY,36,2,4,6
diff --git a/usafacts/tests/test_pull.py b/usafacts/tests/test_pull.py
@@ -3,10 +3,11 @@
 from os.path import join
 
 import pandas as pd
+import numpy as np
 from delphi_utils import GeoMapper
 from delphi_usafacts.pull import pull_usafacts_data
 
-base_url_good = "test_data/small_{metric}.csv"
+base_url_good = "test_data/small_{metric}_pull.csv"
 
 base_url_bad = {
     "missing_days": "test_data/bad_{metric}_missing_days.csv",
@@ -21,11 +22,17 @@ class TestPullUSAFacts:
     def test_good_file(self):
         metric = "deaths"
         df = pull_usafacts_data(base_url_good, metric, geo_mapper)
-
-        assert (
-            df.columns.values
-            == ["fips", "timestamp", "population", "new_counts", "cumulative_counts"]
-        ).all()
+        expected_df = pd.DataFrame({
+            "fips": ["00001", "00001", "00001", "36009", "36009", "36009"],
+            "timestamp": [pd.Timestamp("2020-02-29"), pd.Timestamp("2020-03-01"),
+                          pd.Timestamp("2020-03-02"), pd.Timestamp("2020-02-29"),
+                          pd.Timestamp("2020-03-01"), pd.Timestamp("2020-03-02")],
+            "population": [np.nan, np.nan, np.nan, 76117., 76117., 76117.],
+            "new_counts": [0., 0., 1., 2., 2., 2.],
+            "cumulative_counts": [0, 0, 1, 2, 4, 6]},
+            index=[1, 2, 3, 5, 6, 7])
+        # sort since rows order doesn't matter
+        pd.testing.assert_frame_equal(df.sort_index(), expected_df.sort_index())
 
     def test_missing_days(self):