Skip to content

Commit a471262

Browse files
authored
Merge pull request #488 from cmu-delphi/robustify-usafacts
Add string to int cleaning for USAFacts
2 parents 6d7386d + 78b367b commit a471262

File tree

3 files changed

+21
-6
lines changed

3 files changed

+21
-6
lines changed

usafacts/delphi_usafacts/pull.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ def pull_usafacts_data(base_url: str, metric: str, geo_mapper: GeoMapper) -> pd.
5555
"""
5656
# Read data
5757
df = pd.read_csv(base_url.format(metric=metric)).rename({"countyFIPS":"FIPS"}, axis=1)
58+
# Clean commas in count fields in case the input file included them
59+
df[df.columns[4:]] = df[df.columns[4:]].applymap(
60+
lambda x: int(x.replace(",", "")) if isinstance(x, str) else x)
5861
# Check missing FIPS
5962
null_mask = pd.isnull(df["FIPS"])
6063
assert null_mask.sum() == 0
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
countyFIPS,County Name,State,stateFIPS,2/29/20,3/1/20,3/2/20
2+
1,New York City Unallocated/Probable,NY,36,0,0,1
3+
6000,Somewhere,NY,36,11,12,13
4+
2270,Place,NY,36,101,101,"1,0,2"
5+
36009,City,NY,36,2,4,6

usafacts/tests/test_pull.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@
33
from os.path import join
44

55
import pandas as pd
6+
import numpy as np
67
from delphi_utils import GeoMapper
78
from delphi_usafacts.pull import pull_usafacts_data
89

9-
base_url_good = "test_data/small_{metric}.csv"
10+
base_url_good = "test_data/small_{metric}_pull.csv"
1011

1112
base_url_bad = {
1213
"missing_days": "test_data/bad_{metric}_missing_days.csv",
@@ -21,11 +22,17 @@ class TestPullUSAFacts:
2122
def test_good_file(self):
2223
metric = "deaths"
2324
df = pull_usafacts_data(base_url_good, metric, geo_mapper)
24-
25-
assert (
26-
df.columns.values
27-
== ["fips", "timestamp", "population", "new_counts", "cumulative_counts"]
28-
).all()
25+
expected_df = pd.DataFrame({
26+
"fips": ["00001", "00001", "00001", "36009", "36009", "36009"],
27+
"timestamp": [pd.Timestamp("2020-02-29"), pd.Timestamp("2020-03-01"),
28+
pd.Timestamp("2020-03-02"), pd.Timestamp("2020-02-29"),
29+
pd.Timestamp("2020-03-01"), pd.Timestamp("2020-03-02")],
30+
"population": [np.nan, np.nan, np.nan, 76117., 76117., 76117.],
31+
"new_counts": [0., 0., 1., 2., 2., 2.],
32+
"cumulative_counts": [0, 0, 1, 2, 4, 6]},
33+
index=[1, 2, 3, 5, 6, 7])
34+
# sort since rows order doesn't matter
35+
pd.testing.assert_frame_equal(df.sort_index(), expected_df.sort_index())
2936

3037
def test_missing_days(self):
3138

0 commit comments

Comments
 (0)