1
1
# -*- coding: utf-8 -*-
2
2
"""Functions to pull data from JHU website."""
3
3
4
- import re
5
4
import pandas as pd
6
5
import numpy as np
7
6
from delphi_utils import GeoMapper
8
7
9
- def detect_date_col (col_name : str ):
10
- """determine if column name is a date"""
11
- date_match = re .match (r'\d{1,2}\/\d{1,2}\/\d{1,2}' , col_name )
12
- if date_match :
13
- return True
14
- return False
15
8
16
- def pull_jhu_data (base_url : str , metric : str , pop_df : pd .DataFrame ) -> pd .DataFrame :
9
+ def download_data (base_url : str , metric : str ) -> pd .DataFrame :
10
+ """
11
+ Downloads the data from the JHU repo, extracts the UID and the date columns, and
12
+ enforces the date datatype on the the time column.
13
+ """
14
+ # Read data
15
+ df = pd .read_csv (base_url .format (metric = metric ))
16
+ # Keep the UID and the time series columns only
17
+ # The regex filters for columns with the date format MM-DD-YY or M-D-YY
18
+ df = df .filter (regex = r"\d{1,2}\/\d{1,2}\/\d{2}|UID" ).melt (
19
+ id_vars = ["UID" ], var_name = "timestamp" , value_name = "cumulative_counts"
20
+ )
21
+ df ["timestamp" ] = pd .to_datetime (df ["timestamp" ])
22
+ return df
23
+
24
+
25
+ def create_diffs_column (df : pd .DataFrame ) -> pd .DataFrame :
26
+ """
27
+ Using the cumulative_counts column from the dataframe, partitions the dataframe
28
+ into separate time-series based on fips, and then computes pairwise differences
29
+ of the cumulative values to get the incidence values. Boundary cases are handled
30
+ by zero-filling the day prior.
31
+ """
32
+ # Take time-diffs in each geo_code partition
33
+ df = df .set_index (["fips" , "timestamp" ])
34
+ df ["new_counts" ] = df .groupby (level = 0 )["cumulative_counts" ].diff ()
35
+ # Fill the NA value for the first date of each partition with the cumulative value that day
36
+ # (i.e. pretend the cumulative count the day before was 0)
37
+ na_value_mask = df ["new_counts" ].isna ()
38
+ df .loc [na_value_mask , "new_counts" ] = df .loc [na_value_mask , "cumulative_counts" ]
39
+ df = df .reset_index ()
40
+ return df
41
+
42
+
43
+ def sanity_check_data (df : pd .DataFrame ) -> pd .DataFrame :
44
+ """
45
+ Perform a final set of sanity checks on the data.
46
+ """
47
+ days_by_fips = df .groupby ("fips" ).count ()["cumulative_counts" ].unique ()
48
+ unique_days = df ["timestamp" ].unique ()
49
+
50
+ # each FIPS has same number of rows
51
+ if (len (days_by_fips ) > 1 ) or (days_by_fips [0 ] != len (unique_days )):
52
+ raise ValueError ("Differing number of days by fips" )
53
+
54
+ min_timestamp = min (unique_days )
55
+ max_timestamp = max (unique_days )
56
+ n_days = (max_timestamp - min_timestamp ) / np .timedelta64 (1 , "D" ) + 1
57
+ if n_days != len (unique_days ):
58
+ raise ValueError (
59
+ f"Not every day between { min_timestamp } and "
60
+ "{max_timestamp} is represented."
61
+ )
62
+
63
+
64
+ def pull_jhu_data (base_url : str , metric : str , gmpr : GeoMapper ) -> pd .DataFrame :
17
65
"""Pulls the latest Johns Hopkins CSSE data, and conforms it into a dataset
18
66
19
67
The output dataset has:
@@ -29,92 +77,37 @@ def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFr
29
77
may be negative. This is wholly dependent on the quality of the raw
30
78
dataset.
31
79
32
- We filter the data such that we only keep rows with valid FIPS, or "FIPS"
33
- codes defined under the exceptions of the README. The current exceptions
34
- include:
35
-
36
- - 70002: Dukes County and Nantucket County in Massachusetts, which are
37
- reported together
38
- - 70003: Kansas City, Missouri, which reports counts separately from the
39
- four counties it intesects (Platte, Cass, Clay, Jackson Counties)
80
+ We filter the data such that we only keep rows with valid FIPS or "FIPS"
81
+ codes defined under the exceptions of the README.
40
82
41
83
Parameters
42
84
----------
43
85
base_url: str
44
- Base URL for pulling the JHU CSSE data
86
+ Base URL for pulling the JHU CSSE data.
45
87
metric: str
46
88
One of 'confirmed' or 'deaths'.
47
- pop_df: pd.DataFrame
48
- Read from static file "fips_population.csv" .
89
+ gmpr: GeoMapper
90
+ An instance of the geomapping utility .
49
91
50
92
Returns
51
93
-------
52
94
pd.DataFrame
53
95
Dataframe as described above.
54
96
"""
97
+ df = download_data (base_url , metric )
55
98
56
- # Read data
57
- df = pd .read_csv (base_url .format (metric = metric ))
58
-
59
- # FIPS are missing for some nonstandard FIPS
60
- date_cols = [col_name for col_name in df .columns if detect_date_col (col_name )]
61
- keep_cols = date_cols + ['UID' ]
62
- df = df [keep_cols ]
63
-
64
- df = df .melt (
65
- id_vars = ["UID" ],
66
- var_name = "timestamp" ,
67
- value_name = "cumulative_counts" ,
99
+ gmpr = GeoMapper ()
100
+ df = gmpr .replace_geocode (
101
+ df , "jhu_uid" , "fips" , from_col = "UID" , date_col = "timestamp"
68
102
)
69
- df ["timestamp" ] = pd .to_datetime (df ["timestamp" ])
70
103
71
- gmpr = GeoMapper ()
72
- df = gmpr .replace_geocode (df , "jhu_uid" , "fips" , from_col = "UID" , date_col = "timestamp" )
73
-
74
- # Merge in population LOWERCASE, consistent across confirmed and deaths
75
- # Set population as NAN for fake fips
76
- pop_df .rename (columns = {'FIPS' :'fips' }, inplace = True )
77
- pop_df ['fips' ] = pop_df ['fips' ].astype (int ).\
78
- astype (str ).str .zfill (5 )
79
- df = df .merge (pop_df , on = "fips" , how = 'left' )
80
-
81
- # Add a dummy first row here on day before first day
82
- # code below could be cleaned with groupby.diff
83
-
84
- min_ts = min (df ["timestamp" ])
85
- df_dummy = df .loc [df ["timestamp" ] == min_ts ].copy ()
86
- df_dummy .loc [:, "timestamp" ] = min_ts - pd .Timedelta (days = 1 )
87
- df_dummy .loc [:, "cumulative_counts" ] = 0
88
- df = pd .concat ([df_dummy , df ])
89
- # Obtain new_counts
90
- df .sort_values (["fips" , "timestamp" ], inplace = True )
91
- df ["new_counts" ] = df ["cumulative_counts" ].diff () # 1st discrete difference
92
- # Handle edge cases where we diffed across fips
93
- mask = df ["fips" ] != df ["fips" ].shift (1 )
94
- df .loc [mask , "new_counts" ] = np .nan
95
- df .reset_index (inplace = True , drop = True )
104
+ # Merge in population, set population as NAN for fake fips
105
+ df = gmpr .add_population_column (df , "fips" )
106
+ df = create_diffs_column (df )
96
107
97
108
# Final sanity checks
98
- days_by_fips = df .groupby ("fips" ).count ()["cumulative_counts" ].unique ()
99
- unique_days = df ["timestamp" ].unique ()
100
- # each FIPS has same number of rows
101
- if (len (days_by_fips ) > 1 ) or (days_by_fips [0 ] != len (unique_days )):
102
- raise ValueError ("Differing number of days by fips" )
103
- min_timestamp = min (unique_days )
104
- max_timestamp = max (unique_days )
105
- n_days = (max_timestamp - min_timestamp ) / np .timedelta64 (1 , "D" ) + 1
106
- if n_days != len (unique_days ):
107
- raise ValueError (
108
- f"Not every day between { min_timestamp } and "
109
- "{max_timestamp} is represented."
110
- )
111
- return df .loc [
112
- df ["timestamp" ] >= min_ts ,
113
- [ # Reorder
114
- "fips" ,
115
- "timestamp" ,
116
- "population" ,
117
- "new_counts" ,
118
- "cumulative_counts" ,
119
- ],
120
- ]
109
+ sanity_check_data (df )
110
+
111
+ # Reorder columns
112
+ df = df [["fips" , "timestamp" , "population" , "new_counts" , "cumulative_counts" ]]
113
+ return df
0 commit comments