1
1
# -*- coding: utf-8 -*-
2
2
3
- import re
4
3
import pandas as pd
5
4
import numpy as np
6
5
from delphi_utils import GeoMapper
7
6
8
- def detect_date_col (col_name : str ):
9
- """determine if column name is a date"""
10
- date_match = re .match (r'\d{1,2}\/\d{1,2}\/\d{1,2}' , col_name )
11
- if date_match :
12
- return True
13
- return False
14
7
15
- def pull_jhu_data (base_url : str , metric : str , pop_df : pd .DataFrame ) -> pd .DataFrame :
8
+ def download_data (base_url : str , metric : str ) -> pd .DataFrame :
9
+ """
10
+ Downloads the data from the JHU repo, extracts the UID and the date columns, and
11
+ enforces the date datatype on the the time column.
12
+ """
13
+ # Read data
14
+ df = pd .read_csv (base_url .format (metric = metric ))
15
+ # Keep the UID and the time series columns only
16
+ # The regex filters for columns with the date format MM-DD-YY or M-D-YY
17
+ df = df .filter (regex = "\d{1,2}\/\d{1,2}\/\d{2}|UID" ).melt (
18
+ id_vars = ["UID" ], var_name = "timestamp" , value_name = "cumulative_counts"
19
+ )
20
+ df ["timestamp" ] = pd .to_datetime (df ["timestamp" ])
21
+ return df
22
+
23
+
24
+ def create_diffs_column (df : pd .DataFrame ) -> pd .DataFrame :
25
+ """
26
+ Using the cumulative_counts column from the dataframe, partitions the dataframe
27
+ into separate time-series based on fips, and then computes pairwise differences
28
+ of the cumulative values to get the incidence values. Boundary cases are handled
29
+ by zero-filling the day prior.
30
+ """
31
+ # Take time-diffs in each geo_code partition
32
+ df = df .set_index (["fips" , "timestamp" ])
33
+ df ["new_counts" ] = df .groupby (level = 0 )["cumulative_counts" ].diff ()
34
+ # Fill the NA value for the first date of each partition with the cumulative value that day
35
+ # (i.e. pretend the cumulative count the day before was 0)
36
+ na_value_mask = df ["new_counts" ].isna ()
37
+ df .loc [na_value_mask , "new_counts" ] = df .loc [na_value_mask , "cumulative_counts" ]
38
+ df = df .reset_index ()
39
+ return df
40
+
41
+
42
+ def sanity_check_data (df : pd .DataFrame ) -> pd .DataFrame :
43
+ """
44
+ Perform a final set of sanity checks on the data.
45
+ """
46
+ days_by_fips = df .groupby ("fips" ).count ()["cumulative_counts" ].unique ()
47
+ unique_days = df ["timestamp" ].unique ()
48
+
49
+ # each FIPS has same number of rows
50
+ if (len (days_by_fips ) > 1 ) or (days_by_fips [0 ] != len (unique_days )):
51
+ raise ValueError ("Differing number of days by fips" )
52
+
53
+ min_timestamp = min (unique_days )
54
+ max_timestamp = max (unique_days )
55
+ n_days = (max_timestamp - min_timestamp ) / np .timedelta64 (1 , "D" ) + 1
56
+ if n_days != len (unique_days ):
57
+ raise ValueError (
58
+ f"Not every day between { min_timestamp } and "
59
+ "{max_timestamp} is represented."
60
+ )
61
+
62
+
63
+ def pull_jhu_data (base_url : str , metric : str , gmpr : GeoMapper ) -> pd .DataFrame :
16
64
"""Pulls the latest Johns Hopkins CSSE data, and conforms it into a dataset
17
65
18
66
The output dataset has:
@@ -28,92 +76,37 @@ def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFr
28
76
may be negative. This is wholly dependent on the quality of the raw
29
77
dataset.
30
78
31
- We filter the data such that we only keep rows with valid FIPS, or "FIPS"
32
- codes defined under the exceptions of the README. The current exceptions
33
- include:
34
-
35
- - 70002: Dukes County and Nantucket County in Massachusetts, which are
36
- reported together
37
- - 70003: Kansas City, Missouri, which reports counts separately from the
38
- four counties it intesects (Platte, Cass, Clay, Jackson Counties)
79
+ We filter the data such that we only keep rows with valid FIPS or "FIPS"
80
+ codes defined under the exceptions of the README.
39
81
40
82
Parameters
41
83
----------
42
84
base_url: str
43
- Base URL for pulling the JHU CSSE data
85
+ Base URL for pulling the JHU CSSE data.
44
86
metric: str
45
87
One of 'confirmed' or 'deaths'.
46
- pop_df: pd.DataFrame
47
- Read from static file "fips_population.csv" .
88
+ gmpr: GeoMapper
89
+ An instance of the geomapping utility .
48
90
49
91
Returns
50
92
-------
51
93
pd.DataFrame
52
94
Dataframe as described above.
53
95
"""
96
+ df = download_data (base_url , metric )
54
97
55
- # Read data
56
- df = pd .read_csv (base_url .format (metric = metric ))
57
-
58
- # FIPS are missing for some nonstandard FIPS
59
- date_cols = [col_name for col_name in df .columns if detect_date_col (col_name )]
60
- keep_cols = date_cols + ['UID' ]
61
- df = df [keep_cols ]
62
-
63
- df = df .melt (
64
- id_vars = ["UID" ],
65
- var_name = "timestamp" ,
66
- value_name = "cumulative_counts" ,
98
+ gmpr = GeoMapper ()
99
+ df = gmpr .replace_geocode (
100
+ df , "jhu_uid" , "fips" , from_col = "UID" , date_col = "timestamp"
67
101
)
68
- df ["timestamp" ] = pd .to_datetime (df ["timestamp" ])
69
102
70
- gmpr = GeoMapper ()
71
- df = gmpr .replace_geocode (df , "jhu_uid" , "fips" , from_col = "UID" , date_col = "timestamp" )
72
-
73
- # Merge in population LOWERCASE, consistent across confirmed and deaths
74
- # Set population as NAN for fake fips
75
- pop_df .rename (columns = {'FIPS' :'fips' }, inplace = True )
76
- pop_df ['fips' ] = pop_df ['fips' ].astype (int ).\
77
- astype (str ).str .zfill (5 )
78
- df = df .merge (pop_df , on = "fips" , how = 'left' )
79
-
80
- # Add a dummy first row here on day before first day
81
- # code below could be cleaned with groupby.diff
82
-
83
- min_ts = min (df ["timestamp" ])
84
- df_dummy = df .loc [df ["timestamp" ] == min_ts ].copy ()
85
- df_dummy .loc [:, "timestamp" ] = min_ts - pd .Timedelta (days = 1 )
86
- df_dummy .loc [:, "cumulative_counts" ] = 0
87
- df = pd .concat ([df_dummy , df ])
88
- # Obtain new_counts
89
- df .sort_values (["fips" , "timestamp" ], inplace = True )
90
- df ["new_counts" ] = df ["cumulative_counts" ].diff () # 1st discrete difference
91
- # Handle edge cases where we diffed across fips
92
- mask = df ["fips" ] != df ["fips" ].shift (1 )
93
- df .loc [mask , "new_counts" ] = np .nan
94
- df .reset_index (inplace = True , drop = True )
103
+ # Merge in population, set population as NAN for fake fips
104
+ df = gmpr .add_population_column (df , "fips" )
105
+ df = create_diffs_column (df )
95
106
96
107
# Final sanity checks
97
- days_by_fips = df .groupby ("fips" ).count ()["cumulative_counts" ].unique ()
98
- unique_days = df ["timestamp" ].unique ()
99
- # each FIPS has same number of rows
100
- if (len (days_by_fips ) > 1 ) or (days_by_fips [0 ] != len (unique_days )):
101
- raise ValueError ("Differing number of days by fips" )
102
- min_timestamp = min (unique_days )
103
- max_timestamp = max (unique_days )
104
- n_days = (max_timestamp - min_timestamp ) / np .timedelta64 (1 , "D" ) + 1
105
- if n_days != len (unique_days ):
106
- raise ValueError (
107
- f"Not every day between { min_timestamp } and "
108
- "{max_timestamp} is represented."
109
- )
110
- return df .loc [
111
- df ["timestamp" ] >= min_ts ,
112
- [ # Reorder
113
- "fips" ,
114
- "timestamp" ,
115
- "population" ,
116
- "new_counts" ,
117
- "cumulative_counts" ,
118
- ],
119
- ]
108
+ sanity_check_data (df )
109
+
110
+ # Reorder columns
111
+ df = df [["fips" , "timestamp" , "population" , "new_counts" , "cumulative_counts" ]]
112
+ return df
0 commit comments