cmu-delphi · krivard · Dec 2, 2020 · Nov 20, 2020 · Nov 20, 2020 · Nov 21, 2020
diff --git a/quidel_covidtest/Makefile b/quidel_covidtest/Makefile
@@ -13,7 +13,8 @@ install: venv
 
 lint:
 	. env/bin/activate; \
-	pylint $(dir)
+	pylint $(dir); \
+	pydocstyle $(dir)
 
 test:
 	. env/bin/activate ;\

diff --git a/quidel_covidtest/delphi_quidel_covidtest/constants.py b/quidel_covidtest/delphi_quidel_covidtest/constants.py
@@ -1,4 +1,4 @@
-"""Registry for constants"""
+"""Registry for constants."""
 # global constants
 MIN_OBS = 50  # minimum number of observations in order to compute a proportion.
 POOL_DAYS = 7  # number of days in the past (including today) to pool over

diff --git a/quidel_covidtest/delphi_quidel_covidtest/data_tools.py b/quidel_covidtest/delphi_quidel_covidtest/data_tools.py
@@ -1,23 +1,26 @@
-"""
-Functions to calculate the quidel sensor statistic.
-"""
+"""Functions to calculate the quidel sensor statistic."""
 
 import numpy as np
 import pandas as pd
 
 
 def remove_null_samples(df):
-    """Removes entries in a data frame whose sample sizes are null."""
+    """Remove entries in a data frame whose sample sizes are null."""
     return df[df["sample_size"].notnull()]
 
 
 def _prop_var(p, n):
-    """var(X/n) = 1/(n^2)var(X) = (npq)/(n^2) = pq/n"""
+    """
+    Calculate variance of proportion.
+
+    var(X/n) = 1/(n^2)var(X) = (npq)/(n^2) = pq/n
+    """
     return p * (1 - p) / n
 
 def fill_dates(y_data, first_date, last_date):
     """
     Ensure all dates are listed in the data, otherwise, add days with 0 counts.
+
     Args:
         y_data: dataframe with datetime index
         first_date: datetime.datetime
@@ -42,8 +45,9 @@ def fill_dates(y_data, first_date, last_date):
 
 def _slide_window_sum(arr, k):
     """
-    Sliding window sum, with fixed window size k.  For indices 0:k, we
-    DO compute a sum, using whatever points are available.
+    Sliding window sum, with fixed window size k.
+
+    For indices 0:k, we DO compute a sum, using whatever points are available.
 
     Reference: https://stackoverflow.com/a/38507725
 
@@ -57,7 +61,6 @@ def _slide_window_sum(arr, k):
         sarr: np.ndarray
             Array of same length of arr, holding the sliding window sum.
     """
-
     if not isinstance(k, int):
         raise ValueError('k must be int.')
     temp = np.append(np.zeros(k - 1), arr)
@@ -67,12 +70,11 @@ def _slide_window_sum(arr, k):
 
 def _geographical_pooling(tpooled_tests, tpooled_ptests, min_obs):
     """
-    Calculates the proportion of parent samples (tests) that must be "borrowed"
-    in order to properly compute the statistic.  If there are no samples
-    available in the parent, the borrow_prop is 0.  If the parent does not
+    Calculate proportion of parent samples (tests) that must be "borrowed" in order to compute the statistic.
+
+    If there are no samples available in the parent, the borrow_prop is 0.  If the parent does not
     have enough samples, we return a borrow_prop of 1, and the fact that the
-    pooled samples are insufficient are handled in the statistic fitting
-    step.
+    pooled samples are insufficient are handled in the statistic fitting step.
 
     Args:
         tpooled_tests: np.ndarray[float]
@@ -115,8 +117,7 @@ def _geographical_pooling(tpooled_tests, tpooled_ptests, min_obs):
 
 def raw_positive_prop(positives, tests, min_obs):
     """
-    Calculates the proportion of positive tests for a single geographic
-    location, without any temporal smoothing.
+    Calculate the proportion of positive tests for a single geographic location, without any temporal smoothing.
 
     If on any day t, tests[t] < min_obs, then we report np.nan.
 
@@ -169,8 +170,7 @@ def raw_positive_prop(positives, tests, min_obs):
 def smoothed_positive_prop(positives, tests, min_obs, pool_days,
                            parent_positives=None, parent_tests=None):
     """
-    Calculates the proportion of negative tests for a single geographic
-    location, with temporal smoothing.
+    Calculate the proportion of negative tests for a single geographic location, with temporal smoothing.
 
     For a given day t, if sum(tests[(t-pool_days+1):(t+1)]) < min_obs, then we
     'borrow' min_obs - sum(tests[(t-pool_days+1):(t+1)]) observations from the
@@ -215,7 +215,6 @@ def smoothed_positive_prop(positives, tests, min_obs, pool_days,
         np.ndarray
             Effective sample size (after temporal and geographic pooling).
     """
-
     positives = positives.astype(float)
     tests = tests.astype(float)
     if (parent_positives is None) or (parent_tests is None):
@@ -259,9 +258,8 @@ def smoothed_positive_prop(positives, tests, min_obs, pool_days,
 
 
 def raw_tests_per_device(devices, tests, min_obs):
-    '''
-    Calculates the tests per device for a single geographic
-    location, without any temporal smoothing.
+    """
+    Calculate the tests per device for a single geographic location, without any temporal smoothing.
 
     If on any day t, tests[t] < min_obs, then we report np.nan.
     The second and third returned np.ndarray are the standard errors,
@@ -284,7 +282,7 @@ def raw_tests_per_device(devices, tests, min_obs):
             Placeholder for standard errors
         np.ndarray
             Sample size used to compute estimates.
-    '''
+    """
     devices = devices.astype(float)
     tests = tests.astype(float)
     if (np.any(np.isnan(devices)) or np.any(np.isnan(tests))):
@@ -304,8 +302,8 @@ def raw_tests_per_device(devices, tests, min_obs):
 def smoothed_tests_per_device(devices, tests, min_obs, pool_days,
                               parent_devices=None, parent_tests=None):
     """
-    Calculates the ratio of tests per device for a single geographic
-    location, with temporal smoothing.
+    Calculate the ratio of tests per device for a single geographic location, with temporal smoothing.
+
     For a given day t, if sum(tests[(t-pool_days+1):(t+1)]) < min_obs, then we
     'borrow' min_obs - sum(tests[(t-pool_days+1):(t+1)]) observations from the
     parents over the same timespan.  Importantly, it will make sure NOT to

diff --git a/quidel_covidtest/delphi_quidel_covidtest/generate_sensor.py b/quidel_covidtest/delphi_quidel_covidtest/generate_sensor.py
@@ -1,7 +1,5 @@
 # -*- coding: utf-8 -*-
-"""
-Functions to help generate sensor for different geographical levels
-"""
+"""Functions to help generate sensor for different geographical levels."""
 import pandas as pd
 from .data_tools import (fill_dates, raw_positive_prop,
                          smoothed_positive_prop,
@@ -13,7 +11,8 @@
 
 def generate_sensor_for_states(state_groups, smooth, device, first_date, last_date):
     """
-    fit over states
+    Fit over states.
+
     Args:
         state_groups: pd.groupby.generic.DataFrameGroupBy
         state_key: "state_id"
@@ -70,7 +69,8 @@ def generate_sensor_for_states(state_groups, smooth, device, first_date, last_da
 def generate_sensor_for_other_geores(state_groups, data, res_key, smooth,
                                      device, first_date, last_date):
     """
-    fit over counties/HRRs/MSAs
+    Fit over counties/HRRs/MSAs.
+
     Args:
         data: pd.DataFrame
         res_key: "fips", "cbsa_id" or "hrrnum"

diff --git a/quidel_covidtest/delphi_quidel_covidtest/geo_maps.py b/quidel_covidtest/delphi_quidel_covidtest/geo_maps.py
@@ -13,9 +13,7 @@
 
 
 def geo_map(geo_res, df):
-    """
-    Map a geocode to a new value.
-    """
+    """Map a geocode to a new value."""
     data = df.copy()
     geo_key = GEO_KEY_DICT[geo_res]
     # Add population for each zipcode
@@ -32,6 +30,8 @@ def geo_map(geo_res, df):
 
 def add_parent_state(data, geo_res, geo_key):
     """
+    Add parent state column to DataFrame.
+
     - map from msa/hrr to state, going by the state with the largest
       population (since a msa/hrr may span multiple states)
     - map from county to the corresponding state

diff --git a/quidel_covidtest/delphi_quidel_covidtest/pull.py b/quidel_covidtest/delphi_quidel_covidtest/pull.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 """Simply downloads email attachments.
+
 Uses this handy package: https://pypi.org/project/imap-tools/
 """
 import io
@@ -15,7 +16,8 @@
 def get_from_email(start_date, end_date, mail_server,
                    account, sender, password):
     """
-    Get raw data from email account
+    Get raw data from email account.
+
     Args:
         start_date: datetime.datetime
             pull data from email received from the start date
@@ -56,9 +58,7 @@ def get_from_email(start_date, end_date, mail_server,
     return df, time_flag
 
 def fix_zipcode(df):
-    """
-    Fix zipcode that is 9 digit instead of 5 digit
-    """
+    """Fix zipcode that is 9 digit instead of 5 digit."""
     zipcode5 = []
     fixnum = 0
     for zipcode in df['Zip'].values:
@@ -74,6 +74,8 @@ def fix_zipcode(df):
 
 def fix_date(df):
     """
+    Remove invalid dates and select correct test date to use.
+
     Quidel Covid Test are labeled with Test Date and Storage Date. In principle,
     the TestDate should reflect when the test was performed and the StorageDate
     when the test was logged in the MyVirena cloud storage device. We expect
@@ -101,6 +103,7 @@ def preprocess_new_data(start_date, end_date, mail_server, account,
                         sender, password, test_mode):
     """
     Pull and pre-process Quidel Covid Test data from datadrop email.
+
     Drop unnecessary columns. Temporarily consider the positive rate
     sensor only which is related to number of total tests and number
     of positive tests.
@@ -173,9 +176,7 @@ def preprocess_new_data(start_date, end_date, mail_server, account,
     return df_merged, time_flag
 
 def check_intermediate_file(cache_dir, pull_start_date):
-    """
-    Check whether there is a cache file containing historical data already
-    """
+    """Check whether there is a cache file containing historical data already."""
     for filename in os.listdir(cache_dir):
         if ".csv" in filename:
             pull_start_date = datetime.strptime(filename.split("_")[2].split(".")[0],
@@ -187,8 +188,7 @@ def check_intermediate_file(cache_dir, pull_start_date):
 
 def pull_quidel_covidtest(params):
     """
-    Pull the quidel covid test data. Decide whether to combine the newly
-    received data with stored historical records in ./cache
+    Pull the quidel covid test data and ecide whether to combine the new data with stored historical records in ./cache.
 
     Parameters:
         params: dict
@@ -240,7 +240,8 @@ def pull_quidel_covidtest(params):
 def check_export_end_date(input_export_end_date, _end_date,
                           end_from_today_minus):
     """
-    Update the export_end_date according to the data received
+    Update the export_end_date according to the data received.
+
     By default, set the export end date to be the last pulling date - 5 days
     (end_from_today_minus = 5).
     Otherwise, use the required date if it is earlier than the default one.
@@ -267,8 +268,7 @@ def check_export_end_date(input_export_end_date, _end_date,
 def check_export_start_date(export_start_date, export_end_date,
                             export_day_range):
     """
-    Update the export_start_date according to the export_end_date so that it
-    could be export_end_date - export_day_range
+    Update export_start_date according to the export_end_date so that it could be export_end_date - export_day_range.
 
     Parameters:
         export_start_date: str
@@ -296,7 +296,7 @@ def check_export_start_date(export_start_date, export_end_date,
 
 def update_cache_file(df, _end_date, cache_dir):
     """
-    Update cache file. Remove the old one, export the new one
+    Update cache file. Remove the old one, export the new one.
 
     Parameter:
         df: pd.DataFrame