cmu-delphi · krivard · Dec 2, 2020 · Nov 21, 2020
diff --git a/quidel/Makefile b/quidel/Makefile
@@ -13,7 +13,8 @@ install: venv
 
 lint:
 	. env/bin/activate; \
-	pylint $(dir)
+	pylint $(dir); \
+	pydocstyle $(dir)
 
 test:
 	. env/bin/activate ;\

diff --git a/quidel/delphi_quidel/constants.py b/quidel/delphi_quidel/constants.py
@@ -1,4 +1,4 @@
-"""Registry for constants"""
+"""Registry for constants."""
 # global constants
 MIN_OBS = 50  # minimum number of observations in order to compute a proportion.
 MAX_BORROW_OBS = 20 # maximum number of observations can be borrowed in geographical pooling

diff --git a/quidel/delphi_quidel/data_tools.py b/quidel/delphi_quidel/data_tools.py
@@ -1,17 +1,20 @@
-"""
-Functions to calculate the quidel sensor statistic.
-"""
+"""Functions to calculate the quidel sensor statistic."""
 
 import numpy as np
 import pandas as pd
 
 def _prop_var(p, n):
-    """var(X/n) = 1/(n^2)var(X) = (npq)/(n^2) = pq/n"""
+    """
+    Calculate variance of proportion.
+
+    var(X/n) = 1/(n^2)var(X) = (npq)/(n^2) = pq/n
+    """
     return p * (1 - p) / n
 
 def fill_dates(y_data, first_date, last_date):
     """
     Ensure all dates are listed in the data, otherwise, add days with 0 counts.
+
     Args:
         y_data: dataframe with datetime index
         first_date: datetime.datetime
@@ -36,8 +39,9 @@ def fill_dates(y_data, first_date, last_date):
 
 def _slide_window_sum(arr, k):
     """
-    Sliding window sum, with fixed window size k.  For indices 0:k, we
-    DO compute a sum, using whatever points are available.
+    Sliding window sum, with fixed window size k.
+
+    For indices 0:k, we DO compute a sum, using whatever points are available.
 
     Reference: https://stackoverflow.com/a/38507725
 
@@ -51,7 +55,6 @@ def _slide_window_sum(arr, k):
         sarr: np.ndarray
             Array of same length of arr, holding the sliding window sum.
     """
-
     if not isinstance(k, int):
         raise ValueError('k must be int.')
     temp = np.append(np.zeros(k - 1), arr)
@@ -61,12 +64,11 @@ def _slide_window_sum(arr, k):
 
 def _geographical_pooling(tpooled_tests, tpooled_ptests, min_obs, max_borrow_obs):
     """
-    Calculates the proportion of parent samples (tests) that must be "borrowed"
-    in order to properly compute the statistic.  If there are no samples
-    available in the parent, the borrow_prop is 0.  If the parent does not
+    Calculate proportion of parent samples (tests) that must be "borrowed" in order to compute the statistic.
+
+    If there are no samples available in the parent, the borrow_prop is 0.  If the parent does not
     have enough samples, we return a borrow_prop of 1, and the fact that the
-    pooled samples are insufficient are handled in the statistic fitting
-    step.
+    pooled samples are insufficient are handled in the statistic fitting step.
 
     Args:
         tpooled_tests: np.ndarray[float]
@@ -117,8 +119,7 @@ def _geographical_pooling(tpooled_tests, tpooled_ptests, min_obs, max_borrow_obs
 
 def raw_positive_prop(positives, tests, min_obs):
     """
-    Calculates the proportion of positive tests for a single geographic
-    location, without any temporal smoothing.
+    Calculate the proportion of positive tests for a single geographic location, without any temporal smoothing.
 
     If on any day t, tests[t] < min_obs, then we report np.nan.
 
@@ -171,8 +172,7 @@ def raw_positive_prop(positives, tests, min_obs):
 def smoothed_positive_prop(positives, tests, min_obs, max_borrow_obs, pool_days,
                            parent_positives=None, parent_tests=None):
     """
-    Calculates the proportion of negative tests for a single geographic
-    location, with temporal smoothing.
+    Calculate the proportion of negative tests for a single geographic location, with temporal smoothing.
 
     For a given day t, if sum(tests[(t-pool_days+1):(t+1)]) < min_obs, then we
     'borrow' min_obs - sum(tests[(t-pool_days+1):(t+1)]) observations from the
@@ -219,7 +219,6 @@ def smoothed_positive_prop(positives, tests, min_obs, max_borrow_obs, pool_days,
         np.ndarray
             Effective sample size (after temporal and geographic pooling).
     """
-
     positives = positives.astype(float)
     tests = tests.astype(float)
     if (parent_positives is None) or (parent_tests is None):
@@ -264,9 +263,8 @@ def smoothed_positive_prop(positives, tests, min_obs, max_borrow_obs, pool_days,
 
 
 def raw_tests_per_device(devices, tests, min_obs):
-    '''
-    Calculates the tests per device for a single geographic
-    location, without any temporal smoothing.
+    """
+    Calculate the tests per device for a single geographic location, without any temporal smoothing.
 
     If on any day t, tests[t] < min_obs, then we report np.nan.
     The second and third returned np.ndarray are the standard errors,
@@ -289,7 +287,7 @@ def raw_tests_per_device(devices, tests, min_obs):
             Placeholder for standard errors
         np.ndarray
             Sample size used to compute estimates.
-    '''
+    """
     devices = devices.astype(float)
     tests = tests.astype(float)
     if (np.any(np.isnan(devices)) or np.any(np.isnan(tests))):
@@ -309,8 +307,8 @@ def raw_tests_per_device(devices, tests, min_obs):
 def smoothed_tests_per_device(devices, tests, min_obs, max_borrow_obs, pool_days,
                               parent_devices=None, parent_tests=None):
     """
-    Calculates the ratio of tests per device for a single geographic
-    location, with temporal smoothing.
+    Calculate the ratio of tests per device for a single geographic location, with temporal smoothing.
+
     For a given day t, if sum(tests[(t-pool_days+1):(t+1)]) < min_obs, then we
     'borrow' min_obs - sum(tests[(t-pool_days+1):(t+1)]) observations from the
     parents over the same timespan.  Importantly, it will make sure NOT to

diff --git a/quidel/delphi_quidel/generate_sensor.py b/quidel/delphi_quidel/generate_sensor.py
@@ -1,7 +1,5 @@
 # -*- coding: utf-8 -*-
-"""
-Functions to help generate sensor for different geographical levels
-"""
+"""Functions to help generate sensor for different geographical levels."""
 import pandas as pd
 from .data_tools import (fill_dates, raw_positive_prop,
                          smoothed_positive_prop,
@@ -11,7 +9,8 @@
 
 def generate_sensor_for_states(state_groups, smooth, device, first_date, last_date):
     """
-    fit over states
+    Fit over states.
+
     Args:
         state_groups: pd.groupby.generic.DataFrameGroupBy
         state_key: "state_id"
@@ -70,7 +69,8 @@ def generate_sensor_for_states(state_groups, smooth, device, first_date, last_da
 def generate_sensor_for_other_geores(state_groups, data, res_key, smooth,
                                      device, first_date, last_date):
     """
-    fit over counties/HRRs/MSAs
+    Fit over counties/HRRs/MSAs.
+
     Args:
         data: pd.DataFrame
         res_key: "fips", "cbsa_id" or "hrrnum"

diff --git a/quidel/delphi_quidel/geo_maps.py b/quidel/delphi_quidel/geo_maps.py
@@ -1,4 +1,5 @@
 """Contains geographic mapping tools."""
+
 def geo_map(geo_res, data, map_df):
     """Call appropriate mapping function based on desired geo resolution."""
     if geo_res == "county":
@@ -11,6 +12,7 @@ def geo_map(geo_res, data, map_df):
 
 def zip_to_msa(data, map_df):
     """Map from zipcode to MSA (along with parent state).
+
     Args:
         data: dataframe at the day-zip resolution.
     Returns:
@@ -35,6 +37,7 @@ def zip_to_msa(data, map_df):
 
 def zip_to_hrr(data, map_df):
     """Map from zipcode to HRR (along with parent state).
+
     Args:
         data: dataframe at the day-zip resolution.
     Returns:
@@ -59,6 +62,7 @@ def zip_to_hrr(data, map_df):
 
 def zip_to_county(data, map_df):
     """Aggregate zip codes to the county resolution, along with its parent state.
+
     Args:
         data: dataframe aggregated to the day-zip resolution
     Returns:
@@ -74,6 +78,7 @@ def zip_to_county(data, map_df):
 
 def zip_to_state(data, map_df):
     """Aggregate zip codes to the state resolution.
+
     Args:
         data: dataframe aggregated to the day-zip resolution
     Returns:

diff --git a/quidel/delphi_quidel/pull.py b/quidel/delphi_quidel/pull.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 """Simply downloads email attachments.
+
 Uses this handy package: https://pypi.org/project/imap-tools/
 """
 import io
@@ -26,6 +27,7 @@
 def compare_dates(date1, date2, flag):
     """
     Compare two dates.
+
     If op == "l" return the larger date
     If op == "s" return the smaller date
     """
@@ -38,20 +40,15 @@ def compare_dates(date1, date2, flag):
     return date1
 
 def check_whether_date_in_range(search_date, start_date, end_date):
-    """
-    Check whether the search date is in a valid time range
-    """
+    """Check whether the search date is in a valid time range."""
     if search_date > end_date:
         return False
     if search_date < start_date:
         return False
     return True
 
 def read_historical_data():
-    """
-    Read historical flu antigen test data stored in
-    midas /common/quidel-historical-raw
-    """
+    """Read historical flu antigen test data stored in midas /common/quidel-historical-raw."""
     pull_dir = "/common/quidel-historical-raw"
     columns = ['SofiaSerNum', 'TestDate', 'Facility', 'ZipCode',
                                'FluA', 'FluB', 'StorageDate']
@@ -65,9 +62,9 @@ def read_historical_data():
 
 def regulate_column_names(df, test_type):
     """
-    Regulate column names for flu_ag test data since Quidel changed their
-    column names multiple times. We want to finalize the column name list
-    to be:
+    Regulate column names for flu_ag test data since Quidel changed their column names multiple times.
+
+    We want to finalize the column name list to be:
         ['SofiaSerNum', 'TestDate', 'Facility',
         'Zip', 'FluA', 'FluB', 'StorageDate']
     """
@@ -87,7 +84,7 @@ def regulate_column_names(df, test_type):
 def get_from_email(column_names, start_dates, end_dates, mail_server,
                    account, sender, password):
     """
-    Get raw data from email account
+    Get raw data from email account.
 
     Parameters:
         start_date: datetime.datetime
@@ -145,9 +142,7 @@ def get_from_email(column_names, start_dates, end_dates, mail_server,
     return dfs, time_flag
 
 def fix_zipcode(df):
-    """
-    Fix zipcode that is 9 digit instead of 5 digit
-    """
+    """Fix zipcode that is 9 digit instead of 5 digit."""
     zipcode5 = []
     fixnum = 0
     for zipcode in df['Zip'].values:
@@ -163,6 +158,8 @@ def fix_zipcode(df):
 
 def fix_date(df):
     """
+    Remove invalid dates and select correct test date to use.
+
     Quidel antigen tests are labeled with Test Date and Storage Date. In principle,
     the TestDate should reflect when the test was performed and the StorageDate
     when the test was logged in the MyVirena cloud storage device. We expect
@@ -190,6 +187,7 @@ def preprocess_new_data(start_dates, end_dates, mail_server, account,
                         sender, password, test_mode):
     """
     Pull and pre-process Quidel Antigen Test data from datadrop email.
+
     Drop unnecessary columns. Temporarily consider the positive rate
     sensor only which is related to number of total tests and number
     of positive tests.
@@ -285,7 +283,7 @@ def preprocess_new_data(start_dates, end_dates, mail_server, account,
 
 def check_intermediate_file(cache_dir, pull_start_dates):
     """
-    Check whether there is a cache file containing historical data already
+    Check whether there is a cache file containing historical data already.
 
     Parameters:
         cache_dir: str
@@ -313,8 +311,7 @@ def check_intermediate_file(cache_dir, pull_start_dates):
 
 def pull_quidel_data(params):
     """
-    Pull the quidel test data. Decide whether to combine the newly
-    received data with stored historical records in ./cache
+    Pull the quidel test data and decide whether to combine the new data with stored historical records in ./cache.
 
     Parameters:
         params: dict
@@ -371,7 +368,8 @@ def pull_quidel_data(params):
 def check_export_end_date(input_export_end_dates, _end_date,
                           end_from_today_minus):
     """
-    Update the export_end_date according to the data received
+    Update the export_end_date according to the data received.
+
     By default, set the export end date to be the last pulling date - 5 days
     (END_FROM_TODAY_MINUS = 5).
     Otherwise, use the required date if it is earlier than the default one.
@@ -404,8 +402,7 @@ def check_export_end_date(input_export_end_dates, _end_date,
 def check_export_start_date(export_start_dates, export_end_dates,
                             export_day_range):
     """
-    Update the export_start_date according to the export_end_date so that it
-    could be export_end_date - EXPORT_DAY_RANGE
+    Update export_start_date according to the export_end_date so that it could be export_end_date - EXPORT_DAY_RANGE.
 
     Parameters:
         export_start_date: dict
@@ -438,7 +435,7 @@ def check_export_start_date(export_start_dates, export_end_dates,
 
 def update_cache_file(dfs, _end_date, cache_dir):
     """
-    Update cache file. Remove the old one, export the new one
+    Update cache file. Remove the old one, export the new one.
 
     Parameter:
         df: pd.DataFrame