diff --git a/quidel/Makefile b/quidel/Makefile index 56a71a88c..968732f99 100644 --- a/quidel/Makefile +++ b/quidel/Makefile @@ -13,7 +13,8 @@ install: venv lint: . env/bin/activate; \ - pylint $(dir) + pylint $(dir); \ + pydocstyle $(dir) test: . env/bin/activate ;\ diff --git a/quidel/delphi_quidel/constants.py b/quidel/delphi_quidel/constants.py index e660d4f8e..6a905c945 100644 --- a/quidel/delphi_quidel/constants.py +++ b/quidel/delphi_quidel/constants.py @@ -1,4 +1,4 @@ -"""Registry for constants""" +"""Registry for constants.""" # global constants MIN_OBS = 50 # minimum number of observations in order to compute a proportion. MAX_BORROW_OBS = 20 # maximum number of observations can be borrowed in geographical pooling diff --git a/quidel/delphi_quidel/data_tools.py b/quidel/delphi_quidel/data_tools.py index fbbce4de7..9ada778ef 100644 --- a/quidel/delphi_quidel/data_tools.py +++ b/quidel/delphi_quidel/data_tools.py @@ -1,17 +1,20 @@ -""" -Functions to calculate the quidel sensor statistic. -""" +"""Functions to calculate the quidel sensor statistic.""" import numpy as np import pandas as pd def _prop_var(p, n): - """var(X/n) = 1/(n^2)var(X) = (npq)/(n^2) = pq/n""" + """ + Calculate variance of proportion. + + var(X/n) = 1/(n^2)var(X) = (npq)/(n^2) = pq/n + """ return p * (1 - p) / n def fill_dates(y_data, first_date, last_date): """ Ensure all dates are listed in the data, otherwise, add days with 0 counts. + Args: y_data: dataframe with datetime index first_date: datetime.datetime @@ -36,8 +39,9 @@ def fill_dates(y_data, first_date, last_date): def _slide_window_sum(arr, k): """ - Sliding window sum, with fixed window size k. For indices 0:k, we - DO compute a sum, using whatever points are available. + Sliding window sum, with fixed window size k. + + For indices 0:k, we DO compute a sum, using whatever points are available. Reference: https://stackoverflow.com/a/38507725 @@ -51,7 +55,6 @@ def _slide_window_sum(arr, k): sarr: np.ndarray Array of same length of arr, holding the sliding window sum. """ - if not isinstance(k, int): raise ValueError('k must be int.') temp = np.append(np.zeros(k - 1), arr) @@ -61,12 +64,11 @@ def _slide_window_sum(arr, k): def _geographical_pooling(tpooled_tests, tpooled_ptests, min_obs, max_borrow_obs): """ - Calculates the proportion of parent samples (tests) that must be "borrowed" - in order to properly compute the statistic. If there are no samples - available in the parent, the borrow_prop is 0. If the parent does not + Calculate proportion of parent samples (tests) that must be "borrowed" in order to compute the statistic. + + If there are no samples available in the parent, the borrow_prop is 0. If the parent does not have enough samples, we return a borrow_prop of 1, and the fact that the - pooled samples are insufficient are handled in the statistic fitting - step. + pooled samples are insufficient are handled in the statistic fitting step. Args: tpooled_tests: np.ndarray[float] @@ -117,8 +119,7 @@ def _geographical_pooling(tpooled_tests, tpooled_ptests, min_obs, max_borrow_obs def raw_positive_prop(positives, tests, min_obs): """ - Calculates the proportion of positive tests for a single geographic - location, without any temporal smoothing. + Calculate the proportion of positive tests for a single geographic location, without any temporal smoothing. If on any day t, tests[t] < min_obs, then we report np.nan. @@ -171,8 +172,7 @@ def raw_positive_prop(positives, tests, min_obs): def smoothed_positive_prop(positives, tests, min_obs, max_borrow_obs, pool_days, parent_positives=None, parent_tests=None): """ - Calculates the proportion of negative tests for a single geographic - location, with temporal smoothing. + Calculate the proportion of negative tests for a single geographic location, with temporal smoothing. For a given day t, if sum(tests[(t-pool_days+1):(t+1)]) < min_obs, then we 'borrow' min_obs - sum(tests[(t-pool_days+1):(t+1)]) observations from the @@ -219,7 +219,6 @@ def smoothed_positive_prop(positives, tests, min_obs, max_borrow_obs, pool_days, np.ndarray Effective sample size (after temporal and geographic pooling). """ - positives = positives.astype(float) tests = tests.astype(float) if (parent_positives is None) or (parent_tests is None): @@ -264,9 +263,8 @@ def smoothed_positive_prop(positives, tests, min_obs, max_borrow_obs, pool_days, def raw_tests_per_device(devices, tests, min_obs): - ''' - Calculates the tests per device for a single geographic - location, without any temporal smoothing. + """ + Calculate the tests per device for a single geographic location, without any temporal smoothing. If on any day t, tests[t] < min_obs, then we report np.nan. The second and third returned np.ndarray are the standard errors, @@ -289,7 +287,7 @@ def raw_tests_per_device(devices, tests, min_obs): Placeholder for standard errors np.ndarray Sample size used to compute estimates. - ''' + """ devices = devices.astype(float) tests = tests.astype(float) if (np.any(np.isnan(devices)) or np.any(np.isnan(tests))): @@ -309,8 +307,8 @@ def raw_tests_per_device(devices, tests, min_obs): def smoothed_tests_per_device(devices, tests, min_obs, max_borrow_obs, pool_days, parent_devices=None, parent_tests=None): """ - Calculates the ratio of tests per device for a single geographic - location, with temporal smoothing. + Calculate the ratio of tests per device for a single geographic location, with temporal smoothing. + For a given day t, if sum(tests[(t-pool_days+1):(t+1)]) < min_obs, then we 'borrow' min_obs - sum(tests[(t-pool_days+1):(t+1)]) observations from the parents over the same timespan. Importantly, it will make sure NOT to diff --git a/quidel/delphi_quidel/generate_sensor.py b/quidel/delphi_quidel/generate_sensor.py index 7558f81c6..43778c9b4 100644 --- a/quidel/delphi_quidel/generate_sensor.py +++ b/quidel/delphi_quidel/generate_sensor.py @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- -""" -Functions to help generate sensor for different geographical levels -""" +"""Functions to help generate sensor for different geographical levels.""" import pandas as pd from .data_tools import (fill_dates, raw_positive_prop, smoothed_positive_prop, @@ -11,7 +9,8 @@ def generate_sensor_for_states(state_groups, smooth, device, first_date, last_date): """ - fit over states + Fit over states. + Args: state_groups: pd.groupby.generic.DataFrameGroupBy state_key: "state_id" @@ -70,7 +69,8 @@ def generate_sensor_for_states(state_groups, smooth, device, first_date, last_da def generate_sensor_for_other_geores(state_groups, data, res_key, smooth, device, first_date, last_date): """ - fit over counties/HRRs/MSAs + Fit over counties/HRRs/MSAs. + Args: data: pd.DataFrame res_key: "fips", "cbsa_id" or "hrrnum" diff --git a/quidel/delphi_quidel/geo_maps.py b/quidel/delphi_quidel/geo_maps.py index f868e2748..03f4f61cf 100644 --- a/quidel/delphi_quidel/geo_maps.py +++ b/quidel/delphi_quidel/geo_maps.py @@ -1,4 +1,5 @@ """Contains geographic mapping tools.""" + def geo_map(geo_res, data, map_df): """Call appropriate mapping function based on desired geo resolution.""" if geo_res == "county": @@ -11,6 +12,7 @@ def geo_map(geo_res, data, map_df): def zip_to_msa(data, map_df): """Map from zipcode to MSA (along with parent state). + Args: data: dataframe at the day-zip resolution. Returns: @@ -35,6 +37,7 @@ def zip_to_msa(data, map_df): def zip_to_hrr(data, map_df): """Map from zipcode to HRR (along with parent state). + Args: data: dataframe at the day-zip resolution. Returns: @@ -59,6 +62,7 @@ def zip_to_hrr(data, map_df): def zip_to_county(data, map_df): """Aggregate zip codes to the county resolution, along with its parent state. + Args: data: dataframe aggregated to the day-zip resolution Returns: @@ -74,6 +78,7 @@ def zip_to_county(data, map_df): def zip_to_state(data, map_df): """Aggregate zip codes to the state resolution. + Args: data: dataframe aggregated to the day-zip resolution Returns: diff --git a/quidel/delphi_quidel/pull.py b/quidel/delphi_quidel/pull.py index 22fc3b6d0..18304d012 100644 --- a/quidel/delphi_quidel/pull.py +++ b/quidel/delphi_quidel/pull.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- """Simply downloads email attachments. + Uses this handy package: https://pypi.org/project/imap-tools/ """ import io @@ -26,6 +27,7 @@ def compare_dates(date1, date2, flag): """ Compare two dates. + If op == "l" return the larger date If op == "s" return the smaller date """ @@ -38,9 +40,7 @@ def compare_dates(date1, date2, flag): return date1 def check_whether_date_in_range(search_date, start_date, end_date): - """ - Check whether the search date is in a valid time range - """ + """Check whether the search date is in a valid time range.""" if search_date > end_date: return False if search_date < start_date: @@ -48,10 +48,7 @@ def check_whether_date_in_range(search_date, start_date, end_date): return True def read_historical_data(): - """ - Read historical flu antigen test data stored in - midas /common/quidel-historical-raw - """ + """Read historical flu antigen test data stored in midas /common/quidel-historical-raw.""" pull_dir = "/common/quidel-historical-raw" columns = ['SofiaSerNum', 'TestDate', 'Facility', 'ZipCode', 'FluA', 'FluB', 'StorageDate'] @@ -65,9 +62,9 @@ def read_historical_data(): def regulate_column_names(df, test_type): """ - Regulate column names for flu_ag test data since Quidel changed their - column names multiple times. We want to finalize the column name list - to be: + Regulate column names for flu_ag test data since Quidel changed their column names multiple times. + + We want to finalize the column name list to be: ['SofiaSerNum', 'TestDate', 'Facility', 'Zip', 'FluA', 'FluB', 'StorageDate'] """ @@ -87,7 +84,7 @@ def regulate_column_names(df, test_type): def get_from_email(column_names, start_dates, end_dates, mail_server, account, sender, password): """ - Get raw data from email account + Get raw data from email account. Parameters: start_date: datetime.datetime @@ -145,9 +142,7 @@ def get_from_email(column_names, start_dates, end_dates, mail_server, return dfs, time_flag def fix_zipcode(df): - """ - Fix zipcode that is 9 digit instead of 5 digit - """ + """Fix zipcode that is 9 digit instead of 5 digit.""" zipcode5 = [] fixnum = 0 for zipcode in df['Zip'].values: @@ -163,6 +158,8 @@ def fix_zipcode(df): def fix_date(df): """ + Remove invalid dates and select correct test date to use. + Quidel antigen tests are labeled with Test Date and Storage Date. In principle, the TestDate should reflect when the test was performed and the StorageDate when the test was logged in the MyVirena cloud storage device. We expect @@ -190,6 +187,7 @@ def preprocess_new_data(start_dates, end_dates, mail_server, account, sender, password, test_mode): """ Pull and pre-process Quidel Antigen Test data from datadrop email. + Drop unnecessary columns. Temporarily consider the positive rate sensor only which is related to number of total tests and number of positive tests. @@ -285,7 +283,7 @@ def preprocess_new_data(start_dates, end_dates, mail_server, account, def check_intermediate_file(cache_dir, pull_start_dates): """ - Check whether there is a cache file containing historical data already + Check whether there is a cache file containing historical data already. Parameters: cache_dir: str @@ -313,8 +311,7 @@ def check_intermediate_file(cache_dir, pull_start_dates): def pull_quidel_data(params): """ - Pull the quidel test data. Decide whether to combine the newly - received data with stored historical records in ./cache + Pull the quidel test data and decide whether to combine the new data with stored historical records in ./cache. Parameters: params: dict @@ -371,7 +368,8 @@ def pull_quidel_data(params): def check_export_end_date(input_export_end_dates, _end_date, end_from_today_minus): """ - Update the export_end_date according to the data received + Update the export_end_date according to the data received. + By default, set the export end date to be the last pulling date - 5 days (END_FROM_TODAY_MINUS = 5). Otherwise, use the required date if it is earlier than the default one. @@ -404,8 +402,7 @@ def check_export_end_date(input_export_end_dates, _end_date, def check_export_start_date(export_start_dates, export_end_dates, export_day_range): """ - Update the export_start_date according to the export_end_date so that it - could be export_end_date - EXPORT_DAY_RANGE + Update export_start_date according to the export_end_date so that it could be export_end_date - EXPORT_DAY_RANGE. Parameters: export_start_date: dict @@ -438,7 +435,7 @@ def check_export_start_date(export_start_dates, export_end_dates, def update_cache_file(dfs, _end_date, cache_dir): """ - Update cache file. Remove the old one, export the new one + Update cache file. Remove the old one, export the new one. Parameter: df: pd.DataFrame