Solved the problems in pylint test

Jingjing Tang · Jingjing Tang · commit 3637c8e8b581 · 2020-08-28T17:09:05.000-04:00
diff --git a/quidel_covidtest/delphi_quidel_covidtest/pull.py b/quidel_covidtest/delphi_quidel_covidtest/pull.py
@@ -2,23 +2,24 @@
 """Simply downloads email attachments.
 Uses this handy package: https://pypi.org/project/imap-tools/
 """
-from datetime import datetime, timedelta, date
 import io
+from os.path import join
 import os
+from datetime import datetime, timedelta
 
 import pandas as pd
 import numpy as np
 
 from imap_tools import MailBox, A, AND
 
-def get_from_email(start_date: datetime.date, end_date: datetime.date,
-                   mail_server: str, account: str, sender: str, password: str):
+def get_from_email(start_date, end_date, mail_server,
+                   account, sender, password):
     """
     Get raw data from email account
     Args:
-        start_date: datetime.date
+        start_date: datetime.datetime
             pull data from email received from the start date
-        end_date: datetime.date
+        end_date: datetime.datetime
             pull data from email received on/before the end date
         mail_server: str
         account: str
@@ -39,13 +40,13 @@ def get_from_email(start_date: datetime.date, end_date: datetime.date,
     with MailBox(mail_server).login(account, password, 'INBOX') as mailbox:
         for search_date in [start_date + timedelta(days=x)
                             for x in range((end_date - start_date).days + 1)]:
-            for message in mailbox.fetch(A(AND(date=search_date, from_=sender))):
+            for message in mailbox.fetch(A(AND(date=search_date.date(), from_=sender))):
                 for att in message.attachments:
                     name = att.filename
                     # Only consider covid tests
                     if "Sars" not in name:
                         continue
-                    print("Pulling data received on %s"%search_date)
+                    print("Pulling data received on %s"%search_date.date())
                     toread = io.BytesIO()
                     toread.write(att.payload)
                     toread.seek(0)  # reset the pointer
@@ -90,18 +91,18 @@ def fix_date(df):
     df["timestamp"].values[mask] = df["StorageDate"].values[mask]
     return df
 
-def pull_quidel_covidtest(start_date, end_date, mail_server, account,
-                          sender, password, test_mode):
+def preprocess_new_data(start_date, end_date, mail_server, account,
+                        sender, password, test_mode):
     """
     Pull and pre-process Quidel Covid Test data from datadrop email.
     Drop unnecessary columns. Temporarily consider the positive rate
     sensor only which is related to number of total tests and number
     of positive tests.
 
     Args:
-        start_date: datetime.date
+        start_date: datetime.datetime
             pull data from email received from the start date
-        end_date: datetime.date
+        end_date: datetime.datetime
             pull data from email received on/before the end date
         mail_server: str
         account: str
@@ -119,7 +120,7 @@ def pull_quidel_covidtest(start_date, end_date, mail_server, account,
     """
     if test_mode:
         test_data_dir = "./test_data/test_data.xlsx"
-        df, time_flag = pd.read_excel(test_data_dir), date(2020, 8, 17)
+        df, time_flag = pd.read_excel(test_data_dir), datetime(2020, 8, 17)
     else:
         # Get new data from email
         df, time_flag = get_from_email(start_date, end_date, mail_server,
@@ -166,12 +167,140 @@ def pull_quidel_covidtest(start_date, end_date, mail_server, account,
     return df_merged, time_flag
 
 def check_intermediate_file(cache_dir, pull_start_date):
+    """
+    Check whether there is a cache file containing historical data already
+    """
     for filename in os.listdir(cache_dir):
         if ".csv" in filename:
             pull_start_date = datetime.strptime(filename.split("_")[2].split(".")[0],
-                                            '%Y%m%d').date() + timedelta(days=1)
+                                            '%Y%m%d') + timedelta(days=1)
             previous_df = pd.read_csv(os.path.join(cache_dir, filename),
                                       sep=",", parse_dates=["timestamp"])
-            os.remove(os.path.join(cache_dir, filename))
             return previous_df, pull_start_date
     return None, pull_start_date
+
+def pull_quidel_covidtest(params):
+    """
+    Pull the quidel covid test data. Decide whether to combine the newly
+    received data with stored historical records in ./cache
+
+    Parameters:
+        params: dict
+            including all the information read from params.json
+        END_FROM_TODAY_MINUS: int
+            report data until - X days
+        EXPORT_DAY_RANGE: int
+            number of dates to report
+
+    Returns:
+        DataFrame:
+            A data frame containinig the pre-process data with columns:
+            timestamp, numUniqueDevices, positiveTest, totalTest
+        datetime.datetime
+            the first date of the report
+        datetime.datetime
+            the last date of the report
+    """
+    cache_dir = params["cache_dir"]
+
+    mail_server = params["mail_server"]
+    account = params["account"]
+    password = params["password"]
+    sender = params["sender"]
+
+    test_mode = (params["mode"] == "test")
+
+    # pull new data only that has not been ingested
+    previous_df, pull_start_date = check_intermediate_file(
+        cache_dir,
+        datetime.strptime(params["pull_start_date"], '%Y-%m-%d'))
+
+    if params["pull_end_date"] == "":
+        pull_end_date = datetime.today()
+    else:
+        pull_end_date = datetime.strptime(params["pull_end_date"], '%Y-%m-%d')
+
+    # Pull data from the email at 5 digit zipcode level
+    # Use _end_date to check the most recent date that we received data
+    df, _end_date = preprocess_new_data(
+            pull_start_date, pull_end_date, mail_server,
+            account, sender, password, test_mode)
+
+    # Utilize previously stored data
+    if previous_df is not None:
+        df = previous_df.append(df).groupby(["timestamp", "zip"]).sum().reset_index()
+    return df, _end_date
+
+def check_export_end_date(input_export_end_date, _end_date,
+                          END_FROM_TODAY_MINUS):
+    """
+    Update the export_end_date according to the data received
+    By default, set the export end date to be the last pulling date - 5 days
+    (END_FROM_TODAY_MINUS = 5).
+    Otherwise, use the required date if it is earlier than the default one.
+
+    Parameter:
+        input_export_end_date: str
+            read from params
+        _end_date: datetime.datetime
+            updated according the data received
+        END_FROM_TODAY_MINUS: int
+            report data until - X days
+
+    Returns:
+        datetime.datetime
+            export data from which date
+    """
+    export_end_date = _end_date - timedelta(days=END_FROM_TODAY_MINUS)
+    if input_export_end_date != "":
+        input_export_end_date = datetime.strptime(input_export_end_date, '%Y-%m-%d')
+        if input_export_end_date < export_end_date:
+            return input_export_end_date
+    return export_end_date
+
+def check_export_start_date(export_start_date, export_end_date,
+                            EXPORT_DAY_RANGE):
+    """
+    Update the export_start_date according to the export_end_date so that it
+    could be export_end_date - EXPORT_DAY_RANGE
+
+    Parameters:
+        export_start_date: str
+            Read from params
+        export_end_date: datetime.datetime
+            Calculated according to the data received
+        EXPORT_DAY_RANGE: int
+            Number of days to report
+
+    Returns:
+        datetime.datetime
+            export data until which date
+    """
+    if export_start_date == "":
+        export_start_date = datetime(2020, 5, 26)
+    else:
+        export_start_date = datetime.strptime(export_start_date, '%Y-%m-%d')
+     # Only export data from -45 days to -5 days
+    if (export_end_date - export_start_date).days > EXPORT_DAY_RANGE:
+        export_start_date = export_end_date - timedelta(days=EXPORT_DAY_RANGE)
+
+    if export_start_date < datetime(2020, 5, 26):
+        return datetime(2020, 5, 26)
+    return export_start_date
+
+def update_cache_file(df, _end_date, cache_dir):
+    """
+    Update cache file. Remove the old one, export the new one
+
+    Parameter:
+        df: pd.DataFrame
+            Pre-process file at ZipCode level
+        _end_date:
+            The most recent date when the raw data is received
+        cache_dir:
+            ./cache where the cache file is stored
+    """
+    for fn in os.listdir(cache_dir):
+        if ".csv" in fn:
+            os.remove(join(cache_dir, fn))
+    df.to_csv(join(cache_dir, "pulled_until_%s.csv") % _end_date.strftime("%Y%m%d"), index=False)
diff --git a/quidel_covidtest/delphi_quidel_covidtest/run.py b/quidel_covidtest/delphi_quidel_covidtest/run.py
@@ -4,14 +4,16 @@
 This module should contain a function called `run_module`, that is executed
 when the module is run with `python -m MODULE_NAME`.
 """
-from datetime import datetime, date, timedelta
 from os.path import join
 
 import pandas as pd
 from delphi_utils import read_params
 
 from .geo_maps import (zip_to_msa, zip_to_hrr, zip_to_county, zip_to_state)
-from .pull import pull_quidel_covidtest, check_intermediate_file
+from .pull import (pull_quidel_covidtest,
+                   check_export_start_date,
+                   check_export_end_date,
+                   update_cache_file)
 from .export import export_csv
 from .generate_sensor import (generate_sensor_for_states,
                               generate_sensor_for_other_geores)
@@ -28,57 +30,23 @@ def run_module():
     cache_dir = params["cache_dir"]
     export_dir = params["export_dir"]
     static_file_dir = params["static_file_dir"]
-
-    mail_server = params["mail_server"]
-    account = params["account"]
-    password = params["password"]
-    sender = params["sender"]
-
-    test_mode = (params["mode"] == "test")
-
-    export_start_date = datetime.strptime(params["export_start_date"], '%Y-%m-%d')
-
-    # pull new data only that has not been ingested
-    previous_df, pull_start_date = check_intermediate_file(
-        cache_dir,
-        datetime.strptime(params["pull_start_date"], '%Y-%m-%d').date())
-
-    if params["pull_end_date"] == "":
-        pull_end_date = date.today()
-    else:
-        pull_end_date = datetime.strptime(params["pull_end_date"], '%Y-%m-%d').date()
-
+    export_start_date = params["export_start_date"]
+    export_end_date = params["export_end_date"]
     map_df = pd.read_csv(
         join(static_file_dir, "fips_prop_pop.csv"), dtype={"fips": int}
     )
 
-    # Pull data from the email at 5 digit zipcode level
-    # Use _end_date to check the most recent date that we received data
-    df, _end_date = pull_quidel_covidtest(
-            pull_start_date, pull_end_date, mail_server,
-            account, sender, password, test_mode)
+    # Pull data and update export date
+    df, _end_date = pull_quidel_covidtest(params)
     if _end_date is None:
         print("The data is up-to-date. Currently, no new data to be ingested.")
         return
+    export_end_date = check_export_end_date(export_end_date, _end_date,
+                                            END_FROM_TODAY_MINUS)
+    export_start_date = check_export_start_date(export_start_date,
+                                                export_end_date, EXPORT_DAY_RANGE)
 
-    # Utilize previously stored data
-    if previous_df is not None:
-        df = previous_df.append(df).groupby(["timestamp", "zip"]).sum().reset_index()
-
-    # By default, set the export end date to be the last pulling date - 5 days
-    export_end_date = _end_date - timedelta(days=END_FROM_TODAY_MINUS)
-    if params["export_end_date"] != "":
-        input_export_end_date = datetime.strptime(params["export_end_date"], '%Y-%m-%d').date()
-        if input_export_end_date < export_end_date:
-            export_end_date = input_export_end_date
-    export_end_date = datetime(export_end_date.year, export_end_date.month, export_end_date.day)
-
-    # Only export data from -45 days to -5 days
-    if (export_end_date - export_start_date).days > EXPORT_DAY_RANGE:
-        export_start_date = export_end_date - timedelta(days=EXPORT_DAY_RANGE)
-
-    first_date = df["timestamp"].min()
-    last_date = df["timestamp"].max()
+    first_date, last_date = df["timestamp"].min(), df["timestamp"].max()
 
     # State Level
     data = df.copy()
@@ -128,4 +96,4 @@ def run_module():
 
     # Export the cache file if the pipeline runs successfully.
     # Otherwise, don't update the cache file
-    df.to_csv(join(cache_dir, "pulled_until_%s.csv") % _end_date.strftime("%Y%m%d"), index=False)
+    update_cache_file(df, _end_date, cache_dir)
diff --git a/quidel_covidtest/tests/test_pull.py b/quidel_covidtest/tests/test_pull.py
@@ -9,9 +9,13 @@
     fix_zipcode,
     fix_date,
     pull_quidel_covidtest,
-    check_intermediate_file
+    check_intermediate_file,
+    check_export_end_date,
+    check_export_start_date
 )
 
+END_FROM_TODAY_MINUS = 5
+EXPORT_DAY_RANGE = 40
 
 class TestFixData:
     def test_fix_zipcode(self):
@@ -36,18 +40,8 @@ class TestingPullData:
     def test_pull_quidel_covidtest(self):
         
         params = read_params()
-        mail_server = params["mail_server"]
-        account = params["account"]
-        password = params["password"]
-        sender = params["sender"]
         
-        test_mode = (params["mode"] == "test")
-        
-        pull_start_date = date(2020, 6, 10)
-        pull_end_date = date(2020, 6, 12)
-        
-        df, _ = pull_quidel_covidtest(pull_start_date, pull_end_date, mail_server,
-                               account, sender, password, test_mode) 
+        df, _ = pull_quidel_covidtest(params) 
         
         first_date = df["timestamp"].min().date() 
         last_date = df["timestamp"].max().date() 
@@ -67,4 +61,28 @@ def test_check_intermediate_file(self):
 
         previous_df, pull_start_date = check_intermediate_file("./cache/test_cache_without_file", None)
         assert previous_df is None
-        assert pull_start_date is None
+        assert pull_start_date is None
+    
+    def test_check_export_end_date(self):
+        
+        _end_date = datetime(2020, 7, 7)
+        export_end_dates = ["", "2020-07-07", "2020-06-15"]
+        tested = []
+        for export_end_date in export_end_dates:
+            tested.append(check_export_end_date(export_end_date, _end_date,
+                                                END_FROM_TODAY_MINUS))
+        expected = [datetime(2020, 7, 2), datetime(2020, 7, 2), datetime(2020, 6,15)]
+        
+        assert tested == expected
+            
+    def test_check_export_start_date(self):
+        
+        export_end_date = datetime(2020, 7, 2)
+        export_start_dates = ["", "2020-06-20", "2020-04-20"]
+        tested = []
+        for export_start_date in export_start_dates:
+            tested.append(check_export_start_date(export_start_date,
+                                                  export_end_date, EXPORT_DAY_RANGE))
+        expected = [datetime(2020, 5, 26), datetime(2020, 6, 20), datetime(2020, 5, 26)]
+        
+        assert tested == expected