diff --git a/src/acquisition/covidcast/csv_importer.py b/src/acquisition/covidcast/csv_importer.py index 7b88ba00f..4f89c2b1c 100644 --- a/src/acquisition/covidcast/csv_importer.py +++ b/src/acquisition/covidcast/csv_importer.py @@ -1,19 +1,31 @@ """Collects and reads covidcast data from a set of local CSV files.""" # standard library +from dataclasses import dataclass from datetime import date import glob import os import re # third party -import pandas +import pandas as pd import epiweeks as epi # first party from delphi_utils import Nans from delphi.utils.epiweek import delta_epiweeks -from delphi.epidata.acquisition.covidcast.logger import get_structured_logger +from .logger import get_structured_logger + +@dataclass +class CsvRowValue: + """A container for the values of a single validated covidcast CSV row.""" + geo_value: str + value: float + stderr: float + sample_size: float + missing_value: int + missing_stderr: int + missing_sample_size: int class CsvImporter: """Finds and parses covidcast CSV files.""" @@ -37,6 +49,7 @@ class CsvImporter: MIN_YEAR = 2019 MAX_YEAR = 2030 + # The datatypes expected by pandas.read_csv. Int64 is like float in that it can handle both numbers and nans. DTYPES = { "geo_id": str, "val": float, @@ -47,20 +60,6 @@ class CsvImporter: "missing_sample_size": "Int64" } - # NOTE: this should be a Python 3.7+ `dataclass`, but the server is on 3.4 - # See https://docs.python.org/3/library/dataclasses.html - class RowValues: - """A container for the values of a single covidcast row.""" - - def __init__(self, geo_value, value, stderr, sample_size, missing_value, missing_stderr, missing_sample_size): - self.geo_value = geo_value - self.value = value - self.stderr = stderr - self.sample_size = sample_size - self.missing_value = missing_value - self.missing_stderr = missing_stderr - self.missing_sample_size = missing_sample_size - @staticmethod def is_sane_day(value): """Return whether `value` is a sane (maybe not valid) YYYYMMDD date. @@ -184,7 +183,7 @@ def is_header_valid(columns): return set(columns) >= CsvImporter.REQUIRED_COLUMNS @staticmethod - def floaty_int(value): + def floaty_int(value: str) -> int: """Cast a string to an int, even if it looks like a float. For example, "-1" and "-1.0" should both result in -1. Non-integer floats @@ -253,7 +252,7 @@ def validate_missing_code(row, attr_quantity, attr_name, filepath=None, logger=N @staticmethod def extract_and_check_row(row, geo_type, filepath=None): - """Extract and return `RowValues` from a CSV row, with sanity checks. + """Extract and return `CsvRowValue` from a CSV row, with sanity checks. Also returns the name of the field which failed sanity check, or None. @@ -330,14 +329,10 @@ def extract_and_check_row(row, geo_type, filepath=None): missing_sample_size = CsvImporter.validate_missing_code(row, sample_size, "sample_size", filepath) # return extracted and validated row values - row_values = CsvImporter.RowValues( - geo_id, value, stderr, sample_size, - missing_value, missing_stderr, missing_sample_size - ) - return (row_values, None) + return (CsvRowValue(geo_id, value, stderr, sample_size, missing_value, missing_stderr, missing_sample_size), None) @staticmethod - def load_csv(filepath, geo_type, pandas=pandas): + def load_csv(filepath, geo_type): """Load, validate, and yield data as `RowValues` from a CSV file. filepath: the CSV file to be loaded @@ -349,10 +344,10 @@ def load_csv(filepath, geo_type, pandas=pandas): logger = get_structured_logger('load_csv') try: - table = pandas.read_csv(filepath, dtype=CsvImporter.DTYPES) + table = pd.read_csv(filepath, dtype=CsvImporter.DTYPES) except ValueError as e: logger.warning(event='Failed to open CSV with specified dtypes, switching to str', detail=str(e), file=filepath) - table = pandas.read_csv(filepath, dtype='str') + table = pd.read_csv(filepath, dtype='str') if not CsvImporter.is_header_valid(table.columns): logger.warning(event='invalid header', detail=table.columns, file=filepath) diff --git a/tests/acquisition/covidcast/test_csv_importer.py b/tests/acquisition/covidcast/test_csv_importer.py index a99eb3a1a..3d3c22ecc 100644 --- a/tests/acquisition/covidcast/test_csv_importer.py +++ b/tests/acquisition/covidcast/test_csv_importer.py @@ -5,17 +5,15 @@ from unittest.mock import MagicMock from unittest.mock import patch from datetime import date -import math import numpy as np -import os # third party -import pandas +import pandas as pd import epiweeks as epi from delphi_utils import Nans -from delphi.epidata.acquisition.covidcast.csv_importer import CsvImporter from delphi.utils.epiweek import delta_epiweeks +from delphi.epidata.acquisition.covidcast.csv_importer import CsvImporter, CsvRowValue # py3tester coverage target __test_target__ = 'delphi.epidata.acquisition.covidcast.csv_importer' @@ -208,37 +206,38 @@ def make_row( self.assertEqual(error, field) success_cases = [ - (make_row(), CsvImporter.RowValues('vi', 1.23, 4.56, 100.5, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.NOT_MISSING)), - (make_row(value=None, stderr=np.nan, sample_size='', missing_value=str(float(Nans.DELETED)), missing_stderr=str(float(Nans.DELETED)), missing_sample_size=str(float(Nans.DELETED))), CsvImporter.RowValues('vi', None, None, None, Nans.DELETED, Nans.DELETED, Nans.DELETED)), - (make_row(stderr='', sample_size='NA', missing_stderr=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.OTHER))), CsvImporter.RowValues('vi', 1.23, None, None, Nans.NOT_MISSING, Nans.OTHER, Nans.OTHER)), - (make_row(sample_size=None, missing_value='missing_value', missing_stderr=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.NOT_MISSING))), CsvImporter.RowValues('vi', 1.23, 4.56, None, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.OTHER)), + (make_row(), CsvRowValue('vi', 1.23, 4.56, 100.5, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.NOT_MISSING)), + (make_row(value=None, stderr=np.nan, sample_size='', missing_value=str(float(Nans.DELETED)), missing_stderr=str(float(Nans.DELETED)), missing_sample_size=str(float(Nans.DELETED))), CsvRowValue('vi', None, None, None, Nans.DELETED, Nans.DELETED, Nans.DELETED)), + (make_row(stderr='', sample_size='NA', missing_stderr=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.OTHER))), CsvRowValue('vi', 1.23, None, None, Nans.NOT_MISSING, Nans.OTHER, Nans.OTHER)), + (make_row(sample_size=None, missing_value='missing_value', missing_stderr=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.NOT_MISSING))), CsvRowValue('vi', 1.23, 4.56, None, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.OTHER)), ] for ((geo_type, row), field) in success_cases: values, error = CsvImporter.extract_and_check_row(row, geo_type) self.assertIsNone(error) - self.assertIsInstance(values, CsvImporter.RowValues) + self.assertIsInstance(values, CsvRowValue) self.assertEqual(values.geo_value, field.geo_value) self.assertEqual(values.value, field.value) self.assertEqual(values.stderr, field.stderr) self.assertEqual(values.sample_size, field.sample_size) - def test_load_csv_with_invalid_header(self): + @patch("pandas.read_csv") + def test_load_csv_with_invalid_header(self, mock_read_csv): """Bail loading a CSV when the header is invalid.""" data = {'foo': [1, 2, 3]} - mock_pandas = MagicMock() - mock_pandas.read_csv.return_value = pandas.DataFrame(data=data) filepath = 'path/name.csv' geo_type = 'state' - rows = list(CsvImporter.load_csv(filepath, geo_type, pandas=mock_pandas)) + mock_read_csv.return_value = pd.DataFrame(data) + rows = list(CsvImporter.load_csv(filepath, geo_type)) - self.assertTrue(mock_pandas.read_csv.called) - self.assertTrue(mock_pandas.read_csv.call_args[0][0], filepath) + self.assertTrue(mock_read_csv.called) + self.assertTrue(mock_read_csv.call_args[0][0], filepath) self.assertEqual(rows, [None]) - def test_load_csv_with_valid_header(self): + @patch("pandas.read_csv") + def test_load_csv_with_valid_header(self, mock_read_csv): """Yield sanity checked `RowValues` from a valid CSV file.""" # one invalid geo_id, but otherwise valid @@ -248,15 +247,14 @@ def test_load_csv_with_valid_header(self): 'se': ['2.1', '2.2', '2.3', '2.4'], 'sample_size': ['301', '302', '303', '304'], } - mock_pandas = MagicMock() - mock_pandas.read_csv.return_value = pandas.DataFrame(data=data) filepath = 'path/name.csv' geo_type = 'state' - rows = list(CsvImporter.load_csv(filepath, geo_type, pandas=mock_pandas)) + mock_read_csv.return_value = pd.DataFrame(data=data) + rows = list(CsvImporter.load_csv(filepath, geo_type)) - self.assertTrue(mock_pandas.read_csv.called) - self.assertTrue(mock_pandas.read_csv.call_args[0][0], filepath) + self.assertTrue(mock_read_csv.called) + self.assertTrue(mock_read_csv.call_args[0][0], filepath) self.assertEqual(len(rows), 4) self.assertEqual(rows[0].geo_value, 'ca') @@ -286,15 +284,14 @@ def test_load_csv_with_valid_header(self): 'missing_stderr': [Nans.NOT_MISSING, Nans.REGION_EXCEPTION, Nans.NOT_MISSING, Nans.NOT_MISSING] + [None], 'missing_sample_size': [Nans.NOT_MISSING] * 2 + [Nans.REGION_EXCEPTION] * 2 + [None] } - mock_pandas = MagicMock() - mock_pandas.read_csv.return_value = pandas.DataFrame(data=data) filepath = 'path/name.csv' geo_type = 'state' - rows = list(CsvImporter.load_csv(filepath, geo_type, pandas=mock_pandas)) + mock_read_csv.return_value = pd.DataFrame(data) + rows = list(CsvImporter.load_csv(filepath, geo_type)) - self.assertTrue(mock_pandas.read_csv.called) - self.assertTrue(mock_pandas.read_csv.call_args[0][0], filepath) + self.assertTrue(mock_read_csv.called) + self.assertTrue(mock_read_csv.call_args[0][0], filepath) self.assertEqual(len(rows), 5) self.assertEqual(rows[0].geo_value, 'ca') diff --git a/tests/acquisition/covidcast/test_csv_to_database.py b/tests/acquisition/covidcast/test_csv_to_database.py index 0b91815fb..3f037f6b1 100644 --- a/tests/acquisition/covidcast/test_csv_to_database.py +++ b/tests/acquisition/covidcast/test_csv_to_database.py @@ -6,8 +6,7 @@ import unittest from unittest.mock import MagicMock -from delphi.epidata.acquisition.covidcast.csv_to_database import get_argument_parser, main, \ - collect_files, upload_archive, make_handlers +from delphi.epidata.acquisition.covidcast.csv_to_database import get_argument_parser, main, collect_files, upload_archive, make_handlers # py3tester coverage target __test_target__ = 'delphi.epidata.acquisition.covidcast.csv_to_database'