Skip to content

[Refactor] Clarify a few csv_importer types and names #949

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 21 additions & 26 deletions src/acquisition/covidcast/csv_importer.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,31 @@
"""Collects and reads covidcast data from a set of local CSV files."""

# standard library
from dataclasses import dataclass
from datetime import date
import glob
import os
import re

# third party
import pandas
import pandas as pd
import epiweeks as epi

# first party
from delphi_utils import Nans
from delphi.utils.epiweek import delta_epiweeks
from delphi.epidata.acquisition.covidcast.logger import get_structured_logger
from .logger import get_structured_logger

@dataclass
class CsvRowValue:
"""A container for the values of a single validated covidcast CSV row."""
geo_value: str
value: float
stderr: float
sample_size: float
missing_value: int
missing_stderr: int
missing_sample_size: int

class CsvImporter:
"""Finds and parses covidcast CSV files."""
Expand All @@ -37,6 +49,7 @@ class CsvImporter:
MIN_YEAR = 2019
MAX_YEAR = 2030

# The datatypes expected by pandas.read_csv. Int64 is like float in that it can handle both numbers and nans.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🏆

DTYPES = {
"geo_id": str,
"val": float,
Expand All @@ -47,20 +60,6 @@ class CsvImporter:
"missing_sample_size": "Int64"
}

# NOTE: this should be a Python 3.7+ `dataclass`, but the server is on 3.4
# See https://docs.python.org/3/library/dataclasses.html
class RowValues:
"""A container for the values of a single covidcast row."""

def __init__(self, geo_value, value, stderr, sample_size, missing_value, missing_stderr, missing_sample_size):
self.geo_value = geo_value
self.value = value
self.stderr = stderr
self.sample_size = sample_size
self.missing_value = missing_value
self.missing_stderr = missing_stderr
self.missing_sample_size = missing_sample_size

@staticmethod
def is_sane_day(value):
"""Return whether `value` is a sane (maybe not valid) YYYYMMDD date.
Expand Down Expand Up @@ -184,7 +183,7 @@ def is_header_valid(columns):
return set(columns) >= CsvImporter.REQUIRED_COLUMNS

@staticmethod
def floaty_int(value):
def floaty_int(value: str) -> int:
"""Cast a string to an int, even if it looks like a float.

For example, "-1" and "-1.0" should both result in -1. Non-integer floats
Expand Down Expand Up @@ -253,7 +252,7 @@ def validate_missing_code(row, attr_quantity, attr_name, filepath=None, logger=N

@staticmethod
def extract_and_check_row(row, geo_type, filepath=None):
"""Extract and return `RowValues` from a CSV row, with sanity checks.
"""Extract and return `CsvRowValue` from a CSV row, with sanity checks.

Also returns the name of the field which failed sanity check, or None.

Expand Down Expand Up @@ -330,14 +329,10 @@ def extract_and_check_row(row, geo_type, filepath=None):
missing_sample_size = CsvImporter.validate_missing_code(row, sample_size, "sample_size", filepath)

# return extracted and validated row values
row_values = CsvImporter.RowValues(
geo_id, value, stderr, sample_size,
missing_value, missing_stderr, missing_sample_size
)
return (row_values, None)
return (CsvRowValue(geo_id, value, stderr, sample_size, missing_value, missing_stderr, missing_sample_size), None)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find this harder to read but I will hold my nose and accept it as the way of the future


@staticmethod
def load_csv(filepath, geo_type, pandas=pandas):
def load_csv(filepath, geo_type):
"""Load, validate, and yield data as `RowValues` from a CSV file.

filepath: the CSV file to be loaded
Expand All @@ -349,10 +344,10 @@ def load_csv(filepath, geo_type, pandas=pandas):
logger = get_structured_logger('load_csv')

try:
table = pandas.read_csv(filepath, dtype=CsvImporter.DTYPES)
table = pd.read_csv(filepath, dtype=CsvImporter.DTYPES)
except ValueError as e:
logger.warning(event='Failed to open CSV with specified dtypes, switching to str', detail=str(e), file=filepath)
table = pandas.read_csv(filepath, dtype='str')
table = pd.read_csv(filepath, dtype='str')

if not CsvImporter.is_header_valid(table.columns):
logger.warning(event='invalid header', detail=table.columns, file=filepath)
Expand Down
49 changes: 23 additions & 26 deletions tests/acquisition/covidcast/test_csv_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,15 @@
from unittest.mock import MagicMock
from unittest.mock import patch
from datetime import date
import math
import numpy as np
import os

# third party
import pandas
import pandas as pd
import epiweeks as epi

from delphi_utils import Nans
from delphi.epidata.acquisition.covidcast.csv_importer import CsvImporter
from delphi.utils.epiweek import delta_epiweeks
from delphi.epidata.acquisition.covidcast.csv_importer import CsvImporter, CsvRowValue

# py3tester coverage target
__test_target__ = 'delphi.epidata.acquisition.covidcast.csv_importer'
Expand Down Expand Up @@ -208,37 +206,38 @@ def make_row(
self.assertEqual(error, field)

success_cases = [
(make_row(), CsvImporter.RowValues('vi', 1.23, 4.56, 100.5, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.NOT_MISSING)),
(make_row(value=None, stderr=np.nan, sample_size='', missing_value=str(float(Nans.DELETED)), missing_stderr=str(float(Nans.DELETED)), missing_sample_size=str(float(Nans.DELETED))), CsvImporter.RowValues('vi', None, None, None, Nans.DELETED, Nans.DELETED, Nans.DELETED)),
(make_row(stderr='', sample_size='NA', missing_stderr=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.OTHER))), CsvImporter.RowValues('vi', 1.23, None, None, Nans.NOT_MISSING, Nans.OTHER, Nans.OTHER)),
(make_row(sample_size=None, missing_value='missing_value', missing_stderr=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.NOT_MISSING))), CsvImporter.RowValues('vi', 1.23, 4.56, None, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.OTHER)),
(make_row(), CsvRowValue('vi', 1.23, 4.56, 100.5, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.NOT_MISSING)),
(make_row(value=None, stderr=np.nan, sample_size='', missing_value=str(float(Nans.DELETED)), missing_stderr=str(float(Nans.DELETED)), missing_sample_size=str(float(Nans.DELETED))), CsvRowValue('vi', None, None, None, Nans.DELETED, Nans.DELETED, Nans.DELETED)),
(make_row(stderr='', sample_size='NA', missing_stderr=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.OTHER))), CsvRowValue('vi', 1.23, None, None, Nans.NOT_MISSING, Nans.OTHER, Nans.OTHER)),
(make_row(sample_size=None, missing_value='missing_value', missing_stderr=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.NOT_MISSING))), CsvRowValue('vi', 1.23, 4.56, None, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.OTHER)),
]

for ((geo_type, row), field) in success_cases:
values, error = CsvImporter.extract_and_check_row(row, geo_type)
self.assertIsNone(error)
self.assertIsInstance(values, CsvImporter.RowValues)
self.assertIsInstance(values, CsvRowValue)
self.assertEqual(values.geo_value, field.geo_value)
self.assertEqual(values.value, field.value)
self.assertEqual(values.stderr, field.stderr)
self.assertEqual(values.sample_size, field.sample_size)

def test_load_csv_with_invalid_header(self):
@patch("pandas.read_csv")
def test_load_csv_with_invalid_header(self, mock_read_csv):
"""Bail loading a CSV when the header is invalid."""

data = {'foo': [1, 2, 3]}
mock_pandas = MagicMock()
mock_pandas.read_csv.return_value = pandas.DataFrame(data=data)
filepath = 'path/name.csv'
geo_type = 'state'

rows = list(CsvImporter.load_csv(filepath, geo_type, pandas=mock_pandas))
mock_read_csv.return_value = pd.DataFrame(data)
rows = list(CsvImporter.load_csv(filepath, geo_type))

self.assertTrue(mock_pandas.read_csv.called)
self.assertTrue(mock_pandas.read_csv.call_args[0][0], filepath)
self.assertTrue(mock_read_csv.called)
self.assertTrue(mock_read_csv.call_args[0][0], filepath)
self.assertEqual(rows, [None])

def test_load_csv_with_valid_header(self):
@patch("pandas.read_csv")
def test_load_csv_with_valid_header(self, mock_read_csv):
"""Yield sanity checked `RowValues` from a valid CSV file."""

# one invalid geo_id, but otherwise valid
Expand All @@ -248,15 +247,14 @@ def test_load_csv_with_valid_header(self):
'se': ['2.1', '2.2', '2.3', '2.4'],
'sample_size': ['301', '302', '303', '304'],
}
mock_pandas = MagicMock()
mock_pandas.read_csv.return_value = pandas.DataFrame(data=data)
filepath = 'path/name.csv'
geo_type = 'state'

rows = list(CsvImporter.load_csv(filepath, geo_type, pandas=mock_pandas))
mock_read_csv.return_value = pd.DataFrame(data=data)
rows = list(CsvImporter.load_csv(filepath, geo_type))

self.assertTrue(mock_pandas.read_csv.called)
self.assertTrue(mock_pandas.read_csv.call_args[0][0], filepath)
self.assertTrue(mock_read_csv.called)
self.assertTrue(mock_read_csv.call_args[0][0], filepath)
self.assertEqual(len(rows), 4)

self.assertEqual(rows[0].geo_value, 'ca')
Expand Down Expand Up @@ -286,15 +284,14 @@ def test_load_csv_with_valid_header(self):
'missing_stderr': [Nans.NOT_MISSING, Nans.REGION_EXCEPTION, Nans.NOT_MISSING, Nans.NOT_MISSING] + [None],
'missing_sample_size': [Nans.NOT_MISSING] * 2 + [Nans.REGION_EXCEPTION] * 2 + [None]
}
mock_pandas = MagicMock()
mock_pandas.read_csv.return_value = pandas.DataFrame(data=data)
filepath = 'path/name.csv'
geo_type = 'state'

rows = list(CsvImporter.load_csv(filepath, geo_type, pandas=mock_pandas))
mock_read_csv.return_value = pd.DataFrame(data)
rows = list(CsvImporter.load_csv(filepath, geo_type))

self.assertTrue(mock_pandas.read_csv.called)
self.assertTrue(mock_pandas.read_csv.call_args[0][0], filepath)
self.assertTrue(mock_read_csv.called)
self.assertTrue(mock_read_csv.call_args[0][0], filepath)
self.assertEqual(len(rows), 5)

self.assertEqual(rows[0].geo_value, 'ca')
Expand Down
3 changes: 1 addition & 2 deletions tests/acquisition/covidcast/test_csv_to_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
import unittest
from unittest.mock import MagicMock

from delphi.epidata.acquisition.covidcast.csv_to_database import get_argument_parser, main, \
collect_files, upload_archive, make_handlers
from delphi.epidata.acquisition.covidcast.csv_to_database import get_argument_parser, main, collect_files, upload_archive, make_handlers
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ugh same, I don't like it but it's fine


# py3tester coverage target
__test_target__ = 'delphi.epidata.acquisition.covidcast.csv_to_database'
Expand Down