Skip to content

BUG: parse_dates may have columns not in dataframe #32320

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Mar 17, 2020
Merged
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ I/O
``coerce_timestamps``; following pyarrow's default allows writing nanosecond
timestamps with ``version="2.0"`` (:issue:`31652`).
- Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`)
- `read_csv` will raise a ``ValueError`` when the columns passed in `parse_dates` is missing in the dataframe. (:issue:`31251`)

Plotting
^^^^^^^^
Expand Down
55 changes: 54 additions & 1 deletion pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
import csv
import datetime
from io import BufferedIOBase, RawIOBase, StringIO, TextIOWrapper
from itertools import chain
import re
import sys
from textwrap import fill
from typing import Any, Dict, Set
from typing import Any, Dict, List, Set
import warnings

import numpy as np
Expand Down Expand Up @@ -1419,6 +1420,56 @@ def __init__(self, kwds):
# keep references to file handles opened by the parser itself
self.handles = []

def _validate_parse_dates_presence(self, columns: List[str]) -> None:
"""
Check if parse_dates are in columns.

If user has provided names for parse_dates, check if those columns
are available.

Parameters
----------
columns : list
list of names of the dataframe.

Raises
------
ValueError
If column to parse_date is not in dataframe.

"""
if isinstance(self.parse_dates, list):
# a column in parse_dates could be represented
# ColReference = Union[int, str]
# DateGroups = List[ColReference]
# ParseDates = Union[ DateGroups, List[DateGroups],
# Dict[ColReference, DateGroups]]
cols_needed = []
for col in self.parse_dates:
if isinstance(col, list):
cols_needed.extend(col)
else:
cols_needed.append(col)
elif isinstance(self.parse_dates, dict):
cols_needed = list(chain(*self.parse_dates.values()))
else:
cols_needed = []

# get only columns that are references using names (str), not by index
missing_cols = ", ".join(
sorted(
{
col
for col in cols_needed
if isinstance(col, str) and col not in columns
}
)
)
if missing_cols:
raise ValueError(
f"Missing column provided to 'parse_dates': '{missing_cols}'"
)

def close(self):
for f in self.handles:
f.close()
Expand Down Expand Up @@ -1938,6 +1989,7 @@ def __init__(self, src, **kwds):
if len(self.names) < len(usecols):
_validate_usecols_names(usecols, self.names)

self._validate_parse_dates_presence(self.names)
self._set_noconvert_columns()

self.orig_names = self.names
Expand Down Expand Up @@ -2308,6 +2360,7 @@ def __init__(self, f, **kwds):
if self.index_names is None:
self.index_names = index_names

self._validate_parse_dates_presence(self.columns)
if self.parse_dates:
self._no_thousands_columns = self._set_no_thousands_columns()
else:
Expand Down
30 changes: 30 additions & 0 deletions pandas/tests/io/parser/test_parse_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -1516,3 +1516,33 @@ def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_dateti

assert except_out_dateutil == except_in_dateutil
assert result == expected


@pytest.mark.parametrize(
"names, usecols, parse_dates, missing_cols",
[
(None, ["val"], ["date", "time"], "date, time"),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a tuple or other list-like in this test

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I use tuple, _validate_parse_dates_arg throws the following error.

TypeError: Only booleans, lists, and dictionaries are accepted for the 'parse_dates' parameter

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea I guess this is only currently documented as supporting scalars, lists and dicts (not tuples)

(None, ["val"], [0, "time"], "time"),
(None, ["val"], [["date", "time"]], "date, time"),
(None, ["val"], [[0, "time"]], "time"),
(None, ["val"], {"date": [0, "time"]}, "time"),
(None, ["val"], {"date": ["date", "time"]}, "date, time"),
(None, ["val"], [["date", "time"], "date"], "date, time"),
(["date1", "time1", "temperature"], None, ["date", "time"], "date, time"),
(
["date1", "time1", "temperature"],
["date1", "temperature"],
["date1", "time"],
"time",
),
],
)
def test_missing_column(all_parsers, names, usecols, parse_dates, missing_cols):
"""GH31251 column names provided in parse_dates could be missing."""
parser = all_parsers
content = StringIO("date,time,val\n2020-01-31,04:20:32,32\n")
msg = f"Missing column provided to 'parse_dates': '{missing_cols}'"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates,
)