Skip to content

Commit 9c7494a

Browse files
authored
BUG: parse_dates may have columns not in dataframe (#32320)
1 parent ad81de1 commit 9c7494a

File tree

3 files changed

+86
-1
lines changed

3 files changed

+86
-1
lines changed

doc/source/whatsnew/v1.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,7 @@ I/O
340340
timestamps with ``version="2.0"`` (:issue:`31652`).
341341
- Bug in :meth:`read_csv` was raising `TypeError` when `sep=None` was used in combination with `comment` keyword (:issue:`31396`)
342342
- Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`)
343+
- :func:`read_csv` will raise a ``ValueError`` when the column names passed in `parse_dates` are missing in the Dataframe (:issue:`31251`)
343344
- Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`)
344345
- Bug in :meth:`read_csv` was causing a file descriptor leak on an empty file (:issue:`31488`)
345346
- Bug in :meth:`read_csv` was causing a segfault when there were blank lines between the header and data rows (:issue:`28071`)

pandas/io/parsers.py

+53-1
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,11 @@
66
import csv
77
import datetime
88
from io import StringIO, TextIOWrapper
9+
import itertools
910
import re
1011
import sys
1112
from textwrap import fill
12-
from typing import Any, Dict, Set
13+
from typing import Any, Dict, Iterable, List, Set
1314
import warnings
1415

1516
import numpy as np
@@ -34,6 +35,7 @@
3435
ensure_str,
3536
is_bool_dtype,
3637
is_categorical_dtype,
38+
is_dict_like,
3739
is_dtype_equal,
3840
is_extension_array_dtype,
3941
is_file_like,
@@ -1421,6 +1423,54 @@ def __init__(self, kwds):
14211423
# keep references to file handles opened by the parser itself
14221424
self.handles = []
14231425

1426+
def _validate_parse_dates_presence(self, columns: List[str]) -> None:
1427+
"""
1428+
Check if parse_dates are in columns.
1429+
1430+
If user has provided names for parse_dates, check if those columns
1431+
are available.
1432+
1433+
Parameters
1434+
----------
1435+
columns : list
1436+
List of names of the dataframe.
1437+
1438+
Raises
1439+
------
1440+
ValueError
1441+
If column to parse_date is not in dataframe.
1442+
1443+
"""
1444+
cols_needed: Iterable
1445+
if is_dict_like(self.parse_dates):
1446+
cols_needed = itertools.chain(*self.parse_dates.values())
1447+
elif is_list_like(self.parse_dates):
1448+
# a column in parse_dates could be represented
1449+
# ColReference = Union[int, str]
1450+
# DateGroups = List[ColReference]
1451+
# ParseDates = Union[DateGroups, List[DateGroups],
1452+
# Dict[ColReference, DateGroups]]
1453+
cols_needed = itertools.chain.from_iterable(
1454+
col if is_list_like(col) else [col] for col in self.parse_dates
1455+
)
1456+
else:
1457+
cols_needed = []
1458+
1459+
# get only columns that are references using names (str), not by index
1460+
missing_cols = ", ".join(
1461+
sorted(
1462+
{
1463+
col
1464+
for col in cols_needed
1465+
if isinstance(col, str) and col not in columns
1466+
}
1467+
)
1468+
)
1469+
if missing_cols:
1470+
raise ValueError(
1471+
f"Missing column provided to 'parse_dates': '{missing_cols}'"
1472+
)
1473+
14241474
def close(self):
14251475
for f in self.handles:
14261476
f.close()
@@ -1940,6 +1990,7 @@ def __init__(self, src, **kwds):
19401990
if len(self.names) < len(usecols):
19411991
_validate_usecols_names(usecols, self.names)
19421992

1993+
self._validate_parse_dates_presence(self.names)
19431994
self._set_noconvert_columns()
19441995

19451996
self.orig_names = self.names
@@ -2314,6 +2365,7 @@ def __init__(self, f, **kwds):
23142365
if self.index_names is None:
23152366
self.index_names = index_names
23162367

2368+
self._validate_parse_dates_presence(self.columns)
23172369
if self.parse_dates:
23182370
self._no_thousands_columns = self._set_no_thousands_columns()
23192371
else:

pandas/tests/io/parser/test_parse_dates.py

+32
Original file line numberDiff line numberDiff line change
@@ -1516,3 +1516,35 @@ def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_dateti
15161516

15171517
assert except_out_dateutil == except_in_dateutil
15181518
assert result == expected
1519+
1520+
1521+
@pytest.mark.parametrize(
1522+
"names, usecols, parse_dates, missing_cols",
1523+
[
1524+
(None, ["val"], ["date", "time"], "date, time"),
1525+
(None, ["val"], [0, "time"], "time"),
1526+
(None, ["val"], [["date", "time"]], "date, time"),
1527+
(None, ["val"], [[0, "time"]], "time"),
1528+
(None, ["val"], {"date": [0, "time"]}, "time"),
1529+
(None, ["val"], {"date": ["date", "time"]}, "date, time"),
1530+
(None, ["val"], [["date", "time"], "date"], "date, time"),
1531+
(["date1", "time1", "temperature"], None, ["date", "time"], "date, time"),
1532+
(
1533+
["date1", "time1", "temperature"],
1534+
["date1", "temperature"],
1535+
["date1", "time"],
1536+
"time",
1537+
),
1538+
],
1539+
)
1540+
def test_missing_parse_dates_column_raises(
1541+
all_parsers, names, usecols, parse_dates, missing_cols
1542+
):
1543+
# gh-31251 column names provided in parse_dates could be missing.
1544+
parser = all_parsers
1545+
content = StringIO("date,time,val\n2020-01-31,04:20:32,32\n")
1546+
msg = f"Missing column provided to 'parse_dates': '{missing_cols}'"
1547+
with pytest.raises(ValueError, match=msg):
1548+
parser.read_csv(
1549+
content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates,
1550+
)

0 commit comments

Comments
 (0)