Skip to content

Commit 7a3f81a

Browse files
AaronCritchleyjreback
authored andcommitted
ENH: Better error message if usecols doesn't match columns (#17310)
1 parent 0e16818 commit 7a3f81a

File tree

3 files changed

+53
-11
lines changed

3 files changed

+53
-11
lines changed

doc/source/whatsnew/v0.22.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ Other Enhancements
7676
- Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`)
7777
- :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`)
7878
- :func:`pandas.read_clipboard` updated to use qtpy, falling back to PyQt5 and then PyQt4, adding compatibility with Python3 and multiple python-qt bindings (:issue:`17722`)
79+
- Improved wording of ``ValueError`` raised in :func:`read_csv` when the ``usecols`` argument cannot match all columns. (:issue:`17301`)
7980

8081
.. _whatsnew_0220.api_breaking:
8182

pandas/io/parsers.py

+39-3
Original file line numberDiff line numberDiff line change
@@ -1141,6 +1141,38 @@ def _evaluate_usecols(usecols, names):
11411141
return usecols
11421142

11431143

1144+
def _validate_usecols_names(usecols, names):
1145+
"""
1146+
Validates that all usecols are present in a given
1147+
list of names. If not, raise a ValueError that
1148+
shows what usecols are missing.
1149+
1150+
Parameters
1151+
----------
1152+
usecols : iterable of usecols
1153+
The columns to validate are present in names.
1154+
names : iterable of names
1155+
The column names to check against.
1156+
1157+
Returns
1158+
-------
1159+
usecols : iterable of usecols
1160+
The `usecols` parameter if the validation succeeds.
1161+
1162+
Raises
1163+
------
1164+
ValueError : Columns were missing. Error message will list them.
1165+
"""
1166+
missing = [c for c in usecols if c not in names]
1167+
if len(missing) > 0:
1168+
raise ValueError(
1169+
"Usecols do not match columns, "
1170+
"columns expected but not found: {missing}".format(missing=missing)
1171+
)
1172+
1173+
return usecols
1174+
1175+
11441176
def _validate_skipfooter_arg(skipfooter):
11451177
"""
11461178
Validate the 'skipfooter' parameter.
@@ -1753,14 +1785,14 @@ def __init__(self, src, **kwds):
17531785
# GH 14671
17541786
if (self.usecols_dtype == 'string' and
17551787
not set(usecols).issubset(self.orig_names)):
1756-
raise ValueError("Usecols do not match names.")
1788+
_validate_usecols_names(usecols, self.orig_names)
17571789

17581790
if len(self.names) > len(usecols):
17591791
self.names = [n for i, n in enumerate(self.names)
17601792
if (i in usecols or n in usecols)]
17611793

17621794
if len(self.names) < len(usecols):
1763-
raise ValueError("Usecols do not match names.")
1795+
_validate_usecols_names(usecols, self.names)
17641796

17651797
self._set_noconvert_columns()
17661798

@@ -2532,9 +2564,13 @@ def _handle_usecols(self, columns, usecols_key):
25322564
raise ValueError("If using multiple headers, usecols must "
25332565
"be integers.")
25342566
col_indices = []
2567+
25352568
for col in self.usecols:
25362569
if isinstance(col, string_types):
2537-
col_indices.append(usecols_key.index(col))
2570+
try:
2571+
col_indices.append(usecols_key.index(col))
2572+
except ValueError:
2573+
_validate_usecols_names(self.usecols, usecols_key)
25382574
else:
25392575
col_indices.append(col)
25402576
else:

pandas/tests/io/parser/usecols.py

+13-8
Original file line numberDiff line numberDiff line change
@@ -480,10 +480,10 @@ def test_raise_on_usecols_names_mismatch(self):
480480
# GH 14671
481481
data = 'a,b,c,d\n1,2,3,4\n5,6,7,8'
482482

483-
if self.engine == 'c':
484-
msg = 'Usecols do not match names'
485-
else:
486-
msg = 'is not in list'
483+
msg = (
484+
"Usecols do not match columns, "
485+
"columns expected but not found: {missing}"
486+
)
487487

488488
usecols = ['a', 'b', 'c', 'd']
489489
df = self.read_csv(StringIO(data), usecols=usecols)
@@ -492,11 +492,16 @@ def test_raise_on_usecols_names_mismatch(self):
492492
tm.assert_frame_equal(df, expected)
493493

494494
usecols = ['a', 'b', 'c', 'f']
495-
with tm.assert_raises_regex(ValueError, msg):
495+
with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
496496
self.read_csv(StringIO(data), usecols=usecols)
497497

498498
usecols = ['a', 'b', 'f']
499-
with tm.assert_raises_regex(ValueError, msg):
499+
with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
500+
self.read_csv(StringIO(data), usecols=usecols)
501+
502+
usecols = ['a', 'b', 'f', 'g']
503+
with tm.assert_raises_regex(
504+
ValueError, msg.format(missing="\[('f', 'g'|'g', 'f')\]")):
500505
self.read_csv(StringIO(data), usecols=usecols)
501506

502507
names = ['A', 'B', 'C', 'D']
@@ -520,9 +525,9 @@ def test_raise_on_usecols_names_mismatch(self):
520525
# tm.assert_frame_equal(df, expected)
521526

522527
usecols = ['A', 'B', 'C', 'f']
523-
with tm.assert_raises_regex(ValueError, msg):
528+
with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
524529
self.read_csv(StringIO(data), header=0, names=names,
525530
usecols=usecols)
526531
usecols = ['A', 'B', 'f']
527-
with tm.assert_raises_regex(ValueError, msg):
532+
with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
528533
self.read_csv(StringIO(data), names=names, usecols=usecols)

0 commit comments

Comments
 (0)