Skip to content

Commit 28b1c8b

Browse files
bpraggastisTomAugspurger
authored andcommitted
ERRR: Raise error in usecols when column doesn't exist but length matches (pandas-dev#16460)
* pandas-devgh-14671 Check if usecols with type string contains a subset of names, if not throws an error * tests added for pandas-devgh-14671, expected behavior of simultaneous use of usecols and names unclear so these tests are commented out * Review comments (cherry picked from commit 50a62c1)
1 parent e01241a commit 28b1c8b

File tree

3 files changed

+58
-0
lines changed

3 files changed

+58
-0
lines changed

doc/source/whatsnew/v0.20.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ I/O
7171
^^^
7272

7373
- Bug in :func:`read_csv` when ``comment`` is passed in a space delimited text file (:issue:`16472`)
74+
- Bug in :func:`read_csv` not raising an exception with nonexistent columns in ``usecols`` when it had the correct length (:issue:`14671`)
7475
- Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`)
7576
- Bug that raised ``IndexError`` when HTML-rendering an empty ``DataFrame`` (:issue:`15953`)
7677
- Bug in :func:`read_csv` in which tarfile object inputs were raising an error in Python 2.x for the C engine (:issue:`16530`)

pandas/io/parsers.py

+6
Original file line numberDiff line numberDiff line change
@@ -1643,6 +1643,12 @@ def __init__(self, src, **kwds):
16431643

16441644
if self.usecols:
16451645
usecols = _evaluate_usecols(self.usecols, self.orig_names)
1646+
1647+
# GH 14671
1648+
if (self.usecols_dtype == 'string' and
1649+
not set(usecols).issubset(self.orig_names)):
1650+
raise ValueError("Usecols do not match names.")
1651+
16461652
if len(self.names) > len(usecols):
16471653
self.names = [n for i, n in enumerate(self.names)
16481654
if (i in usecols or n in usecols)]

pandas/tests/io/parser/usecols.py

+51
Original file line numberDiff line numberDiff line change
@@ -475,3 +475,54 @@ def test_uneven_length_cols(self):
475475
'C': [3, 5, 4, 3, 3, 7]})
476476
df = self.read_csv(StringIO(data), usecols=usecols)
477477
tm.assert_frame_equal(df, expected)
478+
479+
def test_raise_on_usecols_names_mismatch(self):
480+
# GH 14671
481+
data = 'a,b,c,d\n1,2,3,4\n5,6,7,8'
482+
483+
if self.engine == 'c':
484+
msg = 'Usecols do not match names'
485+
else:
486+
msg = 'is not in list'
487+
488+
usecols = ['a', 'b', 'c', 'd']
489+
df = self.read_csv(StringIO(data), usecols=usecols)
490+
expected = DataFrame({'a': [1, 5], 'b': [2, 6], 'c': [3, 7],
491+
'd': [4, 8]})
492+
tm.assert_frame_equal(df, expected)
493+
494+
usecols = ['a', 'b', 'c', 'f']
495+
with tm.assert_raises_regex(ValueError, msg):
496+
self.read_csv(StringIO(data), usecols=usecols)
497+
498+
usecols = ['a', 'b', 'f']
499+
with tm.assert_raises_regex(ValueError, msg):
500+
self.read_csv(StringIO(data), usecols=usecols)
501+
502+
names = ['A', 'B', 'C', 'D']
503+
504+
df = self.read_csv(StringIO(data), header=0, names=names)
505+
expected = DataFrame({'A': [1, 5], 'B': [2, 6], 'C': [3, 7],
506+
'D': [4, 8]})
507+
tm.assert_frame_equal(df, expected)
508+
509+
# TODO: https://github.com/pandas-dev/pandas/issues/16469
510+
# usecols = ['A','C']
511+
# df = self.read_csv(StringIO(data), header=0, names=names,
512+
# usecols=usecols)
513+
# expected = DataFrame({'A': [1,5], 'C': [3,7]})
514+
# tm.assert_frame_equal(df, expected)
515+
#
516+
# usecols = [0,2]
517+
# df = self.read_csv(StringIO(data), header=0, names=names,
518+
# usecols=usecols)
519+
# expected = DataFrame({'A': [1,5], 'C': [3,7]})
520+
# tm.assert_frame_equal(df, expected)
521+
522+
usecols = ['A', 'B', 'C', 'f']
523+
with tm.assert_raises_regex(ValueError, msg):
524+
self.read_csv(StringIO(data), header=0, names=names,
525+
usecols=usecols)
526+
usecols = ['A', 'B', 'f']
527+
with tm.assert_raises_regex(ValueError, msg):
528+
self.read_csv(StringIO(data), names=names, usecols=usecols)

0 commit comments

Comments
 (0)