Skip to content

Commit 6a22cf7

Browse files
mingglijreback
authored andcommitted
BUG: usecols kwarg accepts string when it should only allow list-like or callable. (#20558)
1 parent 5d1f5ab commit 6a22cf7

File tree

5 files changed

+60
-45
lines changed

5 files changed

+60
-45
lines changed

doc/source/io.rst

+3-3
Original file line numberDiff line numberDiff line change
@@ -130,11 +130,11 @@ index_col : int or sequence or ``False``, default ``None``
130130
MultiIndex is used. If you have a malformed file with delimiters at the end of
131131
each line, you might consider ``index_col=False`` to force pandas to *not* use
132132
the first column as the index (row names).
133-
usecols : array-like or callable, default ``None``
134-
Return a subset of the columns. If array-like, all elements must either
133+
usecols : list-like or callable, default ``None``
134+
Return a subset of the columns. If list-like, all elements must either
135135
be positional (i.e. integer indices into the document columns) or strings
136136
that correspond to column names provided either by the user in `names` or
137-
inferred from the document header row(s). For example, a valid array-like
137+
inferred from the document header row(s). For example, a valid list-like
138138
`usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
139139

140140
Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. To

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1095,6 +1095,7 @@ I/O
10951095
- Bug in :meth:`pandas.io.stata.StataReader.value_labels` raising an ``AttributeError`` when called on very old files. Now returns an empty dict (:issue:`19417`)
10961096
- Bug in :func:`read_pickle` when unpickling objects with :class:`TimedeltaIndex` or :class:`Float64Index` created with pandas prior to version 0.20 (:issue:`19939`)
10971097
- Bug in :meth:`pandas.io.json.json_normalize` where subrecords are not properly normalized if any subrecords values are NoneType (:issue:`20030`)
1098+
- Bug in ``usecols`` parameter in :func:`pandas.io.read_csv` and :func:`pandas.io.read_table` where error is not raised correctly when passing a string. (:issue:`20529`)
10981099

10991100
Plotting
11001101
^^^^^^^^

pandas/_libs/parsers.pyx

+3-4
Original file line numberDiff line numberDiff line change
@@ -445,10 +445,9 @@ cdef class TextReader:
445445
# suboptimal
446446
if usecols is not None:
447447
self.has_usecols = 1
448-
if callable(usecols):
449-
self.usecols = usecols
450-
else:
451-
self.usecols = set(usecols)
448+
# GH-20558, validate usecols at higher level and only pass clean
449+
# usecols into TextReader.
450+
self.usecols = usecols
452451

453452
# XXX
454453
if skipfooter > 0:

pandas/io/parsers.py

+18-15
Original file line numberDiff line numberDiff line change
@@ -97,11 +97,11 @@
9797
MultiIndex is used. If you have a malformed file with delimiters at the end
9898
of each line, you might consider index_col=False to force pandas to _not_
9999
use the first column as the index (row names)
100-
usecols : array-like or callable, default None
101-
Return a subset of the columns. If array-like, all elements must either
100+
usecols : list-like or callable, default None
101+
Return a subset of the columns. If list-like, all elements must either
102102
be positional (i.e. integer indices into the document columns) or strings
103103
that correspond to column names provided either by the user in `names` or
104-
inferred from the document header row(s). For example, a valid array-like
104+
inferred from the document header row(s). For example, a valid list-like
105105
`usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Element
106106
order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
107107
To instantiate a DataFrame from ``data`` with element order preserved use
@@ -1177,7 +1177,7 @@ def _validate_usecols_arg(usecols):
11771177
11781178
Parameters
11791179
----------
1180-
usecols : array-like, callable, or None
1180+
usecols : list-like, callable, or None
11811181
List of columns to use when parsing or a callable that can be used
11821182
to filter a list of table columns.
11831183
@@ -1192,17 +1192,19 @@ def _validate_usecols_arg(usecols):
11921192
'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
11931193
is passed in or None if a callable or None is passed in.
11941194
"""
1195-
msg = ("'usecols' must either be all strings, all unicode, "
1196-
"all integers or a callable")
1197-
1195+
msg = ("'usecols' must either be list-like of all strings, all unicode, "
1196+
"all integers or a callable.")
11981197
if usecols is not None:
11991198
if callable(usecols):
12001199
return usecols, None
1201-
usecols_dtype = lib.infer_dtype(usecols)
1202-
if usecols_dtype not in ('empty', 'integer',
1203-
'string', 'unicode'):
1200+
# GH20529, ensure is iterable container but not string.
1201+
elif not is_list_like(usecols):
12041202
raise ValueError(msg)
1205-
1203+
else:
1204+
usecols_dtype = lib.infer_dtype(usecols)
1205+
if usecols_dtype not in ('empty', 'integer',
1206+
'string', 'unicode'):
1207+
raise ValueError(msg)
12061208
return set(usecols), usecols_dtype
12071209
return usecols, None
12081210

@@ -1697,11 +1699,12 @@ def __init__(self, src, **kwds):
16971699
# #2442
16981700
kwds['allow_leading_cols'] = self.index_col is not False
16991701

1700-
self._reader = parsers.TextReader(src, **kwds)
1701-
1702-
# XXX
1702+
# GH20529, validate usecol arg before TextReader
17031703
self.usecols, self.usecols_dtype = _validate_usecols_arg(
1704-
self._reader.usecols)
1704+
kwds['usecols'])
1705+
kwds['usecols'] = self.usecols
1706+
1707+
self._reader = parsers.TextReader(src, **kwds)
17051708

17061709
passed_names = self.names is None
17071710

pandas/tests/io/parser/usecols.py

+35-23
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@
1616

1717

1818
class UsecolsTests(object):
19+
msg_validate_usecols_arg = ("'usecols' must either be list-like of all "
20+
"strings, all unicode, all integers or a "
21+
"callable.")
22+
msg_validate_usecols_names = ("Usecols do not match columns, columns "
23+
"expected but not found: {0}")
1924

2025
def test_raise_on_mixed_dtype_usecols(self):
2126
# See gh-12678
@@ -24,11 +29,9 @@ def test_raise_on_mixed_dtype_usecols(self):
2429
4000,5000,6000
2530
"""
2631

27-
msg = ("'usecols' must either be all strings, all unicode, "
28-
"all integers or a callable")
2932
usecols = [0, 'b', 2]
3033

31-
with tm.assert_raises_regex(ValueError, msg):
34+
with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
3235
self.read_csv(StringIO(data), usecols=usecols)
3336

3437
def test_usecols(self):
@@ -85,6 +88,18 @@ def test_usecols(self):
8588
pytest.raises(ValueError, self.read_csv, StringIO(data),
8689
names=['a', 'b'], usecols=[1], header=None)
8790

91+
def test_usecols_single_string(self):
92+
# GH 20558
93+
data = """foo, bar, baz
94+
1000, 2000, 3000
95+
4000, 5000, 6000
96+
"""
97+
98+
usecols = 'foo'
99+
100+
with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
101+
self.read_csv(StringIO(data), usecols=usecols)
102+
88103
def test_usecols_index_col_False(self):
89104
# see gh-9082
90105
s = "a,b,c,d\n1,2,3,4\n5,6,7,8"
@@ -348,13 +363,10 @@ def test_usecols_with_mixed_encoding_strings(self):
348363
3.568935038,7,False,a
349364
'''
350365

351-
msg = ("'usecols' must either be all strings, all unicode, "
352-
"all integers or a callable")
353-
354-
with tm.assert_raises_regex(ValueError, msg):
366+
with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
355367
self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB'])
356368

357-
with tm.assert_raises_regex(ValueError, msg):
369+
with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
358370
self.read_csv(StringIO(s), usecols=[b'AAA', u'BBB'])
359371

360372
def test_usecols_with_multibyte_characters(self):
@@ -480,30 +492,28 @@ def test_raise_on_usecols_names_mismatch(self):
480492
# GH 14671
481493
data = 'a,b,c,d\n1,2,3,4\n5,6,7,8'
482494

483-
msg = (
484-
"Usecols do not match columns, "
485-
"columns expected but not found: {missing}"
486-
)
487-
488495
usecols = ['a', 'b', 'c', 'd']
489496
df = self.read_csv(StringIO(data), usecols=usecols)
490497
expected = DataFrame({'a': [1, 5], 'b': [2, 6], 'c': [3, 7],
491498
'd': [4, 8]})
492499
tm.assert_frame_equal(df, expected)
493500

494501
usecols = ['a', 'b', 'c', 'f']
495-
with tm.assert_raises_regex(
496-
ValueError, msg.format(missing=r"\['f'\]")):
502+
with tm.assert_raises_regex(ValueError,
503+
self.msg_validate_usecols_names.format(
504+
r"\['f'\]")):
497505
self.read_csv(StringIO(data), usecols=usecols)
498506

499507
usecols = ['a', 'b', 'f']
500-
with tm.assert_raises_regex(
501-
ValueError, msg.format(missing=r"\['f'\]")):
508+
with tm.assert_raises_regex(ValueError,
509+
self.msg_validate_usecols_names.format(
510+
r"\['f'\]")):
502511
self.read_csv(StringIO(data), usecols=usecols)
503512

504513
usecols = ['a', 'b', 'f', 'g']
505-
with tm.assert_raises_regex(
506-
ValueError, msg.format(missing=r"\[('f', 'g'|'g', 'f')\]")):
514+
with tm.assert_raises_regex(ValueError,
515+
self.msg_validate_usecols_names.format(
516+
r"\[('f', 'g'|'g', 'f')\]")):
507517
self.read_csv(StringIO(data), usecols=usecols)
508518

509519
names = ['A', 'B', 'C', 'D']
@@ -527,11 +537,13 @@ def test_raise_on_usecols_names_mismatch(self):
527537
# tm.assert_frame_equal(df, expected)
528538

529539
usecols = ['A', 'B', 'C', 'f']
530-
with tm.assert_raises_regex(
531-
ValueError, msg.format(missing=r"\['f'\]")):
540+
with tm.assert_raises_regex(ValueError,
541+
self.msg_validate_usecols_names.format(
542+
r"\['f'\]")):
532543
self.read_csv(StringIO(data), header=0, names=names,
533544
usecols=usecols)
534545
usecols = ['A', 'B', 'f']
535-
with tm.assert_raises_regex(
536-
ValueError, msg.format(missing=r"\['f'\]")):
546+
with tm.assert_raises_regex(ValueError,
547+
self.msg_validate_usecols_names.format(
548+
r"\['f'\]")):
537549
self.read_csv(StringIO(data), names=names, usecols=usecols)

0 commit comments

Comments
 (0)