Skip to content

BUG: usecols kwarg accepts string when it should only allow list-like or callable. #20558

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Apr 1, 2018
6 changes: 2 additions & 4 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -445,10 +445,8 @@ cdef class TextReader:
# suboptimal
if usecols is not None:
self.has_usecols = 1
if callable(usecols):
self.usecols = usecols
else:
self.usecols = set(usecols)
# GH20529, validate usecols at higher level.
Copy link
Member

@gfyoung gfyoung Mar 30, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's expand the comment to this:

# gh-20558: we will validate and clean "usecols" before getting here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done.

self.usecols = usecols

# XXX
if skipfooter > 0:
Expand Down
33 changes: 18 additions & 15 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,11 @@
MultiIndex is used. If you have a malformed file with delimiters at the end
of each line, you might consider index_col=False to force pandas to _not_
use the first column as the index (row names)
usecols : array-like or callable, default None
Return a subset of the columns. If array-like, all elements must either
usecols : list-like or callable, default None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Leave this alone actually. array-like will suffice.

Copy link
Contributor Author

@minggli minggli Mar 30, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure. but below is_array_like will evaluate ['foo' ,'bar'] to False. so it's really a list-like IMHO.

def is_array_like(obj):

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmm...that's a fair point. Actually, leave that alone then.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But if you're going to change it here, make sure to do it in io.rst as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure. it's done.

Return a subset of the columns. If list-like, all elements must either
be positional (i.e. integer indices into the document columns) or strings
that correspond to column names provided either by the user in `names` or
inferred from the document header row(s). For example, a valid array-like
inferred from the document header row(s). For example, a valid list-like
`usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Element
order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
To instantiate a DataFrame from ``data`` with element order preserved use
Expand Down Expand Up @@ -1177,7 +1177,7 @@ def _validate_usecols_arg(usecols):
Parameters
----------
usecols : array-like, callable, or None
usecols : list-like, callable, or None
List of columns to use when parsing or a callable that can be used
to filter a list of table columns.
Expand All @@ -1192,17 +1192,19 @@ def _validate_usecols_arg(usecols):
'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
is passed in or None if a callable or None is passed in.
"""
msg = ("'usecols' must either be all strings, all unicode, "
"all integers or a callable")

msg = ("'usecols' must either be list-like of all strings, all unicode, "
"all integers or a callable.")
if usecols is not None:
if callable(usecols):
return usecols, None
usecols_dtype = lib.infer_dtype(usecols)
if usecols_dtype not in ('empty', 'integer',
'string', 'unicode'):
# GH20529, ensure is iterable container but not string.
elif not is_list_like(usecols):
raise ValueError(msg)

else:
usecols_dtype = lib.infer_dtype(usecols)
if usecols_dtype not in ('empty', 'integer',
'string', 'unicode'):
raise ValueError(msg)
return set(usecols), usecols_dtype
return usecols, None

Expand Down Expand Up @@ -1697,11 +1699,12 @@ def __init__(self, src, **kwds):
# #2442
kwds['allow_leading_cols'] = self.index_col is not False

self._reader = parsers.TextReader(src, **kwds)

# XXX
# GH20529, validate usecol arg before TextReader
self.usecols, self.usecols_dtype = _validate_usecols_arg(
self._reader.usecols)
kwds['usecols'])
kwds['usecols'] = self.usecols

self._reader = parsers.TextReader(src, **kwds)

passed_names = self.names is None

Expand Down
46 changes: 23 additions & 23 deletions pandas/tests/io/parser/usecols.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@


class UsecolsTests(object):
msg_validate_usecols_arg = ("'usecols' must either be list-like of all "
"strings, all unicode, all integers or a "
"callable.")
msg_validate_usecols_names = ("Usecols do not match columns, columns "
"expected but not found: {0}")

def test_raise_on_mixed_dtype_usecols(self):
# See gh-12678
Expand All @@ -24,11 +29,9 @@ def test_raise_on_mixed_dtype_usecols(self):
4000,5000,6000
"""

msg = ("'usecols' must either be all strings, all unicode, "
"all integers or a callable")
usecols = [0, 'b', 2]

with tm.assert_raises_regex(ValueError, msg):
with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
self.read_csv(StringIO(data), usecols=usecols)

def test_usecols(self):
Expand Down Expand Up @@ -348,13 +351,10 @@ def test_usecols_with_mixed_encoding_strings(self):
3.568935038,7,False,a
'''

msg = ("'usecols' must either be all strings, all unicode, "
"all integers or a callable")

with tm.assert_raises_regex(ValueError, msg):
with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB'])

with tm.assert_raises_regex(ValueError, msg):
with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
self.read_csv(StringIO(s), usecols=[b'AAA', u'BBB'])

def test_usecols_with_multibyte_characters(self):
Expand Down Expand Up @@ -480,30 +480,28 @@ def test_raise_on_usecols_names_mismatch(self):
# GH 14671
data = 'a,b,c,d\n1,2,3,4\n5,6,7,8'

msg = (
"Usecols do not match columns, "
"columns expected but not found: {missing}"
)

usecols = ['a', 'b', 'c', 'd']
df = self.read_csv(StringIO(data), usecols=usecols)
expected = DataFrame({'a': [1, 5], 'b': [2, 6], 'c': [3, 7],
'd': [4, 8]})
tm.assert_frame_equal(df, expected)

usecols = ['a', 'b', 'c', 'f']
with tm.assert_raises_regex(
ValueError, msg.format(missing=r"\['f'\]")):
with tm.assert_raises_regex(ValueError,
self.msg_validate_usecols_names.format(
r"\['f'\]")):
self.read_csv(StringIO(data), usecols=usecols)

usecols = ['a', 'b', 'f']
with tm.assert_raises_regex(
ValueError, msg.format(missing=r"\['f'\]")):
with tm.assert_raises_regex(ValueError,
self.msg_validate_usecols_names.format(
r"\['f'\]")):
self.read_csv(StringIO(data), usecols=usecols)

usecols = ['a', 'b', 'f', 'g']
with tm.assert_raises_regex(
ValueError, msg.format(missing=r"\[('f', 'g'|'g', 'f')\]")):
with tm.assert_raises_regex(ValueError,
self.msg_validate_usecols_names.format(
r"\[('f', 'g'|'g', 'f')\]")):
self.read_csv(StringIO(data), usecols=usecols)

names = ['A', 'B', 'C', 'D']
Expand All @@ -527,11 +525,13 @@ def test_raise_on_usecols_names_mismatch(self):
# tm.assert_frame_equal(df, expected)

usecols = ['A', 'B', 'C', 'f']
with tm.assert_raises_regex(
ValueError, msg.format(missing=r"\['f'\]")):
with tm.assert_raises_regex(ValueError,
self.msg_validate_usecols_names.format(
r"\['f'\]")):
self.read_csv(StringIO(data), header=0, names=names,
usecols=usecols)
usecols = ['A', 'B', 'f']
with tm.assert_raises_regex(
ValueError, msg.format(missing=r"\['f'\]")):
with tm.assert_raises_regex(ValueError,
self.msg_validate_usecols_names.format(
r"\['f'\]")):
self.read_csv(StringIO(data), names=names, usecols=usecols)