Skip to content

Commit f0543a4

Browse files
committed
BUG: Prevent mixed-typed usecols
Enforces the fact that 'usecols' must either be all integers (indexing) or strings (column names), as mixtures of the two are ambiguous. Closes pandas-devgh-12678.
1 parent 83caa3b commit f0543a4

File tree

4 files changed

+86
-29
lines changed

4 files changed

+86
-29
lines changed

doc/source/io.rst

+6-2
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,12 @@ index_col : int or sequence or ``False``, default ``None``
120120
each line, you might consider ``index_col=False`` to force pandas to *not* use
121121
the first column as the index (row names).
122122
usecols : array-like, default ``None``
123-
Return a subset of the columns. Results in much faster parsing time and lower
124-
memory usage
123+
Return a subset of the columns. All elements in this array must either
124+
be positional (i.e. integer indices into the document columns) or strings
125+
that correspond to column names provided either by the user in `names` or
126+
inferred from the document header row(s). For example, a valid `usecols`
127+
parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter
128+
results in much faster parsing time and lower memory usage.
125129
squeeze : boolean, default ``False``
126130
If the parsed data only contains one column then return a Series.
127131
prefix : str, default ``None``

doc/source/whatsnew/v0.18.1.txt

+2-7
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ API changes
101101

102102

103103
- ``CParserError`` is now a ``ValueError`` instead of just an ``Exception`` (:issue:`12551`)
104-
104+
- ``read_csv`` no longer allows a combination of strings and integers for the ``usecols`` parameter (:issue:`12678`)
105105
- ``pd.show_versions()`` now includes ``pandas_datareader`` version (:issue:`12740`)
106106
- Provide a proper ``__name__`` and ``__qualname__`` attributes for generic functions (:issue:`12021`)
107107

@@ -211,6 +211,7 @@ Bug Fixes
211211

212212
- Bug in ``value_counts`` when ``normalize=True`` and ``dropna=True`` where nulls still contributed to the normalized count (:issue:`12558`)
213213
- Bug in ``Panel.fillna()`` ignoring ``inplace=True`` (:issue:`12633`)
214+
- Bug in ``read_csv`` when specifying ``names``, ```usecols``, and ``parse_dates`` simultaneously with the C engine (:issue:`9755`)
214215
- Bug in ``Series.rename``, ``DataFrame.rename`` and ``DataFrame.rename_axis`` not treating ``Series`` as mappings to relabel (:issue:`12623`).
215216
- Clean in ``.rolling.min`` and ``.rolling.max`` to enhance dtype handling (:issue:`12373`)
216217

@@ -236,9 +237,3 @@ Bug Fixes
236237
- Bug in ``.describe()`` resets categorical columns information (:issue:`11558`)
237238
- Bug where ``loffset`` argument was not applied when calling ``resample().count()`` on a timeseries (:issue:`12725`)
238239
- ``pd.read_excel()`` now accepts path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path, in line with other ``read_*`` functions (:issue:`12655`)
239-
240-
241-
242-
243-
244-
- Bug in ``read_csv`` when specifying ``usecols`` and ``parse_dates`` simultaneously with the C engine (:issue:`9755`)

pandas/io/parsers.py

+28-4
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,12 @@ class ParserWarning(Warning):
7575
of each line, you might consider index_col=False to force pandas to _not_
7676
use the first column as the index (row names)
7777
usecols : array-like, default None
78-
Return a subset of the columns.
79-
Results in much faster parsing time and lower memory usage.
78+
Return a subset of the columns. All elements in this array must either
79+
be positional (i.e. integer indices into the document columns) or strings
80+
that correspond to column names provided either by the user in `names` or
81+
inferred from the document header row(s). For example, a valid `usecols`
82+
parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter
83+
results in much faster parsing time and lower memory usage.
8084
squeeze : boolean, default False
8185
If the parsed data only contains one column then return a Series
8286
prefix : str, default None
@@ -801,6 +805,26 @@ def _is_index_col(col):
801805
return col is not None and col is not False
802806

803807

808+
def _validate_usecols_arg(usecols):
809+
"""
810+
Check whether or not the 'usecols' parameter
811+
contains all integers (column selection by index)
812+
or strings (column by name). Raises a ValueError
813+
if that is not the case.
814+
"""
815+
# gh-12678
816+
if usecols is not None:
817+
usecols_dtype = lib.infer_dtype(usecols)
818+
if usecols_dtype not in ('integer', 'string'):
819+
raise ValueError(("The elements of 'usecols' "
820+
"must either be all strings "
821+
"or all integers"))
822+
823+
# validation has succeeded, so
824+
# return the argument for assignment
825+
return usecols
826+
827+
804828
class ParserBase(object):
805829

806830
def __init__(self, kwds):
@@ -1132,7 +1156,7 @@ def __init__(self, src, **kwds):
11321156
self._reader = _parser.TextReader(src, **kwds)
11331157

11341158
# XXX
1135-
self.usecols = self._reader.usecols
1159+
self.usecols = _validate_usecols_arg(self._reader.usecols)
11361160

11371161
passed_names = self.names is None
11381162

@@ -1479,7 +1503,7 @@ def __init__(self, f, **kwds):
14791503
self.lineterminator = kwds['lineterminator']
14801504
self.quoting = kwds['quoting']
14811505
self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True)
1482-
self.usecols = kwds['usecols']
1506+
self.usecols = _validate_usecols_arg(kwds['usecols'])
14831507
self.skip_blank_lines = kwds['skip_blank_lines']
14841508

14851509
self.names_passed = kwds['names'] or None

pandas/io/tests/test_parsers.py

+50-16
Original file line numberDiff line numberDiff line change
@@ -2702,12 +2702,12 @@ def test_usecols_with_parse_dates(self):
27022702
}
27032703
expected = DataFrame(cols, columns=['c_d', 'a'])
27042704

2705-
df = read_csv(StringIO(s), usecols=[0, 2, 3],
2706-
parse_dates=parse_dates)
2705+
df = self.read_csv(StringIO(s), usecols=[0, 2, 3],
2706+
parse_dates=parse_dates)
27072707
tm.assert_frame_equal(df, expected)
27082708

2709-
df = read_csv(StringIO(s), usecols=[3, 0, 2],
2710-
parse_dates=parse_dates)
2709+
df = self.read_csv(StringIO(s), usecols=[3, 0, 2],
2710+
parse_dates=parse_dates)
27112711
tm.assert_frame_equal(df, expected)
27122712

27132713
def test_usecols_with_parse_dates_and_full_names(self):
@@ -2726,14 +2726,14 @@ def test_usecols_with_parse_dates_and_full_names(self):
27262726
}
27272727
expected = DataFrame(cols, columns=['c_d', 'a'])
27282728

2729-
df = read_csv(StringIO(s), names=names,
2730-
usecols=[0, 2, 3],
2731-
parse_dates=parse_dates)
2729+
df = self.read_csv(StringIO(s), names=names,
2730+
usecols=[0, 2, 3],
2731+
parse_dates=parse_dates)
27322732
tm.assert_frame_equal(df, expected)
27332733

2734-
df = read_csv(StringIO(s), names=names,
2735-
usecols=[3, 0, 2],
2736-
parse_dates=parse_dates)
2734+
df = self.read_csv(StringIO(s), names=names,
2735+
usecols=[3, 0, 2],
2736+
parse_dates=parse_dates)
27372737
tm.assert_frame_equal(df, expected)
27382738

27392739
def test_usecols_with_parse_dates_and_usecol_names(self):
@@ -2752,14 +2752,48 @@ def test_usecols_with_parse_dates_and_usecol_names(self):
27522752
}
27532753
expected = DataFrame(cols, columns=['c_d', 'a'])
27542754

2755-
df = read_csv(StringIO(s), names=names,
2756-
usecols=[0, 2, 3],
2757-
parse_dates=parse_dates)
2755+
df = self.read_csv(StringIO(s), names=names,
2756+
usecols=[0, 2, 3],
2757+
parse_dates=parse_dates)
27582758
tm.assert_frame_equal(df, expected)
27592759

2760-
df = read_csv(StringIO(s), names=names,
2761-
usecols=[3, 0, 2],
2762-
parse_dates=parse_dates)
2760+
df = self.read_csv(StringIO(s), names=names,
2761+
usecols=[3, 0, 2],
2762+
parse_dates=parse_dates)
2763+
tm.assert_frame_equal(df, expected)
2764+
2765+
def test_mixed_dtype_usecols(self):
2766+
# See gh-12678
2767+
data = """a,b,c
2768+
1000,2000,3000
2769+
4000,5000,6000
2770+
"""
2771+
msg = ("The elements of \'usecols\' "
2772+
"must either be all strings "
2773+
"or all integers")
2774+
usecols = [0, 'b', 2]
2775+
2776+
with tm.assertRaisesRegexp(ValueError, msg):
2777+
df = self.read_csv(StringIO(data), usecols=usecols)
2778+
2779+
def test_usecols_with_integer_like_header(self):
2780+
data = """2,0,1
2781+
1000,2000,3000
2782+
4000,5000,6000
2783+
"""
2784+
2785+
usecols = [0, 1] # column selection by index
2786+
expected = DataFrame(data=[[1000, 2000],
2787+
[4000, 5000]],
2788+
columns=['2', '0'])
2789+
df = self.read_csv(StringIO(data), usecols=usecols)
2790+
tm.assert_frame_equal(df, expected)
2791+
2792+
usecols = ['0', '1'] # column selection by name
2793+
expected = DataFrame(data=[[2000, 3000],
2794+
[5000, 6000]],
2795+
columns=['0', '1'])
2796+
df = self.read_csv(StringIO(data), usecols=usecols)
27632797
tm.assert_frame_equal(df, expected)
27642798

27652799

0 commit comments

Comments
 (0)