Skip to content

Commit c6c201e

Browse files
gfyoungjreback
authored andcommitted
BUG: Fix parse_dates processing with usecols and C engine
closes #9755 closes #12678 `read_csv` bugs, this PR fixes a bug brought up in #9755 in processing `parse_dates` with the C engine in which the wrong indices (those of the filtered column names) were being used to determine the date columns to not be dtype-parsed by the C engine. The correct indices are those of the original column names, as they are used later on in the actual data processing. Author: gfyoung <[email protected]> Closes #12512 from gfyoung/parse_dates_usecols and squashes the following commits: f0543a4 [gfyoung] BUG: Prevent mixed-typed usecols 83caa3b [gfyoung] BUG: Fix parse_dates processing with usecols and C engine
1 parent e04f343 commit c6c201e

File tree

4 files changed

+166
-27
lines changed

4 files changed

+166
-27
lines changed

doc/source/io.rst

+6-2
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,12 @@ index_col : int or sequence or ``False``, default ``None``
120120
each line, you might consider ``index_col=False`` to force pandas to *not* use
121121
the first column as the index (row names).
122122
usecols : array-like, default ``None``
123-
Return a subset of the columns. Results in much faster parsing time and lower
124-
memory usage
123+
Return a subset of the columns. All elements in this array must either
124+
be positional (i.e. integer indices into the document columns) or strings
125+
that correspond to column names provided either by the user in `names` or
126+
inferred from the document header row(s). For example, a valid `usecols`
127+
parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter
128+
results in much faster parsing time and lower memory usage.
125129
squeeze : boolean, default ``False``
126130
If the parsed data only contains one column then return a Series.
127131
prefix : str, default ``None``

doc/source/whatsnew/v0.18.1.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ API changes
101101

102102

103103
- ``CParserError`` is now a ``ValueError`` instead of just an ``Exception`` (:issue:`12551`)
104-
104+
- ``read_csv`` no longer allows a combination of strings and integers for the ``usecols`` parameter (:issue:`12678`)
105105
- ``pd.show_versions()`` now includes ``pandas_datareader`` version (:issue:`12740`)
106106
- Provide a proper ``__name__`` and ``__qualname__`` attributes for generic functions (:issue:`12021`)
107107

@@ -211,6 +211,7 @@ Bug Fixes
211211

212212
- Bug in ``value_counts`` when ``normalize=True`` and ``dropna=True`` where nulls still contributed to the normalized count (:issue:`12558`)
213213
- Bug in ``Panel.fillna()`` ignoring ``inplace=True`` (:issue:`12633`)
214+
- Bug in ``read_csv`` when specifying ``names``, ```usecols``, and ``parse_dates`` simultaneously with the C engine (:issue:`9755`)
214215
- Bug in ``Series.rename``, ``DataFrame.rename`` and ``DataFrame.rename_axis`` not treating ``Series`` as mappings to relabel (:issue:`12623`).
215216
- Clean in ``.rolling.min`` and ``.rolling.max`` to enhance dtype handling (:issue:`12373`)
216217

pandas/io/parsers.py

+49-21
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,12 @@ class ParserWarning(Warning):
7575
of each line, you might consider index_col=False to force pandas to _not_
7676
use the first column as the index (row names)
7777
usecols : array-like, default None
78-
Return a subset of the columns.
79-
Results in much faster parsing time and lower memory usage.
78+
Return a subset of the columns. All elements in this array must either
79+
be positional (i.e. integer indices into the document columns) or strings
80+
that correspond to column names provided either by the user in `names` or
81+
inferred from the document header row(s). For example, a valid `usecols`
82+
parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter
83+
results in much faster parsing time and lower memory usage.
8084
squeeze : boolean, default False
8185
If the parsed data only contains one column then return a Series
8286
prefix : str, default None
@@ -801,6 +805,23 @@ def _is_index_col(col):
801805
return col is not None and col is not False
802806

803807

808+
def _validate_usecols_arg(usecols):
809+
"""
810+
Check whether or not the 'usecols' parameter
811+
contains all integers (column selection by index)
812+
or strings (column by name). Raises a ValueError
813+
if that is not the case.
814+
"""
815+
if usecols is not None:
816+
usecols_dtype = lib.infer_dtype(usecols)
817+
if usecols_dtype not in ('integer', 'string'):
818+
raise ValueError(("The elements of 'usecols' "
819+
"must either be all strings "
820+
"or all integers"))
821+
822+
return usecols
823+
824+
804825
class ParserBase(object):
805826

806827
def __init__(self, kwds):
@@ -1132,7 +1153,7 @@ def __init__(self, src, **kwds):
11321153
self._reader = _parser.TextReader(src, **kwds)
11331154

11341155
# XXX
1135-
self.usecols = self._reader.usecols
1156+
self.usecols = _validate_usecols_arg(self._reader.usecols)
11361157

11371158
passed_names = self.names is None
11381159

@@ -1157,18 +1178,21 @@ def __init__(self, src, **kwds):
11571178
else:
11581179
self.names = lrange(self._reader.table_width)
11591180

1160-
# If the names were inferred (not passed by user) and usedcols is
1161-
# defined, then ensure names refers to the used columns, not the
1162-
# document's columns.
1163-
if self.usecols and passed_names:
1164-
col_indices = []
1165-
for u in self.usecols:
1166-
if isinstance(u, string_types):
1167-
col_indices.append(self.names.index(u))
1168-
else:
1169-
col_indices.append(u)
1170-
self.names = [n for i, n in enumerate(self.names)
1171-
if i in col_indices]
1181+
# gh-9755
1182+
#
1183+
# need to set orig_names here first
1184+
# so that proper indexing can be done
1185+
# with _set_noconvert_columns
1186+
#
1187+
# once names has been filtered, we will
1188+
# then set orig_names again to names
1189+
self.orig_names = self.names[:]
1190+
1191+
if self.usecols:
1192+
if len(self.names) > len(self.usecols):
1193+
self.names = [n for i, n in enumerate(self.names)
1194+
if (i in self.usecols or n in self.usecols)]
1195+
11721196
if len(self.names) < len(self.usecols):
11731197
raise ValueError("Usecols do not match names.")
11741198

@@ -1194,13 +1218,17 @@ def __init__(self, src, **kwds):
11941218
self._implicit_index = self._reader.leading_cols > 0
11951219

11961220
def _set_noconvert_columns(self):
1197-
names = self.names
1221+
names = self.orig_names
1222+
usecols = self.usecols
11981223

11991224
def _set(x):
1200-
if com.is_integer(x):
1201-
self._reader.set_noconvert(x)
1202-
else:
1203-
self._reader.set_noconvert(names.index(x))
1225+
if usecols and com.is_integer(x):
1226+
x = list(usecols)[x]
1227+
1228+
if not com.is_integer(x):
1229+
x = names.index(x)
1230+
1231+
self._reader.set_noconvert(x)
12041232

12051233
if isinstance(self.parse_dates, list):
12061234
for val in self.parse_dates:
@@ -1472,7 +1500,7 @@ def __init__(self, f, **kwds):
14721500
self.lineterminator = kwds['lineterminator']
14731501
self.quoting = kwds['quoting']
14741502
self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True)
1475-
self.usecols = kwds['usecols']
1503+
self.usecols = _validate_usecols_arg(kwds['usecols'])
14761504
self.skip_blank_lines = kwds['skip_blank_lines']
14771505

14781506
self.names_passed = kwds['names'] or None

pandas/io/tests/test_parsers.py

+109-3
Original file line numberDiff line numberDiff line change
@@ -2682,12 +2682,118 @@ def test_uneven_lines_with_usecols(self):
26822682
df = self.read_csv(StringIO(csv), usecols=usecols)
26832683
tm.assert_frame_equal(df, expected)
26842684

2685-
usecols = ['a', 1]
2685+
usecols = ['a', 'b']
26862686
df = self.read_csv(StringIO(csv), usecols=usecols)
26872687
tm.assert_frame_equal(df, expected)
26882688

2689-
usecols = ['a', 'b']
2690-
df = self.read_csv(StringIO(csv), usecols=usecols)
2689+
def test_usecols_with_parse_dates(self):
2690+
# See gh-9755
2691+
s = """a,b,c,d,e
2692+
0,1,20140101,0900,4
2693+
0,1,20140102,1000,4"""
2694+
parse_dates = [[1, 2]]
2695+
2696+
cols = {
2697+
'a' : [0, 0],
2698+
'c_d': [
2699+
Timestamp('2014-01-01 09:00:00'),
2700+
Timestamp('2014-01-02 10:00:00')
2701+
]
2702+
}
2703+
expected = DataFrame(cols, columns=['c_d', 'a'])
2704+
2705+
df = self.read_csv(StringIO(s), usecols=[0, 2, 3],
2706+
parse_dates=parse_dates)
2707+
tm.assert_frame_equal(df, expected)
2708+
2709+
df = self.read_csv(StringIO(s), usecols=[3, 0, 2],
2710+
parse_dates=parse_dates)
2711+
tm.assert_frame_equal(df, expected)
2712+
2713+
def test_usecols_with_parse_dates_and_full_names(self):
2714+
# See gh-9755
2715+
s = """0,1,20140101,0900,4
2716+
0,1,20140102,1000,4"""
2717+
parse_dates = [[1, 2]]
2718+
names = list('abcde')
2719+
2720+
cols = {
2721+
'a' : [0, 0],
2722+
'c_d': [
2723+
Timestamp('2014-01-01 09:00:00'),
2724+
Timestamp('2014-01-02 10:00:00')
2725+
]
2726+
}
2727+
expected = DataFrame(cols, columns=['c_d', 'a'])
2728+
2729+
df = self.read_csv(StringIO(s), names=names,
2730+
usecols=[0, 2, 3],
2731+
parse_dates=parse_dates)
2732+
tm.assert_frame_equal(df, expected)
2733+
2734+
df = self.read_csv(StringIO(s), names=names,
2735+
usecols=[3, 0, 2],
2736+
parse_dates=parse_dates)
2737+
tm.assert_frame_equal(df, expected)
2738+
2739+
def test_usecols_with_parse_dates_and_usecol_names(self):
2740+
# See gh-9755
2741+
s = """0,1,20140101,0900,4
2742+
0,1,20140102,1000,4"""
2743+
parse_dates = [[1, 2]]
2744+
names = list('acd')
2745+
2746+
cols = {
2747+
'a' : [0, 0],
2748+
'c_d': [
2749+
Timestamp('2014-01-01 09:00:00'),
2750+
Timestamp('2014-01-02 10:00:00')
2751+
]
2752+
}
2753+
expected = DataFrame(cols, columns=['c_d', 'a'])
2754+
2755+
df = self.read_csv(StringIO(s), names=names,
2756+
usecols=[0, 2, 3],
2757+
parse_dates=parse_dates)
2758+
tm.assert_frame_equal(df, expected)
2759+
2760+
df = self.read_csv(StringIO(s), names=names,
2761+
usecols=[3, 0, 2],
2762+
parse_dates=parse_dates)
2763+
tm.assert_frame_equal(df, expected)
2764+
2765+
def test_mixed_dtype_usecols(self):
2766+
# See gh-12678
2767+
data = """a,b,c
2768+
1000,2000,3000
2769+
4000,5000,6000
2770+
"""
2771+
msg = ("The elements of \'usecols\' "
2772+
"must either be all strings "
2773+
"or all integers")
2774+
usecols = [0, 'b', 2]
2775+
2776+
with tm.assertRaisesRegexp(ValueError, msg):
2777+
df = self.read_csv(StringIO(data), usecols=usecols)
2778+
2779+
def test_usecols_with_integer_like_header(self):
2780+
data = """2,0,1
2781+
1000,2000,3000
2782+
4000,5000,6000
2783+
"""
2784+
2785+
usecols = [0, 1] # column selection by index
2786+
expected = DataFrame(data=[[1000, 2000],
2787+
[4000, 5000]],
2788+
columns=['2', '0'])
2789+
df = self.read_csv(StringIO(data), usecols=usecols)
2790+
tm.assert_frame_equal(df, expected)
2791+
2792+
usecols = ['0', '1'] # column selection by name
2793+
expected = DataFrame(data=[[2000, 3000],
2794+
[5000, 6000]],
2795+
columns=['0', '1'])
2796+
df = self.read_csv(StringIO(data), usecols=usecols)
26912797
tm.assert_frame_equal(df, expected)
26922798

26932799

0 commit comments

Comments
 (0)