Skip to content

Commit 83caa3b

Browse files
committed
BUG: Fix parse_dates processing with usecols and C engine
Fixes bug in processing 'parse_dates' with the C engine in which the wrong indices (those of the filtered column names) were being used to determine the date columns to not be dtype-parsed by the C engine. The correct indices are those of the original (unfiltered) column names, as they are used later on in the actual data processing. Closes gh-9755.
1 parent e04f343 commit 83caa3b

File tree

3 files changed

+105
-20
lines changed

3 files changed

+105
-20
lines changed

doc/source/whatsnew/v0.18.1.txt

+6
Original file line numberDiff line numberDiff line change
@@ -236,3 +236,9 @@ Bug Fixes
236236
- Bug in ``.describe()`` resets categorical columns information (:issue:`11558`)
237237
- Bug where ``loffset`` argument was not applied when calling ``resample().count()`` on a timeseries (:issue:`12725`)
238238
- ``pd.read_excel()`` now accepts path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path, in line with other ``read_*`` functions (:issue:`12655`)
239+
240+
241+
242+
243+
244+
- Bug in ``read_csv`` when specifying ``usecols`` and ``parse_dates`` simultaneously with the C engine (:issue:`9755`)

pandas/io/parsers.py

+24-17
Original file line numberDiff line numberDiff line change
@@ -1157,18 +1157,21 @@ def __init__(self, src, **kwds):
11571157
else:
11581158
self.names = lrange(self._reader.table_width)
11591159

1160-
# If the names were inferred (not passed by user) and usedcols is
1161-
# defined, then ensure names refers to the used columns, not the
1162-
# document's columns.
1163-
if self.usecols and passed_names:
1164-
col_indices = []
1165-
for u in self.usecols:
1166-
if isinstance(u, string_types):
1167-
col_indices.append(self.names.index(u))
1168-
else:
1169-
col_indices.append(u)
1170-
self.names = [n for i, n in enumerate(self.names)
1171-
if i in col_indices]
1160+
# gh-9755
1161+
#
1162+
# need to set orig_names here first
1163+
# so that proper indexing can be done
1164+
# with _set_noconvert_columns
1165+
#
1166+
# once names has been filtered, we will
1167+
# then set orig_names again to names
1168+
self.orig_names = self.names[:]
1169+
1170+
if self.usecols:
1171+
if len(self.names) > len(self.usecols):
1172+
self.names = [n for i, n in enumerate(self.names)
1173+
if (i in self.usecols or n in self.usecols)]
1174+
11721175
if len(self.names) < len(self.usecols):
11731176
raise ValueError("Usecols do not match names.")
11741177

@@ -1194,13 +1197,17 @@ def __init__(self, src, **kwds):
11941197
self._implicit_index = self._reader.leading_cols > 0
11951198

11961199
def _set_noconvert_columns(self):
1197-
names = self.names
1200+
names = self.orig_names
1201+
usecols = self.usecols
11981202

11991203
def _set(x):
1200-
if com.is_integer(x):
1201-
self._reader.set_noconvert(x)
1202-
else:
1203-
self._reader.set_noconvert(names.index(x))
1204+
if usecols and com.is_integer(x):
1205+
x = list(usecols)[x]
1206+
1207+
if not com.is_integer(x):
1208+
x = names.index(x)
1209+
1210+
self._reader.set_noconvert(x)
12041211

12051212
if isinstance(self.parse_dates, list):
12061213
for val in self.parse_dates:

pandas/io/tests/test_parsers.py

+75-3
Original file line numberDiff line numberDiff line change
@@ -2682,12 +2682,84 @@ def test_uneven_lines_with_usecols(self):
26822682
df = self.read_csv(StringIO(csv), usecols=usecols)
26832683
tm.assert_frame_equal(df, expected)
26842684

2685-
usecols = ['a', 1]
2685+
usecols = ['a', 'b']
26862686
df = self.read_csv(StringIO(csv), usecols=usecols)
26872687
tm.assert_frame_equal(df, expected)
26882688

2689-
usecols = ['a', 'b']
2690-
df = self.read_csv(StringIO(csv), usecols=usecols)
2689+
def test_usecols_with_parse_dates(self):
2690+
# See gh-9755
2691+
s = """a,b,c,d,e
2692+
0,1,20140101,0900,4
2693+
0,1,20140102,1000,4"""
2694+
parse_dates = [[1, 2]]
2695+
2696+
cols = {
2697+
'a' : [0, 0],
2698+
'c_d': [
2699+
Timestamp('2014-01-01 09:00:00'),
2700+
Timestamp('2014-01-02 10:00:00')
2701+
]
2702+
}
2703+
expected = DataFrame(cols, columns=['c_d', 'a'])
2704+
2705+
df = read_csv(StringIO(s), usecols=[0, 2, 3],
2706+
parse_dates=parse_dates)
2707+
tm.assert_frame_equal(df, expected)
2708+
2709+
df = read_csv(StringIO(s), usecols=[3, 0, 2],
2710+
parse_dates=parse_dates)
2711+
tm.assert_frame_equal(df, expected)
2712+
2713+
def test_usecols_with_parse_dates_and_full_names(self):
2714+
# See gh-9755
2715+
s = """0,1,20140101,0900,4
2716+
0,1,20140102,1000,4"""
2717+
parse_dates = [[1, 2]]
2718+
names = list('abcde')
2719+
2720+
cols = {
2721+
'a' : [0, 0],
2722+
'c_d': [
2723+
Timestamp('2014-01-01 09:00:00'),
2724+
Timestamp('2014-01-02 10:00:00')
2725+
]
2726+
}
2727+
expected = DataFrame(cols, columns=['c_d', 'a'])
2728+
2729+
df = read_csv(StringIO(s), names=names,
2730+
usecols=[0, 2, 3],
2731+
parse_dates=parse_dates)
2732+
tm.assert_frame_equal(df, expected)
2733+
2734+
df = read_csv(StringIO(s), names=names,
2735+
usecols=[3, 0, 2],
2736+
parse_dates=parse_dates)
2737+
tm.assert_frame_equal(df, expected)
2738+
2739+
def test_usecols_with_parse_dates_and_usecol_names(self):
2740+
# See gh-9755
2741+
s = """0,1,20140101,0900,4
2742+
0,1,20140102,1000,4"""
2743+
parse_dates = [[1, 2]]
2744+
names = list('acd')
2745+
2746+
cols = {
2747+
'a' : [0, 0],
2748+
'c_d': [
2749+
Timestamp('2014-01-01 09:00:00'),
2750+
Timestamp('2014-01-02 10:00:00')
2751+
]
2752+
}
2753+
expected = DataFrame(cols, columns=['c_d', 'a'])
2754+
2755+
df = read_csv(StringIO(s), names=names,
2756+
usecols=[0, 2, 3],
2757+
parse_dates=parse_dates)
2758+
tm.assert_frame_equal(df, expected)
2759+
2760+
df = read_csv(StringIO(s), names=names,
2761+
usecols=[3, 0, 2],
2762+
parse_dates=parse_dates)
26912763
tm.assert_frame_equal(df, expected)
26922764

26932765

0 commit comments

Comments
 (0)