Skip to content

Commit 30bae02

Browse files
committed
BUG: Avoid flaky usecols set in C engine
Closes gh-14792.
1 parent aba7d25 commit 30bae02

File tree

3 files changed

+46
-8
lines changed

3 files changed

+46
-8
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -318,3 +318,4 @@ Bug Fixes
318318
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
319319
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)
320320
- Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`)
321+
- Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`)

pandas/io/parsers.py

+20-8
Original file line numberDiff line numberDiff line change
@@ -997,14 +997,14 @@ def _validate_usecols_arg(usecols):
997997

998998
if usecols is not None:
999999
if callable(usecols):
1000-
return usecols
1000+
return usecols, None
10011001
usecols_dtype = lib.infer_dtype(usecols)
10021002
if usecols_dtype not in ('empty', 'integer',
10031003
'string', 'unicode'):
10041004
raise ValueError(msg)
10051005

1006-
return set(usecols)
1007-
return usecols
1006+
return set(usecols), usecols_dtype
1007+
return usecols, None
10081008

10091009

10101010
def _validate_parse_dates_arg(parse_dates):
@@ -1473,7 +1473,8 @@ def __init__(self, src, **kwds):
14731473
self._reader = _parser.TextReader(src, **kwds)
14741474

14751475
# XXX
1476-
self.usecols = _validate_usecols_arg(self._reader.usecols)
1476+
self.usecols, self.usecols_dtype = _validate_usecols_arg(
1477+
self._reader.usecols)
14771478

14781479
passed_names = self.names is None
14791480

@@ -1550,11 +1551,22 @@ def close(self):
15501551

15511552
def _set_noconvert_columns(self):
15521553
names = self.orig_names
1553-
usecols = self.usecols
1554+
if self.usecols_dtype == 'integer':
1555+
# A set of integers will be converted to a list in
1556+
# the correct order every single time.
1557+
usecols = list(self.usecols)
1558+
elif (callable(self.usecols) or
1559+
self.usecols_dtype not in ('empty', None)):
1560+
# The names attribute should have the correct columns
1561+
# in the proper order for indexing with parse_dates.
1562+
usecols = self.names[:]
1563+
else:
1564+
# Usecols is empty.
1565+
usecols = None
15541566

15551567
def _set(x):
1556-
if usecols and is_integer(x):
1557-
x = list(usecols)[x]
1568+
if usecols is not None and is_integer(x):
1569+
x = usecols[x]
15581570

15591571
if not is_integer(x):
15601572
x = names.index(x)
@@ -1792,7 +1804,7 @@ def __init__(self, f, **kwds):
17921804
self.skipinitialspace = kwds['skipinitialspace']
17931805
self.lineterminator = kwds['lineterminator']
17941806
self.quoting = kwds['quoting']
1795-
self.usecols = _validate_usecols_arg(kwds['usecols'])
1807+
self.usecols, _ = _validate_usecols_arg(kwds['usecols'])
17961808
self.skip_blank_lines = kwds['skip_blank_lines']
17971809

17981810
self.names_passed = kwds['names'] or None

pandas/io/tests/parser/usecols.py

+25
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,31 @@ def test_usecols_with_parse_dates(self):
200200
parse_dates=parse_dates)
201201
tm.assert_frame_equal(df, expected)
202202

203+
# See gh-14792
204+
s = """a,b,c,d,e,f,g,h,i,j
205+
2016/09/21,1,1,2,3,4,5,6,7,8"""
206+
parse_dates = [0]
207+
usecols = list('abcdefghij')
208+
cols = {'a': Timestamp('2016-09-21'),
209+
'b': [1], 'c': [1], 'd': [2],
210+
'e': [3], 'f': [4], 'g': [5],
211+
'h': [6], 'i': [7], 'j': [8]}
212+
expected = DataFrame(cols, columns=usecols)
213+
df = self.read_csv(StringIO(s), usecols=usecols,
214+
parse_dates=parse_dates)
215+
tm.assert_frame_equal(df, expected)
216+
217+
s = """a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"""
218+
parse_dates = [[0, 1]]
219+
usecols = list('abcdefghij')
220+
cols = {'a_b': '2016/09/21 1',
221+
'c': [1], 'd': [2], 'e': [3], 'f': [4],
222+
'g': [5], 'h': [6], 'i': [7], 'j': [8]}
223+
expected = DataFrame(cols, columns=['a_b'] + list('cdefghij'))
224+
df = self.read_csv(StringIO(s), usecols=usecols,
225+
parse_dates=parse_dates)
226+
tm.assert_frame_equal(df, expected)
227+
203228
def test_usecols_with_parse_dates_and_full_names(self):
204229
# See gh-9755
205230
s = """0,1,20140101,0900,4

0 commit comments

Comments
 (0)