Skip to content

Commit 6ac5814

Browse files
committed
BUG: Avoid flaky usecols set in C engine
Closes gh-14792.
1 parent 72786cc commit 6ac5814

File tree

3 files changed

+68
-12
lines changed

3 files changed

+68
-12
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -318,3 +318,4 @@ Bug Fixes
318318
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
319319
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)
320320
- Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`)
321+
- Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`)

pandas/io/parsers.py

+42-12
Original file line numberDiff line numberDiff line change
@@ -987,24 +987,42 @@ def _evaluate_usecols(usecols, names):
987987

988988
def _validate_usecols_arg(usecols):
989989
"""
990-
Check whether or not the 'usecols' parameter
991-
contains all integers (column selection by index),
992-
strings (column by name) or is a callable. Raises
993-
a ValueError if that is not the case.
990+
Validate the 'usecols' parameter.
991+
992+
Checks whether or not the 'usecols' parameter contains all integers
993+
(column selection by index), strings (column by name) or is a callable.
994+
Raises a ValueError if that is not the case.
995+
996+
Parameters
997+
----------
998+
usecols : array-like, callable, or None
999+
List of columns to use when parsing or a callable that can be used
1000+
to filter a list of table columns.
1001+
1002+
Returns
1003+
-------
1004+
usecols_tuple : tuple
1005+
A tuple of (verified_usecols, usecols_dtype).
1006+
1007+
'verified_usecols' is either a set if an array-like is passed in or
1008+
'usecols' if a callable or None is passed in.
1009+
1010+
'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
1011+
is passed in or None if a callable or None is passed in.
9941012
"""
9951013
msg = ("'usecols' must either be all strings, all unicode, "
9961014
"all integers or a callable")
9971015

9981016
if usecols is not None:
9991017
if callable(usecols):
1000-
return usecols
1018+
return usecols, None
10011019
usecols_dtype = lib.infer_dtype(usecols)
10021020
if usecols_dtype not in ('empty', 'integer',
10031021
'string', 'unicode'):
10041022
raise ValueError(msg)
10051023

1006-
return set(usecols)
1007-
return usecols
1024+
return set(usecols), usecols_dtype
1025+
return usecols, None
10081026

10091027

10101028
def _validate_parse_dates_arg(parse_dates):
@@ -1473,7 +1491,8 @@ def __init__(self, src, **kwds):
14731491
self._reader = _parser.TextReader(src, **kwds)
14741492

14751493
# XXX
1476-
self.usecols = _validate_usecols_arg(self._reader.usecols)
1494+
self.usecols, self.usecols_dtype = _validate_usecols_arg(
1495+
self._reader.usecols)
14771496

14781497
passed_names = self.names is None
14791498

@@ -1550,11 +1569,22 @@ def close(self):
15501569

15511570
def _set_noconvert_columns(self):
15521571
names = self.orig_names
1553-
usecols = self.usecols
1572+
if self.usecols_dtype == 'integer':
1573+
# A set of integers will be converted to a list in
1574+
# the correct order every single time.
1575+
usecols = list(self.usecols)
1576+
elif (callable(self.usecols) or
1577+
self.usecols_dtype not in ('empty', None)):
1578+
# The names attribute should have the correct columns
1579+
# in the proper order for indexing with parse_dates.
1580+
usecols = self.names[:]
1581+
else:
1582+
# Usecols is empty.
1583+
usecols = None
15541584

15551585
def _set(x):
1556-
if usecols and is_integer(x):
1557-
x = list(usecols)[x]
1586+
if usecols is not None and is_integer(x):
1587+
x = usecols[x]
15581588

15591589
if not is_integer(x):
15601590
x = names.index(x)
@@ -1792,7 +1822,7 @@ def __init__(self, f, **kwds):
17921822
self.skipinitialspace = kwds['skipinitialspace']
17931823
self.lineterminator = kwds['lineterminator']
17941824
self.quoting = kwds['quoting']
1795-
self.usecols = _validate_usecols_arg(kwds['usecols'])
1825+
self.usecols, _ = _validate_usecols_arg(kwds['usecols'])
17961826
self.skip_blank_lines = kwds['skip_blank_lines']
17971827

17981828
self.names_passed = kwds['names'] or None

pandas/io/tests/parser/usecols.py

+25
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,31 @@ def test_usecols_with_parse_dates(self):
200200
parse_dates=parse_dates)
201201
tm.assert_frame_equal(df, expected)
202202

203+
# See gh-14792
204+
s = """a,b,c,d,e,f,g,h,i,j
205+
2016/09/21,1,1,2,3,4,5,6,7,8"""
206+
parse_dates = [0]
207+
usecols = list('abcdefghij')
208+
cols = {'a': Timestamp('2016-09-21'),
209+
'b': [1], 'c': [1], 'd': [2],
210+
'e': [3], 'f': [4], 'g': [5],
211+
'h': [6], 'i': [7], 'j': [8]}
212+
expected = DataFrame(cols, columns=usecols)
213+
df = self.read_csv(StringIO(s), usecols=usecols,
214+
parse_dates=parse_dates)
215+
tm.assert_frame_equal(df, expected)
216+
217+
s = """a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"""
218+
parse_dates = [[0, 1]]
219+
usecols = list('abcdefghij')
220+
cols = {'a_b': '2016/09/21 1',
221+
'c': [1], 'd': [2], 'e': [3], 'f': [4],
222+
'g': [5], 'h': [6], 'i': [7], 'j': [8]}
223+
expected = DataFrame(cols, columns=['a_b'] + list('cdefghij'))
224+
df = self.read_csv(StringIO(s), usecols=usecols,
225+
parse_dates=parse_dates)
226+
tm.assert_frame_equal(df, expected)
227+
203228
def test_usecols_with_parse_dates_and_full_names(self):
204229
# See gh-9755
205230
s = """0,1,20140101,0900,4

0 commit comments

Comments
 (0)