Skip to content

Commit a42a015

Browse files
gfyoungjreback
authored andcommitted
BUG: Avoid flaky usecols set in C engine
Explanation of the bug can be found #14792 (comment) Closes #14792 Author: gfyoung <[email protected]> Closes #14984 from gfyoung/usecols-parse-dates-list and squashes the following commits: 82cf55b [gfyoung] BUG: Avoid flaky usecols set in C engine
1 parent 298b241 commit a42a015

File tree

3 files changed

+74
-12
lines changed

3 files changed

+74
-12
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -323,3 +323,4 @@ Bug Fixes
323323
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
324324
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`, :issue:`14982`)
325325
- Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`)
326+
- Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`)

pandas/io/parsers.py

+48-12
Original file line numberDiff line numberDiff line change
@@ -987,24 +987,42 @@ def _evaluate_usecols(usecols, names):
987987

988988
def _validate_usecols_arg(usecols):
989989
"""
990-
Check whether or not the 'usecols' parameter
991-
contains all integers (column selection by index),
992-
strings (column by name) or is a callable. Raises
993-
a ValueError if that is not the case.
990+
Validate the 'usecols' parameter.
991+
992+
Checks whether or not the 'usecols' parameter contains all integers
993+
(column selection by index), strings (column by name) or is a callable.
994+
Raises a ValueError if that is not the case.
995+
996+
Parameters
997+
----------
998+
usecols : array-like, callable, or None
999+
List of columns to use when parsing or a callable that can be used
1000+
to filter a list of table columns.
1001+
1002+
Returns
1003+
-------
1004+
usecols_tuple : tuple
1005+
A tuple of (verified_usecols, usecols_dtype).
1006+
1007+
'verified_usecols' is either a set if an array-like is passed in or
1008+
'usecols' if a callable or None is passed in.
1009+
1010+
'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
1011+
is passed in or None if a callable or None is passed in.
9941012
"""
9951013
msg = ("'usecols' must either be all strings, all unicode, "
9961014
"all integers or a callable")
9971015

9981016
if usecols is not None:
9991017
if callable(usecols):
1000-
return usecols
1018+
return usecols, None
10011019
usecols_dtype = lib.infer_dtype(usecols)
10021020
if usecols_dtype not in ('empty', 'integer',
10031021
'string', 'unicode'):
10041022
raise ValueError(msg)
10051023

1006-
return set(usecols)
1007-
return usecols
1024+
return set(usecols), usecols_dtype
1025+
return usecols, None
10081026

10091027

10101028
def _validate_parse_dates_arg(parse_dates):
@@ -1473,7 +1491,8 @@ def __init__(self, src, **kwds):
14731491
self._reader = _parser.TextReader(src, **kwds)
14741492

14751493
# XXX
1476-
self.usecols = _validate_usecols_arg(self._reader.usecols)
1494+
self.usecols, self.usecols_dtype = _validate_usecols_arg(
1495+
self._reader.usecols)
14771496

14781497
passed_names = self.names is None
14791498

@@ -1549,12 +1568,29 @@ def close(self):
15491568
pass
15501569

15511570
def _set_noconvert_columns(self):
1571+
"""
1572+
Set the columns that should not undergo dtype conversions.
1573+
1574+
Currently, any column that is involved with date parsing will not
1575+
undergo such conversions.
1576+
"""
15521577
names = self.orig_names
1553-
usecols = self.usecols
1578+
if self.usecols_dtype == 'integer':
1579+
# A set of integers will be converted to a list in
1580+
# the correct order every single time.
1581+
usecols = list(self.usecols)
1582+
elif (callable(self.usecols) or
1583+
self.usecols_dtype not in ('empty', None)):
1584+
# The names attribute should have the correct columns
1585+
# in the proper order for indexing with parse_dates.
1586+
usecols = self.names[:]
1587+
else:
1588+
# Usecols is empty.
1589+
usecols = None
15541590

15551591
def _set(x):
1556-
if usecols and is_integer(x):
1557-
x = list(usecols)[x]
1592+
if usecols is not None and is_integer(x):
1593+
x = usecols[x]
15581594

15591595
if not is_integer(x):
15601596
x = names.index(x)
@@ -1792,7 +1828,7 @@ def __init__(self, f, **kwds):
17921828
self.skipinitialspace = kwds['skipinitialspace']
17931829
self.lineterminator = kwds['lineterminator']
17941830
self.quoting = kwds['quoting']
1795-
self.usecols = _validate_usecols_arg(kwds['usecols'])
1831+
self.usecols, _ = _validate_usecols_arg(kwds['usecols'])
17961832
self.skip_blank_lines = kwds['skip_blank_lines']
17971833

17981834
self.names_passed = kwds['names'] or None

pandas/io/tests/parser/usecols.py

+25
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,31 @@ def test_usecols_with_parse_dates(self):
200200
parse_dates=parse_dates)
201201
tm.assert_frame_equal(df, expected)
202202

203+
# See gh-14792
204+
s = """a,b,c,d,e,f,g,h,i,j
205+
2016/09/21,1,1,2,3,4,5,6,7,8"""
206+
parse_dates = [0]
207+
usecols = list('abcdefghij')
208+
cols = {'a': Timestamp('2016-09-21'),
209+
'b': [1], 'c': [1], 'd': [2],
210+
'e': [3], 'f': [4], 'g': [5],
211+
'h': [6], 'i': [7], 'j': [8]}
212+
expected = DataFrame(cols, columns=usecols)
213+
df = self.read_csv(StringIO(s), usecols=usecols,
214+
parse_dates=parse_dates)
215+
tm.assert_frame_equal(df, expected)
216+
217+
s = """a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"""
218+
parse_dates = [[0, 1]]
219+
usecols = list('abcdefghij')
220+
cols = {'a_b': '2016/09/21 1',
221+
'c': [1], 'd': [2], 'e': [3], 'f': [4],
222+
'g': [5], 'h': [6], 'i': [7], 'j': [8]}
223+
expected = DataFrame(cols, columns=['a_b'] + list('cdefghij'))
224+
df = self.read_csv(StringIO(s), usecols=usecols,
225+
parse_dates=parse_dates)
226+
tm.assert_frame_equal(df, expected)
227+
203228
def test_usecols_with_parse_dates_and_full_names(self):
204229
# See gh-9755
205230
s = """0,1,20140101,0900,4

0 commit comments

Comments
 (0)