From 82cf55b1bd1ba8ec4f29a36458bd75aa361810e3 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 25 Dec 2016 11:41:22 -0800 Subject: [PATCH] BUG: Avoid flaky usecols set in C engine Closes gh-14792. --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/io/parsers.py | 60 ++++++++++++++++++++++++------- pandas/io/tests/parser/usecols.py | 25 +++++++++++++ 3 files changed, 74 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 0873e4b34b0b1..1341ce2710f57 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -321,3 +321,4 @@ Bug Fixes - Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`) - Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`) - Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`) +- Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index fdd753d1870b9..2332a9ade93ff 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -987,24 +987,42 @@ def _evaluate_usecols(usecols, names): def _validate_usecols_arg(usecols): """ - Check whether or not the 'usecols' parameter - contains all integers (column selection by index), - strings (column by name) or is a callable. Raises - a ValueError if that is not the case. + Validate the 'usecols' parameter. + + Checks whether or not the 'usecols' parameter contains all integers + (column selection by index), strings (column by name) or is a callable. + Raises a ValueError if that is not the case. + + Parameters + ---------- + usecols : array-like, callable, or None + List of columns to use when parsing or a callable that can be used + to filter a list of table columns. + + Returns + ------- + usecols_tuple : tuple + A tuple of (verified_usecols, usecols_dtype). + + 'verified_usecols' is either a set if an array-like is passed in or + 'usecols' if a callable or None is passed in. + + 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like + is passed in or None if a callable or None is passed in. """ msg = ("'usecols' must either be all strings, all unicode, " "all integers or a callable") if usecols is not None: if callable(usecols): - return usecols + return usecols, None usecols_dtype = lib.infer_dtype(usecols) if usecols_dtype not in ('empty', 'integer', 'string', 'unicode'): raise ValueError(msg) - return set(usecols) - return usecols + return set(usecols), usecols_dtype + return usecols, None def _validate_parse_dates_arg(parse_dates): @@ -1473,7 +1491,8 @@ def __init__(self, src, **kwds): self._reader = _parser.TextReader(src, **kwds) # XXX - self.usecols = _validate_usecols_arg(self._reader.usecols) + self.usecols, self.usecols_dtype = _validate_usecols_arg( + self._reader.usecols) passed_names = self.names is None @@ -1549,12 +1568,29 @@ def close(self): pass def _set_noconvert_columns(self): + """ + Set the columns that should not undergo dtype conversions. + + Currently, any column that is involved with date parsing will not + undergo such conversions. + """ names = self.orig_names - usecols = self.usecols + if self.usecols_dtype == 'integer': + # A set of integers will be converted to a list in + # the correct order every single time. + usecols = list(self.usecols) + elif (callable(self.usecols) or + self.usecols_dtype not in ('empty', None)): + # The names attribute should have the correct columns + # in the proper order for indexing with parse_dates. + usecols = self.names[:] + else: + # Usecols is empty. + usecols = None def _set(x): - if usecols and is_integer(x): - x = list(usecols)[x] + if usecols is not None and is_integer(x): + x = usecols[x] if not is_integer(x): x = names.index(x) @@ -1792,7 +1828,7 @@ def __init__(self, f, **kwds): self.skipinitialspace = kwds['skipinitialspace'] self.lineterminator = kwds['lineterminator'] self.quoting = kwds['quoting'] - self.usecols = _validate_usecols_arg(kwds['usecols']) + self.usecols, _ = _validate_usecols_arg(kwds['usecols']) self.skip_blank_lines = kwds['skip_blank_lines'] self.names_passed = kwds['names'] or None diff --git a/pandas/io/tests/parser/usecols.py b/pandas/io/tests/parser/usecols.py index 26b4b5b8ec7d1..4fb6ff00e2d7b 100644 --- a/pandas/io/tests/parser/usecols.py +++ b/pandas/io/tests/parser/usecols.py @@ -200,6 +200,31 @@ def test_usecols_with_parse_dates(self): parse_dates=parse_dates) tm.assert_frame_equal(df, expected) + # See gh-14792 + s = """a,b,c,d,e,f,g,h,i,j + 2016/09/21,1,1,2,3,4,5,6,7,8""" + parse_dates = [0] + usecols = list('abcdefghij') + cols = {'a': Timestamp('2016-09-21'), + 'b': [1], 'c': [1], 'd': [2], + 'e': [3], 'f': [4], 'g': [5], + 'h': [6], 'i': [7], 'j': [8]} + expected = DataFrame(cols, columns=usecols) + df = self.read_csv(StringIO(s), usecols=usecols, + parse_dates=parse_dates) + tm.assert_frame_equal(df, expected) + + s = """a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8""" + parse_dates = [[0, 1]] + usecols = list('abcdefghij') + cols = {'a_b': '2016/09/21 1', + 'c': [1], 'd': [2], 'e': [3], 'f': [4], + 'g': [5], 'h': [6], 'i': [7], 'j': [8]} + expected = DataFrame(cols, columns=['a_b'] + list('cdefghij')) + df = self.read_csv(StringIO(s), usecols=usecols, + parse_dates=parse_dates) + tm.assert_frame_equal(df, expected) + def test_usecols_with_parse_dates_and_full_names(self): # See gh-9755 s = """0,1,20140101,0900,4