BUG: Fix parse_dates processing with usecols and C engine

gfyoung · jreback · commit c6c201e27c7e · 2016-04-06T15:14:47.000-04:00
closes #9755 closes #12678 `read_csv` bugs, this PR fixes a bug brought up in #9755 in processing `parse_dates` with the C engine in which the wrong indices (those of the filtered column names) were being used to determine the date columns to not be dtype-parsed by the C engine. The correct indices are those of the original column names, as they are used later on in the actual data processing. Author: gfyoung <gfyoung17@gmail.com> Closes #12512 from gfyoung/parse_dates_usecols and squashes the following commits: f0543a4 [gfyoung] BUG: Prevent mixed-typed usecols 83caa3b [gfyoung] BUG: Fix parse_dates processing with usecols and C engine
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -120,8 +120,12 @@ index_col :  int or sequence or ``False``, default ``None``
   each line, you might consider ``index_col=False`` to force pandas to *not* use
   the first column as the index (row names).
 usecols : array-like, default ``None``
-  Return a subset of the columns. Results in much faster parsing time and lower
-  memory usage
+  Return a subset of the columns. All elements in this array must either
+  be positional (i.e. integer indices into the document columns) or strings
+  that correspond to column names provided either by the user in `names` or
+  inferred from the document header row(s). For example, a valid `usecols`
+  parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter
+  results in much faster parsing time and lower memory usage.
 squeeze : boolean, default ``False``
   If the parsed data only contains one column then return a Series.
 prefix : str, default ``None``
diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt
@@ -101,7 +101,7 @@ API changes
 
 
 - ``CParserError`` is now a ``ValueError`` instead of just an ``Exception`` (:issue:`12551`)
-
+- ``read_csv`` no longer allows a combination of strings and integers for the ``usecols`` parameter (:issue:`12678`)
 - ``pd.show_versions()`` now includes ``pandas_datareader`` version (:issue:`12740`)
 - Provide a proper ``__name__`` and ``__qualname__`` attributes for generic functions (:issue:`12021`)
 
@@ -211,6 +211,7 @@ Bug Fixes
 
 - Bug in ``value_counts`` when ``normalize=True`` and ``dropna=True`` where nulls still contributed to the normalized count (:issue:`12558`)
 - Bug in ``Panel.fillna()`` ignoring ``inplace=True`` (:issue:`12633`)
+- Bug in ``read_csv`` when specifying ``names``, ```usecols``, and ``parse_dates`` simultaneously with the C engine (:issue:`9755`)
 - Bug in ``Series.rename``, ``DataFrame.rename`` and ``DataFrame.rename_axis`` not treating ``Series`` as mappings to relabel (:issue:`12623`).
 - Clean in ``.rolling.min`` and ``.rolling.max`` to enhance dtype handling (:issue:`12373`)
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -75,8 +75,12 @@ class ParserWarning(Warning):
     of each line, you might consider index_col=False to force pandas to _not_
     use the first column as the index (row names)
 usecols : array-like, default None
-    Return a subset of the columns.
-    Results in much faster parsing time and lower memory usage.
+    Return a subset of the columns. All elements in this array must either
+    be positional (i.e. integer indices into the document columns) or strings
+    that correspond to column names provided either by the user in `names` or
+    inferred from the document header row(s). For example, a valid `usecols`
+    parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter
+    results in much faster parsing time and lower memory usage.
 squeeze : boolean, default False
     If the parsed data only contains one column then return a Series
 prefix : str, default None
@@ -801,6 +805,23 @@ def _is_index_col(col):
     return col is not None and col is not False
 
 
+def _validate_usecols_arg(usecols):
+    """
+    Check whether or not the 'usecols' parameter
+    contains all integers (column selection by index)
+    or strings (column by name). Raises a ValueError
+    if that is not the case.
+    """
+    if usecols is not None:
+        usecols_dtype = lib.infer_dtype(usecols)
+        if usecols_dtype not in ('integer', 'string'):
+            raise ValueError(("The elements of 'usecols' "
+                              "must either be all strings "
+                              "or all integers"))
+
+    return usecols
+
+
 class ParserBase(object):
 
     def __init__(self, kwds):
@@ -1132,7 +1153,7 @@ def __init__(self, src, **kwds):
         self._reader = _parser.TextReader(src, **kwds)
 
         # XXX
-        self.usecols = self._reader.usecols
+        self.usecols = _validate_usecols_arg(self._reader.usecols)
 
         passed_names = self.names is None
 
@@ -1157,18 +1178,21 @@ def __init__(self, src, **kwds):
             else:
                 self.names = lrange(self._reader.table_width)
 
-        # If the names were inferred (not passed by user) and usedcols is
-        # defined, then ensure names refers to the used columns, not the
-        # document's columns.
-        if self.usecols and passed_names:
-            col_indices = []
-            for u in self.usecols:
-                if isinstance(u, string_types):
-                    col_indices.append(self.names.index(u))
-                else:
-                    col_indices.append(u)
-            self.names = [n for i, n in enumerate(self.names)
-                          if i in col_indices]
+        # gh-9755
+        #
+        # need to set orig_names here first
+        # so that proper indexing can be done
+        # with _set_noconvert_columns
+        #
+        # once names has been filtered, we will
+        # then set orig_names again to names
+        self.orig_names = self.names[:]
+
+        if self.usecols:
+            if len(self.names) > len(self.usecols):
+                self.names = [n for i, n in enumerate(self.names)
+                              if (i in self.usecols or n in self.usecols)]
+
             if len(self.names) < len(self.usecols):
                 raise ValueError("Usecols do not match names.")
 
@@ -1194,13 +1218,17 @@ def __init__(self, src, **kwds):
         self._implicit_index = self._reader.leading_cols > 0
 
     def _set_noconvert_columns(self):
-        names = self.names
+        names = self.orig_names
+        usecols = self.usecols
 
         def _set(x):
-            if com.is_integer(x):
-                self._reader.set_noconvert(x)
-            else:
-                self._reader.set_noconvert(names.index(x))
+            if usecols and com.is_integer(x):
+                x = list(usecols)[x]
+
+            if not com.is_integer(x):
+                x = names.index(x)
+
+            self._reader.set_noconvert(x)
 
         if isinstance(self.parse_dates, list):
             for val in self.parse_dates:
@@ -1472,7 +1500,7 @@ def __init__(self, f, **kwds):
         self.lineterminator = kwds['lineterminator']
         self.quoting = kwds['quoting']
         self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True)
-        self.usecols = kwds['usecols']
+        self.usecols = _validate_usecols_arg(kwds['usecols'])
         self.skip_blank_lines = kwds['skip_blank_lines']
 
         self.names_passed = kwds['names'] or None
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -2682,12 +2682,118 @@ def test_uneven_lines_with_usecols(self):
         df = self.read_csv(StringIO(csv), usecols=usecols)
         tm.assert_frame_equal(df, expected)
 
-        usecols = ['a', 1]
+        usecols = ['a', 'b']
         df = self.read_csv(StringIO(csv), usecols=usecols)
         tm.assert_frame_equal(df, expected)
 
-        usecols = ['a', 'b']
-        df = self.read_csv(StringIO(csv), usecols=usecols)
+    def test_usecols_with_parse_dates(self):
+        # See gh-9755
+        s = """a,b,c,d,e
+        0,1,20140101,0900,4
+        0,1,20140102,1000,4"""
+        parse_dates = [[1, 2]]
+
+        cols = {
+            'a'  : [0, 0],
+            'c_d': [
+                Timestamp('2014-01-01 09:00:00'),
+                Timestamp('2014-01-02 10:00:00')
+            ]
+        }
+        expected = DataFrame(cols, columns=['c_d', 'a'])
+
+        df = self.read_csv(StringIO(s), usecols=[0, 2, 3],
+                           parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+        df = self.read_csv(StringIO(s), usecols=[3, 0, 2],
+                           parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+    def test_usecols_with_parse_dates_and_full_names(self):
+        # See gh-9755
+        s = """0,1,20140101,0900,4
+        0,1,20140102,1000,4"""
+        parse_dates = [[1, 2]]
+        names = list('abcde')
+
+        cols = {
+            'a'  : [0, 0],
+            'c_d': [
+                Timestamp('2014-01-01 09:00:00'),
+                Timestamp('2014-01-02 10:00:00')
+            ]
+        }
+        expected = DataFrame(cols, columns=['c_d', 'a'])
+
+        df = self.read_csv(StringIO(s), names=names,
+                           usecols=[0, 2, 3],
+                           parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+        df = self.read_csv(StringIO(s), names=names,
+                           usecols=[3, 0, 2],
+                           parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+    def test_usecols_with_parse_dates_and_usecol_names(self):
+        # See gh-9755
+        s = """0,1,20140101,0900,4
+        0,1,20140102,1000,4"""
+        parse_dates = [[1, 2]]
+        names = list('acd')
+
+        cols = {
+            'a'  : [0, 0],
+            'c_d': [
+                Timestamp('2014-01-01 09:00:00'),
+                Timestamp('2014-01-02 10:00:00')
+            ]
+        }
+        expected = DataFrame(cols, columns=['c_d', 'a'])
+
+        df = self.read_csv(StringIO(s), names=names,
+                           usecols=[0, 2, 3],
+                           parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+        df = self.read_csv(StringIO(s), names=names,
+                           usecols=[3, 0, 2],
+                           parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+    def test_mixed_dtype_usecols(self):
+        # See gh-12678
+        data = """a,b,c
+        1000,2000,3000
+        4000,5000,6000
+        """
+        msg = ("The elements of \'usecols\' "
+               "must either be all strings "
+               "or all integers")
+        usecols = [0, 'b', 2]
+
+        with tm.assertRaisesRegexp(ValueError, msg):
+            df = self.read_csv(StringIO(data), usecols=usecols)
+
+    def test_usecols_with_integer_like_header(self):
+        data = """2,0,1
+        1000,2000,3000
+        4000,5000,6000
+        """
+
+        usecols = [0, 1]  # column selection by index
+        expected = DataFrame(data=[[1000, 2000],
+                                   [4000, 5000]],
+                             columns=['2', '0'])
+        df = self.read_csv(StringIO(data), usecols=usecols)
+        tm.assert_frame_equal(df, expected)
+
+        usecols = ['0', '1']  # column selection by name
+        expected = DataFrame(data=[[2000, 3000],
+                                   [5000, 6000]],
+                             columns=['0', '1'])
+        df = self.read_csv(StringIO(data), usecols=usecols)
         tm.assert_frame_equal(df, expected)