BUG: Prevent mixed-typed usecols

gfyoung · gfyoung · commit f0543a4f3785 · 2016-04-06T20:10:31.000+01:00
Enforces the fact that 'usecols' must either be all integers (indexing) or strings (column names), as mixtures of the two are ambiguous. Closes pandas-devgh-12678.
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -120,8 +120,12 @@ index_col :  int or sequence or ``False``, default ``None``
   each line, you might consider ``index_col=False`` to force pandas to *not* use
   the first column as the index (row names).
 usecols : array-like, default ``None``
-  Return a subset of the columns. Results in much faster parsing time and lower
-  memory usage
+  Return a subset of the columns. All elements in this array must either
+  be positional (i.e. integer indices into the document columns) or strings
+  that correspond to column names provided either by the user in `names` or
+  inferred from the document header row(s). For example, a valid `usecols`
+  parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter
+  results in much faster parsing time and lower memory usage.
 squeeze : boolean, default ``False``
   If the parsed data only contains one column then return a Series.
 prefix : str, default ``None``
diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt
@@ -101,7 +101,7 @@ API changes
 
 
 - ``CParserError`` is now a ``ValueError`` instead of just an ``Exception`` (:issue:`12551`)
-
+- ``read_csv`` no longer allows a combination of strings and integers for the ``usecols`` parameter (:issue:`12678`)
 - ``pd.show_versions()`` now includes ``pandas_datareader`` version (:issue:`12740`)
 - Provide a proper ``__name__`` and ``__qualname__`` attributes for generic functions (:issue:`12021`)
 
@@ -211,6 +211,7 @@ Bug Fixes
 
 - Bug in ``value_counts`` when ``normalize=True`` and ``dropna=True`` where nulls still contributed to the normalized count (:issue:`12558`)
 - Bug in ``Panel.fillna()`` ignoring ``inplace=True`` (:issue:`12633`)
+- Bug in ``read_csv`` when specifying ``names``, ```usecols``, and ``parse_dates`` simultaneously with the C engine (:issue:`9755`)
 - Bug in ``Series.rename``, ``DataFrame.rename`` and ``DataFrame.rename_axis`` not treating ``Series`` as mappings to relabel (:issue:`12623`).
 - Clean in ``.rolling.min`` and ``.rolling.max`` to enhance dtype handling (:issue:`12373`)
 
@@ -236,9 +237,3 @@ Bug Fixes
 - Bug in ``.describe()`` resets categorical columns information (:issue:`11558`)
 - Bug where ``loffset`` argument was not applied when calling ``resample().count()`` on a timeseries (:issue:`12725`)
 - ``pd.read_excel()`` now accepts path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path, in line with other ``read_*`` functions (:issue:`12655`)
-
-
-
-
-
-- Bug in ``read_csv`` when specifying ``usecols`` and ``parse_dates`` simultaneously with the C engine (:issue:`9755`)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -75,8 +75,12 @@ class ParserWarning(Warning):
     of each line, you might consider index_col=False to force pandas to _not_
     use the first column as the index (row names)
 usecols : array-like, default None
-    Return a subset of the columns.
-    Results in much faster parsing time and lower memory usage.
+    Return a subset of the columns. All elements in this array must either
+    be positional (i.e. integer indices into the document columns) or strings
+    that correspond to column names provided either by the user in `names` or
+    inferred from the document header row(s). For example, a valid `usecols`
+    parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter
+    results in much faster parsing time and lower memory usage.
 squeeze : boolean, default False
     If the parsed data only contains one column then return a Series
 prefix : str, default None
@@ -801,6 +805,26 @@ def _is_index_col(col):
     return col is not None and col is not False
 
 
+def _validate_usecols_arg(usecols):
+    """
+    Check whether or not the 'usecols' parameter
+    contains all integers (column selection by index)
+    or strings (column by name). Raises a ValueError
+    if that is not the case.
+    """
+    # gh-12678
+    if usecols is not None:
+        usecols_dtype = lib.infer_dtype(usecols)
+        if usecols_dtype not in ('integer', 'string'):
+            raise ValueError(("The elements of 'usecols' "
+                              "must either be all strings "
+                              "or all integers"))
+
+    # validation has succeeded, so
+    # return the argument for assignment
+    return usecols
+
+
 class ParserBase(object):
 
     def __init__(self, kwds):
@@ -1132,7 +1156,7 @@ def __init__(self, src, **kwds):
         self._reader = _parser.TextReader(src, **kwds)
 
         # XXX
-        self.usecols = self._reader.usecols
+        self.usecols = _validate_usecols_arg(self._reader.usecols)
 
         passed_names = self.names is None
 
@@ -1479,7 +1503,7 @@ def __init__(self, f, **kwds):
         self.lineterminator = kwds['lineterminator']
         self.quoting = kwds['quoting']
         self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True)
-        self.usecols = kwds['usecols']
+        self.usecols = _validate_usecols_arg(kwds['usecols'])
         self.skip_blank_lines = kwds['skip_blank_lines']
 
         self.names_passed = kwds['names'] or None
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -2702,12 +2702,12 @@ def test_usecols_with_parse_dates(self):
         }
         expected = DataFrame(cols, columns=['c_d', 'a'])
 
-        df = read_csv(StringIO(s), usecols=[0, 2, 3],
-                      parse_dates=parse_dates)
+        df = self.read_csv(StringIO(s), usecols=[0, 2, 3],
+                           parse_dates=parse_dates)
         tm.assert_frame_equal(df, expected)
 
-        df = read_csv(StringIO(s), usecols=[3, 0, 2],
-                      parse_dates=parse_dates)
+        df = self.read_csv(StringIO(s), usecols=[3, 0, 2],
+                           parse_dates=parse_dates)
         tm.assert_frame_equal(df, expected)
 
     def test_usecols_with_parse_dates_and_full_names(self):
@@ -2726,14 +2726,14 @@ def test_usecols_with_parse_dates_and_full_names(self):
         }
         expected = DataFrame(cols, columns=['c_d', 'a'])
 
-        df = read_csv(StringIO(s), names=names,
-                      usecols=[0, 2, 3],
-                      parse_dates=parse_dates)
+        df = self.read_csv(StringIO(s), names=names,
+                           usecols=[0, 2, 3],
+                           parse_dates=parse_dates)
         tm.assert_frame_equal(df, expected)
 
-        df = read_csv(StringIO(s), names=names,
-                      usecols=[3, 0, 2],
-                      parse_dates=parse_dates)
+        df = self.read_csv(StringIO(s), names=names,
+                           usecols=[3, 0, 2],
+                           parse_dates=parse_dates)
         tm.assert_frame_equal(df, expected)
 
     def test_usecols_with_parse_dates_and_usecol_names(self):
@@ -2752,14 +2752,48 @@ def test_usecols_with_parse_dates_and_usecol_names(self):
         }
         expected = DataFrame(cols, columns=['c_d', 'a'])
 
-        df = read_csv(StringIO(s), names=names,
-                      usecols=[0, 2, 3],
-                      parse_dates=parse_dates)
+        df = self.read_csv(StringIO(s), names=names,
+                           usecols=[0, 2, 3],
+                           parse_dates=parse_dates)
         tm.assert_frame_equal(df, expected)
 
-        df = read_csv(StringIO(s), names=names,
-                      usecols=[3, 0, 2],
-                      parse_dates=parse_dates)
+        df = self.read_csv(StringIO(s), names=names,
+                           usecols=[3, 0, 2],
+                           parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+    def test_mixed_dtype_usecols(self):
+        # See gh-12678
+        data = """a,b,c
+        1000,2000,3000
+        4000,5000,6000
+        """
+        msg = ("The elements of \'usecols\' "
+               "must either be all strings "
+               "or all integers")
+        usecols = [0, 'b', 2]
+
+        with tm.assertRaisesRegexp(ValueError, msg):
+            df = self.read_csv(StringIO(data), usecols=usecols)
+
+    def test_usecols_with_integer_like_header(self):
+        data = """2,0,1
+        1000,2000,3000
+        4000,5000,6000
+        """
+
+        usecols = [0, 1]  # column selection by index
+        expected = DataFrame(data=[[1000, 2000],
+                                   [4000, 5000]],
+                             columns=['2', '0'])
+        df = self.read_csv(StringIO(data), usecols=usecols)
+        tm.assert_frame_equal(df, expected)
+
+        usecols = ['0', '1']  # column selection by name
+        expected = DataFrame(data=[[2000, 3000],
+                                   [5000, 6000]],
+                             columns=['0', '1'])
+        df = self.read_csv(StringIO(data), usecols=usecols)
         tm.assert_frame_equal(df, expected)