BUG: usecols kwarg accepts string when it should only allow list-like or callable. (#20558)

minggli · jreback · commit 6a22cf786e9e · 2018-04-01T09:42:25.000-04:00
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -130,11 +130,11 @@ index_col :  int or sequence or ``False``, default ``None``
   MultiIndex is used. If you have a malformed file with delimiters at the end of
   each line, you might consider ``index_col=False`` to force pandas to *not* use
   the first column as the index (row names).
-usecols : array-like or callable, default ``None``
-  Return a subset of the columns. If array-like, all elements must either
+usecols : list-like or callable, default ``None``
+  Return a subset of the columns. If list-like, all elements must either
   be positional (i.e. integer indices into the document columns) or strings
   that correspond to column names provided either by the user in `names` or
-  inferred from the document header row(s). For example, a valid array-like
+  inferred from the document header row(s). For example, a valid list-like
   `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
 
   Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. To
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -1095,6 +1095,7 @@ I/O
 - Bug in :meth:`pandas.io.stata.StataReader.value_labels` raising an ``AttributeError`` when called on very old files. Now returns an empty dict (:issue:`19417`)
 - Bug in :func:`read_pickle` when unpickling objects with :class:`TimedeltaIndex` or :class:`Float64Index` created with pandas prior to version 0.20 (:issue:`19939`)
 - Bug in :meth:`pandas.io.json.json_normalize` where subrecords are not properly normalized if any subrecords values are NoneType (:issue:`20030`)
+- Bug in ``usecols`` parameter in :func:`pandas.io.read_csv` and :func:`pandas.io.read_table` where error is not raised correctly when passing a string. (:issue:`20529`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -445,10 +445,9 @@ cdef class TextReader:
         # suboptimal
         if usecols is not None:
             self.has_usecols = 1
-            if callable(usecols):
-                self.usecols = usecols
-            else:
-                self.usecols = set(usecols)
+            # GH-20558, validate usecols at higher level and only pass clean
+            # usecols into TextReader.
+            self.usecols = usecols
 
         # XXX
         if skipfooter > 0:
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -97,11 +97,11 @@
     MultiIndex is used. If you have a malformed file with delimiters at the end
     of each line, you might consider index_col=False to force pandas to _not_
     use the first column as the index (row names)
-usecols : array-like or callable, default None
-    Return a subset of the columns. If array-like, all elements must either
+usecols : list-like or callable, default None
+    Return a subset of the columns. If list-like, all elements must either
     be positional (i.e. integer indices into the document columns) or strings
     that correspond to column names provided either by the user in `names` or
-    inferred from the document header row(s). For example, a valid array-like
+    inferred from the document header row(s). For example, a valid list-like
     `usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Element
     order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
     To instantiate a DataFrame from ``data`` with element order preserved use
@@ -1177,7 +1177,7 @@ def _validate_usecols_arg(usecols):
 
     Parameters
     ----------
-    usecols : array-like, callable, or None
+    usecols : list-like, callable, or None
         List of columns to use when parsing or a callable that can be used
         to filter a list of table columns.
 
@@ -1192,17 +1192,19 @@ def _validate_usecols_arg(usecols):
         'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
         is passed in or None if a callable or None is passed in.
     """
-    msg = ("'usecols' must either be all strings, all unicode, "
-           "all integers or a callable")
-
+    msg = ("'usecols' must either be list-like of all strings, all unicode, "
+           "all integers or a callable.")
     if usecols is not None:
         if callable(usecols):
             return usecols, None
-        usecols_dtype = lib.infer_dtype(usecols)
-        if usecols_dtype not in ('empty', 'integer',
-                                 'string', 'unicode'):
+        # GH20529, ensure is iterable container but not string.
+        elif not is_list_like(usecols):
             raise ValueError(msg)
-
+        else:
+            usecols_dtype = lib.infer_dtype(usecols)
+            if usecols_dtype not in ('empty', 'integer',
+                                     'string', 'unicode'):
+                raise ValueError(msg)
         return set(usecols), usecols_dtype
     return usecols, None
 
@@ -1697,11 +1699,12 @@ def __init__(self, src, **kwds):
         # #2442
         kwds['allow_leading_cols'] = self.index_col is not False
 
-        self._reader = parsers.TextReader(src, **kwds)
-
-        # XXX
+        # GH20529, validate usecol arg before TextReader
         self.usecols, self.usecols_dtype = _validate_usecols_arg(
-            self._reader.usecols)
+            kwds['usecols'])
+        kwds['usecols'] = self.usecols
+
+        self._reader = parsers.TextReader(src, **kwds)
 
         passed_names = self.names is None
 
diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py
@@ -16,6 +16,11 @@
 
 
 class UsecolsTests(object):
+    msg_validate_usecols_arg = ("'usecols' must either be list-like of all "
+                                "strings, all unicode, all integers or a "
+                                "callable.")
+    msg_validate_usecols_names = ("Usecols do not match columns, columns "
+                                  "expected but not found: {0}")
 
     def test_raise_on_mixed_dtype_usecols(self):
         # See gh-12678
@@ -24,11 +29,9 @@ def test_raise_on_mixed_dtype_usecols(self):
         4000,5000,6000
         """
 
-        msg = ("'usecols' must either be all strings, all unicode, "
-               "all integers or a callable")
         usecols = [0, 'b', 2]
 
-        with tm.assert_raises_regex(ValueError, msg):
+        with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
             self.read_csv(StringIO(data), usecols=usecols)
 
     def test_usecols(self):
@@ -85,6 +88,18 @@ def test_usecols(self):
         pytest.raises(ValueError, self.read_csv, StringIO(data),
                       names=['a', 'b'], usecols=[1], header=None)
 
+    def test_usecols_single_string(self):
+        # GH 20558
+        data = """foo, bar, baz
+        1000, 2000, 3000
+        4000, 5000, 6000
+        """
+
+        usecols = 'foo'
+
+        with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
+            self.read_csv(StringIO(data), usecols=usecols)
+
     def test_usecols_index_col_False(self):
         # see gh-9082
         s = "a,b,c,d\n1,2,3,4\n5,6,7,8"
@@ -348,13 +363,10 @@ def test_usecols_with_mixed_encoding_strings(self):
         3.568935038,7,False,a
         '''
 
-        msg = ("'usecols' must either be all strings, all unicode, "
-               "all integers or a callable")
-
-        with tm.assert_raises_regex(ValueError, msg):
+        with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
             self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB'])
 
-        with tm.assert_raises_regex(ValueError, msg):
+        with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
             self.read_csv(StringIO(s), usecols=[b'AAA', u'BBB'])
 
     def test_usecols_with_multibyte_characters(self):
@@ -480,30 +492,28 @@ def test_raise_on_usecols_names_mismatch(self):
         # GH 14671
         data = 'a,b,c,d\n1,2,3,4\n5,6,7,8'
 
-        msg = (
-            "Usecols do not match columns, "
-            "columns expected but not found: {missing}"
-        )
-
         usecols = ['a', 'b', 'c', 'd']
         df = self.read_csv(StringIO(data), usecols=usecols)
         expected = DataFrame({'a': [1, 5], 'b': [2, 6], 'c': [3, 7],
                               'd': [4, 8]})
         tm.assert_frame_equal(df, expected)
 
         usecols = ['a', 'b', 'c', 'f']
-        with tm.assert_raises_regex(
-                ValueError, msg.format(missing=r"\['f'\]")):
+        with tm.assert_raises_regex(ValueError,
+                                    self.msg_validate_usecols_names.format(
+                                        r"\['f'\]")):
             self.read_csv(StringIO(data), usecols=usecols)
 
         usecols = ['a', 'b', 'f']
-        with tm.assert_raises_regex(
-                ValueError, msg.format(missing=r"\['f'\]")):
+        with tm.assert_raises_regex(ValueError,
+                                    self.msg_validate_usecols_names.format(
+                                        r"\['f'\]")):
             self.read_csv(StringIO(data), usecols=usecols)
 
         usecols = ['a', 'b', 'f', 'g']
-        with tm.assert_raises_regex(
-                ValueError, msg.format(missing=r"\[('f', 'g'|'g', 'f')\]")):
+        with tm.assert_raises_regex(ValueError,
+                                    self.msg_validate_usecols_names.format(
+                                        r"\[('f', 'g'|'g', 'f')\]")):
             self.read_csv(StringIO(data), usecols=usecols)
 
         names = ['A', 'B', 'C', 'D']
@@ -527,11 +537,13 @@ def test_raise_on_usecols_names_mismatch(self):
         # tm.assert_frame_equal(df, expected)
 
         usecols = ['A', 'B', 'C', 'f']
-        with tm.assert_raises_regex(
-                ValueError, msg.format(missing=r"\['f'\]")):
+        with tm.assert_raises_regex(ValueError,
+                                    self.msg_validate_usecols_names.format(
+                                        r"\['f'\]")):
             self.read_csv(StringIO(data), header=0, names=names,
                           usecols=usecols)
         usecols = ['A', 'B', 'f']
-        with tm.assert_raises_regex(
-                ValueError, msg.format(missing=r"\['f'\]")):
+        with tm.assert_raises_regex(ValueError,
+                                    self.msg_validate_usecols_names.format(
+                                        r"\['f'\]")):
             self.read_csv(StringIO(data), names=names, usecols=usecols)