pandas-dev · jreback · Apr 1, 2018 · Mar 30, 2018 · Mar 30, 2018 · Mar 30, 2018
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -445,10 +445,8 @@ cdef class TextReader:
         # suboptimal
         if usecols is not None:
             self.has_usecols = 1
-            if callable(usecols):
-                self.usecols = usecols
-            else:
-                self.usecols = set(usecols)
+            # GH20529, validate usecols at higher level.
+            self.usecols = usecols
 
         # XXX
         if skipfooter > 0:

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -97,11 +97,11 @@
     MultiIndex is used. If you have a malformed file with delimiters at the end
     of each line, you might consider index_col=False to force pandas to _not_
     use the first column as the index (row names)
-usecols : array-like or callable, default None
-    Return a subset of the columns. If array-like, all elements must either
+usecols : list-like or callable, default None
 def is_array_like(obj): 
 def is_array_like(obj): 
+    Return a subset of the columns. If list-like, all elements must either
     be positional (i.e. integer indices into the document columns) or strings
     that correspond to column names provided either by the user in `names` or
-    inferred from the document header row(s). For example, a valid array-like
+    inferred from the document header row(s). For example, a valid list-like
     `usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Element
     order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
     To instantiate a DataFrame from ``data`` with element order preserved use
@@ -1177,7 +1177,7 @@ def _validate_usecols_arg(usecols):
 
     Parameters
     ----------
-    usecols : array-like, callable, or None
+    usecols : list-like, callable, or None
         List of columns to use when parsing or a callable that can be used
         to filter a list of table columns.
 
@@ -1192,17 +1192,19 @@ def _validate_usecols_arg(usecols):
         'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
         is passed in or None if a callable or None is passed in.
     """
-    msg = ("'usecols' must either be all strings, all unicode, "
-           "all integers or a callable")
-
+    msg = ("'usecols' must either be list-like of all strings, all unicode, "
+           "all integers or a callable.")
     if usecols is not None:
         if callable(usecols):
             return usecols, None
-        usecols_dtype = lib.infer_dtype(usecols)
-        if usecols_dtype not in ('empty', 'integer',
-                                 'string', 'unicode'):
+        # GH20529, ensure is iterable container but not string.
+        elif not is_list_like(usecols):
             raise ValueError(msg)
-
+        else:
+            usecols_dtype = lib.infer_dtype(usecols)
+            if usecols_dtype not in ('empty', 'integer',
+                                     'string', 'unicode'):
+                raise ValueError(msg)
         return set(usecols), usecols_dtype
     return usecols, None
 
@@ -1697,11 +1699,12 @@ def __init__(self, src, **kwds):
         # #2442
         kwds['allow_leading_cols'] = self.index_col is not False
 
-        self._reader = parsers.TextReader(src, **kwds)
-
-        # XXX
+        # GH20529, validate usecol arg before TextReader
         self.usecols, self.usecols_dtype = _validate_usecols_arg(
-            self._reader.usecols)
+            kwds['usecols'])
+        kwds['usecols'] = self.usecols
+
+        self._reader = parsers.TextReader(src, **kwds)
 
         passed_names = self.names is None
 

diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py
@@ -16,6 +16,11 @@
 
 
 class UsecolsTests(object):
+    msg_validate_usecols_arg = ("'usecols' must either be list-like of all "
+                                "strings, all unicode, all integers or a "
+                                "callable.")
+    msg_validate_usecols_names = ("Usecols do not match columns, columns "
+                                  "expected but not found: {0}")
 
     def test_raise_on_mixed_dtype_usecols(self):
         # See gh-12678
@@ -24,11 +29,9 @@ def test_raise_on_mixed_dtype_usecols(self):
         4000,5000,6000
         """
 
-        msg = ("'usecols' must either be all strings, all unicode, "
-               "all integers or a callable")
         usecols = [0, 'b', 2]
 
-        with tm.assert_raises_regex(ValueError, msg):
+        with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
             self.read_csv(StringIO(data), usecols=usecols)
 
     def test_usecols(self):
@@ -348,13 +351,10 @@ def test_usecols_with_mixed_encoding_strings(self):
         3.568935038,7,False,a
         '''
 
-        msg = ("'usecols' must either be all strings, all unicode, "
-               "all integers or a callable")
-
-        with tm.assert_raises_regex(ValueError, msg):
+        with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
             self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB'])
 
-        with tm.assert_raises_regex(ValueError, msg):
+        with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
             self.read_csv(StringIO(s), usecols=[b'AAA', u'BBB'])
 
     def test_usecols_with_multibyte_characters(self):
@@ -480,30 +480,28 @@ def test_raise_on_usecols_names_mismatch(self):
         # GH 14671
         data = 'a,b,c,d\n1,2,3,4\n5,6,7,8'
 
-        msg = (
-            "Usecols do not match columns, "
-            "columns expected but not found: {missing}"
-        )
-
         usecols = ['a', 'b', 'c', 'd']
         df = self.read_csv(StringIO(data), usecols=usecols)
         expected = DataFrame({'a': [1, 5], 'b': [2, 6], 'c': [3, 7],
                               'd': [4, 8]})
         tm.assert_frame_equal(df, expected)
 
         usecols = ['a', 'b', 'c', 'f']
-        with tm.assert_raises_regex(
-                ValueError, msg.format(missing=r"\['f'\]")):
+        with tm.assert_raises_regex(ValueError,
+                                    self.msg_validate_usecols_names.format(
+                                        r"\['f'\]")):
             self.read_csv(StringIO(data), usecols=usecols)
 
         usecols = ['a', 'b', 'f']
-        with tm.assert_raises_regex(
-                ValueError, msg.format(missing=r"\['f'\]")):
+        with tm.assert_raises_regex(ValueError,
+                                    self.msg_validate_usecols_names.format(
+                                        r"\['f'\]")):
             self.read_csv(StringIO(data), usecols=usecols)
 
         usecols = ['a', 'b', 'f', 'g']
-        with tm.assert_raises_regex(
-                ValueError, msg.format(missing=r"\[('f', 'g'|'g', 'f')\]")):
+        with tm.assert_raises_regex(ValueError,
+                                    self.msg_validate_usecols_names.format(
+                                        r"\[('f', 'g'|'g', 'f')\]")):
             self.read_csv(StringIO(data), usecols=usecols)
 
         names = ['A', 'B', 'C', 'D']
@@ -527,11 +525,13 @@ def test_raise_on_usecols_names_mismatch(self):
         # tm.assert_frame_equal(df, expected)
 
         usecols = ['A', 'B', 'C', 'f']
-        with tm.assert_raises_regex(
-                ValueError, msg.format(missing=r"\['f'\]")):
+        with tm.assert_raises_regex(ValueError,
+                                    self.msg_validate_usecols_names.format(
+                                        r"\['f'\]")):
             self.read_csv(StringIO(data), header=0, names=names,
                           usecols=usecols)
         usecols = ['A', 'B', 'f']
-        with tm.assert_raises_regex(
-                ValueError, msg.format(missing=r"\['f'\]")):
+        with tm.assert_raises_regex(ValueError,
+                                    self.msg_validate_usecols_names.format(
+                                        r"\['f'\]")):
             self.read_csv(StringIO(data), names=names, usecols=usecols)