From b232fa708372c00c8522a7c29905eb1352901918 Mon Sep 17 00:00:00 2001
From: Ming Li <ming.george.li@gmail.com>
Date: Fri, 30 Mar 2018 16:58:00 +0100
Subject: [PATCH 01/10] pass usecols without converting to set at parser.pyx

---
 pandas/_libs/parsers.pyx | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 52ca3d1226f79..0dfc59586eaef 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -378,9 +378,7 @@ cdef class TextReader:
 
         self.compression = compression
         self.memory_map = memory_map
-
         self.parser.usecols = (usecols is not None)
-
         self._setup_parser_source(source)
         parser_set_default_options(self.parser)
 
@@ -445,10 +443,8 @@ cdef class TextReader:
         # suboptimal
         if usecols is not None:
             self.has_usecols = 1
-            if callable(usecols):
-                self.usecols = usecols
-            else:
-                self.usecols = set(usecols)
+            # GH20529, validate usecols at higher level.
+            self.usecols = usecols
 
         # XXX
         if skipfooter > 0:

From 1e23cb534e0f84a38bbe4c979fc238930eecd69a Mon Sep 17 00:00:00 2001
From: Ming Li <ming.george.li@gmail.com>
Date: Fri, 30 Mar 2018 18:21:06 +0100
Subject: [PATCH 02/10] validate if it's a list-like container and not string

---
 pandas/io/parsers.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 469cd6d82e4b4..df915637541cd 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -410,6 +410,7 @@ def _validate_names(names):
     return names
 
 
+
 def _read(filepath_or_buffer, kwds):
     """Generic reader of line files."""
     encoding = kwds.get('encoding', None)
@@ -1192,17 +1193,19 @@ def _validate_usecols_arg(usecols):
         'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
         is passed in or None if a callable or None is passed in.
     """
-    msg = ("'usecols' must either be all strings, all unicode, "
+    msg = ("'usecols' must either be list-like of all strings, all unicode, "
            "all integers or a callable")
-
     if usecols is not None:
         if callable(usecols):
             return usecols, None
-        usecols_dtype = lib.infer_dtype(usecols)
-        if usecols_dtype not in ('empty', 'integer',
-                                 'string', 'unicode'):
+        # GH20529, ensure is iterable container but not string.
+        elif not is_list_like(usecols):
             raise ValueError(msg)
-
+        else:
+            usecols_dtype = lib.infer_dtype(usecols)
+            if usecols_dtype not in ('empty', 'integer',
+                                     'string', 'unicode'):
+                raise ValueError(msg)
         return set(usecols), usecols_dtype
     return usecols, None
 

From b74422bddc3764e30dd1a7d0725ff162f9216391 Mon Sep 17 00:00:00 2001
From: Ming Li <ming.george.li@gmail.com>
Date: Fri, 30 Mar 2018 21:21:51 +0100
Subject: [PATCH 03/10] refactor msg assert

---
 pandas/tests/io/parser/usecols.py | 46 +++++++++++++++----------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py
index 195fb4cba2aed..56a9b53cc9cf5 100644
--- a/pandas/tests/io/parser/usecols.py
+++ b/pandas/tests/io/parser/usecols.py
@@ -16,6 +16,11 @@
 
 
 class UsecolsTests(object):
+    msg_validate_usecols_arg = ("'usecols' must either be list-like of all "
+                                "strings, all unicode, all integers or a "
+                                "callable.")
+    msg_validate_usecols_names = ("Usecols do not match columns, columns "
+                                  "expected but not found: {0}")
 
     def test_raise_on_mixed_dtype_usecols(self):
         # See gh-12678
@@ -24,11 +29,9 @@ def test_raise_on_mixed_dtype_usecols(self):
         4000,5000,6000
         """
 
-        msg = ("'usecols' must either be all strings, all unicode, "
-               "all integers or a callable")
         usecols = [0, 'b', 2]
 
-        with tm.assert_raises_regex(ValueError, msg):
+        with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
             self.read_csv(StringIO(data), usecols=usecols)
 
     def test_usecols(self):
@@ -348,13 +351,10 @@ def test_usecols_with_mixed_encoding_strings(self):
         3.568935038,7,False,a
         '''
 
-        msg = ("'usecols' must either be all strings, all unicode, "
-               "all integers or a callable")
-
-        with tm.assert_raises_regex(ValueError, msg):
+        with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
             self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB'])
 
-        with tm.assert_raises_regex(ValueError, msg):
+        with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
             self.read_csv(StringIO(s), usecols=[b'AAA', u'BBB'])
 
     def test_usecols_with_multibyte_characters(self):
@@ -480,11 +480,6 @@ def test_raise_on_usecols_names_mismatch(self):
         # GH 14671
         data = 'a,b,c,d\n1,2,3,4\n5,6,7,8'
 
-        msg = (
-            "Usecols do not match columns, "
-            "columns expected but not found: {missing}"
-        )
-
         usecols = ['a', 'b', 'c', 'd']
         df = self.read_csv(StringIO(data), usecols=usecols)
         expected = DataFrame({'a': [1, 5], 'b': [2, 6], 'c': [3, 7],
@@ -492,18 +487,21 @@ def test_raise_on_usecols_names_mismatch(self):
         tm.assert_frame_equal(df, expected)
 
         usecols = ['a', 'b', 'c', 'f']
-        with tm.assert_raises_regex(
-                ValueError, msg.format(missing=r"\['f'\]")):
+        with tm.assert_raises_regex(ValueError,
+                                    self.msg_validate_usecols_names.format(
+                                        r"\['f'\]")):
             self.read_csv(StringIO(data), usecols=usecols)
 
         usecols = ['a', 'b', 'f']
-        with tm.assert_raises_regex(
-                ValueError, msg.format(missing=r"\['f'\]")):
+        with tm.assert_raises_regex(ValueError,
+                                    self.msg_validate_usecols_names.format(
+                                        r"\['f'\]")):
             self.read_csv(StringIO(data), usecols=usecols)
 
         usecols = ['a', 'b', 'f', 'g']
-        with tm.assert_raises_regex(
-                ValueError, msg.format(missing=r"\[('f', 'g'|'g', 'f')\]")):
+        with tm.assert_raises_regex(ValueError,
+                                    self.msg_validate_usecols_names.format(
+                                        r"\[('f', 'g'|'g', 'f')\]")):
             self.read_csv(StringIO(data), usecols=usecols)
 
         names = ['A', 'B', 'C', 'D']
@@ -527,11 +525,13 @@ def test_raise_on_usecols_names_mismatch(self):
         # tm.assert_frame_equal(df, expected)
 
         usecols = ['A', 'B', 'C', 'f']
-        with tm.assert_raises_regex(
-                ValueError, msg.format(missing=r"\['f'\]")):
+        with tm.assert_raises_regex(ValueError,
+                                    self.msg_validate_usecols_names.format(
+                                        r"\['f'\]")):
             self.read_csv(StringIO(data), header=0, names=names,
                           usecols=usecols)
         usecols = ['A', 'B', 'f']
-        with tm.assert_raises_regex(
-                ValueError, msg.format(missing=r"\['f'\]")):
+        with tm.assert_raises_regex(ValueError,
+                                    self.msg_validate_usecols_names.format(
+                                        r"\['f'\]")):
             self.read_csv(StringIO(data), names=names, usecols=usecols)

From 14b0ac0c67874eb664434f53512f33fcd79a0492 Mon Sep 17 00:00:00 2001
From: Ming Li <ming.george.li@gmail.com>
Date: Fri, 30 Mar 2018 21:39:01 +0100
Subject: [PATCH 04/10] validate usecol

---
 pandas/io/parsers.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index df915637541cd..14d85f2cbd767 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -410,7 +410,6 @@ def _validate_names(names):
     return names
 
 
-
 def _read(filepath_or_buffer, kwds):
     """Generic reader of line files."""
     encoding = kwds.get('encoding', None)
@@ -1194,7 +1193,7 @@ def _validate_usecols_arg(usecols):
         is passed in or None if a callable or None is passed in.
     """
     msg = ("'usecols' must either be list-like of all strings, all unicode, "
-           "all integers or a callable")
+           "all integers or a callable.")
     if usecols is not None:
         if callable(usecols):
             return usecols, None
@@ -1685,7 +1684,6 @@ class CParserWrapper(ParserBase):
     def __init__(self, src, **kwds):
         self.kwds = kwds
         kwds = kwds.copy()
-
         ParserBase.__init__(self, kwds)
 
         if (kwds.get('compression') is None
@@ -1700,11 +1698,12 @@ def __init__(self, src, **kwds):
         # #2442
         kwds['allow_leading_cols'] = self.index_col is not False
 
-        self._reader = parsers.TextReader(src, **kwds)
-
-        # XXX
+        # GH20529, validate usecol arg before TextReader
         self.usecols, self.usecols_dtype = _validate_usecols_arg(
-            self._reader.usecols)
+            kwds['usecols'])
+        kwds['usecols'] = self.usecols
+
+        self._reader = parsers.TextReader(src, **kwds)
 
         passed_names = self.names is None
 

From 23f3e8e124ffe948d5c5054b631e00267ae203dd Mon Sep 17 00:00:00 2001
From: Ming Li <ming.george.li@gmail.com>
Date: Fri, 30 Mar 2018 21:44:02 +0100
Subject: [PATCH 05/10] add back space

---
 pandas/_libs/parsers.pyx | 2 ++
 pandas/io/parsers.py     | 1 +
 2 files changed, 3 insertions(+)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 0dfc59586eaef..d4b95e4c7bccd 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -378,7 +378,9 @@ cdef class TextReader:
 
         self.compression = compression
         self.memory_map = memory_map
+
         self.parser.usecols = (usecols is not None)
+
         self._setup_parser_source(source)
         parser_set_default_options(self.parser)
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 14d85f2cbd767..73e14ce4ae4c7 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -1684,6 +1684,7 @@ class CParserWrapper(ParserBase):
     def __init__(self, src, **kwds):
         self.kwds = kwds
         kwds = kwds.copy()
+
         ParserBase.__init__(self, kwds)
 
         if (kwds.get('compression') is None

From 2de72b17f408729f5c8e548efcf83d3db0c057ed Mon Sep 17 00:00:00 2001
From: Ming Li <ming.george.li@gmail.com>
Date: Fri, 30 Mar 2018 21:47:51 +0100
Subject: [PATCH 06/10] update parameter description to list-like

---
 pandas/io/parsers.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 73e14ce4ae4c7..780aa5d02f598 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -97,11 +97,11 @@
     MultiIndex is used. If you have a malformed file with delimiters at the end
     of each line, you might consider index_col=False to force pandas to _not_
     use the first column as the index (row names)
-usecols : array-like or callable, default None
-    Return a subset of the columns. If array-like, all elements must either
+usecols : list-like or callable, default None
+    Return a subset of the columns. If list-like, all elements must either
     be positional (i.e. integer indices into the document columns) or strings
     that correspond to column names provided either by the user in `names` or
-    inferred from the document header row(s). For example, a valid array-like
+    inferred from the document header row(s). For example, a valid list-like
     `usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Element
     order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
     To instantiate a DataFrame from ``data`` with element order preserved use
@@ -1177,7 +1177,7 @@ def _validate_usecols_arg(usecols):
 
     Parameters
     ----------
-    usecols : array-like, callable, or None
+    usecols : list-like, callable, or None
         List of columns to use when parsing or a callable that can be used
         to filter a list of table columns.
 

From 8362885b5e2c4999035352064008484c47b117e4 Mon Sep 17 00:00:00 2001
From: Ming Li <ming.george.li@gmail.com>
Date: Fri, 30 Mar 2018 22:11:10 +0100
Subject: [PATCH 07/10] expand comment in parser.pyx

---
 pandas/_libs/parsers.pyx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index d4b95e4c7bccd..a24e2cdd99f6f 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -445,7 +445,8 @@ cdef class TextReader:
         # suboptimal
         if usecols is not None:
             self.has_usecols = 1
-            # GH20529, validate usecols at higher level.
+            # GH-20558, validate usecols at higher level and only pass clean
+            # usecols into TextReader.
             self.usecols = usecols
 
         # XXX

From b177e93de0db981ba568b6fb32c1bc5c103cd6c8 Mon Sep 17 00:00:00 2001
From: Ming Li <ming.george.li@gmail.com>
Date: Fri, 30 Mar 2018 22:36:39 +0100
Subject: [PATCH 08/10] add whatsnew entry

---
 doc/source/whatsnew/v0.23.0.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
index e83f149db1f18..6524012d27fc9 100644
--- a/doc/source/whatsnew/v0.23.0.txt
+++ b/doc/source/whatsnew/v0.23.0.txt
@@ -1077,6 +1077,7 @@ I/O
 - Bug in :meth:`pandas.io.stata.StataReader.value_labels` raising an ``AttributeError`` when called on very old files. Now returns an empty dict (:issue:`19417`)
 - Bug in :func:`read_pickle` when unpickling objects with :class:`TimedeltaIndex` or :class:`Float64Index` created with pandas prior to version 0.20 (:issue:`19939`)
 - Bug in :meth:`pandas.io.json.json_normalize` where subrecords are not properly normalized if any subrecords values are NoneType (:issue:`20030`)
+- Bug in ``usecols`` parameter in :func:`pandas.io.read_csv` and :func:`pandas.io.read_table` where error is not raised correctly when passing a string. (:issue:`20529`)
 
 Plotting
 ^^^^^^^^

From 86cb16bf06feb5d57376518d002e16cd433d8df5 Mon Sep 17 00:00:00 2001
From: Ming Li <ming.george.li@gmail.com>
Date: Fri, 30 Mar 2018 22:43:02 +0100
Subject: [PATCH 09/10] update parameter description

---
 doc/source/io.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/source/io.rst b/doc/source/io.rst
index ff505f525fc22..fd998d32cfbfb 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -130,11 +130,11 @@ index_col :  int or sequence or ``False``, default ``None``
   MultiIndex is used. If you have a malformed file with delimiters at the end of
   each line, you might consider ``index_col=False`` to force pandas to *not* use
   the first column as the index (row names).
-usecols : array-like or callable, default ``None``
-  Return a subset of the columns. If array-like, all elements must either
+usecols : list-like or callable, default ``None``
+  Return a subset of the columns. If list-like, all elements must either
   be positional (i.e. integer indices into the document columns) or strings
   that correspond to column names provided either by the user in `names` or
-  inferred from the document header row(s). For example, a valid array-like
+  inferred from the document header row(s). For example, a valid list-like
   `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
 
   Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. To

From dafc44290f8f8d7a06fa28cdca77fc0a6dea2792 Mon Sep 17 00:00:00 2001
From: Ming Li <ming.george.li@gmail.com>
Date: Sat, 31 Mar 2018 20:34:04 +0100
Subject: [PATCH 10/10] add test single string on usecols

---
 pandas/tests/io/parser/usecols.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py
index 56a9b53cc9cf5..584711528e9cb 100644
--- a/pandas/tests/io/parser/usecols.py
+++ b/pandas/tests/io/parser/usecols.py
@@ -88,6 +88,18 @@ def test_usecols(self):
         pytest.raises(ValueError, self.read_csv, StringIO(data),
                       names=['a', 'b'], usecols=[1], header=None)
 
+    def test_usecols_single_string(self):
+        # GH 20558
+        data = """foo, bar, baz
+        1000, 2000, 3000
+        4000, 5000, 6000
+        """
+
+        usecols = 'foo'
+
+        with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
+            self.read_csv(StringIO(data), usecols=usecols)
+
     def test_usecols_index_col_False(self):
         # see gh-9082
         s = "a,b,c,d\n1,2,3,4\n5,6,7,8"