From b0e102a5e1406e023329c31190771e40c6a13156 Mon Sep 17 00:00:00 2001
From: Aaron Critchley <aaron.critchley@gmail.com>
Date: Tue, 22 Aug 2017 14:06:18 +0100
Subject: [PATCH 01/10] ENH: better error message if usecols doesn't match

GH17301: Improving the error message given when usecols
doesn't match with the columns provided.

Signed-off-by: Aaron Critchley <aaron.critchley@gmail.com>
---
 pandas/io/parsers.py              | 12 ++++++++++--
 pandas/tests/io/parser/usecols.py |  2 +-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index a9821be3fa5e2..4187842a7c03b 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -1662,14 +1662,22 @@ def __init__(self, src, **kwds):
             # GH 14671
             if (self.usecols_dtype == 'string' and
                     not set(usecols).issubset(self.orig_names)):
-                raise ValueError("Usecols do not match names.")
+                missing = [c for c in usecols if c not in self.orig_names]
+                raise ValueError(
+                    "Usecols do not match columns, "
+                    "columns expected but not found: %s" % missing
+                )
 
             if len(self.names) > len(usecols):
                 self.names = [n for i, n in enumerate(self.names)
                               if (i in usecols or n in usecols)]
 
             if len(self.names) < len(usecols):
-                raise ValueError("Usecols do not match names.")
+                missing = [c for c in usecols if c not in self.orig_names]
+                raise ValueError(
+                    "Usecols do not match columns, "
+                    "columns expected but not found: %s" % missing
+                )
 
         self._set_noconvert_columns()
 
diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py
index f582e5037ca07..4fe74aa700385 100644
--- a/pandas/tests/io/parser/usecols.py
+++ b/pandas/tests/io/parser/usecols.py
@@ -481,7 +481,7 @@ def test_raise_on_usecols_names_mismatch(self):
         data = 'a,b,c,d\n1,2,3,4\n5,6,7,8'
 
         if self.engine == 'c':
-            msg = 'Usecols do not match names'
+            msg = "do not match columns, columns expected but not found"
         else:
             msg = 'is not in list'
 

From 15d47863503ff91e581c6948091a5b861375ad94 Mon Sep 17 00:00:00 2001
From: Aaron Critchley <aaron.critchley@gmail.com>
Date: Wed, 4 Oct 2017 20:07:43 +0100
Subject: [PATCH 02/10] WIP: Changing Python parser behaviour, better tests

---
 pandas/io/parsers.py              | 14 +++++++++++---
 pandas/tests/io/parser/usecols.py | 16 ++++++++--------
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 4187842a7c03b..e9b0aad81083b 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -1665,7 +1665,7 @@ def __init__(self, src, **kwds):
                 missing = [c for c in usecols if c not in self.orig_names]
                 raise ValueError(
                     "Usecols do not match columns, "
-                    "columns expected but not found: %s" % missing
+                    "columns expected but not found: {}".format(missing)
                 )
 
             if len(self.names) > len(usecols):
@@ -1673,10 +1673,10 @@ def __init__(self, src, **kwds):
                               if (i in usecols or n in usecols)]
 
             if len(self.names) < len(usecols):
-                missing = [c for c in usecols if c not in self.orig_names]
+                missing = [c for c in usecols if c not in self.names]
                 raise ValueError(
                     "Usecols do not match columns, "
-                    "columns expected but not found: %s" % missing
+                    "columns expected but not found: {}".format(missing)
                 )
 
         self._set_noconvert_columns()
@@ -2450,6 +2450,14 @@ def _handle_usecols(self, columns, usecols_key):
                     raise ValueError("If using multiple headers, usecols must "
                                      "be integers.")
                 col_indices = []
+
+                missing = [c for c in self.usecols if c not in usecols_key]
+                if len(missing) > 0:
+                    raise ValueError(
+                        "Usecols do not match columns, "
+                        "columns expected but not found: {}".format(missing)
+                    )
+
                 for col in self.usecols:
                     if isinstance(col, string_types):
                         col_indices.append(usecols_key.index(col))
diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py
index 4fe74aa700385..c64e1c4aa8821 100644
--- a/pandas/tests/io/parser/usecols.py
+++ b/pandas/tests/io/parser/usecols.py
@@ -480,10 +480,10 @@ def test_raise_on_usecols_names_mismatch(self):
         # GH 14671
         data = 'a,b,c,d\n1,2,3,4\n5,6,7,8'
 
-        if self.engine == 'c':
-            msg = "do not match columns, columns expected but not found"
-        else:
-            msg = 'is not in list'
+        msg = (
+            "Usecols do not match columns, "
+            "columns expected but not found: {}"
+        )
 
         usecols = ['a', 'b', 'c', 'd']
         df = self.read_csv(StringIO(data), usecols=usecols)
@@ -492,11 +492,11 @@ def test_raise_on_usecols_names_mismatch(self):
         tm.assert_frame_equal(df, expected)
 
         usecols = ['a', 'b', 'c', 'f']
-        with tm.assert_raises_regex(ValueError, msg):
+        with tm.assert_raises_regex(ValueError, msg.format(['f'])):
             self.read_csv(StringIO(data), usecols=usecols)
 
         usecols = ['a', 'b', 'f']
-        with tm.assert_raises_regex(ValueError, msg):
+        with tm.assert_raises_regex(ValueError, msg.format(['f'])):
             self.read_csv(StringIO(data), usecols=usecols)
 
         names = ['A', 'B', 'C', 'D']
@@ -520,9 +520,9 @@ def test_raise_on_usecols_names_mismatch(self):
         # tm.assert_frame_equal(df, expected)
 
         usecols = ['A', 'B', 'C', 'f']
-        with tm.assert_raises_regex(ValueError, msg):
+        with tm.assert_ra(ValueError, msg.format(['f'])):
             self.read_csv(StringIO(data), header=0, names=names,
                           usecols=usecols)
         usecols = ['A', 'B', 'f']
-        with tm.assert_raises_regex(ValueError, msg):
+        with tm.assert_raises_regex(ValueError, msg.format(['f'])):
             self.read_csv(StringIO(data), names=names, usecols=usecols)

From 841a6cc5d37300387c43e9649cc7669fbe990ec1 Mon Sep 17 00:00:00 2001
From: Aaron Critchley <aaron.critchley@gmail.com>
Date: Wed, 4 Oct 2017 20:38:41 +0100
Subject: [PATCH 03/10] WIP: Implementing suggested changes

---
 pandas/io/parsers.py              | 38 ++++++++++++++++---------------
 pandas/tests/io/parser/usecols.py | 10 ++++----
 2 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index e9b0aad81083b..90a04e18cd65a 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -1065,6 +1065,20 @@ def _evaluate_usecols(usecols, names):
     return usecols
 
 
+def _validate_usecols(usecols, names):
+    """
+    Validates that all usecols are present in a given
+    list of names, if not, raise a ValueError that
+    shows what usecols are missing.
+    """
+    missing = [c for c in usecols if c not in names]
+    if len(missing) > 0:
+        raise ValueError(
+            "Usecols do not match columns, "
+            "columns expected but not found: {missing}".format(missing=missing)
+        )
+
+
 def _validate_skipfooter_arg(skipfooter):
     """
     Validate the 'skipfooter' parameter.
@@ -1662,22 +1676,14 @@ def __init__(self, src, **kwds):
             # GH 14671
             if (self.usecols_dtype == 'string' and
                     not set(usecols).issubset(self.orig_names)):
-                missing = [c for c in usecols if c not in self.orig_names]
-                raise ValueError(
-                    "Usecols do not match columns, "
-                    "columns expected but not found: {}".format(missing)
-                )
+                _validate_usecols(usecols, self.orig_names)
 
             if len(self.names) > len(usecols):
                 self.names = [n for i, n in enumerate(self.names)
                               if (i in usecols or n in usecols)]
 
             if len(self.names) < len(usecols):
-                missing = [c for c in usecols if c not in self.names]
-                raise ValueError(
-                    "Usecols do not match columns, "
-                    "columns expected but not found: {}".format(missing)
-                )
+                _validate_usecols(usecols, self.names)
 
         self._set_noconvert_columns()
 
@@ -2451,16 +2457,12 @@ def _handle_usecols(self, columns, usecols_key):
                                      "be integers.")
                 col_indices = []
 
-                missing = [c for c in self.usecols if c not in usecols_key]
-                if len(missing) > 0:
-                    raise ValueError(
-                        "Usecols do not match columns, "
-                        "columns expected but not found: {}".format(missing)
-                    )
-
                 for col in self.usecols:
                     if isinstance(col, string_types):
-                        col_indices.append(usecols_key.index(col))
+                        try:
+                            col_indices.append(usecols_key.index(col))
+                        except ValueError:
+                            _validate_usecols(self.usecols, usecols_key)
                     else:
                         col_indices.append(col)
             else:
diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py
index c64e1c4aa8821..88f268b11b6fd 100644
--- a/pandas/tests/io/parser/usecols.py
+++ b/pandas/tests/io/parser/usecols.py
@@ -482,7 +482,7 @@ def test_raise_on_usecols_names_mismatch(self):
 
         msg = (
             "Usecols do not match columns, "
-            "columns expected but not found: {}"
+            "columns expected but not found: {missing}"
         )
 
         usecols = ['a', 'b', 'c', 'd']
@@ -492,11 +492,11 @@ def test_raise_on_usecols_names_mismatch(self):
         tm.assert_frame_equal(df, expected)
 
         usecols = ['a', 'b', 'c', 'f']
-        with tm.assert_raises_regex(ValueError, msg.format(['f'])):
+        with tm.assert_raises_regex(ValueError, msg.format(missing=['f'])):
             self.read_csv(StringIO(data), usecols=usecols)
 
         usecols = ['a', 'b', 'f']
-        with tm.assert_raises_regex(ValueError, msg.format(['f'])):
+        with tm.assert_raises_regex(ValueError, msg.format(missing=['f'])):
             self.read_csv(StringIO(data), usecols=usecols)
 
         names = ['A', 'B', 'C', 'D']
@@ -520,9 +520,9 @@ def test_raise_on_usecols_names_mismatch(self):
         # tm.assert_frame_equal(df, expected)
 
         usecols = ['A', 'B', 'C', 'f']
-        with tm.assert_ra(ValueError, msg.format(['f'])):
+        with tm.assert_ra(ValueError, msg.format(missing=['f'])):
             self.read_csv(StringIO(data), header=0, names=names,
                           usecols=usecols)
         usecols = ['A', 'B', 'f']
-        with tm.assert_raises_regex(ValueError, msg.format(['f'])):
+        with tm.assert_raises_regex(ValueError, msg.format(missing=['f'])):
             self.read_csv(StringIO(data), names=names, usecols=usecols)

From dced1b7a19a24c53aae3ff51c0a80d0003e6dee9 Mon Sep 17 00:00:00 2001
From: Aaron Critchley <aaron.critchley@gmail.com>
Date: Wed, 4 Oct 2017 20:57:12 +0100
Subject: [PATCH 04/10] Fixing silly error in tests

---
 pandas/tests/io/parser/usecols.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py
index 88f268b11b6fd..210d40fd5c805 100644
--- a/pandas/tests/io/parser/usecols.py
+++ b/pandas/tests/io/parser/usecols.py
@@ -492,11 +492,11 @@ def test_raise_on_usecols_names_mismatch(self):
         tm.assert_frame_equal(df, expected)
 
         usecols = ['a', 'b', 'c', 'f']
-        with tm.assert_raises_regex(ValueError, msg.format(missing=['f'])):
+        with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
             self.read_csv(StringIO(data), usecols=usecols)
 
         usecols = ['a', 'b', 'f']
-        with tm.assert_raises_regex(ValueError, msg.format(missing=['f'])):
+        with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
             self.read_csv(StringIO(data), usecols=usecols)
 
         names = ['A', 'B', 'C', 'D']
@@ -520,9 +520,9 @@ def test_raise_on_usecols_names_mismatch(self):
         # tm.assert_frame_equal(df, expected)
 
         usecols = ['A', 'B', 'C', 'f']
-        with tm.assert_ra(ValueError, msg.format(missing=['f'])):
+        with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
             self.read_csv(StringIO(data), header=0, names=names,
                           usecols=usecols)
         usecols = ['A', 'B', 'f']
-        with tm.assert_raises_regex(ValueError, msg.format(missing=['f'])):
+        with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
             self.read_csv(StringIO(data), names=names, usecols=usecols)

From 5bf89a88d0166ad0ec59e93ac180e23bb24b92c7 Mon Sep 17 00:00:00 2001
From: Aaron Critchley <aaron.critchley@gmail.com>
Date: Thu, 5 Oct 2017 00:42:16 +0100
Subject: [PATCH 05/10] Better documentation as per @gfyoung

---
 pandas/io/parsers.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 90a04e18cd65a..491d5568b3412 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -1068,8 +1068,24 @@ def _evaluate_usecols(usecols, names):
 def _validate_usecols(usecols, names):
     """
     Validates that all usecols are present in a given
-    list of names, if not, raise a ValueError that
+    list of names. If not, raise a ValueError that
     shows what usecols are missing.
+
+    Parameters
+    ----------
+    usecols : iterable of usecols
+        The columns to validate are present in names.
+    names : iterable of names
+        The column names to check against.
+
+    Returns
+    -------
+    usecols : iterable of usecols
+        The `usecols` parameter if the validation succeeds.
+
+    Raises
+    ------
+    ValueError : Detailing which usecols are missing, if any.
     """
     missing = [c for c in usecols if c not in names]
     if len(missing) > 0:
@@ -1078,6 +1094,8 @@ def _validate_usecols(usecols, names):
             "columns expected but not found: {missing}".format(missing=missing)
         )
 
+    return usecols
+
 
 def _validate_skipfooter_arg(skipfooter):
     """

From ba93833774fb24a049176d7e7ec93cfafb647b4a Mon Sep 17 00:00:00 2001
From: Aaron Critchley <aaron.critchley@gmail.com>
Date: Thu, 5 Oct 2017 01:05:01 +0100
Subject: [PATCH 06/10] Changing ValueError documentation

---
 pandas/io/parsers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 491d5568b3412..45385b70d8e1b 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -1085,7 +1085,7 @@ def _validate_usecols(usecols, names):
 
     Raises
     ------
-    ValueError : Detailing which usecols are missing, if any.
+    ValueError : Columns were missing. Error message will list them.
     """
     missing = [c for c in usecols if c not in names]
     if len(missing) > 0:

From 8a06cee2401223b286ddcae5697f772a550b848f Mon Sep 17 00:00:00 2001
From: Aaron Critchley <aaron.critchley@gmail.com>
Date: Wed, 8 Nov 2017 23:14:00 +0000
Subject: [PATCH 07/10] Reverting, changing name of function

---
 pandas/io/parsers.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 45385b70d8e1b..fafacd77947e3 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -1065,7 +1065,7 @@ def _evaluate_usecols(usecols, names):
     return usecols
 
 
-def _validate_usecols(usecols, names):
+def _validate_usecols_names(usecols, names):
     """
     Validates that all usecols are present in a given
     list of names. If not, raise a ValueError that
@@ -1694,14 +1694,14 @@ def __init__(self, src, **kwds):
             # GH 14671
             if (self.usecols_dtype == 'string' and
                     not set(usecols).issubset(self.orig_names)):
-                _validate_usecols(usecols, self.orig_names)
+                _validate_usecols_names(usecols, self.orig_names)
 
             if len(self.names) > len(usecols):
                 self.names = [n for i, n in enumerate(self.names)
                               if (i in usecols or n in usecols)]
 
             if len(self.names) < len(usecols):
-                _validate_usecols(usecols, self.names)
+                _validate_usecols_names(usecols, self.names)
 
         self._set_noconvert_columns()
 
@@ -2480,7 +2480,7 @@ def _handle_usecols(self, columns, usecols_key):
                         try:
                             col_indices.append(usecols_key.index(col))
                         except ValueError:
-                            _validate_usecols(self.usecols, usecols_key)
+                            _validate_usecols_names(self.usecols, usecols_key)
                     else:
                         col_indices.append(col)
             else:

From 2209eae98824a354ec81a900c278b05d120a685b Mon Sep 17 00:00:00 2001
From: Aaron Critchley <aaron.critchley@gmail.com>
Date: Sat, 2 Dec 2017 15:54:20 +0000
Subject: [PATCH 08/10] Adding whatsnews entry, and test for multiple missing
 columns

---
 doc/source/whatsnew/v0.22.0.txt   | 1 +
 pandas/tests/io/parser/usecols.py | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
index 1a08a1353a605..64f2701896a94 100644
--- a/doc/source/whatsnew/v0.22.0.txt
+++ b/doc/source/whatsnew/v0.22.0.txt
@@ -76,6 +76,7 @@ Other Enhancements
 - Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`)
 - :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`)
 - :func:`pandas.read_clipboard` updated to use qtpy, falling back to PyQt5 and then PyQt4, adding compatibility with Python3 and multiple python-qt bindings (:issue:`17722`)
+- Improved wording of ``ValueError`` raised when creating a DataFrame and the ``usecols`` argument cannot match all columns. (:issue:`17301`)
 
 .. _whatsnew_0220.api_breaking:
 
diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py
index 210d40fd5c805..4126e2eb93f28 100644
--- a/pandas/tests/io/parser/usecols.py
+++ b/pandas/tests/io/parser/usecols.py
@@ -495,8 +495,9 @@ def test_raise_on_usecols_names_mismatch(self):
         with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
             self.read_csv(StringIO(data), usecols=usecols)
 
-        usecols = ['a', 'b', 'f']
-        with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
+        usecols = ['a', 'b', 'f', 'g']
+        with tm.assert_raises_regex(
+                ValueError, msg.format(missing="\[('f', 'g'|'g', 'f')\]")):
             self.read_csv(StringIO(data), usecols=usecols)
 
         names = ['A', 'B', 'C', 'D']

From 93185f5f80842c4287b74e37d0e4f41e3f093901 Mon Sep 17 00:00:00 2001
From: Aaron Critchley <aaron.critchley@gmail.com>
Date: Sat, 2 Dec 2017 15:58:07 +0000
Subject: [PATCH 09/10] Adding accidentally removed test back

---
 pandas/tests/io/parser/usecols.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py
index 4126e2eb93f28..0fa53e6288bda 100644
--- a/pandas/tests/io/parser/usecols.py
+++ b/pandas/tests/io/parser/usecols.py
@@ -495,6 +495,10 @@ def test_raise_on_usecols_names_mismatch(self):
         with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
             self.read_csv(StringIO(data), usecols=usecols)
 
+        usecols = ['a', 'b', 'f']
+        with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
+            self.read_csv(StringIO(data), usecols=usecols)
+
         usecols = ['a', 'b', 'f', 'g']
         with tm.assert_raises_regex(
                 ValueError, msg.format(missing="\[('f', 'g'|'g', 'f')\]")):

From 5dfccdb6e617ec6596a1a9c34ff4e5d5b4e091cf Mon Sep 17 00:00:00 2001
From: Aaron Critchley <aaron.critchley@gmail.com>
Date: Sat, 2 Dec 2017 16:00:35 +0000
Subject: [PATCH 10/10] Suggested whatsnew change

---
 doc/source/whatsnew/v0.22.0.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
index 64f2701896a94..1cad0e68d0f25 100644
--- a/doc/source/whatsnew/v0.22.0.txt
+++ b/doc/source/whatsnew/v0.22.0.txt
@@ -76,7 +76,7 @@ Other Enhancements
 - Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`)
 - :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`)
 - :func:`pandas.read_clipboard` updated to use qtpy, falling back to PyQt5 and then PyQt4, adding compatibility with Python3 and multiple python-qt bindings (:issue:`17722`)
-- Improved wording of ``ValueError`` raised when creating a DataFrame and the ``usecols`` argument cannot match all columns. (:issue:`17301`)
+- Improved wording of ``ValueError`` raised in :func:`read_csv` when the ``usecols`` argument cannot match all columns. (:issue:`17301`)
 
 .. _whatsnew_0220.api_breaking: