ENH: Better error message if usecols doesn't match columns (#17310)

AaronCritchley · jreback · commit 7a3f81a34507 · 2017-12-03T10:26:50.000-05:00
diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
@@ -76,6 +76,7 @@ Other Enhancements
 - Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`)
 - :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`)
 - :func:`pandas.read_clipboard` updated to use qtpy, falling back to PyQt5 and then PyQt4, adding compatibility with Python3 and multiple python-qt bindings (:issue:`17722`)
+- Improved wording of ``ValueError`` raised in :func:`read_csv` when the ``usecols`` argument cannot match all columns. (:issue:`17301`)
 
 .. _whatsnew_0220.api_breaking:
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1141,6 +1141,38 @@ def _evaluate_usecols(usecols, names):
     return usecols
 
 
+def _validate_usecols_names(usecols, names):
+    """
+    Validates that all usecols are present in a given
+    list of names. If not, raise a ValueError that
+    shows what usecols are missing.
+
+    Parameters
+    ----------
+    usecols : iterable of usecols
+        The columns to validate are present in names.
+    names : iterable of names
+        The column names to check against.
+
+    Returns
+    -------
+    usecols : iterable of usecols
+        The `usecols` parameter if the validation succeeds.
+
+    Raises
+    ------
+    ValueError : Columns were missing. Error message will list them.
+    """
+    missing = [c for c in usecols if c not in names]
+    if len(missing) > 0:
+        raise ValueError(
+            "Usecols do not match columns, "
+            "columns expected but not found: {missing}".format(missing=missing)
+        )
+
+    return usecols
+
+
 def _validate_skipfooter_arg(skipfooter):
     """
     Validate the 'skipfooter' parameter.
@@ -1753,14 +1785,14 @@ def __init__(self, src, **kwds):
             # GH 14671
             if (self.usecols_dtype == 'string' and
                     not set(usecols).issubset(self.orig_names)):
-                raise ValueError("Usecols do not match names.")
+                _validate_usecols_names(usecols, self.orig_names)
 
             if len(self.names) > len(usecols):
                 self.names = [n for i, n in enumerate(self.names)
                               if (i in usecols or n in usecols)]
 
             if len(self.names) < len(usecols):
-                raise ValueError("Usecols do not match names.")
+                _validate_usecols_names(usecols, self.names)
 
         self._set_noconvert_columns()
 
@@ -2532,9 +2564,13 @@ def _handle_usecols(self, columns, usecols_key):
                     raise ValueError("If using multiple headers, usecols must "
                                      "be integers.")
                 col_indices = []
+
                 for col in self.usecols:
                     if isinstance(col, string_types):
-                        col_indices.append(usecols_key.index(col))
+                        try:
+                            col_indices.append(usecols_key.index(col))
+                        except ValueError:
+                            _validate_usecols_names(self.usecols, usecols_key)
                     else:
                         col_indices.append(col)
             else:
diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py
@@ -480,10 +480,10 @@ def test_raise_on_usecols_names_mismatch(self):
         # GH 14671
         data = 'a,b,c,d\n1,2,3,4\n5,6,7,8'
 
-        if self.engine == 'c':
-            msg = 'Usecols do not match names'
-        else:
-            msg = 'is not in list'
+        msg = (
+            "Usecols do not match columns, "
+            "columns expected but not found: {missing}"
+        )
 
         usecols = ['a', 'b', 'c', 'd']
         df = self.read_csv(StringIO(data), usecols=usecols)
@@ -492,11 +492,16 @@ def test_raise_on_usecols_names_mismatch(self):
         tm.assert_frame_equal(df, expected)
 
         usecols = ['a', 'b', 'c', 'f']
-        with tm.assert_raises_regex(ValueError, msg):
+        with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
             self.read_csv(StringIO(data), usecols=usecols)
 
         usecols = ['a', 'b', 'f']
-        with tm.assert_raises_regex(ValueError, msg):
+        with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
+            self.read_csv(StringIO(data), usecols=usecols)
+
+        usecols = ['a', 'b', 'f', 'g']
+        with tm.assert_raises_regex(
+                ValueError, msg.format(missing="\[('f', 'g'|'g', 'f')\]")):
             self.read_csv(StringIO(data), usecols=usecols)
 
         names = ['A', 'B', 'C', 'D']
@@ -520,9 +525,9 @@ def test_raise_on_usecols_names_mismatch(self):
         # tm.assert_frame_equal(df, expected)
 
         usecols = ['A', 'B', 'C', 'f']
-        with tm.assert_raises_regex(ValueError, msg):
+        with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
             self.read_csv(StringIO(data), header=0, names=names,
                           usecols=usecols)
         usecols = ['A', 'B', 'f']
-        with tm.assert_raises_regex(ValueError, msg):
+        with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
             self.read_csv(StringIO(data), names=names, usecols=usecols)