ERRR: Raise error in usecols when column doesn't exist but length matches (pandas-dev#16460)

bpraggastis · TomAugspurger · commit 28b1c8bb15de · 2017-06-04T05:57:21.000-05:00
* pandas-devgh-14671 Check if usecols with type string contains a subset of names, if not throws an error * tests added for pandas-devgh-14671, expected behavior of simultaneous use of usecols and names unclear so these tests are commented out * Review comments (cherry picked from commit 50a62c1)
diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt
@@ -71,6 +71,7 @@ I/O
 ^^^
 
 - Bug in :func:`read_csv` when ``comment`` is passed in a space delimited text file (:issue:`16472`)
+- Bug in :func:`read_csv` not raising an exception with nonexistent columns in ``usecols`` when it had the correct length (:issue:`14671`)
 - Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`)
 - Bug that raised ``IndexError`` when HTML-rendering an empty ``DataFrame`` (:issue:`15953`)
 - Bug in :func:`read_csv` in which tarfile object inputs were raising an error in Python 2.x for the C engine (:issue:`16530`)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1643,6 +1643,12 @@ def __init__(self, src, **kwds):
 
         if self.usecols:
             usecols = _evaluate_usecols(self.usecols, self.orig_names)
+
+            # GH 14671
+            if (self.usecols_dtype == 'string' and
+                    not set(usecols).issubset(self.orig_names)):
+                raise ValueError("Usecols do not match names.")
+
             if len(self.names) > len(usecols):
                 self.names = [n for i, n in enumerate(self.names)
                               if (i in usecols or n in usecols)]
diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py
@@ -475,3 +475,54 @@ def test_uneven_length_cols(self):
                               'C': [3, 5, 4, 3, 3, 7]})
         df = self.read_csv(StringIO(data), usecols=usecols)
         tm.assert_frame_equal(df, expected)
+
+    def test_raise_on_usecols_names_mismatch(self):
+        # GH 14671
+        data = 'a,b,c,d\n1,2,3,4\n5,6,7,8'
+
+        if self.engine == 'c':
+            msg = 'Usecols do not match names'
+        else:
+            msg = 'is not in list'
+
+        usecols = ['a', 'b', 'c', 'd']
+        df = self.read_csv(StringIO(data), usecols=usecols)
+        expected = DataFrame({'a': [1, 5], 'b': [2, 6], 'c': [3, 7],
+                              'd': [4, 8]})
+        tm.assert_frame_equal(df, expected)
+
+        usecols = ['a', 'b', 'c', 'f']
+        with tm.assert_raises_regex(ValueError, msg):
+            self.read_csv(StringIO(data), usecols=usecols)
+
+        usecols = ['a', 'b', 'f']
+        with tm.assert_raises_regex(ValueError, msg):
+            self.read_csv(StringIO(data), usecols=usecols)
+
+        names = ['A', 'B', 'C', 'D']
+
+        df = self.read_csv(StringIO(data), header=0, names=names)
+        expected = DataFrame({'A': [1, 5], 'B': [2, 6], 'C': [3, 7],
+                              'D': [4, 8]})
+        tm.assert_frame_equal(df, expected)
+
+        # TODO: https://github.com/pandas-dev/pandas/issues/16469
+        # usecols = ['A','C']
+        # df = self.read_csv(StringIO(data), header=0, names=names,
+        #                    usecols=usecols)
+        # expected = DataFrame({'A': [1,5], 'C': [3,7]})
+        # tm.assert_frame_equal(df, expected)
+        #
+        # usecols = [0,2]
+        # df = self.read_csv(StringIO(data), header=0, names=names,
+        #                    usecols=usecols)
+        # expected = DataFrame({'A': [1,5], 'C': [3,7]})
+        # tm.assert_frame_equal(df, expected)
+
+        usecols = ['A', 'B', 'C', 'f']
+        with tm.assert_raises_regex(ValueError, msg):
+            self.read_csv(StringIO(data), header=0, names=names,
+                          usecols=usecols)
+        usecols = ['A', 'B', 'f']
+        with tm.assert_raises_regex(ValueError, msg):
+            self.read_csv(StringIO(data), names=names, usecols=usecols)