gh-14671 Check if usecols with type string contains a subset of names, if not throws an error

brendapraggastis · brendapraggastis · commit 5e4966c1620a · 2017-05-22T22:29:49.000-07:00
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1620,6 +1620,12 @@ def __init__(self, src, **kwds):
 
         if self.usecols:
             usecols = _evaluate_usecols(self.usecols, self.orig_names)
+
+            #gh-14671
+            if  (self.usecols_dtype == 'string') and \
+                (not set(usecols).issubset(self.orig_names)):
+               raise ValueError("Usecols do not match names.")
+
             if len(self.names) > len(usecols):
                 self.names = [n for i, n in enumerate(self.names)
                               if (i in usecols or n in usecols)]
diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py
@@ -9,6 +9,7 @@
 
 import numpy as np
 import pandas.util.testing as tm
+import re
 
 from pandas import DataFrame, Index
 from pandas._libs.lib import Timestamp
@@ -475,3 +476,20 @@ def test_uneven_length_cols(self):
                               'C': [3, 5, 4, 3, 3, 7]})
         df = self.read_csv(StringIO(data), usecols=usecols)
         tm.assert_frame_equal(df, expected)
+
+    def test_raise_on_usecols_names_mismatch(self):
+        # see gh-14671
+        data = 'a,b,c,d\n1,2,3,4\n5,6,7,8'
+        usecols = ['a','b','c','d']
+        df = self.read_csv(StringIO(data), usecols=usecols)
+        expected = DataFrame({'a': [1,5], 'b': [2,6], 'c': [3,7], 'd': [4,8]})
+        tm.assert_frame_equal(df, expected)
+
+        msg = 'Usecols do not match names'  ## from parsers.py CParserWrapper()
+        msg2 = 'is not in list' ## from parser.py _handle_usecols()
+        usecols = ['a','b','c','f']
+        with tm.assert_raises_regex(ValueError, re.compile("'" + msg + '||' + msg2 + "'")):
+            self.read_csv(StringIO(data), usecols=usecols)
+        usecols = ['a','b','f']
+        with tm.assert_raises_regex(ValueError, re.compile("'" + msg + '||' + msg2 + "'")):
+            self.read_csv(StringIO(data), usecols=usecols)