BUG: drop_duplicates not raising KeyError on missing key (pandas-dev#19730)

NoahTheDuke · harisbal · commit 440fc8d4a9fe · 2018-02-28T00:04:27.000Z
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -795,6 +795,8 @@ Indexing
 - Bug in :class:`IntervalIndex` where empty and purely NA data was constructed inconsistently depending on the construction method (:issue:`18421`)
 - Bug in :func:`IntervalIndex.symmetric_difference` where the symmetric difference with a non-``IntervalIndex`` did not raise (:issue:`18475`)
 - Bug in :class:`IntervalIndex` where set operations that returned an empty ``IntervalIndex`` had the wrong dtype (:issue:`19101`)
+- Bug in :meth:`DataFrame.drop_duplicates` where no ``KeyError`` is raised when passing in columns that don't exist on the ``DataFrame`` (issue:`19726`)
+
 
 MultiIndex
 ^^^^^^^^^^
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3655,6 +3655,13 @@ def f(vals):
               isinstance(subset, tuple) and subset in self.columns):
             subset = subset,
 
+        # Verify all columns in subset exist in the queried dataframe
+        # Otherwise, raise a KeyError, same as if you try to __getitem__ with a
+        # key that doesn't exist.
+        diff = Index(subset).difference(self.columns)
+        if not diff.empty:
+            raise KeyError(diff)
+
         vals = (col.values for name, col in self.iteritems()
                 if name in subset)
         labels, shape = map(list, zip(*map(f, vals)))
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -1492,6 +1492,19 @@ def test_drop_duplicates(self):
         for keep in ['first', 'last', False]:
             assert df.duplicated(keep=keep).sum() == 0
 
+    @pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']])
+    def test_duplicated_with_misspelled_column_name(self, subset):
+        # GH 19730
+        df = pd.DataFrame({'A': [0, 0, 1],
+                           'B': [0, 0, 1],
+                           'C': [0, 0, 1]})
+
+        with pytest.raises(KeyError):
+            df.duplicated(subset)
+
+        with pytest.raises(KeyError):
+            df.drop_duplicates(subset)
+
     def test_drop_duplicates_with_duplicate_column_names(self):
         # GH17836
         df = DataFrame([