Merge pull request #11403 from evanpw/drop_dup_integers

jreback · jreback · commit 8a04d63ebd15 · 2015-10-23T20:18:23.000-04:00
BUG: drop_duplicates drops non-duplicate rows in the presence of integer columns
diff --git a/doc/source/whatsnew/v0.17.1.txt b/doc/source/whatsnew/v0.17.1.txt
@@ -92,7 +92,7 @@ Bug Fixes
 - Bug in ``pivot_table`` with ``margins=True`` when indexes are of ``Categorical`` dtype (:issue:`10993`)
 - Bug in ``DataFrame.plot`` cannot use hex strings colors (:issue:`10299`)
 
-
+- Bug in ``DataFrame.drop_duplicates`` (regression from 0.16.2) causing some non-duplicate rows containing integer values to be dropped (:issue:`11376`)
 
 
 - Bug in ``pd.eval`` where unary ops in a list error (:issue:`11235`)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -2994,13 +2994,7 @@ def duplicated(self, subset=None, keep='first'):
         from pandas.hashtable import duplicated_int64, _SIZE_HINT_LIMIT
 
         def f(vals):
-
-            # if we have integers we can directly index with these
-            if com.is_integer_dtype(vals):
-                from pandas.core.nanops import unique1d
-                labels, shape = vals, unique1d(vals)
-            else:
-                labels, shape = factorize(vals, size_hint=min(len(self), _SIZE_HINT_LIMIT))
+            labels, shape = factorize(vals, size_hint=min(len(self), _SIZE_HINT_LIMIT))
             return labels.astype('i8',copy=False), len(shape)
 
         if subset is None:
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -8380,6 +8380,25 @@ def test_drop_duplicates(self):
         expected = df.iloc[[-2,-1]]
         assert_frame_equal(result, expected)
 
+        # GH 11376
+        df = pd.DataFrame({'x': [7, 6, 3, 3, 4, 8, 0],
+                           'y': [0, 6, 5, 5, 9, 1, 2]})
+        expected = df.loc[df.index != 3]
+        assert_frame_equal(df.drop_duplicates(), expected)
+
+        df = pd.DataFrame([[1 , 0], [0, 2]])
+        assert_frame_equal(df.drop_duplicates(), df)
+
+        df = pd.DataFrame([[-2, 0], [0, -4]])
+        assert_frame_equal(df.drop_duplicates(), df)
+
+        x = np.iinfo(np.int64).max / 3 * 2
+        df = pd.DataFrame([[-x, x], [0, x + 4]])
+        assert_frame_equal(df.drop_duplicates(), df)
+
+        df = pd.DataFrame([[-x, x], [x, x + 4]])
+        assert_frame_equal(df.drop_duplicates(), df)
+
     def test_drop_duplicates_for_take_all(self):
         df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar',
                                 'foo', 'bar', 'qux', 'foo'],