From d6ae52a523902271ecedb8d141208b06f0fb52b9 Mon Sep 17 00:00:00 2001 From: Evan Wright Date: Fri, 23 Oct 2015 16:11:27 -0400 Subject: [PATCH 1/2] BUG: drop_duplicates drops non-duplicate rows in the presence of integer columns (GH 11376) --- doc/source/whatsnew/v0.17.1.txt | 2 +- pandas/tests/test_frame.py | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.17.1.txt b/doc/source/whatsnew/v0.17.1.txt index 2bb9920b6f177..70226ca302a60 100755 --- a/doc/source/whatsnew/v0.17.1.txt +++ b/doc/source/whatsnew/v0.17.1.txt @@ -92,7 +92,7 @@ Bug Fixes - Bug in ``pivot_table`` with ``margins=True`` when indexes are of ``Categorical`` dtype (:issue:`10993`) - Bug in ``DataFrame.plot`` cannot use hex strings colors (:issue:`10299`) - +- Bug in ``DataFrame.drop_duplicates`` (regression from 0.16.2) causing some non-duplicate rows containing integer values to be dropped (:issue:`11376`) - Bug in ``pd.eval`` where unary ops in a list error (:issue:`11235`) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 5c7f1ec9e0037..dfbd21997568d 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -8380,6 +8380,25 @@ def test_drop_duplicates(self): expected = df.iloc[[-2,-1]] assert_frame_equal(result, expected) + # GH 11376 + df = pd.DataFrame({'x': [7, 6, 3, 3, 4, 8, 0], + 'y': [0, 6, 5, 5, 9, 1, 2]}) + expected = df.loc[df.index != 3] + assert_frame_equal(df.drop_duplicates(), expected) + + df = pd.DataFrame([[1 , 0], [0, 2]]) + assert_frame_equal(df.drop_duplicates(), df) + + df = pd.DataFrame([[-2, 0], [0, -4]]) + assert_frame_equal(df.drop_duplicates(), df) + + x = np.iinfo(np.int64).max / 3 * 2 + df = pd.DataFrame([[-x, x], [0, x + 4]]) + assert_frame_equal(df.drop_duplicates(), df) + + df = pd.DataFrame([[-x, x], [x, x + 4]]) + assert_frame_equal(df.drop_duplicates(), df) + def test_drop_duplicates_for_take_all(self): df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar', 'foo', 'bar', 'qux', 'foo'], From b7107283df30a7c45dbc30347d06c3bbda7f05f3 Mon Sep 17 00:00:00 2001 From: Evan Wright Date: Fri, 23 Oct 2015 16:42:28 -0400 Subject: [PATCH 2/2] Revert "PERF: perf improvements in drop_duplicates for integer dtyped arrays" This reverts commit a00c7ea1e2b6be5754a0461915cc48b007771b34, but leaves new tests and benchmark --- pandas/core/frame.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 31b7aacefcb60..4774fc4f17a91 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2994,13 +2994,7 @@ def duplicated(self, subset=None, keep='first'): from pandas.hashtable import duplicated_int64, _SIZE_HINT_LIMIT def f(vals): - - # if we have integers we can directly index with these - if com.is_integer_dtype(vals): - from pandas.core.nanops import unique1d - labels, shape = vals, unique1d(vals) - else: - labels, shape = factorize(vals, size_hint=min(len(self), _SIZE_HINT_LIMIT)) + labels, shape = factorize(vals, size_hint=min(len(self), _SIZE_HINT_LIMIT)) return labels.astype('i8',copy=False), len(shape) if subset is None: