Skip to content

Commit 8a04d63

Browse files
committed
Merge pull request #11403 from evanpw/drop_dup_integers
BUG: drop_duplicates drops non-duplicate rows in the presence of integer columns
2 parents 22d75a3 + b710728 commit 8a04d63

File tree

3 files changed

+21
-8
lines changed

3 files changed

+21
-8
lines changed

doc/source/whatsnew/v0.17.1.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ Bug Fixes
9292
- Bug in ``pivot_table`` with ``margins=True`` when indexes are of ``Categorical`` dtype (:issue:`10993`)
9393
- Bug in ``DataFrame.plot`` cannot use hex strings colors (:issue:`10299`)
9494

95-
95+
- Bug in ``DataFrame.drop_duplicates`` (regression from 0.16.2) causing some non-duplicate rows containing integer values to be dropped (:issue:`11376`)
9696

9797

9898
- Bug in ``pd.eval`` where unary ops in a list error (:issue:`11235`)

pandas/core/frame.py

+1-7
Original file line numberDiff line numberDiff line change
@@ -2994,13 +2994,7 @@ def duplicated(self, subset=None, keep='first'):
29942994
from pandas.hashtable import duplicated_int64, _SIZE_HINT_LIMIT
29952995

29962996
def f(vals):
2997-
2998-
# if we have integers we can directly index with these
2999-
if com.is_integer_dtype(vals):
3000-
from pandas.core.nanops import unique1d
3001-
labels, shape = vals, unique1d(vals)
3002-
else:
3003-
labels, shape = factorize(vals, size_hint=min(len(self), _SIZE_HINT_LIMIT))
2997+
labels, shape = factorize(vals, size_hint=min(len(self), _SIZE_HINT_LIMIT))
30042998
return labels.astype('i8',copy=False), len(shape)
30052999

30063000
if subset is None:

pandas/tests/test_frame.py

+19
Original file line numberDiff line numberDiff line change
@@ -8380,6 +8380,25 @@ def test_drop_duplicates(self):
83808380
expected = df.iloc[[-2,-1]]
83818381
assert_frame_equal(result, expected)
83828382

8383+
# GH 11376
8384+
df = pd.DataFrame({'x': [7, 6, 3, 3, 4, 8, 0],
8385+
'y': [0, 6, 5, 5, 9, 1, 2]})
8386+
expected = df.loc[df.index != 3]
8387+
assert_frame_equal(df.drop_duplicates(), expected)
8388+
8389+
df = pd.DataFrame([[1 , 0], [0, 2]])
8390+
assert_frame_equal(df.drop_duplicates(), df)
8391+
8392+
df = pd.DataFrame([[-2, 0], [0, -4]])
8393+
assert_frame_equal(df.drop_duplicates(), df)
8394+
8395+
x = np.iinfo(np.int64).max / 3 * 2
8396+
df = pd.DataFrame([[-x, x], [0, x + 4]])
8397+
assert_frame_equal(df.drop_duplicates(), df)
8398+
8399+
df = pd.DataFrame([[-x, x], [x, x + 4]])
8400+
assert_frame_equal(df.drop_duplicates(), df)
8401+
83838402
def test_drop_duplicates_for_take_all(self):
83848403
df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar',
83858404
'foo', 'bar', 'qux', 'foo'],

0 commit comments

Comments
 (0)