Skip to content

Commit 6d324db

Browse files
Chang Shewesm
Chang She
authored andcommitted
BUG: DataFrame.drop_duplicates with NA values
1 parent 9746f16 commit 6d324db

File tree

2 files changed

+66
-0
lines changed

2 files changed

+66
-0
lines changed

pandas/src/groupby.pyx

+13
Original file line numberDiff line numberDiff line change
@@ -1306,6 +1306,7 @@ def duplicated(list values, take_last=False):
13061306
cdef:
13071307
Py_ssize_t i, n
13081308
dict seen = {}
1309+
int has_nan = 0
13091310
object row
13101311

13111312
n = len(values)
@@ -1316,6 +1317,12 @@ def duplicated(list values, take_last=False):
13161317
row = values[i]
13171318
if row in seen:
13181319
result[i] = 1
1320+
elif row != row:
1321+
if has_nan == 1:
1322+
result[i] = 1
1323+
else:
1324+
has_nan = 1
1325+
result[i] = 0
13191326
else:
13201327
seen[row] = None
13211328
result[i] = 0
@@ -1324,6 +1331,12 @@ def duplicated(list values, take_last=False):
13241331
row = values[i]
13251332
if row in seen:
13261333
result[i] = 1
1334+
elif row != row:
1335+
if has_nan == 1:
1336+
result[i] = 1
1337+
else:
1338+
has_nan = 1
1339+
result[i] = 0
13271340
else:
13281341
seen[row] = None
13291342
result[i] = 0

pandas/tests/test_frame.py

+53
Original file line numberDiff line numberDiff line change
@@ -3286,6 +3286,59 @@ def test_drop_duplicates(self):
32863286
expected = df2.drop_duplicates(['A', 'B'], take_last=True)
32873287
assert_frame_equal(result, expected)
32883288

3289+
def test_drop_duplicates_NA(self):
3290+
# none
3291+
df = DataFrame({'A' : [None, None, 'foo', 'bar',
3292+
'foo', 'bar', 'bar', 'foo'],
3293+
'B' : ['one', 'one', 'two', 'two',
3294+
'two', 'two', 'one', 'two'],
3295+
'C' : [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.],
3296+
'D' : range(8)})
3297+
3298+
# single column
3299+
result = df.drop_duplicates('A')
3300+
expected = df.ix[[0, 2, 3]]
3301+
assert_frame_equal(result, expected)
3302+
3303+
result = df.drop_duplicates('A', take_last=True)
3304+
expected = df.ix[[1, 6, 7]]
3305+
assert_frame_equal(result, expected)
3306+
3307+
# multi column
3308+
result = df.drop_duplicates(['A', 'B'])
3309+
expected = df.ix[[0, 2, 3, 6]]
3310+
assert_frame_equal(result, expected)
3311+
3312+
result = df.drop_duplicates(['A', 'B'], take_last=True)
3313+
expected = df.ix[[1, 5, 6, 7]]
3314+
assert_frame_equal(result, expected)
3315+
3316+
# nan
3317+
df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
3318+
'foo', 'bar', 'bar', 'foo'],
3319+
'B' : ['one', 'one', 'two', 'two',
3320+
'two', 'two', 'one', 'two'],
3321+
'C' : [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.],
3322+
'D' : range(8)})
3323+
3324+
# single column
3325+
result = df.drop_duplicates('C')
3326+
expected = df[:2]
3327+
assert_frame_equal(result, expected)
3328+
3329+
result = df.drop_duplicates('C', take_last=True)
3330+
expected = df.ix[[3, 7]]
3331+
assert_frame_equal(result, expected)
3332+
3333+
# multi column
3334+
result = df.drop_duplicates(['C', 'B'])
3335+
expected = df.ix[[0, 1, 2, 4]]
3336+
assert_frame_equal(result, expected)
3337+
3338+
result = df.drop_duplicates(['C', 'B'], take_last=True)
3339+
expected = df.ix[[1, 3, 6, 7]]
3340+
assert_frame_equal(result, expected)
3341+
32893342
def test_drop_col_still_multiindex(self):
32903343
arrays = [[ 'a', 'b', 'c', 'top'],
32913344
[ '', '', '', 'OD' ],

0 commit comments

Comments
 (0)