Skip to content

Commit 5c1937b

Browse files
Chang Shewesm
Chang She
authored andcommitted
ENH: inplace option to DataFrame.drop_duplicates #805 with vbench
1 parent 030ab4d commit 5c1937b

File tree

3 files changed

+72
-2
lines changed

3 files changed

+72
-2
lines changed

pandas/core/frame.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -2344,7 +2344,7 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None):
23442344
new_labels = labels[mask]
23452345
return self.reindex(**{axis_name: new_labels})
23462346

2347-
def drop_duplicates(self, cols=None, take_last=False):
2347+
def drop_duplicates(self, cols=None, take_last=False, inplace=False):
23482348
"""
23492349
Return DataFrame with duplicate rows removed, optionally only
23502350
considering certain columns
@@ -2358,13 +2358,23 @@ def drop_duplicates(self, cols=None, take_last=False):
23582358
Take the last observed row in a row. Defaults to the first row
23592359
skipna : boolean, default True
23602360
If True then keep NaN
2361+
inplace : boolean, default False
2362+
Whether to drop duplicates in place or to return a copy
23612363
23622364
Returns
23632365
-------
23642366
deduplicated : DataFrame
23652367
"""
2368+
23662369
duplicated = self.duplicated(cols, take_last=take_last)
2367-
return self[-duplicated]
2370+
2371+
if inplace:
2372+
inds, = (-duplicated).nonzero()
2373+
self._data = self._data.take(inds)
2374+
self._clear_item_cache()
2375+
return self
2376+
else:
2377+
return self[-duplicated]
23682378

23692379
def duplicated(self, cols=None, take_last=False):
23702380
"""

pandas/tests/test_frame.py

+50
Original file line numberDiff line numberDiff line change
@@ -3340,6 +3340,56 @@ def test_drop_duplicates_NA(self):
33403340
expected = df.ix[[1, 3, 6, 7]]
33413341
assert_frame_equal(result, expected)
33423342

3343+
def test_drop_duplicates_inplace(self):
3344+
orig = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
3345+
'foo', 'bar', 'bar', 'foo'],
3346+
'B' : ['one', 'one', 'two', 'two',
3347+
'two', 'two', 'one', 'two'],
3348+
'C' : [1, 1, 2, 2, 2, 2, 1, 2],
3349+
'D' : range(8)})
3350+
3351+
# single column
3352+
df = orig.copy()
3353+
df.drop_duplicates('A', inplace=True)
3354+
expected = orig[:2]
3355+
result = df
3356+
assert_frame_equal(result, expected)
3357+
3358+
df = orig.copy()
3359+
df.drop_duplicates('A', take_last=True, inplace=True)
3360+
expected = orig.ix[[6, 7]]
3361+
result = df
3362+
assert_frame_equal(result, expected)
3363+
3364+
# multi column
3365+
df = orig.copy()
3366+
df.drop_duplicates(['A', 'B'], inplace=True)
3367+
expected = orig.ix[[0, 1, 2, 3]]
3368+
result = df
3369+
assert_frame_equal(result, expected)
3370+
3371+
df = orig.copy()
3372+
df.drop_duplicates(['A', 'B'], take_last=True, inplace=True)
3373+
expected = orig.ix[[0, 5, 6, 7]]
3374+
result = df
3375+
assert_frame_equal(result, expected)
3376+
3377+
# consider everything
3378+
orig2 = orig.ix[:, ['A', 'B', 'C']].copy()
3379+
3380+
df2 = orig2.copy()
3381+
df2.drop_duplicates(inplace=True)
3382+
# in this case only
3383+
expected = orig2.drop_duplicates(['A', 'B'])
3384+
result = df2
3385+
assert_frame_equal(result, expected)
3386+
3387+
df2 = orig2.copy()
3388+
df2.drop_duplicates(take_last=True, inplace=True)
3389+
expected = orig2.drop_duplicates(['A', 'B'], take_last=True)
3390+
result = df2
3391+
assert_frame_equal(result, expected)
3392+
33433393
def test_drop_col_still_multiindex(self):
33443394
arrays = [[ 'a', 'b', 'c', 'top'],
33453395
[ '', '', '', 'OD' ],

vb_suite/reindex.py

+10
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,11 @@ def backfill():
136136
name='frame_drop_duplicates',
137137
start_date=datetime(2011, 11, 15))
138138

139+
statement = "df.drop_duplicates(['key1', 'key2'], inplace=True)"
140+
frame_drop_dup_inplace = Benchmark(statement, setup,
141+
name='frame_drop_dup_inplace',
142+
start_date=datetime(2012, 5, 16))
143+
139144
lib_fast_zip = Benchmark('lib.fast_zip(df.values.T)', setup,
140145
name='lib_fast_zip',
141146
start_date=datetime(2012, 1, 1))
@@ -152,6 +157,11 @@ def backfill():
152157
name='lib_fast_zip_fillna',
153158
start_date=datetime(2012, 5, 15))
154159

160+
statement2 = "df.drop_duplicates(['key1', 'key2'], inplace=True)"
161+
frame_drop_dup_na_inplace = Benchmark(statement2, setup,
162+
name='frame_drop_dup_na_inplace',
163+
start_date=datetime(2012, 5, 16))
164+
155165
#----------------------------------------------------------------------
156166
# fillna, many columns
157167

0 commit comments

Comments
 (0)