Skip to content

DataFrame.drop_duplicates #1244

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 19 additions & 7 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2337,7 +2337,7 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None):
new_labels = labels[mask]
return self.reindex(**{axis_name: new_labels})

def drop_duplicates(self, cols=None, take_last=False):
def drop_duplicates(self, cols=None, take_last=False, inplace=False):
"""
Return DataFrame with duplicate rows removed, optionally only
considering certain columns
Expand All @@ -2349,13 +2349,25 @@ def drop_duplicates(self, cols=None, take_last=False):
default use all of the columns
take_last : boolean, default False
Take the last observed row in a row. Defaults to the first row
skipna : boolean, default True
If True then keep NaN
inplace : boolean, default False
Whether to drop duplicates in place or to return a copy

Returns
-------
deduplicated : DataFrame
"""

duplicated = self.duplicated(cols, take_last=take_last)
return self[-duplicated]

if inplace:
inds, = (-duplicated).nonzero()
self._data = self._data.take(inds)
self._clear_item_cache()
return self
else:
return self[-duplicated]

def duplicated(self, cols=None, take_last=False):
"""
Expand All @@ -2376,11 +2388,13 @@ def duplicated(self, cols=None, take_last=False):
"""
if cols is not None:
if isinstance(cols, list):
keys = zip(*[self[x] for x in cols])
values = [self[x].values for x in cols]
keys = lib.fast_zip_fillna(values)
else:
keys = list(self[cols])
keys = lib.fast_zip_fillna([self[cols]])
else:
keys = zip(*self.values.T)
values = list(self.values.T)
keys = lib.fast_zip_fillna(values)

duplicated = lib.duplicated(keys, take_last=take_last)
return Series(duplicated, index=self.index)
Expand Down Expand Up @@ -4374,7 +4388,6 @@ def _homogenize(data, index, columns, dtype=None):
def _put_str(s, space):
return ('%s' % s)[:space].ljust(space)


def _is_sequence(x):
try:
iter(x)
Expand All @@ -4383,7 +4396,6 @@ def _is_sequence(x):
except Exception:
return False


def install_ipython_completers(): # pragma: no cover
"""Register the DataFrame type with IPython's tab completion machinery, so
that it knows about accessing column names as attributes."""
Expand Down
65 changes: 63 additions & 2 deletions pandas/src/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1301,8 +1301,69 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,

return counts

cdef class _PandasNull:

def duplicated(list values, take_last=False):
def __richcmp__(_PandasNull self, object other, int op):
if op == 2: # ==
return isinstance(other, _PandasNull)
elif op == 3: # !=
return not isinstance(other, _PandasNull)
else:
return False

def __hash__(self):
return 0

pandas_null = _PandasNull()

def fast_zip_fillna(list ndarrays, fill_value=pandas_null):
'''
For zipping multiple ndarrays into an ndarray of tuples
'''
cdef:
Py_ssize_t i, j, k, n
ndarray[object] result
flatiter it
object val, tup

k = len(ndarrays)
n = len(ndarrays[0])

result = np.empty(n, dtype=object)

# initialize tuples on first pass
arr = ndarrays[0]
it = <flatiter> PyArray_IterNew(arr)
for i in range(n):
val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
tup = PyTuple_New(k)

if val != val:
val = fill_value

PyTuple_SET_ITEM(tup, 0, val)
Py_INCREF(val)
result[i] = tup
PyArray_ITER_NEXT(it)

for j in range(1, k):
arr = ndarrays[j]
it = <flatiter> PyArray_IterNew(arr)
if len(arr) != n:
raise ValueError('all arrays must be same length')

for i in range(n):
val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
if val != val:
val = fill_value

PyTuple_SET_ITEM(result[i], j, val)
Py_INCREF(val)
PyArray_ITER_NEXT(it)

return result

def duplicated(ndarray[object] values, take_last=False):
cdef:
Py_ssize_t i, n
dict seen = {}
Expand All @@ -1314,6 +1375,7 @@ def duplicated(list values, take_last=False):
if take_last:
for i from n > i >= 0:
row = values[i]

if row in seen:
result[i] = 1
else:
Expand All @@ -1330,7 +1392,6 @@ def duplicated(list values, take_last=False):

return result.view(np.bool_)


def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups):
cdef:
Py_ssize_t i, group_size, n, lab, start
Expand Down
1 change: 0 additions & 1 deletion pandas/src/tseries.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,6 @@ def fast_zip(list ndarrays):

return result


def get_reverse_indexer(ndarray[int64_t] indexer, Py_ssize_t length):
cdef:
Py_ssize_t i, n = len(indexer)
Expand Down
104 changes: 104 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

# pylint: disable-msg=W0612,E1101
from copy import deepcopy
from datetime import datetime, timedelta
Expand Down Expand Up @@ -3214,6 +3215,109 @@ def test_drop_duplicates(self):
expected = df2.drop_duplicates(['A', 'B'], take_last=True)
assert_frame_equal(result, expected)

def test_drop_duplicates_NA(self):
# none
df = DataFrame({'A' : [None, None, 'foo', 'bar',
'foo', 'bar', 'bar', 'foo'],
'B' : ['one', 'one', 'two', 'two',
'two', 'two', 'one', 'two'],
'C' : [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.],
'D' : range(8)})

# single column
result = df.drop_duplicates('A')
expected = df.ix[[0, 2, 3]]
assert_frame_equal(result, expected)

result = df.drop_duplicates('A', take_last=True)
expected = df.ix[[1, 6, 7]]
assert_frame_equal(result, expected)

# multi column
result = df.drop_duplicates(['A', 'B'])
expected = df.ix[[0, 2, 3, 6]]
assert_frame_equal(result, expected)

result = df.drop_duplicates(['A', 'B'], take_last=True)
expected = df.ix[[1, 5, 6, 7]]
assert_frame_equal(result, expected)

# nan
df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'bar', 'foo'],
'B' : ['one', 'one', 'two', 'two',
'two', 'two', 'one', 'two'],
'C' : [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.],
'D' : range(8)})

# single column
result = df.drop_duplicates('C')
expected = df[:2]
assert_frame_equal(result, expected)

result = df.drop_duplicates('C', take_last=True)
expected = df.ix[[3, 7]]
assert_frame_equal(result, expected)

# multi column
result = df.drop_duplicates(['C', 'B'])
expected = df.ix[[0, 1, 2, 4]]
assert_frame_equal(result, expected)

result = df.drop_duplicates(['C', 'B'], take_last=True)
expected = df.ix[[1, 3, 6, 7]]
assert_frame_equal(result, expected)

def test_drop_duplicates_inplace(self):
orig = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'bar', 'foo'],
'B' : ['one', 'one', 'two', 'two',
'two', 'two', 'one', 'two'],
'C' : [1, 1, 2, 2, 2, 2, 1, 2],
'D' : range(8)})

# single column
df = orig.copy()
df.drop_duplicates('A', inplace=True)
expected = orig[:2]
result = df
assert_frame_equal(result, expected)

df = orig.copy()
df.drop_duplicates('A', take_last=True, inplace=True)
expected = orig.ix[[6, 7]]
result = df
assert_frame_equal(result, expected)

# multi column
df = orig.copy()
df.drop_duplicates(['A', 'B'], inplace=True)
expected = orig.ix[[0, 1, 2, 3]]
result = df
assert_frame_equal(result, expected)

df = orig.copy()
df.drop_duplicates(['A', 'B'], take_last=True, inplace=True)
expected = orig.ix[[0, 5, 6, 7]]
result = df
assert_frame_equal(result, expected)

# consider everything
orig2 = orig.ix[:, ['A', 'B', 'C']].copy()

df2 = orig2.copy()
df2.drop_duplicates(inplace=True)
# in this case only
expected = orig2.drop_duplicates(['A', 'B'])
result = df2
assert_frame_equal(result, expected)

df2 = orig2.copy()
df2.drop_duplicates(take_last=True, inplace=True)
expected = orig2.drop_duplicates(['A', 'B'], take_last=True)
result = df2
assert_frame_equal(result, expected)

def test_drop_col_still_multiindex(self):
arrays = [[ 'a', 'b', 'c', 'top'],
[ '', '', '', 'OD' ],
Expand Down
6 changes: 4 additions & 2 deletions pandas/tests/test_tseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def test_groupsort_indexer():


def test_duplicated_with_nas():
keys = [0, 1, nan, 0, 2, nan]
keys = np.array([0, 1, nan, 0, 2, nan], dtype=object)

result = lib.duplicated(keys)
expected = [False, False, False, True, False, True]
Expand All @@ -175,7 +175,9 @@ def test_duplicated_with_nas():
expected = [True, False, True, False, False, False]
assert(np.array_equal(result, expected))

keys = [(0, 0), (0, nan), (nan, 0), (nan, nan)] * 2
keys = np.empty(8, dtype=object)
for i, t in enumerate(zip([0, 0, nan, nan]*2, [0, nan, 0, nan]*2)):
keys[i] = t

result = lib.duplicated(keys)
falses = [False] * 4
Expand Down
27 changes: 27 additions & 0 deletions vb_suite/reindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ def backfill():

# pathological, but realistic
setup = common_setup + """
import pandas._tseries as lib
N = 10000
K = 10

Expand All @@ -135,6 +136,32 @@ def backfill():
name='frame_drop_duplicates',
start_date=datetime(2011, 11, 15))

statement = "df.drop_duplicates(['key1', 'key2'], inplace=True)"
frame_drop_dup_inplace = Benchmark(statement, setup,
name='frame_drop_dup_inplace',
start_date=datetime(2012, 5, 16))

lib_fast_zip = Benchmark('lib.fast_zip(df.values.T)', setup,
name='lib_fast_zip',
start_date=datetime(2012, 1, 1))

setup = setup + """
df.ix[:10000, :] = np.nan
"""
statement2 = "df.drop_duplicates(['key1', 'key2'])"
frame_drop_duplicates_na = Benchmark(statement2, setup,
name='frame_drop_duplicates_na',
start_date=datetime(2012, 5, 15))

lib_fast_zip_fillna = Benchmark('lib.fast_zip_fillna(df.values.T)', setup,
name='lib_fast_zip_fillna',
start_date=datetime(2012, 5, 15))

statement2 = "df.drop_duplicates(['key1', 'key2'], inplace=True)"
frame_drop_dup_na_inplace = Benchmark(statement2, setup,
name='frame_drop_dup_na_inplace',
start_date=datetime(2012, 5, 16))

#----------------------------------------------------------------------
# fillna, many columns

Expand Down