Skip to content

Commit b2d5a33

Browse files
committed
Merge pull request #7979 from sinhrks/dup_idx
ENH: Add duplicated/drop_duplicates to Index
2 parents 76e6588 + 54d3e4d commit b2d5a33

File tree

7 files changed

+183
-41
lines changed

7 files changed

+183
-41
lines changed

doc/source/api.rst

+4
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,8 @@ Reindexing / Selection / Label manipulation
374374

375375
Series.align
376376
Series.drop
377+
Series.drop_duplicates
378+
Series.duplicated
377379
Series.equals
378380
Series.first
379381
Series.head
@@ -1165,6 +1167,8 @@ Modifying and Computations
11651167
Index.diff
11661168
Index.sym_diff
11671169
Index.drop
1170+
Index.drop_duplicates
1171+
Index.duplicated
11681172
Index.equals
11691173
Index.factorize
11701174
Index.identical

doc/source/v0.15.0.txt

+9
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,15 @@ API changes
223223
s
224224
s.loc[['D']]
225225

226+
- ``Index`` now supports ``duplicated`` and ``drop_duplicates``. (:issue:`4060`)
227+
228+
.. ipython:: python
229+
230+
idx = Index([1, 2, 3, 4, 1, 2])
231+
idx
232+
idx.duplicated()
233+
idx.drop_duplicates()
234+
226235
.. _whatsnew_0150.dt:
227236

228237
.dt accessor

pandas/core/base.py

+60-1
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,14 @@
88
from pandas.core import common as com
99
import pandas.core.nanops as nanops
1010
import pandas.tslib as tslib
11+
import pandas.lib as lib
1112
from pandas.util.decorators import Appender, cache_readonly
1213

14+
15+
_shared_docs = dict()
16+
_indexops_doc_kwargs = dict(klass='IndexOpsMixin', inplace='')
17+
18+
1319
class StringMixin(object):
1420

1521
"""implements string methods so long as object defines a `__unicode__`
@@ -474,12 +480,66 @@ def searchsorted(self, key, side='left'):
474480
#### needs tests/doc-string
475481
return self.values.searchsorted(key, side=side)
476482

483+
_shared_docs['drop_duplicates'] = (
484+
"""Return %(klass)s with duplicate values removed
485+
486+
Parameters
487+
----------
488+
take_last : boolean, default False
489+
Take the last observed index in a group. Default first
490+
%(inplace)s
491+
492+
Returns
493+
-------
494+
deduplicated : %(klass)s
495+
""")
496+
497+
@Appender(_shared_docs['drop_duplicates'] % _indexops_doc_kwargs)
498+
def drop_duplicates(self, take_last=False, inplace=False):
499+
duplicated = self.duplicated(take_last=take_last)
500+
result = self[~duplicated.values]
501+
if inplace:
502+
return self._update_inplace(result)
503+
else:
504+
return result
505+
506+
_shared_docs['duplicated'] = (
507+
"""Return boolean %(klass)s denoting duplicate values
508+
509+
Parameters
510+
----------
511+
take_last : boolean, default False
512+
Take the last observed index in a group. Default first
513+
514+
Returns
515+
-------
516+
duplicated : %(klass)s
517+
""")
518+
519+
@Appender(_shared_docs['duplicated'] % _indexops_doc_kwargs)
520+
def duplicated(self, take_last=False):
521+
keys = com._ensure_object(self.values)
522+
duplicated = lib.duplicated(keys, take_last=take_last)
523+
try:
524+
return self._constructor(duplicated,
525+
index=self.index).__finalize__(self)
526+
except AttributeError:
527+
from pandas.core.index import Index
528+
return Index(duplicated)
529+
477530
#----------------------------------------------------------------------
478531
# unbox reductions
479532

480533
all = _unbox(np.ndarray.all)
481534
any = _unbox(np.ndarray.any)
482535

536+
#----------------------------------------------------------------------
537+
# abstracts
538+
539+
def _update_inplace(self, result):
540+
raise NotImplementedError
541+
542+
483543
class DatetimeIndexOpsMixin(object):
484544
""" common ops mixin to support a unified inteface datetimelike Index """
485545

@@ -497,7 +557,6 @@ def _box_values(self, values):
497557
"""
498558
apply box func to passed values
499559
"""
500-
import pandas.lib as lib
501560
return lib.map_infer(values, self._box_func)
502561

503562
@cache_readonly

pandas/core/index.py

+16-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
import pandas.algos as _algos
1313
import pandas.index as _index
1414
from pandas.lib import Timestamp, is_datetime_array
15-
from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin
15+
from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs
1616
from pandas.util.decorators import Appender, cache_readonly, deprecate
1717
from pandas.core.common import isnull, array_equivalent
1818
import pandas.core.common as com
@@ -30,6 +30,8 @@
3030

3131
_unsortable_types = frozenset(('mixed', 'mixed-integer'))
3232

33+
_index_doc_kwargs = dict(klass='Index', inplace='')
34+
3335

3436
def _try_get_item(x):
3537
try:
@@ -209,6 +211,10 @@ def _simple_new(cls, values, name=None, **kwargs):
209211
result._reset_identity()
210212
return result
211213

214+
def _update_inplace(self, result):
215+
# guard when called from IndexOpsMixin
216+
raise TypeError("Index can't be updated inplace")
217+
212218
def is_(self, other):
213219
"""
214220
More flexible, faster check like ``is`` but that works through views
@@ -2022,6 +2028,15 @@ def drop(self, labels):
20222028
raise ValueError('labels %s not contained in axis' % labels[mask])
20232029
return self.delete(indexer)
20242030

2031+
@Appender(_shared_docs['drop_duplicates'] % _index_doc_kwargs)
2032+
def drop_duplicates(self, take_last=False):
2033+
result = super(Index, self).drop_duplicates(take_last=take_last)
2034+
return self._constructor(result)
2035+
2036+
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
2037+
def duplicated(self, take_last=False):
2038+
return super(Index, self).duplicated(take_last=take_last)
2039+
20252040
@classmethod
20262041
def _add_numeric_methods_disabled(cls):
20272042
""" add in numeric methods to disable """

pandas/core/series.py

+12-37
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,13 @@
5252

5353
__all__ = ['Series']
5454

55+
5556
_shared_doc_kwargs = dict(
5657
axes='index',
5758
klass='Series',
58-
axes_single_arg="{0,'index'}"
59+
axes_single_arg="{0,'index'}",
60+
inplace="""inplace : boolean, default False
61+
If True, performs operation inplace and returns None."""
5962
)
6063

6164

@@ -265,6 +268,9 @@ def _set_subtyp(self, is_all_dates):
265268
else:
266269
object.__setattr__(self, '_subtyp', 'series')
267270

271+
def _update_inplace(self, result):
272+
return generic.NDFrame._update_inplace(self, result)
273+
268274
# ndarray compatibility
269275
@property
270276
def dtype(self):
@@ -1114,45 +1120,14 @@ def mode(self):
11141120
from pandas.core.algorithms import mode
11151121
return mode(self)
11161122

1123+
@Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs)
11171124
def drop_duplicates(self, take_last=False, inplace=False):
1118-
"""
1119-
Return Series with duplicate values removed
1120-
1121-
Parameters
1122-
----------
1123-
take_last : boolean, default False
1124-
Take the last observed index in a group. Default first
1125-
inplace : boolean, default False
1126-
If True, performs operation inplace and returns None.
1127-
1128-
Returns
1129-
-------
1130-
deduplicated : Series
1131-
"""
1132-
duplicated = self.duplicated(take_last=take_last)
1133-
result = self[-duplicated]
1134-
if inplace:
1135-
return self._update_inplace(result)
1136-
else:
1137-
return result
1125+
return super(Series, self).drop_duplicates(take_last=take_last,
1126+
inplace=inplace)
11381127

1128+
@Appender(base._shared_docs['duplicated'] % _shared_doc_kwargs)
11391129
def duplicated(self, take_last=False):
1140-
"""
1141-
Return boolean Series denoting duplicate values
1142-
1143-
Parameters
1144-
----------
1145-
take_last : boolean, default False
1146-
Take the last observed index in a group. Default first
1147-
1148-
Returns
1149-
-------
1150-
duplicated : Series
1151-
"""
1152-
keys = _ensure_object(self.values)
1153-
duplicated = lib.duplicated(keys, take_last=take_last)
1154-
return self._constructor(duplicated,
1155-
index=self.index).__finalize__(self)
1130+
return super(Series, self).duplicated(take_last=take_last)
11561131

11571132
def idxmin(self, axis=None, out=None, skipna=True):
11581133
"""

pandas/tests/test_base.py

+68-2
Original file line numberDiff line numberDiff line change
@@ -339,9 +339,13 @@ def test_value_counts_unique_nunique(self):
339339
# freq must be specified because repeat makes freq ambiguous
340340
expected_index = o[::-1]
341341
o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq)
342-
else:
342+
elif isinstance(o, Index):
343343
expected_index = values[::-1]
344344
o = klass(np.repeat(values, range(1, len(o) + 1)))
345+
else:
346+
expected_index = values[::-1]
347+
idx = np.repeat(o.index.values, range(1, len(o) + 1))
348+
o = klass(np.repeat(values, range(1, len(o) + 1)), index=idx)
345349

346350
expected_s = Series(range(10, 0, -1), index=expected_index, dtype='int64')
347351
tm.assert_series_equal(o.value_counts(), expected_s)
@@ -374,11 +378,16 @@ def test_value_counts_unique_nunique(self):
374378

375379
# create repeated values, 'n'th element is repeated by n+1 times
376380
if isinstance(o, PeriodIndex):
381+
# freq must be specified because repeat makes freq ambiguous
377382
expected_index = o
378383
o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq)
379-
else:
384+
elif isinstance(o, Index):
380385
expected_index = values
381386
o = klass(np.repeat(values, range(1, len(o) + 1)))
387+
else:
388+
expected_index = values
389+
idx = np.repeat(o.index.values, range(1, len(o) + 1))
390+
o = klass(np.repeat(values, range(1, len(o) + 1)), index=idx)
382391

383392
expected_s_na = Series(list(range(10, 2, -1)) +[3], index=expected_index[9:0:-1], dtype='int64')
384393
expected_s = Series(list(range(10, 2, -1)), index=expected_index[9:1:-1], dtype='int64')
@@ -571,6 +580,63 @@ def test_factorize(self):
571580
expected = o[5:].append(o[:5])
572581
self.assertTrue(uniques.equals(expected))
573582

583+
def test_duplicated_drop_duplicates(self):
584+
# GH 4060
585+
for original in self.objs:
586+
587+
if isinstance(original, Index):
588+
# original doesn't have duplicates
589+
expected = Index([False] * len(original))
590+
tm.assert_index_equal(original.duplicated(), expected)
591+
result = original.drop_duplicates()
592+
tm.assert_index_equal(result, original)
593+
self.assertFalse(result is original)
594+
595+
# create repeated values, 3rd and 5th values are duplicated
596+
idx = original[list(range(len(original))) + [5, 3]]
597+
expected = Index([False] * len(original) + [True, True])
598+
tm.assert_index_equal(idx.duplicated(), expected)
599+
tm.assert_index_equal(idx.drop_duplicates(), original)
600+
601+
last_base = [False] * len(idx)
602+
last_base[3] = True
603+
last_base[5] = True
604+
expected = Index(last_base)
605+
tm.assert_index_equal(idx.duplicated(take_last=True), expected)
606+
tm.assert_index_equal(idx.drop_duplicates(take_last=True),
607+
idx[~np.array(last_base)])
608+
609+
with tm.assertRaisesRegexp(TypeError,
610+
"drop_duplicates\(\) got an unexpected keyword argument"):
611+
idx.drop_duplicates(inplace=True)
612+
613+
else:
614+
expected = Series([False] * len(original), index=original.index)
615+
tm.assert_series_equal(original.duplicated(), expected)
616+
result = original.drop_duplicates()
617+
tm.assert_series_equal(result, original)
618+
self.assertFalse(result is original)
619+
620+
idx = original.index[list(range(len(original))) + [5, 3]]
621+
values = original.values[list(range(len(original))) + [5, 3]]
622+
s = Series(values, index=idx)
623+
624+
expected = Series([False] * len(original) + [True, True], index=idx)
625+
tm.assert_series_equal(s.duplicated(), expected)
626+
tm.assert_series_equal(s.drop_duplicates(), original)
627+
628+
last_base = [False] * len(idx)
629+
last_base[3] = True
630+
last_base[5] = True
631+
expected = Series(last_base, index=idx)
632+
expected
633+
tm.assert_series_equal(s.duplicated(take_last=True), expected)
634+
tm.assert_series_equal(s.drop_duplicates(take_last=True),
635+
s[~np.array(last_base)])
636+
637+
s.drop_duplicates(inplace=True)
638+
tm.assert_series_equal(s, original)
639+
574640

575641
class TestDatetimeIndexOps(Ops):
576642
tz = [None, 'UTC', 'Asia/Tokyo', 'US/Eastern',

pandas/tests/test_multilevel.py

+14
Original file line numberDiff line numberDiff line change
@@ -2031,6 +2031,20 @@ def test_duplicate_mi(self):
20312031
result = df.loc[('foo','bar')]
20322032
assert_frame_equal(result,expected)
20332033

2034+
def test_duplicated_drop_duplicates(self):
2035+
# GH 4060
2036+
idx = MultiIndex.from_arrays(([1, 2, 3, 1, 2 ,3], [1, 1, 1, 1, 2, 2]))
2037+
2038+
expected = Index([False, False, False, True, False, False])
2039+
tm.assert_index_equal(idx.duplicated(), expected)
2040+
expected = MultiIndex.from_arrays(([1, 2, 3, 2 ,3], [1, 1, 1, 2, 2]))
2041+
tm.assert_index_equal(idx.drop_duplicates(), expected)
2042+
2043+
expected = Index([True, False, False, False, False, False])
2044+
tm.assert_index_equal(idx.duplicated(take_last=True), expected)
2045+
expected = MultiIndex.from_arrays(([2, 3, 1, 2 ,3], [1, 1, 1, 2, 2]))
2046+
tm.assert_index_equal(idx.drop_duplicates(take_last=True), expected)
2047+
20342048
def test_multiindex_set_index(self):
20352049
# segfault in #3308
20362050
d = {'t1': [2, 2.5, 3], 't2': [4, 5, 6]}

0 commit comments

Comments
 (0)