Skip to content

Commit ed511e9

Browse files
WillAydjreback
authored andcommitted
Deprecated Index.get_duplicates() (#20544)
1 parent 3bb58ac commit ed511e9

File tree

9 files changed

+39
-16
lines changed

9 files changed

+39
-16
lines changed

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -886,6 +886,7 @@ Deprecations
886886
- :func:`Series.rolling().apply() <pandas.core.window.Rolling.apply>`, :func:`DataFrame.rolling().apply() <pandas.core.window.Rolling.apply>`,
887887
:func:`Series.expanding().apply() <pandas.core.window.Expanding.apply>`, and :func:`DataFrame.expanding().apply() <pandas.core.window.Expanding.apply>` have deprecated passing an ``np.array`` by default. One will need to pass the new ``raw`` parameter to be explicit about what is passed (:issue:`20584`)
888888
- ``DatetimeIndex.offset`` is deprecated. Use ``DatetimeIndex.freq`` instead (:issue:`20716`)
889+
- ``Index.get_duplicates()`` is deprecated and will be removed in a future version (:issue:`20239`)
889890

890891
.. _whatsnew_0230.prior_deprecations:
891892

pandas/core/frame.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3879,7 +3879,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
38793879
index = _ensure_index_from_sequences(arrays, names)
38803880

38813881
if verify_integrity and not index.is_unique:
3882-
duplicates = index.get_duplicates()
3882+
duplicates = index[index.duplicated()].unique()
38833883
raise ValueError('Index has duplicate keys: {dup}'.format(
38843884
dup=duplicates))
38853885

pandas/core/indexes/base.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -1851,6 +1851,9 @@ def get_duplicates(self):
18511851
Returns a sorted list of index elements which appear more than once in
18521852
the index.
18531853
1854+
.. deprecated:: 0.23.0
1855+
Use idx[idx.duplicated()].unique() instead
1856+
18541857
Returns
18551858
-------
18561859
array-like
@@ -1897,13 +1900,12 @@ def get_duplicates(self):
18971900
>>> pd.Index(dates).get_duplicates()
18981901
DatetimeIndex([], dtype='datetime64[ns]', freq=None)
18991902
"""
1900-
from collections import defaultdict
1901-
counter = defaultdict(lambda: 0)
1902-
for k in self.values:
1903-
counter[k] += 1
1904-
return sorted(k for k, v in compat.iteritems(counter) if v > 1)
1903+
warnings.warn("'get_duplicates' is deprecated and will be removed in "
1904+
"a future release. You can use "
1905+
"idx[idx.duplicated()].unique() instead",
1906+
FutureWarning, stacklevel=2)
19051907

1906-
_get_duplicates = get_duplicates
1908+
return self[self.duplicated()].unique()
19071909

19081910
def _cleanup(self):
19091911
self._engine.clear_mapping()

pandas/core/indexes/datetimelike.py

-4
Original file line numberDiff line numberDiff line change
@@ -502,10 +502,6 @@ def take(self, indices, axis=0, allow_fill=True,
502502
freq = self.freq if isinstance(self, ABCPeriodIndex) else None
503503
return self._shallow_copy(taken, freq=freq)
504504

505-
def get_duplicates(self):
506-
values = Index.get_duplicates(self)
507-
return self._simple_new(values)
508-
509505
_can_hold_na = True
510506

511507
_na_value = NaT

pandas/core/reshape/concat.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -504,7 +504,7 @@ def _get_concat_axis(self):
504504
def _maybe_check_integrity(self, concat_index):
505505
if self.verify_integrity:
506506
if not concat_index.is_unique:
507-
overlap = concat_index.get_duplicates()
507+
overlap = concat_index[concat_index.duplicated()].unique()
508508
raise ValueError('Indexes have overlapping values: '
509509
'{overlap!s}'.format(overlap=overlap))
510510

pandas/tests/indexes/datetimes/test_datetime.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import warnings
12

23
import pytest
34

@@ -178,7 +179,10 @@ def test_get_duplicates(self):
178179
idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-02',
179180
'2000-01-03', '2000-01-03', '2000-01-04'])
180181

181-
result = idx.get_duplicates()
182+
with warnings.catch_warnings(record=True):
183+
# Deprecated - see GH20239
184+
result = idx.get_duplicates()
185+
182186
ex = DatetimeIndex(['2000-01-02', '2000-01-03'])
183187
tm.assert_index_equal(result, ex)
184188

pandas/tests/indexes/test_base.py

+5
Original file line numberDiff line numberDiff line change
@@ -2068,6 +2068,11 @@ def test_cached_properties_not_settable(self):
20682068
with tm.assert_raises_regex(AttributeError, "Can't set attribute"):
20692069
idx.is_unique = False
20702070

2071+
def test_get_duplicates_deprecated(self):
2072+
idx = pd.Index([1, 2, 3])
2073+
with tm.assert_produces_warning(FutureWarning):
2074+
idx.get_duplicates()
2075+
20712076

20722077
class TestMixedIntIndex(Base):
20732078
# Mostly the tests from common.py for which the results differ

pandas/tests/indexes/test_multi.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -2432,7 +2432,12 @@ def check(nlevels, with_nulls):
24322432
for a in [101, 102]:
24332433
mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]])
24342434
assert not mi.has_duplicates
2435-
assert mi.get_duplicates() == []
2435+
2436+
with warnings.catch_warnings(record=True):
2437+
# Deprecated - see GH20239
2438+
assert mi.get_duplicates().equals(MultiIndex.from_arrays(
2439+
[[], []]))
2440+
24362441
tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(
24372442
2, dtype='bool'))
24382443

@@ -2444,7 +2449,12 @@ def check(nlevels, with_nulls):
24442449
labels=np.random.permutation(list(lab)).T)
24452450
assert len(mi) == (n + 1) * (m + 1)
24462451
assert not mi.has_duplicates
2447-
assert mi.get_duplicates() == []
2452+
2453+
with warnings.catch_warnings(record=True):
2454+
# Deprecated - see GH20239
2455+
assert mi.get_duplicates().equals(MultiIndex.from_arrays(
2456+
[[], []]))
2457+
24482458
tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(
24492459
len(mi), dtype='bool'))
24502460

pandas/tests/indexes/timedeltas/test_timedelta.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import warnings
2+
13
import pytest
24

35
import numpy as np
@@ -145,7 +147,10 @@ def test_get_duplicates(self):
145147
idx = TimedeltaIndex(['1 day', '2 day', '2 day', '3 day', '3day',
146148
'4day'])
147149

148-
result = idx.get_duplicates()
150+
with warnings.catch_warnings(record=True):
151+
# Deprecated - see GH20239
152+
result = idx.get_duplicates()
153+
149154
ex = TimedeltaIndex(['2 day', '3day'])
150155
tm.assert_index_equal(result, ex)
151156

0 commit comments

Comments
 (0)