Skip to content

Commit d19f073

Browse files
committed
API: add return_inverse to pd.unique
1 parent 4ae63aa commit d19f073

File tree

4 files changed

+166
-98
lines changed

4 files changed

+166
-98
lines changed

doc/source/whatsnew/v0.24.0.rst

+18
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,24 @@ Example:
320320
See the :ref:`advanced docs on renaming<advanced.index_names>` for more details.
321321

322322

323+
.. _whatsnew_0240.enhancements.unique:
324+
325+
Changes to the ``unique``-method
326+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
327+
328+
The method :meth:`pandas.unique` now supports the keyword ``return_inverse``, which, if passed,
329+
makes the output a tuple where the second component is an ndarray that contains the
330+
mapping from the indices of the values to their location in the return unique values.
331+
332+
.. ipython:: python
333+
334+
idx = pd.Index([1, 0, 0, 1])
335+
uniques, inverse = pd.unique(idx, return_inverse=True)
336+
uniques
337+
inverse
338+
reconstruct = pd.Index(uniques[inverse])
339+
reconstruct.equals(idx)
340+
323341
.. _whatsnew_0240.enhancements.other:
324342

325343
Other Enhancements

pandas/core/algorithms.py

+27-2
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ def match(to_match, values, na_sentinel=-1):
271271
return result
272272

273273

274-
def unique(values):
274+
def unique(values, return_inverse=False):
275275
"""
276276
Hash table-based unique. Uniques are returned in order
277277
of appearance. This does NOT sort.
@@ -344,18 +344,41 @@ def unique(values):
344344
pandas.Index.unique
345345
pandas.Series.unique
346346
"""
347+
from pandas import Index
347348

348349
values = _ensure_arraylike(values)
349350

350351
if is_extension_array_dtype(values):
351352
# Dispatch to extension dtype's unique.
353+
if return_inverse:
354+
# as long as return_inverse is not part of the EA.unique contract,
355+
# test if this works
356+
try:
357+
# make sure that we're not calling from an Index/Series
358+
# container, as these do not support return_inverse yet
359+
ea_val = getattr(values, 'array', values)
360+
result, inverse = ea_val.unique(return_inverse=return_inverse)
361+
362+
if is_categorical_dtype(values) and isinstance(values, Index):
363+
# pd.unique(CategoricalIndex) returns Index not Categorical
364+
result = Index(result)
365+
return result, inverse
366+
except TypeError:
367+
msg = ('The Extension Array class for type {dtype} does not '
368+
'yet support the unique-method with '
369+
'"return_inverse=True".'.format(dtype=type(values)))
370+
raise NotImplementedError(msg)
352371
return values.unique()
353372

354373
original = values
355374
htable, _, values, dtype, ndtype = _get_hashtable_algo(values)
356375

357376
table = htable(len(values))
358-
uniques = table.unique(values)
377+
if return_inverse:
378+
uniques, inverse = table.unique(values, return_inverse=True)
379+
else:
380+
uniques = table.unique(values)
381+
359382
uniques = _reconstruct_data(uniques, dtype, original)
360383

361384
if isinstance(original, ABCSeries) and is_datetime64tz_dtype(dtype):
@@ -365,6 +388,8 @@ def unique(values):
365388
# TODO: it must return DatetimeArray with tz in pandas 2.0
366389
uniques = uniques.astype(object).values
367390

391+
if return_inverse:
392+
return uniques, inverse
368393
return uniques
369394

370395

pandas/core/arrays/categorical.py

+24-4
Original file line numberDiff line numberDiff line change
@@ -2249,7 +2249,7 @@ def mode(self, dropna=True):
22492249
codes = sorted(htable.mode_int64(ensure_int64(codes), dropna))
22502250
return self._constructor(values=codes, dtype=self.dtype, fastpath=True)
22512251

2252-
def unique(self):
2252+
def unique(self, return_inverse=False):
22532253
"""
22542254
Return the ``Categorical`` which ``categories`` and ``codes`` are
22552255
unique. Unused categories are NOT returned.
@@ -2259,9 +2259,22 @@ def unique(self):
22592259
- ordered category: values are sorted by appearance order, categories
22602260
keeps existing order.
22612261
2262+
Parameters
2263+
----------
2264+
return_inverse : boolean, default False
2265+
Whether to return the inverse of the unique values. If True, the
2266+
output will be a tuple where the second component is again an
2267+
np.ndarray that contains the mapping between the indices of the
2268+
elements in the calling Categorical and their locations in the
2269+
unique values. See examples for how to reconstruct.
2270+
2271+
.. versionadded:: 0.24.0
2272+
22622273
Returns
22632274
-------
2264-
unique values : ``Categorical``
2275+
uniques : ``Categorical``
2276+
inverse : np.ndarray (if `return_inverse=True`)
2277+
The inverse from the `uniques` back to the calling ``Categorical``.
22652278
22662279
Examples
22672280
--------
@@ -2293,7 +2306,10 @@ def unique(self):
22932306
"""
22942307

22952308
# unlike np.unique, unique1d does not sort
2296-
unique_codes = unique1d(self.codes)
2309+
if return_inverse:
2310+
unique_codes, inverse = unique1d(self.codes, return_inverse=True)
2311+
else:
2312+
unique_codes = unique1d(self.codes, return_inverse=False)
22972313
cat = self.copy()
22982314

22992315
# keep nan in codes
@@ -2303,7 +2319,11 @@ def unique(self):
23032319
take_codes = unique_codes[unique_codes != -1]
23042320
if self.ordered:
23052321
take_codes = np.sort(take_codes)
2306-
return cat.set_categories(cat.categories.take(take_codes))
2322+
result = cat.set_categories(cat.categories.take(take_codes))
2323+
2324+
if return_inverse:
2325+
return result, inverse
2326+
return result
23072327

23082328
def _values_for_factorize(self):
23092329
codes = self.codes.astype('int64')

pandas/tests/test_algos.py

+97-92
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,20 @@
2525
from pandas.util.testing import assert_almost_equal
2626

2727

28+
def assert_series_or_index_or_array_or_categorical_equal(left, right):
29+
if isinstance(left, Series):
30+
tm.assert_series_equal(left, right)
31+
elif isinstance(left, Index):
32+
tm.assert_index_equal(left, right)
33+
elif isinstance(left, np.ndarray):
34+
tm.assert_numpy_array_equal(left, right)
35+
elif isinstance(left, Categorical):
36+
tm.assert_categorical_equal(left, right)
37+
else:
38+
# will fail
39+
assert isinstance(left, (Series, Index, np.ndarray, Categorical))
40+
41+
2842
class TestMatch(object):
2943

3044
def test_ints(self):
@@ -321,17 +335,22 @@ def test_parametrized_factorize_na_value(self, data, na_value):
321335

322336
class TestUnique(object):
323337

324-
def test_ints(self):
325-
arr = np.random.randint(0, 100, size=50)
338+
def test_unique_inverse(self, any_numpy_dtype):
339+
dtype = any_numpy_dtype
340+
arr = np.random.randint(0, 100, size=50).astype(dtype)
326341

327342
result = algos.unique(arr)
328343
assert isinstance(result, np.ndarray)
329344

330-
def test_objects(self):
331-
arr = np.random.randint(0, 100, size=50).astype('O')
345+
# reuse result as expected outcome of return_inverse case
346+
expected_uniques = result.copy()
332347

333-
result = algos.unique(arr)
334-
assert isinstance(result, np.ndarray)
348+
result_uniques, result_inverse = algos.unique(arr, return_inverse=True)
349+
tm.assert_numpy_array_equal(result_uniques, expected_uniques)
350+
351+
# reconstruction can only work if inverse is correct
352+
reconstr = result_uniques[result_inverse]
353+
tm.assert_numpy_array_equal(reconstr, arr, check_dtype=False)
335354

336355
def test_object_refcount_bug(self):
337356
lst = ['A', 'B', 'C', 'D', 'E']
@@ -376,24 +395,26 @@ def test_datetime64_dtype_array_returned(self):
376395
tm.assert_numpy_array_equal(result, expected)
377396
assert result.dtype == expected.dtype
378397

379-
def test_timedelta64_dtype_array_returned(self):
398+
@pytest.mark.parametrize('box', [Index, Series, np.array])
399+
def test_timedelta64_dtype_array_returned(self, box):
380400
# GH 9431
381401
expected = np.array([31200, 45678, 10000], dtype='m8[ns]')
382402

383403
td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678])
384-
result = algos.unique(td_index)
385-
tm.assert_numpy_array_equal(result, expected)
386-
assert result.dtype == expected.dtype
404+
obj = box(td_index)
387405

388-
s = Series(td_index)
389-
result = algos.unique(s)
406+
result = algos.unique(obj)
390407
tm.assert_numpy_array_equal(result, expected)
391-
assert result.dtype == expected.dtype
392408

393-
arr = s.values
394-
result = algos.unique(arr)
395-
tm.assert_numpy_array_equal(result, expected)
396-
assert result.dtype == expected.dtype
409+
# reuse result as expected outcome of return_inverse case
410+
expected_uniques = result.copy()
411+
412+
result_uniques, result_inverse = algos.unique(obj, return_inverse=True)
413+
tm.assert_numpy_array_equal(result_uniques, expected_uniques)
414+
415+
# reconstruction can only work if inverse is correct
416+
reconstr = box(result_uniques[result_inverse])
417+
assert_series_or_index_or_array_or_categorical_equal(reconstr, obj)
397418

398419
def test_uint64_overflow(self):
399420
s = Series([1, 2, 2**63, 2**63], dtype=np.uint64)
@@ -406,78 +427,80 @@ def test_nan_in_object_array(self):
406427
expected = np.array(['a', np.nan, 'c'], dtype=object)
407428
tm.assert_numpy_array_equal(result, expected)
408429

409-
def test_categorical(self):
430+
result_uniques, result_inverse = pd.unique(duplicated_items,
431+
return_inverse=True)
432+
expected_inverse = np.array([0, 1, 2, 2], dtype='int64')
433+
tm.assert_numpy_array_equal(result_inverse, expected_inverse)
434+
435+
@pytest.mark.parametrize('ordered', [True, False])
436+
@pytest.mark.parametrize('box', [lambda x: x, Series, Index],
437+
ids=['Categorical', 'Series', 'Index'])
438+
@pytest.mark.parametrize('method', [lambda x, **kwargs: x.unique(**kwargs),
439+
pd.unique],
440+
ids=['classmethod', 'toplevel'])
441+
def test_categorical(self, method, box, ordered):
410442

411-
# we are expecting to return in the order
412-
# of appearance
413-
expected = Categorical(list('bac'), categories=list('bac'))
443+
categories = list('abc') if ordered else list('bac')
444+
expected = Categorical(list('bac'), categories=categories,
445+
ordered=ordered)
414446

415-
# we are expecting to return in the order
416-
# of the categories
417-
expected_o = Categorical(
418-
list('bac'), categories=list('abc'), ordered=True)
447+
# Index.unique always returns Index
448+
# pd.unique(Index) stays Index (only) for Categorical
449+
expected = box(expected) if box == Index else expected
419450

420451
# GH 15939
421-
c = Categorical(list('baabc'))
422-
result = c.unique()
423-
tm.assert_categorical_equal(result, expected)
452+
c = box(Categorical(list('baabc'), categories=categories,
453+
ordered=ordered))
454+
result = method(c)
424455

425-
result = algos.unique(c)
426-
tm.assert_categorical_equal(result, expected)
456+
assert_series_or_index_or_array_or_categorical_equal(result, expected)
427457

428-
c = Categorical(list('baabc'), ordered=True)
429-
result = c.unique()
430-
tm.assert_categorical_equal(result, expected_o)
458+
if method == pd.unique:
459+
# [Series/Index].unique do not yet support return_inverse=True
431460

432-
result = algos.unique(c)
433-
tm.assert_categorical_equal(result, expected_o)
461+
# reuse result as expected outcome of return_inverse case
462+
expected_uniques = result.copy()
463+
result_uniques, result_inverse = method(c, return_inverse=True)
434464

435-
# Series of categorical dtype
436-
s = Series(Categorical(list('baabc')), name='foo')
437-
result = s.unique()
438-
tm.assert_categorical_equal(result, expected)
465+
assert_series_or_index_or_array_or_categorical_equal(
466+
result_uniques, expected_uniques)
439467

440-
result = pd.unique(s)
441-
tm.assert_categorical_equal(result, expected)
468+
# reconstruction can only work if inverse is correct
469+
reconstr = box(result_uniques[result_inverse])
470+
assert_series_or_index_or_array_or_categorical_equal(reconstr, c)
442471

443-
# CI -> return CI
444-
ci = CategoricalIndex(Categorical(list('baabc'),
445-
categories=list('bac')))
446-
expected = CategoricalIndex(expected)
447-
result = ci.unique()
448-
tm.assert_index_equal(result, expected)
472+
@pytest.mark.parametrize('box', [Series, Index])
473+
@pytest.mark.parametrize('method', [lambda x, **kwargs: x.unique(**kwargs),
474+
pd.unique],
475+
ids=['classmethod', 'toplevel'])
476+
def test_datetime64tz_aware(self, method, box):
477+
# GH 15939
449478

450-
result = pd.unique(ci)
451-
tm.assert_index_equal(result, expected)
479+
ts = Timestamp('20160101', tz='US/Eastern')
480+
obj = box([ts, ts])
452481

453-
def test_datetime64tz_aware(self):
454-
# GH 15939
482+
if box == Series:
483+
expected = np.array([Timestamp('2016-01-01 00:00:00-0500',
484+
tz='US/Eastern')], dtype=object)
485+
else: # Index
486+
expected = Index([ts])
455487

456-
result = Series(
457-
Index([Timestamp('20160101', tz='US/Eastern'),
458-
Timestamp('20160101', tz='US/Eastern')])).unique()
459-
expected = np.array([Timestamp('2016-01-01 00:00:00-0500',
460-
tz='US/Eastern')], dtype=object)
461-
tm.assert_numpy_array_equal(result, expected)
488+
result = method(obj)
489+
assert_series_or_index_or_array_or_categorical_equal(result, expected)
462490

463-
result = Index([Timestamp('20160101', tz='US/Eastern'),
464-
Timestamp('20160101', tz='US/Eastern')]).unique()
465-
expected = DatetimeIndex(['2016-01-01 00:00:00'],
466-
dtype='datetime64[ns, US/Eastern]', freq=None)
467-
tm.assert_index_equal(result, expected)
468-
469-
result = pd.unique(
470-
Series(Index([Timestamp('20160101', tz='US/Eastern'),
471-
Timestamp('20160101', tz='US/Eastern')])))
472-
expected = np.array([Timestamp('2016-01-01 00:00:00-0500',
473-
tz='US/Eastern')], dtype=object)
474-
tm.assert_numpy_array_equal(result, expected)
491+
if method == pd.unique:
492+
# [Series/Index].unique do not yet support return_inverse=True
493+
494+
# reuse result as expected outcome of return_inverse case
495+
expected_uniques = result.copy()
496+
result_uniques, result_inverse = method(obj, return_inverse=True)
475497

476-
result = pd.unique(Index([Timestamp('20160101', tz='US/Eastern'),
477-
Timestamp('20160101', tz='US/Eastern')]))
478-
expected = DatetimeIndex(['2016-01-01 00:00:00'],
479-
dtype='datetime64[ns, US/Eastern]', freq=None)
480-
tm.assert_index_equal(result, expected)
498+
assert_series_or_index_or_array_or_categorical_equal(
499+
result_uniques, expected_uniques)
500+
501+
# reconstruction can only work if inverse is correct
502+
reconstr = box(result_uniques[result_inverse])
503+
assert_series_or_index_or_array_or_categorical_equal(reconstr, obj)
481504

482505
def test_order_of_appearance(self):
483506
# 9346
@@ -491,28 +514,10 @@ def test_order_of_appearance(self):
491514
tm.assert_numpy_array_equal(result,
492515
np.array([2, 1], dtype='int64'))
493516

494-
result = pd.unique(Series([Timestamp('20160101'),
495-
Timestamp('20160101')]))
496-
expected = np.array(['2016-01-01T00:00:00.000000000'],
497-
dtype='datetime64[ns]')
498-
tm.assert_numpy_array_equal(result, expected)
499-
500-
result = pd.unique(Index(
501-
[Timestamp('20160101', tz='US/Eastern'),
502-
Timestamp('20160101', tz='US/Eastern')]))
503-
expected = DatetimeIndex(['2016-01-01 00:00:00'],
504-
dtype='datetime64[ns, US/Eastern]',
505-
freq=None)
506-
tm.assert_index_equal(result, expected)
507-
508517
result = pd.unique(list('aabc'))
509518
expected = np.array(['a', 'b', 'c'], dtype=object)
510519
tm.assert_numpy_array_equal(result, expected)
511520

512-
result = pd.unique(Series(Categorical(list('aabc'))))
513-
expected = Categorical(list('abc'))
514-
tm.assert_categorical_equal(result, expected)
515-
516521
@pytest.mark.parametrize("arg ,expected", [
517522
(('1', '1', '2'), np.array(['1', '2'], dtype=object)),
518523
(('foo',), np.array(['foo'], dtype=object))

0 commit comments

Comments
 (0)