API: add return_inverse to pd.unique

h-vetinari · h-vetinari · commit d19f073bf705 · 2018-12-06T00:15:17.000+01:00
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -320,6 +320,24 @@ Example:
 See the :ref:`advanced docs on renaming<advanced.index_names>` for more details.
 
 
+.. _whatsnew_0240.enhancements.unique:
+
+Changes to the ``unique``-method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The method :meth:`pandas.unique` now supports the keyword ``return_inverse``, which, if passed,
+makes the output a tuple where the second component is an ndarray that contains the
+mapping from the indices of the values to their location in the return unique values.
+
+.. ipython:: python
+
+    idx = pd.Index([1, 0, 0, 1])
+    uniques, inverse = pd.unique(idx, return_inverse=True)
+    uniques
+    inverse
+    reconstruct = pd.Index(uniques[inverse])
+    reconstruct.equals(idx)
+
 .. _whatsnew_0240.enhancements.other:
 
 Other Enhancements
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -271,7 +271,7 @@ def match(to_match, values, na_sentinel=-1):
     return result
 
 
-def unique(values):
+def unique(values, return_inverse=False):
     """
     Hash table-based unique. Uniques are returned in order
     of appearance. This does NOT sort.
@@ -344,18 +344,41 @@ def unique(values):
     pandas.Index.unique
     pandas.Series.unique
     """
+    from pandas import Index
 
     values = _ensure_arraylike(values)
 
     if is_extension_array_dtype(values):
         # Dispatch to extension dtype's unique.
+        if return_inverse:
+            # as long as return_inverse is not part of the EA.unique contract,
+            # test if this works
+            try:
+                # make sure that we're not calling from an Index/Series
+                # container, as these do not support return_inverse yet
+                ea_val = getattr(values, 'array', values)
+                result, inverse = ea_val.unique(return_inverse=return_inverse)
+
+                if is_categorical_dtype(values) and isinstance(values, Index):
+                    # pd.unique(CategoricalIndex) returns Index not Categorical
+                    result = Index(result)
+                return result, inverse
+            except TypeError:
+                msg = ('The Extension Array class for type {dtype} does not '
+                       'yet support the unique-method with '
+                       '"return_inverse=True".'.format(dtype=type(values)))
+                raise NotImplementedError(msg)
         return values.unique()
 
     original = values
     htable, _, values, dtype, ndtype = _get_hashtable_algo(values)
 
     table = htable(len(values))
-    uniques = table.unique(values)
+    if return_inverse:
+        uniques, inverse = table.unique(values, return_inverse=True)
+    else:
+        uniques = table.unique(values)
+
     uniques = _reconstruct_data(uniques, dtype, original)
 
     if isinstance(original, ABCSeries) and is_datetime64tz_dtype(dtype):
@@ -365,6 +388,8 @@ def unique(values):
         # TODO: it must return DatetimeArray with tz in pandas 2.0
         uniques = uniques.astype(object).values
 
+    if return_inverse:
+        return uniques, inverse
     return uniques
 
 
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2249,7 +2249,7 @@ def mode(self, dropna=True):
         codes = sorted(htable.mode_int64(ensure_int64(codes), dropna))
         return self._constructor(values=codes, dtype=self.dtype, fastpath=True)
 
-    def unique(self):
+    def unique(self, return_inverse=False):
         """
         Return the ``Categorical`` which ``categories`` and ``codes`` are
         unique. Unused categories are NOT returned.
@@ -2259,9 +2259,22 @@ def unique(self):
         - ordered category: values are sorted by appearance order, categories
           keeps existing order.
 
+        Parameters
+        ----------
+        return_inverse : boolean, default False
+            Whether to return the inverse of the unique values. If True, the
+            output will be a tuple where the second component is again an
+            np.ndarray that contains the mapping between the indices of the
+            elements in the calling Categorical and their locations in the
+            unique values. See examples for how to reconstruct.
+
+            .. versionadded:: 0.24.0
+
         Returns
         -------
-        unique values : ``Categorical``
+        uniques : ``Categorical``
+        inverse : np.ndarray (if `return_inverse=True`)
+            The inverse from the `uniques` back to the calling ``Categorical``.
 
         Examples
         --------
@@ -2293,7 +2306,10 @@ def unique(self):
         """
 
         # unlike np.unique, unique1d does not sort
-        unique_codes = unique1d(self.codes)
+        if return_inverse:
+            unique_codes, inverse = unique1d(self.codes, return_inverse=True)
+        else:
+            unique_codes = unique1d(self.codes, return_inverse=False)
         cat = self.copy()
 
         # keep nan in codes
@@ -2303,7 +2319,11 @@ def unique(self):
         take_codes = unique_codes[unique_codes != -1]
         if self.ordered:
             take_codes = np.sort(take_codes)
-        return cat.set_categories(cat.categories.take(take_codes))
+        result = cat.set_categories(cat.categories.take(take_codes))
+
+        if return_inverse:
+            return result, inverse
+        return result
 
     def _values_for_factorize(self):
         codes = self.codes.astype('int64')
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -25,6 +25,20 @@
 from pandas.util.testing import assert_almost_equal
 
 
+def assert_series_or_index_or_array_or_categorical_equal(left, right):
+    if isinstance(left, Series):
+        tm.assert_series_equal(left, right)
+    elif isinstance(left, Index):
+        tm.assert_index_equal(left, right)
+    elif isinstance(left, np.ndarray):
+        tm.assert_numpy_array_equal(left, right)
+    elif isinstance(left, Categorical):
+        tm.assert_categorical_equal(left, right)
+    else:
+        # will fail
+        assert isinstance(left, (Series, Index, np.ndarray, Categorical))
+
+
 class TestMatch(object):
 
     def test_ints(self):
@@ -321,17 +335,22 @@ def test_parametrized_factorize_na_value(self, data, na_value):
 
 class TestUnique(object):
 
-    def test_ints(self):
-        arr = np.random.randint(0, 100, size=50)
+    def test_unique_inverse(self, any_numpy_dtype):
+        dtype = any_numpy_dtype
+        arr = np.random.randint(0, 100, size=50).astype(dtype)
 
         result = algos.unique(arr)
         assert isinstance(result, np.ndarray)
 
-    def test_objects(self):
-        arr = np.random.randint(0, 100, size=50).astype('O')
+        # reuse result as expected outcome of return_inverse case
+        expected_uniques = result.copy()
 
-        result = algos.unique(arr)
-        assert isinstance(result, np.ndarray)
+        result_uniques, result_inverse = algos.unique(arr, return_inverse=True)
+        tm.assert_numpy_array_equal(result_uniques, expected_uniques)
+
+        # reconstruction can only work if inverse is correct
+        reconstr = result_uniques[result_inverse]
+        tm.assert_numpy_array_equal(reconstr, arr, check_dtype=False)
 
     def test_object_refcount_bug(self):
         lst = ['A', 'B', 'C', 'D', 'E']
@@ -376,24 +395,26 @@ def test_datetime64_dtype_array_returned(self):
         tm.assert_numpy_array_equal(result, expected)
         assert result.dtype == expected.dtype
 
-    def test_timedelta64_dtype_array_returned(self):
+    @pytest.mark.parametrize('box', [Index, Series, np.array])
+    def test_timedelta64_dtype_array_returned(self, box):
         # GH 9431
         expected = np.array([31200, 45678, 10000], dtype='m8[ns]')
 
         td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678])
-        result = algos.unique(td_index)
-        tm.assert_numpy_array_equal(result, expected)
-        assert result.dtype == expected.dtype
+        obj = box(td_index)
 
-        s = Series(td_index)
-        result = algos.unique(s)
+        result = algos.unique(obj)
         tm.assert_numpy_array_equal(result, expected)
-        assert result.dtype == expected.dtype
 
-        arr = s.values
-        result = algos.unique(arr)
-        tm.assert_numpy_array_equal(result, expected)
-        assert result.dtype == expected.dtype
+        # reuse result as expected outcome of return_inverse case
+        expected_uniques = result.copy()
+
+        result_uniques, result_inverse = algos.unique(obj, return_inverse=True)
+        tm.assert_numpy_array_equal(result_uniques, expected_uniques)
+
+        # reconstruction can only work if inverse is correct
+        reconstr = box(result_uniques[result_inverse])
+        assert_series_or_index_or_array_or_categorical_equal(reconstr, obj)
 
     def test_uint64_overflow(self):
         s = Series([1, 2, 2**63, 2**63], dtype=np.uint64)
@@ -406,78 +427,80 @@ def test_nan_in_object_array(self):
         expected = np.array(['a', np.nan, 'c'], dtype=object)
         tm.assert_numpy_array_equal(result, expected)
 
-    def test_categorical(self):
+        result_uniques, result_inverse = pd.unique(duplicated_items,
+                                                   return_inverse=True)
+        expected_inverse = np.array([0, 1, 2, 2], dtype='int64')
+        tm.assert_numpy_array_equal(result_inverse, expected_inverse)
+
+    @pytest.mark.parametrize('ordered', [True, False])
+    @pytest.mark.parametrize('box', [lambda x: x, Series, Index],
+                             ids=['Categorical', 'Series', 'Index'])
+    @pytest.mark.parametrize('method', [lambda x, **kwargs: x.unique(**kwargs),
+                                        pd.unique],
+                             ids=['classmethod', 'toplevel'])
+    def test_categorical(self, method, box, ordered):
 
-        # we are expecting to return in the order
-        # of appearance
-        expected = Categorical(list('bac'), categories=list('bac'))
+        categories = list('abc') if ordered else list('bac')
+        expected = Categorical(list('bac'), categories=categories,
+                               ordered=ordered)
 
-        # we are expecting to return in the order
-        # of the categories
-        expected_o = Categorical(
-            list('bac'), categories=list('abc'), ordered=True)
+        # Index.unique always returns Index
+        # pd.unique(Index) stays Index (only) for Categorical
+        expected = box(expected) if box == Index else expected
 
         # GH 15939
-        c = Categorical(list('baabc'))
-        result = c.unique()
-        tm.assert_categorical_equal(result, expected)
+        c = box(Categorical(list('baabc'), categories=categories,
+                            ordered=ordered))
+        result = method(c)
 
-        result = algos.unique(c)
-        tm.assert_categorical_equal(result, expected)
+        assert_series_or_index_or_array_or_categorical_equal(result, expected)
 
-        c = Categorical(list('baabc'), ordered=True)
-        result = c.unique()
-        tm.assert_categorical_equal(result, expected_o)
+        if method == pd.unique:
+            # [Series/Index].unique do not yet support return_inverse=True
 
-        result = algos.unique(c)
-        tm.assert_categorical_equal(result, expected_o)
+            # reuse result as expected outcome of return_inverse case
+            expected_uniques = result.copy()
+            result_uniques, result_inverse = method(c, return_inverse=True)
 
-        # Series of categorical dtype
-        s = Series(Categorical(list('baabc')), name='foo')
-        result = s.unique()
-        tm.assert_categorical_equal(result, expected)
+            assert_series_or_index_or_array_or_categorical_equal(
+                result_uniques, expected_uniques)
 
-        result = pd.unique(s)
-        tm.assert_categorical_equal(result, expected)
+            # reconstruction can only work if inverse is correct
+            reconstr = box(result_uniques[result_inverse])
+            assert_series_or_index_or_array_or_categorical_equal(reconstr, c)
 
-        # CI -> return CI
-        ci = CategoricalIndex(Categorical(list('baabc'),
-                                          categories=list('bac')))
-        expected = CategoricalIndex(expected)
-        result = ci.unique()
-        tm.assert_index_equal(result, expected)
+    @pytest.mark.parametrize('box', [Series, Index])
+    @pytest.mark.parametrize('method', [lambda x, **kwargs: x.unique(**kwargs),
+                                        pd.unique],
+                             ids=['classmethod', 'toplevel'])
+    def test_datetime64tz_aware(self, method, box):
+        # GH 15939
 
-        result = pd.unique(ci)
-        tm.assert_index_equal(result, expected)
+        ts = Timestamp('20160101', tz='US/Eastern')
+        obj = box([ts, ts])
 
-    def test_datetime64tz_aware(self):
-        # GH 15939
+        if box == Series:
+            expected = np.array([Timestamp('2016-01-01 00:00:00-0500',
+                                           tz='US/Eastern')], dtype=object)
+        else:  # Index
+            expected = Index([ts])
 
-        result = Series(
-            Index([Timestamp('20160101', tz='US/Eastern'),
-                   Timestamp('20160101', tz='US/Eastern')])).unique()
-        expected = np.array([Timestamp('2016-01-01 00:00:00-0500',
-                                       tz='US/Eastern')], dtype=object)
-        tm.assert_numpy_array_equal(result, expected)
+        result = method(obj)
+        assert_series_or_index_or_array_or_categorical_equal(result, expected)
 
-        result = Index([Timestamp('20160101', tz='US/Eastern'),
-                        Timestamp('20160101', tz='US/Eastern')]).unique()
-        expected = DatetimeIndex(['2016-01-01 00:00:00'],
-                                 dtype='datetime64[ns, US/Eastern]', freq=None)
-        tm.assert_index_equal(result, expected)
-
-        result = pd.unique(
-            Series(Index([Timestamp('20160101', tz='US/Eastern'),
-                          Timestamp('20160101', tz='US/Eastern')])))
-        expected = np.array([Timestamp('2016-01-01 00:00:00-0500',
-                                       tz='US/Eastern')], dtype=object)
-        tm.assert_numpy_array_equal(result, expected)
+        if method == pd.unique:
+            # [Series/Index].unique do not yet support return_inverse=True
+
+            # reuse result as expected outcome of return_inverse case
+            expected_uniques = result.copy()
+            result_uniques, result_inverse = method(obj, return_inverse=True)
 
-        result = pd.unique(Index([Timestamp('20160101', tz='US/Eastern'),
-                                  Timestamp('20160101', tz='US/Eastern')]))
-        expected = DatetimeIndex(['2016-01-01 00:00:00'],
-                                 dtype='datetime64[ns, US/Eastern]', freq=None)
-        tm.assert_index_equal(result, expected)
+            assert_series_or_index_or_array_or_categorical_equal(
+                result_uniques, expected_uniques)
+
+            # reconstruction can only work if inverse is correct
+            reconstr = box(result_uniques[result_inverse])
+            assert_series_or_index_or_array_or_categorical_equal(reconstr, obj)
 
     def test_order_of_appearance(self):
         # 9346
@@ -491,28 +514,10 @@ def test_order_of_appearance(self):
         tm.assert_numpy_array_equal(result,
                                     np.array([2, 1], dtype='int64'))
 
-        result = pd.unique(Series([Timestamp('20160101'),
-                                   Timestamp('20160101')]))
-        expected = np.array(['2016-01-01T00:00:00.000000000'],
-                            dtype='datetime64[ns]')
-        tm.assert_numpy_array_equal(result, expected)
-
-        result = pd.unique(Index(
-            [Timestamp('20160101', tz='US/Eastern'),
-             Timestamp('20160101', tz='US/Eastern')]))
-        expected = DatetimeIndex(['2016-01-01 00:00:00'],
-                                 dtype='datetime64[ns, US/Eastern]',
-                                 freq=None)
-        tm.assert_index_equal(result, expected)
-
         result = pd.unique(list('aabc'))
         expected = np.array(['a', 'b', 'c'], dtype=object)
         tm.assert_numpy_array_equal(result, expected)
 
-        result = pd.unique(Series(Categorical(list('aabc'))))
-        expected = Categorical(list('abc'))
-        tm.assert_categorical_equal(result, expected)
-
     @pytest.mark.parametrize("arg ,expected", [
         (('1', '1', '2'), np.array(['1', '2'], dtype=object)),
         (('foo',), np.array(['foo'], dtype=object))