diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 9812521fe2767..cc8f26bf95a2f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -390,6 +390,7 @@ Other ^^^^^ - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) - Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`) +- Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`) - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 774bbbe2463e9..344314d829c19 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -439,6 +439,10 @@ def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None): # Dispatch to extension dtype's unique. return values.unique() + if isinstance(values, ABCIndex): + # Dispatch to Index's unique. + return values.unique() + original = values hashtable, values = _get_hashtable_algo(values) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 057a5a627370e..365ec452a7f25 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -16,7 +16,10 @@ is_integer_dtype, is_object_dtype, ) -from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, +) import pandas as pd from pandas import ( @@ -570,19 +573,20 @@ def test_object_refcount_bug(self): for i in range(1000): len(algos.unique(lst)) - def test_on_index_object(self): - mindex = MultiIndex.from_arrays( - [np.arange(5).repeat(5), np.tile(np.arange(5), 5)] - ) - expected = mindex.values - expected.sort() - - mindex = mindex.repeat(2) + def test_index_returned(self, index): + # GH#57043 + index = index.repeat(2) + result = algos.unique(index) - result = pd.unique(mindex) - result.sort() - - tm.assert_almost_equal(result, expected) + # dict.fromkeys preserves the order + unique_values = list(dict.fromkeys(index.values)) + if isinstance(index, MultiIndex): + expected = MultiIndex.from_tuples(unique_values, names=index.names) + else: + expected = Index(unique_values, dtype=index.dtype) + if isinstance(index.dtype, DatetimeTZDtype): + expected = expected.normalize() + tm.assert_index_equal(result, expected, exact=True) def test_dtype_preservation(self, any_numpy_dtype): # GH 15442 @@ -623,7 +627,7 @@ def test_dtype_preservation(self, any_numpy_dtype): def test_datetime64_dtype_array_returned(self): # GH 9431 - expected = np.array( + dt_arr = np.array( [ "2015-01-03T00:00:00.000000000", "2015-01-01T00:00:00.000000000", @@ -639,18 +643,18 @@ def test_datetime64_dtype_array_returned(self): ] ) result = algos.unique(dt_index) - tm.assert_numpy_array_equal(result, expected) - assert result.dtype == expected.dtype + expected = to_datetime(dt_arr) + tm.assert_index_equal(result, expected, exact=True) s = Series(dt_index) result = algos.unique(s) - tm.assert_numpy_array_equal(result, expected) - assert result.dtype == expected.dtype + tm.assert_numpy_array_equal(result, dt_arr) + assert result.dtype == dt_arr.dtype arr = s.values result = algos.unique(arr) - tm.assert_numpy_array_equal(result, expected) - assert result.dtype == expected.dtype + tm.assert_numpy_array_equal(result, dt_arr) + assert result.dtype == dt_arr.dtype def test_datetime_non_ns(self): a = np.array(["2000", "2000", "2001"], dtype="datetime64[s]") @@ -666,22 +670,23 @@ def test_timedelta_non_ns(self): def test_timedelta64_dtype_array_returned(self): # GH 9431 - expected = np.array([31200, 45678, 10000], dtype="m8[ns]") + td_arr = np.array([31200, 45678, 10000], dtype="m8[ns]") td_index = to_timedelta([31200, 45678, 31200, 10000, 45678]) result = algos.unique(td_index) - tm.assert_numpy_array_equal(result, expected) + expected = to_timedelta(td_arr) + tm.assert_index_equal(result, expected) assert result.dtype == expected.dtype s = Series(td_index) result = algos.unique(s) - tm.assert_numpy_array_equal(result, expected) - assert result.dtype == expected.dtype + tm.assert_numpy_array_equal(result, td_arr) + assert result.dtype == td_arr.dtype arr = s.values result = algos.unique(arr) - tm.assert_numpy_array_equal(result, expected) - assert result.dtype == expected.dtype + tm.assert_numpy_array_equal(result, td_arr) + assert result.dtype == td_arr.dtype def test_uint64_overflow(self): s = Series([1, 2, 2**63, 2**63], dtype=np.uint64)