From c22db31d6369bbc1021c6ae338aa5db97ac69e79 Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Thu, 29 Feb 2024 22:17:46 +0800 Subject: [PATCH 1/6] Add test --- pandas/tests/test_algos.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 057a5a627370e..948a4cbece2ab 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -570,6 +570,13 @@ def test_object_refcount_bug(self): for i in range(1000): len(algos.unique(lst)) + def test_index_returned(self, index): + # GH#57043 + index = index.repeat(2) + result = algos.unique(index) + expected = index.unique() + tm.assert_index_equal(result, expected, exact=True) + def test_on_index_object(self): mindex = MultiIndex.from_arrays( [np.arange(5).repeat(5), np.tile(np.arange(5), 5)] From d6c7a29505b89a0fdd69c21fe85d404ce63acaf6 Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Thu, 29 Feb 2024 22:21:03 +0800 Subject: [PATCH 2/6] Fix --- pandas/core/algorithms.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 3672cdb13d4a3..664de07649c5f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -438,6 +438,10 @@ def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None): # Dispatch to extension dtype's unique. return values.unique() + if isinstance(values, ABCIndex): + # Dispatch to Index's unique. + return values.unique() + original = values hashtable, values = _get_hashtable_algo(values) From 61cfc069db2e164d94af1d669ef1b8ac40ad2bc7 Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Thu, 29 Feb 2024 22:29:16 +0800 Subject: [PATCH 3/6] Adjust tests --- pandas/tests/test_algos.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 948a4cbece2ab..b7bdd42122cd4 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -586,7 +586,7 @@ def test_on_index_object(self): mindex = mindex.repeat(2) - result = pd.unique(mindex) + result = pd.unique(mindex).values result.sort() tm.assert_almost_equal(result, expected) @@ -646,7 +646,7 @@ def test_datetime64_dtype_array_returned(self): ] ) result = algos.unique(dt_index) - tm.assert_numpy_array_equal(result, expected) + tm.assert_index_equal(result, to_datetime(expected)) assert result.dtype == expected.dtype s = Series(dt_index) @@ -677,7 +677,7 @@ def test_timedelta64_dtype_array_returned(self): td_index = to_timedelta([31200, 45678, 31200, 10000, 45678]) result = algos.unique(td_index) - tm.assert_numpy_array_equal(result, expected) + tm.assert_index_equal(result, to_timedelta(expected)) assert result.dtype == expected.dtype s = Series(td_index) From 0864bd418d406a0b86561ac43c0e556103db6612 Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Thu, 29 Feb 2024 22:39:18 +0800 Subject: [PATCH 4/6] Add whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a95f0485abd5f..f795093503e76 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -367,6 +367,7 @@ Other ^^^^^ - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) - Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`) +- Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`) - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) From 3eed933ef06a7e6281d1bc181b390fae4d1b488f Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Sat, 9 Mar 2024 16:17:02 +0800 Subject: [PATCH 5/6] Improve tests --- pandas/tests/test_algos.py | 50 +++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index b7bdd42122cd4..167bce6ed9655 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -16,7 +16,10 @@ is_integer_dtype, is_object_dtype, ) -from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, +) import pandas as pd from pandas import ( @@ -574,22 +577,28 @@ def test_index_returned(self, index): # GH#57043 index = index.repeat(2) result = algos.unique(index) - expected = index.unique() + + # dict.fromkeys preserves the order + unique_values = list(dict.fromkeys(index.values)) + if isinstance(index, MultiIndex): + expected = MultiIndex.from_tuples(unique_values, names=index.names) + else: + expected = Index(unique_values, dtype=index.dtype) + if isinstance(index.dtype, DatetimeTZDtype): + expected = expected.normalize() tm.assert_index_equal(result, expected, exact=True) def test_on_index_object(self): mindex = MultiIndex.from_arrays( [np.arange(5).repeat(5), np.tile(np.arange(5), 5)] ) - expected = mindex.values - expected.sort() + expected = mindex mindex = mindex.repeat(2) - result = pd.unique(mindex).values - result.sort() + result = pd.unique(mindex) - tm.assert_almost_equal(result, expected) + tm.assert_index_equal(result, expected, exact=True) def test_dtype_preservation(self, any_numpy_dtype): # GH 15442 @@ -630,7 +639,7 @@ def test_dtype_preservation(self, any_numpy_dtype): def test_datetime64_dtype_array_returned(self): # GH 9431 - expected = np.array( + dt_arr = np.array( [ "2015-01-03T00:00:00.000000000", "2015-01-01T00:00:00.000000000", @@ -646,18 +655,18 @@ def test_datetime64_dtype_array_returned(self): ] ) result = algos.unique(dt_index) - tm.assert_index_equal(result, to_datetime(expected)) - assert result.dtype == expected.dtype + expected = to_datetime(dt_arr) + tm.assert_index_equal(result, expected, exact=True) s = Series(dt_index) result = algos.unique(s) - tm.assert_numpy_array_equal(result, expected) - assert result.dtype == expected.dtype + tm.assert_numpy_array_equal(result, dt_arr) + assert result.dtype == dt_arr.dtype arr = s.values result = algos.unique(arr) - tm.assert_numpy_array_equal(result, expected) - assert result.dtype == expected.dtype + tm.assert_numpy_array_equal(result, dt_arr) + assert result.dtype == dt_arr.dtype def test_datetime_non_ns(self): a = np.array(["2000", "2000", "2001"], dtype="datetime64[s]") @@ -673,22 +682,23 @@ def test_timedelta_non_ns(self): def test_timedelta64_dtype_array_returned(self): # GH 9431 - expected = np.array([31200, 45678, 10000], dtype="m8[ns]") + td_arr = np.array([31200, 45678, 10000], dtype="m8[ns]") td_index = to_timedelta([31200, 45678, 31200, 10000, 45678]) result = algos.unique(td_index) - tm.assert_index_equal(result, to_timedelta(expected)) + expected = to_timedelta(td_arr) + tm.assert_index_equal(result, expected) assert result.dtype == expected.dtype s = Series(td_index) result = algos.unique(s) - tm.assert_numpy_array_equal(result, expected) - assert result.dtype == expected.dtype + tm.assert_numpy_array_equal(result, td_arr) + assert result.dtype == td_arr.dtype arr = s.values result = algos.unique(arr) - tm.assert_numpy_array_equal(result, expected) - assert result.dtype == expected.dtype + tm.assert_numpy_array_equal(result, td_arr) + assert result.dtype == td_arr.dtype def test_uint64_overflow(self): s = Series([1, 2, 2**63, 2**63], dtype=np.uint64) From 016b45b5825e0c57ef58193d24a2cb8dd77bd2a2 Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Sat, 9 Mar 2024 20:30:54 +0800 Subject: [PATCH 6/6] Remove duplicate test --- pandas/tests/test_algos.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 167bce6ed9655..365ec452a7f25 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -588,18 +588,6 @@ def test_index_returned(self, index): expected = expected.normalize() tm.assert_index_equal(result, expected, exact=True) - def test_on_index_object(self): - mindex = MultiIndex.from_arrays( - [np.arange(5).repeat(5), np.tile(np.arange(5), 5)] - ) - expected = mindex - - mindex = mindex.repeat(2) - - result = pd.unique(mindex) - - tm.assert_index_equal(result, expected, exact=True) - def test_dtype_preservation(self, any_numpy_dtype): # GH 15442 if any_numpy_dtype in (tm.BYTES_DTYPES + tm.STRING_DTYPES):