From 076061bd7cb506f7f7863cd6e2dcc67d68c2cb9b Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Fri, 7 Aug 2020 11:16:39 +0300 Subject: [PATCH 01/60] BUG: attempt initial fix --- pandas/core/indexes/base.py | 10 ++++++++-- pandas/core/resample.py | 1 + 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a07c3328def54..76d018f909108 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4441,9 +4441,15 @@ def sort_values( """ idx = ensure_key_mapped(self, key) - _as = idx.argsort() + # GH 35584. Place missing values at the end of sorted Index, + # same as Series.sort_values + bad = isna(idx) + good = ~bad + _as = np.arange(len(idx), dtype=np.int64) + _as = np.concatenate([_as[good][idx[good].argsort()], _as[bad]]) + if not ascending: - _as = _as[::-1] + _as[:np.sum(good)] = _as[:np.sum(good)][::-1] sorted_index = self.take(_as) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index fc54128ae5aa6..39c8ed2b4390c 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -287,6 +287,7 @@ def aggregate(self, func, *args, **kwargs): self._set_binner() result, how = self._aggregate(func, *args, **kwargs) if result is None: + print("Here") how = func grouper = None result = self._groupby_and_aggregate(how, grouper, *args, **kwargs) From 3fa93513fe9d3801022ebfa2101a77aa5fd51437 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Fri, 7 Aug 2020 12:22:48 +0300 Subject: [PATCH 02/60] TST: add test --- pandas/tests/indexes/test_common.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index db260b71e7186..5cb7d476d90da 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -13,7 +13,7 @@ from pandas.core.dtypes.common import is_period_dtype, needs_i8_conversion import pandas as pd -from pandas import CategoricalIndex, MultiIndex, RangeIndex +from pandas import CategoricalIndex, Index, MultiIndex, RangeIndex import pandas._testing as tm @@ -391,3 +391,12 @@ def test_astype_preserves_name(self, index, dtype): assert result.names == index.names else: assert result.name == index.name + + +def test_sort_values_with_missing(): + # GH 35584. Test that sort_values works with missing values, + # sort non-missing and places missing at the end + idx_with_nan = Index(["a", None, "c", None, "e"]) + expected = Index(["a", "c", "e", None, None]) + result = idx_with_nan.sort_values() + tm.assert_index_equal(result, expected) From 333c6e404ff20e9d06c85726c866db9e060ee9db Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Fri, 7 Aug 2020 12:27:57 +0300 Subject: [PATCH 03/60] CLN: run black --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 76d018f909108..980b7e08a15f5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4449,7 +4449,7 @@ def sort_values( _as = np.concatenate([_as[good][idx[good].argsort()], _as[bad]]) if not ascending: - _as[:np.sum(good)] = _as[:np.sum(good)][::-1] + _as[: np.sum(good)] = _as[: np.sum(good)][::-1] sorted_index = self.take(_as) From 8414fd052ca86370a2e605deeb91ee5044d42351 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Fri, 7 Aug 2020 14:47:25 +0300 Subject: [PATCH 04/60] CLN: clean up unnecessary print --- pandas/core/resample.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 39c8ed2b4390c..fc54128ae5aa6 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -287,7 +287,6 @@ def aggregate(self, func, *args, **kwargs): self._set_binner() result, how = self._aggregate(func, *args, **kwargs) if result is None: - print("Here") how = func grouper = None result = self._groupby_and_aggregate(how, grouper, *args, **kwargs) From fa1289841ee2de75ef4a247ec919ccb321b78053 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Fri, 7 Aug 2020 16:01:27 +0300 Subject: [PATCH 05/60] exempt MultiIndex from handling missing values --- pandas/core/indexes/base.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 980b7e08a15f5..bdd36b4fae724 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4443,13 +4443,18 @@ def sort_values( # GH 35584. Place missing values at the end of sorted Index, # same as Series.sort_values - bad = isna(idx) - good = ~bad - _as = np.arange(len(idx), dtype=np.int64) - _as = np.concatenate([_as[good][idx[good].argsort()], _as[bad]]) + if not isinstance(self, ABCMultiIndex): + bad = isna(idx) + good = ~bad + _as = np.arange(len(idx), dtype=np.int64) + _as = np.concatenate([_as[good][idx[good].argsort()], _as[bad]]) + if not ascending: + _as[: np.sum(good)] = _as[: np.sum(good)][::-1] + else: + _as = idx.argsort() + if not ascending: + _as = _as[::-1] - if not ascending: - _as[: np.sum(good)] = _as[: np.sum(good)][::-1] sorted_index = self.take(_as) From e2575c9b6248003cfa028d878d6967709acdc9ba Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Fri, 7 Aug 2020 16:47:25 +0300 Subject: [PATCH 06/60] CLN: run black --- pandas/core/indexes/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bdd36b4fae724..eb2ab5cccc188 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4455,7 +4455,6 @@ def sort_values( if not ascending: _as = _as[::-1] - sorted_index = self.take(_as) if return_indexer: From 4293e22d0e57791e53c2b9964717a5f1adcd3894 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Mon, 10 Aug 2020 10:55:40 +0300 Subject: [PATCH 07/60] add na_position kwarg to sort_values --- pandas/core/indexes/base.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index eb2ab5cccc188..153f3e7dc63ab 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4387,7 +4387,11 @@ def asof_locs(self, where, mask): return result def sort_values( - self, return_indexer=False, ascending=True, key: Optional[Callable] = None + self, + return_indexer=False, + ascending=True, + key: Optional[Callable] = None, + na_position: str = "last", ): """ Return a sorted copy of the index. @@ -4410,6 +4414,12 @@ def sort_values( .. versionadded:: 1.1.0 + na_position : {'first' or 'last'}, default 'last' + Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at + the end. + + .. versionadded:: 1.1.1 + Returns ------- sorted_index : pandas.Index @@ -4447,7 +4457,15 @@ def sort_values( bad = isna(idx) good = ~bad _as = np.arange(len(idx), dtype=np.int64) - _as = np.concatenate([_as[good][idx[good].argsort()], _as[bad]]) + if na_position == "last": + _as = np.concatenate([_as[good][idx[good].argsort()], _as[bad]]) + elif na_position == "first": + _as = np.concatenate([_as[bad], _as[good][idx[good].argsort()]]) + else: + raise ValueError( + "Invalid na_position keyword argument value." + f"Can be only 'first' or 'last', {na_position} given." + ) if not ascending: _as[: np.sum(good)] = _as[: np.sum(good)][::-1] else: From 275067e1a11a50eb87918287090207b7b8673b4a Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Mon, 10 Aug 2020 11:18:40 +0300 Subject: [PATCH 08/60] debug na_position logic in sort_values --- pandas/core/indexes/base.py | 6 ++++-- pandas/tests/indexes/interval/test_interval.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 153f3e7dc63ab..5f1351d6d3e2f 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4459,15 +4459,17 @@ def sort_values( _as = np.arange(len(idx), dtype=np.int64) if na_position == "last": _as = np.concatenate([_as[good][idx[good].argsort()], _as[bad]]) + if not ascending: + _as[: np.sum(good)] = _as[: np.sum(good)][::-1] elif na_position == "first": _as = np.concatenate([_as[bad], _as[good][idx[good].argsort()]]) + if not ascending: + _as[np.sum(bad) :] = _as[np.sum(bad) :][::-1] else: raise ValueError( "Invalid na_position keyword argument value." f"Can be only 'first' or 'last', {na_position} given." ) - if not ascending: - _as[: np.sum(good)] = _as[: np.sum(good)][::-1] else: _as = idx.argsort() if not ascending: diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 2755b186f3eae..b5e150f25f543 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -618,7 +618,7 @@ def test_sort_values(self, closed): expected = IntervalIndex([Interval(0, 1), Interval(1, 2), np.nan]) tm.assert_index_equal(result, expected) - result = index.sort_values(ascending=False) + result = index.sort_values(ascending=False, na_position="first") expected = IntervalIndex([np.nan, Interval(1, 2), Interval(0, 1)]) tm.assert_index_equal(result, expected) From 098fe10ccb70bd3427ba537f9b5384a0fac12aa8 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Mon, 10 Aug 2020 13:21:33 +0300 Subject: [PATCH 09/60] CLN: fix na_position type --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5f1351d6d3e2f..1e780a1675751 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4391,7 +4391,7 @@ def sort_values( return_indexer=False, ascending=True, key: Optional[Callable] = None, - na_position: str = "last", + na_position: str_t = "last", ): """ Return a sorted copy of the index. From f0d4c2d0c73210232d7061d7e48faf8d7a229f76 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Mon, 10 Aug 2020 16:42:46 +0300 Subject: [PATCH 10/60] DOC: mv to 1.2.0 since API is changed --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 55570341cf4e8..4bfd1ff0563b8 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -218,7 +218,7 @@ Interval Indexing ^^^^^^^^ -- +- Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`) - Missing From 743eca66afc8b52e63cab6815de1b8321d2fb28d Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Tue, 11 Aug 2020 08:47:01 +0300 Subject: [PATCH 11/60] DOC: make the comment more specific --- pandas/core/indexes/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 1e780a1675751..3fdd9bc7ff4de 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4451,8 +4451,7 @@ def sort_values( """ idx = ensure_key_mapped(self, key) - # GH 35584. Place missing values at the end of sorted Index, - # same as Series.sort_values + # GH 35584. Sort missing values according to na_position kwarg if not isinstance(self, ABCMultiIndex): bad = isna(idx) good = ~bad From 9d17d7a8a7952ddbf755a53dd1dec62fc859cbc6 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Tue, 11 Aug 2020 10:24:28 +0300 Subject: [PATCH 12/60] TST: parametrize the test, add raise check --- pandas/core/indexes/base.py | 4 ++-- pandas/tests/indexes/test_common.py | 30 +++++++++++++++++++++++------ 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3fdd9bc7ff4de..29896585f36c5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4466,8 +4466,8 @@ def sort_values( _as[np.sum(bad) :] = _as[np.sum(bad) :][::-1] else: raise ValueError( - "Invalid na_position keyword argument value." - f"Can be only 'first' or 'last', {na_position} given." + "Invalid na_position keyword argument value. " + f"Can be only 'first' or 'last', '{na_position}' given." ) else: _as = idx.argsort() diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 5cb7d476d90da..f75a1ef5d498b 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -393,10 +393,28 @@ def test_astype_preserves_name(self, index, dtype): assert result.name == index.name -def test_sort_values_with_missing(): +@pytest.mark.parametrize( + "idx", [Index(["a", None, "c", None, "e"])] +) +@pytest.mark.parametrize( + "na_position, expected", + [ + ("first", Index([None, None, "a", "c", "e"])), + ("last", Index(["a", "c", "e", None, None])), + ("middle", None), + ] +) +def test_sort_values_with_missing(idx, na_position, expected): # GH 35584. Test that sort_values works with missing values, - # sort non-missing and places missing at the end - idx_with_nan = Index(["a", None, "c", None, "e"]) - expected = Index(["a", "c", "e", None, None]) - result = idx_with_nan.sort_values() - tm.assert_index_equal(result, expected) + # sort non-missing and places missing according to na_position + + if na_position not in ["first", "last"]: + with pytest.raises( + ValueError, + match="Invalid na_position keyword argument value. " + f"Can be only 'first' or 'last', '{na_position}' given." + ): + idx.sort_values(na_position=na_position) + else: + result = idx.sort_values(na_position=na_position) + tm.assert_index_equal(result, expected) From 744e5c15bf4298c64971a72c12ac0889c705ad36 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Tue, 11 Aug 2020 10:28:13 +0300 Subject: [PATCH 13/60] CLN: run black --- pandas/tests/indexes/test_common.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index f75a1ef5d498b..22e361627cbb8 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -393,16 +393,14 @@ def test_astype_preserves_name(self, index, dtype): assert result.name == index.name -@pytest.mark.parametrize( - "idx", [Index(["a", None, "c", None, "e"])] -) +@pytest.mark.parametrize("idx", [Index(["a", None, "c", None, "e"])]) @pytest.mark.parametrize( "na_position, expected", [ ("first", Index([None, None, "a", "c", "e"])), ("last", Index(["a", "c", "e", None, None])), ("middle", None), - ] + ], ) def test_sort_values_with_missing(idx, na_position, expected): # GH 35584. Test that sort_values works with missing values, @@ -412,7 +410,7 @@ def test_sort_values_with_missing(idx, na_position, expected): with pytest.raises( ValueError, match="Invalid na_position keyword argument value. " - f"Can be only 'first' or 'last', '{na_position}' given." + f"Can be only 'first' or 'last', '{na_position}' given.", ): idx.sort_values(na_position=na_position) else: From 13fcd9473b667b5295aa592d3b9823a9cb5091f6 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Tue, 11 Aug 2020 11:19:46 +0300 Subject: [PATCH 14/60] CLN: fix versionadded --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 29896585f36c5..255f5d39ec11c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4418,7 +4418,7 @@ def sort_values( Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at the end. - .. versionadded:: 1.1.1 + .. versionadded:: 1.2.0 Returns ------- From 33b02da84e7842c718c70b8940a59a2538c7462c Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Thu, 13 Aug 2020 10:37:03 +0300 Subject: [PATCH 15/60] CLN: move na_position before key --- pandas/core/indexes/base.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 255f5d39ec11c..ae5f4097a1968 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4390,8 +4390,8 @@ def sort_values( self, return_indexer=False, ascending=True, - key: Optional[Callable] = None, na_position: str_t = "last", + key: Optional[Callable] = None, ): """ Return a sorted copy of the index. @@ -4405,6 +4405,12 @@ def sort_values( Should the indices that would sort the index be returned. ascending : bool, default True Should the index values be sorted in an ascending order. + na_position : {'first' or 'last'}, default 'last' + Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at + the end. + + .. versionadded:: 1.2.0 + key : callable, optional If not None, apply the key function to the index values before sorting. This is similar to the `key` argument in the @@ -4414,12 +4420,6 @@ def sort_values( .. versionadded:: 1.1.0 - na_position : {'first' or 'last'}, default 'last' - Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at - the end. - - .. versionadded:: 1.2.0 - Returns ------- sorted_index : pandas.Index From 73cc94cf2e1d85819b4fa4b35c91101b19490d5d Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Thu, 13 Aug 2020 10:38:13 +0300 Subject: [PATCH 16/60] DOC: add comment for MultiIndex --- pandas/core/indexes/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ae5f4097a1968..f0b63af1ed197 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4470,6 +4470,7 @@ def sort_values( f"Can be only 'first' or 'last', '{na_position}' given." ) else: + # GH 35584. Ignore na_position for MultiIndex _as = idx.argsort() if not ascending: _as = _as[::-1] From 153fb06bf8fa49a2a3f951061bb7b9e95f7f31e8 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Thu, 13 Aug 2020 14:35:40 +0300 Subject: [PATCH 17/60] REFACT: switch to nargsort from sorting.py --- pandas/core/indexes/base.py | 23 +++++------------------ pandas/tests/indexes/test_common.py | 3 +-- 2 files changed, 6 insertions(+), 20 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f0b63af1ed197..9d4c190d82fb8 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -86,7 +86,7 @@ import pandas.core.missing as missing from pandas.core.ops import get_op_result_name from pandas.core.ops.invalid import make_invalid_op -from pandas.core.sorting import ensure_key_mapped +from pandas.core.sorting import ensure_key_mapped, nargsort from pandas.core.strings import StringMethods from pandas.io.formats.printing import ( @@ -4452,25 +4452,12 @@ def sort_values( idx = ensure_key_mapped(self, key) # GH 35584. Sort missing values according to na_position kwarg + # ignore na_position for MutiIndex if not isinstance(self, ABCMultiIndex): - bad = isna(idx) - good = ~bad - _as = np.arange(len(idx), dtype=np.int64) - if na_position == "last": - _as = np.concatenate([_as[good][idx[good].argsort()], _as[bad]]) - if not ascending: - _as[: np.sum(good)] = _as[: np.sum(good)][::-1] - elif na_position == "first": - _as = np.concatenate([_as[bad], _as[good][idx[good].argsort()]]) - if not ascending: - _as[np.sum(bad) :] = _as[np.sum(bad) :][::-1] - else: - raise ValueError( - "Invalid na_position keyword argument value. " - f"Can be only 'first' or 'last', '{na_position}' given." - ) + _as = nargsort( + items=idx, ascending=ascending, na_position=na_position, key=key + ) else: - # GH 35584. Ignore na_position for MultiIndex _as = idx.argsort() if not ascending: _as = _as[::-1] diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 22e361627cbb8..bee0610bca88a 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -409,8 +409,7 @@ def test_sort_values_with_missing(idx, na_position, expected): if na_position not in ["first", "last"]: with pytest.raises( ValueError, - match="Invalid na_position keyword argument value. " - f"Can be only 'first' or 'last', '{na_position}' given.", + match=f"invalid na_position: {na_position}", ): idx.sort_values(na_position=na_position) else: From 3481b8186b30dc404025444b5cc24447bf11798e Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Thu, 13 Aug 2020 15:39:46 +0300 Subject: [PATCH 18/60] reverse equal els in nargsort of ascending is False --- pandas/core/sorting.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 8bdd466ae6f33..0e2a59d37acae 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -306,9 +306,9 @@ def nargsort( non_nan_idx = idx[~mask] nan_idx = np.nonzero(mask)[0] - if not ascending: - non_nans = non_nans[::-1] - non_nan_idx = non_nan_idx[::-1] + + # GH 35584. Reverse order for equal elements if ascending is False + # to use in Index and Series sort_values indexer = non_nan_idx[non_nans.argsort(kind=kind)] if not ascending: indexer = indexer[::-1] From 9d92dddefe52277dd6f9eb3230c6b4be520717b0 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Thu, 13 Aug 2020 16:15:22 +0300 Subject: [PATCH 19/60] switch Series to nargsort --- pandas/core/series.py | 49 ++++++++++-------------------------------- pandas/core/sorting.py | 7 +++++- 2 files changed, 17 insertions(+), 39 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index a8a2d300fa168..3b3cefce0444d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -89,7 +89,7 @@ from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.indexing import check_bool_indexer from pandas.core.internals import SingleBlockManager -from pandas.core.sorting import ensure_key_mapped +from pandas.core.sorting import ensure_key_mapped, nargsort from pandas.core.strings import StringMethods from pandas.core.tools.datetimes import to_datetime @@ -3254,54 +3254,27 @@ def sort_values( "sort in-place you must create a copy" ) - def _try_kind_sort(arr): - arr = ensure_key_mapped(arr, key) - arr = getattr(arr, "_values", arr) - - # easier to ask forgiveness than permission - try: - # if kind==mergesort, it can fail for object dtype - return arr.argsort(kind=kind) - except TypeError: - # stable sort not available for object dtype - # uses the argsort default quicksort - return arr.argsort(kind="quicksort") - - arr = self._values - sorted_index = np.empty(len(self), dtype=np.int32) - - bad = isna(arr) - - good = ~bad - idx = ibase.default_index(len(self)) - - argsorted = _try_kind_sort(self[good]) - if is_list_like(ascending): if len(ascending) != 1: raise ValueError( f"Length of ascending ({len(ascending)}) must be 1 for Series" ) ascending = ascending[0] - if not is_bool(ascending): raise ValueError("ascending must be boolean") - if not ascending: - argsorted = argsorted[::-1] - - if na_position == "last": - n = good.sum() - sorted_index[:n] = idx[good][argsorted] - sorted_index[n:] = idx[bad] - elif na_position == "first": - n = bad.sum() - sorted_index[n:] = idx[good][argsorted] - sorted_index[:n] = idx[bad] + if key is not None: + arr = ensure_key_mapped(self, key) + arr = getattr(arr, "_values", arr) else: - raise ValueError(f"invalid na_position: {na_position}") + arr = self._values + sorted_index = nargsort( + items=arr, ascending=ascending, kind=kind, na_position=na_position, key=None + ) - result = self._constructor(arr[sorted_index], index=self.index[sorted_index]) + result = self._constructor( + self._values[sorted_index], index=self.index[sorted_index] + ) if ignore_index: result.index = ibase.default_index(len(sorted_index)) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 0e2a59d37acae..bf40e1ad6d5ea 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -309,7 +309,12 @@ def nargsort( # GH 35584. Reverse order for equal elements if ascending is False # to use in Index and Series sort_values - indexer = non_nan_idx[non_nans.argsort(kind=kind)] + try: + indexer = non_nan_idx[non_nans.argsort(kind=kind)] + except: + # For compatibility with Series: fall back to quicksort + # when mergesort is unavailable for object element type + indexer = non_nan_idx[non_nans.argsort(kind="quicksort")] if not ascending: indexer = indexer[::-1] # Finally, place the NaNs at the end or the beginning according to From 7ec9e7e3ce719069538636d48cf9883fa338f9f2 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Thu, 13 Aug 2020 16:26:44 +0300 Subject: [PATCH 20/60] Revert "reverse equal els in nargsort of ascending is False" Most tests expect the order of equal elements to be preserved when ascending is False. --- pandas/core/sorting.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index bf40e1ad6d5ea..30ca24b2ee110 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -307,14 +307,16 @@ def nargsort( nan_idx = np.nonzero(mask)[0] - # GH 35584. Reverse order for equal elements if ascending is False - # to use in Index and Series sort_values + if not ascending: + non_nans = non_nans[::-1] + non_nan_idx = non_nan_idx[::-1] try: indexer = non_nan_idx[non_nans.argsort(kind=kind)] except: # For compatibility with Series: fall back to quicksort # when mergesort is unavailable for object element type indexer = non_nan_idx[non_nans.argsort(kind="quicksort")] + if not ascending: indexer = indexer[::-1] # Finally, place the NaNs at the end or the beginning according to From 2d16b8d0e25674b7d8085ccf7c5a98ce280d77c9 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Thu, 13 Aug 2020 16:41:13 +0300 Subject: [PATCH 21/60] add TypeError to exception catch in nargsort --- pandas/core/sorting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 30ca24b2ee110..d5e42b6ad6683 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -312,7 +312,7 @@ def nargsort( non_nan_idx = non_nan_idx[::-1] try: indexer = non_nan_idx[non_nans.argsort(kind=kind)] - except: + except TypeError: # For compatibility with Series: fall back to quicksort # when mergesort is unavailable for object element type indexer = non_nan_idx[non_nans.argsort(kind="quicksort")] From 89390402261074184451db525a19ee6bfb7543fc Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Thu, 13 Aug 2020 16:46:36 +0300 Subject: [PATCH 22/60] reverse equal els order in nargsort (again) --- pandas/core/sorting.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index d5e42b6ad6683..7572cabfdbae1 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -307,9 +307,6 @@ def nargsort( nan_idx = np.nonzero(mask)[0] - if not ascending: - non_nans = non_nans[::-1] - non_nan_idx = non_nan_idx[::-1] try: indexer = non_nan_idx[non_nans.argsort(kind=kind)] except TypeError: From 066f0af46214ceda5fffec528ae410e721980d18 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Sat, 15 Aug 2020 15:02:02 +0300 Subject: [PATCH 23/60] CLN: isort test_datetime.py --- pandas/tests/series/indexing/test_datetime.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 088f8681feb99..19d5dfa3b3900 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -11,6 +11,7 @@ from pandas import DataFrame, DatetimeIndex, NaT, Series, Timestamp, date_range import pandas._testing as tm + """ Also test support for datetime64[ns] in Series / DataFrame """ From 7e351cd9848048eb29e242f2ddc0b4cac0e51e9e Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Sat, 15 Aug 2020 16:10:08 +0300 Subject: [PATCH 24/60] keep equal el order in nargsort (again) --- pandas/core/sorting.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 7572cabfdbae1..d5e42b6ad6683 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -307,6 +307,9 @@ def nargsort( nan_idx = np.nonzero(mask)[0] + if not ascending: + non_nans = non_nans[::-1] + non_nan_idx = non_nan_idx[::-1] try: indexer = non_nan_idx[non_nans.argsort(kind=kind)] except TypeError: From 79ed8eec0e1fc92c279a75d95e92d7c2c5eee6a2 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Sat, 15 Aug 2020 17:51:28 +0300 Subject: [PATCH 25/60] add equals_reversible to nargsort and a debugging branch --- pandas/core/series.py | 90 ++++++++++++++++++++++++++++++++++-------- pandas/core/sorting.py | 8 +++- 2 files changed, 80 insertions(+), 18 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 3b3cefce0444d..a520af3c13a6f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3254,27 +3254,83 @@ def sort_values( "sort in-place you must create a copy" ) - if is_list_like(ascending): - if len(ascending) != 1: - raise ValueError( - f"Length of ascending ({len(ascending)}) must be 1 for Series" - ) - ascending = ascending[0] - if not is_bool(ascending): - raise ValueError("ascending must be boolean") - - if key is not None: - arr = ensure_key_mapped(self, key) + def _try_kind_sort(arr): + arr = ensure_key_mapped(arr, key) arr = getattr(arr, "_values", arr) + + # easier to ask forgiveness than permission + try: + # if kind==mergesort, it can fail for object dtype + return arr.argsort(kind=kind) + except TypeError: + # stable sort not available for object dtype + # uses the argsort default quicksort + return arr.argsort(kind="quicksort") + + if True: + if is_list_like(ascending): + if len(ascending) != 1: + raise ValueError( + f"Length of ascending ({len(ascending)}) must be 1 for Series" + ) + ascending = ascending[0] + if not is_bool(ascending): + raise ValueError("ascending must be boolean") + + if key is not None: + arr = ensure_key_mapped(self, key) + arr = getattr(arr, "_values", arr) + else: + arr = self._values + sorted_index = nargsort( + items=arr, + ascending=ascending, + kind=kind, + na_position=na_position, + key=None, + equals_reversible=True, + ) + + result = self._constructor( + self._values[sorted_index], index=self.index[sorted_index] + ) else: arr = self._values - sorted_index = nargsort( - items=arr, ascending=ascending, kind=kind, na_position=na_position, key=None - ) + sorted_index = np.empty(len(self), dtype=np.int32) - result = self._constructor( - self._values[sorted_index], index=self.index[sorted_index] - ) + bad = isna(arr) + + good = ~bad + idx = ibase.default_index(len(self)) + + argsorted = _try_kind_sort(self[good]) + + if is_list_like(ascending): + if len(ascending) != 1: + raise ValueError( + f"Length of ascending ({len(ascending)}) must be 1 for Series" + ) + ascending = ascending[0] + + if not is_bool(ascending): + raise ValueError("ascending must be boolean") + + if not ascending: + argsorted = argsorted[::-1] + + if na_position == "last": + n = good.sum() + sorted_index[:n] = idx[good][argsorted] + sorted_index[n:] = idx[bad] + elif na_position == "first": + n = bad.sum() + sorted_index[n:] = idx[good][argsorted] + sorted_index[:n] = idx[bad] + else: + raise ValueError(f"invalid na_position: {na_position}") + result = self._constructor( + arr[sorted_index], index=self.index[sorted_index] + ) if ignore_index: result.index = ibase.default_index(len(sorted_index)) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index d5e42b6ad6683..cf457e1f6c1aa 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -271,6 +271,7 @@ def nargsort( ascending: bool = True, na_position: str = "last", key: Optional[Callable] = None, + equals_reversible: bool = False, ): """ Intended to be a drop-in replacement for np.argsort which handles NaNs. @@ -285,6 +286,11 @@ def nargsort( ascending : bool, default True na_position : {'first', 'last'}, default 'last' key : Optional[Callable], default None + equals_reversible: bool, default False + Whether equal elements should be reversed or keep their order + when ascending is False. True reverses elements when ascending is False. + Added for compatibility between Index and Series + (False for Index, True for Series) """ if key is not None: @@ -307,7 +313,7 @@ def nargsort( nan_idx = np.nonzero(mask)[0] - if not ascending: + if not ascending and not equals_reversible: non_nans = non_nans[::-1] non_nan_idx = non_nan_idx[::-1] try: From 62268e8dd9a6e02be489a44e2f1e10eec85875b8 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Sat, 15 Aug 2020 17:59:47 +0300 Subject: [PATCH 26/60] CLN: remove debug code --- pandas/core/series.py | 95 ++++++++++--------------------------------- 1 file changed, 22 insertions(+), 73 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index a520af3c13a6f..2b7070530c19b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3254,83 +3254,32 @@ def sort_values( "sort in-place you must create a copy" ) - def _try_kind_sort(arr): - arr = ensure_key_mapped(arr, key) - arr = getattr(arr, "_values", arr) - - # easier to ask forgiveness than permission - try: - # if kind==mergesort, it can fail for object dtype - return arr.argsort(kind=kind) - except TypeError: - # stable sort not available for object dtype - # uses the argsort default quicksort - return arr.argsort(kind="quicksort") - - if True: - if is_list_like(ascending): - if len(ascending) != 1: - raise ValueError( - f"Length of ascending ({len(ascending)}) must be 1 for Series" - ) - ascending = ascending[0] - if not is_bool(ascending): - raise ValueError("ascending must be boolean") - - if key is not None: - arr = ensure_key_mapped(self, key) - arr = getattr(arr, "_values", arr) - else: - arr = self._values - sorted_index = nargsort( - items=arr, - ascending=ascending, - kind=kind, - na_position=na_position, - key=None, - equals_reversible=True, - ) + if is_list_like(ascending): + if len(ascending) != 1: + raise ValueError( + f"Length of ascending ({len(ascending)}) must be 1 for Series" + ) + ascending = ascending[0] + if not is_bool(ascending): + raise ValueError("ascending must be boolean") - result = self._constructor( - self._values[sorted_index], index=self.index[sorted_index] - ) + if key is not None: + arr = ensure_key_mapped(self, key) + arr = getattr(arr, "_values", arr) else: arr = self._values - sorted_index = np.empty(len(self), dtype=np.int32) - - bad = isna(arr) - - good = ~bad - idx = ibase.default_index(len(self)) - - argsorted = _try_kind_sort(self[good]) + sorted_index = nargsort( + items=arr, + ascending=ascending, + kind=kind, + na_position=na_position, + key=None, + equals_reversible=True, + ) - if is_list_like(ascending): - if len(ascending) != 1: - raise ValueError( - f"Length of ascending ({len(ascending)}) must be 1 for Series" - ) - ascending = ascending[0] - - if not is_bool(ascending): - raise ValueError("ascending must be boolean") - - if not ascending: - argsorted = argsorted[::-1] - - if na_position == "last": - n = good.sum() - sorted_index[:n] = idx[good][argsorted] - sorted_index[n:] = idx[bad] - elif na_position == "first": - n = bad.sum() - sorted_index[n:] = idx[good][argsorted] - sorted_index[:n] = idx[bad] - else: - raise ValueError(f"invalid na_position: {na_position}") - result = self._constructor( - arr[sorted_index], index=self.index[sorted_index] - ) + result = self._constructor( + self._values[sorted_index], index=self.index[sorted_index] + ) if ignore_index: result.index = ibase.default_index(len(sorted_index)) From 5cab3d4fb5335f71edd72e88f168e347b0391206 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Sat, 15 Aug 2020 18:43:14 +0300 Subject: [PATCH 27/60] CLN: remove extra newline in test_datetime --- pandas/tests/series/indexing/test_datetime.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 19d5dfa3b3900..088f8681feb99 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -11,7 +11,6 @@ from pandas import DataFrame, DatetimeIndex, NaT, Series, Timestamp, date_range import pandas._testing as tm - """ Also test support for datetime64[ns] in Series / DataFrame """ From 30ac9491a1a93a4fe52e0da1b4cc32ba0af5ca24 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Sat, 15 Aug 2020 19:37:11 +0300 Subject: [PATCH 28/60] CLN: alter test_ops to preserve equal element order Preserve equal element order in indexes.period.test_ops.test_order_compat --- pandas/tests/indexes/period/test_ops.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index e7dd76584d780..cc2df54758774 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -175,7 +175,9 @@ def _check_freq(index, expected_index): ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, expected[::-1]) - exp = np.array([2, 1, 3, 4, 0]) + # GH 35584. Preserve order of equal index elements when + # ascending is False + exp = np.array([2, 1, 3, 0, 4]) tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) _check_freq(ordered, idx) From 0ac0a3cbb6bd3f8fe9fce54ccc6cc6ab682bdbc5 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Sat, 15 Aug 2020 19:38:02 +0300 Subject: [PATCH 29/60] CLN: run black --- pandas/tests/indexes/test_common.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index bee0610bca88a..b85392e037c60 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -408,8 +408,7 @@ def test_sort_values_with_missing(idx, na_position, expected): if na_position not in ["first", "last"]: with pytest.raises( - ValueError, - match=f"invalid na_position: {na_position}", + ValueError, match=f"invalid na_position: {na_position}", ): idx.sort_values(na_position=na_position) else: From 051c3e705645c6c6581b2f4886531d0ec442c983 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Fri, 21 Aug 2020 11:33:01 +0300 Subject: [PATCH 30/60] revert changes to Series and connected tests --- pandas/core/series.py | 54 +++++++++++++++++-------- pandas/core/sorting.py | 8 +--- pandas/tests/indexes/period/test_ops.py | 4 +- 3 files changed, 40 insertions(+), 26 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 2b7070530c19b..a8a2d300fa168 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -89,7 +89,7 @@ from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.indexing import check_bool_indexer from pandas.core.internals import SingleBlockManager -from pandas.core.sorting import ensure_key_mapped, nargsort +from pandas.core.sorting import ensure_key_mapped from pandas.core.strings import StringMethods from pandas.core.tools.datetimes import to_datetime @@ -3254,32 +3254,54 @@ def sort_values( "sort in-place you must create a copy" ) + def _try_kind_sort(arr): + arr = ensure_key_mapped(arr, key) + arr = getattr(arr, "_values", arr) + + # easier to ask forgiveness than permission + try: + # if kind==mergesort, it can fail for object dtype + return arr.argsort(kind=kind) + except TypeError: + # stable sort not available for object dtype + # uses the argsort default quicksort + return arr.argsort(kind="quicksort") + + arr = self._values + sorted_index = np.empty(len(self), dtype=np.int32) + + bad = isna(arr) + + good = ~bad + idx = ibase.default_index(len(self)) + + argsorted = _try_kind_sort(self[good]) + if is_list_like(ascending): if len(ascending) != 1: raise ValueError( f"Length of ascending ({len(ascending)}) must be 1 for Series" ) ascending = ascending[0] + if not is_bool(ascending): raise ValueError("ascending must be boolean") - if key is not None: - arr = ensure_key_mapped(self, key) - arr = getattr(arr, "_values", arr) + if not ascending: + argsorted = argsorted[::-1] + + if na_position == "last": + n = good.sum() + sorted_index[:n] = idx[good][argsorted] + sorted_index[n:] = idx[bad] + elif na_position == "first": + n = bad.sum() + sorted_index[n:] = idx[good][argsorted] + sorted_index[:n] = idx[bad] else: - arr = self._values - sorted_index = nargsort( - items=arr, - ascending=ascending, - kind=kind, - na_position=na_position, - key=None, - equals_reversible=True, - ) + raise ValueError(f"invalid na_position: {na_position}") - result = self._constructor( - self._values[sorted_index], index=self.index[sorted_index] - ) + result = self._constructor(arr[sorted_index], index=self.index[sorted_index]) if ignore_index: result.index = ibase.default_index(len(sorted_index)) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index cf457e1f6c1aa..d5e42b6ad6683 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -271,7 +271,6 @@ def nargsort( ascending: bool = True, na_position: str = "last", key: Optional[Callable] = None, - equals_reversible: bool = False, ): """ Intended to be a drop-in replacement for np.argsort which handles NaNs. @@ -286,11 +285,6 @@ def nargsort( ascending : bool, default True na_position : {'first', 'last'}, default 'last' key : Optional[Callable], default None - equals_reversible: bool, default False - Whether equal elements should be reversed or keep their order - when ascending is False. True reverses elements when ascending is False. - Added for compatibility between Index and Series - (False for Index, True for Series) """ if key is not None: @@ -313,7 +307,7 @@ def nargsort( nan_idx = np.nonzero(mask)[0] - if not ascending and not equals_reversible: + if not ascending: non_nans = non_nans[::-1] non_nan_idx = non_nan_idx[::-1] try: diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index cc2df54758774..e7dd76584d780 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -175,9 +175,7 @@ def _check_freq(index, expected_index): ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, expected[::-1]) - # GH 35584. Preserve order of equal index elements when - # ascending is False - exp = np.array([2, 1, 3, 0, 4]) + exp = np.array([2, 1, 3, 4, 0]) tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) _check_freq(ordered, idx) From dfdee40f29100e35e50bd57c438b33d5d7d42b8e Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Fri, 21 Aug 2020 11:56:26 +0300 Subject: [PATCH 31/60] reimplement test fix for Index.sort_values --- pandas/tests/indexes/period/test_ops.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index e7dd76584d780..cc2df54758774 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -175,7 +175,9 @@ def _check_freq(index, expected_index): ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, expected[::-1]) - exp = np.array([2, 1, 3, 4, 0]) + # GH 35584. Preserve order of equal index elements when + # ascending is False + exp = np.array([2, 1, 3, 0, 4]) tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) _check_freq(ordered, idx) From acf61709199035a3491f825878c13dba94487c8c Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Fri, 21 Aug 2020 14:16:16 +0300 Subject: [PATCH 32/60] TST: don't expect same indexer for Index and PeriodIndex The new sort_values implementation for Index is stable when sorting in descending order. However, we expect PeriodIndex.sort_values to be unstable (and have tests for this). So until we enforce the same convention for sorting duplicate values in descending order, we can't check that the indexers for these classes are the same. --- pandas/tests/indexes/period/test_ops.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index cc2df54758774..2a4e2534b7157 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -175,11 +175,9 @@ def _check_freq(index, expected_index): ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, expected[::-1]) - # GH 35584. Preserve order of equal index elements when - # ascending is False - exp = np.array([2, 1, 3, 0, 4]) - tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) - _check_freq(ordered, idx) + # GH 35584. Index sort is now stable when sorting in descending order + # so we don't expect PeriodIndex and Index to return same indexers + # for now. TODO: test indexers after making PeriodIndex stable pidx = PeriodIndex(["2011", "2013", "NaT", "2011"], name="pidx", freq="D") From 4b438526b818e386e2021abe37a6668fec5b6fa4 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Tue, 25 Aug 2020 14:18:35 +0300 Subject: [PATCH 33/60] TST: add Index types, switch to fixture --- pandas/tests/indexes/test_common.py | 77 +++++++++++++++++++++++------ 1 file changed, 61 insertions(+), 16 deletions(-) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index b85392e037c60..f3e0dd2d4a60f 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -13,7 +13,17 @@ from pandas.core.dtypes.common import is_period_dtype, needs_i8_conversion import pandas as pd -from pandas import CategoricalIndex, Index, MultiIndex, RangeIndex +from pandas import ( + CategoricalIndex, + DatetimeIndex, + PeriodIndex, + Index, + Int64Index, + UInt64Index, + MultiIndex, + RangeIndex, + TimedeltaIndex, +) import pandas._testing as tm @@ -393,24 +403,59 @@ def test_astype_preserves_name(self, index, dtype): assert result.name == index.name -@pytest.mark.parametrize("idx", [Index(["a", None, "c", None, "e"])]) -@pytest.mark.parametrize( - "na_position, expected", - [ - ("first", Index([None, None, "a", "c", "e"])), - ("last", Index(["a", "c", "e", None, None])), - ("middle", None), - ], +@pytest.fixture( + params=[ + tm.makeBoolIndex(10), + tm.makeFloatIndex(10), + tm.makeIntervalIndex(10), + tm.makeStringIndex(10), + tm.makeIntIndex(10), + tm.makeUIntIndex(10), + tm.makeRangeIndex(10), + tm.makeCategoricalIndex(10), + tm.makeMultiIndex(10), + tm.makeDateIndex(10), + tm.makePeriodIndex(10), + tm.makeTimedeltaIndex(10), + ] ) -def test_sort_values_with_missing(idx, na_position, expected): - # GH 35584. Test that sort_values works with missing values, - # sort non-missing and places missing according to na_position - +def index_with_missing(request): + ind = request.param + if type(ind) in [DatetimeIndex, PeriodIndex, TimedeltaIndex]: + pytest.xfail("stable descending order sort not implemented") + if type(ind) in [Int64Index, UInt64Index, RangeIndex]: + pytest.xfail("missing values not supported") + if type(ind) in [CategoricalIndex, MultiIndex]: + pytest.xfail("missing value sorting order not defined for index type") + vals = request.param.values + vals[2] = None + vals[5] = None + ind = type(ind)(vals) + return ind + + +@pytest.mark.parametrize("na_position", ["middle"]) +def test_sort_values_invalid_na_position(index_with_missing, na_position): if na_position not in ["first", "last"]: with pytest.raises( ValueError, match=f"invalid na_position: {na_position}", ): - idx.sort_values(na_position=na_position) + index_with_missing.sort_values(na_position=na_position) + + +@pytest.mark.parametrize("na_position", ["first", "last"]) +def test_sort_values_with_missing(index_with_missing, na_position): + # GH 35584. Test that sort_values works with missing values, + # sort non-missing and places missing according to na_position + + missing_count = np.sum(index_with_missing.isna()) + not_na_vals = index_with_missing[index_with_missing.notna()].values + sorted_values = np.sort(not_na_vals) + if na_position == "first": + sorted_values = np.concatenate([[None] * missing_count, sorted_values]) else: - result = idx.sort_values(na_position=na_position) - tm.assert_index_equal(result, expected) + sorted_values = np.concatenate([sorted_values, [None] * missing_count]) + expected = type(index_with_missing)(sorted_values) + + result = index_with_missing.sort_values(na_position=na_position) + tm.assert_index_equal(result, expected) From fb8f703bd6a2ef26937f47c8151578a2134a4f49 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Tue, 25 Aug 2020 14:38:59 +0300 Subject: [PATCH 34/60] CLN: fix import order --- pandas/tests/indexes/test_common.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index f3e0dd2d4a60f..70b646ee95537 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -16,13 +16,12 @@ from pandas import ( CategoricalIndex, DatetimeIndex, - PeriodIndex, - Index, Int64Index, - UInt64Index, MultiIndex, + PeriodIndex, RangeIndex, TimedeltaIndex, + UInt64Index, ) import pandas._testing as tm From fc58c3f9dd568837890d04735dc2a51950d1f6f7 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Tue, 25 Aug 2020 17:55:56 +0300 Subject: [PATCH 35/60] TST: add test for Index and PeriodIndex incompatibility --- pandas/tests/indexes/period/test_ops.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index 2a4e2534b7157..3c7b478383ee4 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -174,6 +174,7 @@ def _check_freq(index, expected_index): ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, expected[::-1]) + _check_freq(ordered, idx) # GH 35584. Index sort is now stable when sorting in descending order # so we don't expect PeriodIndex and Index to return same indexers @@ -333,3 +334,16 @@ def test_freq_setter_deprecated(self): # warning for setter with pytest.raises(AttributeError, match="can't set attribute"): idx.freq = pd.offsets.Day() + + +@pytest.mark.xfail(reason="PeriodIndex.sort_values currently unstable") +def test_order_incompat(): + # GH 35584. The new implementation of sort_values for Index.sort_values + # is stable when sorting in descending order. PeriodIndex.sort_values + # currently isn't stable. xfail should be removed after + # the implementations' behavior is synchronized. + pidx = PeriodIndex(["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A") + iidx = Index([2011, 2013, 2015, 2012, 2011], name="idx") + ordered1, indexer1 = pidx.sort_values(return_indexer=True, ascending=False) + ordered2, indexer2 = iidx.sort_values(return_indexer=True, ascending=False) + tm.assert_numpy_array_equal(indexer1, indexer2) From 1dfd4f2fd5f5f2f2f8ab91d29555cca21b45e091 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Thu, 27 Aug 2020 15:41:57 +0300 Subject: [PATCH 36/60] TST: move fixture to pandas/conftest.py --- pandas/conftest.py | 20 ++++++++++++++ pandas/tests/indexes/test_common.py | 42 +---------------------------- 2 files changed, 21 insertions(+), 41 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 0878380d00837..bc4660bcf90d2 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -437,6 +437,26 @@ def index(request): index_fixture2 = index +@pytest.fixture(params=indices_dict.keys()) +def index_with_missing(request): + """ + Fixture for indices with missing values + """ + if request.param in ["datetime", "datetime-tz", "period", "timedelta"]: + pytest.xfail("stable descending order sort not implemented") + if request.param in ["int", "uint", "range", "empty", "repeats"]: + pytest.xfail("missing values not supported") + if request.param in ["categorical", "tuples", "mi-with-dt64tz-level", "multi"]: + pytest.xfail("missing value sorting order not defined for index type") + ind = indices_dict[request.param].copy() + vals = ind.values + print(request.param) + vals[0] = None + vals[-1] = None + ind = type(ind)(vals) + return ind + + # ---------------------------------------------------------------- # Series' # ---------------------------------------------------------------- diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 70b646ee95537..216253aa4839c 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -13,16 +13,7 @@ from pandas.core.dtypes.common import is_period_dtype, needs_i8_conversion import pandas as pd -from pandas import ( - CategoricalIndex, - DatetimeIndex, - Int64Index, - MultiIndex, - PeriodIndex, - RangeIndex, - TimedeltaIndex, - UInt64Index, -) +from pandas import MultiIndex, RangeIndex import pandas._testing as tm @@ -402,37 +393,6 @@ def test_astype_preserves_name(self, index, dtype): assert result.name == index.name -@pytest.fixture( - params=[ - tm.makeBoolIndex(10), - tm.makeFloatIndex(10), - tm.makeIntervalIndex(10), - tm.makeStringIndex(10), - tm.makeIntIndex(10), - tm.makeUIntIndex(10), - tm.makeRangeIndex(10), - tm.makeCategoricalIndex(10), - tm.makeMultiIndex(10), - tm.makeDateIndex(10), - tm.makePeriodIndex(10), - tm.makeTimedeltaIndex(10), - ] -) -def index_with_missing(request): - ind = request.param - if type(ind) in [DatetimeIndex, PeriodIndex, TimedeltaIndex]: - pytest.xfail("stable descending order sort not implemented") - if type(ind) in [Int64Index, UInt64Index, RangeIndex]: - pytest.xfail("missing values not supported") - if type(ind) in [CategoricalIndex, MultiIndex]: - pytest.xfail("missing value sorting order not defined for index type") - vals = request.param.values - vals[2] = None - vals[5] = None - ind = type(ind)(vals) - return ind - - @pytest.mark.parametrize("na_position", ["middle"]) def test_sort_values_invalid_na_position(index_with_missing, na_position): if na_position not in ["first", "last"]: From 65c001ab1d0af2584346340109a0a721fc397800 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Thu, 27 Aug 2020 15:42:53 +0300 Subject: [PATCH 37/60] TST: add na_position=None test case --- pandas/tests/indexes/test_common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 216253aa4839c..b9eba523b253f 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -393,7 +393,7 @@ def test_astype_preserves_name(self, index, dtype): assert result.name == index.name -@pytest.mark.parametrize("na_position", ["middle"]) +@pytest.mark.parametrize("na_position", [None, "middle"]) def test_sort_values_invalid_na_position(index_with_missing, na_position): if na_position not in ["first", "last"]: with pytest.raises( @@ -405,7 +405,7 @@ def test_sort_values_invalid_na_position(index_with_missing, na_position): @pytest.mark.parametrize("na_position", ["first", "last"]) def test_sort_values_with_missing(index_with_missing, na_position): # GH 35584. Test that sort_values works with missing values, - # sort non-missing and places missing according to na_position + # sort non-missing and place missing according to na_position missing_count = np.sum(index_with_missing.isna()) not_na_vals = index_with_missing[index_with_missing.notna()].values From dd82cc1f81eb76935b625df87c3770b319847af7 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Thu, 27 Aug 2020 16:24:24 +0300 Subject: [PATCH 38/60] TST: move index type xfails to test_common.py --- pandas/conftest.py | 18 ++++++++++-------- pandas/tests/indexes/test_common.py | 19 ++++++++++++++++++- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index bc4660bcf90d2..cdcc558e5cf85 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -442,18 +442,20 @@ def index_with_missing(request): """ Fixture for indices with missing values """ - if request.param in ["datetime", "datetime-tz", "period", "timedelta"]: - pytest.xfail("stable descending order sort not implemented") if request.param in ["int", "uint", "range", "empty", "repeats"]: pytest.xfail("missing values not supported") - if request.param in ["categorical", "tuples", "mi-with-dt64tz-level", "multi"]: - pytest.xfail("missing value sorting order not defined for index type") ind = indices_dict[request.param].copy() vals = ind.values - print(request.param) - vals[0] = None - vals[-1] = None - ind = type(ind)(vals) + if type(vals[0]) == tuple: + # For setting missing values in the top level of MultiIndex + vals = ind.tolist() + vals[0] = tuple([None]) + vals[0][1:] + vals[-1] = tuple([None]) + vals[-1][1:] + ind = MultiIndex.from_tuples(vals) + else: + vals[0] = None + vals[-1] = None + ind = type(ind)(vals) return ind diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index b9eba523b253f..8aaaf70889656 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -13,7 +13,14 @@ from pandas.core.dtypes.common import is_period_dtype, needs_i8_conversion import pandas as pd -from pandas import MultiIndex, RangeIndex +from pandas import ( + CategoricalIndex, + DatetimeIndex, + MultiIndex, + PeriodIndex, + RangeIndex, + TimedeltaIndex, +) import pandas._testing as tm @@ -395,6 +402,12 @@ def test_astype_preserves_name(self, index, dtype): @pytest.mark.parametrize("na_position", [None, "middle"]) def test_sort_values_invalid_na_position(index_with_missing, na_position): + + if type(index_with_missing) in [DatetimeIndex, PeriodIndex, TimedeltaIndex]: + pytest.xfail("stable descending order sort not implemented") + elif type(index_with_missing) in [CategoricalIndex, MultiIndex]: + pytest.xfail("missing value sorting order not defined for index type") + if na_position not in ["first", "last"]: with pytest.raises( ValueError, match=f"invalid na_position: {na_position}", @@ -407,6 +420,10 @@ def test_sort_values_with_missing(index_with_missing, na_position): # GH 35584. Test that sort_values works with missing values, # sort non-missing and place missing according to na_position + if type(index_with_missing) in [DatetimeIndex, PeriodIndex, TimedeltaIndex]: + pytest.xfail("stable descending order sort not implemented") + elif type(index_with_missing) in [CategoricalIndex, MultiIndex]: + pytest.xfail("missing value sorting order not defined for index type") missing_count = np.sum(index_with_missing.isna()) not_na_vals = index_with_missing[index_with_missing.notna()].values sorted_values = np.sort(not_na_vals) From a3855adc21a7b33da333d3f9f6ac95abd8fe77b5 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Thu, 27 Aug 2020 16:33:52 +0300 Subject: [PATCH 39/60] DOC: remove unnecessary comment in test_ops --- pandas/tests/indexes/period/test_ops.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index 3c7b478383ee4..de0c6a63fca88 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -176,10 +176,6 @@ def _check_freq(index, expected_index): tm.assert_index_equal(ordered, expected[::-1]) _check_freq(ordered, idx) - # GH 35584. Index sort is now stable when sorting in descending order - # so we don't expect PeriodIndex and Index to return same indexers - # for now. TODO: test indexers after making PeriodIndex stable - pidx = PeriodIndex(["2011", "2013", "NaT", "2011"], name="pidx", freq="D") result = pidx.sort_values() From 481de599008e38d0e637a49df85973397f601b97 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Thu, 27 Aug 2020 17:00:43 +0300 Subject: [PATCH 40/60] DOC: add xref to unstable sort issue --- pandas/tests/indexes/period/test_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index de0c6a63fca88..2d7fcfe7f484f 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -337,7 +337,7 @@ def test_order_incompat(): # GH 35584. The new implementation of sort_values for Index.sort_values # is stable when sorting in descending order. PeriodIndex.sort_values # currently isn't stable. xfail should be removed after - # the implementations' behavior is synchronized. + # the implementations' behavior is synchronized (xref GH 35922) pidx = PeriodIndex(["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A") iidx = Index([2011, 2013, 2015, 2012, 2011], name="idx") ordered1, indexer1 = pidx.sort_values(return_indexer=True, ascending=False) From c7e824049d82df9370126f3c6bf5737f3c8c194a Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Mon, 31 Aug 2020 13:21:28 +0300 Subject: [PATCH 41/60] TST: add prints to debug numpy dev test fails --- pandas/tests/indexes/test_setops.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 1a40fe550be61..df49dc7361589 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -48,8 +48,12 @@ def test_union_different_types(index, index_fixture2): # Union with a non-unique, non-monotonic index raises error # This applies to the boolean index + print(idx1) + print(idx2) idx1 = idx1.sort_values() idx2 = idx2.sort_values() + print(idx1) + print(idx2) assert idx1.union(idx2).dtype == np.dtype("O") assert idx2.union(idx1).dtype == np.dtype("O") From ad29fff2997969370962f12b90a3ba5f80e2d3d3 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Mon, 31 Aug 2020 13:57:26 +0300 Subject: [PATCH 42/60] Revert "TST: add prints to debug numpy dev test fails" This reverts commit c7e824049d82df9370126f3c6bf5737f3c8c194a. --- pandas/tests/indexes/test_setops.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index df49dc7361589..1a40fe550be61 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -48,12 +48,8 @@ def test_union_different_types(index, index_fixture2): # Union with a non-unique, non-monotonic index raises error # This applies to the boolean index - print(idx1) - print(idx2) idx1 = idx1.sort_values() idx2 = idx2.sort_values() - print(idx1) - print(idx2) assert idx1.union(idx2).dtype == np.dtype("O") assert idx2.union(idx1).dtype == np.dtype("O") From 6ae6e8a0b96a6bfb7e90f35102f8eec32f584986 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Wed, 2 Sep 2020 13:37:12 +0300 Subject: [PATCH 43/60] restart tests From 760bda1fb4e52e3f7c0f0f83b6ed3148607e085f Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Wed, 2 Sep 2020 16:57:41 +0300 Subject: [PATCH 44/60] DOC: remove junk from whatsnew --- doc/source/whatsnew/v1.2.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 6aefb64113c26..a310ce108fd36 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -217,7 +217,6 @@ Interval Indexing ^^^^^^^^ -<<<<<<< HEAD - Bug in :meth:`PeriodIndex.get_loc` incorrectly raising ``ValueError`` on non-datelike strings instead of ``KeyError``, causing similar errors in :meth:`Series.__geitem__`, :meth:`Series.__contains__`, and :meth:`Series.loc.__getitem__` (:issue:`34240`) - Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`) From d8f9061d480d33b485882d5c3b6e35d032411301 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Wed, 2 Sep 2020 18:46:50 +0300 Subject: [PATCH 45/60] TST: warp test for debug --- pandas/tests/indexes/test_setops.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 1a40fe550be61..a713ab833c8c0 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -48,8 +48,10 @@ def test_union_different_types(index, index_fixture2): # Union with a non-unique, non-monotonic index raises error # This applies to the boolean index + print(f"KAV lengths before are {len(idx1)} and {len(idx2)}") idx1 = idx1.sort_values() idx2 = idx2.sort_values() + print(f"KAV lengths after are {len(idx1)} and {len(idx2)}") assert idx1.union(idx2).dtype == np.dtype("O") assert idx2.union(idx1).dtype == np.dtype("O") From 2f8bec684a368faa092c5c0d32407498f75dd422 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Wed, 2 Sep 2020 19:25:40 +0300 Subject: [PATCH 46/60] Revert "TST: warp test for debug" This reverts commit d8f9061d480d33b485882d5c3b6e35d032411301. --- pandas/tests/indexes/test_setops.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index a713ab833c8c0..1a40fe550be61 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -48,10 +48,8 @@ def test_union_different_types(index, index_fixture2): # Union with a non-unique, non-monotonic index raises error # This applies to the boolean index - print(f"KAV lengths before are {len(idx1)} and {len(idx2)}") idx1 = idx1.sort_values() idx2 = idx2.sort_values() - print(f"KAV lengths after are {len(idx1)} and {len(idx2)}") assert idx1.union(idx2).dtype == np.dtype("O") assert idx2.union(idx1).dtype == np.dtype("O") From 29d684907ecc654628f3ae2447db825a7aaaa5fa Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Wed, 2 Sep 2020 19:28:31 +0300 Subject: [PATCH 47/60] remove new tests for debug --- pandas/conftest.py | 22 ----------------- pandas/tests/indexes/test_common.py | 37 ----------------------------- 2 files changed, 59 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index cdcc558e5cf85..0878380d00837 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -437,28 +437,6 @@ def index(request): index_fixture2 = index -@pytest.fixture(params=indices_dict.keys()) -def index_with_missing(request): - """ - Fixture for indices with missing values - """ - if request.param in ["int", "uint", "range", "empty", "repeats"]: - pytest.xfail("missing values not supported") - ind = indices_dict[request.param].copy() - vals = ind.values - if type(vals[0]) == tuple: - # For setting missing values in the top level of MultiIndex - vals = ind.tolist() - vals[0] = tuple([None]) + vals[0][1:] - vals[-1] = tuple([None]) + vals[-1][1:] - ind = MultiIndex.from_tuples(vals) - else: - vals[0] = None - vals[-1] = None - ind = type(ind)(vals) - return ind - - # ---------------------------------------------------------------- # Series' # ---------------------------------------------------------------- diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 8aaaf70889656..382f544ea04fa 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -398,40 +398,3 @@ def test_astype_preserves_name(self, index, dtype): assert result.names == index.names else: assert result.name == index.name - - -@pytest.mark.parametrize("na_position", [None, "middle"]) -def test_sort_values_invalid_na_position(index_with_missing, na_position): - - if type(index_with_missing) in [DatetimeIndex, PeriodIndex, TimedeltaIndex]: - pytest.xfail("stable descending order sort not implemented") - elif type(index_with_missing) in [CategoricalIndex, MultiIndex]: - pytest.xfail("missing value sorting order not defined for index type") - - if na_position not in ["first", "last"]: - with pytest.raises( - ValueError, match=f"invalid na_position: {na_position}", - ): - index_with_missing.sort_values(na_position=na_position) - - -@pytest.mark.parametrize("na_position", ["first", "last"]) -def test_sort_values_with_missing(index_with_missing, na_position): - # GH 35584. Test that sort_values works with missing values, - # sort non-missing and place missing according to na_position - - if type(index_with_missing) in [DatetimeIndex, PeriodIndex, TimedeltaIndex]: - pytest.xfail("stable descending order sort not implemented") - elif type(index_with_missing) in [CategoricalIndex, MultiIndex]: - pytest.xfail("missing value sorting order not defined for index type") - missing_count = np.sum(index_with_missing.isna()) - not_na_vals = index_with_missing[index_with_missing.notna()].values - sorted_values = np.sort(not_na_vals) - if na_position == "first": - sorted_values = np.concatenate([[None] * missing_count, sorted_values]) - else: - sorted_values = np.concatenate([sorted_values, [None] * missing_count]) - expected = type(index_with_missing)(sorted_values) - - result = index_with_missing.sort_values(na_position=na_position) - tm.assert_index_equal(result, expected) From 8b81e55f60ea0d0c1fdb51a4ce625e08844c81ac Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Thu, 3 Sep 2020 09:51:54 +0300 Subject: [PATCH 48/60] Revert "remove new tests for debug" This reverts commit 29d684907ecc654628f3ae2447db825a7aaaa5fa. --- pandas/conftest.py | 22 +++++++++++++++++ pandas/tests/indexes/test_common.py | 37 +++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/pandas/conftest.py b/pandas/conftest.py index 0878380d00837..cdcc558e5cf85 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -437,6 +437,28 @@ def index(request): index_fixture2 = index +@pytest.fixture(params=indices_dict.keys()) +def index_with_missing(request): + """ + Fixture for indices with missing values + """ + if request.param in ["int", "uint", "range", "empty", "repeats"]: + pytest.xfail("missing values not supported") + ind = indices_dict[request.param].copy() + vals = ind.values + if type(vals[0]) == tuple: + # For setting missing values in the top level of MultiIndex + vals = ind.tolist() + vals[0] = tuple([None]) + vals[0][1:] + vals[-1] = tuple([None]) + vals[-1][1:] + ind = MultiIndex.from_tuples(vals) + else: + vals[0] = None + vals[-1] = None + ind = type(ind)(vals) + return ind + + # ---------------------------------------------------------------- # Series' # ---------------------------------------------------------------- diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 382f544ea04fa..8aaaf70889656 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -398,3 +398,40 @@ def test_astype_preserves_name(self, index, dtype): assert result.names == index.names else: assert result.name == index.name + + +@pytest.mark.parametrize("na_position", [None, "middle"]) +def test_sort_values_invalid_na_position(index_with_missing, na_position): + + if type(index_with_missing) in [DatetimeIndex, PeriodIndex, TimedeltaIndex]: + pytest.xfail("stable descending order sort not implemented") + elif type(index_with_missing) in [CategoricalIndex, MultiIndex]: + pytest.xfail("missing value sorting order not defined for index type") + + if na_position not in ["first", "last"]: + with pytest.raises( + ValueError, match=f"invalid na_position: {na_position}", + ): + index_with_missing.sort_values(na_position=na_position) + + +@pytest.mark.parametrize("na_position", ["first", "last"]) +def test_sort_values_with_missing(index_with_missing, na_position): + # GH 35584. Test that sort_values works with missing values, + # sort non-missing and place missing according to na_position + + if type(index_with_missing) in [DatetimeIndex, PeriodIndex, TimedeltaIndex]: + pytest.xfail("stable descending order sort not implemented") + elif type(index_with_missing) in [CategoricalIndex, MultiIndex]: + pytest.xfail("missing value sorting order not defined for index type") + missing_count = np.sum(index_with_missing.isna()) + not_na_vals = index_with_missing[index_with_missing.notna()].values + sorted_values = np.sort(not_na_vals) + if na_position == "first": + sorted_values = np.concatenate([[None] * missing_count, sorted_values]) + else: + sorted_values = np.concatenate([sorted_values, [None] * missing_count]) + expected = type(index_with_missing)(sorted_values) + + result = index_with_missing.sort_values(na_position=na_position) + tm.assert_index_equal(result, expected) From 5b2a0d387b919c5bef82d1bea14b0bd87c3c3aaf Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Thu, 3 Sep 2020 09:53:07 +0300 Subject: [PATCH 49/60] TST: try deep copy in fixture --- pandas/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index cdcc558e5cf85..761abea1d60fe 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -444,7 +444,7 @@ def index_with_missing(request): """ if request.param in ["int", "uint", "range", "empty", "repeats"]: pytest.xfail("missing values not supported") - ind = indices_dict[request.param].copy() + ind = indices_dict[request.param].copy(deep=True) vals = ind.values if type(vals[0]) == tuple: # For setting missing values in the top level of MultiIndex From a3310ba9172db9511e7d5954c4bf498136dd8329 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Thu, 3 Sep 2020 10:25:44 +0300 Subject: [PATCH 50/60] TST: try immediate return instead of deep copy --- pandas/conftest.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 761abea1d60fe..8dcfd867e1a8f 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -444,19 +444,18 @@ def index_with_missing(request): """ if request.param in ["int", "uint", "range", "empty", "repeats"]: pytest.xfail("missing values not supported") - ind = indices_dict[request.param].copy(deep=True) + ind = indices_dict[request.param].copy() vals = ind.values if type(vals[0]) == tuple: # For setting missing values in the top level of MultiIndex vals = ind.tolist() vals[0] = tuple([None]) + vals[0][1:] vals[-1] = tuple([None]) + vals[-1][1:] - ind = MultiIndex.from_tuples(vals) + return MultiIndex.from_tuples(vals) else: vals[0] = None vals[-1] = None - ind = type(ind)(vals) - return ind + return type(ind)(vals) # ---------------------------------------------------------------- From e80bc9df8311784495c8645d42d463aa754253f0 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Thu, 3 Sep 2020 10:42:24 +0300 Subject: [PATCH 51/60] Revert "TST: try immediate return instead of deep copy" This reverts commit a3310ba9172db9511e7d5954c4bf498136dd8329. --- pandas/conftest.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 8dcfd867e1a8f..761abea1d60fe 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -444,18 +444,19 @@ def index_with_missing(request): """ if request.param in ["int", "uint", "range", "empty", "repeats"]: pytest.xfail("missing values not supported") - ind = indices_dict[request.param].copy() + ind = indices_dict[request.param].copy(deep=True) vals = ind.values if type(vals[0]) == tuple: # For setting missing values in the top level of MultiIndex vals = ind.tolist() vals[0] = tuple([None]) + vals[0][1:] vals[-1] = tuple([None]) + vals[-1][1:] - return MultiIndex.from_tuples(vals) + ind = MultiIndex.from_tuples(vals) else: vals[0] = None vals[-1] = None - return type(ind)(vals) + ind = type(ind)(vals) + return ind # ---------------------------------------------------------------- From 2007699721287a95896c14fa5fb5efecb3f7a761 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Thu, 3 Sep 2020 10:43:02 +0300 Subject: [PATCH 52/60] REFACT: add immediate returns --- pandas/conftest.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 761abea1d60fe..53cf930e643c3 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -451,12 +451,11 @@ def index_with_missing(request): vals = ind.tolist() vals[0] = tuple([None]) + vals[0][1:] vals[-1] = tuple([None]) + vals[-1][1:] - ind = MultiIndex.from_tuples(vals) + return MultiIndex.from_tuples(vals) else: vals[0] = None vals[-1] = None - ind = type(ind)(vals) - return ind + return type(ind)(vals) # ---------------------------------------------------------------- From 2e39294b04368945eede4b2527b6cd1483aa7782 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Thu, 3 Sep 2020 10:45:55 +0300 Subject: [PATCH 53/60] DOC: add comment to conftest clarifying deep copy --- pandas/conftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/conftest.py b/pandas/conftest.py index 53cf930e643c3..dc5346356c078 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -444,6 +444,8 @@ def index_with_missing(request): """ if request.param in ["int", "uint", "range", "empty", "repeats"]: pytest.xfail("missing values not supported") + # GH 35538. Use deep copy to avoid illusive bug on np-dev + # Azure pipeline that writes into indices_dict despite copy ind = indices_dict[request.param].copy(deep=True) vals = ind.values if type(vals[0]) == tuple: From d78a9a236cc4917a5a7ebf666b6472a51ebc1ef7 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Thu, 3 Sep 2020 12:01:14 +0300 Subject: [PATCH 54/60] restart tests From bfd2e9ccbb4edc40941b7ad0377f1a895f78c03a Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Thu, 3 Sep 2020 17:44:49 +0300 Subject: [PATCH 55/60] CLN: remove test-output.xml --- test-output.xml | 1 - 1 file changed, 1 deletion(-) delete mode 100644 test-output.xml diff --git a/test-output.xml b/test-output.xml deleted file mode 100644 index d1cd70e860a69..0000000000000 --- a/test-output.xml +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file From 54a6e821c281e9428bf4ff58e7cae4437f6f7d97 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Sat, 5 Sep 2020 07:24:09 +0300 Subject: [PATCH 56/60] TST: check for MultiIndex through request.param --- pandas/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index dc5346356c078..5474005a63b8e 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -448,7 +448,7 @@ def index_with_missing(request): # Azure pipeline that writes into indices_dict despite copy ind = indices_dict[request.param].copy(deep=True) vals = ind.values - if type(vals[0]) == tuple: + if request.param in ["tuples", "mi-with-dt64tz-level", "multi"]: # For setting missing values in the top level of MultiIndex vals = ind.tolist() vals[0] = tuple([None]) + vals[0][1:] From f60d2a8ff7230875e1a047da08b6ac23090fbb3c Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Sat, 5 Sep 2020 07:26:29 +0300 Subject: [PATCH 57/60] CLN: revert changes to nargsort --- pandas/core/sorting.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index d5e42b6ad6683..8bdd466ae6f33 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -306,17 +306,10 @@ def nargsort( non_nan_idx = idx[~mask] nan_idx = np.nonzero(mask)[0] - if not ascending: non_nans = non_nans[::-1] non_nan_idx = non_nan_idx[::-1] - try: - indexer = non_nan_idx[non_nans.argsort(kind=kind)] - except TypeError: - # For compatibility with Series: fall back to quicksort - # when mergesort is unavailable for object element type - indexer = non_nan_idx[non_nans.argsort(kind="quicksort")] - + indexer = non_nan_idx[non_nans.argsort(kind=kind)] if not ascending: indexer = indexer[::-1] # Finally, place the NaNs at the end or the beginning according to From af92fe8bee9f7dff44d6f1afe28b046d225f855b Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Sat, 5 Sep 2020 07:37:53 +0300 Subject: [PATCH 58/60] DOC: add issue xref to xfail reason in period/test_ops.py --- pandas/tests/indexes/period/test_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index 2d7fcfe7f484f..a4171621959d2 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -332,7 +332,7 @@ def test_freq_setter_deprecated(self): idx.freq = pd.offsets.Day() -@pytest.mark.xfail(reason="PeriodIndex.sort_values currently unstable") +@pytest.mark.xfail(reason="PeriodIndex.sort_values currently unstable (GH 35922)") def test_order_incompat(): # GH 35584. The new implementation of sort_values for Index.sort_values # is stable when sorting in descending order. PeriodIndex.sort_values From 6d33657156dd94a7af8d36f50e520d11d393e746 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Sat, 5 Sep 2020 08:20:31 +0300 Subject: [PATCH 59/60] DOC: clarify xfail reasons and add comments --- pandas/tests/indexes/period/test_ops.py | 8 ++++---- pandas/tests/indexes/test_common.py | 11 ++++++++--- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index a4171621959d2..d1b34c315b682 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -332,11 +332,11 @@ def test_freq_setter_deprecated(self): idx.freq = pd.offsets.Day() -@pytest.mark.xfail(reason="PeriodIndex.sort_values currently unstable (GH 35922)") -def test_order_incompat(): +@pytest.mark.xfail(reason="Datetime-like sort_values currently unstable (GH 35922)") +def test_order_stability_compat(): # GH 35584. The new implementation of sort_values for Index.sort_values - # is stable when sorting in descending order. PeriodIndex.sort_values - # currently isn't stable. xfail should be removed after + # is stable when sorting in descending order. Datetime-like sort_values + # currently aren't stable. xfail should be removed after # the implementations' behavior is synchronized (xref GH 35922) pidx = PeriodIndex(["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A") iidx = Index([2011, 2013, 2015, 2012, 2011], name="idx") diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 8aaaf70889656..243631e8ebdef 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -402,9 +402,11 @@ def test_astype_preserves_name(self, index, dtype): @pytest.mark.parametrize("na_position", [None, "middle"]) def test_sort_values_invalid_na_position(index_with_missing, na_position): - if type(index_with_missing) in [DatetimeIndex, PeriodIndex, TimedeltaIndex]: - pytest.xfail("stable descending order sort not implemented") + # datetime-like indices will get na_position kwarg as part of + # synchronizing duplicate-sorting behavior, because we currently expect + # them, other indices, and Series to sort differently (xref 35922) + pytest.xfail("sort_values does not support na_position kwarg") elif type(index_with_missing) in [CategoricalIndex, MultiIndex]: pytest.xfail("missing value sorting order not defined for index type") @@ -421,7 +423,10 @@ def test_sort_values_with_missing(index_with_missing, na_position): # sort non-missing and place missing according to na_position if type(index_with_missing) in [DatetimeIndex, PeriodIndex, TimedeltaIndex]: - pytest.xfail("stable descending order sort not implemented") + # datetime-like indices will get na_position kwarg as part of + # synchronizing duplicate-sorting behavior, because we currently expect + # them, other indices, and Series to sort differently (xref 35922) + pytest.xfail("sort_values does not support na_position kwarg") elif type(index_with_missing) in [CategoricalIndex, MultiIndex]: pytest.xfail("missing value sorting order not defined for index type") missing_count = np.sum(index_with_missing.isna()) From 49353091097562cd924c1567962833dd3fc2216c Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Sun, 6 Sep 2020 07:24:21 +0300 Subject: [PATCH 60/60] REFACT: switch to isinstance, add blank line --- pandas/tests/indexes/test_common.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 243631e8ebdef..aa6b395176b06 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -402,12 +402,12 @@ def test_astype_preserves_name(self, index, dtype): @pytest.mark.parametrize("na_position", [None, "middle"]) def test_sort_values_invalid_na_position(index_with_missing, na_position): - if type(index_with_missing) in [DatetimeIndex, PeriodIndex, TimedeltaIndex]: + if isinstance(index_with_missing, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): # datetime-like indices will get na_position kwarg as part of # synchronizing duplicate-sorting behavior, because we currently expect # them, other indices, and Series to sort differently (xref 35922) pytest.xfail("sort_values does not support na_position kwarg") - elif type(index_with_missing) in [CategoricalIndex, MultiIndex]: + elif isinstance(index_with_missing, (CategoricalIndex, MultiIndex)): pytest.xfail("missing value sorting order not defined for index type") if na_position not in ["first", "last"]: @@ -422,13 +422,14 @@ def test_sort_values_with_missing(index_with_missing, na_position): # GH 35584. Test that sort_values works with missing values, # sort non-missing and place missing according to na_position - if type(index_with_missing) in [DatetimeIndex, PeriodIndex, TimedeltaIndex]: + if isinstance(index_with_missing, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): # datetime-like indices will get na_position kwarg as part of # synchronizing duplicate-sorting behavior, because we currently expect # them, other indices, and Series to sort differently (xref 35922) pytest.xfail("sort_values does not support na_position kwarg") - elif type(index_with_missing) in [CategoricalIndex, MultiIndex]: + elif isinstance(index_with_missing, (CategoricalIndex, MultiIndex)): pytest.xfail("missing value sorting order not defined for index type") + missing_count = np.sum(index_with_missing.isna()) not_na_vals = index_with_missing[index_with_missing.notna()].values sorted_values = np.sort(not_na_vals)