diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index b07351d05defb..d09d3c6832526 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -267,8 +267,9 @@ Interval Indexing ^^^^^^^^ + - Bug in :meth:`PeriodIndex.get_loc` incorrectly raising ``ValueError`` on non-datelike strings instead of ``KeyError``, causing similar errors in :meth:`Series.__geitem__`, :meth:`Series.__contains__`, and :meth:`Series.loc.__getitem__` (:issue:`34240`) -- +- Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`) - Missing diff --git a/pandas/conftest.py b/pandas/conftest.py index 0878380d00837..5474005a63b8e 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -437,6 +437,29 @@ def index(request): index_fixture2 = index +@pytest.fixture(params=indices_dict.keys()) +def index_with_missing(request): + """ + Fixture for indices with missing values + """ + if request.param in ["int", "uint", "range", "empty", "repeats"]: + pytest.xfail("missing values not supported") + # GH 35538. Use deep copy to avoid illusive bug on np-dev + # Azure pipeline that writes into indices_dict despite copy + ind = indices_dict[request.param].copy(deep=True) + vals = ind.values + if request.param in ["tuples", "mi-with-dt64tz-level", "multi"]: + # For setting missing values in the top level of MultiIndex + vals = ind.tolist() + vals[0] = tuple([None]) + vals[0][1:] + vals[-1] = tuple([None]) + vals[-1][1:] + return MultiIndex.from_tuples(vals) + else: + vals[0] = None + vals[-1] = None + return type(ind)(vals) + + # ---------------------------------------------------------------- # Series' # ---------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 65b5dfb6df911..a1bc8a4659b24 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -88,7 +88,7 @@ import pandas.core.missing as missing from pandas.core.ops import get_op_result_name from pandas.core.ops.invalid import make_invalid_op -from pandas.core.sorting import ensure_key_mapped +from pandas.core.sorting import ensure_key_mapped, nargsort from pandas.core.strings import StringMethods from pandas.io.formats.printing import ( @@ -4443,7 +4443,11 @@ def asof_locs(self, where, mask): return result def sort_values( - self, return_indexer=False, ascending=True, key: Optional[Callable] = None + self, + return_indexer=False, + ascending=True, + na_position: str_t = "last", + key: Optional[Callable] = None, ): """ Return a sorted copy of the index. @@ -4457,6 +4461,12 @@ def sort_values( Should the indices that would sort the index be returned. ascending : bool, default True Should the index values be sorted in an ascending order. + na_position : {'first' or 'last'}, default 'last' + Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at + the end. + + .. versionadded:: 1.2.0 + key : callable, optional If not None, apply the key function to the index values before sorting. This is similar to the `key` argument in the @@ -4497,9 +4507,16 @@ def sort_values( """ idx = ensure_key_mapped(self, key) - _as = idx.argsort() - if not ascending: - _as = _as[::-1] + # GH 35584. Sort missing values according to na_position kwarg + # ignore na_position for MutiIndex + if not isinstance(self, ABCMultiIndex): + _as = nargsort( + items=idx, ascending=ascending, na_position=na_position, key=key + ) + else: + _as = idx.argsort() + if not ascending: + _as = _as[::-1] sorted_index = self.take(_as) diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 2755b186f3eae..b5e150f25f543 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -618,7 +618,7 @@ def test_sort_values(self, closed): expected = IntervalIndex([Interval(0, 1), Interval(1, 2), np.nan]) tm.assert_index_equal(result, expected) - result = index.sort_values(ascending=False) + result = index.sort_values(ascending=False, na_position="first") expected = IntervalIndex([np.nan, Interval(1, 2), Interval(0, 1)]) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index e7dd76584d780..d1b34c315b682 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -174,9 +174,6 @@ def _check_freq(index, expected_index): ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, expected[::-1]) - - exp = np.array([2, 1, 3, 4, 0]) - tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) _check_freq(ordered, idx) pidx = PeriodIndex(["2011", "2013", "NaT", "2011"], name="pidx", freq="D") @@ -333,3 +330,16 @@ def test_freq_setter_deprecated(self): # warning for setter with pytest.raises(AttributeError, match="can't set attribute"): idx.freq = pd.offsets.Day() + + +@pytest.mark.xfail(reason="Datetime-like sort_values currently unstable (GH 35922)") +def test_order_stability_compat(): + # GH 35584. The new implementation of sort_values for Index.sort_values + # is stable when sorting in descending order. Datetime-like sort_values + # currently aren't stable. xfail should be removed after + # the implementations' behavior is synchronized (xref GH 35922) + pidx = PeriodIndex(["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A") + iidx = Index([2011, 2013, 2015, 2012, 2011], name="idx") + ordered1, indexer1 = pidx.sort_values(return_indexer=True, ascending=False) + ordered2, indexer2 = iidx.sort_values(return_indexer=True, ascending=False) + tm.assert_numpy_array_equal(indexer1, indexer2) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index db260b71e7186..aa6b395176b06 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -13,7 +13,14 @@ from pandas.core.dtypes.common import is_period_dtype, needs_i8_conversion import pandas as pd -from pandas import CategoricalIndex, MultiIndex, RangeIndex +from pandas import ( + CategoricalIndex, + DatetimeIndex, + MultiIndex, + PeriodIndex, + RangeIndex, + TimedeltaIndex, +) import pandas._testing as tm @@ -391,3 +398,46 @@ def test_astype_preserves_name(self, index, dtype): assert result.names == index.names else: assert result.name == index.name + + +@pytest.mark.parametrize("na_position", [None, "middle"]) +def test_sort_values_invalid_na_position(index_with_missing, na_position): + if isinstance(index_with_missing, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): + # datetime-like indices will get na_position kwarg as part of + # synchronizing duplicate-sorting behavior, because we currently expect + # them, other indices, and Series to sort differently (xref 35922) + pytest.xfail("sort_values does not support na_position kwarg") + elif isinstance(index_with_missing, (CategoricalIndex, MultiIndex)): + pytest.xfail("missing value sorting order not defined for index type") + + if na_position not in ["first", "last"]: + with pytest.raises( + ValueError, match=f"invalid na_position: {na_position}", + ): + index_with_missing.sort_values(na_position=na_position) + + +@pytest.mark.parametrize("na_position", ["first", "last"]) +def test_sort_values_with_missing(index_with_missing, na_position): + # GH 35584. Test that sort_values works with missing values, + # sort non-missing and place missing according to na_position + + if isinstance(index_with_missing, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): + # datetime-like indices will get na_position kwarg as part of + # synchronizing duplicate-sorting behavior, because we currently expect + # them, other indices, and Series to sort differently (xref 35922) + pytest.xfail("sort_values does not support na_position kwarg") + elif isinstance(index_with_missing, (CategoricalIndex, MultiIndex)): + pytest.xfail("missing value sorting order not defined for index type") + + missing_count = np.sum(index_with_missing.isna()) + not_na_vals = index_with_missing[index_with_missing.notna()].values + sorted_values = np.sort(not_na_vals) + if na_position == "first": + sorted_values = np.concatenate([[None] * missing_count, sorted_values]) + else: + sorted_values = np.concatenate([sorted_values, [None] * missing_count]) + expected = type(index_with_missing)(sorted_values) + + result = index_with_missing.sort_values(na_position=na_position) + tm.assert_index_equal(result, expected)