Skip to content

Commit ba552ec

Browse files
authored
BUG: allow missing values in Index when calling Index.sort_values (#35604)
1 parent b431e9a commit ba552ec

File tree

6 files changed

+112
-11
lines changed

6 files changed

+112
-11
lines changed

doc/source/whatsnew/v1.2.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -270,8 +270,9 @@ Interval
270270

271271
Indexing
272272
^^^^^^^^
273+
273274
- Bug in :meth:`PeriodIndex.get_loc` incorrectly raising ``ValueError`` on non-datelike strings instead of ``KeyError``, causing similar errors in :meth:`Series.__geitem__`, :meth:`Series.__contains__`, and :meth:`Series.loc.__getitem__` (:issue:`34240`)
274-
-
275+
- Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`)
275276
-
276277

277278
Missing

pandas/conftest.py

+23
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,29 @@ def index(request):
437437
index_fixture2 = index
438438

439439

440+
@pytest.fixture(params=indices_dict.keys())
441+
def index_with_missing(request):
442+
"""
443+
Fixture for indices with missing values
444+
"""
445+
if request.param in ["int", "uint", "range", "empty", "repeats"]:
446+
pytest.xfail("missing values not supported")
447+
# GH 35538. Use deep copy to avoid illusive bug on np-dev
448+
# Azure pipeline that writes into indices_dict despite copy
449+
ind = indices_dict[request.param].copy(deep=True)
450+
vals = ind.values
451+
if request.param in ["tuples", "mi-with-dt64tz-level", "multi"]:
452+
# For setting missing values in the top level of MultiIndex
453+
vals = ind.tolist()
454+
vals[0] = tuple([None]) + vals[0][1:]
455+
vals[-1] = tuple([None]) + vals[-1][1:]
456+
return MultiIndex.from_tuples(vals)
457+
else:
458+
vals[0] = None
459+
vals[-1] = None
460+
return type(ind)(vals)
461+
462+
440463
# ----------------------------------------------------------------
441464
# Series'
442465
# ----------------------------------------------------------------

pandas/core/indexes/base.py

+22-5
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@
8888
import pandas.core.missing as missing
8989
from pandas.core.ops import get_op_result_name
9090
from pandas.core.ops.invalid import make_invalid_op
91-
from pandas.core.sorting import ensure_key_mapped
91+
from pandas.core.sorting import ensure_key_mapped, nargsort
9292
from pandas.core.strings import StringMethods
9393

9494
from pandas.io.formats.printing import (
@@ -4443,7 +4443,11 @@ def asof_locs(self, where, mask):
44434443
return result
44444444

44454445
def sort_values(
4446-
self, return_indexer=False, ascending=True, key: Optional[Callable] = None
4446+
self,
4447+
return_indexer=False,
4448+
ascending=True,
4449+
na_position: str_t = "last",
4450+
key: Optional[Callable] = None,
44474451
):
44484452
"""
44494453
Return a sorted copy of the index.
@@ -4457,6 +4461,12 @@ def sort_values(
44574461
Should the indices that would sort the index be returned.
44584462
ascending : bool, default True
44594463
Should the index values be sorted in an ascending order.
4464+
na_position : {'first' or 'last'}, default 'last'
4465+
Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at
4466+
the end.
4467+
4468+
.. versionadded:: 1.2.0
4469+
44604470
key : callable, optional
44614471
If not None, apply the key function to the index values
44624472
before sorting. This is similar to the `key` argument in the
@@ -4497,9 +4507,16 @@ def sort_values(
44974507
"""
44984508
idx = ensure_key_mapped(self, key)
44994509

4500-
_as = idx.argsort()
4501-
if not ascending:
4502-
_as = _as[::-1]
4510+
# GH 35584. Sort missing values according to na_position kwarg
4511+
# ignore na_position for MutiIndex
4512+
if not isinstance(self, ABCMultiIndex):
4513+
_as = nargsort(
4514+
items=idx, ascending=ascending, na_position=na_position, key=key
4515+
)
4516+
else:
4517+
_as = idx.argsort()
4518+
if not ascending:
4519+
_as = _as[::-1]
45034520

45044521
sorted_index = self.take(_as)
45054522

pandas/tests/indexes/interval/test_interval.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -618,7 +618,7 @@ def test_sort_values(self, closed):
618618
expected = IntervalIndex([Interval(0, 1), Interval(1, 2), np.nan])
619619
tm.assert_index_equal(result, expected)
620620

621-
result = index.sort_values(ascending=False)
621+
result = index.sort_values(ascending=False, na_position="first")
622622
expected = IntervalIndex([np.nan, Interval(1, 2), Interval(0, 1)])
623623
tm.assert_index_equal(result, expected)
624624

pandas/tests/indexes/period/test_ops.py

+13-3
Original file line numberDiff line numberDiff line change
@@ -174,9 +174,6 @@ def _check_freq(index, expected_index):
174174

175175
ordered, indexer = idx.sort_values(return_indexer=True, ascending=False)
176176
tm.assert_index_equal(ordered, expected[::-1])
177-
178-
exp = np.array([2, 1, 3, 4, 0])
179-
tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
180177
_check_freq(ordered, idx)
181178

182179
pidx = PeriodIndex(["2011", "2013", "NaT", "2011"], name="pidx", freq="D")
@@ -333,3 +330,16 @@ def test_freq_setter_deprecated(self):
333330
# warning for setter
334331
with pytest.raises(AttributeError, match="can't set attribute"):
335332
idx.freq = pd.offsets.Day()
333+
334+
335+
@pytest.mark.xfail(reason="Datetime-like sort_values currently unstable (GH 35922)")
336+
def test_order_stability_compat():
337+
# GH 35584. The new implementation of sort_values for Index.sort_values
338+
# is stable when sorting in descending order. Datetime-like sort_values
339+
# currently aren't stable. xfail should be removed after
340+
# the implementations' behavior is synchronized (xref GH 35922)
341+
pidx = PeriodIndex(["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A")
342+
iidx = Index([2011, 2013, 2015, 2012, 2011], name="idx")
343+
ordered1, indexer1 = pidx.sort_values(return_indexer=True, ascending=False)
344+
ordered2, indexer2 = iidx.sort_values(return_indexer=True, ascending=False)
345+
tm.assert_numpy_array_equal(indexer1, indexer2)

pandas/tests/indexes/test_common.py

+51-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,14 @@
1313
from pandas.core.dtypes.common import is_period_dtype, needs_i8_conversion
1414

1515
import pandas as pd
16-
from pandas import CategoricalIndex, MultiIndex, RangeIndex
16+
from pandas import (
17+
CategoricalIndex,
18+
DatetimeIndex,
19+
MultiIndex,
20+
PeriodIndex,
21+
RangeIndex,
22+
TimedeltaIndex,
23+
)
1724
import pandas._testing as tm
1825

1926

@@ -391,3 +398,46 @@ def test_astype_preserves_name(self, index, dtype):
391398
assert result.names == index.names
392399
else:
393400
assert result.name == index.name
401+
402+
403+
@pytest.mark.parametrize("na_position", [None, "middle"])
404+
def test_sort_values_invalid_na_position(index_with_missing, na_position):
405+
if isinstance(index_with_missing, (DatetimeIndex, PeriodIndex, TimedeltaIndex)):
406+
# datetime-like indices will get na_position kwarg as part of
407+
# synchronizing duplicate-sorting behavior, because we currently expect
408+
# them, other indices, and Series to sort differently (xref 35922)
409+
pytest.xfail("sort_values does not support na_position kwarg")
410+
elif isinstance(index_with_missing, (CategoricalIndex, MultiIndex)):
411+
pytest.xfail("missing value sorting order not defined for index type")
412+
413+
if na_position not in ["first", "last"]:
414+
with pytest.raises(
415+
ValueError, match=f"invalid na_position: {na_position}",
416+
):
417+
index_with_missing.sort_values(na_position=na_position)
418+
419+
420+
@pytest.mark.parametrize("na_position", ["first", "last"])
421+
def test_sort_values_with_missing(index_with_missing, na_position):
422+
# GH 35584. Test that sort_values works with missing values,
423+
# sort non-missing and place missing according to na_position
424+
425+
if isinstance(index_with_missing, (DatetimeIndex, PeriodIndex, TimedeltaIndex)):
426+
# datetime-like indices will get na_position kwarg as part of
427+
# synchronizing duplicate-sorting behavior, because we currently expect
428+
# them, other indices, and Series to sort differently (xref 35922)
429+
pytest.xfail("sort_values does not support na_position kwarg")
430+
elif isinstance(index_with_missing, (CategoricalIndex, MultiIndex)):
431+
pytest.xfail("missing value sorting order not defined for index type")
432+
433+
missing_count = np.sum(index_with_missing.isna())
434+
not_na_vals = index_with_missing[index_with_missing.notna()].values
435+
sorted_values = np.sort(not_na_vals)
436+
if na_position == "first":
437+
sorted_values = np.concatenate([[None] * missing_count, sorted_values])
438+
else:
439+
sorted_values = np.concatenate([sorted_values, [None] * missing_count])
440+
expected = type(index_with_missing)(sorted_values)
441+
442+
result = index_with_missing.sort_values(na_position=na_position)
443+
tm.assert_index_equal(result, expected)

0 commit comments

Comments
 (0)