Skip to content

Commit b9c9afe

Browse files
AlexKirkoukarroum
authored andcommitted
BUG: stabilize sort_values algorithms for Series and time-like Indices (pandas-dev#37310)
1 parent f7fb67d commit b9c9afe

File tree

19 files changed

+69
-107
lines changed

19 files changed

+69
-107
lines changed

doc/source/whatsnew/v1.2.0.rst

+7
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,13 @@ Optional libraries below the lowest tested version may still work, but are not c
309309

310310
See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more.
311311

312+
.. _whatsnew_200.api.other:
313+
314+
Other API changes
315+
^^^^^^^^^^^^^^^^^
316+
317+
- Sorting in descending order is now stable for :meth:`Series.sort_values` and :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses. This will affect sort order when sorting :class:`DataFrame` on multiple columns, sorting with a key function that produces duplicates, or requesting the sorting index when using :meth:`Index.sort_values`. When using :meth:`Series.value_counts`, count of missing values is no longer the last in the list of duplicate counts, and its position corresponds to the position in the original :class:`Series`. When using :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses, NaTs ignored the ``na_position`` argument and were sorted to the beggining. Now they respect ``na_position``, the default being ``last``, same as other :class:`Index` subclasses. (:issue:`35992`)
318+
312319
.. ---------------------------------------------------------------------------
313320
314321
.. _whatsnew_120.deprecations:

pandas/core/algorithms.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -1181,10 +1181,8 @@ def compute(self, method: str) -> Series:
11811181

11821182
# slow method
11831183
if n >= len(self.obj):
1184-
reverse_it = self.keep == "last" or method == "nlargest"
11851184
ascending = method == "nsmallest"
1186-
slc = np.s_[::-1] if reverse_it else np.s_[:]
1187-
return dropped[slc].sort_values(ascending=ascending).head(n)
1185+
return dropped.sort_values(ascending=ascending).head(n)
11881186

11891187
# fast method
11901188
arr, pandas_dtype = _ensure_data(dropped.values)

pandas/core/base.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -933,9 +933,9 @@ def value_counts(
933933
>>> index = pd.Index([3, 1, 2, 3, 4, np.nan])
934934
>>> index.value_counts()
935935
3.0 2
936-
4.0 1
937-
2.0 1
938936
1.0 1
937+
2.0 1
938+
4.0 1
939939
dtype: int64
940940
941941
With `normalize` set to `True`, returns the relative frequency by
@@ -944,9 +944,9 @@ def value_counts(
944944
>>> s = pd.Series([3, 1, 2, 3, 4, np.nan])
945945
>>> s.value_counts(normalize=True)
946946
3.0 0.4
947-
4.0 0.2
948-
2.0 0.2
949947
1.0 0.2
948+
2.0 0.2
949+
4.0 0.2
950950
dtype: float64
951951
952952
**bins**
@@ -957,8 +957,8 @@ def value_counts(
957957
number of half-open bins.
958958
959959
>>> s.value_counts(bins=3)
960-
(2.0, 3.0] 2
961960
(0.996, 2.0] 2
961+
(2.0, 3.0] 2
962962
(3.0, 4.0] 1
963963
dtype: int64
964964
@@ -968,10 +968,10 @@ def value_counts(
968968
969969
>>> s.value_counts(dropna=False)
970970
3.0 2
971-
NaN 1
972-
4.0 1
973-
2.0 1
974971
1.0 1
972+
2.0 1
973+
4.0 1
974+
NaN 1
975975
dtype: int64
976976
"""
977977
result = value_counts(

pandas/core/frame.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -5563,8 +5563,8 @@ def value_counts(
55635563
>>> df.value_counts()
55645564
num_legs num_wings
55655565
4 0 2
5566-
6 0 1
55675566
2 2 1
5567+
6 0 1
55685568
dtype: int64
55695569
55705570
>>> df.value_counts(sort=False)
@@ -5584,8 +5584,8 @@ def value_counts(
55845584
>>> df.value_counts(normalize=True)
55855585
num_legs num_wings
55865586
4 0 0.50
5587-
6 0 0.25
55885587
2 2 0.25
5588+
6 0 0.25
55895589
dtype: float64
55905590
"""
55915591
if subset is None:

pandas/core/generic.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -10106,7 +10106,7 @@ def describe(
1010610106
categorical
1010710107
count 3
1010810108
unique 3
10109-
top f
10109+
top d
1011010110
freq 1
1011110111
1011210112
Excluding numeric columns from a ``DataFrame`` description.

pandas/core/indexes/base.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -4542,9 +4542,7 @@ def sort_values(
45424542

45434543
# GH 35584. Sort missing values according to na_position kwarg
45444544
# ignore na_position for MultiIndex
4545-
if not isinstance(
4546-
self, (ABCMultiIndex, ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex)
4547-
):
4545+
if not isinstance(self, ABCMultiIndex):
45484546
_as = nargsort(
45494547
items=idx, ascending=ascending, na_position=na_position, key=key
45504548
)

pandas/core/series.py

+9-37
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@
9292
from pandas.core.indexing import check_bool_indexer
9393
from pandas.core.internals import SingleBlockManager
9494
from pandas.core.shared_docs import _shared_docs
95-
from pandas.core.sorting import ensure_key_mapped
95+
from pandas.core.sorting import ensure_key_mapped, nargsort
9696
from pandas.core.strings import StringMethods
9797
from pandas.core.tools.datetimes import to_datetime
9898

@@ -3288,29 +3288,6 @@ def sort_values(
32883288
"sort in-place you must create a copy"
32893289
)
32903290

3291-
def _try_kind_sort(arr):
3292-
arr = ensure_key_mapped(arr, key)
3293-
arr = getattr(arr, "_values", arr)
3294-
3295-
# easier to ask forgiveness than permission
3296-
try:
3297-
# if kind==mergesort, it can fail for object dtype
3298-
return arr.argsort(kind=kind)
3299-
except TypeError:
3300-
# stable sort not available for object dtype
3301-
# uses the argsort default quicksort
3302-
return arr.argsort(kind="quicksort")
3303-
3304-
arr = self._values
3305-
sorted_index = np.empty(len(self), dtype=np.int32)
3306-
3307-
bad = isna(arr)
3308-
3309-
good = ~bad
3310-
idx = ibase.default_index(len(self))
3311-
3312-
argsorted = _try_kind_sort(self[good])
3313-
33143291
if is_list_like(ascending):
33153292
if len(ascending) != 1:
33163293
raise ValueError(
@@ -3321,21 +3298,16 @@ def _try_kind_sort(arr):
33213298
if not is_bool(ascending):
33223299
raise ValueError("ascending must be boolean")
33233300

3324-
if not ascending:
3325-
argsorted = argsorted[::-1]
3326-
3327-
if na_position == "last":
3328-
n = good.sum()
3329-
sorted_index[:n] = idx[good][argsorted]
3330-
sorted_index[n:] = idx[bad]
3331-
elif na_position == "first":
3332-
n = bad.sum()
3333-
sorted_index[n:] = idx[good][argsorted]
3334-
sorted_index[:n] = idx[bad]
3335-
else:
3301+
if na_position not in ["first", "last"]:
33363302
raise ValueError(f"invalid na_position: {na_position}")
33373303

3338-
result = self._constructor(arr[sorted_index], index=self.index[sorted_index])
3304+
# GH 35922. Make sorting stable by leveraging nargsort
3305+
values_to_sort = ensure_key_mapped(self, key)._values if key else self._values
3306+
sorted_index = nargsort(values_to_sort, kind, ascending, na_position)
3307+
3308+
result = self._constructor(
3309+
self._values[sorted_index], index=self.index[sorted_index]
3310+
)
33393311

33403312
if ignore_index:
33413313
result.index = ibase.default_index(len(sorted_index))

pandas/tests/arrays/boolean/test_function.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -77,11 +77,11 @@ def test_ufunc_reduce_raises(values):
7777
def test_value_counts_na():
7878
arr = pd.array([True, False, pd.NA], dtype="boolean")
7979
result = arr.value_counts(dropna=False)
80-
expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64")
80+
expected = pd.Series([1, 1, 1], index=[False, True, pd.NA], dtype="Int64")
8181
tm.assert_series_equal(result, expected)
8282

8383
result = arr.value_counts(dropna=True)
84-
expected = pd.Series([1, 1], index=[True, False], dtype="Int64")
84+
expected = pd.Series([1, 1], index=[False, True], dtype="Int64")
8585
tm.assert_series_equal(result, expected)
8686

8787

pandas/tests/arrays/string_/test_string.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,7 @@ def test_arrow_roundtrip():
301301
def test_value_counts_na():
302302
arr = pd.array(["a", "b", "a", pd.NA], dtype="string")
303303
result = arr.value_counts(dropna=False)
304-
expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64")
304+
expected = pd.Series([2, 1, 1], index=["a", pd.NA, "b"], dtype="Int64")
305305
tm.assert_series_equal(result, expected)
306306

307307
result = arr.value_counts(dropna=True)

pandas/tests/base/test_value_counts.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -153,16 +153,16 @@ def test_value_counts_bins(index_or_series):
153153
# these return the same
154154
res4 = s1.value_counts(bins=4, dropna=True)
155155
intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
156-
exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]))
156+
exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]))
157157
tm.assert_series_equal(res4, exp4)
158158

159159
res4 = s1.value_counts(bins=4, dropna=False)
160160
intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
161-
exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]))
161+
exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]))
162162
tm.assert_series_equal(res4, exp4)
163163

164164
res4n = s1.value_counts(bins=4, normalize=True)
165-
exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 3, 1, 2]))
165+
exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 1, 3, 2]))
166166
tm.assert_series_equal(res4n, exp4n)
167167

168168
# handle NA's properly
@@ -239,7 +239,11 @@ def test_value_counts_datetime64(index_or_series):
239239
tm.assert_series_equal(result, expected_s)
240240

241241
result = s.value_counts(dropna=False)
242-
expected_s[pd.NaT] = 1
242+
# GH 35922. NaN-like now sorts to the beginning of duplicate counts
243+
idx = pd.to_datetime(
244+
["2010-01-01 00:00:00", "2008-09-09 00:00:00", pd.NaT, "2009-01-01 00:00:00"]
245+
)
246+
expected_s = Series([3, 2, 1, 1], index=idx)
243247
tm.assert_series_equal(result, expected_s)
244248

245249
unique = s.unique()

pandas/tests/extension/base/methods.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,11 @@ def test_sort_values(self, data_for_sorting, ascending, sort_by_key):
125125
result = ser.sort_values(ascending=ascending, key=sort_by_key)
126126
expected = ser.iloc[[2, 0, 1]]
127127
if not ascending:
128-
expected = expected[::-1]
128+
# GH 35922. Expect stable sort
129+
if ser.nunique() == 2:
130+
expected = ser.iloc[[0, 1, 2]]
131+
else:
132+
expected = ser.iloc[[1, 0, 2]]
129133

130134
self.assert_series_equal(result, expected)
131135

pandas/tests/frame/methods/test_describe.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def test_describe_bool_frame(self):
5656
)
5757
result = df.describe()
5858
expected = DataFrame(
59-
{"bool_data_1": [4, 2, True, 2], "bool_data_2": [4, 2, True, 3]},
59+
{"bool_data_1": [4, 2, False, 2], "bool_data_2": [4, 2, True, 3]},
6060
index=["count", "unique", "top", "freq"],
6161
)
6262
tm.assert_frame_equal(result, expected)
@@ -79,7 +79,7 @@ def test_describe_bool_frame(self):
7979
)
8080
result = df.describe()
8181
expected = DataFrame(
82-
{"bool_data": [4, 2, True, 2], "str_data": [4, 3, "a", 2]},
82+
{"bool_data": [4, 2, False, 2], "str_data": [4, 3, "a", 2]},
8383
index=["count", "unique", "top", "freq"],
8484
)
8585
tm.assert_frame_equal(result, expected)

pandas/tests/frame/methods/test_value_counts.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def test_data_frame_value_counts_default():
4848
expected = pd.Series(
4949
data=[2, 1, 1],
5050
index=pd.MultiIndex.from_arrays(
51-
[(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"]
51+
[(4, 2, 6), (0, 2, 0)], names=["num_legs", "num_wings"]
5252
),
5353
)
5454

@@ -65,7 +65,7 @@ def test_data_frame_value_counts_normalize():
6565
expected = pd.Series(
6666
data=[0.5, 0.25, 0.25],
6767
index=pd.MultiIndex.from_arrays(
68-
[(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"]
68+
[(4, 2, 6), (0, 2, 0)], names=["num_legs", "num_wings"]
6969
),
7070
)
7171

@@ -78,7 +78,7 @@ def test_data_frame_value_counts_single_col_default():
7878
result = df.value_counts()
7979
expected = pd.Series(
8080
data=[2, 1, 1],
81-
index=pd.MultiIndex.from_arrays([[4, 6, 2]], names=["num_legs"]),
81+
index=pd.MultiIndex.from_arrays([[4, 2, 6]], names=["num_legs"]),
8282
)
8383

8484
tm.assert_series_equal(result, expected)

pandas/tests/indexes/datetimes/test_ops.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -231,15 +231,15 @@ def test_order_without_freq(self, index_dates, expected_dates, tz_naive_fixture)
231231
index = DatetimeIndex(index_dates, tz=tz, name="idx")
232232
expected = DatetimeIndex(expected_dates, tz=tz, name="idx")
233233

234-
ordered = index.sort_values()
234+
ordered = index.sort_values(na_position="first")
235235
tm.assert_index_equal(ordered, expected)
236236
assert ordered.freq is None
237237

238238
ordered = index.sort_values(ascending=False)
239239
tm.assert_index_equal(ordered, expected[::-1])
240240
assert ordered.freq is None
241241

242-
ordered, indexer = index.sort_values(return_indexer=True)
242+
ordered, indexer = index.sort_values(return_indexer=True, na_position="first")
243243
tm.assert_index_equal(ordered, expected)
244244

245245
exp = np.array([0, 4, 3, 1, 2])
@@ -249,7 +249,7 @@ def test_order_without_freq(self, index_dates, expected_dates, tz_naive_fixture)
249249
ordered, indexer = index.sort_values(return_indexer=True, ascending=False)
250250
tm.assert_index_equal(ordered, expected[::-1])
251251

252-
exp = np.array([2, 1, 3, 4, 0])
252+
exp = np.array([2, 1, 3, 0, 4])
253253
tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
254254
assert ordered.freq is None
255255

pandas/tests/indexes/period/test_ops.py

+5-9
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ def _check_freq(index, expected_index):
178178

179179
pidx = PeriodIndex(["2011", "2013", "NaT", "2011"], name="pidx", freq="D")
180180

181-
result = pidx.sort_values()
181+
result = pidx.sort_values(na_position="first")
182182
expected = PeriodIndex(["NaT", "2011", "2011", "2013"], name="pidx", freq="D")
183183
tm.assert_index_equal(result, expected)
184184
assert result.freq == "D"
@@ -247,15 +247,15 @@ def test_order(self):
247247
)
248248

249249
for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3)]:
250-
ordered = idx.sort_values()
250+
ordered = idx.sort_values(na_position="first")
251251
tm.assert_index_equal(ordered, expected)
252252
assert ordered.freq == "D"
253253

254254
ordered = idx.sort_values(ascending=False)
255255
tm.assert_index_equal(ordered, expected[::-1])
256256
assert ordered.freq == "D"
257257

258-
ordered, indexer = idx.sort_values(return_indexer=True)
258+
ordered, indexer = idx.sort_values(return_indexer=True, na_position="first")
259259
tm.assert_index_equal(ordered, expected)
260260

261261
exp = np.array([0, 4, 3, 1, 2])
@@ -265,7 +265,7 @@ def test_order(self):
265265
ordered, indexer = idx.sort_values(return_indexer=True, ascending=False)
266266
tm.assert_index_equal(ordered, expected[::-1])
267267

268-
exp = np.array([2, 1, 3, 4, 0])
268+
exp = np.array([2, 1, 3, 0, 4])
269269
tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
270270
assert ordered.freq == "D"
271271

@@ -332,12 +332,8 @@ def test_freq_setter_deprecated(self):
332332
idx.freq = pd.offsets.Day()
333333

334334

335-
@pytest.mark.xfail(reason="Datetime-like sort_values currently unstable (GH 35922)")
336335
def test_order_stability_compat():
337-
# GH 35584. The new implementation of sort_values for Index.sort_values
338-
# is stable when sorting in descending order. Datetime-like sort_values
339-
# currently aren't stable. xfail should be removed after
340-
# the implementations' behavior is synchronized (xref GH 35922)
336+
# GH 35922. sort_values is stable both for normal and datetime-like Index
341337
pidx = PeriodIndex(["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A")
342338
iidx = Index([2011, 2013, 2015, 2012, 2011], name="idx")
343339
ordered1, indexer1 = pidx.sort_values(return_indexer=True, ascending=False)

0 commit comments

Comments
 (0)