Skip to content

Commit 254e260

Browse files
authored
DOC: Clarify that DataFrame.sort_values is stable for sorting by multiple columns or labels (#38426)
1 parent 0db7bd9 commit 254e260

File tree

2 files changed

+41
-19
lines changed

2 files changed

+41
-19
lines changed

pandas/core/generic.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -4376,9 +4376,9 @@ def sort_values(
43764376
If True, perform operation in-place.
43774377
kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'
43784378
Choice of sorting algorithm. See also ndarray.np.sort for more
4379-
information. `mergesort` is the only stable algorithm. For
4380-
DataFrames, this option is only applied when sorting on a single
4381-
column or label.
4379+
information. `mergesort` is the only stable algorithm. For
4380+
DataFrames, if sorting by multiple columns or labels, this
4381+
argument is ignored, defaulting to a stable sorting algorithm.
43824382
na_position : {'first', 'last'}, default 'last'
43834383
Puts NaNs at the beginning if `first`; `last` puts NaNs at the
43844384
end.

pandas/tests/frame/methods/test_sort_values.py

+38-16
Original file line numberDiff line numberDiff line change
@@ -217,26 +217,48 @@ def test_sort_values_stable_descending_sort(self):
217217
sorted_df = df.sort_values(by="sort_col", kind="mergesort", ascending=False)
218218
tm.assert_frame_equal(df, sorted_df)
219219

220-
def test_sort_values_stable_descending_multicolumn_sort(self):
220+
@pytest.mark.parametrize(
221+
"expected_idx_non_na, ascending",
222+
[
223+
[
224+
[3, 4, 5, 0, 1, 8, 6, 9, 7, 10, 13, 14],
225+
[True, True],
226+
],
227+
[
228+
[0, 3, 4, 5, 1, 8, 6, 7, 10, 13, 14, 9],
229+
[True, False],
230+
],
231+
[
232+
[9, 7, 10, 13, 14, 6, 8, 1, 3, 4, 5, 0],
233+
[False, True],
234+
],
235+
[
236+
[7, 10, 13, 14, 9, 6, 8, 1, 0, 3, 4, 5],
237+
[False, False],
238+
],
239+
],
240+
)
241+
@pytest.mark.parametrize("na_position", ["first", "last"])
242+
def test_sort_values_stable_multicolumn_sort(
243+
self, expected_idx_non_na, ascending, na_position
244+
):
245+
# GH#38426 Clarify sort_values with mult. columns / labels is stable
221246
df = DataFrame(
222-
{"A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5]}
223-
)
224-
# test stable mergesort
225-
expected = DataFrame(
226-
{"A": [np.nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, np.nan, 2, 9]},
227-
index=[2, 5, 4, 6, 1, 3, 0],
228-
)
229-
sorted_df = df.sort_values(
230-
["A", "B"], ascending=[0, 1], na_position="first", kind="mergesort"
247+
{
248+
"A": [1, 2, np.nan, 1, 1, 1, 6, 8, 4, 8, 8, np.nan, np.nan, 8, 8],
249+
"B": [9, np.nan, 5, 2, 2, 2, 5, 4, 5, 3, 4, np.nan, np.nan, 4, 4],
250+
}
231251
)
232-
tm.assert_frame_equal(sorted_df, expected)
233-
234-
expected = DataFrame(
235-
{"A": [np.nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, np.nan, 9, 2]},
236-
index=[2, 5, 4, 6, 1, 0, 3],
252+
# All rows with NaN in col "B" only have unique values in "A", therefore,
253+
# only the rows with NaNs in "A" have to be treated individually:
254+
expected_idx = (
255+
[11, 12, 2] + expected_idx_non_na
256+
if na_position == "first"
257+
else expected_idx_non_na + [2, 11, 12]
237258
)
259+
expected = df.take(expected_idx)
238260
sorted_df = df.sort_values(
239-
["A", "B"], ascending=[0, 0], na_position="first", kind="mergesort"
261+
["A", "B"], ascending=ascending, na_position=na_position
240262
)
241263
tm.assert_frame_equal(sorted_df, expected)
242264

0 commit comments

Comments
 (0)