DOC: Clarify that DataFrame.sort_values is stable for sorting by multiple columns or labels (#38426)

jotasi · web-flow · commit 254e26001223 · 2020-12-22T15:32:56.000-05:00
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -4376,9 +4376,9 @@ def sort_values(
              If True, perform operation in-place.
         kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'
              Choice of sorting algorithm. See also ndarray.np.sort for more
-             information.  `mergesort` is the only stable algorithm. For
-             DataFrames, this option is only applied when sorting on a single
-             column or label.
+             information. `mergesort` is the only stable algorithm. For
+             DataFrames, if sorting by multiple columns or labels, this
+             argument is ignored, defaulting to a stable sorting algorithm.
         na_position : {'first', 'last'}, default 'last'
              Puts NaNs at the beginning if `first`; `last` puts NaNs at the
              end.
diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py
@@ -217,26 +217,48 @@ def test_sort_values_stable_descending_sort(self):
         sorted_df = df.sort_values(by="sort_col", kind="mergesort", ascending=False)
         tm.assert_frame_equal(df, sorted_df)
 
-    def test_sort_values_stable_descending_multicolumn_sort(self):
+    @pytest.mark.parametrize(
+        "expected_idx_non_na, ascending",
+        [
+            [
+                [3, 4, 5, 0, 1, 8, 6, 9, 7, 10, 13, 14],
+                [True, True],
+            ],
+            [
+                [0, 3, 4, 5, 1, 8, 6, 7, 10, 13, 14, 9],
+                [True, False],
+            ],
+            [
+                [9, 7, 10, 13, 14, 6, 8, 1, 3, 4, 5, 0],
+                [False, True],
+            ],
+            [
+                [7, 10, 13, 14, 9, 6, 8, 1, 0, 3, 4, 5],
+                [False, False],
+            ],
+        ],
+    )
+    @pytest.mark.parametrize("na_position", ["first", "last"])
+    def test_sort_values_stable_multicolumn_sort(
+        self, expected_idx_non_na, ascending, na_position
+    ):
+        # GH#38426 Clarify sort_values with mult. columns / labels is stable
         df = DataFrame(
-            {"A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5]}
-        )
-        # test stable mergesort
-        expected = DataFrame(
-            {"A": [np.nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, np.nan, 2, 9]},
-            index=[2, 5, 4, 6, 1, 3, 0],
-        )
-        sorted_df = df.sort_values(
-            ["A", "B"], ascending=[0, 1], na_position="first", kind="mergesort"
+            {
+                "A": [1, 2, np.nan, 1, 1, 1, 6, 8, 4, 8, 8, np.nan, np.nan, 8, 8],
+                "B": [9, np.nan, 5, 2, 2, 2, 5, 4, 5, 3, 4, np.nan, np.nan, 4, 4],
+            }
         )
-        tm.assert_frame_equal(sorted_df, expected)
-
-        expected = DataFrame(
-            {"A": [np.nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, np.nan, 9, 2]},
-            index=[2, 5, 4, 6, 1, 0, 3],
+        # All rows with NaN in col "B" only have unique values in "A", therefore,
+        # only the rows with NaNs in "A" have to be treated individually:
+        expected_idx = (
+            [11, 12, 2] + expected_idx_non_na
+            if na_position == "first"
+            else expected_idx_non_na + [2, 11, 12]
         )
+        expected = df.take(expected_idx)
         sorted_df = df.sort_values(
-            ["A", "B"], ascending=[0, 0], na_position="first", kind="mergesort"
+            ["A", "B"], ascending=ascending, na_position=na_position
         )
         tm.assert_frame_equal(sorted_df, expected)