pandas-dev · jreback · Apr 27, 2020 · Jul 4, 2019 · Jan 28, 2020 · Jan 28, 2020
diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst
@@ -1813,6 +1813,18 @@ argument:
    s.sort_values()
    s.sort_values(na_position='first')
 
+Sorting also supports a ``key`` parameter that takes a callable function
+to apply to the values being sorted.
+
+.. ipython:: python
+
+   s1 = pd.Series(['B', 'a', 'C'])
+   s1.sort_values()
+   s1.sort_values(key=lambda x: x.str.lower())
+
+`key` will be given the :class:`Series` of values and should return a ``Series``
+or array of the same shape with the transformed values.
+
 .. _basics.sort_indexes_and_values:
 
 By indexes and values

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -36,6 +36,42 @@ For example:
    ser["2014"]
    ser.loc["May 2015"]
 
+.. _whatsnew_110.key_sorting:
+
+Sorting with keys
+^^^^^^^^^^^^^^^^^
+
+We've added a ``key`` argument to the DataFrame and Series sorting methods, including
+:meth:`DataFrame.sort_values`, :meth:`DataFrame.sort_index`, :meth:`Series.sort_values`,
+and :meth:`Series.sort_index`. The ``key`` can be any callable function which is applied
+to the each column of a DataFrame before sorting is performed (:issue:`27237`).
+
+.. ipython:: python
+
+   s = pd.Series(['C', 'a', 'B'])
+   s.sort_values()
+
+
+Note how this is sorted with capital letters first. If we apply the `ser.str.lower()` method, we get
+
+.. ipython:: python
+
+   s.sort_values(key=lambda x: x.str.lower())
+
+
+When applied to a `DataFrame`, they key is applied per-column to all columns or a subset if
+`by` is specified, e.g.
+
+.. ipython:: python
+
+   df = pd.DataFrame({'a' : ['C', 'C', 'a', 'a', 'B', 'B'], 'b' : [1, 2, 3, 4, 5, 6]})
+   df.sort_values(by=['a', 'b'], key=lambda col : col.str.lower() if col.name == 'a' else -col)
+
+
+For :meth:`DataFrame.sort_index` with `MultiIndex`, the key function is applied per level. For
+more details, see examples and documentation in :meth:`DataFrame.sort_values`, :meth:`Series.sort_values`,
+and :meth:`~DataFrame.sort_index`.
+
 .. _whatsnew_110.timestamp_fold_support:
 
 Fold argument support in Timestamp constructor

diff --git a/pandas/_typing.py b/pandas/_typing.py
@@ -75,3 +75,8 @@
 
 # to maintain type information across generic functions and parametrization
 T = TypeVar("T")
+
+# types of vectorized key functions for DataFrame::sort_values and
+# DataFrame::sort_index, among others
+ValueKeyFunc = Optional[Callable[["Series"], Union["Series", AnyArrayLike]]]
+IndexKeyFunc = Optional[Callable[["Index"], Union["Index", AnyArrayLike]]]
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -1221,3 +1221,12 @@ def tick_classes(request):
     Fixture for Tick based datetime offsets available for a time series.
     """
     return request.param
+
+
+@pytest.fixture(params=[None, lambda x: x])
+def sort_by_key(request):
+    """
+    Simple fixture for testing keys in sorting methods.
+    Tests None (no key) and the identity key.
+    """
+    return request.param
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -1,6 +1,6 @@
 import operator
 from shutil import get_terminal_size
-from typing import Dict, Hashable, List, Type, Union, cast
+from typing import Callable, Dict, Hashable, List, Optional, Type, Union, cast
 from warnings import warn
 
 import numpy as np
@@ -1532,7 +1532,13 @@ def argsort(self, ascending=True, kind="quicksort", **kwargs):
         """
         return super().argsort(ascending=ascending, kind=kind, **kwargs)
 
-    def sort_values(self, inplace=False, ascending=True, na_position="last"):
+    def sort_values(
+        self,
+        inplace=False,
+        ascending=True,
+        na_position="last",
+        key: Optional[Callable] = None,
+    ):
         """
         Sort the Categorical by category value returning a new
         Categorical by default.
@@ -1554,6 +1560,15 @@ def sort_values(self, inplace=False, ascending=True, na_position="last"):
         na_position : {'first', 'last'} (optional, default='last')
             'first' puts NaNs at the beginning
             'last' puts NaNs at the end
+        key : callable, optional
+            Apply the key function to the values before sorting.
+            This is similar to the `key` argument in the builtin
+            :meth:`sorted` function, with the notable difference that
+            this `key` function should be *vectorized*. It should expect
+            a ``Categorical`` and return an object with the same shape
+            as the input.
+
+            .. versionadded:: 1.1.0
 
         Returns
         -------
@@ -1610,7 +1625,9 @@ def sort_values(self, inplace=False, ascending=True, na_position="last"):
         if na_position not in ["last", "first"]:
             raise ValueError(f"invalid na_position: {repr(na_position)}")
 
-        sorted_idx = nargsort(self, ascending=ascending, na_position=na_position)
+        sorted_idx = nargsort(
+            self, ascending=ascending, na_position=na_position, key=key
+        )
 
         if inplace:
             self._codes = self._codes[sorted_idx]

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -40,7 +40,17 @@
 from pandas._config import get_option
 
 from pandas._libs import algos as libalgos, lib, properties
-from pandas._typing import Axes, Axis, Dtype, FilePathOrBuffer, Label, Level, Renamer
+from pandas._typing import (
+    Axes,
+    Axis,
+    Dtype,
+    FilePathOrBuffer,
+    IndexKeyFunc,
+    Label,
+    Level,
+    Renamer,
+    ValueKeyFunc,
+)
 from pandas.compat import PY37
 from pandas.compat._optional import import_optional_dependency
 from pandas.compat.numpy import function as nv
@@ -129,6 +139,7 @@
 )
 from pandas.core.ops.missing import dispatch_fill_zeros
 from pandas.core.series import Series
+from pandas.core.sorting import ensure_key_mapped
 
 from pandas.io.common import get_filepath_or_buffer
 from pandas.io.formats import console, format as fmt
@@ -4746,10 +4757,10 @@ def f(vals):
 
     # ----------------------------------------------------------------------
     # Sorting
-
+    # TODO: Just move the sort_values doc here.
     @Substitution(**_shared_doc_kwargs)
     @Appender(NDFrame.sort_values.__doc__)
-    def sort_values(
+    def sort_values(  # type: ignore[override] # NOQA # issue 27237
         self,
         by,
         axis=0,
@@ -4758,6 +4769,7 @@ def sort_values(
         kind="quicksort",
         na_position="last",
         ignore_index=False,
+        key: ValueKeyFunc = None,
     ):
         inplace = validate_bool_kwarg(inplace, "inplace")
         axis = self._get_axis_number(axis)
@@ -4772,19 +4784,30 @@ def sort_values(
             from pandas.core.sorting import lexsort_indexer
 
             keys = [self._get_label_or_level_values(x, axis=axis) for x in by]
-            indexer = lexsort_indexer(keys, orders=ascending, na_position=na_position)
+
+            # need to rewrap columns in Series to apply key function
+            if key is not None:
+                keys = [Series(k, name=name) for (k, name) in zip(keys, by)]
+
+            indexer = lexsort_indexer(
+                keys, orders=ascending, na_position=na_position, key=key
+            )
             indexer = ensure_platform_int(indexer)
         else:
             from pandas.core.sorting import nargsort
 
             by = by[0]
             k = self._get_label_or_level_values(by, axis=axis)
 
+            # need to rewrap column in Series to apply key function
+            if key is not None:
+                k = Series(k)
+
             if isinstance(ascending, (tuple, list)):
                 ascending = ascending[0]
 
             indexer = nargsort(
-                k, kind=kind, ascending=ascending, na_position=na_position
+                k, kind=kind, ascending=ascending, na_position=na_position, key=key
             )
 
         new_data = self._mgr.take(
@@ -4810,6 +4833,7 @@ def sort_index(
         na_position: str = "last",
         sort_remaining: bool = True,
         ignore_index: bool = False,
+        key: IndexKeyFunc = None,
     ):
         """
         Sort object by labels (along an axis).
@@ -4845,6 +4869,16 @@ def sort_index(
 
             .. versionadded:: 1.0.0
 
+        key : callable, optional
+            If not None, apply the key function to the index values
+            before sorting. This is similar to the `key` argument in the
+            builtin :meth:`sorted` function, with the notable difference that
+            this `key` function should be *vectorized*. It should expect an
+            ``Index`` and return an ``Index`` of the same shape. For MultiIndex
+            inputs, the key is applied *per level*.
+
+            .. versionadded:: 1.1.0
+
         Returns
         -------
         DataFrame
@@ -4887,11 +4921,16 @@ def sort_index(
         axis = self._get_axis_number(axis)
         labels = self._get_axis(axis)
 
+        # apply key to each level separately and create a new index
+        if isinstance(labels, ABCMultiIndex):
+            labels = labels.apply_key(key, level=level)
+        else:
+            labels = ensure_key_mapped(labels, key)
+
         # make sure that the axis is lexsorted to start
         # if not we need to reconstruct to get the correct indexer
         labels = labels._sort_levels_monotonic()
         if level is not None:
-
             new_axis, indexer = labels.sortlevel(
                 level, ascending=ascending, sort_remaining=sort_remaining
             )