Skip to content

Commit 9ce3b26

Browse files
DOC: updated inline documentation for key sorting
1 parent af949fd commit 9ce3b26

File tree

10 files changed

+226
-79
lines changed

10 files changed

+226
-79
lines changed

doc/source/whatsnew/v1.0.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,8 @@ Other enhancements
150150
- Roundtripping DataFrames with nullable integer or string data types to parquet
151151
(:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine
152152
now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`).
153+
- :meth:`DataFrame.sort_values`, :meth:`DataFrame.sort_index`, :meth:`Series.sort_index`, and :meth:`Series.sort_index`
154+
now support the ``key`` argument which allows for custom sorting orders (:issue:`3942`)
153155

154156
Build Changes
155157
^^^^^^^^^^^^^

pandas/core/arrays/categorical.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -1579,7 +1579,7 @@ def argsort(self, ascending=True, kind="quicksort", *args, **kwargs):
15791579
"""
15801580
return super().argsort(ascending=ascending, kind=kind, *args, **kwargs)
15811581

1582-
def sort_values(self, inplace=False, ascending=True, na_position="last"):
1582+
def sort_values(self, inplace=False, ascending=True, na_position="last", key=None):
15831583
"""
15841584
Sort the Categorical by category value returning a new
15851585
Categorical by default.
@@ -1601,6 +1601,9 @@ def sort_values(self, inplace=False, ascending=True, na_position="last"):
16011601
na_position : {'first', 'last'} (optional, default='last')
16021602
'first' puts NaNs at the beginning
16031603
'last' puts NaNs at the end
1604+
key : Callable, default None
1605+
If not None, apply the key function to every value before
1606+
sorting. Identical to key argument in built-in sorted function.
16041607
16051608
Returns
16061609
-------
@@ -1657,7 +1660,9 @@ def sort_values(self, inplace=False, ascending=True, na_position="last"):
16571660
if na_position not in ["last", "first"]:
16581661
raise ValueError(f"invalid na_position: {na_position!r}")
16591662

1660-
sorted_idx = nargsort(self, ascending=ascending, na_position=na_position)
1663+
sorted_idx = nargsort(
1664+
self, ascending=ascending, na_position=na_position, key=key
1665+
)
16611666

16621667
if inplace:
16631668
self._codes = self._codes[sorted_idx]

pandas/core/frame.py

+10-19
Original file line numberDiff line numberDiff line change
@@ -4704,7 +4704,6 @@ def f(vals):
47044704

47054705
# ----------------------------------------------------------------------
47064706
# Sorting
4707-
47084707
@Substitution(**_shared_doc_kwargs)
47094708
@Appender(NDFrame.sort_values.__doc__)
47104709
def sort_values(
@@ -4715,7 +4714,7 @@ def sort_values(
47154714
inplace=False,
47164715
kind="quicksort",
47174716
na_position="last",
4718-
key = None
4717+
key=None,
47194718
):
47204719
inplace = validate_bool_kwarg(inplace, "inplace")
47214720
axis = self._get_axis_number(axis)
@@ -4729,29 +4728,22 @@ def sort_values(
47294728
if len(by) > 1:
47304729
from pandas.core.sorting import lexsort_indexer
47314730

4732-
if key is not None:
4733-
key_func = np.vectorize(key)
4734-
keys = [key_func(self._get_label_or_level_values(x, axis=axis)) for x in by]
4735-
else:
4736-
keys = [self._get_label_or_level_values(x, axis=axis) for x in by]
4737-
4738-
indexer = lexsort_indexer(keys, orders=ascending, na_position=na_position)
4731+
keys = [self._get_label_or_level_values(x, axis=axis) for x in by]
4732+
indexer = lexsort_indexer(
4733+
keys, orders=ascending, na_position=na_position, key=key
4734+
)
47394735
indexer = ensure_platform_int(indexer)
47404736
else:
47414737
from pandas.core.sorting import nargsort
47424738

47434739
by = by[0]
47444740
k = self._get_label_or_level_values(by, axis=axis)
47454741

4746-
if key is not None:
4747-
key_func = np.vectorize(key)
4748-
k = key_func(k)
4749-
47504742
if isinstance(ascending, (tuple, list)):
47514743
ascending = ascending[0]
47524744

47534745
indexer = nargsort(
4754-
k, kind=kind, ascending=ascending, na_position=na_position
4746+
k, kind=kind, ascending=ascending, na_position=na_position, key=key
47554747
)
47564748

47574749
new_data = self._data.take(
@@ -4774,7 +4766,7 @@ def sort_index(
47744766
kind="quicksort",
47754767
na_position="last",
47764768
sort_remaining=True,
4777-
key=None
4769+
key=None,
47784770
):
47794771

47804772
# TODO: this can be combined with Series.sort_index impl as
@@ -4785,8 +4777,8 @@ def sort_index(
47854777
axis = self._get_axis_number(axis)
47864778
labels = self._get_axis(axis)
47874779
if key is not None:
4788-
labels = labels.map(key)
4789-
4780+
labels = labels.map(key, na_action="ignore")
4781+
47904782
# make sure that the axis is lexsorted to start
47914783
# if not we need to reconstruct to get the correct indexer
47924784
labels = labels._sort_levels_monotonic()
@@ -4798,9 +4790,8 @@ def sort_index(
47984790
elif isinstance(labels, ABCMultiIndex):
47994791
from pandas.core.sorting import lexsort_indexer
48004792

4801-
codes = labels._get_codes_for_sorting()
48024793
indexer = lexsort_indexer(
4803-
codes,
4794+
labels._get_codes_for_sorting(),
48044795
orders=ascending,
48054796
na_position=na_position,
48064797
)

pandas/core/indexes/base.py

+16-3
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from datetime import datetime
22
import operator
33
from textwrap import dedent
4-
from typing import FrozenSet, Union
4+
from typing import Callable, FrozenSet, Optional, Union
55
import warnings
66

77
import numpy as np
@@ -4400,7 +4400,9 @@ def asof_locs(self, where, mask):
44004400

44014401
return result
44024402

4403-
def sort_values(self, return_indexer=False, ascending=True):
4403+
def sort_values(
4404+
self, return_indexer=False, ascending=True, key: Optional[Callable] = None
4405+
):
44044406
"""
44054407
Return a sorted copy of the index.
44064408
@@ -4413,6 +4415,9 @@ def sort_values(self, return_indexer=False, ascending=True):
44134415
Should the indices that would sort the index be returned.
44144416
ascending : bool, default True
44154417
Should the index values be sorted in an ascending order.
4418+
key : Callable, default None
4419+
Apply a key function to the indices before sorting, like
4420+
built-in sorted function.
44164421
44174422
Returns
44184423
-------
@@ -4443,7 +4448,12 @@ def sort_values(self, return_indexer=False, ascending=True):
44434448
>>> idx.sort_values(ascending=False, return_indexer=True)
44444449
(Int64Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2]))
44454450
"""
4446-
_as = self.argsort()
4451+
if key:
4452+
idx = self.map(key, na_action="ignore")
4453+
else:
4454+
idx = self
4455+
4456+
_as = idx.argsort()
44474457
if not ascending:
44484458
_as = _as[::-1]
44494459

@@ -4553,9 +4563,12 @@ def argsort(self, *args, **kwargs):
45534563
>>> idx[order]
45544564
Index(['a', 'b', 'c', 'd'], dtype='object')
45554565
"""
4566+
45564567
result = self.asi8
4568+
45574569
if result is None:
45584570
result = np.array(self)
4571+
45594572
return result.argsort(*args, **kwargs)
45604573

45614574
_index_shared_docs[

pandas/core/indexes/datetimelike.py

+14-4
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Base and utility classes for tseries type pandas objects.
33
"""
44
import operator
5-
from typing import Set
5+
from typing import Callable, Optional, Set
66

77
import numpy as np
88

@@ -273,12 +273,22 @@ def map(self, mapper, na_action=None):
273273
except Exception:
274274
return self.astype(object).map(mapper)
275275

276-
def sort_values(self, return_indexer=False, ascending=True):
276+
def sort_values(
277+
self, return_indexer=False, ascending=True, key: Optional[Callable] = None
278+
):
277279
"""
278280
Return sorted copy of Index.
279281
"""
282+
if not isinstance(self, Index):
283+
raise TypeError("sort_values must be called on an Index object")
284+
285+
if key:
286+
idx = self.map(key, na_action="ignore")
287+
else:
288+
idx = self
289+
280290
if return_indexer:
281-
_as = self.argsort()
291+
_as = idx.argsort()
282292
if not ascending:
283293
_as = _as[::-1]
284294
sorted_index = self.take(_as)
@@ -287,7 +297,7 @@ def sort_values(self, return_indexer=False, ascending=True):
287297
# NB: using asi8 instead of _ndarray_values matters in numpy 1.18
288298
# because the treatment of NaT has been changed to put NaT last
289299
# instead of first.
290-
sorted_values = np.sort(self.asi8)
300+
sorted_values = np.sort(idx.asi8)
291301
attribs = self._get_attributes_dict()
292302
freq = attribs["freq"]
293303

pandas/core/series.py

+40-6
Original file line numberDiff line numberDiff line change
@@ -2835,7 +2835,7 @@ def sort_values(
28352835
inplace=False,
28362836
kind="quicksort",
28372837
na_position="last",
2838-
key=None
2838+
key=None,
28392839
):
28402840
"""
28412841
Sort by the values.
@@ -2858,6 +2858,9 @@ def sort_values(
28582858
na_position : {'first' or 'last'}, default 'last'
28592859
Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at
28602860
the end.
2861+
key : Callable, default None
2862+
If not None, apply the key function to every value before
2863+
sorting. Identical to key argument in built-in sorted function.
28612864
28622865
Returns
28632866
-------
@@ -2940,6 +2943,22 @@ def sort_values(
29402943
2 d
29412944
0 z
29422945
dtype: object
2946+
2947+
>>> s = pd.Series(['a', 'B', 'c', 'D', 'e'])
2948+
>>> s.sort_values()
2949+
1 B
2950+
3 D
2951+
0 a
2952+
2 c
2953+
4 e
2954+
dtype: object
2955+
>>> s.sort_values(key=str.lower)
2956+
0 a
2957+
1 B
2958+
2 c
2959+
3 D
2960+
4 e
2961+
dtype: object
29432962
"""
29442963
inplace = validate_bool_kwarg(inplace, "inplace")
29452964
# Validate the axis parameter
@@ -3016,7 +3035,7 @@ def sort_index(
30163035
kind="quicksort",
30173036
na_position="last",
30183037
sort_remaining=True,
3019-
key=None
3038+
key: Optional[Callable] = None,
30203039
):
30213040
"""
30223041
Sort Series by index labels.
@@ -3045,6 +3064,9 @@ def sort_index(
30453064
sort_remaining : bool, default True
30463065
If True and sorting by level and index is multilevel, sort by other
30473066
levels too (in order) after sorting by specified level.
3067+
key : Callable, default None
3068+
If not None, apply the key function to every index element before
3069+
sorting. Identical to key argument in built-in sorted function.
30483070
30493071
Returns
30503072
-------
@@ -3127,7 +3149,20 @@ def sort_index(
31273149
baz two 5
31283150
bar two 7
31293151
dtype: int64
3152+
3153+
>>> s = Series([1, 2, 3, 4, 5, 6, 7, 8])
3154+
>>> s.sort_index(key=lambda x : -x)
3155+
7 8
3156+
6 7
3157+
5 6
3158+
4 5
3159+
3 4
3160+
2 3
3161+
1 2
3162+
0 1
3163+
dtype: int64
31303164
"""
3165+
31313166
# TODO: this can be combined with DataFrame.sort_index impl as
31323167
# almost identical
31333168
inplace = validate_bool_kwarg(inplace, "inplace")
@@ -3136,8 +3171,8 @@ def sort_index(
31363171
index = self.index
31373172
true_index = index
31383173
if key is not None:
3139-
index = index.map(key)
3140-
3174+
index = index.map(key, na_action="ignore")
3175+
31413176
if level is not None:
31423177
new_index, indexer = index.sortlevel(
31433178
level, ascending=ascending, sort_remaining=sort_remaining
@@ -3147,10 +3182,9 @@ def sort_index(
31473182
from pandas.core.sorting import lexsort_indexer
31483183

31493184
labels = index._sort_levels_monotonic()
3150-
codes = labels._get_codes_for_sorting()
31513185

31523186
indexer = lexsort_indexer(
3153-
codes,
3187+
labels._get_codes_for_sorting(),
31543188
orders=ascending,
31553189
na_position=na_position,
31563190
)

pandas/core/sorting.py

+28-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
""" miscellaneous sorting / groupby utilities """
2+
from typing import Callable, Optional
3+
24
import numpy as np
35

46
from pandas._libs import algos, hashtable, lib
@@ -187,7 +189,8 @@ def indexer_from_factorized(labels, shape, compress: bool = True):
187189
return get_group_index_sorter(ids, ngroups)
188190

189191

190-
def lexsort_indexer(keys, orders=None, na_position="last"):
192+
def lexsort_indexer(keys, orders=None, na_position="last", key=None):
193+
191194
from pandas.core.arrays import Categorical
192195

193196
labels = []
@@ -197,6 +200,10 @@ def lexsort_indexer(keys, orders=None, na_position="last"):
197200
elif orders is None:
198201
orders = [True] * len(keys)
199202

203+
if key:
204+
key_func = np.vectorize(key)
205+
keys = [key_func(entry) if entry.size != 0 else entry for entry in keys]
206+
200207
for key, order in zip(keys, orders):
201208

202209
# we are already a Categorical
@@ -233,7 +240,13 @@ def lexsort_indexer(keys, orders=None, na_position="last"):
233240
return indexer_from_factorized(labels, shape)
234241

235242

236-
def nargsort(items, kind="quicksort", ascending: bool = True, na_position="last"):
243+
def nargsort(
244+
items,
245+
kind="quicksort",
246+
ascending: bool = True,
247+
na_position="last",
248+
key: Optional[Callable] = None,
249+
):
237250
"""
238251
This is intended to be a drop-in replacement for np.argsort which
239252
handles NaNs. It adds ascending and na_position parameters.
@@ -247,6 +260,19 @@ def nargsort(items, kind="quicksort", ascending: bool = True, na_position="last"
247260
else:
248261
items = np.asanyarray(items)
249262

263+
if key is not None:
264+
key_func = np.vectorize(key)
265+
masked = np.ma.MaskedArray(items, mask)
266+
267+
if masked.size == 0:
268+
vals = np.array([]) # vectorize fails on empty object arrays
269+
else:
270+
vals = np.asarray(key_func(masked)) # revert from masked
271+
272+
return nargsort(
273+
vals, kind=kind, ascending=ascending, na_position=na_position, key=None
274+
)
275+
250276
idx = np.arange(len(items))
251277
non_nans = items[~mask]
252278
non_nan_idx = idx[~mask]

0 commit comments

Comments
 (0)