Skip to content

Commit 2ea0601

Browse files
gfyoungjreback
authored andcommitted
BUG, DEP, DOC: Patch and Align Categorical's Sorting API
Clarifies the meaning of 'sort' in the context of `Categorical` to mean 'organization' rather than 'order', as it is possible to call this method (as well as `sort_values`) when the `Categorical` is unordered. Also patches a bug in `Categorical.sort_values` in which `na_position` was not being respected when `ascending` was set to `True`. This commit aligns the behaviour with that of `Series`. Finally, deprecates `sort` in favor of `sort_values`, which is in alignment with what was done with `Series` back in pandas-dev#10726. Closes pandas-dev#12785 Author: gfyoung <[email protected]> Closes pandas-dev#12882 from gfyoung/categorical-sort-doc and squashes the following commits: f324a9c [gfyoung] BUG, DOC, DEP: Patch and Align Categorical's Sorting API
1 parent 6f1ade1 commit 2ea0601

File tree

3 files changed

+129
-99
lines changed

3 files changed

+129
-99
lines changed

doc/source/whatsnew/v0.18.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@ Deprecations
224224
^^^^^^^^^^^^
225225

226226
- The method name ``Index.sym_diff()`` is deprecated and can be replaced by ``Index.symmetric_difference()`` (:issue:`12591`)
227+
- The method name ``Categorical.sort()`` is deprecated in favor of ``Categorical.sort_values()`` (:issue:`12882`)
227228

228229

229230

pandas/core/categorical.py

+67-53
Original file line numberDiff line numberDiff line change
@@ -1157,30 +1157,76 @@ def argsort(self, ascending=True, **kwargs):
11571157
return result
11581158

11591159
def sort_values(self, inplace=False, ascending=True, na_position='last'):
1160-
""" Sorts the Category by category value returning a new Categorical by
1161-
default.
1160+
""" Sorts the Categorical by category value returning a new
1161+
Categorical by default.
11621162
1163-
Only ordered Categoricals can be sorted!
1164-
1165-
Categorical.sort is the equivalent but sorts the Categorical inplace.
1163+
While an ordering is applied to the category values, sorting in this
1164+
context refers more to organizing and grouping together based on
1165+
matching category values. Thus, this function can be called on an
1166+
unordered Categorical instance unlike the functions 'Categorical.min'
1167+
and 'Categorical.max'.
11661168
11671169
Parameters
11681170
----------
11691171
inplace : boolean, default False
11701172
Do operation in place.
11711173
ascending : boolean, default True
1172-
Sort ascending. Passing False sorts descending
1174+
Order ascending. Passing False orders descending. The
1175+
ordering parameter provides the method by which the
1176+
category values are organized.
11731177
na_position : {'first', 'last'} (optional, default='last')
11741178
'first' puts NaNs at the beginning
11751179
'last' puts NaNs at the end
11761180
11771181
Returns
11781182
-------
1179-
y : Category or None
1183+
y : Categorical or None
11801184
11811185
See Also
11821186
--------
1183-
Category.sort
1187+
Categorical.sort
1188+
1189+
Examples
1190+
--------
1191+
>>> c = pd.Categorical([1, 2, 2, 1, 5])
1192+
>>> c
1193+
[1, 2, 2, 1, 5]
1194+
Categories (3, int64): [1, 2, 5]
1195+
>>> c.sort_values()
1196+
[1, 1, 2, 2, 5]
1197+
Categories (3, int64): [1, 2, 5]
1198+
>>> c.sort_values(ascending=False)
1199+
[5, 2, 2, 1, 1]
1200+
Categories (3, int64): [1, 2, 5]
1201+
1202+
Inplace sorting can be done as well:
1203+
1204+
>>> c.sort_values(inplace=True)
1205+
>>> c
1206+
[1, 1, 2, 2, 5]
1207+
Categories (3, int64): [1, 2, 5]
1208+
>>>
1209+
>>> c = pd.Categorical([1, 2, 2, 1, 5])
1210+
1211+
'sort_values' behaviour with NaNs. Note that 'na_position'
1212+
is independent of the 'ascending' parameter:
1213+
1214+
>>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5])
1215+
>>> c
1216+
[NaN, 2.0, 2.0, NaN, 5.0]
1217+
Categories (2, int64): [2, 5]
1218+
>>> c.sort_values()
1219+
[2.0, 2.0, 5.0, NaN, NaN]
1220+
Categories (2, int64): [2, 5]
1221+
>>> c.sort_values(ascending=False)
1222+
[5.0, 2.0, 2.0, NaN, NaN]
1223+
Categories (2, int64): [2, 5]
1224+
>>> c.sort_values(na_position='first')
1225+
[NaN, NaN, 2.0, 2.0, 5.0]
1226+
Categories (2, int64): [2, 5]
1227+
>>> c.sort_values(ascending=False, na_position='first')
1228+
[NaN, NaN, 5.0, 2.0, 2.0]
1229+
Categories (2, int64): [2, 5]
11841230
"""
11851231
if na_position not in ['last', 'first']:
11861232
raise ValueError('invalid na_position: {!r}'.format(na_position))
@@ -1193,13 +1239,13 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'):
11931239
na_mask = (codes == -1)
11941240
if na_mask.any():
11951241
n_nans = len(codes[na_mask])
1196-
if na_position == "first" and not ascending:
1242+
if na_position == "first":
11971243
# in this case sort to the front
11981244
new_codes = codes.copy()
11991245
new_codes[0:n_nans] = -1
12001246
new_codes[n_nans:] = codes[~na_mask]
12011247
codes = new_codes
1202-
elif na_position == "last" and not ascending:
1248+
elif na_position == "last":
12031249
# ... and to the end
12041250
new_codes = codes.copy()
12051251
pos = len(codes) - n_nans
@@ -1215,63 +1261,31 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'):
12151261

12161262
def order(self, inplace=False, ascending=True, na_position='last'):
12171263
"""
1218-
DEPRECATED: use :meth:`Categorical.sort_values`
1219-
1220-
Sorts the Category by category value returning a new Categorical by
1221-
default.
1222-
1223-
Only ordered Categoricals can be sorted!
1224-
1225-
Categorical.sort is the equivalent but sorts the Categorical inplace.
1226-
1227-
Parameters
1228-
----------
1229-
inplace : boolean, default False
1230-
Do operation in place.
1231-
ascending : boolean, default True
1232-
Sort ascending. Passing False sorts descending
1233-
na_position : {'first', 'last'} (optional, default='last')
1234-
'first' puts NaNs at the beginning
1235-
'last' puts NaNs at the end
1236-
1237-
Returns
1238-
-------
1239-
y : Category or None
1264+
DEPRECATED: use :meth:`Categorical.sort_values`. That function
1265+
is entirely equivalent to this one.
12401266
12411267
See Also
12421268
--------
1243-
Category.sort
1269+
Categorical.sort_values
12441270
"""
12451271
warn("order is deprecated, use sort_values(...)", FutureWarning,
12461272
stacklevel=2)
12471273
return self.sort_values(inplace=inplace, ascending=ascending,
12481274
na_position=na_position)
12491275

12501276
def sort(self, inplace=True, ascending=True, na_position='last'):
1251-
""" Sorts the Category inplace by category value.
1252-
1253-
Only ordered Categoricals can be sorted!
1254-
1255-
Catgorical.order is the equivalent but returns a new Categorical.
1256-
1257-
Parameters
1258-
----------
1259-
ascending : boolean, default True
1260-
Sort ascending. Passing False sorts descending
1261-
inplace : boolean, default False
1262-
Do operation in place.
1263-
na_position : {'first', 'last'} (optional, default='last')
1264-
'first' puts NaNs at the beginning
1265-
'last' puts NaNs at the end
1266-
1267-
Returns
1268-
-------
1269-
y : Category or None
1277+
"""
1278+
DEPRECATED: use :meth:`Categorical.sort_values`. That function
1279+
is just like this one, except that a new Categorical is returned
1280+
by default, so make sure to pass in 'inplace=True' to get
1281+
inplace sorting.
12701282
12711283
See Also
12721284
--------
1273-
Category.sort_values
1285+
Categorical.sort_values
12741286
"""
1287+
warn("sort is deprecated, use sort_values(...)", FutureWarning,
1288+
stacklevel=2)
12751289
return self.sort_values(inplace=inplace, ascending=ascending,
12761290
na_position=na_position)
12771291

pandas/tests/test_categorical.py

+61-46
Original file line numberDiff line numberDiff line change
@@ -1277,12 +1277,11 @@ def test_mode(self):
12771277
exp = Categorical([4], categories=[5, 4, 3, 2, 1], ordered=True)
12781278
self.assertTrue(res.equals(exp))
12791279

1280-
def test_sort(self):
1280+
def test_sort_values(self):
12811281

12821282
# unordered cats are sortable
12831283
cat = Categorical(["a", "b", "b", "a"], ordered=False)
12841284
cat.sort_values()
1285-
cat.sort()
12861285

12871286
cat = Categorical(["a", "c", "b", "d"], ordered=True)
12881287

@@ -1303,10 +1302,62 @@ def test_sort(self):
13031302

13041303
# sort (inplace order)
13051304
cat1 = cat.copy()
1306-
cat1.sort()
1305+
cat1.sort_values(inplace=True)
13071306
exp = np.array(["a", "b", "c", "d"], dtype=object)
13081307
self.assert_numpy_array_equal(cat1.__array__(), exp)
13091308

1309+
# reverse
1310+
cat = Categorical(["a", "c", "c", "b", "d"], ordered=True)
1311+
res = cat.sort_values(ascending=False)
1312+
exp_val = np.array(["d", "c", "c", "b", "a"], dtype=object)
1313+
exp_categories = np.array(["a", "b", "c", "d"], dtype=object)
1314+
self.assert_numpy_array_equal(res.__array__(), exp_val)
1315+
self.assert_numpy_array_equal(res.categories, exp_categories)
1316+
1317+
def test_sort_values_na_position(self):
1318+
# see gh-12882
1319+
cat = Categorical([5, 2, np.nan, 2, np.nan], ordered=True)
1320+
exp_categories = np.array([2, 5])
1321+
1322+
exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
1323+
res = cat.sort_values() # default arguments
1324+
self.assert_numpy_array_equal(res.__array__(), exp)
1325+
self.assert_numpy_array_equal(res.categories, exp_categories)
1326+
1327+
exp = np.array([np.nan, np.nan, 2.0, 2.0, 5.0])
1328+
res = cat.sort_values(ascending=True, na_position='first')
1329+
self.assert_numpy_array_equal(res.__array__(), exp)
1330+
self.assert_numpy_array_equal(res.categories, exp_categories)
1331+
1332+
exp = np.array([np.nan, np.nan, 5.0, 2.0, 2.0])
1333+
res = cat.sort_values(ascending=False, na_position='first')
1334+
self.assert_numpy_array_equal(res.__array__(), exp)
1335+
self.assert_numpy_array_equal(res.categories, exp_categories)
1336+
1337+
exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
1338+
res = cat.sort_values(ascending=True, na_position='last')
1339+
self.assert_numpy_array_equal(res.__array__(), exp)
1340+
self.assert_numpy_array_equal(res.categories, exp_categories)
1341+
1342+
exp = np.array([5.0, 2.0, 2.0, np.nan, np.nan])
1343+
res = cat.sort_values(ascending=False, na_position='last')
1344+
self.assert_numpy_array_equal(res.__array__(), exp)
1345+
self.assert_numpy_array_equal(res.categories, exp_categories)
1346+
1347+
cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
1348+
res = cat.sort_values(ascending=False, na_position='last')
1349+
exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object)
1350+
exp_categories = np.array(["a", "b", "c", "d"], dtype=object)
1351+
self.assert_numpy_array_equal(res.__array__(), exp_val)
1352+
self.assert_numpy_array_equal(res.categories, exp_categories)
1353+
1354+
cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
1355+
res = cat.sort_values(ascending=False, na_position='first')
1356+
exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object)
1357+
exp_categories = np.array(["a", "b", "c", "d"], dtype=object)
1358+
self.assert_numpy_array_equal(res.__array__(), exp_val)
1359+
self.assert_numpy_array_equal(res.categories, exp_categories)
1360+
13101361
def test_slicing_directly(self):
13111362
cat = Categorical(["a", "b", "c", "d", "a", "b", "c"])
13121363
sliced = cat[3]
@@ -2951,14 +3002,16 @@ def test_count(self):
29513002
result = s.count()
29523003
self.assertEqual(result, 2)
29533004

2954-
def test_sort(self):
3005+
def test_sort_values(self):
29553006

29563007
c = Categorical(["a", "b", "b", "a"], ordered=False)
2957-
cat = Series(c)
3008+
cat = Series(c.copy())
29583009

2959-
# 9816 deprecated
2960-
with tm.assert_produces_warning(FutureWarning):
2961-
c.order()
3010+
# 'order' was deprecated in gh-10726
3011+
# 'sort' was deprecated in gh-12882
3012+
for func in ('order', 'sort'):
3013+
with tm.assert_produces_warning(FutureWarning):
3014+
getattr(c, func)()
29623015

29633016
# sort in the categories order
29643017
expected = Series(
@@ -3024,44 +3077,6 @@ def test_sort(self):
30243077
expected = df.iloc[[2, 1, 5, 4, 3, 0]]
30253078
tm.assert_frame_equal(result, expected)
30263079

3027-
# reverse
3028-
cat = Categorical(["a", "c", "c", "b", "d"], ordered=True)
3029-
res = cat.sort_values(ascending=False)
3030-
exp_val = np.array(["d", "c", "c", "b", "a"], dtype=object)
3031-
exp_categories = np.array(["a", "b", "c", "d"], dtype=object)
3032-
self.assert_numpy_array_equal(res.__array__(), exp_val)
3033-
self.assert_numpy_array_equal(res.categories, exp_categories)
3034-
3035-
# some NaN positions
3036-
3037-
cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
3038-
res = cat.sort_values(ascending=False, na_position='last')
3039-
exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object)
3040-
exp_categories = np.array(["a", "b", "c", "d"], dtype=object)
3041-
self.assert_numpy_array_equal(res.__array__(), exp_val)
3042-
self.assert_numpy_array_equal(res.categories, exp_categories)
3043-
3044-
cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
3045-
res = cat.sort_values(ascending=False, na_position='first')
3046-
exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object)
3047-
exp_categories = np.array(["a", "b", "c", "d"], dtype=object)
3048-
self.assert_numpy_array_equal(res.__array__(), exp_val)
3049-
self.assert_numpy_array_equal(res.categories, exp_categories)
3050-
3051-
cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
3052-
res = cat.sort_values(ascending=False, na_position='first')
3053-
exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object)
3054-
exp_categories = np.array(["a", "b", "c", "d"], dtype=object)
3055-
self.assert_numpy_array_equal(res.__array__(), exp_val)
3056-
self.assert_numpy_array_equal(res.categories, exp_categories)
3057-
3058-
cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
3059-
res = cat.sort_values(ascending=False, na_position='last')
3060-
exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object)
3061-
exp_categories = np.array(["a", "b", "c", "d"], dtype=object)
3062-
self.assert_numpy_array_equal(res.__array__(), exp_val)
3063-
self.assert_numpy_array_equal(res.categories, exp_categories)
3064-
30653080
def test_slicing(self):
30663081
cat = Series(Categorical([1, 2, 3, 4]))
30673082
reversed = cat[::-1]

0 commit comments

Comments
 (0)