Skip to content

Commit f324a9c

Browse files
committed
BUG, DOC, DEP: Patch and Align Categorical's Sorting API
Clarifies the meaning of 'sort' in the context of Categorical to mean 'organization' rather than 'order', as it is possible to call this method (as well as 'sort_values') when the Categorical is unordered. Also patches a bug in 'Categorical.sort_values' in which 'na_position' was not being respected when 'ascending' was set to 'True'. This commit aligns the behaviour with that of Series. Finally, this commit deprecates 'sort' in favor of 'sort_values,' which is in alignment with the Series API as well. Closes pandas-devgh-12785.
1 parent 2ba977a commit f324a9c

File tree

3 files changed

+129
-99
lines changed

3 files changed

+129
-99
lines changed

doc/source/whatsnew/v0.18.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@ Deprecations
224224
^^^^^^^^^^^^
225225

226226
- The method name ``Index.sym_diff()`` is deprecated and can be replaced by ``Index.symmetric_difference()`` (:issue:`12591`)
227+
- The method name ``Categorical.sort()`` is deprecated in favor of ``Categorical.sort_values()`` (:issue:`12882`)
227228

228229

229230

pandas/core/categorical.py

+67-53
Original file line numberDiff line numberDiff line change
@@ -1157,30 +1157,76 @@ def argsort(self, ascending=True, **kwargs):
11571157
return result
11581158

11591159
def sort_values(self, inplace=False, ascending=True, na_position='last'):
1160-
""" Sorts the Category by category value returning a new Categorical by
1161-
default.
1160+
""" Sorts the Categorical by category value returning a new
1161+
Categorical by default.
11621162
1163-
Only ordered Categoricals can be sorted!
1164-
1165-
Categorical.sort is the equivalent but sorts the Categorical inplace.
1163+
While an ordering is applied to the category values, sorting in this
1164+
context refers more to organizing and grouping together based on
1165+
matching category values. Thus, this function can be called on an
1166+
unordered Categorical instance unlike the functions 'Categorical.min'
1167+
and 'Categorical.max'.
11661168
11671169
Parameters
11681170
----------
11691171
inplace : boolean, default False
11701172
Do operation in place.
11711173
ascending : boolean, default True
1172-
Sort ascending. Passing False sorts descending
1174+
Order ascending. Passing False orders descending. The
1175+
ordering parameter provides the method by which the
1176+
category values are organized.
11731177
na_position : {'first', 'last'} (optional, default='last')
11741178
'first' puts NaNs at the beginning
11751179
'last' puts NaNs at the end
11761180
11771181
Returns
11781182
-------
1179-
y : Category or None
1183+
y : Categorical or None
11801184
11811185
See Also
11821186
--------
1183-
Category.sort
1187+
Categorical.sort
1188+
1189+
Examples
1190+
--------
1191+
>>> c = pd.Categorical([1, 2, 2, 1, 5])
1192+
>>> c
1193+
[1, 2, 2, 1, 5]
1194+
Categories (3, int64): [1, 2, 5]
1195+
>>> c.sort_values()
1196+
[1, 1, 2, 2, 5]
1197+
Categories (3, int64): [1, 2, 5]
1198+
>>> c.sort_values(ascending=False)
1199+
[5, 2, 2, 1, 1]
1200+
Categories (3, int64): [1, 2, 5]
1201+
1202+
Inplace sorting can be done as well:
1203+
1204+
>>> c.sort_values(inplace=True)
1205+
>>> c
1206+
[1, 1, 2, 2, 5]
1207+
Categories (3, int64): [1, 2, 5]
1208+
>>>
1209+
>>> c = pd.Categorical([1, 2, 2, 1, 5])
1210+
1211+
'sort_values' behaviour with NaNs. Note that 'na_position'
1212+
is independent of the 'ascending' parameter:
1213+
1214+
>>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5])
1215+
>>> c
1216+
[NaN, 2.0, 2.0, NaN, 5.0]
1217+
Categories (2, int64): [2, 5]
1218+
>>> c.sort_values()
1219+
[2.0, 2.0, 5.0, NaN, NaN]
1220+
Categories (2, int64): [2, 5]
1221+
>>> c.sort_values(ascending=False)
1222+
[5.0, 2.0, 2.0, NaN, NaN]
1223+
Categories (2, int64): [2, 5]
1224+
>>> c.sort_values(na_position='first')
1225+
[NaN, NaN, 2.0, 2.0, 5.0]
1226+
Categories (2, int64): [2, 5]
1227+
>>> c.sort_values(ascending=False, na_position='first')
1228+
[NaN, NaN, 5.0, 2.0, 2.0]
1229+
Categories (2, int64): [2, 5]
11841230
"""
11851231
if na_position not in ['last', 'first']:
11861232
raise ValueError('invalid na_position: {!r}'.format(na_position))
@@ -1193,13 +1239,13 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'):
11931239
na_mask = (codes == -1)
11941240
if na_mask.any():
11951241
n_nans = len(codes[na_mask])
1196-
if na_position == "first" and not ascending:
1242+
if na_position == "first":
11971243
# in this case sort to the front
11981244
new_codes = codes.copy()
11991245
new_codes[0:n_nans] = -1
12001246
new_codes[n_nans:] = codes[~na_mask]
12011247
codes = new_codes
1202-
elif na_position == "last" and not ascending:
1248+
elif na_position == "last":
12031249
# ... and to the end
12041250
new_codes = codes.copy()
12051251
pos = len(codes) - n_nans
@@ -1215,63 +1261,31 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'):
12151261

12161262
def order(self, inplace=False, ascending=True, na_position='last'):
12171263
"""
1218-
DEPRECATED: use :meth:`Categorical.sort_values`
1219-
1220-
Sorts the Category by category value returning a new Categorical by
1221-
default.
1222-
1223-
Only ordered Categoricals can be sorted!
1224-
1225-
Categorical.sort is the equivalent but sorts the Categorical inplace.
1226-
1227-
Parameters
1228-
----------
1229-
inplace : boolean, default False
1230-
Do operation in place.
1231-
ascending : boolean, default True
1232-
Sort ascending. Passing False sorts descending
1233-
na_position : {'first', 'last'} (optional, default='last')
1234-
'first' puts NaNs at the beginning
1235-
'last' puts NaNs at the end
1236-
1237-
Returns
1238-
-------
1239-
y : Category or None
1264+
DEPRECATED: use :meth:`Categorical.sort_values`. That function
1265+
is entirely equivalent to this one.
12401266
12411267
See Also
12421268
--------
1243-
Category.sort
1269+
Categorical.sort_values
12441270
"""
12451271
warn("order is deprecated, use sort_values(...)", FutureWarning,
12461272
stacklevel=2)
12471273
return self.sort_values(inplace=inplace, ascending=ascending,
12481274
na_position=na_position)
12491275

12501276
def sort(self, inplace=True, ascending=True, na_position='last'):
1251-
""" Sorts the Category inplace by category value.
1252-
1253-
Only ordered Categoricals can be sorted!
1254-
1255-
Catgorical.order is the equivalent but returns a new Categorical.
1256-
1257-
Parameters
1258-
----------
1259-
ascending : boolean, default True
1260-
Sort ascending. Passing False sorts descending
1261-
inplace : boolean, default False
1262-
Do operation in place.
1263-
na_position : {'first', 'last'} (optional, default='last')
1264-
'first' puts NaNs at the beginning
1265-
'last' puts NaNs at the end
1266-
1267-
Returns
1268-
-------
1269-
y : Category or None
1277+
"""
1278+
DEPRECATED: use :meth:`Categorical.sort_values`. That function
1279+
is just like this one, except that a new Categorical is returned
1280+
by default, so make sure to pass in 'inplace=True' to get
1281+
inplace sorting.
12701282
12711283
See Also
12721284
--------
1273-
Category.sort_values
1285+
Categorical.sort_values
12741286
"""
1287+
warn("sort is deprecated, use sort_values(...)", FutureWarning,
1288+
stacklevel=2)
12751289
return self.sort_values(inplace=inplace, ascending=ascending,
12761290
na_position=na_position)
12771291

pandas/tests/test_categorical.py

+61-46
Original file line numberDiff line numberDiff line change
@@ -1277,12 +1277,11 @@ def test_mode(self):
12771277
exp = Categorical([4], categories=[5, 4, 3, 2, 1], ordered=True)
12781278
self.assertTrue(res.equals(exp))
12791279

1280-
def test_sort(self):
1280+
def test_sort_values(self):
12811281

12821282
# unordered cats are sortable
12831283
cat = Categorical(["a", "b", "b", "a"], ordered=False)
12841284
cat.sort_values()
1285-
cat.sort()
12861285

12871286
cat = Categorical(["a", "c", "b", "d"], ordered=True)
12881287

@@ -1303,10 +1302,62 @@ def test_sort(self):
13031302

13041303
# sort (inplace order)
13051304
cat1 = cat.copy()
1306-
cat1.sort()
1305+
cat1.sort_values(inplace=True)
13071306
exp = np.array(["a", "b", "c", "d"], dtype=object)
13081307
self.assert_numpy_array_equal(cat1.__array__(), exp)
13091308

1309+
# reverse
1310+
cat = Categorical(["a", "c", "c", "b", "d"], ordered=True)
1311+
res = cat.sort_values(ascending=False)
1312+
exp_val = np.array(["d", "c", "c", "b", "a"], dtype=object)
1313+
exp_categories = np.array(["a", "b", "c", "d"], dtype=object)
1314+
self.assert_numpy_array_equal(res.__array__(), exp_val)
1315+
self.assert_numpy_array_equal(res.categories, exp_categories)
1316+
1317+
def test_sort_values_na_position(self):
1318+
# see gh-12882
1319+
cat = Categorical([5, 2, np.nan, 2, np.nan], ordered=True)
1320+
exp_categories = np.array([2, 5])
1321+
1322+
exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
1323+
res = cat.sort_values() # default arguments
1324+
self.assert_numpy_array_equal(res.__array__(), exp)
1325+
self.assert_numpy_array_equal(res.categories, exp_categories)
1326+
1327+
exp = np.array([np.nan, np.nan, 2.0, 2.0, 5.0])
1328+
res = cat.sort_values(ascending=True, na_position='first')
1329+
self.assert_numpy_array_equal(res.__array__(), exp)
1330+
self.assert_numpy_array_equal(res.categories, exp_categories)
1331+
1332+
exp = np.array([np.nan, np.nan, 5.0, 2.0, 2.0])
1333+
res = cat.sort_values(ascending=False, na_position='first')
1334+
self.assert_numpy_array_equal(res.__array__(), exp)
1335+
self.assert_numpy_array_equal(res.categories, exp_categories)
1336+
1337+
exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
1338+
res = cat.sort_values(ascending=True, na_position='last')
1339+
self.assert_numpy_array_equal(res.__array__(), exp)
1340+
self.assert_numpy_array_equal(res.categories, exp_categories)
1341+
1342+
exp = np.array([5.0, 2.0, 2.0, np.nan, np.nan])
1343+
res = cat.sort_values(ascending=False, na_position='last')
1344+
self.assert_numpy_array_equal(res.__array__(), exp)
1345+
self.assert_numpy_array_equal(res.categories, exp_categories)
1346+
1347+
cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
1348+
res = cat.sort_values(ascending=False, na_position='last')
1349+
exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object)
1350+
exp_categories = np.array(["a", "b", "c", "d"], dtype=object)
1351+
self.assert_numpy_array_equal(res.__array__(), exp_val)
1352+
self.assert_numpy_array_equal(res.categories, exp_categories)
1353+
1354+
cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
1355+
res = cat.sort_values(ascending=False, na_position='first')
1356+
exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object)
1357+
exp_categories = np.array(["a", "b", "c", "d"], dtype=object)
1358+
self.assert_numpy_array_equal(res.__array__(), exp_val)
1359+
self.assert_numpy_array_equal(res.categories, exp_categories)
1360+
13101361
def test_slicing_directly(self):
13111362
cat = Categorical(["a", "b", "c", "d", "a", "b", "c"])
13121363
sliced = cat[3]
@@ -2951,14 +3002,16 @@ def test_count(self):
29513002
result = s.count()
29523003
self.assertEqual(result, 2)
29533004

2954-
def test_sort(self):
3005+
def test_sort_values(self):
29553006

29563007
c = Categorical(["a", "b", "b", "a"], ordered=False)
2957-
cat = Series(c)
3008+
cat = Series(c.copy())
29583009

2959-
# 9816 deprecated
2960-
with tm.assert_produces_warning(FutureWarning):
2961-
c.order()
3010+
# 'order' was deprecated in gh-10726
3011+
# 'sort' was deprecated in gh-12882
3012+
for func in ('order', 'sort'):
3013+
with tm.assert_produces_warning(FutureWarning):
3014+
getattr(c, func)()
29623015

29633016
# sort in the categories order
29643017
expected = Series(
@@ -3024,44 +3077,6 @@ def test_sort(self):
30243077
expected = df.iloc[[2, 1, 5, 4, 3, 0]]
30253078
tm.assert_frame_equal(result, expected)
30263079

3027-
# reverse
3028-
cat = Categorical(["a", "c", "c", "b", "d"], ordered=True)
3029-
res = cat.sort_values(ascending=False)
3030-
exp_val = np.array(["d", "c", "c", "b", "a"], dtype=object)
3031-
exp_categories = np.array(["a", "b", "c", "d"], dtype=object)
3032-
self.assert_numpy_array_equal(res.__array__(), exp_val)
3033-
self.assert_numpy_array_equal(res.categories, exp_categories)
3034-
3035-
# some NaN positions
3036-
3037-
cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
3038-
res = cat.sort_values(ascending=False, na_position='last')
3039-
exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object)
3040-
exp_categories = np.array(["a", "b", "c", "d"], dtype=object)
3041-
self.assert_numpy_array_equal(res.__array__(), exp_val)
3042-
self.assert_numpy_array_equal(res.categories, exp_categories)
3043-
3044-
cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
3045-
res = cat.sort_values(ascending=False, na_position='first')
3046-
exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object)
3047-
exp_categories = np.array(["a", "b", "c", "d"], dtype=object)
3048-
self.assert_numpy_array_equal(res.__array__(), exp_val)
3049-
self.assert_numpy_array_equal(res.categories, exp_categories)
3050-
3051-
cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
3052-
res = cat.sort_values(ascending=False, na_position='first')
3053-
exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object)
3054-
exp_categories = np.array(["a", "b", "c", "d"], dtype=object)
3055-
self.assert_numpy_array_equal(res.__array__(), exp_val)
3056-
self.assert_numpy_array_equal(res.categories, exp_categories)
3057-
3058-
cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
3059-
res = cat.sort_values(ascending=False, na_position='last')
3060-
exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object)
3061-
exp_categories = np.array(["a", "b", "c", "d"], dtype=object)
3062-
self.assert_numpy_array_equal(res.__array__(), exp_val)
3063-
self.assert_numpy_array_equal(res.categories, exp_categories)
3064-
30653080
def test_slicing(self):
30663081
cat = Series(Categorical([1, 2, 3, 4]))
30673082
reversed = cat[::-1]

0 commit comments

Comments
 (0)