Skip to content

Commit 3fe85af

Browse files
jeet63jreback
authored andcommitted
BUG: incorrect ranking in an ordered categorical
check for categorical, and then pass the underlying integer codes. closes #15420 Author: Prasanjit Prakash <[email protected]> Closes #15422 from ikilledthecat/rank_categorical and squashes the following commits: a7e573b [Prasanjit Prakash] moved test for categorical, in rank, to top 3ba4e3a [Prasanjit Prakash] corrections after rebasing c43a029 [Prasanjit Prakash] using if/else construct to pick sorting function for categoricals f8ec019 [Prasanjit Prakash] ask Categorical for ranking function 40d88c1 [Prasanjit Prakash] return values for rank from categorical object 049c0fc [Prasanjit Prakash] GH#15420 added support for na_option when ranking categorical 5e5bbeb [Prasanjit Prakash] BUG: GH#15420 rank for categoricals ef999c3 [Prasanjit Prakash] merged with upstream master fbaba1b [Prasanjit Prakash] return values for rank from categorical object fa0b4c2 [Prasanjit Prakash] BUG: GH15420 - _rank private method on Categorical 9a6b5cd [Prasanjit Prakash] BUG: GH15420 - _rank private method on Categorical 4220e56 [Prasanjit Prakash] BUG: GH15420 - _rank private method on Categorical 6b70921 [Prasanjit Prakash] GH#15420 move rank inside categoricals bf4e36c [Prasanjit Prakash] GH#15420 added support for na_option when ranking categorical ce90207 [Prasanjit Prakash] BUG: GH#15420 rank for categoricals 85b267a [Prasanjit Prakash] Added support for categorical datatype in rank - issue#15420
1 parent 924c166 commit 3fe85af

File tree

4 files changed

+105
-1
lines changed

4 files changed

+105
-1
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -578,6 +578,7 @@ Bug Fixes
578578

579579

580580

581+
- Bug in ``.rank()`` which incorrectly ranks ordered categories (:issue:`15420`)
581582

582583

583584

pandas/core/algorithms.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -973,6 +973,10 @@ def _hashtable_algo(f, values, return_dtype=None):
973973
def _get_data_algo(values, func_map):
974974

975975
f = None
976+
977+
if is_categorical_dtype(values):
978+
values = values._values_for_rank()
979+
976980
if is_float_dtype(values):
977981
f = func_map['float64']
978982
values = _ensure_float64(values)
@@ -988,7 +992,6 @@ def _get_data_algo(values, func_map):
988992
elif is_unsigned_integer_dtype(values):
989993
f = func_map['uint64']
990994
values = _ensure_uint64(values)
991-
992995
else:
993996
values = _ensure_object(values)
994997

pandas/core/categorical.py

+22
Original file line numberDiff line numberDiff line change
@@ -1404,6 +1404,28 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'):
14041404
return self._constructor(values=codes, categories=self.categories,
14051405
ordered=self.ordered, fastpath=True)
14061406

1407+
def _values_for_rank(self):
1408+
"""
1409+
For correctly ranking ordered categorical data. See GH#15420
1410+
1411+
Ordered categorical data should be ranked on the basis of
1412+
codes with -1 translated to NaN.
1413+
1414+
Returns
1415+
-------
1416+
numpy array
1417+
1418+
"""
1419+
if self.ordered:
1420+
values = self.codes
1421+
mask = values == -1
1422+
if mask.any():
1423+
values = values.astype('float64')
1424+
values[mask] = np.nan
1425+
else:
1426+
values = np.array(self)
1427+
return values
1428+
14071429
def order(self, inplace=False, ascending=True, na_position='last'):
14081430
"""
14091431
DEPRECATED: use :meth:`Categorical.sort_values`. That function

pandas/tests/series/test_analytics.py

+78
Original file line numberDiff line numberDiff line change
@@ -1057,6 +1057,84 @@ def test_rank(self):
10571057
iranks = iseries.rank()
10581058
assert_series_equal(iranks, exp)
10591059

1060+
def test_rank_categorical(self):
1061+
# GH issue #15420 rank incorrectly orders ordered categories
1062+
1063+
# Test ascending/descending ranking for ordered categoricals
1064+
exp = pd.Series([1., 2., 3., 4., 5., 6.])
1065+
exp_desc = pd.Series([6., 5., 4., 3., 2., 1.])
1066+
ordered = pd.Series(
1067+
['first', 'second', 'third', 'fourth', 'fifth', 'sixth']
1068+
).astype('category', ).cat.set_categories(
1069+
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
1070+
ordered=True
1071+
)
1072+
assert_series_equal(ordered.rank(), exp)
1073+
assert_series_equal(ordered.rank(ascending=False), exp_desc)
1074+
1075+
# Unordered categoricals should be ranked as objects
1076+
unordered = pd.Series(
1077+
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
1078+
).astype('category').cat.set_categories(
1079+
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
1080+
ordered=False
1081+
)
1082+
exp_unordered = pd.Series([2., 4., 6., 3., 1., 5.])
1083+
res = unordered.rank()
1084+
assert_series_equal(res, exp_unordered)
1085+
1086+
# Test na_option for rank data
1087+
na_ser = pd.Series(
1088+
['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN]
1089+
).astype('category', ).cat.set_categories(
1090+
[
1091+
'first', 'second', 'third', 'fourth',
1092+
'fifth', 'sixth', 'seventh'
1093+
],
1094+
ordered=True
1095+
)
1096+
1097+
exp_top = pd.Series([2., 3., 4., 5., 6., 7., 1.])
1098+
exp_bot = pd.Series([1., 2., 3., 4., 5., 6., 7.])
1099+
exp_keep = pd.Series([1., 2., 3., 4., 5., 6., np.NaN])
1100+
1101+
assert_series_equal(na_ser.rank(na_option='top'), exp_top)
1102+
assert_series_equal(na_ser.rank(na_option='bottom'), exp_bot)
1103+
assert_series_equal(na_ser.rank(na_option='keep'), exp_keep)
1104+
1105+
# Test na_option for rank data with ascending False
1106+
exp_top = pd.Series([7., 6., 5., 4., 3., 2., 1.])
1107+
exp_bot = pd.Series([6., 5., 4., 3., 2., 1., 7.])
1108+
exp_keep = pd.Series([6., 5., 4., 3., 2., 1., np.NaN])
1109+
1110+
assert_series_equal(
1111+
na_ser.rank(na_option='top', ascending=False),
1112+
exp_top
1113+
)
1114+
assert_series_equal(
1115+
na_ser.rank(na_option='bottom', ascending=False),
1116+
exp_bot
1117+
)
1118+
assert_series_equal(
1119+
na_ser.rank(na_option='keep', ascending=False),
1120+
exp_keep
1121+
)
1122+
1123+
# Test with pct=True
1124+
na_ser = pd.Series(
1125+
['first', 'second', 'third', 'fourth', np.NaN],
1126+
).astype('category').cat.set_categories(
1127+
['first', 'second', 'third', 'fourth'],
1128+
ordered=True
1129+
)
1130+
exp_top = pd.Series([0.4, 0.6, 0.8, 1., 0.2])
1131+
exp_bot = pd.Series([0.2, 0.4, 0.6, 0.8, 1.])
1132+
exp_keep = pd.Series([0.25, 0.5, 0.75, 1., np.NaN])
1133+
1134+
assert_series_equal(na_ser.rank(na_option='top', pct=True), exp_top)
1135+
assert_series_equal(na_ser.rank(na_option='bottom', pct=True), exp_bot)
1136+
assert_series_equal(na_ser.rank(na_option='keep', pct=True), exp_keep)
1137+
10601138
def test_rank_signature(self):
10611139
s = Series([0, 1])
10621140
s.rank(method='average')

0 commit comments

Comments
 (0)