Skip to content

Commit 6b70921

Browse files
committed
GH#15420 move rank inside categoricals
1 parent bf4e36c commit 6b70921

File tree

3 files changed

+87
-26
lines changed

3 files changed

+87
-26
lines changed

pandas/core/algorithms.py

-6
Original file line numberDiff line numberDiff line change
@@ -988,12 +988,6 @@ def _get_data_algo(values, func_map):
988988
elif is_unsigned_integer_dtype(values):
989989
f = func_map['uint64']
990990
values = _ensure_uint64(values)
991-
992-
elif is_categorical_dtype(values) and values.ordered:
993-
nanMapper = np.vectorize(lambda t: np.NaN if t == -1 else t*1.)
994-
f = func_map['float64']
995-
values = _ensure_float64(nanMapper(values.codes))
996-
997991
else:
998992
values = _ensure_object(values)
999993

pandas/core/categorical.py

+49
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
_coerce_indexer_dtype)
1515
from pandas.types.dtypes import CategoricalDtype
1616
from pandas.types.common import (_ensure_int64,
17+
_ensure_float64,
1718
_ensure_object,
1819
_ensure_platform_int,
1920
is_dtype_equal,
@@ -1404,6 +1405,54 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'):
14041405
return self._constructor(values=codes, categories=self.categories,
14051406
ordered=self.ordered, fastpath=True)
14061407

1408+
def rank(self, method='average', na_option='keep',
1409+
ascending=True, pct=False):
1410+
"""
1411+
Rank the values along a given axis.
1412+
1413+
Parameters
1414+
----------
1415+
values : array-like
1416+
Array whose values will be ranked. The number of dimensions in this
1417+
array must not exceed 2.
1418+
method : {'average', 'min', 'max', 'first', 'dense'},
1419+
default 'average'
1420+
The method by which tiebreaks are broken during the ranking.
1421+
na_option : {'keep', 'top'}, default 'keep'
1422+
The method by which NaNs are placed in the ranking.
1423+
- ``keep``: rank each NaN value with a NaN ranking
1424+
- ``top``: replace each NaN with either +/- inf so that they
1425+
there are ranked at the top
1426+
- ``bottom``: replace each NaN with either +/- inf so that they
1427+
there are ranked at the bottom
1428+
ascending : boolean, default True
1429+
Whether or not the elements should be ranked in ascending order.
1430+
pct : boolean, default False
1431+
Whether or not to the display the returned rankings in integer form
1432+
(e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
1433+
"""
1434+
from pandas.core.series import Series
1435+
if na_option not in ['keep', 'top', 'bottom']:
1436+
raise ValueError('invalid na_position: {!r}'.format(na_option))
1437+
1438+
codes = self._codes.copy()
1439+
codes = codes.astype(float)
1440+
if self._ordered:
1441+
na_mask = (codes == -1)
1442+
codes[na_mask] = np.nan
1443+
codes = _ensure_float64(codes)
1444+
ranks = _algos.rank_1d_float64(
1445+
codes, ties_method=method,
1446+
na_option=na_option, ascending=ascending, pct=pct
1447+
)
1448+
else:
1449+
values = _ensure_object(self)
1450+
ranks = _algos.rank_1d_object(
1451+
values, ties_method=method,
1452+
na_option=na_option, ascending=ascending, pct=pct
1453+
)
1454+
return Series(ranks)
1455+
14071456
def order(self, inplace=False, ascending=True, na_position='last'):
14081457
"""
14091458
DEPRECATED: use :meth:`Categorical.sort_values`. That function

pandas/tests/series/test_analytics.py

+38-20
Original file line numberDiff line numberDiff line change
@@ -1057,59 +1057,77 @@ def test_rank(self):
10571057
iranks = iseries.rank()
10581058
assert_series_equal(iranks, exp)
10591059

1060+
def test_rank_categorical(self):
10601061
# GH issue #15420 rank incorrectly orders ordered categories
1061-
1062+
10621063
# Test ascending/descending ranking for ordered categoricals
10631064
exp = pd.Series([1., 2., 3., 4., 5., 6.])
10641065
exp_desc = pd.Series([6., 5., 4., 3., 2., 1.])
1065-
ser = pd.Series(
1066-
['first', 'second', 'third', 'fourth', 'fifth', 'sixth']
1067-
)
1068-
ordered = ser.astype('category', ).cat.set_categories(
1066+
ordered = pd.Categorical(
1067+
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
10691068
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
10701069
ordered=True
10711070
)
10721071
assert_series_equal(ordered.rank(), exp)
10731072
assert_series_equal(ordered.rank(ascending=False), exp_desc)
10741073

10751074
# Unordered categoricals should be ranked as objects
1076-
unordered = ser.astype('category', ).cat.set_categories(
1077-
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
1075+
unord_ser = pd.Series(['first', 'second', 'third', 'fourth'])
1076+
unordered = pd.Categorical(
1077+
['first', 'second', 'third', 'fourth'],
1078+
['first', 'second', 'third', 'fourth'],
10781079
ordered=False
10791080
)
10801081
res = unordered.rank()
1081-
assert_series_equal(res, unordered.astype(object).rank())
1082+
assert_series_equal(res, unord_ser.astype(object).rank())
10821083

10831084
# Test na_option for rank data
1084-
na_ser = pd.Series(
1085-
['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN]
1086-
).astype('category', ).cat.set_categories(
1087-
[
1088-
'first', 'second', 'third', 'fourth',
1089-
'fifth', 'sixth', 'seventh'
1090-
],
1085+
na_ser = pd.Categorical(
1086+
['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN],
1087+
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
10911088
ordered=True
10921089
)
10931090

10941091
exp_top = pd.Series([2., 3., 4., 5., 6., 7., 1.])
10951092
exp_bot = pd.Series([1., 2., 3., 4., 5., 6., 7.])
10961093
exp_keep = pd.Series([1., 2., 3., 4., 5., 6., np.NaN])
10971094

1095+
assert_series_equal(na_ser.rank(na_option='top'), exp_top)
1096+
assert_series_equal(na_ser.rank(na_option='bottom'), exp_bot)
1097+
assert_series_equal(na_ser.rank(na_option='keep'), exp_keep)
1098+
1099+
# Test na_option for rank data with ascending False
1100+
exp_top = pd.Series([7., 6., 5., 4., 3., 2., 1.])
1101+
exp_bot = pd.Series([6., 5., 4., 3., 2., 1., 7.])
1102+
exp_keep = pd.Series([6., 5., 4., 3., 2., 1., np.NaN])
1103+
10981104
assert_series_equal(
1099-
na_ser.rank(na_option='top'),
1105+
na_ser.rank(na_option='top', ascending=False),
11001106
exp_top
11011107
)
1102-
11031108
assert_series_equal(
1104-
na_ser.rank(na_option='bottom'),
1109+
na_ser.rank(na_option='bottom', ascending=False),
11051110
exp_bot
11061111
)
1107-
11081112
assert_series_equal(
1109-
na_ser.rank(na_option='keep'),
1113+
na_ser.rank(na_option='keep', ascending=False),
11101114
exp_keep
11111115
)
11121116

1117+
# Test with pct=True
1118+
na_ser = pd.Categorical(
1119+
['first', 'second', 'third', 'fourth', np.NaN],
1120+
['first', 'second', 'third', 'fourth'],
1121+
ordered=True
1122+
)
1123+
exp_top = pd.Series([0.4, 0.6, 0.8, 1., 0.2])
1124+
exp_bot = pd.Series([0.2, 0.4, 0.6, 0.8, 1.])
1125+
exp_keep = pd.Series([0.25, 0.5, 0.75, 1., np.NaN])
1126+
1127+
assert_series_equal(na_ser.rank(na_option='top', pct=True), exp_top)
1128+
assert_series_equal(na_ser.rank(na_option='bottom', pct=True), exp_bot)
1129+
assert_series_equal(na_ser.rank(na_option='keep', pct=True), exp_keep)
1130+
11131131
def test_rank_signature(self):
11141132
s = Series([0, 1])
11151133
s.rank(method='average')

0 commit comments

Comments
 (0)