Skip to content

Commit 9f3bb24

Browse files
committed
GH#15420 move rank inside categoricals
1 parent 68b42f6 commit 9f3bb24

File tree

3 files changed

+87
-26
lines changed

3 files changed

+87
-26
lines changed

pandas/core/algorithms.py

-6
Original file line numberDiff line numberDiff line change
@@ -988,12 +988,6 @@ def _get_data_algo(values, func_map):
988988
elif is_unsigned_integer_dtype(values):
989989
f = func_map['uint64']
990990
values = _ensure_uint64(values)
991-
992-
elif is_categorical_dtype(values) and values.ordered:
993-
nanMapper = np.vectorize(lambda t: np.NaN if t == -1 else t*1.)
994-
f = func_map['float64']
995-
values = _ensure_float64(nanMapper(values.codes))
996-
997991
else:
998992
values = _ensure_object(values)
999993

pandas/core/categorical.py

+49
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
_coerce_indexer_dtype)
1515
from pandas.types.dtypes import CategoricalDtype
1616
from pandas.types.common import (_ensure_int64,
17+
_ensure_float64,
1718
_ensure_object,
1819
_ensure_platform_int,
1920
is_dtype_equal,
@@ -1364,6 +1365,54 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'):
13641365
return self._constructor(values=codes, categories=self.categories,
13651366
ordered=self.ordered, fastpath=True)
13661367

1368+
def rank(self, method='average', na_option='keep',
1369+
ascending=True, pct=False):
1370+
"""
1371+
Rank the values along a given axis.
1372+
1373+
Parameters
1374+
----------
1375+
values : array-like
1376+
Array whose values will be ranked. The number of dimensions in this
1377+
array must not exceed 2.
1378+
method : {'average', 'min', 'max', 'first', 'dense'},
1379+
default 'average'
1380+
The method by which tiebreaks are broken during the ranking.
1381+
na_option : {'keep', 'top'}, default 'keep'
1382+
The method by which NaNs are placed in the ranking.
1383+
- ``keep``: rank each NaN value with a NaN ranking
1384+
- ``top``: replace each NaN with either +/- inf so that they
1385+
there are ranked at the top
1386+
- ``bottom``: replace each NaN with either +/- inf so that they
1387+
there are ranked at the bottom
1388+
ascending : boolean, default True
1389+
Whether or not the elements should be ranked in ascending order.
1390+
pct : boolean, default False
1391+
Whether or not to the display the returned rankings in integer form
1392+
(e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
1393+
"""
1394+
from pandas.core.series import Series
1395+
if na_option not in ['keep', 'top', 'bottom']:
1396+
raise ValueError('invalid na_position: {!r}'.format(na_option))
1397+
1398+
codes = self._codes.copy()
1399+
codes = codes.astype(float)
1400+
if self._ordered:
1401+
na_mask = (codes == -1)
1402+
codes[na_mask] = np.nan
1403+
codes = _ensure_float64(codes)
1404+
ranks = _algos.rank_1d_float64(
1405+
codes, ties_method=method,
1406+
na_option=na_option, ascending=ascending, pct=pct
1407+
)
1408+
else:
1409+
values = _ensure_object(self)
1410+
ranks = _algos.rank_1d_object(
1411+
values, ties_method=method,
1412+
na_option=na_option, ascending=ascending, pct=pct
1413+
)
1414+
return Series(ranks)
1415+
13671416
def order(self, inplace=False, ascending=True, na_position='last'):
13681417
"""
13691418
DEPRECATED: use :meth:`Categorical.sort_values`. That function

pandas/tests/series/test_analytics.py

+38-20
Original file line numberDiff line numberDiff line change
@@ -1057,59 +1057,77 @@ def test_rank(self):
10571057
iranks = iseries.rank()
10581058
assert_series_equal(iranks, exp)
10591059

1060+
def test_rank_categorical(self):
10601061
# GH issue #15420 rank incorrectly orders ordered categories
1061-
1062+
10621063
# Test ascending/descending ranking for ordered categoricals
10631064
exp = pd.Series([1., 2., 3., 4., 5., 6.])
10641065
exp_desc = pd.Series([6., 5., 4., 3., 2., 1.])
1065-
ser = pd.Series(
1066-
['first', 'second', 'third', 'fourth', 'fifth', 'sixth']
1067-
)
1068-
ordered = ser.astype('category', ).cat.set_categories(
1066+
ordered = pd.Categorical(
1067+
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
10691068
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
10701069
ordered=True
10711070
)
10721071
assert_series_equal(ordered.rank(), exp)
10731072
assert_series_equal(ordered.rank(ascending=False), exp_desc)
10741073

10751074
# Unordered categoricals should be ranked as objects
1076-
unordered = ser.astype('category', ).cat.set_categories(
1077-
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
1075+
unord_ser = pd.Series(['first', 'second', 'third', 'fourth'])
1076+
unordered = pd.Categorical(
1077+
['first', 'second', 'third', 'fourth'],
1078+
['first', 'second', 'third', 'fourth'],
10781079
ordered=False
10791080
)
10801081
res = unordered.rank()
1081-
assert_series_equal(res, unordered.astype(object).rank())
1082+
assert_series_equal(res, unord_ser.astype(object).rank())
10821083

10831084
# Test na_option for rank data
1084-
na_ser = pd.Series(
1085-
['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN]
1086-
).astype('category', ).cat.set_categories(
1087-
[
1088-
'first', 'second', 'third', 'fourth',
1089-
'fifth', 'sixth', 'seventh'
1090-
],
1085+
na_ser = pd.Categorical(
1086+
['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN],
1087+
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
10911088
ordered=True
10921089
)
10931090

10941091
exp_top = pd.Series([2., 3., 4., 5., 6., 7., 1.])
10951092
exp_bot = pd.Series([1., 2., 3., 4., 5., 6., 7.])
10961093
exp_keep = pd.Series([1., 2., 3., 4., 5., 6., np.NaN])
10971094

1095+
assert_series_equal(na_ser.rank(na_option='top'), exp_top)
1096+
assert_series_equal(na_ser.rank(na_option='bottom'), exp_bot)
1097+
assert_series_equal(na_ser.rank(na_option='keep'), exp_keep)
1098+
1099+
# Test na_option for rank data with ascending False
1100+
exp_top = pd.Series([7., 6., 5., 4., 3., 2., 1.])
1101+
exp_bot = pd.Series([6., 5., 4., 3., 2., 1., 7.])
1102+
exp_keep = pd.Series([6., 5., 4., 3., 2., 1., np.NaN])
1103+
10981104
assert_series_equal(
1099-
na_ser.rank(na_option='top'),
1105+
na_ser.rank(na_option='top', ascending=False),
11001106
exp_top
11011107
)
1102-
11031108
assert_series_equal(
1104-
na_ser.rank(na_option='bottom'),
1109+
na_ser.rank(na_option='bottom', ascending=False),
11051110
exp_bot
11061111
)
1107-
11081112
assert_series_equal(
1109-
na_ser.rank(na_option='keep'),
1113+
na_ser.rank(na_option='keep', ascending=False),
11101114
exp_keep
11111115
)
11121116

1117+
# Test with pct=True
1118+
na_ser = pd.Categorical(
1119+
['first', 'second', 'third', 'fourth', np.NaN],
1120+
['first', 'second', 'third', 'fourth'],
1121+
ordered=True
1122+
)
1123+
exp_top = pd.Series([0.4, 0.6, 0.8, 1., 0.2])
1124+
exp_bot = pd.Series([0.2, 0.4, 0.6, 0.8, 1.])
1125+
exp_keep = pd.Series([0.25, 0.5, 0.75, 1., np.NaN])
1126+
1127+
assert_series_equal(na_ser.rank(na_option='top', pct=True), exp_top)
1128+
assert_series_equal(na_ser.rank(na_option='bottom', pct=True), exp_bot)
1129+
assert_series_equal(na_ser.rank(na_option='keep', pct=True), exp_keep)
1130+
11131131
def test_rank_signature(self):
11141132
s = Series([0, 1])
11151133
s.rank(method='average')

0 commit comments

Comments
 (0)