Skip to content

Commit 4220e56

Browse files
committed
BUG: GH15420 - _rank private method on Categorical
1 parent 6b70921 commit 4220e56

File tree

3 files changed

+58
-71
lines changed

3 files changed

+58
-71
lines changed

pandas/core/algorithms.py

+33-23
Original file line numberDiff line numberDiff line change
@@ -598,29 +598,39 @@ def mode(values):
598598
def rank(values, axis=0, method='average', na_option='keep',
599599
ascending=True, pct=False):
600600
"""
601-
Rank the values along a given axis.
602-
603-
Parameters
604-
----------
605-
values : array-like
606-
Array whose values will be ranked. The number of dimensions in this
607-
array must not exceed 2.
608-
axis : int, default 0
609-
Axis over which to perform rankings.
610-
method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
611-
The method by which tiebreaks are broken during the ranking.
612-
na_option : {'keep', 'top'}, default 'keep'
613-
The method by which NaNs are placed in the ranking.
614-
- ``keep``: rank each NaN value with a NaN ranking
615-
- ``top``: replace each NaN with either +/- inf so that they
616-
there are ranked at the top
617-
ascending : boolean, default True
618-
Whether or not the elements should be ranked in ascending order.
619-
pct : boolean, default False
620-
Whether or not to the display the returned rankings in integer form
621-
(e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
622-
"""
623-
if values.ndim == 1:
601+
Compute numerical data ranks (1 through n) along axis. Equal values are
602+
assigned a rank that is the average of the ranks of those values
603+
604+
Parameters
605+
----------
606+
axis : {0 or 'index', 1 or 'columns'}, default 0
607+
index to direct ranking
608+
method : {'average', 'min', 'max', 'first', 'dense'}
609+
* average: average rank of group
610+
* min: lowest rank in group
611+
* max: highest rank in group
612+
* first: ranks assigned in order they appear in the array
613+
* dense: like 'min', but rank always increases by 1 between groups
614+
numeric_only : boolean, default None
615+
Include only float, int, boolean data. Valid only for DataFrame or
616+
Panel objects
617+
na_option : {'keep', 'top', 'bottom'}
618+
* keep: leave NA values where they are
619+
* top: smallest rank if ascending
620+
* bottom: smallest rank if descending
621+
ascending : boolean, default True
622+
False for ranks by high (1) to low (N)
623+
pct : boolean, default False
624+
Computes percentage rank of data
625+
626+
Returns
627+
-------
628+
ranks : same type as caller
629+
"""
630+
if is_categorical(values):
631+
ranks = values._rank(axis=axis, method=method, ascending=ascending,
632+
na_option=na_option, pct=pct)
633+
elif values.ndim == 1:
624634
f, values = _get_data_algo(values, _rank1d_functions)
625635
ranks = f(values, ties_method=method, ascending=ascending,
626636
na_option=na_option, pct=pct)

pandas/core/categorical.py

+14-40
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
_coerce_indexer_dtype)
1515
from pandas.types.dtypes import CategoricalDtype
1616
from pandas.types.common import (_ensure_int64,
17-
_ensure_float64,
1817
_ensure_object,
1918
_ensure_platform_int,
2019
is_dtype_equal,
@@ -1405,53 +1404,28 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'):
14051404
return self._constructor(values=codes, categories=self.categories,
14061405
ordered=self.ordered, fastpath=True)
14071406

1408-
def rank(self, method='average', na_option='keep',
1409-
ascending=True, pct=False):
1407+
def _rank(self, *args, **kwargs):
14101408
"""
1411-
Rank the values along a given axis.
1409+
For correctly ranking ordered categorical data. See GH#15420
1410+
1411+
Ordered categorical data should be ranked on the basis of
1412+
codes.
1413+
1414+
Returns
1415+
-------
1416+
numpy array
14121417
1413-
Parameters
1414-
----------
1415-
values : array-like
1416-
Array whose values will be ranked. The number of dimensions in this
1417-
array must not exceed 2.
1418-
method : {'average', 'min', 'max', 'first', 'dense'},
1419-
default 'average'
1420-
The method by which tiebreaks are broken during the ranking.
1421-
na_option : {'keep', 'top'}, default 'keep'
1422-
The method by which NaNs are placed in the ranking.
1423-
- ``keep``: rank each NaN value with a NaN ranking
1424-
- ``top``: replace each NaN with either +/- inf so that they
1425-
there are ranked at the top
1426-
- ``bottom``: replace each NaN with either +/- inf so that they
1427-
there are ranked at the bottom
1428-
ascending : boolean, default True
1429-
Whether or not the elements should be ranked in ascending order.
1430-
pct : boolean, default False
1431-
Whether or not to the display the returned rankings in integer form
1432-
(e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
14331418
"""
1434-
from pandas.core.series import Series
1435-
if na_option not in ['keep', 'top', 'bottom']:
1436-
raise ValueError('invalid na_position: {!r}'.format(na_option))
1419+
from pandas.core.algorithms import rank
14371420

1438-
codes = self._codes.copy()
1439-
codes = codes.astype(float)
14401421
if self._ordered:
1422+
codes = self._codes.astype('float64')
14411423
na_mask = (codes == -1)
14421424
codes[na_mask] = np.nan
1443-
codes = _ensure_float64(codes)
1444-
ranks = _algos.rank_1d_float64(
1445-
codes, ties_method=method,
1446-
na_option=na_option, ascending=ascending, pct=pct
1447-
)
1425+
ranks = rank(codes, *args, **kwargs)
14481426
else:
1449-
values = _ensure_object(self)
1450-
ranks = _algos.rank_1d_object(
1451-
values, ties_method=method,
1452-
na_option=na_option, ascending=ascending, pct=pct
1453-
)
1454-
return Series(ranks)
1427+
ranks = rank(self.astype('object'), *args, **kwargs)
1428+
return ranks
14551429

14561430
def order(self, inplace=False, ascending=True, na_position='last'):
14571431
"""

pandas/tests/series/test_analytics.py

+11-8
Original file line numberDiff line numberDiff line change
@@ -1063,27 +1063,29 @@ def test_rank_categorical(self):
10631063
# Test ascending/descending ranking for ordered categoricals
10641064
exp = pd.Series([1., 2., 3., 4., 5., 6.])
10651065
exp_desc = pd.Series([6., 5., 4., 3., 2., 1.])
1066-
ordered = pd.Categorical(
1066+
ordered = pd.Series(
10671067
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
1068+
).astype('category').cat.set_categories(
10681069
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
10691070
ordered=True
10701071
)
10711072
assert_series_equal(ordered.rank(), exp)
10721073
assert_series_equal(ordered.rank(ascending=False), exp_desc)
10731074

10741075
# Unordered categoricals should be ranked as objects
1075-
unord_ser = pd.Series(['first', 'second', 'third', 'fourth'])
1076-
unordered = pd.Categorical(
1077-
['first', 'second', 'third', 'fourth'],
1078-
['first', 'second', 'third', 'fourth'],
1076+
unordered = pd.Series(
1077+
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
1078+
).astype('category').cat.set_categories(
1079+
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
10791080
ordered=False
10801081
)
10811082
res = unordered.rank()
1082-
assert_series_equal(res, unord_ser.astype(object).rank())
1083+
assert_series_equal(res, unordered.astype(object).rank())
10831084

10841085
# Test na_option for rank data
1085-
na_ser = pd.Categorical(
1086+
na_ser = pd.Series(
10861087
['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN],
1088+
).astype('category').cat.set_categories(
10871089
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
10881090
ordered=True
10891091
)
@@ -1115,8 +1117,9 @@ def test_rank_categorical(self):
11151117
)
11161118

11171119
# Test with pct=True
1118-
na_ser = pd.Categorical(
1120+
na_ser = pd.Series(
11191121
['first', 'second', 'third', 'fourth', np.NaN],
1122+
).astype('category').cat.set_categories(
11201123
['first', 'second', 'third', 'fourth'],
11211124
ordered=True
11221125
)

0 commit comments

Comments
 (0)