Skip to content

Commit 9e052aa

Browse files
committed
BUG: GH15420 - _rank private method on Categorical
1 parent 9f3bb24 commit 9e052aa

File tree

3 files changed

+58
-71
lines changed

3 files changed

+58
-71
lines changed

pandas/core/algorithms.py

+33-23
Original file line numberDiff line numberDiff line change
@@ -598,29 +598,39 @@ def mode(values):
598598
def rank(values, axis=0, method='average', na_option='keep',
599599
ascending=True, pct=False):
600600
"""
601-
Rank the values along a given axis.
602-
603-
Parameters
604-
----------
605-
values : array-like
606-
Array whose values will be ranked. The number of dimensions in this
607-
array must not exceed 2.
608-
axis : int, default 0
609-
Axis over which to perform rankings.
610-
method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
611-
The method by which tiebreaks are broken during the ranking.
612-
na_option : {'keep', 'top'}, default 'keep'
613-
The method by which NaNs are placed in the ranking.
614-
- ``keep``: rank each NaN value with a NaN ranking
615-
- ``top``: replace each NaN with either +/- inf so that they
616-
there are ranked at the top
617-
ascending : boolean, default True
618-
Whether or not the elements should be ranked in ascending order.
619-
pct : boolean, default False
620-
Whether or not to the display the returned rankings in integer form
621-
(e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
622-
"""
623-
if values.ndim == 1:
601+
Compute numerical data ranks (1 through n) along axis. Equal values are
602+
assigned a rank that is the average of the ranks of those values
603+
604+
Parameters
605+
----------
606+
axis : {0 or 'index', 1 or 'columns'}, default 0
607+
index to direct ranking
608+
method : {'average', 'min', 'max', 'first', 'dense'}
609+
* average: average rank of group
610+
* min: lowest rank in group
611+
* max: highest rank in group
612+
* first: ranks assigned in order they appear in the array
613+
* dense: like 'min', but rank always increases by 1 between groups
614+
numeric_only : boolean, default None
615+
Include only float, int, boolean data. Valid only for DataFrame or
616+
Panel objects
617+
na_option : {'keep', 'top', 'bottom'}
618+
* keep: leave NA values where they are
619+
* top: smallest rank if ascending
620+
* bottom: smallest rank if descending
621+
ascending : boolean, default True
622+
False for ranks by high (1) to low (N)
623+
pct : boolean, default False
624+
Computes percentage rank of data
625+
626+
Returns
627+
-------
628+
ranks : same type as caller
629+
"""
630+
if is_categorical(values):
631+
ranks = values._rank(axis=axis, method=method, ascending=ascending,
632+
na_option=na_option, pct=pct)
633+
elif values.ndim == 1:
624634
f, values = _get_data_algo(values, _rank1d_functions)
625635
ranks = f(values, ties_method=method, ascending=ascending,
626636
na_option=na_option, pct=pct)

pandas/core/categorical.py

+14-40
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
_coerce_indexer_dtype)
1515
from pandas.types.dtypes import CategoricalDtype
1616
from pandas.types.common import (_ensure_int64,
17-
_ensure_float64,
1817
_ensure_object,
1918
_ensure_platform_int,
2019
is_dtype_equal,
@@ -1365,53 +1364,28 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'):
13651364
return self._constructor(values=codes, categories=self.categories,
13661365
ordered=self.ordered, fastpath=True)
13671366

1368-
def rank(self, method='average', na_option='keep',
1369-
ascending=True, pct=False):
1367+
def _rank(self, *args, **kwargs):
13701368
"""
1371-
Rank the values along a given axis.
1369+
For correctly ranking ordered categorical data. See GH#15420
1370+
1371+
Ordered categorical data should be ranked on the basis of
1372+
codes.
1373+
1374+
Returns
1375+
-------
1376+
numpy array
13721377
1373-
Parameters
1374-
----------
1375-
values : array-like
1376-
Array whose values will be ranked. The number of dimensions in this
1377-
array must not exceed 2.
1378-
method : {'average', 'min', 'max', 'first', 'dense'},
1379-
default 'average'
1380-
The method by which tiebreaks are broken during the ranking.
1381-
na_option : {'keep', 'top'}, default 'keep'
1382-
The method by which NaNs are placed in the ranking.
1383-
- ``keep``: rank each NaN value with a NaN ranking
1384-
- ``top``: replace each NaN with either +/- inf so that they
1385-
there are ranked at the top
1386-
- ``bottom``: replace each NaN with either +/- inf so that they
1387-
there are ranked at the bottom
1388-
ascending : boolean, default True
1389-
Whether or not the elements should be ranked in ascending order.
1390-
pct : boolean, default False
1391-
Whether or not to the display the returned rankings in integer form
1392-
(e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
13931378
"""
1394-
from pandas.core.series import Series
1395-
if na_option not in ['keep', 'top', 'bottom']:
1396-
raise ValueError('invalid na_position: {!r}'.format(na_option))
1379+
from pandas.core.algorithms import rank
13971380

1398-
codes = self._codes.copy()
1399-
codes = codes.astype(float)
14001381
if self._ordered:
1382+
codes = self._codes.astype('float64')
14011383
na_mask = (codes == -1)
14021384
codes[na_mask] = np.nan
1403-
codes = _ensure_float64(codes)
1404-
ranks = _algos.rank_1d_float64(
1405-
codes, ties_method=method,
1406-
na_option=na_option, ascending=ascending, pct=pct
1407-
)
1385+
ranks = rank(codes, *args, **kwargs)
14081386
else:
1409-
values = _ensure_object(self)
1410-
ranks = _algos.rank_1d_object(
1411-
values, ties_method=method,
1412-
na_option=na_option, ascending=ascending, pct=pct
1413-
)
1414-
return Series(ranks)
1387+
ranks = rank(self.astype('object'), *args, **kwargs)
1388+
return ranks
14151389

14161390
def order(self, inplace=False, ascending=True, na_position='last'):
14171391
"""

pandas/tests/series/test_analytics.py

+11-8
Original file line numberDiff line numberDiff line change
@@ -1063,27 +1063,29 @@ def test_rank_categorical(self):
10631063
# Test ascending/descending ranking for ordered categoricals
10641064
exp = pd.Series([1., 2., 3., 4., 5., 6.])
10651065
exp_desc = pd.Series([6., 5., 4., 3., 2., 1.])
1066-
ordered = pd.Categorical(
1066+
ordered = pd.Series(
10671067
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
1068+
).astype('category').cat.set_categories(
10681069
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
10691070
ordered=True
10701071
)
10711072
assert_series_equal(ordered.rank(), exp)
10721073
assert_series_equal(ordered.rank(ascending=False), exp_desc)
10731074

10741075
# Unordered categoricals should be ranked as objects
1075-
unord_ser = pd.Series(['first', 'second', 'third', 'fourth'])
1076-
unordered = pd.Categorical(
1077-
['first', 'second', 'third', 'fourth'],
1078-
['first', 'second', 'third', 'fourth'],
1076+
unordered = pd.Series(
1077+
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
1078+
).astype('category').cat.set_categories(
1079+
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
10791080
ordered=False
10801081
)
10811082
res = unordered.rank()
1082-
assert_series_equal(res, unord_ser.astype(object).rank())
1083+
assert_series_equal(res, unordered.astype(object).rank())
10831084

10841085
# Test na_option for rank data
1085-
na_ser = pd.Categorical(
1086+
na_ser = pd.Series(
10861087
['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN],
1088+
).astype('category').cat.set_categories(
10871089
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
10881090
ordered=True
10891091
)
@@ -1115,8 +1117,9 @@ def test_rank_categorical(self):
11151117
)
11161118

11171119
# Test with pct=True
1118-
na_ser = pd.Categorical(
1120+
na_ser = pd.Series(
11191121
['first', 'second', 'third', 'fourth', np.NaN],
1122+
).astype('category').cat.set_categories(
11201123
['first', 'second', 'third', 'fourth'],
11211124
ordered=True
11221125
)

0 commit comments

Comments
 (0)