BUG: GH15420 - _rank private method on Categorical

jeet63 · jeet63 · commit 4220e565d834 · 2017-02-24T17:05:50.000+05:30
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -598,29 +598,39 @@ def mode(values):
 def rank(values, axis=0, method='average', na_option='keep',
          ascending=True, pct=False):
     """
-    Rank the values along a given axis.
-
-    Parameters
-    ----------
-    values : array-like
-        Array whose values will be ranked. The number of dimensions in this
-        array must not exceed 2.
-    axis : int, default 0
-        Axis over which to perform rankings.
-    method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
-        The method by which tiebreaks are broken during the ranking.
-    na_option : {'keep', 'top'}, default 'keep'
-        The method by which NaNs are placed in the ranking.
-        - ``keep``: rank each NaN value with a NaN ranking
-        - ``top``: replace each NaN with either +/- inf so that they
-                   there are ranked at the top
-    ascending : boolean, default True
-        Whether or not the elements should be ranked in ascending order.
-    pct : boolean, default False
-        Whether or not to the display the returned rankings in integer form
-        (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
-    """
-    if values.ndim == 1:
+        Compute numerical data ranks (1 through n) along axis. Equal values are
+        assigned a rank that is the average of the ranks of those values
+
+        Parameters
+        ----------
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            index to direct ranking
+        method : {'average', 'min', 'max', 'first', 'dense'}
+            * average: average rank of group
+            * min: lowest rank in group
+            * max: highest rank in group
+            * first: ranks assigned in order they appear in the array
+            * dense: like 'min', but rank always increases by 1 between groups
+        numeric_only : boolean, default None
+            Include only float, int, boolean data. Valid only for DataFrame or
+            Panel objects
+        na_option : {'keep', 'top', 'bottom'}
+            * keep: leave NA values where they are
+            * top: smallest rank if ascending
+            * bottom: smallest rank if descending
+        ascending : boolean, default True
+            False for ranks by high (1) to low (N)
+        pct : boolean, default False
+            Computes percentage rank of data
+
+        Returns
+        -------
+        ranks : same type as caller
+        """
+    if is_categorical(values):
+        ranks = values._rank(axis=axis, method=method, ascending=ascending,
+                             na_option=na_option, pct=pct)
+    elif values.ndim == 1:
         f, values = _get_data_algo(values, _rank1d_functions)
         ranks = f(values, ties_method=method, ascending=ascending,
                   na_option=na_option, pct=pct)
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -14,7 +14,6 @@
                                _coerce_indexer_dtype)
 from pandas.types.dtypes import CategoricalDtype
 from pandas.types.common import (_ensure_int64,
-                                 _ensure_float64,
                                  _ensure_object,
                                  _ensure_platform_int,
                                  is_dtype_equal,
@@ -1405,53 +1404,28 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'):
             return self._constructor(values=codes, categories=self.categories,
                                      ordered=self.ordered, fastpath=True)
 
-    def rank(self, method='average', na_option='keep',
-             ascending=True, pct=False):
+    def _rank(self, *args, **kwargs):
         """
-        Rank the values along a given axis.
+        For correctly ranking ordered categorical data. See GH#15420
+
+        Ordered categorical data should be ranked on the basis of
+        codes.
+
+        Returns
+        -------
+        numpy array
 
-        Parameters
-        ----------
-        values : array-like
-            Array whose values will be ranked. The number of dimensions in this
-            array must not exceed 2.
-        method : {'average', 'min', 'max', 'first', 'dense'},
-            default 'average'
-            The method by which tiebreaks are broken during the ranking.
-        na_option : {'keep', 'top'}, default 'keep'
-            The method by which NaNs are placed in the ranking.
-            - ``keep``: rank each NaN value with a NaN ranking
-            - ``top``: replace each NaN with either +/- inf so that they
-                       there are ranked at the top
-            - ``bottom``: replace each NaN with either +/- inf so that they
-                       there are ranked at the bottom
-        ascending : boolean, default True
-            Whether or not the elements should be ranked in ascending order.
-        pct : boolean, default False
-            Whether or not to the display the returned rankings in integer form
-            (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
         """
-        from pandas.core.series import Series
-        if na_option not in ['keep', 'top', 'bottom']:
-            raise ValueError('invalid na_position: {!r}'.format(na_option))
+        from pandas.core.algorithms import rank
 
-        codes = self._codes.copy()
-        codes = codes.astype(float)
         if self._ordered:
+            codes = self._codes.astype('float64')
             na_mask = (codes == -1)
             codes[na_mask] = np.nan
-            codes = _ensure_float64(codes)
-            ranks = _algos.rank_1d_float64(
-                codes, ties_method=method,
-                na_option=na_option, ascending=ascending, pct=pct
-            )
+            ranks = rank(codes, *args, **kwargs)
         else:
-            values = _ensure_object(self)
-            ranks = _algos.rank_1d_object(
-                values, ties_method=method,
-                na_option=na_option, ascending=ascending, pct=pct
-            )
-        return Series(ranks)
+            ranks = rank(self.astype('object'), *args, **kwargs)
+        return ranks
 
     def order(self, inplace=False, ascending=True, na_position='last'):
         """
diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py
@@ -1063,27 +1063,29 @@ def test_rank_categorical(self):
         # Test ascending/descending ranking for ordered categoricals
         exp = pd.Series([1., 2., 3., 4., 5., 6.])
         exp_desc = pd.Series([6., 5., 4., 3., 2., 1.])
-        ordered = pd.Categorical(
+        ordered = pd.Series(
             ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
+        ).astype('category').cat.set_categories(
             ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
             ordered=True
         )
         assert_series_equal(ordered.rank(), exp)
         assert_series_equal(ordered.rank(ascending=False), exp_desc)
 
         # Unordered categoricals should be ranked as objects
-        unord_ser = pd.Series(['first', 'second', 'third', 'fourth'])
-        unordered = pd.Categorical(
-            ['first', 'second', 'third', 'fourth'],
-            ['first', 'second', 'third', 'fourth'],
+        unordered = pd.Series(
+            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
+        ).astype('category').cat.set_categories(
+            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
             ordered=False
         )
         res = unordered.rank()
-        assert_series_equal(res, unord_ser.astype(object).rank())
+        assert_series_equal(res, unordered.astype(object).rank())
 
         # Test na_option for rank data
-        na_ser = pd.Categorical(
+        na_ser = pd.Series(
             ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN],
+        ).astype('category').cat.set_categories(
             ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
             ordered=True
         )
@@ -1115,8 +1117,9 @@ def test_rank_categorical(self):
         )
 
         # Test with pct=True
-        na_ser = pd.Categorical(
+        na_ser = pd.Series(
             ['first', 'second', 'third', 'fourth', np.NaN],
+        ).astype('category').cat.set_categories(
             ['first', 'second', 'third', 'fourth'],
             ordered=True
         )