GH#15420 move rank inside categoricals

jeet63 · jeet63 · commit 9f3bb24caa5d · 2017-02-17T15:38:11.000+05:30
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -988,12 +988,6 @@ def _get_data_algo(values, func_map):
     elif is_unsigned_integer_dtype(values):
         f = func_map['uint64']
         values = _ensure_uint64(values)
-
-    elif is_categorical_dtype(values) and values.ordered:
-        nanMapper = np.vectorize(lambda t: np.NaN if t == -1 else t*1.)
-        f = func_map['float64']
-        values = _ensure_float64(nanMapper(values.codes))
-
     else:
         values = _ensure_object(values)
 
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -14,6 +14,7 @@
                                _coerce_indexer_dtype)
 from pandas.types.dtypes import CategoricalDtype
 from pandas.types.common import (_ensure_int64,
+                                 _ensure_float64,
                                  _ensure_object,
                                  _ensure_platform_int,
                                  is_dtype_equal,
@@ -1364,6 +1365,54 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'):
             return self._constructor(values=codes, categories=self.categories,
                                      ordered=self.ordered, fastpath=True)
 
+    def rank(self, method='average', na_option='keep',
+             ascending=True, pct=False):
+        """
+        Rank the values along a given axis.
+
+        Parameters
+        ----------
+        values : array-like
+            Array whose values will be ranked. The number of dimensions in this
+            array must not exceed 2.
+        method : {'average', 'min', 'max', 'first', 'dense'},
+            default 'average'
+            The method by which tiebreaks are broken during the ranking.
+        na_option : {'keep', 'top'}, default 'keep'
+            The method by which NaNs are placed in the ranking.
+            - ``keep``: rank each NaN value with a NaN ranking
+            - ``top``: replace each NaN with either +/- inf so that they
+                       there are ranked at the top
+            - ``bottom``: replace each NaN with either +/- inf so that they
+                       there are ranked at the bottom
+        ascending : boolean, default True
+            Whether or not the elements should be ranked in ascending order.
+        pct : boolean, default False
+            Whether or not to the display the returned rankings in integer form
+            (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
+        """
+        from pandas.core.series import Series
+        if na_option not in ['keep', 'top', 'bottom']:
+            raise ValueError('invalid na_position: {!r}'.format(na_option))
+
+        codes = self._codes.copy()
+        codes = codes.astype(float)
+        if self._ordered:
+            na_mask = (codes == -1)
+            codes[na_mask] = np.nan
+            codes = _ensure_float64(codes)
+            ranks = _algos.rank_1d_float64(
+                codes, ties_method=method,
+                na_option=na_option, ascending=ascending, pct=pct
+            )
+        else:
+            values = _ensure_object(self)
+            ranks = _algos.rank_1d_object(
+                values, ties_method=method,
+                na_option=na_option, ascending=ascending, pct=pct
+            )
+        return Series(ranks)
+
     def order(self, inplace=False, ascending=True, na_position='last'):
         """
         DEPRECATED: use :meth:`Categorical.sort_values`. That function
diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py
@@ -1057,59 +1057,77 @@ def test_rank(self):
         iranks = iseries.rank()
         assert_series_equal(iranks, exp)
 
+    def test_rank_categorical(self):
         # GH issue #15420 rank incorrectly orders ordered categories
-        
+
         # Test ascending/descending ranking for ordered categoricals
         exp = pd.Series([1., 2., 3., 4., 5., 6.])
         exp_desc = pd.Series([6., 5., 4., 3., 2., 1.])
-        ser = pd.Series(
-            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth']
-        )
-        ordered = ser.astype('category', ).cat.set_categories(
+        ordered = pd.Categorical(
+            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
             ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
             ordered=True
         )
         assert_series_equal(ordered.rank(), exp)
         assert_series_equal(ordered.rank(ascending=False), exp_desc)
 
         # Unordered categoricals should be ranked as objects
-        unordered = ser.astype('category', ).cat.set_categories(
-            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
+        unord_ser = pd.Series(['first', 'second', 'third', 'fourth'])
+        unordered = pd.Categorical(
+            ['first', 'second', 'third', 'fourth'],
+            ['first', 'second', 'third', 'fourth'],
             ordered=False
         )
         res = unordered.rank()
-        assert_series_equal(res, unordered.astype(object).rank())
+        assert_series_equal(res, unord_ser.astype(object).rank())
 
         # Test na_option for rank data
-        na_ser = pd.Series(
-            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN]
-        ).astype('category', ).cat.set_categories(
-            [
-                'first', 'second', 'third', 'fourth',
-                'fifth', 'sixth', 'seventh'
-            ],
+        na_ser = pd.Categorical(
+            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN],
+            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
             ordered=True
         )
 
         exp_top = pd.Series([2., 3., 4., 5., 6., 7., 1.])
         exp_bot = pd.Series([1., 2., 3., 4., 5., 6., 7.])
         exp_keep = pd.Series([1., 2., 3., 4., 5., 6., np.NaN])
 
+        assert_series_equal(na_ser.rank(na_option='top'), exp_top)
+        assert_series_equal(na_ser.rank(na_option='bottom'), exp_bot)
+        assert_series_equal(na_ser.rank(na_option='keep'), exp_keep)
+
+        # Test na_option for rank data with ascending False
+        exp_top = pd.Series([7., 6., 5., 4., 3., 2., 1.])
+        exp_bot = pd.Series([6., 5., 4., 3., 2., 1., 7.])
+        exp_keep = pd.Series([6., 5., 4., 3., 2., 1., np.NaN])
+
         assert_series_equal(
-            na_ser.rank(na_option='top'),
+            na_ser.rank(na_option='top', ascending=False),
             exp_top
         )
-
         assert_series_equal(
-            na_ser.rank(na_option='bottom'),
+            na_ser.rank(na_option='bottom', ascending=False),
             exp_bot
         )
-
         assert_series_equal(
-            na_ser.rank(na_option='keep'),
+            na_ser.rank(na_option='keep', ascending=False),
             exp_keep
         )
 
+        # Test with pct=True
+        na_ser = pd.Categorical(
+            ['first', 'second', 'third', 'fourth', np.NaN],
+            ['first', 'second', 'third', 'fourth'],
+            ordered=True
+        )
+        exp_top = pd.Series([0.4, 0.6, 0.8, 1., 0.2])
+        exp_bot = pd.Series([0.2, 0.4, 0.6, 0.8, 1.])
+        exp_keep = pd.Series([0.25, 0.5, 0.75, 1., np.NaN])
+
+        assert_series_equal(na_ser.rank(na_option='top', pct=True), exp_top)
+        assert_series_equal(na_ser.rank(na_option='bottom', pct=True), exp_bot)
+        assert_series_equal(na_ser.rank(na_option='keep', pct=True), exp_keep)
+
     def test_rank_signature(self):
         s = Series([0, 1])
         s.rank(method='average')