PERF: Rank categorical perf

jeet63 · jreback · commit 1c106c842751 · 2017-03-01T16:52:16.000-05:00
closes pandas-dev#15498 Author: Prasanjit Prakash <jeet@gmail.com> Closes pandas-dev#15518 from ikilledthecat/rank_categorical_perf and squashes the following commits: 30b49b9 [Prasanjit Prakash] PERF: GH15498 - pep8 changes ad38544 [Prasanjit Prakash] PERF: GH15498 - asv tests and whatsnew 1ebdb56 [Prasanjit Prakash] PERF: categorical rank GH#15498 a67cd85 [Prasanjit Prakash] PERF: categorical rank GH#15498 81df7df [Prasanjit Prakash] PERF: categorical rank GH#15498 45dd125 [Prasanjit Prakash] PERF: categorical rank GH#15498 33249b3 [Prasanjit Prakash] PERF: categorical rank GH#15498
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
@@ -63,3 +63,37 @@ def time_value_counts_dropna(self):
 
     def time_rendering(self):
         str(self.sel)
+
+
+class Categoricals3(object):
+    goal_time = 0.2
+
+    def setup(self):
+        N = 100000
+        ncats = 100
+
+        self.s1 = Series(np.array(tm.makeCategoricalIndex(N, ncats)))
+        self.s1_cat = self.s1.astype('category')
+        self.s1_cat_ordered = self.s1.astype('category', ordered=True)
+
+        self.s2 = Series(np.random.randint(0, ncats, size=N))
+        self.s2_cat = self.s2.astype('category')
+        self.s2_cat_ordered = self.s2.astype('category', ordered=True)
+
+    def time_rank_string(self):
+        self.s1.rank()
+
+    def time_rank_string_cat(self):
+        self.s1_cat.rank()
+
+    def time_rank_string_cat_ordered(self):
+        self.s1_cat_ordered.rank()
+
+    def time_rank_int(self):
+        self.s2.rank()
+
+    def time_rank_int_cat(self):
+        self.s2_cat.rank()
+
+    def time_rank_int_cat_ordered(self):
+        self.s2_cat_ordered.rank()
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -562,6 +562,7 @@ Performance Improvements
 - Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`)
 - Improved performance and reduced memory when indexing with a ``MultiIndex`` (:issue:`15245`)
 - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. (:issue:`14947`)
+- Improved performance of `rank()` for categorical data (:issue:`15498`)
 
 
 
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -992,6 +992,7 @@ def _get_data_algo(values, func_map):
     elif is_unsigned_integer_dtype(values):
         f = func_map['uint64']
         values = _ensure_uint64(values)
+
     else:
         values = _ensure_object(values)
 
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -1416,14 +1416,21 @@ def _values_for_rank(self):
         numpy array
 
         """
+        from pandas import Series
         if self.ordered:
             values = self.codes
             mask = values == -1
             if mask.any():
                 values = values.astype('float64')
                 values[mask] = np.nan
-        else:
+        elif self.categories.is_numeric():
             values = np.array(self)
+        else:
+            #  reorder the categories (so rank can use the float codes)
+            #  instead of passing an object array to rank
+            values = np.array(
+                self.rename_categories(Series(self.categories).rank())
+            )
         return values
 
     def order(self, inplace=False, ascending=True, na_position='last'):
diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py
@@ -1065,8 +1065,10 @@ def test_rank_categorical(self):
         exp_desc = pd.Series([6., 5., 4., 3., 2., 1.])
         ordered = pd.Series(
             ['first', 'second', 'third', 'fourth', 'fifth', 'sixth']
-        ).astype('category', ).cat.set_categories(
-            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
+        ).astype(
+            'category',
+            categories=['first', 'second', 'third',
+                        'fourth', 'fifth', 'sixth'],
             ordered=True
         )
         assert_series_equal(ordered.rank(), exp)
@@ -1075,19 +1077,33 @@ def test_rank_categorical(self):
         # Unordered categoricals should be ranked as objects
         unordered = pd.Series(
             ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
-        ).astype('category').cat.set_categories(
-            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
+        ).astype(
+            'category',
+            categories=['first', 'second', 'third',
+                        'fourth', 'fifth', 'sixth'],
             ordered=False
         )
         exp_unordered = pd.Series([2., 4., 6., 3., 1., 5.])
         res = unordered.rank()
         assert_series_equal(res, exp_unordered)
 
+        unordered1 = pd.Series(
+            [1, 2, 3, 4, 5, 6],
+        ).astype(
+            'category',
+            categories=[1, 2, 3, 4, 5, 6],
+            ordered=False
+        )
+        exp_unordered1 = pd.Series([1., 2., 3., 4., 5., 6.])
+        res1 = unordered1.rank()
+        assert_series_equal(res1, exp_unordered1)
+
         # Test na_option for rank data
         na_ser = pd.Series(
             ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN]
-        ).astype('category', ).cat.set_categories(
-            [
+        ).astype(
+            'category',
+            categories=[
                 'first', 'second', 'third', 'fourth',
                 'fifth', 'sixth', 'seventh'
             ],
@@ -1123,8 +1139,9 @@ def test_rank_categorical(self):
         # Test with pct=True
         na_ser = pd.Series(
             ['first', 'second', 'third', 'fourth', np.NaN],
-        ).astype('category').cat.set_categories(
-            ['first', 'second', 'third', 'fourth'],
+        ).astype(
+            'category',
+            categories=['first', 'second', 'third', 'fourth'],
             ordered=True
         )
         exp_top = pd.Series([0.4, 0.6, 0.8, 1., 0.2])