From 33249b3bfd0ab2044f3f317a0839a06082a070c1 Mon Sep 17 00:00:00 2001
From: Prasanjit Prakash <jeet@gmail.com>
Date: Sat, 25 Feb 2017 16:31:22 +0530
Subject: [PATCH 1/7] PERF: categorical rank GH#15498

---
 pandas/core/algorithms.py  | 8 +++++---
 pandas/core/categorical.py | 7 +++++--
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index b11927a80fb2e..7f74067f7826b 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -974,9 +974,6 @@ def _get_data_algo(values, func_map):
 
     f = None
 
-    if is_categorical_dtype(values):
-        values = values._values_for_rank()
-
     if is_float_dtype(values):
         f = func_map['float64']
         values = _ensure_float64(values)
@@ -992,6 +989,11 @@ def _get_data_algo(values, func_map):
     elif is_unsigned_integer_dtype(values):
         f = func_map['uint64']
         values = _ensure_uint64(values)
+
+    elif is_categorical_dtype(values):
+        f = func_map['float64']
+        values = values._values_for_rank()
+
     else:
         values = _ensure_object(values)
 
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
index b88a6b171b316..3a83a485c8c3b 100644
--- a/pandas/core/categorical.py
+++ b/pandas/core/categorical.py
@@ -1416,14 +1416,17 @@ def _values_for_rank(self):
         numpy array
 
         """
+        from pandas import Series
         if self.ordered:
             values = self.codes
             mask = values == -1
+            values = values.astype('float64')
             if mask.any():
-                values = values.astype('float64')
                 values[mask] = np.nan
         else:
-            values = np.array(self)
+            values = np.array(
+                self.rename_categories(Series(self.categories).rank())
+            )
         return values
 
     def order(self, inplace=False, ascending=True, na_position='last'):

From 45dd125e182acd99862a6269dc0ec2786cd86617 Mon Sep 17 00:00:00 2001
From: Prasanjit Prakash <jeet@gmail.com>
Date: Mon, 27 Feb 2017 19:06:55 +0530
Subject: [PATCH 2/7]  PERF: categorical rank GH#15498

no need to rename categories where they are already ordered
---
 pandas/core/algorithms.py             |  7 +++----
 pandas/core/categorical.py            |  4 +++-
 pandas/tests/series/test_analytics.py | 10 ++++++++++
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 7f74067f7826b..55d404f05dd1d 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -974,6 +974,9 @@ def _get_data_algo(values, func_map):
 
     f = None
 
+    if is_categorical_dtype(values):
+        values = values._values_for_rank()
+
     if is_float_dtype(values):
         f = func_map['float64']
         values = _ensure_float64(values)
@@ -990,10 +993,6 @@ def _get_data_algo(values, func_map):
         f = func_map['uint64']
         values = _ensure_uint64(values)
 
-    elif is_categorical_dtype(values):
-        f = func_map['float64']
-        values = values._values_for_rank()
-
     else:
         values = _ensure_object(values)
 
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
index 3a83a485c8c3b..a8ccae0d374cd 100644
--- a/pandas/core/categorical.py
+++ b/pandas/core/categorical.py
@@ -1420,9 +1420,11 @@ def _values_for_rank(self):
         if self.ordered:
             values = self.codes
             mask = values == -1
-            values = values.astype('float64')
             if mask.any():
+                values = values.astype('float64')
                 values[mask] = np.nan
+        elif self.categories.is_monotonic:
+            values = np.array(self)
         else:
             values = np.array(
                 self.rename_categories(Series(self.categories).rank())
diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py
index b092e4f084767..1733c515b272c 100644
--- a/pandas/tests/series/test_analytics.py
+++ b/pandas/tests/series/test_analytics.py
@@ -1083,6 +1083,16 @@ def test_rank_categorical(self):
         res = unordered.rank()
         assert_series_equal(res, exp_unordered)
 
+        unordered1 = pd.Series(
+            [1, 2, 3, 4, 5, 6],
+        ).astype('category').cat.set_categories(
+            [1, 2, 3, 4, 5, 6],
+            ordered=False
+        )
+        exp_unordered1 = pd.Series([1., 2., 3., 4., 5., 6.])
+        res1 = unordered1.rank()
+        assert_series_equal(res1, exp_unordered1)
+
         # Test na_option for rank data
         na_ser = pd.Series(
             ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN]

From 81df7dfd57892e61fa46e635ff27470417eb503d Mon Sep 17 00:00:00 2001
From: Prasanjit Prakash <jeet@gmail.com>
Date: Mon, 27 Feb 2017 19:17:11 +0530
Subject: [PATCH 3/7]  PERF: categorical rank GH#15498

check for numeric instead of monotonic
---
 pandas/core/categorical.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
index a8ccae0d374cd..2326cc3c78b72 100644
--- a/pandas/core/categorical.py
+++ b/pandas/core/categorical.py
@@ -1423,7 +1423,7 @@ def _values_for_rank(self):
             if mask.any():
                 values = values.astype('float64')
                 values[mask] = np.nan
-        elif self.categories.is_monotonic:
+        elif self.categories.is_numeric():
             values = np.array(self)
         else:
             values = np.array(

From a67cd8503358fe4d5e4a91a9a9d44bec4dbbdd40 Mon Sep 17 00:00:00 2001
From: Prasanjit Prakash <jeet@gmail.com>
Date: Sat, 25 Feb 2017 16:31:22 +0530
Subject: [PATCH 4/7] PERF: categorical rank GH#15498

---
 pandas/core/algorithms.py  | 7 ++++---
 pandas/core/categorical.py | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 55d404f05dd1d..7f74067f7826b 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -974,9 +974,6 @@ def _get_data_algo(values, func_map):
 
     f = None
 
-    if is_categorical_dtype(values):
-        values = values._values_for_rank()
-
     if is_float_dtype(values):
         f = func_map['float64']
         values = _ensure_float64(values)
@@ -993,6 +990,10 @@ def _get_data_algo(values, func_map):
         f = func_map['uint64']
         values = _ensure_uint64(values)
 
+    elif is_categorical_dtype(values):
+        f = func_map['float64']
+        values = values._values_for_rank()
+
     else:
         values = _ensure_object(values)
 
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
index 2326cc3c78b72..f7526c5d2f537 100644
--- a/pandas/core/categorical.py
+++ b/pandas/core/categorical.py
@@ -1420,8 +1420,8 @@ def _values_for_rank(self):
         if self.ordered:
             values = self.codes
             mask = values == -1
+            values = values.astype('float64')
             if mask.any():
-                values = values.astype('float64')
                 values[mask] = np.nan
         elif self.categories.is_numeric():
             values = np.array(self)

From 1ebdb5686acf0410f1b94bce247856571a3b2cb5 Mon Sep 17 00:00:00 2001
From: Prasanjit Prakash <jeet@gmail.com>
Date: Mon, 27 Feb 2017 19:06:55 +0530
Subject: [PATCH 5/7]  PERF: categorical rank GH#15498

no need to rename categories where they are already ordered
---
 pandas/core/algorithms.py  | 7 +++----
 pandas/core/categorical.py | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 7f74067f7826b..55d404f05dd1d 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -974,6 +974,9 @@ def _get_data_algo(values, func_map):
 
     f = None
 
+    if is_categorical_dtype(values):
+        values = values._values_for_rank()
+
     if is_float_dtype(values):
         f = func_map['float64']
         values = _ensure_float64(values)
@@ -990,10 +993,6 @@ def _get_data_algo(values, func_map):
         f = func_map['uint64']
         values = _ensure_uint64(values)
 
-    elif is_categorical_dtype(values):
-        f = func_map['float64']
-        values = values._values_for_rank()
-
     else:
         values = _ensure_object(values)
 
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
index f7526c5d2f537..2326cc3c78b72 100644
--- a/pandas/core/categorical.py
+++ b/pandas/core/categorical.py
@@ -1420,8 +1420,8 @@ def _values_for_rank(self):
         if self.ordered:
             values = self.codes
             mask = values == -1
-            values = values.astype('float64')
             if mask.any():
+                values = values.astype('float64')
                 values[mask] = np.nan
         elif self.categories.is_numeric():
             values = np.array(self)

From ad3854471abe85b2abbfedf32d34756cc80e6e1d Mon Sep 17 00:00:00 2001
From: Prasanjit Prakash <jeet@gmail.com>
Date: Tue, 28 Feb 2017 17:22:05 +0530
Subject: [PATCH 6/7] PERF: GH15498 - asv tests and whatsnew

---
 asv_bench/benchmarks/categoricals.py  | 43 +++++++++++++++++++++++++++
 doc/source/whatsnew/v0.20.0.txt       |  1 +
 pandas/core/categorical.py            |  2 ++
 pandas/tests/series/test_analytics.py | 25 +++++++++-------
 4 files changed, 61 insertions(+), 10 deletions(-)

diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
index cca652c68cf15..4f0246b27716e 100644
--- a/asv_bench/benchmarks/categoricals.py
+++ b/asv_bench/benchmarks/categoricals.py
@@ -63,3 +63,46 @@ def time_value_counts_dropna(self):
 
     def time_rendering(self):
         str(self.sel)
+
+
+class Categoricals3(object):
+    goal_time = 0.2
+
+    def setup(self):
+        n = 100
+
+        strng = pd.util.testing.makeCategoricalIndex(n)
+        self.s1 = pd.Series(strng)
+
+        dt = pd.util.testing.makeDateIndex(n)
+        self.s2 = pd.Series(dt).astype('category', categories=dt)
+        self.s2o = pd.Series(dt).astype('category', categories=dt, ordered=True)
+
+        fl = pd.util.testing.makeFloatIndex(n)
+        self.s3 = pd.Series(fl).astype('category', categories=fl)
+        self.s3o = pd.Series(fl).astype('category', categories=fl, ordered=True)
+
+        intg = pd.util.testing.makeIntIndex(n)
+        self.s4 = pd.Series(intg).astype('category', categories=intg)
+        self.s4o = pd.Series(intg).astype('category', categories=intg, ordered=True)
+
+    def time_rank_string_unordered(self):
+        self.s1.rank()
+
+    def time_rank_dt_unordered(self):
+        self.s2.rank()
+
+    def time_rank_dt_ordered(self):
+        self.s2o.rank()
+
+    def time_rank_float_unordered(self):
+        self.s3.rank()
+
+    def time_rank_float_ordered(self):
+        self.s3o.rank()
+
+    def time_rank_int_unordered(self):
+        self.s4.rank()
+
+    def time_rank_int_ordered(self):
+        self.s4o.rank()
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
index 54df7514a882d..6e9dfb92dfd90 100644
--- a/doc/source/whatsnew/v0.20.0.txt
+++ b/doc/source/whatsnew/v0.20.0.txt
@@ -562,6 +562,7 @@ Performance Improvements
 - Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`)
 - Improved performance and reduced memory when indexing with a ``MultiIndex`` (:issue:`15245`)
 - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. (:issue:`14947`)
+- Improved performance of `rank()` for categorical data (:issue:`15498`)
 
 
 
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
index 2326cc3c78b72..d5dce250275d9 100644
--- a/pandas/core/categorical.py
+++ b/pandas/core/categorical.py
@@ -1426,6 +1426,8 @@ def _values_for_rank(self):
         elif self.categories.is_numeric():
             values = np.array(self)
         else:
+            #  reorder the categories (so rank can use the float codes)
+            #  instead of passing an object array to rank
             values = np.array(
                 self.rename_categories(Series(self.categories).rank())
             )
diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py
index 1733c515b272c..c8e83ea8c730e 100644
--- a/pandas/tests/series/test_analytics.py
+++ b/pandas/tests/series/test_analytics.py
@@ -1065,8 +1065,9 @@ def test_rank_categorical(self):
         exp_desc = pd.Series([6., 5., 4., 3., 2., 1.])
         ordered = pd.Series(
             ['first', 'second', 'third', 'fourth', 'fifth', 'sixth']
-        ).astype('category', ).cat.set_categories(
-            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
+        ).astype(
+            'category',
+            categories=['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
             ordered=True
         )
         assert_series_equal(ordered.rank(), exp)
@@ -1075,8 +1076,9 @@ def test_rank_categorical(self):
         # Unordered categoricals should be ranked as objects
         unordered = pd.Series(
             ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
-        ).astype('category').cat.set_categories(
-            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
+        ).astype(
+            'category',
+            categories=['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
             ordered=False
         )
         exp_unordered = pd.Series([2., 4., 6., 3., 1., 5.])
@@ -1085,8 +1087,9 @@ def test_rank_categorical(self):
 
         unordered1 = pd.Series(
             [1, 2, 3, 4, 5, 6],
-        ).astype('category').cat.set_categories(
-            [1, 2, 3, 4, 5, 6],
+        ).astype(
+            'category',
+            categories=[1, 2, 3, 4, 5, 6],
             ordered=False
         )
         exp_unordered1 = pd.Series([1., 2., 3., 4., 5., 6.])
@@ -1096,8 +1099,9 @@ def test_rank_categorical(self):
         # Test na_option for rank data
         na_ser = pd.Series(
             ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN]
-        ).astype('category', ).cat.set_categories(
-            [
+        ).astype(
+            'category',
+            categories=[
                 'first', 'second', 'third', 'fourth',
                 'fifth', 'sixth', 'seventh'
             ],
@@ -1133,8 +1137,9 @@ def test_rank_categorical(self):
         # Test with pct=True
         na_ser = pd.Series(
             ['first', 'second', 'third', 'fourth', np.NaN],
-        ).astype('category').cat.set_categories(
-            ['first', 'second', 'third', 'fourth'],
+        ).astype(
+            'category',
+            categories=['first', 'second', 'third', 'fourth'],
             ordered=True
         )
         exp_top = pd.Series([0.4, 0.6, 0.8, 1., 0.2])

From 30b49b9726e2a3b9c8f3194ab29f687821067a03 Mon Sep 17 00:00:00 2001
From: Prasanjit Prakash <jeet@gmail.com>
Date: Wed, 1 Mar 2017 18:45:51 +0530
Subject: [PATCH 7/7] PERF: GH15498 - pep8 changes

---
 asv_bench/benchmarks/categoricals.py  | 9 ++++++---
 pandas/tests/series/test_analytics.py | 6 ++++--
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
index 4f0246b27716e..6955fe7260e8b 100644
--- a/asv_bench/benchmarks/categoricals.py
+++ b/asv_bench/benchmarks/categoricals.py
@@ -76,15 +76,18 @@ def setup(self):
 
         dt = pd.util.testing.makeDateIndex(n)
         self.s2 = pd.Series(dt).astype('category', categories=dt)
-        self.s2o = pd.Series(dt).astype('category', categories=dt, ordered=True)
+        self.s2o = pd.Series(dt).astype('category', categories=dt,
+                                        ordered=True)
 
         fl = pd.util.testing.makeFloatIndex(n)
         self.s3 = pd.Series(fl).astype('category', categories=fl)
-        self.s3o = pd.Series(fl).astype('category', categories=fl, ordered=True)
+        self.s3o = pd.Series(fl).astype('category', categories=fl,
+                                        ordered=True)
 
         intg = pd.util.testing.makeIntIndex(n)
         self.s4 = pd.Series(intg).astype('category', categories=intg)
-        self.s4o = pd.Series(intg).astype('category', categories=intg, ordered=True)
+        self.s4o = pd.Series(intg).astype('category', categories=intg,
+                                          ordered=True)
 
     def time_rank_string_unordered(self):
         self.s1.rank()
diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py
index c8e83ea8c730e..b6985abb64e40 100644
--- a/pandas/tests/series/test_analytics.py
+++ b/pandas/tests/series/test_analytics.py
@@ -1067,7 +1067,8 @@ def test_rank_categorical(self):
             ['first', 'second', 'third', 'fourth', 'fifth', 'sixth']
         ).astype(
             'category',
-            categories=['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
+            categories=['first', 'second', 'third',
+                        'fourth', 'fifth', 'sixth'],
             ordered=True
         )
         assert_series_equal(ordered.rank(), exp)
@@ -1078,7 +1079,8 @@ def test_rank_categorical(self):
             ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
         ).astype(
             'category',
-            categories=['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
+            categories=['first', 'second', 'third',
+                        'fourth', 'fifth', 'sixth'],
             ordered=False
         )
         exp_unordered = pd.Series([2., 4., 6., 3., 1., 5.])