From 33249b3bfd0ab2044f3f317a0839a06082a070c1 Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Sat, 25 Feb 2017 16:31:22 +0530 Subject: [PATCH 1/7] PERF: categorical rank GH#15498 --- pandas/core/algorithms.py | 8 +++++--- pandas/core/categorical.py | 7 +++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b11927a80fb2e..7f74067f7826b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -974,9 +974,6 @@ def _get_data_algo(values, func_map): f = None - if is_categorical_dtype(values): - values = values._values_for_rank() - if is_float_dtype(values): f = func_map['float64'] values = _ensure_float64(values) @@ -992,6 +989,11 @@ def _get_data_algo(values, func_map): elif is_unsigned_integer_dtype(values): f = func_map['uint64'] values = _ensure_uint64(values) + + elif is_categorical_dtype(values): + f = func_map['float64'] + values = values._values_for_rank() + else: values = _ensure_object(values) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index b88a6b171b316..3a83a485c8c3b 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1416,14 +1416,17 @@ def _values_for_rank(self): numpy array """ + from pandas import Series if self.ordered: values = self.codes mask = values == -1 + values = values.astype('float64') if mask.any(): - values = values.astype('float64') values[mask] = np.nan else: - values = np.array(self) + values = np.array( + self.rename_categories(Series(self.categories).rank()) + ) return values def order(self, inplace=False, ascending=True, na_position='last'): From 45dd125e182acd99862a6269dc0ec2786cd86617 Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Mon, 27 Feb 2017 19:06:55 +0530 Subject: [PATCH 2/7] PERF: categorical rank GH#15498 no need to rename categories where they are already ordered --- pandas/core/algorithms.py | 7 +++---- pandas/core/categorical.py | 4 +++- pandas/tests/series/test_analytics.py | 10 ++++++++++ 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 7f74067f7826b..55d404f05dd1d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -974,6 +974,9 @@ def _get_data_algo(values, func_map): f = None + if is_categorical_dtype(values): + values = values._values_for_rank() + if is_float_dtype(values): f = func_map['float64'] values = _ensure_float64(values) @@ -990,10 +993,6 @@ def _get_data_algo(values, func_map): f = func_map['uint64'] values = _ensure_uint64(values) - elif is_categorical_dtype(values): - f = func_map['float64'] - values = values._values_for_rank() - else: values = _ensure_object(values) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 3a83a485c8c3b..a8ccae0d374cd 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1420,9 +1420,11 @@ def _values_for_rank(self): if self.ordered: values = self.codes mask = values == -1 - values = values.astype('float64') if mask.any(): + values = values.astype('float64') values[mask] = np.nan + elif self.categories.is_monotonic: + values = np.array(self) else: values = np.array( self.rename_categories(Series(self.categories).rank()) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index b092e4f084767..1733c515b272c 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1083,6 +1083,16 @@ def test_rank_categorical(self): res = unordered.rank() assert_series_equal(res, exp_unordered) + unordered1 = pd.Series( + [1, 2, 3, 4, 5, 6], + ).astype('category').cat.set_categories( + [1, 2, 3, 4, 5, 6], + ordered=False + ) + exp_unordered1 = pd.Series([1., 2., 3., 4., 5., 6.]) + res1 = unordered1.rank() + assert_series_equal(res1, exp_unordered1) + # Test na_option for rank data na_ser = pd.Series( ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN] From 81df7dfd57892e61fa46e635ff27470417eb503d Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Mon, 27 Feb 2017 19:17:11 +0530 Subject: [PATCH 3/7] PERF: categorical rank GH#15498 check for numeric instead of monotonic --- pandas/core/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index a8ccae0d374cd..2326cc3c78b72 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1423,7 +1423,7 @@ def _values_for_rank(self): if mask.any(): values = values.astype('float64') values[mask] = np.nan - elif self.categories.is_monotonic: + elif self.categories.is_numeric(): values = np.array(self) else: values = np.array( From a67cd8503358fe4d5e4a91a9a9d44bec4dbbdd40 Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Sat, 25 Feb 2017 16:31:22 +0530 Subject: [PATCH 4/7] PERF: categorical rank GH#15498 --- pandas/core/algorithms.py | 7 ++++--- pandas/core/categorical.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 55d404f05dd1d..7f74067f7826b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -974,9 +974,6 @@ def _get_data_algo(values, func_map): f = None - if is_categorical_dtype(values): - values = values._values_for_rank() - if is_float_dtype(values): f = func_map['float64'] values = _ensure_float64(values) @@ -993,6 +990,10 @@ def _get_data_algo(values, func_map): f = func_map['uint64'] values = _ensure_uint64(values) + elif is_categorical_dtype(values): + f = func_map['float64'] + values = values._values_for_rank() + else: values = _ensure_object(values) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 2326cc3c78b72..f7526c5d2f537 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1420,8 +1420,8 @@ def _values_for_rank(self): if self.ordered: values = self.codes mask = values == -1 + values = values.astype('float64') if mask.any(): - values = values.astype('float64') values[mask] = np.nan elif self.categories.is_numeric(): values = np.array(self) From 1ebdb5686acf0410f1b94bce247856571a3b2cb5 Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Mon, 27 Feb 2017 19:06:55 +0530 Subject: [PATCH 5/7] PERF: categorical rank GH#15498 no need to rename categories where they are already ordered --- pandas/core/algorithms.py | 7 +++---- pandas/core/categorical.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 7f74067f7826b..55d404f05dd1d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -974,6 +974,9 @@ def _get_data_algo(values, func_map): f = None + if is_categorical_dtype(values): + values = values._values_for_rank() + if is_float_dtype(values): f = func_map['float64'] values = _ensure_float64(values) @@ -990,10 +993,6 @@ def _get_data_algo(values, func_map): f = func_map['uint64'] values = _ensure_uint64(values) - elif is_categorical_dtype(values): - f = func_map['float64'] - values = values._values_for_rank() - else: values = _ensure_object(values) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index f7526c5d2f537..2326cc3c78b72 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1420,8 +1420,8 @@ def _values_for_rank(self): if self.ordered: values = self.codes mask = values == -1 - values = values.astype('float64') if mask.any(): + values = values.astype('float64') values[mask] = np.nan elif self.categories.is_numeric(): values = np.array(self) From ad3854471abe85b2abbfedf32d34756cc80e6e1d Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Tue, 28 Feb 2017 17:22:05 +0530 Subject: [PATCH 6/7] PERF: GH15498 - asv tests and whatsnew --- asv_bench/benchmarks/categoricals.py | 43 +++++++++++++++++++++++++++ doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/categorical.py | 2 ++ pandas/tests/series/test_analytics.py | 25 +++++++++------- 4 files changed, 61 insertions(+), 10 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index cca652c68cf15..4f0246b27716e 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -63,3 +63,46 @@ def time_value_counts_dropna(self): def time_rendering(self): str(self.sel) + + +class Categoricals3(object): + goal_time = 0.2 + + def setup(self): + n = 100 + + strng = pd.util.testing.makeCategoricalIndex(n) + self.s1 = pd.Series(strng) + + dt = pd.util.testing.makeDateIndex(n) + self.s2 = pd.Series(dt).astype('category', categories=dt) + self.s2o = pd.Series(dt).astype('category', categories=dt, ordered=True) + + fl = pd.util.testing.makeFloatIndex(n) + self.s3 = pd.Series(fl).astype('category', categories=fl) + self.s3o = pd.Series(fl).astype('category', categories=fl, ordered=True) + + intg = pd.util.testing.makeIntIndex(n) + self.s4 = pd.Series(intg).astype('category', categories=intg) + self.s4o = pd.Series(intg).astype('category', categories=intg, ordered=True) + + def time_rank_string_unordered(self): + self.s1.rank() + + def time_rank_dt_unordered(self): + self.s2.rank() + + def time_rank_dt_ordered(self): + self.s2o.rank() + + def time_rank_float_unordered(self): + self.s3.rank() + + def time_rank_float_ordered(self): + self.s3o.rank() + + def time_rank_int_unordered(self): + self.s4.rank() + + def time_rank_int_ordered(self): + self.s4o.rank() diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 54df7514a882d..6e9dfb92dfd90 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -562,6 +562,7 @@ Performance Improvements - Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`) - Improved performance and reduced memory when indexing with a ``MultiIndex`` (:issue:`15245`) - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. (:issue:`14947`) +- Improved performance of `rank()` for categorical data (:issue:`15498`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 2326cc3c78b72..d5dce250275d9 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1426,6 +1426,8 @@ def _values_for_rank(self): elif self.categories.is_numeric(): values = np.array(self) else: + # reorder the categories (so rank can use the float codes) + # instead of passing an object array to rank values = np.array( self.rename_categories(Series(self.categories).rank()) ) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 1733c515b272c..c8e83ea8c730e 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1065,8 +1065,9 @@ def test_rank_categorical(self): exp_desc = pd.Series([6., 5., 4., 3., 2., 1.]) ordered = pd.Series( ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'] - ).astype('category', ).cat.set_categories( - ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], + ).astype( + 'category', + categories=['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], ordered=True ) assert_series_equal(ordered.rank(), exp) @@ -1075,8 +1076,9 @@ def test_rank_categorical(self): # Unordered categoricals should be ranked as objects unordered = pd.Series( ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], - ).astype('category').cat.set_categories( - ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], + ).astype( + 'category', + categories=['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], ordered=False ) exp_unordered = pd.Series([2., 4., 6., 3., 1., 5.]) @@ -1085,8 +1087,9 @@ def test_rank_categorical(self): unordered1 = pd.Series( [1, 2, 3, 4, 5, 6], - ).astype('category').cat.set_categories( - [1, 2, 3, 4, 5, 6], + ).astype( + 'category', + categories=[1, 2, 3, 4, 5, 6], ordered=False ) exp_unordered1 = pd.Series([1., 2., 3., 4., 5., 6.]) @@ -1096,8 +1099,9 @@ def test_rank_categorical(self): # Test na_option for rank data na_ser = pd.Series( ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN] - ).astype('category', ).cat.set_categories( - [ + ).astype( + 'category', + categories=[ 'first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh' ], @@ -1133,8 +1137,9 @@ def test_rank_categorical(self): # Test with pct=True na_ser = pd.Series( ['first', 'second', 'third', 'fourth', np.NaN], - ).astype('category').cat.set_categories( - ['first', 'second', 'third', 'fourth'], + ).astype( + 'category', + categories=['first', 'second', 'third', 'fourth'], ordered=True ) exp_top = pd.Series([0.4, 0.6, 0.8, 1., 0.2]) From 30b49b9726e2a3b9c8f3194ab29f687821067a03 Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Wed, 1 Mar 2017 18:45:51 +0530 Subject: [PATCH 7/7] PERF: GH15498 - pep8 changes --- asv_bench/benchmarks/categoricals.py | 9 ++++++--- pandas/tests/series/test_analytics.py | 6 ++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 4f0246b27716e..6955fe7260e8b 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -76,15 +76,18 @@ def setup(self): dt = pd.util.testing.makeDateIndex(n) self.s2 = pd.Series(dt).astype('category', categories=dt) - self.s2o = pd.Series(dt).astype('category', categories=dt, ordered=True) + self.s2o = pd.Series(dt).astype('category', categories=dt, + ordered=True) fl = pd.util.testing.makeFloatIndex(n) self.s3 = pd.Series(fl).astype('category', categories=fl) - self.s3o = pd.Series(fl).astype('category', categories=fl, ordered=True) + self.s3o = pd.Series(fl).astype('category', categories=fl, + ordered=True) intg = pd.util.testing.makeIntIndex(n) self.s4 = pd.Series(intg).astype('category', categories=intg) - self.s4o = pd.Series(intg).astype('category', categories=intg, ordered=True) + self.s4o = pd.Series(intg).astype('category', categories=intg, + ordered=True) def time_rank_string_unordered(self): self.s1.rank() diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index c8e83ea8c730e..b6985abb64e40 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1067,7 +1067,8 @@ def test_rank_categorical(self): ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'] ).astype( 'category', - categories=['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], + categories=['first', 'second', 'third', + 'fourth', 'fifth', 'sixth'], ordered=True ) assert_series_equal(ordered.rank(), exp) @@ -1078,7 +1079,8 @@ def test_rank_categorical(self): ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], ).astype( 'category', - categories=['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], + categories=['first', 'second', 'third', + 'fourth', 'fifth', 'sixth'], ordered=False ) exp_unordered = pd.Series([2., 4., 6., 3., 1., 5.])