Skip to content

Commit 1c106c8

Browse files
jeet63jreback
authored andcommitted
PERF: Rank categorical perf
closes pandas-dev#15498 Author: Prasanjit Prakash <[email protected]> Closes pandas-dev#15518 from ikilledthecat/rank_categorical_perf and squashes the following commits: 30b49b9 [Prasanjit Prakash] PERF: GH15498 - pep8 changes ad38544 [Prasanjit Prakash] PERF: GH15498 - asv tests and whatsnew 1ebdb56 [Prasanjit Prakash] PERF: categorical rank GH#15498 a67cd85 [Prasanjit Prakash] PERF: categorical rank GH#15498 81df7df [Prasanjit Prakash] PERF: categorical rank GH#15498 45dd125 [Prasanjit Prakash] PERF: categorical rank GH#15498 33249b3 [Prasanjit Prakash] PERF: categorical rank GH#15498
1 parent 2340fb8 commit 1c106c8

File tree

5 files changed

+69
-9
lines changed

5 files changed

+69
-9
lines changed

asv_bench/benchmarks/categoricals.py

+34
Original file line numberDiff line numberDiff line change
@@ -63,3 +63,37 @@ def time_value_counts_dropna(self):
6363

6464
def time_rendering(self):
6565
str(self.sel)
66+
67+
68+
class Categoricals3(object):
69+
goal_time = 0.2
70+
71+
def setup(self):
72+
N = 100000
73+
ncats = 100
74+
75+
self.s1 = Series(np.array(tm.makeCategoricalIndex(N, ncats)))
76+
self.s1_cat = self.s1.astype('category')
77+
self.s1_cat_ordered = self.s1.astype('category', ordered=True)
78+
79+
self.s2 = Series(np.random.randint(0, ncats, size=N))
80+
self.s2_cat = self.s2.astype('category')
81+
self.s2_cat_ordered = self.s2.astype('category', ordered=True)
82+
83+
def time_rank_string(self):
84+
self.s1.rank()
85+
86+
def time_rank_string_cat(self):
87+
self.s1_cat.rank()
88+
89+
def time_rank_string_cat_ordered(self):
90+
self.s1_cat_ordered.rank()
91+
92+
def time_rank_int(self):
93+
self.s2.rank()
94+
95+
def time_rank_int_cat(self):
96+
self.s2_cat.rank()
97+
98+
def time_rank_int_cat_ordered(self):
99+
self.s2_cat_ordered.rank()

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -562,6 +562,7 @@ Performance Improvements
562562
- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`)
563563
- Improved performance and reduced memory when indexing with a ``MultiIndex`` (:issue:`15245`)
564564
- When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. (:issue:`14947`)
565+
- Improved performance of `rank()` for categorical data (:issue:`15498`)
565566

566567

567568

pandas/core/algorithms.py

+1
Original file line numberDiff line numberDiff line change
@@ -992,6 +992,7 @@ def _get_data_algo(values, func_map):
992992
elif is_unsigned_integer_dtype(values):
993993
f = func_map['uint64']
994994
values = _ensure_uint64(values)
995+
995996
else:
996997
values = _ensure_object(values)
997998

pandas/core/categorical.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -1416,14 +1416,21 @@ def _values_for_rank(self):
14161416
numpy array
14171417
14181418
"""
1419+
from pandas import Series
14191420
if self.ordered:
14201421
values = self.codes
14211422
mask = values == -1
14221423
if mask.any():
14231424
values = values.astype('float64')
14241425
values[mask] = np.nan
1425-
else:
1426+
elif self.categories.is_numeric():
14261427
values = np.array(self)
1428+
else:
1429+
# reorder the categories (so rank can use the float codes)
1430+
# instead of passing an object array to rank
1431+
values = np.array(
1432+
self.rename_categories(Series(self.categories).rank())
1433+
)
14271434
return values
14281435

14291436
def order(self, inplace=False, ascending=True, na_position='last'):

pandas/tests/series/test_analytics.py

+25-8
Original file line numberDiff line numberDiff line change
@@ -1065,8 +1065,10 @@ def test_rank_categorical(self):
10651065
exp_desc = pd.Series([6., 5., 4., 3., 2., 1.])
10661066
ordered = pd.Series(
10671067
['first', 'second', 'third', 'fourth', 'fifth', 'sixth']
1068-
).astype('category', ).cat.set_categories(
1069-
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
1068+
).astype(
1069+
'category',
1070+
categories=['first', 'second', 'third',
1071+
'fourth', 'fifth', 'sixth'],
10701072
ordered=True
10711073
)
10721074
assert_series_equal(ordered.rank(), exp)
@@ -1075,19 +1077,33 @@ def test_rank_categorical(self):
10751077
# Unordered categoricals should be ranked as objects
10761078
unordered = pd.Series(
10771079
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
1078-
).astype('category').cat.set_categories(
1079-
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
1080+
).astype(
1081+
'category',
1082+
categories=['first', 'second', 'third',
1083+
'fourth', 'fifth', 'sixth'],
10801084
ordered=False
10811085
)
10821086
exp_unordered = pd.Series([2., 4., 6., 3., 1., 5.])
10831087
res = unordered.rank()
10841088
assert_series_equal(res, exp_unordered)
10851089

1090+
unordered1 = pd.Series(
1091+
[1, 2, 3, 4, 5, 6],
1092+
).astype(
1093+
'category',
1094+
categories=[1, 2, 3, 4, 5, 6],
1095+
ordered=False
1096+
)
1097+
exp_unordered1 = pd.Series([1., 2., 3., 4., 5., 6.])
1098+
res1 = unordered1.rank()
1099+
assert_series_equal(res1, exp_unordered1)
1100+
10861101
# Test na_option for rank data
10871102
na_ser = pd.Series(
10881103
['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN]
1089-
).astype('category', ).cat.set_categories(
1090-
[
1104+
).astype(
1105+
'category',
1106+
categories=[
10911107
'first', 'second', 'third', 'fourth',
10921108
'fifth', 'sixth', 'seventh'
10931109
],
@@ -1123,8 +1139,9 @@ def test_rank_categorical(self):
11231139
# Test with pct=True
11241140
na_ser = pd.Series(
11251141
['first', 'second', 'third', 'fourth', np.NaN],
1126-
).astype('category').cat.set_categories(
1127-
['first', 'second', 'third', 'fourth'],
1142+
).astype(
1143+
'category',
1144+
categories=['first', 'second', 'third', 'fourth'],
11281145
ordered=True
11291146
)
11301147
exp_top = pd.Series([0.4, 0.6, 0.8, 1., 0.2])

0 commit comments

Comments
 (0)