Skip to content

Commit ad38544

Browse files
committed
PERF: GH15498 - asv tests and whatsnew
1 parent 1ebdb56 commit ad38544

File tree

4 files changed

+61
-10
lines changed

4 files changed

+61
-10
lines changed

asv_bench/benchmarks/categoricals.py

+43
Original file line numberDiff line numberDiff line change
@@ -63,3 +63,46 @@ def time_value_counts_dropna(self):
6363

6464
def time_rendering(self):
6565
str(self.sel)
66+
67+
68+
class Categoricals3(object):
69+
goal_time = 0.2
70+
71+
def setup(self):
72+
n = 100
73+
74+
strng = pd.util.testing.makeCategoricalIndex(n)
75+
self.s1 = pd.Series(strng)
76+
77+
dt = pd.util.testing.makeDateIndex(n)
78+
self.s2 = pd.Series(dt).astype('category', categories=dt)
79+
self.s2o = pd.Series(dt).astype('category', categories=dt, ordered=True)
80+
81+
fl = pd.util.testing.makeFloatIndex(n)
82+
self.s3 = pd.Series(fl).astype('category', categories=fl)
83+
self.s3o = pd.Series(fl).astype('category', categories=fl, ordered=True)
84+
85+
intg = pd.util.testing.makeIntIndex(n)
86+
self.s4 = pd.Series(intg).astype('category', categories=intg)
87+
self.s4o = pd.Series(intg).astype('category', categories=intg, ordered=True)
88+
89+
def time_rank_string_unordered(self):
90+
self.s1.rank()
91+
92+
def time_rank_dt_unordered(self):
93+
self.s2.rank()
94+
95+
def time_rank_dt_ordered(self):
96+
self.s2o.rank()
97+
98+
def time_rank_float_unordered(self):
99+
self.s3.rank()
100+
101+
def time_rank_float_ordered(self):
102+
self.s3o.rank()
103+
104+
def time_rank_int_unordered(self):
105+
self.s4.rank()
106+
107+
def time_rank_int_ordered(self):
108+
self.s4o.rank()

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -562,6 +562,7 @@ Performance Improvements
562562
- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`)
563563
- Improved performance and reduced memory when indexing with a ``MultiIndex`` (:issue:`15245`)
564564
- When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. (:issue:`14947`)
565+
- Improved performance of `rank()` for categorical data (:issue:`15498`)
565566

566567

567568

pandas/core/categorical.py

+2
Original file line numberDiff line numberDiff line change
@@ -1426,6 +1426,8 @@ def _values_for_rank(self):
14261426
elif self.categories.is_numeric():
14271427
values = np.array(self)
14281428
else:
1429+
# reorder the categories (so rank can use the float codes)
1430+
# instead of passing an object array to rank
14291431
values = np.array(
14301432
self.rename_categories(Series(self.categories).rank())
14311433
)

pandas/tests/series/test_analytics.py

+15-10
Original file line numberDiff line numberDiff line change
@@ -1065,8 +1065,9 @@ def test_rank_categorical(self):
10651065
exp_desc = pd.Series([6., 5., 4., 3., 2., 1.])
10661066
ordered = pd.Series(
10671067
['first', 'second', 'third', 'fourth', 'fifth', 'sixth']
1068-
).astype('category', ).cat.set_categories(
1069-
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
1068+
).astype(
1069+
'category',
1070+
categories=['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
10701071
ordered=True
10711072
)
10721073
assert_series_equal(ordered.rank(), exp)
@@ -1075,8 +1076,9 @@ def test_rank_categorical(self):
10751076
# Unordered categoricals should be ranked as objects
10761077
unordered = pd.Series(
10771078
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
1078-
).astype('category').cat.set_categories(
1079-
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
1079+
).astype(
1080+
'category',
1081+
categories=['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
10801082
ordered=False
10811083
)
10821084
exp_unordered = pd.Series([2., 4., 6., 3., 1., 5.])
@@ -1085,8 +1087,9 @@ def test_rank_categorical(self):
10851087

10861088
unordered1 = pd.Series(
10871089
[1, 2, 3, 4, 5, 6],
1088-
).astype('category').cat.set_categories(
1089-
[1, 2, 3, 4, 5, 6],
1090+
).astype(
1091+
'category',
1092+
categories=[1, 2, 3, 4, 5, 6],
10901093
ordered=False
10911094
)
10921095
exp_unordered1 = pd.Series([1., 2., 3., 4., 5., 6.])
@@ -1096,8 +1099,9 @@ def test_rank_categorical(self):
10961099
# Test na_option for rank data
10971100
na_ser = pd.Series(
10981101
['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN]
1099-
).astype('category', ).cat.set_categories(
1100-
[
1102+
).astype(
1103+
'category',
1104+
categories=[
11011105
'first', 'second', 'third', 'fourth',
11021106
'fifth', 'sixth', 'seventh'
11031107
],
@@ -1133,8 +1137,9 @@ def test_rank_categorical(self):
11331137
# Test with pct=True
11341138
na_ser = pd.Series(
11351139
['first', 'second', 'third', 'fourth', np.NaN],
1136-
).astype('category').cat.set_categories(
1137-
['first', 'second', 'third', 'fourth'],
1140+
).astype(
1141+
'category',
1142+
categories=['first', 'second', 'third', 'fourth'],
11381143
ordered=True
11391144
)
11401145
exp_top = pd.Series([0.4, 0.6, 0.8, 1., 0.2])

0 commit comments

Comments
 (0)