Skip to content

Commit 44322d1

Browse files
qwhelanTomAugspurger
authored andcommitted
PERF: significantly improve performance of MultiIndex.shape (#27384)
* PERF: significantly improve performance of MultiIndex.shape * BENCH: add benchmarks for cached Index properties
1 parent f1b9fc1 commit 44322d1

File tree

4 files changed

+83
-5
lines changed

4 files changed

+83
-5
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import pandas as pd
2+
3+
4+
class IndexCache:
5+
number = 1
6+
repeat = (3, 100, 20)
7+
8+
params = [
9+
[
10+
"DatetimeIndex",
11+
"Float64Index",
12+
"IntervalIndex",
13+
"Int64Index",
14+
"MultiIndex",
15+
"PeriodIndex",
16+
"RangeIndex",
17+
"TimedeltaIndex",
18+
"UInt64Index",
19+
]
20+
]
21+
param_names = ["index_type"]
22+
23+
def setup(self, index_type):
24+
N = 10 ** 5
25+
if index_type == "MultiIndex":
26+
self.idx = pd.MultiIndex.from_product(
27+
[pd.date_range("1/1/2000", freq="T", periods=N // 2), ["a", "b"]]
28+
)
29+
elif index_type == "DatetimeIndex":
30+
self.idx = pd.date_range("1/1/2000", freq="T", periods=N)
31+
elif index_type == "Int64Index":
32+
self.idx = pd.Index(range(N))
33+
elif index_type == "PeriodIndex":
34+
self.idx = pd.period_range("1/1/2000", freq="T", periods=N)
35+
elif index_type == "RangeIndex":
36+
self.idx = pd.RangeIndex(start=0, stop=N)
37+
elif index_type == "IntervalIndex":
38+
self.idx = pd.IntervalIndex.from_arrays(range(N), range(1, N + 1))
39+
elif index_type == "TimedeltaIndex":
40+
self.idx = pd.TimedeltaIndex(range(N))
41+
elif index_type == "Float64Index":
42+
self.idx = pd.Float64Index(range(N))
43+
elif index_type == "UInt64Index":
44+
self.idx = pd.UInt64Index(range(N))
45+
else:
46+
raise ValueError
47+
assert len(self.idx) == N
48+
self.idx._cache = {}
49+
50+
def time_values(self, index_type):
51+
self.idx._values
52+
53+
def time_shape(self, index_type):
54+
self.idx.shape
55+
56+
def time_is_monotonic(self, index_type):
57+
self.idx.is_monotonic
58+
59+
def time_is_monotonic_decreasing(self, index_type):
60+
self.idx.is_monotonic_decreasing
61+
62+
def time_is_monotonic_increasing(self, index_type):
63+
self.idx.is_monotonic_increasing
64+
65+
def time_is_unique(self, index_type):
66+
self.idx.is_unique
67+
68+
def time_engine(self, index_type):
69+
self.idx._engine
70+
71+
def time_inferred_type(self, index_type):
72+
self.idx.inferred_type
73+
74+
def time_is_all_dates(self, index_type):
75+
self.idx.is_all_dates

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1003,6 +1003,7 @@ Performance improvements
10031003
- For :meth:`to_datetime` changed default value of cache parameter to ``True`` (:issue:`26043`)
10041004
- Improved performance of :class:`DatetimeIndex` and :class:`PeriodIndex` slicing given non-unique, monotonic data (:issue:`27136`).
10051005
- Improved performance of :meth:`pd.read_json` for index-oriented data. (:issue:`26773`)
1006+
- Improved performance of :meth:`MultiIndex.shape` (:issue:`27384`).
10061007
10071008
.. _whatsnew_0250.bug_fixes:
10081009

pandas/core/indexes/base.py

+7
Original file line numberDiff line numberDiff line change
@@ -5640,6 +5640,13 @@ def _add_logical_methods_disabled(cls):
56405640
cls.all = make_invalid_op("all")
56415641
cls.any = make_invalid_op("any")
56425642

5643+
@property
5644+
def shape(self):
5645+
"""
5646+
Return a tuple of the shape of the underlying data.
5647+
"""
5648+
return (len(self),)
5649+
56435650

56445651
Index._add_numeric_methods_disabled()
56455652
Index._add_logical_methods()

pandas/core/indexes/interval.py

-5
Original file line numberDiff line numberDiff line change
@@ -405,11 +405,6 @@ def size(self):
405405
# Avoid materializing ndarray[Interval]
406406
return self._data.size
407407

408-
@property
409-
def shape(self):
410-
# Avoid materializing ndarray[Interval]
411-
return self._data.shape
412-
413408
@property
414409
def itemsize(self):
415410
msg = (

0 commit comments

Comments
 (0)