From 50322fe183a9629f93a31093c6c83f4fb136e8f7 Mon Sep 17 00:00:00 2001 From: Gianluca Ficarelli Date: Tue, 23 Apr 2024 17:08:12 +0200 Subject: [PATCH 1/2] PERF: MultiIndex.memory_usage shouldn't trigger the index engine Ignore the index engine when it isn't already cached. --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/indexes/base.py | 5 +++-- pandas/core/indexes/multi.py | 5 +++-- pandas/tests/base/test_misc.py | 25 +++++++++++++++++++++++++ 4 files changed, 32 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 781b3b2282a87..ffc16f1e365ea 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -345,6 +345,7 @@ Performance improvements - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) - Performance improvement in unary methods on a :class:`RangeIndex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57825`) +- Performance improvement in :meth:`MultiIndex.memory_usage` to ignore the index engine when it isn't already cached. (:issue:`58385`) .. --------------------------------------------------------------------------- .. _whatsnew_300.bug_fixes: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5654111132b5e..e08b585920779 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4863,8 +4863,9 @@ def _from_join_target(self, result: np.ndarray) -> ArrayLike: def memory_usage(self, deep: bool = False) -> int: result = self._memory_usage(deep=deep) - # include our engine hashtable - result += self._engine.sizeof(deep=deep) + # include our engine hashtable, only if it's already cached + if "_engine" in self._cache: + result += self._engine.sizeof(deep=deep) return result @final diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 21ce9b759f2df..c8e16fad00d5b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1391,8 +1391,9 @@ def _nbytes(self, deep: bool = False) -> int: names_nbytes = sum(getsizeof(i, objsize) for i in self.names) result = level_nbytes + label_nbytes + names_nbytes - # include our engine hashtable - result += self._engine.sizeof(deep=deep) + # include our engine hashtable, only if it's already cached + if "_engine" in self._cache: + result += self._engine.sizeof(deep=deep) return result # -------------------------------------------------------------------- diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index f6a4396ca5be0..078ed7b0d3ef9 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -142,6 +142,31 @@ def test_memory_usage_components_narrow_series(any_real_numpy_dtype): assert total_usage == non_index_usage + index_usage +def test_memory_usage_doesnt_trigger_engine(index): + index._cache.clear() + assert "_engine" not in index._cache + + res_without_engine = index.memory_usage() + assert "_engine" not in index._cache + + # explicitly load and cache the engine + _ = index._engine + assert "_engine" in index._cache + + res_with_engine = index.memory_usage() + + # the engine doesn't affect the result even when initialized with values, because + # index._engine.sizeof() doesn't consider the content of index._engine.values + assert res_with_engine == res_without_engine + + if len(index) == 0: + assert res_without_engine == 0 + assert res_with_engine == 0 + else: + assert res_without_engine > 0 + assert res_with_engine > 0 + + def test_searchsorted(request, index_or_series_obj): # numpy.searchsorted calls obj.searchsorted under the hood. # See gh-12238 From 2b42ea0d7bc8d432bd1a57099509f3a9e4d2ad81 Mon Sep 17 00:00:00 2001 From: Gianluca Ficarelli Date: Wed, 24 Apr 2024 09:16:08 +0200 Subject: [PATCH 2/2] Move test, sort whatsnew --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/tests/base/test_misc.py | 25 ------------------------- pandas/tests/indexes/test_old_base.py | 24 ++++++++++++++++++++++++ 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ffc16f1e365ea..723bb6a12060c 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -332,6 +332,7 @@ Performance improvements - Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`) - Performance improvement in :meth:`Index.to_frame` returning a :class:`RangeIndex` columns of a :class:`Index` when possible. (:issue:`58018`) - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`) +- Performance improvement in :meth:`MultiIndex.memory_usage` to ignore the index engine when it isn't already cached. (:issue:`58385`) - Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask or integers returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`) - Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`) - Performance improvement in :meth:`RangeIndex.argmin` and :meth:`RangeIndex.argmax` (:issue:`57823`) @@ -345,7 +346,6 @@ Performance improvements - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) - Performance improvement in unary methods on a :class:`RangeIndex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57825`) -- Performance improvement in :meth:`MultiIndex.memory_usage` to ignore the index engine when it isn't already cached. (:issue:`58385`) .. --------------------------------------------------------------------------- .. _whatsnew_300.bug_fixes: diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index 078ed7b0d3ef9..f6a4396ca5be0 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -142,31 +142,6 @@ def test_memory_usage_components_narrow_series(any_real_numpy_dtype): assert total_usage == non_index_usage + index_usage -def test_memory_usage_doesnt_trigger_engine(index): - index._cache.clear() - assert "_engine" not in index._cache - - res_without_engine = index.memory_usage() - assert "_engine" not in index._cache - - # explicitly load and cache the engine - _ = index._engine - assert "_engine" in index._cache - - res_with_engine = index.memory_usage() - - # the engine doesn't affect the result even when initialized with values, because - # index._engine.sizeof() doesn't consider the content of index._engine.values - assert res_with_engine == res_without_engine - - if len(index) == 0: - assert res_without_engine == 0 - assert res_with_engine == 0 - else: - assert res_without_engine > 0 - assert res_with_engine > 0 - - def test_searchsorted(request, index_or_series_obj): # numpy.searchsorted calls obj.searchsorted under the hood. # See gh-12238 diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 9b4470021cc1d..b929616c814ee 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -326,6 +326,30 @@ def test_memory_usage(self, index): if index.inferred_type == "object": assert result3 > result2 + def test_memory_usage_doesnt_trigger_engine(self, index): + index._cache.clear() + assert "_engine" not in index._cache + + res_without_engine = index.memory_usage() + assert "_engine" not in index._cache + + # explicitly load and cache the engine + _ = index._engine + assert "_engine" in index._cache + + res_with_engine = index.memory_usage() + + # the empty engine doesn't affect the result even when initialized with values, + # because engine.sizeof() doesn't consider the content of engine.values + assert res_with_engine == res_without_engine + + if len(index) == 0: + assert res_without_engine == 0 + assert res_with_engine == 0 + else: + assert res_without_engine > 0 + assert res_with_engine > 0 + def test_argsort(self, index): if isinstance(index, CategoricalIndex): pytest.skip(f"{type(self).__name__} separately tested")