From 50322fe183a9629f93a31093c6c83f4fb136e8f7 Mon Sep 17 00:00:00 2001
From: Gianluca Ficarelli <gianluca.ficarelli@epfl.ch>
Date: Tue, 23 Apr 2024 17:08:12 +0200
Subject: [PATCH 1/2] PERF: MultiIndex.memory_usage shouldn't trigger the index
 engine

Ignore the index engine when it isn't already cached.
---
 doc/source/whatsnew/v3.0.0.rst |  1 +
 pandas/core/indexes/base.py    |  5 +++--
 pandas/core/indexes/multi.py   |  5 +++--
 pandas/tests/base/test_misc.py | 25 +++++++++++++++++++++++++
 4 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 781b3b2282a87..ffc16f1e365ea 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -345,6 +345,7 @@ Performance improvements
 - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`)
 - Performance improvement in indexing operations for string dtypes (:issue:`56997`)
 - Performance improvement in unary methods on a :class:`RangeIndex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57825`)
+- Performance improvement in :meth:`MultiIndex.memory_usage` to ignore the index engine when it isn't already cached. (:issue:`58385`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_300.bug_fixes:
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 5654111132b5e..e08b585920779 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -4863,8 +4863,9 @@ def _from_join_target(self, result: np.ndarray) -> ArrayLike:
     def memory_usage(self, deep: bool = False) -> int:
         result = self._memory_usage(deep=deep)
 
-        # include our engine hashtable
-        result += self._engine.sizeof(deep=deep)
+        # include our engine hashtable, only if it's already cached
+        if "_engine" in self._cache:
+            result += self._engine.sizeof(deep=deep)
         return result
 
     @final
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index 21ce9b759f2df..c8e16fad00d5b 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -1391,8 +1391,9 @@ def _nbytes(self, deep: bool = False) -> int:
         names_nbytes = sum(getsizeof(i, objsize) for i in self.names)
         result = level_nbytes + label_nbytes + names_nbytes
 
-        # include our engine hashtable
-        result += self._engine.sizeof(deep=deep)
+        # include our engine hashtable, only if it's already cached
+        if "_engine" in self._cache:
+            result += self._engine.sizeof(deep=deep)
         return result
 
     # --------------------------------------------------------------------
diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py
index f6a4396ca5be0..078ed7b0d3ef9 100644
--- a/pandas/tests/base/test_misc.py
+++ b/pandas/tests/base/test_misc.py
@@ -142,6 +142,31 @@ def test_memory_usage_components_narrow_series(any_real_numpy_dtype):
     assert total_usage == non_index_usage + index_usage
 
 
+def test_memory_usage_doesnt_trigger_engine(index):
+    index._cache.clear()
+    assert "_engine" not in index._cache
+
+    res_without_engine = index.memory_usage()
+    assert "_engine" not in index._cache
+
+    # explicitly load and cache the engine
+    _ = index._engine
+    assert "_engine" in index._cache
+
+    res_with_engine = index.memory_usage()
+
+    # the engine doesn't affect the result even when initialized with values, because
+    # index._engine.sizeof() doesn't consider the content of index._engine.values
+    assert res_with_engine == res_without_engine
+
+    if len(index) == 0:
+        assert res_without_engine == 0
+        assert res_with_engine == 0
+    else:
+        assert res_without_engine > 0
+        assert res_with_engine > 0
+
+
 def test_searchsorted(request, index_or_series_obj):
     # numpy.searchsorted calls obj.searchsorted under the hood.
     # See gh-12238

From 2b42ea0d7bc8d432bd1a57099509f3a9e4d2ad81 Mon Sep 17 00:00:00 2001
From: Gianluca Ficarelli <gianluca.ficarelli@epfl.ch>
Date: Wed, 24 Apr 2024 09:16:08 +0200
Subject: [PATCH 2/2] Move test, sort whatsnew

---
 doc/source/whatsnew/v3.0.0.rst        |  2 +-
 pandas/tests/base/test_misc.py        | 25 -------------------------
 pandas/tests/indexes/test_old_base.py | 24 ++++++++++++++++++++++++
 3 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index ffc16f1e365ea..723bb6a12060c 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -332,6 +332,7 @@ Performance improvements
 - Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`)
 - Performance improvement in :meth:`Index.to_frame` returning a :class:`RangeIndex` columns of a :class:`Index` when possible. (:issue:`58018`)
 - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`)
+- Performance improvement in :meth:`MultiIndex.memory_usage` to ignore the index engine when it isn't already cached. (:issue:`58385`)
 - Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask or integers returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`)
 - Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`)
 - Performance improvement in :meth:`RangeIndex.argmin` and :meth:`RangeIndex.argmax` (:issue:`57823`)
@@ -345,7 +346,6 @@ Performance improvements
 - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`)
 - Performance improvement in indexing operations for string dtypes (:issue:`56997`)
 - Performance improvement in unary methods on a :class:`RangeIndex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57825`)
-- Performance improvement in :meth:`MultiIndex.memory_usage` to ignore the index engine when it isn't already cached. (:issue:`58385`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_300.bug_fixes:
diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py
index 078ed7b0d3ef9..f6a4396ca5be0 100644
--- a/pandas/tests/base/test_misc.py
+++ b/pandas/tests/base/test_misc.py
@@ -142,31 +142,6 @@ def test_memory_usage_components_narrow_series(any_real_numpy_dtype):
     assert total_usage == non_index_usage + index_usage
 
 
-def test_memory_usage_doesnt_trigger_engine(index):
-    index._cache.clear()
-    assert "_engine" not in index._cache
-
-    res_without_engine = index.memory_usage()
-    assert "_engine" not in index._cache
-
-    # explicitly load and cache the engine
-    _ = index._engine
-    assert "_engine" in index._cache
-
-    res_with_engine = index.memory_usage()
-
-    # the engine doesn't affect the result even when initialized with values, because
-    # index._engine.sizeof() doesn't consider the content of index._engine.values
-    assert res_with_engine == res_without_engine
-
-    if len(index) == 0:
-        assert res_without_engine == 0
-        assert res_with_engine == 0
-    else:
-        assert res_without_engine > 0
-        assert res_with_engine > 0
-
-
 def test_searchsorted(request, index_or_series_obj):
     # numpy.searchsorted calls obj.searchsorted under the hood.
     # See gh-12238
diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py
index 9b4470021cc1d..b929616c814ee 100644
--- a/pandas/tests/indexes/test_old_base.py
+++ b/pandas/tests/indexes/test_old_base.py
@@ -326,6 +326,30 @@ def test_memory_usage(self, index):
         if index.inferred_type == "object":
             assert result3 > result2
 
+    def test_memory_usage_doesnt_trigger_engine(self, index):
+        index._cache.clear()
+        assert "_engine" not in index._cache
+
+        res_without_engine = index.memory_usage()
+        assert "_engine" not in index._cache
+
+        # explicitly load and cache the engine
+        _ = index._engine
+        assert "_engine" in index._cache
+
+        res_with_engine = index.memory_usage()
+
+        # the empty engine doesn't affect the result even when initialized with values,
+        # because engine.sizeof() doesn't consider the content of engine.values
+        assert res_with_engine == res_without_engine
+
+        if len(index) == 0:
+            assert res_without_engine == 0
+            assert res_with_engine == 0
+        else:
+            assert res_without_engine > 0
+            assert res_with_engine > 0
+
     def test_argsort(self, index):
         if isinstance(index, CategoricalIndex):
             pytest.skip(f"{type(self).__name__} separately tested")