diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 47ebd962b367c..81e310a2c44f1 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -285,6 +285,8 @@ Performance improvements - ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`) - Performance improvement in :meth:`pd.to_datetime` with non-ns time unit for ``float`` ``dtype`` columns (:issue:`20445`) - Performance improvement in setting values on a :class:`IntervalArray` (:issue:`36310`) +- The internal index method :meth:`~Index._shallow_copy` now makes the new index and original index share cached attributes, + avoiding creating these again, if created on either. This can speed up operations that depend on creating copies of existing indexes (:issue:`36840`) - Performance improvement in :meth:`RollingGroupby.count` (:issue:`35625`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d603797370ce3..4967e13a9855a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -561,12 +561,12 @@ def _shallow_copy(self, values=None, name: Label = no_default): name : Label, defaults to self.name """ name = self.name if name is no_default else name - cache = self._cache.copy() if values is None else {} - if values is None: - values = self._values - result = self._simple_new(values, name=name) - result._cache = cache + if values is not None: + return self._simple_new(values, name=name) + + result = self._simple_new(self._values, name=name) + result._cache = self._cache return result def is_(self, other) -> bool: diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 5128c644e6bcb..4440238dbd493 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -671,17 +671,15 @@ def _with_freq(self, freq): def _shallow_copy(self, values=None, name: Label = lib.no_default): name = self.name if name is lib.no_default else name - cache = self._cache.copy() if values is None else {} - if values is None: - values = self._data - - if isinstance(values, np.ndarray): + if values is not None: # TODO: We would rather not get here - values = type(self._data)(values, dtype=self.dtype) + if isinstance(values, np.ndarray): + values = type(self._data)(values, dtype=self.dtype) + return self._simple_new(values, name=name) - result = type(self)._simple_new(values, name=name) - result._cache = cache + result = self._simple_new(self._data, name=name) + result._cache = self._cache return result # -------------------------------------------------------------------- diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index a56f6a5bb0340..4a877621a94c2 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -335,12 +335,12 @@ def _shallow_copy( self, values: Optional[IntervalArray] = None, name: Label = lib.no_default ): name = self.name if name is lib.no_default else name - cache = self._cache.copy() if values is None else {} - if values is None: - values = self._data - result = self._simple_new(values, name=name) - result._cache = cache + if values is not None: + return self._simple_new(values, name=name) + + result = self._simple_new(self._data, name=name) + result._cache = self._cache return result @cache_readonly diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 27b60747015de..adf7a75b33b38 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -260,12 +260,12 @@ def _has_complex_internals(self) -> bool: def _shallow_copy(self, values=None, name: Label = no_default): name = name if name is not no_default else self.name - cache = self._cache.copy() if values is None else {} - if values is None: - values = self._data - result = self._simple_new(values, name=name) - result._cache = cache + if values is not None: + return self._simple_new(values, name=name) + + result = self._simple_new(self._data, name=name) + result._cache = self._cache return result def _maybe_convert_timedelta(self, other): diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 4dffda2605ef7..d5d9a9b5bc0a3 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -397,13 +397,13 @@ def __iter__(self): def _shallow_copy(self, values=None, name: Label = no_default): name = self.name if name is no_default else name - if values is None: - result = self._simple_new(self._range, name=name) - result._cache = self._cache.copy() - return result - else: + if values is not None: return Int64Index._simple_new(values, name=name) + result = self._simple_new(self._range, name=name) + result._cache = self._cache + return result + @doc(Int64Index.copy) def copy(self, name=None, deep=False, dtype=None, names=None): name = self._validate_names(name=name, names=names, deep=deep)[0] diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index b8468a5acf277..2dc2fe6d2ad07 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -128,7 +128,8 @@ def test_memory_usage(index_or_series_obj): ) if len(obj) == 0: - assert res_deep == res == 0 + expected = 0 if isinstance(obj, Index) else 80 + assert res_deep == res == expected elif is_object or is_categorical: # only deep will pick them up assert res_deep > res diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index c40f7b1bc2120..73d2e99d3ff5e 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -935,28 +935,22 @@ def test_contains_requires_hashable_raises(self): with pytest.raises(TypeError, match=msg): {} in idx._engine - def test_copy_copies_cache(self): - # GH32898 + def test_copy_shares_cache(self): + # GH32898, GH36840 idx = self.create_index() idx.get_loc(idx[0]) # populates the _cache. copy = idx.copy() - # check that the copied cache is a copy of the original - assert idx._cache == copy._cache - assert idx._cache is not copy._cache - # cache values should reference the same object - for key, val in idx._cache.items(): - assert copy._cache[key] is val, key + assert copy._cache is idx._cache - def test_shallow_copy_copies_cache(self): - # GH32669 + def test_shallow_copy_shares_cache(self): + # GH32669, GH36840 idx = self.create_index() idx.get_loc(idx[0]) # populates the _cache. shallow_copy = idx._shallow_copy() - # check that the shallow_copied cache is a copy of the original - assert idx._cache == shallow_copy._cache - assert idx._cache is not shallow_copy._cache - # cache values should reference the same object - for key, val in idx._cache.items(): - assert shallow_copy._cache[key] is val, key + assert shallow_copy._cache is idx._cache + + shallow_copy = idx._shallow_copy(idx._data) + assert shallow_copy._cache is not idx._cache + assert shallow_copy._cache == {}