From 112737033457a766fb4d2fa402108970ce989bee Mon Sep 17 00:00:00 2001 From: tp Date: Thu, 12 Mar 2020 22:38:17 +0000 Subject: [PATCH 1/3] PERF: MultiIndex._shallow_copy --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/indexes/multi.py | 8 ++++++-- pandas/core/indexes/period.py | 6 ++++-- pandas/core/indexes/range.py | 6 ++++-- pandas/tests/indexes/common.py | 12 ++++++++++++ pandas/tests/indexes/multi/test_names.py | 1 + 6 files changed, 28 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5b6f70be478c2..48eff0543ad4d 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -190,7 +190,7 @@ Performance improvements - Performance improvement in flex arithmetic ops between :class:`DataFrame` and :class:`Series` with ``axis=0`` (:issue:`31296`) - The internal index method :meth:`~Index._shallow_copy` now copies cached attributes over to the new index, avoiding creating these again on the new index. This can speed up many operations that depend on creating copies of - existing indexes (:issue:`28584`, :issue:`32640`) + existing indexes (:issue:`28584`, :issue:`32640`, :issue:`32669`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 0bb88145646ed..012ed5d7780c6 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -276,6 +276,7 @@ def __new__( raise ValueError("Must pass non-zero number of levels/codes") result = object.__new__(MultiIndex) + result._cache = {} # we've already validated levels and codes, so shortcut here result._set_levels(levels, copy=copy, validate=False) @@ -689,7 +690,7 @@ def __len__(self) -> int: # -------------------------------------------------------------------- # Levels Methods - @cache_readonly + @property def levels(self): # Use cache_readonly to ensure that self.get_locs doesn't repeatedly # create new IndexEngine @@ -991,7 +992,10 @@ def _shallow_copy(self, values=None, **kwargs): # discards freq kwargs.pop("freq", None) return MultiIndex.from_tuples(values, names=names, **kwargs) - return self.copy(**kwargs) + + result = self.copy(**kwargs) + result._cache = self._cache.copy() + return result def _shallow_copy_with_infer(self, values, **kwargs): # On equal MultiIndexes the difference is empty. diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 8ae792d3f63b5..f83234f1aac0b 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -251,11 +251,13 @@ def _has_complex_internals(self): def _shallow_copy(self, values=None, name: Label = no_default): name = name if name is not no_default else self.name - + cache = self._cache.copy() if values is None else {} if values is None: values = self._data - return self._simple_new(values, name=name) + result = self._simple_new(values, name=name) + result._cache = cache + return result def _maybe_convert_timedelta(self, other): """ diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index c21d8df2476b3..2c038564f4e6f 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -141,7 +141,7 @@ def _simple_new(cls, values: range, name: Label = None) -> "RangeIndex": result._range = values result.name = name - + result._cache = {} result._reset_identity() return result @@ -391,7 +391,9 @@ def _shallow_copy(self, values=None, name: Label = no_default): name = self.name if name is no_default else name if values is None: - return self._simple_new(self._range, name=name) + result = self._simple_new(self._range, name=name) + result._cache = self._cache.copy() + return result else: return Int64Index._simple_new(values, name=name) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index e5af0d9c03979..484f42aaf9594 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -919,3 +919,15 @@ def test_contains_requires_hashable_raises(self): with pytest.raises(TypeError): {} in idx._engine + + def test_shallow_copy_copies_cache(self): + # GH32669 + idx = self.create_index() + idx.get_loc(idx[0]) # initiate the _engine etc. + shallow_copy = idx._shallow_copy() + # check that the shallow_copied cache is a copy of the original + assert idx._cache == shallow_copy._cache + assert idx._cache is not shallow_copy._cache + # the cache values should reference the same objects + for key, val in idx._cache.items(): + assert shallow_copy._cache[key] is val diff --git a/pandas/tests/indexes/multi/test_names.py b/pandas/tests/indexes/multi/test_names.py index 479b5ef0211a0..10b90f5c9fb7e 100644 --- a/pandas/tests/indexes/multi/test_names.py +++ b/pandas/tests/indexes/multi/test_names.py @@ -7,6 +7,7 @@ def check_level_names(index, names): assert [level.name for level in index.levels] == list(names) + assert index.names == list(names) def test_slice_keep_name(): From dcbcb1b41e239daaac60a0fe835a0279cc168240 Mon Sep 17 00:00:00 2001 From: tp Date: Fri, 13 Mar 2020 15:46:16 +0000 Subject: [PATCH 2/3] keep cache_readonly --- pandas/core/indexes/multi.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 012ed5d7780c6..26eb5938fe54e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -690,7 +690,7 @@ def __len__(self) -> int: # -------------------------------------------------------------------- # Levels Methods - @property + @cache_readonly def levels(self): # Use cache_readonly to ensure that self.get_locs doesn't repeatedly # create new IndexEngine @@ -994,7 +994,10 @@ def _shallow_copy(self, values=None, **kwargs): return MultiIndex.from_tuples(values, names=names, **kwargs) result = self.copy(**kwargs) - result._cache = self._cache.copy() + cache = self._cache.copy() + if "levels" in cache: + cache["levels"] = FrozenList(lev._shallow_copy() for lev in cache["levels"]) + result._cache = cache return result def _shallow_copy_with_infer(self, values, **kwargs): From 9c5a56c906bb1a11d5b719c4b80f5f23419a7f6b Mon Sep 17 00:00:00 2001 From: tp Date: Fri, 13 Mar 2020 21:11:19 +0000 Subject: [PATCH 3/3] better comments and tests --- pandas/core/indexes/multi.py | 8 ++++---- pandas/tests/indexes/common.py | 7 ++++--- pandas/tests/indexes/multi/test_names.py | 1 - 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 26eb5938fe54e..122097f4478d7 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -994,10 +994,10 @@ def _shallow_copy(self, values=None, **kwargs): return MultiIndex.from_tuples(values, names=names, **kwargs) result = self.copy(**kwargs) - cache = self._cache.copy() - if "levels" in cache: - cache["levels"] = FrozenList(lev._shallow_copy() for lev in cache["levels"]) - result._cache = cache + result._cache = self._cache.copy() + # GH32669 + if "levels" in result._cache: + del result._cache["levels"] return result def _shallow_copy_with_infer(self, values, **kwargs): diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 484f42aaf9594..c13385c135e9f 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -923,11 +923,12 @@ def test_contains_requires_hashable_raises(self): def test_shallow_copy_copies_cache(self): # GH32669 idx = self.create_index() - idx.get_loc(idx[0]) # initiate the _engine etc. + idx.get_loc(idx[0]) # populates the _cache. shallow_copy = idx._shallow_copy() + # check that the shallow_copied cache is a copy of the original assert idx._cache == shallow_copy._cache assert idx._cache is not shallow_copy._cache - # the cache values should reference the same objects + # cache values should reference the same object for key, val in idx._cache.items(): - assert shallow_copy._cache[key] is val + assert shallow_copy._cache[key] is val, key diff --git a/pandas/tests/indexes/multi/test_names.py b/pandas/tests/indexes/multi/test_names.py index 10b90f5c9fb7e..479b5ef0211a0 100644 --- a/pandas/tests/indexes/multi/test_names.py +++ b/pandas/tests/indexes/multi/test_names.py @@ -7,7 +7,6 @@ def check_level_names(index, names): assert [level.name for level in index.levels] == list(names) - assert index.names == list(names) def test_slice_keep_name():