From 7e0c3170c5f20b224226f58e244e01d2a6e72801 Mon Sep 17 00:00:00 2001 From: tp Date: Mon, 9 Mar 2020 22:02:59 +0000 Subject: [PATCH 1/2] PERF: copy cached attributes on index shallow_copy --- doc/source/whatsnew/v1.1.0.rst | 4 +++- pandas/core/indexes/base.py | 12 ++++++++---- pandas/core/indexes/numeric.py | 8 ++------ 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index e745bf3f5feed..f20c3df027fba 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -187,7 +187,9 @@ Performance improvements - Performance improvement in :class:`Timedelta` constructor (:issue:`30543`) - Performance improvement in :class:`Timestamp` constructor (:issue:`30543`) - Performance improvement in flex arithmetic ops between :class:`DataFrame` and :class:`Series` with ``axis=0`` (:issue:`31296`) -- +- The internal :meth:`Index._shallow_copy` now copies cached attributes over to the new index, + avoiding creating these again on the new index. This can speed up many operations + that depend on creating copies of existing indexes (:issue:`28584`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3eab757311ccb..0dc5440259999 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1,7 +1,7 @@ from datetime import datetime import operator from textwrap import dedent -from typing import TYPE_CHECKING, Any, FrozenSet, Hashable, Union +from typing import TYPE_CHECKING, Any, Dict, FrozenSet, Hashable, Union import warnings import numpy as np @@ -250,6 +250,7 @@ def _outer_indexer(self, left, right): _typ = "index" _data: Union[ExtensionArray, np.ndarray] + _cache: Dict[str, Any] _id = None _name: Label = None # MultiIndex.levels previously allowed setting the index name. We @@ -468,6 +469,7 @@ def _simple_new(cls, values, name: Label = None): # we actually set this value too. result._index_data = values result._name = name + result._cache = {} return result._reset_identity() @@ -499,10 +501,12 @@ def _shallow_copy(self, values=None, name: Label = no_default): """ name = self.name if name is no_default else name - if values is None: - values = self.values + cache = self._cache if values is None else {} + values = self.values if values is None else values - return self._simple_new(values, name=name) + result = self._simple_new(values, name=name) + result._cache = cache + return result def _shallow_copy_with_infer(self, values, **kwargs): """ diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 6c250ccd09a51..2d1d69772c100 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -104,15 +104,11 @@ def _maybe_cast_slice_bound(self, label, side, kind): @Appender(Index._shallow_copy.__doc__) def _shallow_copy(self, values=None, name: Label = lib.no_default): - name = name if name is not lib.no_default else self.name - if values is not None and not self._can_hold_na and values.dtype.kind == "f": + name = self.name if name is lib.no_default else name # Ensure we are not returning an Int64Index with float data: return Float64Index._simple_new(values, name=name) - - if values is None: - values = self.values - return type(self)._simple_new(values, name=name) + return super()._shallow_copy(values=values, name=name) def _convert_for_op(self, value): """ From 635a73e89a6dc0a902087fb77bd5e34baeb012cb Mon Sep 17 00:00:00 2001 From: tp Date: Mon, 9 Mar 2020 23:10:08 +0000 Subject: [PATCH 2/2] make new cache instead of referencing the old cache --- pandas/core/indexes/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 0dc5440259999..17a1827fda027 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -500,9 +500,9 @@ def _shallow_copy(self, values=None, name: Label = no_default): name : Label, defaults to self.name """ name = self.name if name is no_default else name - - cache = self._cache if values is None else {} - values = self.values if values is None else values + cache = self._cache.copy() if values is None else {} + if values is None: + values = self.values result = self._simple_new(values, name=name) result._cache = cache