From 4558947094ca4bfae4fe091c53f128af69f802c6 Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 11 Mar 2020 23:20:13 +0000 Subject: [PATCH 1/2] PERF: _shallow_copy copies cache --- doc/source/whatsnew/v1.1.0.rst | 6 +++--- pandas/core/indexes/category.py | 12 ++++-------- pandas/core/indexes/datetimelike.py | 5 ++++- pandas/core/indexes/datetimes.py | 1 + pandas/core/indexes/interval.py | 12 ++++++++---- pandas/core/indexes/period.py | 1 + pandas/core/indexes/timedeltas.py | 1 + 7 files changed, 22 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index c473500c205d8..d7e8c4b4dac79 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -188,9 +188,9 @@ Performance improvements - Performance improvement in :class:`Timedelta` constructor (:issue:`30543`) - Performance improvement in :class:`Timestamp` constructor (:issue:`30543`) - Performance improvement in flex arithmetic ops between :class:`DataFrame` and :class:`Series` with ``axis=0`` (:issue:`31296`) -- The internal :meth:`Index._shallow_copy` now copies cached attributes over to the new index, - avoiding creating these again on the new index. This can speed up many operations - that depend on creating copies of existing indexes (:issue:`28584`) +- The internal index method :meth:`~Index._shallow_copy` now copies cached attributes over to the new index, + avoiding creating these again on the new index. This can speed up many operations that depend on creating copies of + existing indexes (:issue:`28584`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 5997843f7ac6d..6388f2007cb12 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -233,6 +233,7 @@ def _simple_new(cls, values: Categorical, name: Label = None): result._data = values result.name = name + result._cache = {} result._reset_identity() result._no_setting_name = False @@ -242,14 +243,9 @@ def _simple_new(cls, values: Categorical, name: Label = None): @Appender(Index._shallow_copy.__doc__) def _shallow_copy(self, values=None, name: Label = no_default): - name = self.name if name is no_default else name - - if values is None: - values = self.values - - cat = Categorical(values, dtype=self.dtype) - - return type(self)._simple_new(cat, name=name) + if values is not None: + values = Categorical(values, dtype=self.dtype) + return super()._shallow_copy(values=values, name=name) def _is_dtype_compat(self, other) -> bool: """ diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 894e1d95a17bc..2355fd00e7d9b 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -618,6 +618,7 @@ def _set_freq(self, freq): def _shallow_copy(self, values=None, name: Label = lib.no_default): name = self.name if name is lib.no_default else name + cache = self._cache if values is None else {} if values is None: values = self._data @@ -636,7 +637,9 @@ def _shallow_copy(self, values=None, name: Label = lib.no_default): del attributes["freq"] attributes["name"] = name - return type(self)._simple_new(values, **attributes) + result = self._simple_new(values, **attributes) + result._cache = cache + return result # -------------------------------------------------------------------- # Set Operation Methods diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 185ad8e4c365a..c8035a9de432b 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -268,6 +268,7 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None): result = object.__new__(cls) result._data = dtarr result.name = name + result._cache = {} result._no_setting_name = False # For groupby perf. See note in indexes/base about _index_data result._index_data = dtarr._data diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index efdaf5a331f5f..d3b2b5f9d8ac0 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -243,6 +243,7 @@ def _simple_new(cls, array: IntervalArray, name: Label = None): result = IntervalMixin.__new__(cls) result._data = array result.name = name + result._cache = {} result._no_setting_name = False result._reset_identity() return result @@ -332,12 +333,15 @@ def from_tuples( # -------------------------------------------------------------------- @Appender(Index._shallow_copy.__doc__) - def _shallow_copy(self, values=None, **kwargs): + def _shallow_copy(self, values=None, name: Label = lib.no_default): + name = self.name if name is lib.no_default else name + cache = self._cache if values is None else {} if values is None: values = self._data - attributes = self._get_attributes_dict() - attributes.update(kwargs) - return self._simple_new(values, **attributes) + + result = self._simple_new(values, name=name) + result._cache = cache + return result @cache_readonly def _isnan(self): diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 9ab80c0b6145c..8ae792d3f63b5 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -233,6 +233,7 @@ def _simple_new(cls, values: PeriodArray, name: Label = None): # For groupby perf. See note in indexes/base about _index_data result._index_data = values._data result.name = name + result._cache = {} result._reset_identity() return result diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 5e4a8e83bd95b..a6d9d4dfc330b 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -180,6 +180,7 @@ def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE): result = object.__new__(cls) result._data = values result._name = name + result._cache = {} # For groupby perf. See note in indexes/base about _index_data result._index_data = values._data From caccd834d6da60d1844d5396b2436a7b10a77b08 Mon Sep 17 00:00:00 2001 From: tp Date: Thu, 12 Mar 2020 09:52:06 +0000 Subject: [PATCH 2/2] Add issue number --- doc/source/whatsnew/v1.1.0.rst | 6 +++--- pandas/core/indexes/datetimelike.py | 2 +- pandas/core/indexes/interval.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index d7e8c4b4dac79..52919f966f284 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -89,9 +89,9 @@ Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - :meth:`DataFrame.swaplevels` now raises a ``TypeError`` if the axis is not a :class:`MultiIndex`. Previously a ``AttributeError`` was raised (:issue:`31126`) -- :meth:`DataFrameGroupby.mean` and :meth:`SeriesGroupby.mean` (and similarly for :meth:`~DataFrameGroupby.median`, :meth:`~DataFrameGroupby.std`` and :meth:`~DataFrameGroupby.var``) +- :meth:`DataFrameGroupby.mean` and :meth:`SeriesGroupby.mean` (and similarly for :meth:`~DataFrameGroupby.median`, :meth:`~DataFrameGroupby.std` and :meth:`~DataFrameGroupby.var`) now raise a ``TypeError`` if a not-accepted keyword argument is passed into it. - Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median``) (:issue:`31485`) + Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median`) (:issue:`31485`) - :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` instead of a ``ValueError`` if an incompatible key is passed, and ``KeyError`` if a missing key is passed, matching the behavior of ``.loc[]`` (:issue:`31722`) - Passing an integer dtype other than ``int64`` to ``np.array(period_index, dtype=...)`` will now raise ``TypeError`` instead of incorrectly using ``int64`` (:issue:`32255`) - @@ -190,7 +190,7 @@ Performance improvements - Performance improvement in flex arithmetic ops between :class:`DataFrame` and :class:`Series` with ``axis=0`` (:issue:`31296`) - The internal index method :meth:`~Index._shallow_copy` now copies cached attributes over to the new index, avoiding creating these again on the new index. This can speed up many operations that depend on creating copies of - existing indexes (:issue:`28584`) + existing indexes (:issue:`28584`, :issue:`32640`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 2355fd00e7d9b..a5902916991ca 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -618,7 +618,7 @@ def _set_freq(self, freq): def _shallow_copy(self, values=None, name: Label = lib.no_default): name = self.name if name is lib.no_default else name - cache = self._cache if values is None else {} + cache = self._cache.copy() if values is None else {} if values is None: values = self._data diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index d3b2b5f9d8ac0..80d58de6f1437 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -335,7 +335,7 @@ def from_tuples( @Appender(Index._shallow_copy.__doc__) def _shallow_copy(self, values=None, name: Label = lib.no_default): name = self.name if name is lib.no_default else name - cache = self._cache if values is None else {} + cache = self._cache.copy() if values is None else {} if values is None: values = self._data