From 866d94f610d5fa5b2576dbb7dd6a41f466d6d7e1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 28 Aug 2024 13:40:21 -0700 Subject: [PATCH 1/4] PERF: CategoricalDtype.update_dtype --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/dtypes/dtypes.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index da0d85b7bb529..7b4229b059d76 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -527,6 +527,7 @@ Performance improvements - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`) - Performance improvement in :func:`merge` if hash-join can be used (:issue:`57970`) - Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`) +- Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`?`) - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) - Performance improvement in unary methods on a :class:`RangeIndex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57825`) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index c0587d36bcb5a..536f98e14930c 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -611,6 +611,8 @@ def update_dtype(self, dtype: str_type | CategoricalDtype) -> CategoricalDtype: dtype = cast(CategoricalDtype, dtype) # update categories/ordered unless they've been explicitly passed as None + if dtype.categories is not None and dtype.ordered is not None: + return dtype new_categories = ( dtype.categories if dtype.categories is not None else self.categories ) From f98f17e6234c2424161a63e242695f8a589eddc0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 28 Aug 2024 13:48:44 -0700 Subject: [PATCH 2/4] Add whatsnew number add comment --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/dtypes/dtypes.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 7b4229b059d76..865abaeb9d130 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -526,8 +526,8 @@ Performance improvements - Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`) - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`) - Performance improvement in :func:`merge` if hash-join can be used (:issue:`57970`) +- Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`59647`) - Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`) -- Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`?`) - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) - Performance improvement in unary methods on a :class:`RangeIndex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57825`) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 536f98e14930c..339791dde05f4 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -612,6 +612,7 @@ def update_dtype(self, dtype: str_type | CategoricalDtype) -> CategoricalDtype: # update categories/ordered unless they've been explicitly passed as None if dtype.categories is not None and dtype.ordered is not None: + # Avoid re-validation in CategoricalDtype constructor return dtype new_categories = ( dtype.categories if dtype.categories is not None else self.categories From 4302ea015ecd6fcb67a188dfd3ab0a4a232024f2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 28 Aug 2024 14:21:32 -0700 Subject: [PATCH 3/4] Fix unit test --- pandas/tests/dtypes/test_dtypes.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 7c7da41124b83..4c65e1de76a7f 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -222,7 +222,9 @@ def test_repr_range_categories(self): def test_update_dtype(self): # GH 27338 - result = CategoricalDtype(["a"]).update_dtype(Categorical(["b"], ordered=True)) + result = CategoricalDtype(["a"]).update_dtype( + CategoricalDtype(["b"], ordered=True) + ) expected = CategoricalDtype(["b"], ordered=True) assert result == expected From a119f6ede4dcdca8c2c6b512eda00dbf6ed91476 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 30 Aug 2024 11:16:13 -0700 Subject: [PATCH 4/4] short circut only for the dtype --- pandas/core/dtypes/dtypes.py | 6 +++++- pandas/tests/dtypes/test_dtypes.py | 4 +--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 339791dde05f4..6e1e3dcb9cb3f 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -611,7 +611,11 @@ def update_dtype(self, dtype: str_type | CategoricalDtype) -> CategoricalDtype: dtype = cast(CategoricalDtype, dtype) # update categories/ordered unless they've been explicitly passed as None - if dtype.categories is not None and dtype.ordered is not None: + if ( + isinstance(dtype, CategoricalDtype) + and dtype.categories is not None + and dtype.ordered is not None + ): # Avoid re-validation in CategoricalDtype constructor return dtype new_categories = ( diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 4c65e1de76a7f..7c7da41124b83 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -222,9 +222,7 @@ def test_repr_range_categories(self): def test_update_dtype(self): # GH 27338 - result = CategoricalDtype(["a"]).update_dtype( - CategoricalDtype(["b"], ordered=True) - ) + result = CategoricalDtype(["a"]).update_dtype(Categorical(["b"], ordered=True)) expected = CategoricalDtype(["b"], ordered=True) assert result == expected