From 47dcdf7ecdbd4451fcb568d4507d7e74b5b1a334 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 17 Feb 2023 19:13:27 -0500 Subject: [PATCH 01/11] PERF: DataFrame.clip --- asv_bench/benchmarks/frame_methods.py | 18 ++++++++++++++++++ pandas/core/generic.py | 18 +++++++----------- pandas/tests/copy_view/test_clip.py | 6 +++--- 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 11b30ce601be6..07713c331f626 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -17,6 +17,24 @@ from .pandas_vb_common import tm +class Clip: + params = [ + ["float64", "Float64", "float64[pyarrow]"], + [True, False], + ] + param_names = ["dtype", "hasna"] + + def setup(self, dtype, hasna): + data = np.random.randn(100_000, 10) + df = DataFrame(data, dtype=dtype) + if hasna: + df.iloc[2, :] = None + self.df = df + + def time_clip(self, dtype, hasna): + self.df.clip(-1.0, 1.0) + + class GetNumericData: def setup(self): self.df = DataFrame(np.random.randn(10000, 25)) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0369de8db1339..d0d89ddd8a6d8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7989,20 +7989,16 @@ def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): ): raise ValueError("Cannot use an NA value as a clip threshold") - result = self - mask = isna(self._values) - - with np.errstate(all="ignore"): + def clip_values(arr, lower=lower, upper=upper): if upper is not None: - subset = self <= upper - result = result.where(subset, upper, axis=None, inplace=False) + arr[arr > upper] = upper if lower is not None: - subset = self >= lower - result = result.where(subset, lower, axis=None, inplace=False) - - if np.any(mask): - result[mask] = np.nan + arr[arr < lower] = lower + return arr + result = self if inplace else self.copy() + result_mgr = result._mgr.apply(clip_values) + result = self._constructor(result_mgr) if inplace: return self._update_inplace(result) else: diff --git a/pandas/tests/copy_view/test_clip.py b/pandas/tests/copy_view/test_clip.py index 6626f724ab7f3..f6487d631b0f1 100644 --- a/pandas/tests/copy_view/test_clip.py +++ b/pandas/tests/copy_view/test_clip.py @@ -12,10 +12,10 @@ def test_clip_inplace_reference(using_copy_on_write): view = df[:] df.clip(lower=2, inplace=True) - # Clip not actually inplace right now but could be - assert not np.shares_memory(get_array(df, "a"), arr_a) - if using_copy_on_write: + assert not np.shares_memory(get_array(df, "a"), arr_a) assert df._mgr._has_no_reference(0) assert view._mgr._has_no_reference(0) tm.assert_frame_equal(df_copy, view) + else: + assert np.shares_memory(get_array(df, "a"), arr_a) From 74d94dc86a33192676e4fe57b2f67e4778c6f9b0 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 17 Feb 2023 19:36:06 -0500 Subject: [PATCH 02/11] whatsnew --- doc/source/whatsnew/v2.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 78422ec686da8..96112d78d4c6a 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1115,6 +1115,7 @@ Performance improvements - Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`) - Performance improvement in :func:`to_datetime` when parsing strings with timezone offsets (:issue:`50107`) - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`) +- Performance improvement in :meth:`DataFrame.clip` and :meth:`Series.clip` (:issue:`51472`) - Performance improvement for :meth:`Series.replace` with categorical dtype (:issue:`49404`) - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) - Performance improvement for indexing operations with nullable and arrow dtypes (:issue:`49420`, :issue:`51316`) From cde3a9502f109ed4d2b6dbd2cace270fb483b26f Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 17 Feb 2023 20:06:54 -0500 Subject: [PATCH 03/11] finalize --- pandas/core/generic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d0d89ddd8a6d8..508abb2a782a1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7989,7 +7989,7 @@ def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): ): raise ValueError("Cannot use an NA value as a clip threshold") - def clip_values(arr, lower=lower, upper=upper): + def blk_func(arr, lower=lower, upper=upper): if upper is not None: arr[arr > upper] = upper if lower is not None: @@ -7997,8 +7997,8 @@ def clip_values(arr, lower=lower, upper=upper): return arr result = self if inplace else self.copy() - result_mgr = result._mgr.apply(clip_values) - result = self._constructor(result_mgr) + result_mgr = result._mgr.apply(blk_func) + result = self._constructor(result_mgr).__finalize__(self) if inplace: return self._update_inplace(result) else: From c13fd1a7a8ff347ffedbdfadcb789d6a55a42af9 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 18 Feb 2023 07:38:36 -0500 Subject: [PATCH 04/11] fix COW test failure --- pandas/core/generic.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 508abb2a782a1..59d8492512d49 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7996,7 +7996,10 @@ def blk_func(arr, lower=lower, upper=upper): arr[arr < lower] = lower return arr - result = self if inplace else self.copy() + if inplace and not using_copy_on_write(): + result = self + else: + result = self.copy() result_mgr = result._mgr.apply(blk_func) result = self._constructor(result_mgr).__finalize__(self) if inplace: From ff9467ef9182c13d2043476d317d038e5e2a403d Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 18 Feb 2023 21:28:44 -0500 Subject: [PATCH 05/11] use BlockManager.where --- pandas/core/generic.py | 24 +++++++++++------------- pandas/tests/copy_view/test_clip.py | 6 +++--- pandas/tests/frame/methods/test_clip.py | 7 +++++++ 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 59d8492512d49..7786578da7c67 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7989,19 +7989,17 @@ def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): ): raise ValueError("Cannot use an NA value as a clip threshold") - def blk_func(arr, lower=lower, upper=upper): - if upper is not None: - arr[arr > upper] = upper - if lower is not None: - arr[arr < lower] = lower - return arr - - if inplace and not using_copy_on_write(): - result = self - else: - result = self.copy() - result_mgr = result._mgr.apply(blk_func) - result = self._constructor(result_mgr).__finalize__(self) + mgr = self._mgr + mask = isna(self) + + if upper is not None: + cond = mask | (self <= upper) + mgr = mgr.where(other=upper, cond=cond, align=False) + if lower is not None: + cond = mask | (self >= lower) + mgr = mgr.where(other=lower, cond=cond, align=False) + + result = self._constructor(mgr).__finalize__(self) if inplace: return self._update_inplace(result) else: diff --git a/pandas/tests/copy_view/test_clip.py b/pandas/tests/copy_view/test_clip.py index f6487d631b0f1..6626f724ab7f3 100644 --- a/pandas/tests/copy_view/test_clip.py +++ b/pandas/tests/copy_view/test_clip.py @@ -12,10 +12,10 @@ def test_clip_inplace_reference(using_copy_on_write): view = df[:] df.clip(lower=2, inplace=True) + # Clip not actually inplace right now but could be + assert not np.shares_memory(get_array(df, "a"), arr_a) + if using_copy_on_write: - assert not np.shares_memory(get_array(df, "a"), arr_a) assert df._mgr._has_no_reference(0) assert view._mgr._has_no_reference(0) tm.assert_frame_equal(df_copy, view) - else: - assert np.shares_memory(get_array(df, "a"), arr_a) diff --git a/pandas/tests/frame/methods/test_clip.py b/pandas/tests/frame/methods/test_clip.py index f8d9adf44dbc2..da13711d607c5 100644 --- a/pandas/tests/frame/methods/test_clip.py +++ b/pandas/tests/frame/methods/test_clip.py @@ -164,3 +164,10 @@ def test_clip_with_na_args(self, float_frame): result = df.clip(lower=t, axis=0) expected = DataFrame({"col_0": [9, -3, 0, 6, 5], "col_1": [2, -4, 6, 8, 3]}) tm.assert_frame_equal(result, expected) + + def test_clip_int_data_with_float_bound(self): + # GH51472 + df = DataFrame({"a": [1, 2, 3]}) + result = df.clip(lower=1.5) + expected = DataFrame({"a": [1.5, 2.0, 3.0]}) + tm.assert_frame_equal(result, expected) From 39c447710cbc9d6c0e385fb282abca148df67339 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 18 Feb 2023 22:02:36 -0500 Subject: [PATCH 06/11] update asv --- asv_bench/benchmarks/frame_methods.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 07713c331f626..33fbc23085e23 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -20,18 +20,15 @@ class Clip: params = [ ["float64", "Float64", "float64[pyarrow]"], - [True, False], ] - param_names = ["dtype", "hasna"] + param_names = ["dtype"] - def setup(self, dtype, hasna): + def setup(self, dtype): data = np.random.randn(100_000, 10) df = DataFrame(data, dtype=dtype) - if hasna: - df.iloc[2, :] = None self.df = df - def time_clip(self, dtype, hasna): + def time_clip(self, dtype): self.df.clip(-1.0, 1.0) From 6eb43d0da496901a7ef3f2c607e22d027d6d7e1e Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sun, 19 Feb 2023 19:33:15 -0500 Subject: [PATCH 07/11] add inplace --- pandas/core/generic.py | 22 +++++++++++++++------- pandas/tests/copy_view/test_clip.py | 6 +++--- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7786578da7c67..837d35b6a3d26 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7990,14 +7990,22 @@ def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): raise ValueError("Cannot use an NA value as a clip threshold") mgr = self._mgr - mask = isna(self) - if upper is not None: - cond = mask | (self <= upper) - mgr = mgr.where(other=upper, cond=cond, align=False) - if lower is not None: - cond = mask | (self >= lower) - mgr = mgr.where(other=lower, cond=cond, align=False) + if inplace: + if upper is not None: + cond = self <= upper + mgr = mgr.putmask(mask=cond, new=upper, align=False) + if lower is not None: + cond = self >= lower + mgr = mgr.putmask(mask=cond, new=lower, align=False) + else: + mask = isna(self) + if upper is not None: + cond = mask | (self <= upper) + mgr = mgr.where(other=upper, cond=cond, align=False) + if lower is not None: + cond = mask | (self >= lower) + mgr = mgr.where(other=lower, cond=cond, align=False) result = self._constructor(mgr).__finalize__(self) if inplace: diff --git a/pandas/tests/copy_view/test_clip.py b/pandas/tests/copy_view/test_clip.py index 6626f724ab7f3..f6487d631b0f1 100644 --- a/pandas/tests/copy_view/test_clip.py +++ b/pandas/tests/copy_view/test_clip.py @@ -12,10 +12,10 @@ def test_clip_inplace_reference(using_copy_on_write): view = df[:] df.clip(lower=2, inplace=True) - # Clip not actually inplace right now but could be - assert not np.shares_memory(get_array(df, "a"), arr_a) - if using_copy_on_write: + assert not np.shares_memory(get_array(df, "a"), arr_a) assert df._mgr._has_no_reference(0) assert view._mgr._has_no_reference(0) tm.assert_frame_equal(df_copy, view) + else: + assert np.shares_memory(get_array(df, "a"), arr_a) From 70fbf81be8c13439ad8dc7ea3d5778e43eeea7db Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 20 Feb 2023 19:46:59 -0500 Subject: [PATCH 08/11] update inplace tests --- pandas/core/generic.py | 4 ++-- pandas/tests/copy_view/test_clip.py | 8 +++----- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ade4542618de8..1759dafaace2d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8007,11 +8007,11 @@ def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): cond = mask | (self >= lower) mgr = mgr.where(other=lower, cond=cond, align=False) - result = self._constructor(mgr).__finalize__(self) + result = self._constructor(mgr) if inplace: return self._update_inplace(result) else: - return result + return result.__finalize__(self) @final def _clip_with_one_bound(self, threshold, method, axis, inplace): diff --git a/pandas/tests/copy_view/test_clip.py b/pandas/tests/copy_view/test_clip.py index 50e8c36012c07..30140ed4ddb6d 100644 --- a/pandas/tests/copy_view/test_clip.py +++ b/pandas/tests/copy_view/test_clip.py @@ -28,13 +28,12 @@ def test_clip_inplace_reference_no_op(using_copy_on_write): view = df[:] df.clip(lower=0, inplace=True) + assert np.shares_memory(get_array(df, "a"), arr_a) + if using_copy_on_write: - assert np.shares_memory(get_array(df, "a"), arr_a) assert not df._mgr._has_no_reference(0) assert not view._mgr._has_no_reference(0) tm.assert_frame_equal(df_copy, view) - else: - assert not np.shares_memory(get_array(df, "a"), arr_a) def test_clip_inplace(using_copy_on_write): @@ -42,8 +41,7 @@ def test_clip_inplace(using_copy_on_write): arr_a = get_array(df, "a") df.clip(lower=2, inplace=True) - # Clip not actually inplace right now but could be - assert not np.shares_memory(get_array(df, "a"), arr_a) + assert np.shares_memory(get_array(df, "a"), arr_a) if using_copy_on_write: assert df._mgr._has_no_reference(0) From 1026357f7a6b59c66da69054be55d426ece05679 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 20 Feb 2023 20:07:52 -0500 Subject: [PATCH 09/11] fix --- pandas/core/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fa9da2f61d2fd..ec02606e15d9c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7999,10 +7999,10 @@ def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): if inplace: if upper is not None: - cond = self <= upper + cond = self > upper mgr = mgr.putmask(mask=cond, new=upper, align=False) if lower is not None: - cond = self >= lower + cond = self < lower mgr = mgr.putmask(mask=cond, new=lower, align=False) else: mask = isna(self) From 3dfa04d4ac97bde1dfc3bfdf17f31283a1adf124 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 21 Feb 2023 20:04:41 -0500 Subject: [PATCH 10/11] add comments --- pandas/core/generic.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ec02606e15d9c..6dcd977465f7c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7998,6 +7998,8 @@ def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): mgr = self._mgr if inplace: + # cond (for putmask) identifies values to be updated. + # exclude boundary as values at the boundary should be no-ops. if upper is not None: cond = self > upper mgr = mgr.putmask(mask=cond, new=upper, align=False) @@ -8005,6 +8007,8 @@ def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): cond = self < lower mgr = mgr.putmask(mask=cond, new=lower, align=False) else: + # cond (for where) identifies values to be left as-is. + # include boundary as values at the boundary should be no-ops. mask = isna(self) if upper is not None: cond = mask | (self <= upper) From 7dc0df4a9357de02fc178a5b61d5d397d13be401 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 22 Feb 2023 05:38:24 -0500 Subject: [PATCH 11/11] move whatsnew to 2.1.0 --- doc/source/whatsnew/v2.0.0.rst | 1 - doc/source/whatsnew/v2.1.0.rst | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index d0986be66ff7f..c0082b451c95d 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1122,7 +1122,6 @@ Performance improvements - Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`) - Performance improvement in :func:`to_datetime` when parsing strings with timezone offsets (:issue:`50107`) - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`) -- Performance improvement in :meth:`DataFrame.clip` and :meth:`Series.clip` (:issue:`51472`) - Performance improvement for :meth:`Series.replace` with categorical dtype (:issue:`49404`) - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) - Performance improvement for indexing operations with nullable and arrow dtypes (:issue:`49420`, :issue:`51316`) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index b83f317814ad9..aeaafbc4c125d 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -100,7 +100,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- +- Performance improvement in :meth:`DataFrame.clip` and :meth:`Series.clip` (:issue:`51472`) - .. ---------------------------------------------------------------------------