diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 11b30ce601be6..33fbc23085e23 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -17,6 +17,21 @@ from .pandas_vb_common import tm +class Clip: + params = [ + ["float64", "Float64", "float64[pyarrow]"], + ] + param_names = ["dtype"] + + def setup(self, dtype): + data = np.random.randn(100_000, 10) + df = DataFrame(data, dtype=dtype) + self.df = df + + def time_clip(self, dtype): + self.df.clip(-1.0, 1.0) + + class GetNumericData: def setup(self): self.df = DataFrame(np.random.randn(10000, 25)) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index b83f317814ad9..aeaafbc4c125d 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -100,7 +100,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- +- Performance improvement in :meth:`DataFrame.clip` and :meth:`Series.clip` (:issue:`51472`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 33dc62564d34e..f25ff81e375fc 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7998,24 +7998,33 @@ def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): ): raise ValueError("Cannot use an NA value as a clip threshold") - result = self - mask = isna(self._values) + mgr = self._mgr - with np.errstate(all="ignore"): + if inplace: + # cond (for putmask) identifies values to be updated. + # exclude boundary as values at the boundary should be no-ops. if upper is not None: - subset = self <= upper - result = result.where(subset, upper, axis=None, inplace=False) + cond = self > upper + mgr = mgr.putmask(mask=cond, new=upper, align=False) if lower is not None: - subset = self >= lower - result = result.where(subset, lower, axis=None, inplace=False) - - if np.any(mask): - result[mask] = np.nan + cond = self < lower + mgr = mgr.putmask(mask=cond, new=lower, align=False) + else: + # cond (for where) identifies values to be left as-is. + # include boundary as values at the boundary should be no-ops. + mask = isna(self) + if upper is not None: + cond = mask | (self <= upper) + mgr = mgr.where(other=upper, cond=cond, align=False) + if lower is not None: + cond = mask | (self >= lower) + mgr = mgr.where(other=lower, cond=cond, align=False) + result = self._constructor(mgr) if inplace: return self._update_inplace(result) else: - return result + return result.__finalize__(self) @final def _clip_with_one_bound(self, threshold, method, axis, inplace): diff --git a/pandas/tests/copy_view/test_clip.py b/pandas/tests/copy_view/test_clip.py index 3eacd62714ab6..30140ed4ddb6d 100644 --- a/pandas/tests/copy_view/test_clip.py +++ b/pandas/tests/copy_view/test_clip.py @@ -12,13 +12,13 @@ def test_clip_inplace_reference(using_copy_on_write): view = df[:] df.clip(lower=2, inplace=True) - # Clip not actually inplace right now but could be - assert not np.shares_memory(get_array(df, "a"), arr_a) - if using_copy_on_write: + assert not np.shares_memory(get_array(df, "a"), arr_a) assert df._mgr._has_no_reference(0) assert view._mgr._has_no_reference(0) tm.assert_frame_equal(df_copy, view) + else: + assert np.shares_memory(get_array(df, "a"), arr_a) def test_clip_inplace_reference_no_op(using_copy_on_write): @@ -28,13 +28,12 @@ def test_clip_inplace_reference_no_op(using_copy_on_write): view = df[:] df.clip(lower=0, inplace=True) + assert np.shares_memory(get_array(df, "a"), arr_a) + if using_copy_on_write: - assert np.shares_memory(get_array(df, "a"), arr_a) assert not df._mgr._has_no_reference(0) assert not view._mgr._has_no_reference(0) tm.assert_frame_equal(df_copy, view) - else: - assert not np.shares_memory(get_array(df, "a"), arr_a) def test_clip_inplace(using_copy_on_write): @@ -42,8 +41,7 @@ def test_clip_inplace(using_copy_on_write): arr_a = get_array(df, "a") df.clip(lower=2, inplace=True) - # Clip not actually inplace right now but could be - assert not np.shares_memory(get_array(df, "a"), arr_a) + assert np.shares_memory(get_array(df, "a"), arr_a) if using_copy_on_write: assert df._mgr._has_no_reference(0) diff --git a/pandas/tests/frame/methods/test_clip.py b/pandas/tests/frame/methods/test_clip.py index f8d9adf44dbc2..da13711d607c5 100644 --- a/pandas/tests/frame/methods/test_clip.py +++ b/pandas/tests/frame/methods/test_clip.py @@ -164,3 +164,10 @@ def test_clip_with_na_args(self, float_frame): result = df.clip(lower=t, axis=0) expected = DataFrame({"col_0": [9, -3, 0, 6, 5], "col_1": [2, -4, 6, 8, 3]}) tm.assert_frame_equal(result, expected) + + def test_clip_int_data_with_float_bound(self): + # GH51472 + df = DataFrame({"a": [1, 2, 3]}) + result = df.clip(lower=1.5) + expected = DataFrame({"a": [1.5, 2.0, 3.0]}) + tm.assert_frame_equal(result, expected)