From 497c5619d2046e0b94b6a67cfbbf7526b1190dcd Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sat, 18 Feb 2023 17:15:33 -0500 Subject: [PATCH 1/9] PERF: Implement round on the block level --- asv_bench/benchmarks/frame_methods.py | 18 ++++++++++++++++++ doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/core/frame.py | 14 +++++++++++--- pandas/core/internals/blocks.py | 16 ++++++++++++++++ 4 files changed, 46 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 11b30ce601be6..690f446b8d9d0 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -739,4 +739,22 @@ def time_memory_usage_object_dtype(self): self.df2.memory_usage(deep=True) +class Round: + def setup(self): + self.df = DataFrame(np.random.randn(1000, 100)) + self.df_t = self.df.transpose(copy=True) + + def time_round(self): + self.df.round() + + def time_round_transposed(self): + self.df_t.round() + + def peakmem_round(self): + self.df.round() + + def peakmem_round_transposed(self): + self.df_t.round() + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 78422ec686da8..482ddbdce6b3a 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1153,7 +1153,7 @@ Performance improvements - Fixed a reference leak in :func:`read_hdf` (:issue:`37441`) - Fixed a memory leak in :meth:`DataFrame.to_json` and :meth:`Series.to_json` when serializing datetimes and timedeltas (:issue:`40443`) - Decreased memory usage in many :class:`DataFrameGroupBy` methods (:issue:`51090`) -- +- Performance improvement in :meth:`DataFrame.round` for an integer ``decimal`` parameter (:issue:`17254`) .. --------------------------------------------------------------------------- .. _whatsnew_200.bug_fixes: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fc0a2463e7702..7921c6efa2487 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9915,6 +9915,7 @@ def _dict_round(df: DataFrame, decimals): yield _series_round(vals, decimals[col]) except KeyError: yield vals + df._clear_item_cache() def _series_round(ser: Series, decimals: int) -> Series: if is_integer_dtype(ser.dtype) or is_float_dtype(ser.dtype): @@ -9923,6 +9924,9 @@ def _series_round(ser: Series, decimals: int) -> Series: nv.validate_round(args, kwargs) + new_cols = None + new_mgr = None + if isinstance(decimals, (dict, Series)): if isinstance(decimals, Series) and not decimals.index.is_unique: raise ValueError("Index of decimals must be unique") @@ -9932,15 +9936,19 @@ def _series_round(ser: Series, decimals: int) -> Series: raise TypeError("Values in decimals must be integers") new_cols = list(_dict_round(self, decimals)) elif is_integer(decimals): - # Dispatch to Series.round - new_cols = [_series_round(v, decimals) for _, v in self.items()] + # Dispatch to Block.round + new_mgr = self._mgr.apply("round", decimals=decimals) else: raise TypeError("decimals must be an integer, a dict-like or a Series") - if len(new_cols) > 0: + if new_cols is not None and len(new_cols) > 0: return self._constructor( concat(new_cols, axis=1), index=self.index, columns=self.columns ).__finalize__(self, method="round") + elif new_mgr is not None: + return self._constructor( + new_mgr, index=self.index, columns=self.columns + ).__finalize__(self, method="round") else: return self.copy(deep=False) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 1aba48371b430..4427fd637116a 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1397,6 +1397,22 @@ def quantile( result = ensure_block_shape(result, ndim=2) return new_block_2d(result, placement=self._mgr_locs) + def round(self, decimals) -> Block: + """ + Rounds the values. + If the block is not of an integer or float dtype, nothing happens + + Parameters + ---------- + decimals: Number of decimal places to round to. + Caller is responsible for validating this + """ + # TODO: EAs? + # Maybe fallback to numpy + if not self.is_numeric or self.is_bool: + return self.copy(deep=None) + return new_block_2d(self.values.round(decimals), placement=self._mgr_locs) + # --------------------------------------------------------------------- # Abstract Methods Overridden By EABackedBlock and NumpyBlock From 1a2c35f5fd46f05d4e4b021e70410078e2c3980a Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 20 Feb 2023 11:57:26 -0500 Subject: [PATCH 2/9] update --- pandas/core/frame.py | 14 +++++--------- pandas/core/internals/array_manager.py | 3 +++ pandas/core/internals/blocks.py | 11 ++++++----- pandas/core/internals/managers.py | 7 +++++++ 4 files changed, 21 insertions(+), 14 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7921c6efa2487..f7192105a55e4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9915,7 +9915,6 @@ def _dict_round(df: DataFrame, decimals): yield _series_round(vals, decimals[col]) except KeyError: yield vals - df._clear_item_cache() def _series_round(ser: Series, decimals: int) -> Series: if is_integer_dtype(ser.dtype) or is_float_dtype(ser.dtype): @@ -9924,9 +9923,6 @@ def _series_round(ser: Series, decimals: int) -> Series: nv.validate_round(args, kwargs) - new_cols = None - new_mgr = None - if isinstance(decimals, (dict, Series)): if isinstance(decimals, Series) and not decimals.index.is_unique: raise ValueError("Index of decimals must be unique") @@ -9937,7 +9933,11 @@ def _series_round(ser: Series, decimals: int) -> Series: new_cols = list(_dict_round(self, decimals)) elif is_integer(decimals): # Dispatch to Block.round - new_mgr = self._mgr.apply("round", decimals=decimals) + return self._constructor( + self._mgr.round(decimals=decimals, using_cow=using_copy_on_write()), + index=self.index, + columns=self.columns, + ).__finalize__(self, method="round") else: raise TypeError("decimals must be an integer, a dict-like or a Series") @@ -9945,10 +9945,6 @@ def _series_round(ser: Series, decimals: int) -> Series: return self._constructor( concat(new_cols, axis=1), index=self.index, columns=self.columns ).__finalize__(self, method="round") - elif new_mgr is not None: - return self._constructor( - new_mgr, index=self.index, columns=self.columns - ).__finalize__(self, method="round") else: return self.copy(deep=False) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 3c3ff2f313186..0f4ec97b8475e 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -323,6 +323,9 @@ def where(self: T, other, cond, align: bool) -> T: cond=cond, ) + def round(self: T, decimals: int, using_cow: bool = False) -> T: + return self.apply_with_block("round", decimals=decimals, using_cow=using_cow) + def setitem(self: T, indexer, value) -> T: return self.apply_with_block("setitem", indexer=indexer, value=value) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 4427fd637116a..b34ecb2b02433 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1397,20 +1397,21 @@ def quantile( result = ensure_block_shape(result, ndim=2) return new_block_2d(result, placement=self._mgr_locs) - def round(self, decimals) -> Block: + def round(self, decimals, using_cow=False) -> Block: """ Rounds the values. If the block is not of an integer or float dtype, nothing happens Parameters ---------- - decimals: Number of decimal places to round to. + decimals: int, + Number of decimal places to round to. Caller is responsible for validating this + using_cow: bool, + Whether Copy on Write is enabled right now """ - # TODO: EAs? - # Maybe fallback to numpy if not self.is_numeric or self.is_bool: - return self.copy(deep=None) + return self.copy(deep=None if using_cow else True) return new_block_2d(self.values.round(decimals), placement=self._mgr_locs) # --------------------------------------------------------------------- diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 664a122015ba5..74ccd1251ab16 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -355,6 +355,13 @@ def where(self: T, other, cond, align: bool) -> T: using_cow=using_copy_on_write(), ) + def round(self: T, decimals: int, using_cow: bool = False) -> T: + return self.apply( + "round", + decimals=decimals, + using_cow=using_cow, + ) + def setitem(self: T, indexer, value) -> T: """ Set values with indexer. From 81d9ac646ca0f598e479b21d1faa748010179608 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 20 Feb 2023 12:12:56 -0500 Subject: [PATCH 3/9] update benchmarks --- asv_bench/benchmarks/frame_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 690f446b8d9d0..761a6f7e9c9c1 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -741,7 +741,7 @@ def time_memory_usage_object_dtype(self): class Round: def setup(self): - self.df = DataFrame(np.random.randn(1000, 100)) + self.df = DataFrame(np.random.randn(10000, 10)) self.df_t = self.df.transpose(copy=True) def time_round(self): From 1e6485cbf92e708cf15399c1740a5c57c7ea70d1 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 22 Feb 2023 11:54:58 -0500 Subject: [PATCH 4/9] fix typing --- pandas/core/internals/blocks.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b34ecb2b02433..98c7e81c6bcfd 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1397,10 +1397,12 @@ def quantile( result = ensure_block_shape(result, ndim=2) return new_block_2d(result, placement=self._mgr_locs) - def round(self, decimals, using_cow=False) -> Block: + def round(self, decimals: int, using_cow: bool = False) -> Block: """ Rounds the values. - If the block is not of an integer or float dtype, nothing happens + If the block is not of an integer or float dtype, nothing happens. + This is consistent with DataFrame.round behavivor. + (Note: Series.round would raise) Parameters ---------- From f1584387fa3b7559222cd939c5f2f51d4227c848 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 22 Feb 2023 18:25:14 -0500 Subject: [PATCH 5/9] fix typing --- pandas/core/internals/blocks.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 98c7e81c6bcfd..db24d4aa94b75 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1413,8 +1413,15 @@ def round(self, decimals: int, using_cow: bool = False) -> Block: Whether Copy on Write is enabled right now """ if not self.is_numeric or self.is_bool: - return self.copy(deep=None if using_cow else True) - return new_block_2d(self.values.round(decimals), placement=self._mgr_locs) + return self.copy(deep=using_cow) + # TODO: round only defined on BaseMaskedArray + # Series also does this, so would need to fix both places + # error: Item "ExtensionArray" of "Union[ndarray[Any, Any], ExtensionArray]" + # has no attribute "round" + return new_block_2d( + self.values.round(decimals), # type: ignore[union-attr] + placement=self._mgr_locs, + ) # --------------------------------------------------------------------- # Abstract Methods Overridden By EABackedBlock and NumpyBlock From a472d796248bc71aab7419d7e4fb91fa7efd3ed6 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 27 Feb 2023 07:00:50 -0500 Subject: [PATCH 6/9] Update blocks.py --- pandas/core/internals/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index db24d4aa94b75..4c120c609b6bc 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1413,7 +1413,7 @@ def round(self, decimals: int, using_cow: bool = False) -> Block: Whether Copy on Write is enabled right now """ if not self.is_numeric or self.is_bool: - return self.copy(deep=using_cow) + return self.copy(deep=not using_cow) # TODO: round only defined on BaseMaskedArray # Series also does this, so would need to fix both places # error: Item "ExtensionArray" of "Union[ndarray[Any, Any], ExtensionArray]" From f628b06019bf62d9033836dff93fbf41e2860a93 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 28 Feb 2023 17:46:27 -0500 Subject: [PATCH 7/9] address code review --- pandas/core/frame.py | 2 -- pandas/core/internals/blocks.py | 8 ++++++++ pandas/tests/copy_view/test_methods.py | 11 ++++++++--- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d6a447c5605de..21a6f312adf35 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9937,8 +9937,6 @@ def _series_round(ser: Series, decimals: int) -> Series: # Dispatch to Block.round return self._constructor( self._mgr.round(decimals=decimals, using_cow=using_copy_on_write()), - index=self.index, - columns=self.columns, ).__finalize__(self, method="round") else: raise TypeError("decimals must be an integer, a dict-like or a Series") diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index d82d14a6594b7..f347a20dfd7a7 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1438,6 +1438,13 @@ def round(self, decimals: int, using_cow: bool = False) -> Block: """ if not self.is_numeric or self.is_bool: return self.copy(deep=not using_cow) + # If self.values is a numpy array of type int + # and decimals >= 0, this is a no-op and numpy + # returns the same array, so need to set refs + # https://github.com/numpy/numpy/blob/486878b37fc7439a3b2b87747f50db9b62fea8eb/numpy/core/src/multiarray/calculation.c#L625-L636 + refs = None + if not self.is_extension and self.dtype.kind == "i" and decimals >= 0: + refs = self.refs # TODO: round only defined on BaseMaskedArray # Series also does this, so would need to fix both places # error: Item "ExtensionArray" of "Union[ndarray[Any, Any], ExtensionArray]" @@ -1445,6 +1452,7 @@ def round(self, decimals: int, using_cow: bool = False) -> Block: return new_block_2d( self.values.round(decimals), # type: ignore[union-attr] placement=self._mgr_locs, + refs=refs, ) # --------------------------------------------------------------------- diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 7042d6e4f9478..a24c2cb9e3a6c 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -986,20 +986,25 @@ def test_sort_values_inplace(using_copy_on_write, obj, kwargs, using_array_manag assert np.shares_memory(get_array(obj, "a"), get_array(view, "a")) -def test_round(using_copy_on_write): +@pytest.mark.parametrize("decimals", [-1, 0, 1]) +def test_round(using_copy_on_write, decimals): df = DataFrame({"a": [1, 2], "b": "c"}) - df2 = df.round() df_orig = df.copy() + df2 = df.round(decimals=decimals) if using_copy_on_write: assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + if decimals >= 0: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) else: assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) df2.iloc[0, 1] = "d" + df2.iloc[0, 0] = 4 if using_copy_on_write: assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + if decimals >= 0: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) tm.assert_frame_equal(df, df_orig) From d0fabca71c3b8c8ac092cd97d05659fe67102356 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 28 Feb 2023 19:34:28 -0500 Subject: [PATCH 8/9] address code review --- pandas/core/internals/blocks.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 5ecfe6cccbe37..cba1a35210e0d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1464,20 +1464,22 @@ def round(self, decimals: int, using_cow: bool = False) -> Block: """ if not self.is_numeric or self.is_bool: return self.copy(deep=not using_cow) - # If self.values is a numpy array of type int - # and decimals >= 0, this is a no-op and numpy - # returns the same array, so need to set refs - # https://github.com/numpy/numpy/blob/486878b37fc7439a3b2b87747f50db9b62fea8eb/numpy/core/src/multiarray/calculation.c#L625-L636 refs = None - if not self.is_extension and self.dtype.kind == "i" and decimals >= 0: - refs = self.refs # TODO: round only defined on BaseMaskedArray # Series also does this, so would need to fix both places # error: Item "ExtensionArray" of "Union[ndarray[Any, Any], ExtensionArray]" # has no attribute "round" - return new_block_2d( - self.values.round(decimals), # type: ignore[union-attr] - placement=self._mgr_locs, + values = self.values.round(decimals) # type: ignore[union-attr] + if values is self.values: + refs = self.refs + if not using_cow: + # Normally would need to do this before, but + # numpy only returns same array when round operation + # is no-op + # https://github.com/numpy/numpy/blob/486878b37fc7439a3b2b87747f50db9b62fea8eb/numpy/core/src/multiarray/calculation.c#L625-L636 + values = values.copy() + return self.make_block_same_class( + values, refs=refs, ) From 15db50063ad5328378fe3360f49be6e2adffb46d Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 1 Mar 2023 10:24:17 -0500 Subject: [PATCH 9/9] updates --- pandas/core/internals/blocks.py | 5 +---- pandas/tests/copy_view/test_methods.py | 7 +++++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index cba1a35210e0d..6ed8f8632fb68 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1478,10 +1478,7 @@ def round(self, decimals: int, using_cow: bool = False) -> Block: # is no-op # https://github.com/numpy/numpy/blob/486878b37fc7439a3b2b87747f50db9b62fea8eb/numpy/core/src/multiarray/calculation.c#L625-L636 values = values.copy() - return self.make_block_same_class( - values, - refs=refs, - ) + return self.make_block_same_class(values, refs=refs) # --------------------------------------------------------------------- # Abstract Methods Overridden By EABackedBlock and NumpyBlock diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index a24c2cb9e3a6c..b30f8ab4c7b9c 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -994,8 +994,12 @@ def test_round(using_copy_on_write, decimals): if using_copy_on_write: assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + # TODO: Make inplace by using out parameter of ndarray.round? if decimals >= 0: + # Ensure lazy copy if no-op assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) else: assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) @@ -1003,8 +1007,7 @@ def test_round(using_copy_on_write, decimals): df2.iloc[0, 0] = 4 if using_copy_on_write: assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - if decimals >= 0: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) tm.assert_frame_equal(df, df_orig)