Skip to content

Backport PR #51498: PERF: Implement round on the block level #51729

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions asv_bench/benchmarks/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,4 +739,22 @@ def time_memory_usage_object_dtype(self):
self.df2.memory_usage(deep=True)


class Round:
def setup(self):
self.df = DataFrame(np.random.randn(10000, 10))
self.df_t = self.df.transpose(copy=True)

def time_round(self):
self.df.round()

def time_round_transposed(self):
self.df_t.round()

def peakmem_round(self):
self.df.round()

def peakmem_round_transposed(self):
self.df_t.round()


from .pandas_vb_common import setup # noqa: F401 isort:skip
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1169,6 +1169,7 @@ Performance improvements
- Fixed a reference leak in :func:`read_hdf` (:issue:`37441`)
- Fixed a memory leak in :meth:`DataFrame.to_json` and :meth:`Series.to_json` when serializing datetimes and timedeltas (:issue:`40443`)
- Decreased memory usage in many :class:`DataFrameGroupBy` methods (:issue:`51090`)
- Performance improvement in :meth:`DataFrame.round` for an integer ``decimal`` parameter (:issue:`17254`)
- Performance improvement in :meth:`DataFrame.replace` and :meth:`Series.replace` when using a large dict for ``to_replace`` (:issue:`6697`)
- Memory improvement in :class:`StataReader` when reading seekable files (:issue:`48922`)

Expand Down
8 changes: 5 additions & 3 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9927,12 +9927,14 @@ def _series_round(ser: Series, decimals: int) -> Series:
raise TypeError("Values in decimals must be integers")
new_cols = list(_dict_round(self, decimals))
elif is_integer(decimals):
# Dispatch to Series.round
new_cols = [_series_round(v, decimals) for _, v in self.items()]
# Dispatch to Block.round
return self._constructor(
self._mgr.round(decimals=decimals, using_cow=using_copy_on_write()),
).__finalize__(self, method="round")
else:
raise TypeError("decimals must be an integer, a dict-like or a Series")

if len(new_cols) > 0:
if new_cols is not None and len(new_cols) > 0:
return self._constructor(
concat(new_cols, axis=1), index=self.index, columns=self.columns
).__finalize__(self, method="round")
Expand Down
3 changes: 3 additions & 0 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,9 @@ def where(self: T, other, cond, align: bool) -> T:
cond=cond,
)

def round(self: T, decimals: int, using_cow: bool = False) -> T:
return self.apply_with_block("round", decimals=decimals, using_cow=using_cow)

def setitem(self: T, indexer, value) -> T:
return self.apply_with_block("setitem", indexer=indexer, value=value)

Expand Down
33 changes: 33 additions & 0 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1462,6 +1462,39 @@ def quantile(
result = ensure_block_shape(result, ndim=2)
return new_block_2d(result, placement=self._mgr_locs)

def round(self, decimals: int, using_cow: bool = False) -> Block:
"""
Rounds the values.
If the block is not of an integer or float dtype, nothing happens.
This is consistent with DataFrame.round behavivor.
(Note: Series.round would raise)

Parameters
----------
decimals: int,
Number of decimal places to round to.
Caller is responsible for validating this
using_cow: bool,
Whether Copy on Write is enabled right now
"""
if not self.is_numeric or self.is_bool:
return self.copy(deep=not using_cow)
refs = None
# TODO: round only defined on BaseMaskedArray
# Series also does this, so would need to fix both places
# error: Item "ExtensionArray" of "Union[ndarray[Any, Any], ExtensionArray]"
# has no attribute "round"
values = self.values.round(decimals) # type: ignore[union-attr]
if values is self.values:
refs = self.refs
if not using_cow:
# Normally would need to do this before, but
# numpy only returns same array when round operation
# is no-op
# https://github.com/numpy/numpy/blob/486878b37fc7439a3b2b87747f50db9b62fea8eb/numpy/core/src/multiarray/calculation.c#L625-L636
values = values.copy()
return self.make_block_same_class(values, refs=refs)

# ---------------------------------------------------------------------
# Abstract Methods Overridden By EABackedBlock and NumpyBlock

Expand Down
7 changes: 7 additions & 0 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,13 @@ def where(self: T, other, cond, align: bool) -> T:
using_cow=using_copy_on_write(),
)

def round(self: T, decimals: int, using_cow: bool = False) -> T:
return self.apply(
"round",
decimals=decimals,
using_cow=using_cow,
)

def setitem(self: T, indexer, value) -> T:
"""
Set values with indexer.
Expand Down
14 changes: 11 additions & 3 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -986,20 +986,28 @@ def test_sort_values_inplace(using_copy_on_write, obj, kwargs, using_array_manag
assert np.shares_memory(get_array(obj, "a"), get_array(view, "a"))


def test_round(using_copy_on_write):
@pytest.mark.parametrize("decimals", [-1, 0, 1])
def test_round(using_copy_on_write, decimals):
df = DataFrame({"a": [1, 2], "b": "c"})
df2 = df.round()
df_orig = df.copy()
df2 = df.round(decimals=decimals)

if using_copy_on_write:
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
# TODO: Make inplace by using out parameter of ndarray.round?
if decimals >= 0:
# Ensure lazy copy if no-op
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))

df2.iloc[0, 1] = "d"
df2.iloc[0, 0] = 4
if using_copy_on_write:
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
tm.assert_frame_equal(df, df_orig)


Expand Down