Skip to content

Commit 067f86f

Browse files
authored
PERF: BlockManager.equals blockwise (#35357)
1 parent 319a6d3 commit 067f86f

File tree

3 files changed

+61
-41
lines changed

3 files changed

+61
-41
lines changed

pandas/core/dtypes/missing.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from pandas._libs import lib
1111
import pandas._libs.missing as libmissing
1212
from pandas._libs.tslibs import NaT, iNaT
13-
from pandas._typing import DtypeObj
13+
from pandas._typing import ArrayLike, DtypeObj
1414

1515
from pandas.core.dtypes.common import (
1616
DT64NS_DTYPE,
@@ -484,6 +484,18 @@ def _array_equivalent_object(left, right, strict_nan):
484484
return True
485485

486486

487+
def array_equals(left: ArrayLike, right: ArrayLike) -> bool:
488+
"""
489+
ExtensionArray-compatible implementation of array_equivalent.
490+
"""
491+
if not is_dtype_equal(left.dtype, right.dtype):
492+
return False
493+
elif isinstance(left, ABCExtensionArray):
494+
return left.equals(right)
495+
else:
496+
return array_equivalent(left, right, dtype_equal=True)
497+
498+
487499
def _infer_fill_value(val):
488500
"""
489501
infer the fill value for the nan/NaT from the provided

pandas/core/internals/managers.py

+4-23
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
from pandas.core.dtypes.common import (
2020
DT64NS_DTYPE,
2121
is_datetimelike_v_numeric,
22-
is_dtype_equal,
2322
is_extension_array_dtype,
2423
is_list_like,
2524
is_numeric_v_string_like,
@@ -28,10 +27,9 @@
2827
from pandas.core.dtypes.concat import concat_compat
2928
from pandas.core.dtypes.dtypes import ExtensionDtype
3029
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
31-
from pandas.core.dtypes.missing import array_equivalent, isna
30+
from pandas.core.dtypes.missing import array_equals, isna
3231

3332
import pandas.core.algorithms as algos
34-
from pandas.core.arrays import ExtensionArray
3533
from pandas.core.arrays.sparse import SparseDtype
3634
from pandas.core.base import PandasObject
3735
import pandas.core.common as com
@@ -49,7 +47,7 @@
4947
get_block_type,
5048
make_block,
5149
)
52-
from pandas.core.internals.ops import operate_blockwise
50+
from pandas.core.internals.ops import blockwise_all, operate_blockwise
5351

5452
# TODO: flexible with index=None and/or items=None
5553

@@ -1449,26 +1447,9 @@ def equals(self, other: "BlockManager") -> bool:
14491447
return False
14501448
left = self.blocks[0].values
14511449
right = other.blocks[0].values
1452-
if not is_dtype_equal(left.dtype, right.dtype):
1453-
return False
1454-
elif isinstance(left, ExtensionArray):
1455-
return left.equals(right)
1456-
else:
1457-
return array_equivalent(left, right)
1450+
return array_equals(left, right)
14581451

1459-
for i in range(len(self.items)):
1460-
# Check column-wise, return False if any column doesn't match
1461-
left = self.iget_values(i)
1462-
right = other.iget_values(i)
1463-
if not is_dtype_equal(left.dtype, right.dtype):
1464-
return False
1465-
elif isinstance(left, ExtensionArray):
1466-
if not left.equals(right):
1467-
return False
1468-
else:
1469-
if not array_equivalent(left, right, dtype_equal=True):
1470-
return False
1471-
return True
1452+
return blockwise_all(self, other, array_equals)
14721453

14731454
def unstack(self, unstacker, fill_value) -> "BlockManager":
14741455
"""

pandas/core/internals/ops.py

+44-17
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
from typing import TYPE_CHECKING, List, Tuple
1+
from collections import namedtuple
2+
from typing import TYPE_CHECKING, Iterator, List, Tuple
23

34
import numpy as np
45

@@ -9,13 +10,17 @@
910
from pandas.core.internals.managers import BlockManager # noqa:F401
1011

1112

12-
def operate_blockwise(
13-
left: "BlockManager", right: "BlockManager", array_op
14-
) -> "BlockManager":
13+
BlockPairInfo = namedtuple(
14+
"BlockPairInfo", ["lvals", "rvals", "locs", "left_ea", "right_ea", "rblk"],
15+
)
16+
17+
18+
def _iter_block_pairs(
19+
left: "BlockManager", right: "BlockManager"
20+
) -> Iterator[BlockPairInfo]:
1521
# At this point we have already checked the parent DataFrames for
1622
# assert rframe._indexed_same(lframe)
1723

18-
res_blks: List["Block"] = []
1924
for n, blk in enumerate(left.blocks):
2025
locs = blk.mgr_locs
2126
blk_vals = blk.values
@@ -34,21 +39,32 @@ def operate_blockwise(
3439
right_ea = not isinstance(rblk.values, np.ndarray)
3540

3641
lvals, rvals = _get_same_shape_values(blk, rblk, left_ea, right_ea)
42+
info = BlockPairInfo(lvals, rvals, locs, left_ea, right_ea, rblk)
43+
yield info
3744

38-
res_values = array_op(lvals, rvals)
39-
if left_ea and not right_ea and hasattr(res_values, "reshape"):
40-
res_values = res_values.reshape(1, -1)
41-
nbs = rblk._split_op_result(res_values)
4245

43-
# Assertions are disabled for performance, but should hold:
44-
# if right_ea or left_ea:
45-
# assert len(nbs) == 1
46-
# else:
47-
# assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape)
46+
def operate_blockwise(
47+
left: "BlockManager", right: "BlockManager", array_op
48+
) -> "BlockManager":
49+
# At this point we have already checked the parent DataFrames for
50+
# assert rframe._indexed_same(lframe)
51+
52+
res_blks: List["Block"] = []
53+
for lvals, rvals, locs, left_ea, right_ea, rblk in _iter_block_pairs(left, right):
54+
res_values = array_op(lvals, rvals)
55+
if left_ea and not right_ea and hasattr(res_values, "reshape"):
56+
res_values = res_values.reshape(1, -1)
57+
nbs = rblk._split_op_result(res_values)
58+
59+
# Assertions are disabled for performance, but should hold:
60+
# if right_ea or left_ea:
61+
# assert len(nbs) == 1
62+
# else:
63+
# assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape)
4864

49-
_reset_block_mgr_locs(nbs, locs)
65+
_reset_block_mgr_locs(nbs, locs)
5066

51-
res_blks.extend(nbs)
67+
res_blks.extend(nbs)
5268

5369
# Assertions are disabled for performance, but should hold:
5470
# slocs = {y for nb in res_blks for y in nb.mgr_locs.as_array}
@@ -85,7 +101,7 @@ def _get_same_shape_values(
85101
# Require that the indexing into lvals be slice-like
86102
assert rblk.mgr_locs.is_slice_like, rblk.mgr_locs
87103

88-
# TODO(EA2D): with 2D EAs pnly this first clause would be needed
104+
# TODO(EA2D): with 2D EAs only this first clause would be needed
89105
if not (left_ea or right_ea):
90106
lvals = lvals[rblk.mgr_locs.indexer, :]
91107
assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape)
@@ -102,3 +118,14 @@ def _get_same_shape_values(
102118
rvals = rvals[0, :]
103119

104120
return lvals, rvals
121+
122+
123+
def blockwise_all(left: "BlockManager", right: "BlockManager", op) -> bool:
124+
"""
125+
Blockwise `all` reduction.
126+
"""
127+
for info in _iter_block_pairs(left, right):
128+
res = op(info.lvals, info.rvals)
129+
if not res:
130+
return False
131+
return True

0 commit comments

Comments
 (0)