From 61555f61705e10576863e2d99166171a4680184e Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 20 Jul 2020 13:18:45 -0700 Subject: [PATCH 1/6] PERF: blockwise equals --- pandas/core/internals/managers.py | 33 +++++++++------------- pandas/core/internals/ops.py | 47 ++++++++++++++++++++----------- 2 files changed, 44 insertions(+), 36 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 895385b170c91..3cfc06e5af3b6 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -49,7 +49,7 @@ get_block_type, make_block, ) -from pandas.core.internals.ops import operate_blockwise +from pandas.core.internals.ops import blockwise_all, operate_blockwise # TODO: flexible with index=None and/or items=None @@ -1422,32 +1422,25 @@ def equals(self, other: "BlockManager") -> bool: if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)): return False - if self.ndim == 1: - # For SingleBlockManager (i.e.Series) - if other.ndim != 1: - return False - left = self.blocks[0].values - right = other.blocks[0].values + def blk_func(left, right): if not is_dtype_equal(left.dtype, right.dtype): return False elif isinstance(left, ExtensionArray): return left.equals(right) + elif isinstance(right, ExtensionArray): + return False else: - return array_equivalent(left, right) + return array_equivalent(left, right, dtype_equal=True) - for i in range(len(self.items)): - # Check column-wise, return False if any column doesn't match - left = self.iget_values(i) - right = other.iget_values(i) - if not is_dtype_equal(left.dtype, right.dtype): + if self.ndim == 1: + # For SingleBlockManager (i.e.Series) + if other.ndim != 1: return False - elif isinstance(left, ExtensionArray): - if not left.equals(right): - return False - else: - if not array_equivalent(left, right, dtype_equal=True): - return False - return True + left = self.blocks[0].values + right = other.blocks[0].values + return blk_func(left, right) + + return blockwise_all(self, other, blk_func) def unstack(self, unstacker, fill_value) -> "BlockManager": """ diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index fd9a9a5ef6c93..650f080b4ba05 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -9,13 +9,10 @@ from pandas.core.internals.blocks import Block # noqa:F401 -def operate_blockwise( - left: "BlockManager", right: "BlockManager", array_op -) -> "BlockManager": +def _iter_block_pairs(left: "BlockManager", right: "BlockManager"): # At this point we have already checked the parent DataFrames for # assert rframe._indexed_same(lframe) - res_blks: List["Block"] = [] for n, blk in enumerate(left.blocks): locs = blk.mgr_locs blk_vals = blk.values @@ -34,21 +31,31 @@ def operate_blockwise( right_ea = not isinstance(rblk.values, np.ndarray) lvals, rvals = _get_same_shape_values(blk, rblk, left_ea, right_ea) + yield lvals, rvals, locs, left_ea, right_ea, rblk - res_values = array_op(lvals, rvals) - if left_ea and not right_ea and hasattr(res_values, "reshape"): - res_values = res_values.reshape(1, -1) - nbs = rblk._split_op_result(res_values) - # Assertions are disabled for performance, but should hold: - # if right_ea or left_ea: - # assert len(nbs) == 1 - # else: - # assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) +def operate_blockwise( + left: "BlockManager", right: "BlockManager", array_op +) -> "BlockManager": + # At this point we have already checked the parent DataFrames for + # assert rframe._indexed_same(lframe) + + res_blks: List["Block"] = [] + for lvals, rvals, locs, left_ea, right_ea, rblk in _iter_block_pairs(left, right): + res_values = array_op(lvals, rvals) + if left_ea and not right_ea and hasattr(res_values, "reshape"): + res_values = res_values.reshape(1, -1) + nbs = rblk._split_op_result(res_values) - _reset_block_mgr_locs(nbs, locs) + # Assertions are disabled for performance, but should hold: + # if right_ea or left_ea: + # assert len(nbs) == 1 + # else: + # assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) - res_blks.extend(nbs) + _reset_block_mgr_locs(nbs, locs) + + res_blks.extend(nbs) # Assertions are disabled for performance, but should hold: # slocs = {y for nb in res_blks for y in nb.mgr_locs.as_array} @@ -85,7 +92,7 @@ def _get_same_shape_values( # Require that the indexing into lvals be slice-like assert rblk.mgr_locs.is_slice_like, rblk.mgr_locs - # TODO(EA2D): with 2D EAs pnly this first clause would be needed + # TODO(EA2D): with 2D EAs only this first clause would be needed if not (left_ea or right_ea): lvals = lvals[rblk.mgr_locs.indexer, :] assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) @@ -102,3 +109,11 @@ def _get_same_shape_values( rvals = rvals[0, :] return lvals, rvals + + +def blockwise_all(left: "BlockManager", right: "BlockManager", op) -> bool: + for lvals, rvals, _, _, _, _ in _iter_block_pairs(left, right): + res = op(lvals, rvals) + if not res: + return False + return True From 8b44750124c95eb8e2ab4d57ef95bfa8ad0afb0c Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 20 Jul 2020 20:17:08 -0700 Subject: [PATCH 2/6] docstring --- pandas/core/internals/ops.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index 650f080b4ba05..c9eaf0a2d9bbb 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -112,6 +112,9 @@ def _get_same_shape_values( def blockwise_all(left: "BlockManager", right: "BlockManager", op) -> bool: + """ + Blockwise `all` reduction. + """ for lvals, rvals, _, _, _, _ in _iter_block_pairs(left, right): res = op(lvals, rvals) if not res: From 528fd859d6f5e366ac0e1cdbf82a5dc4c2403509 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 22 Jul 2020 16:37:44 -0700 Subject: [PATCH 3/6] Use namedtuple --- pandas/core/internals/ops.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index c9eaf0a2d9bbb..c2c0f5fd30821 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -1,3 +1,4 @@ +from collections import namedtuple from typing import TYPE_CHECKING, List, Tuple import numpy as np @@ -9,6 +10,11 @@ from pandas.core.internals.blocks import Block # noqa:F401 +BlockPairInfo = namedtuple( + "BlockPairInfo", ["lvals", "rvals", "locs", "left_ea", "right_ea", "rblk"], +) + + def _iter_block_pairs(left: "BlockManager", right: "BlockManager"): # At this point we have already checked the parent DataFrames for # assert rframe._indexed_same(lframe) @@ -31,7 +37,8 @@ def _iter_block_pairs(left: "BlockManager", right: "BlockManager"): right_ea = not isinstance(rblk.values, np.ndarray) lvals, rvals = _get_same_shape_values(blk, rblk, left_ea, right_ea) - yield lvals, rvals, locs, left_ea, right_ea, rblk + info = BlockPairInfo(lvals, rvals, locs, left_ea, right_ea, rblk) + yield info def operate_blockwise( @@ -115,8 +122,8 @@ def blockwise_all(left: "BlockManager", right: "BlockManager", op) -> bool: """ Blockwise `all` reduction. """ - for lvals, rvals, _, _, _, _ in _iter_block_pairs(left, right): - res = op(lvals, rvals) + for info in _iter_block_pairs(left, right): + res = op(info.lvals, info.rvals) if not res: return False return True From 11aeb304a95c52af032e18809fd22c981891256e Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 4 Aug 2020 20:14:46 -0700 Subject: [PATCH 4/6] annotate, make array_equals a module-level func --- pandas/core/internals/managers.py | 20 ++++---------------- pandas/core/internals/ops.py | 25 +++++++++++++++++++++++-- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 3cfc06e5af3b6..41d549c119091 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -19,7 +19,6 @@ from pandas.core.dtypes.common import ( DT64NS_DTYPE, is_datetimelike_v_numeric, - is_dtype_equal, is_extension_array_dtype, is_list_like, is_numeric_v_string_like, @@ -28,10 +27,9 @@ from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -from pandas.core.dtypes.missing import array_equivalent, isna +from pandas.core.dtypes.missing import isna import pandas.core.algorithms as algos -from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseDtype from pandas.core.base import PandasObject import pandas.core.common as com @@ -49,7 +47,7 @@ get_block_type, make_block, ) -from pandas.core.internals.ops import blockwise_all, operate_blockwise +from pandas.core.internals.ops import array_equals, blockwise_all, operate_blockwise # TODO: flexible with index=None and/or items=None @@ -1422,25 +1420,15 @@ def equals(self, other: "BlockManager") -> bool: if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)): return False - def blk_func(left, right): - if not is_dtype_equal(left.dtype, right.dtype): - return False - elif isinstance(left, ExtensionArray): - return left.equals(right) - elif isinstance(right, ExtensionArray): - return False - else: - return array_equivalent(left, right, dtype_equal=True) - if self.ndim == 1: # For SingleBlockManager (i.e.Series) if other.ndim != 1: return False left = self.blocks[0].values right = other.blocks[0].values - return blk_func(left, right) + return array_equals(left, right) - return blockwise_all(self, other, blk_func) + return blockwise_all(self, other, array_equals) def unstack(self, unstacker, fill_value) -> "BlockManager": """ diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index 168a7e46c0a30..4a29ae8c55a0d 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -1,10 +1,15 @@ from collections import namedtuple -from typing import TYPE_CHECKING, List, Tuple +from typing import TYPE_CHECKING, Generator, List, Tuple import numpy as np from pandas._typing import ArrayLike +from pandas.core.dtypes.common import is_dtype_equal +from pandas.core.dtypes.missing import array_equivalent + +from pandas.core.arrays import ExtensionArray + if TYPE_CHECKING: from pandas.core.internals.blocks import Block # noqa:F401 from pandas.core.internals.managers import BlockManager # noqa:F401 @@ -15,7 +20,9 @@ ) -def _iter_block_pairs(left: "BlockManager", right: "BlockManager"): +def _iter_block_pairs( + left: "BlockManager", right: "BlockManager" +) -> Generator[BlockPairInfo, None, None]: # At this point we have already checked the parent DataFrames for # assert rframe._indexed_same(lframe) @@ -127,3 +134,17 @@ def blockwise_all(left: "BlockManager", right: "BlockManager", op) -> bool: if not res: return False return True + + +def array_equals(left: ArrayLike, right: ArrayLike) -> bool: + """ + ExtensionArray-compatible implementation of array_equivalent. + """ + if not is_dtype_equal(left.dtype, right.dtype): + return False + elif isinstance(left, ExtensionArray): + return left.equals(right) + elif isinstance(right, ExtensionArray): + return False + else: + return array_equivalent(left, right, dtype_equal=True) From 8b21d3e53e689128d458ba693562b2ad6a5af5b1 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 5 Aug 2020 08:04:38 -0700 Subject: [PATCH 5/6] generator->iterator --- pandas/core/dtypes/missing.py | 16 +++++++++++++++- pandas/core/internals/managers.py | 4 ++-- pandas/core/internals/ops.py | 23 ++--------------------- 3 files changed, 19 insertions(+), 24 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 8551ce9f14e6c..bba35a850494f 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -10,7 +10,7 @@ from pandas._libs import lib import pandas._libs.missing as libmissing from pandas._libs.tslibs import NaT, iNaT -from pandas._typing import DtypeObj +from pandas._typing import ArrayLike, DtypeObj from pandas.core.dtypes.common import ( DT64NS_DTYPE, @@ -484,6 +484,20 @@ def _array_equivalent_object(left, right, strict_nan): return True +def array_equals(left: ArrayLike, right: ArrayLike) -> bool: + """ + ExtensionArray-compatible implementation of array_equivalent. + """ + if not is_dtype_equal(left.dtype, right.dtype): + return False + elif isinstance(left, ABCExtensionArray): + return left.equals(right) + elif isinstance(right, ABCExtensionArray): + return False + else: + return array_equivalent(left, right, dtype_equal=True) + + def _infer_fill_value(val): """ infer the fill value for the nan/NaT from the provided diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 41d549c119091..b8079d4af8e29 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -27,7 +27,7 @@ from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.missing import array_equals, isna import pandas.core.algorithms as algos from pandas.core.arrays.sparse import SparseDtype @@ -47,7 +47,7 @@ get_block_type, make_block, ) -from pandas.core.internals.ops import array_equals, blockwise_all, operate_blockwise +from pandas.core.internals.ops import blockwise_all, operate_blockwise # TODO: flexible with index=None and/or items=None diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index 4a29ae8c55a0d..ae4892c720d5b 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -1,15 +1,10 @@ from collections import namedtuple -from typing import TYPE_CHECKING, Generator, List, Tuple +from typing import TYPE_CHECKING, Iterator, List, Tuple import numpy as np from pandas._typing import ArrayLike -from pandas.core.dtypes.common import is_dtype_equal -from pandas.core.dtypes.missing import array_equivalent - -from pandas.core.arrays import ExtensionArray - if TYPE_CHECKING: from pandas.core.internals.blocks import Block # noqa:F401 from pandas.core.internals.managers import BlockManager # noqa:F401 @@ -22,7 +17,7 @@ def _iter_block_pairs( left: "BlockManager", right: "BlockManager" -) -> Generator[BlockPairInfo, None, None]: +) -> Iterator[BlockPairInfo]: # At this point we have already checked the parent DataFrames for # assert rframe._indexed_same(lframe) @@ -134,17 +129,3 @@ def blockwise_all(left: "BlockManager", right: "BlockManager", op) -> bool: if not res: return False return True - - -def array_equals(left: ArrayLike, right: ArrayLike) -> bool: - """ - ExtensionArray-compatible implementation of array_equivalent. - """ - if not is_dtype_equal(left.dtype, right.dtype): - return False - elif isinstance(left, ExtensionArray): - return left.equals(right) - elif isinstance(right, ExtensionArray): - return False - else: - return array_equivalent(left, right, dtype_equal=True) From 5e95f66e0c81c8ca076e3d10cb0b7d0e8e0ac1e5 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 6 Aug 2020 08:30:43 -0700 Subject: [PATCH 6/6] remove unnecessary check --- pandas/core/dtypes/missing.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index bba35a850494f..f59bb31af2828 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -492,8 +492,6 @@ def array_equals(left: ArrayLike, right: ArrayLike) -> bool: return False elif isinstance(left, ABCExtensionArray): return left.equals(right) - elif isinstance(right, ABCExtensionArray): - return False else: return array_equivalent(left, right, dtype_equal=True)