From 1697252a73785bb4ad1bfff82304d5c37534897f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 17 Mar 2020 09:59:21 -0700 Subject: [PATCH 01/32] PERF: block-wise arithmetic for frame-with-frame --- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/ops/__init__.py | 72 +++++++++++++++++++++- pandas/core/ops/array_ops.py | 2 +- pandas/tests/arithmetic/common.py | 9 ++- pandas/tests/arithmetic/test_datetime64.py | 7 ++- 5 files changed, 83 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 105d9581b1a25..46f2c239b4193 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1299,7 +1299,7 @@ def _addsub_object_array(self, other: np.ndarray, op): result : same class as self """ assert op in [operator.add, operator.sub] - if len(other) == 1: + if len(other) == 1 and self.ndim == other.ndim == 1: return op(self, other[0]) warnings.warn( diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 3153a9ac28c10..f80ad80e15d7b 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -5,7 +5,7 @@ """ import datetime import operator -from typing import TYPE_CHECKING, Optional, Set, Tuple +from typing import TYPE_CHECKING, List, Optional, Set, Tuple import numpy as np @@ -58,6 +58,7 @@ if TYPE_CHECKING: from pandas import DataFrame # noqa:F401 + from pandas.core.internals.blocks import Block # noqa: F401 # ----------------------------------------------------------------------------- # constants @@ -353,6 +354,70 @@ def fill_binop(left, right, fill_value): # Dispatch logic +def operate_blockwise(left, right, array_op): + assert right._indexed_same(left) + + res_blks: List["Block"] = [] + rmgr = right._data + for n, blk in enumerate(left._data.blocks): + locs = blk.mgr_locs + + blk_vals = blk.values + + if not isinstance(blk_vals, np.ndarray): + # 1D EA + assert len(locs) == 1, locs + rser = right.iloc[:, locs[0]] + rvals = extract_array(rser, extract_numpy=True) + res_values = array_op(blk_vals, rvals) + nbs = blk._split_op_result(res_values) + res_blks.extend(nbs) + continue + + rblks = rmgr._slice_take_blocks_ax0(locs.indexer) + + for k, rblk in enumerate(rblks): + lvals = blk_vals[rblk.mgr_locs.indexer, :] + rvals = rblk.values + + if not isinstance(rvals, np.ndarray): + # 1D EA + assert lvals.shape[0] == 1, lvals.shape + lvals = lvals[0, :] + res_values = array_op(lvals, rvals) + nbs = rblk._split_op_result(res_values) + assert len(nbs) == 1 + nb = nbs[0] + nb.mgr_locs = locs.as_array[nb.mgr_locs] + res_blks.append(nb) + continue + + assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) + + res_values = array_op(lvals, rvals) + assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) + nbs = rblk._split_op_result(res_values) + for nb in nbs: + # TODO: maybe optimize by sticking with slices? + nb_mgr_locs = nb.mgr_locs + nblocs = locs.as_array[nb_mgr_locs.indexer] + nb.mgr_locs = nblocs + assert len(nblocs) == nb.shape[0], (len(nblocs), nb.shape) + assert all(x in locs.as_array for x in nb.mgr_locs.as_array) + + res_blks.extend(nbs) + + slocs = set(y for nb in res_blks for y in nb.mgr_locs.as_array) + nlocs = sum(len(nb.mgr_locs.as_array) for nb in res_blks) + assert nlocs == len(left.columns), (nlocs, len(left.columns)) + assert len(slocs) == nlocs, (len(slocs), nlocs) + assert slocs == set(range(nlocs)), slocs + + # TODO: once this is working, pass do_integrity_check=False + new_mgr = type(rmgr)(res_blks, axes=rmgr.axes) + return new_mgr + + def dispatch_to_series(left, right, func, str_rep=None, axis=None): """ Evaluate the frame operation func(left, right) by evaluating @@ -385,8 +450,9 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None): elif isinstance(right, ABCDataFrame): assert right._indexed_same(left) - def column_op(a, b): - return {i: func(a.iloc[:, i], b.iloc[:, i]) for i in range(len(a.columns))} + array_op = get_array_op(func, str_rep=str_rep) + bm = operate_blockwise(left, right, array_op) + return type(left)(bm) elif isinstance(right, ABCSeries) and axis == "columns": # We only get here if called via _combine_series_frame, diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index e285c53d9813e..dcef6d8f3c981 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -132,7 +132,7 @@ def masked_arith_op(x: np.ndarray, y, op): return result -def define_na_arithmetic_op(op, str_rep: str): +def define_na_arithmetic_op(op, str_rep: Optional[str]): def na_op(x, y): return na_arithmetic_op(x, y, op, str_rep) diff --git a/pandas/tests/arithmetic/common.py b/pandas/tests/arithmetic/common.py index ccc49adc5da82..755fbd0d9036c 100644 --- a/pandas/tests/arithmetic/common.py +++ b/pandas/tests/arithmetic/common.py @@ -70,7 +70,14 @@ def assert_invalid_comparison(left, right, box): result = right != left tm.assert_equal(result, ~expected) - msg = "Invalid comparison between|Cannot compare type|not supported between" + msg = "|".join( + [ + "Invalid comparison between", + "Cannot compare type", + "not supported between", + "invalid type promotion", + ] + ) with pytest.raises(TypeError, match=msg): left < right with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index f7211ab5f9fd4..5cadf8bff51f1 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -964,7 +964,9 @@ def test_dt64arr_sub_dt64object_array(self, box_with_array, tz_naive_fixture): obj = tm.box_expected(dti, box_with_array) expected = tm.box_expected(expected, box_with_array) - warn = PerformanceWarning if box_with_array is not pd.DataFrame else None + warn = None + if box_with_array is not pd.DataFrame or tz_naive_fixture is None: + warn = PerformanceWarning with tm.assert_produces_warning(warn): result = obj - obj.astype(object) tm.assert_equal(result, expected) @@ -1388,8 +1390,7 @@ def test_dt64arr_add_mixed_offset_array(self, box_with_array): s = DatetimeIndex([Timestamp("2000-1-1"), Timestamp("2000-2-1")]) s = tm.box_expected(s, box_with_array) - warn = None if box_with_array is pd.DataFrame else PerformanceWarning - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): other = pd.Index([pd.offsets.DateOffset(years=1), pd.offsets.MonthEnd()]) other = tm.box_expected(other, box_with_array) result = s + other From 30a836d6ce7a7ef0c9f01daca99cb182bebbbcfa Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 17 Mar 2020 10:46:22 -0700 Subject: [PATCH 02/32] lint fixup --- pandas/core/ops/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index f80ad80e15d7b..e127be3b7d644 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -407,7 +407,7 @@ def operate_blockwise(left, right, array_op): res_blks.extend(nbs) - slocs = set(y for nb in res_blks for y in nb.mgr_locs.as_array) + slocs = {y for nb in res_blks for y in nb.mgr_locs.as_array} nlocs = sum(len(nb.mgr_locs.as_array) for nb in res_blks) assert nlocs == len(left.columns), (nlocs, len(left.columns)) assert len(slocs) == nlocs, (len(slocs), nlocs) From 4334353b53ead13ffb777cb869a17e18687be4c2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 17 Mar 2020 19:14:02 -0700 Subject: [PATCH 03/32] troubleshoot npdev build --- pandas/core/ops/array_ops.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index dcef6d8f3c981..4338746f05386 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -5,6 +5,7 @@ from functools import partial import operator from typing import Any, Optional +import warnings import numpy as np @@ -163,15 +164,18 @@ def na_arithmetic_op(left, right, op, str_rep: Optional[str], is_cmp: bool = Fal """ import pandas.core.computation.expressions as expressions - try: - result = expressions.evaluate(op, str_rep, left, right) - except TypeError: - if is_cmp: - # numexpr failed on comparison op, e.g. ndarray[float] > datetime - # In this case we do not fall back to the masked op, as that - # will handle complex numbers incorrectly, see GH#32047 - raise - result = masked_arith_op(left, right, op) + with warnings.catch_warnings(): + # suppress warnings from numpy about element-wise comparison + warnings.simplefilter("ignore", DeprecationWarning) + try: + result = expressions.evaluate(op, str_rep, left, right) + except TypeError: + if is_cmp: + # numexpr failed on comparison op, e.g. ndarray[float] > datetime + # In this case we do not fall back to the masked op, as that + # will handle complex numbers incorrectly, see GH#32047 + raise + result = masked_arith_op(left, right, op) if is_cmp and (is_scalar(result) or result is NotImplemented): # numpy returned a scalar instead of operating element-wise From 713a776e551186e9e2e4d480f362072b60042f65 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 24 Mar 2020 19:25:57 -0700 Subject: [PATCH 04/32] comment --- pandas/core/arrays/datetimelike.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 6ca45f9acec91..2fdb33b7b8a8f 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1297,10 +1297,11 @@ def _addsub_object_array(self, other: np.ndarray, op): """ assert op in [operator.add, operator.sub] if len(other) == 1 and self.ndim == other.ndim == 1: + # If both 1D then broadcasting is unambiguous return op(self, other[0]) warnings.warn( - "Adding/subtracting array of DateOffsets to " + "Adding/subtracting object-dtype array to " f"{type(self).__name__} not vectorized", PerformanceWarning, ) @@ -1308,7 +1309,7 @@ def _addsub_object_array(self, other: np.ndarray, op): # Caller is responsible for broadcasting if necessary assert self.shape == other.shape, (self.shape, other.shape) - res_values = op(self.astype("O"), np.array(other)) + res_values = op(self.astype("O"), np.asarray(other)) result = array(res_values.ravel()) result = extract_array(result, extract_numpy=True).reshape(self.shape) return result From 95ef3adddd6554ff1aa21ba3304cc9b175f6c49b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 24 Mar 2020 20:45:20 -0700 Subject: [PATCH 05/32] checkpoint passing --- pandas/core/ops/__init__.py | 51 +++++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 16 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index caf420f06a7fb..4fdd18418f423 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -304,43 +304,62 @@ def operate_blockwise(left, right, array_op): blk_vals = blk.values + left_ea = not isinstance(blk_vals, np.ndarray) + if not isinstance(blk_vals, np.ndarray): # 1D EA assert len(locs) == 1, locs - rser = right.iloc[:, locs[0]] - rvals = extract_array(rser, extract_numpy=True) + rblks = rmgr._slice_take_blocks_ax0(locs.indexer) + assert len(rblks) == 1, rblks + rblk = rblks[0] + assert rblk.shape[0] == 1, rblk.shape + + rvals = rblk.values + right_ea = not isinstance(rvals, np.ndarray) + if not right_ea: + assert rvals.shape[0] == 1, rvals.shape + rvals = rvals[0, :] + #rser = right.iloc[:, locs[0]] + #rvals = extract_array(rser, extract_numpy=True) res_values = array_op(blk_vals, rvals) nbs = blk._split_op_result(res_values) + # Setting nb.mgr_locs is unnecessary here, but harmless res_blks.extend(nbs) continue rblks = rmgr._slice_take_blocks_ax0(locs.indexer) for k, rblk in enumerate(rblks): - lvals = blk_vals[rblk.mgr_locs.indexer, :] rvals = rblk.values + right_ea = not isinstance(rvals, np.ndarray) - if not isinstance(rvals, np.ndarray): + lvals = blk_vals[rblk.mgr_locs.indexer, :] + + if not (left_ea or right_ea): + assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) + #elif left_ea and right_ea: + # assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) + elif right_ea: # 1D EA assert lvals.shape[0] == 1, lvals.shape lvals = lvals[0, :] - res_values = array_op(lvals, rvals) - nbs = rblk._split_op_result(res_values) - assert len(nbs) == 1 - nb = nbs[0] - nb.mgr_locs = locs.as_array[nb.mgr_locs] - res_blks.append(nb) - continue - - assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) + else: + assert False # should be unreachable ATM + #assert rvals.shape[0] == 1, rvals.shape + #rvals = rvals[0, :] res_values = array_op(lvals, rvals) - assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) nbs = rblk._split_op_result(res_values) + + # Debugging assertions + if right_ea: # or left_ea + assert len(nbs) == 1 + else: + assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) + for nb in nbs: # TODO: maybe optimize by sticking with slices? - nb_mgr_locs = nb.mgr_locs - nblocs = locs.as_array[nb_mgr_locs.indexer] + nblocs = locs.as_array[nb.mgr_locs.indexer] nb.mgr_locs = nblocs assert len(nblocs) == nb.shape[0], (len(nblocs), nb.shape) assert all(x in locs.as_array for x in nb.mgr_locs.as_array) From 61e5cd6260d3a94f195fa5466a6c9cc0a6bd7607 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 25 Mar 2020 09:29:34 -0700 Subject: [PATCH 06/32] checkpoint passing --- pandas/core/ops/__init__.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 4fdd18418f423..687d1b8bb7891 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -306,7 +306,7 @@ def operate_blockwise(left, right, array_op): left_ea = not isinstance(blk_vals, np.ndarray) - if not isinstance(blk_vals, np.ndarray): + if False:#left_ea: # 1D EA assert len(locs) == 1, locs rblks = rmgr._slice_take_blocks_ax0(locs.indexer) @@ -319,8 +319,6 @@ def operate_blockwise(left, right, array_op): if not right_ea: assert rvals.shape[0] == 1, rvals.shape rvals = rvals[0, :] - #rser = right.iloc[:, locs[0]] - #rvals = extract_array(rser, extract_numpy=True) res_values = array_op(blk_vals, rvals) nbs = blk._split_op_result(res_values) # Setting nb.mgr_locs is unnecessary here, but harmless @@ -329,30 +327,38 @@ def operate_blockwise(left, right, array_op): rblks = rmgr._slice_take_blocks_ax0(locs.indexer) + if left_ea: + assert len(locs) == 1, locs + assert len(rblks) == 1, rblks + assert rblks[0].shape[0] == 1, rblks[0].shape + for k, rblk in enumerate(rblks): rvals = rblk.values right_ea = not isinstance(rvals, np.ndarray) - lvals = blk_vals[rblk.mgr_locs.indexer, :] + #lvals = blk_vals[rblk.mgr_locs.indexer, :] if not (left_ea or right_ea): + lvals = blk_vals[rblk.mgr_locs.indexer, :] + assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) + elif left_ea and right_ea: + lvals = blk_vals assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) - #elif left_ea and right_ea: - # assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) elif right_ea: # 1D EA + lvals = blk_vals[rblk.mgr_locs.indexer, :] assert lvals.shape[0] == 1, lvals.shape lvals = lvals[0, :] else: - assert False # should be unreachable ATM - #assert rvals.shape[0] == 1, rvals.shape - #rvals = rvals[0, :] + lvals = blk_vals + assert rvals.shape[0] == 1, rvals.shape + rvals = rvals[0, :] res_values = array_op(lvals, rvals) nbs = rblk._split_op_result(res_values) # Debugging assertions - if right_ea: # or left_ea + if right_ea or left_ea: assert len(nbs) == 1 else: assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) From 89c3d7bdef4a1b89f14161ac4db3a87611aa3cec Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 25 Mar 2020 10:03:31 -0700 Subject: [PATCH 07/32] refactor --- pandas/core/ops/__init__.py | 75 ++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 43 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 687d1b8bb7891..b285c5494ee71 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -4,7 +4,7 @@ This is not a public API. """ import operator -from typing import TYPE_CHECKING, List, Optional, Set +from typing import TYPE_CHECKING, List, Optional, Set, Tuple import numpy as np @@ -297,34 +297,41 @@ def fill_binop(left, right, fill_value): def operate_blockwise(left, right, array_op): assert right._indexed_same(left) + def get_same_shape_values( + lblk: "Block", rblk: "Block", left_ea: bool, right_ea: bool + ) -> Tuple[ArrayLike, ArrayLike]: + """ + Slice lblk.values to align with rblk. Squeeze if we have EAs. + """ + lvals = lblk.values + rvals = rblk.values + + # TODO(EA2D): with 2D EAs pnly this first clause would be needed + if not (left_ea or right_ea): + lvals = lvals[rblk.mgr_locs.indexer, :] + assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) + elif left_ea and right_ea: + assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) + elif right_ea: + # lvals are 2D, rvals are 1D + lvals = lvals[rblk.mgr_locs.indexer, :] + assert lvals.shape[0] == 1, lvals.shape + lvals = lvals[0, :] + else: + # lvals are 1D, rvals are 2D + assert rvals.shape[0] == 1, rvals.shape + rvals = rvals[0, :] + + return lvals, rvals + res_blks: List["Block"] = [] rmgr = right._data for n, blk in enumerate(left._data.blocks): locs = blk.mgr_locs - blk_vals = blk.values left_ea = not isinstance(blk_vals, np.ndarray) - if False:#left_ea: - # 1D EA - assert len(locs) == 1, locs - rblks = rmgr._slice_take_blocks_ax0(locs.indexer) - assert len(rblks) == 1, rblks - rblk = rblks[0] - assert rblk.shape[0] == 1, rblk.shape - - rvals = rblk.values - right_ea = not isinstance(rvals, np.ndarray) - if not right_ea: - assert rvals.shape[0] == 1, rvals.shape - rvals = rvals[0, :] - res_values = array_op(blk_vals, rvals) - nbs = blk._split_op_result(res_values) - # Setting nb.mgr_locs is unnecessary here, but harmless - res_blks.extend(nbs) - continue - rblks = rmgr._slice_take_blocks_ax0(locs.indexer) if left_ea: @@ -333,38 +340,20 @@ def operate_blockwise(left, right, array_op): assert rblks[0].shape[0] == 1, rblks[0].shape for k, rblk in enumerate(rblks): - rvals = rblk.values - right_ea = not isinstance(rvals, np.ndarray) - - #lvals = blk_vals[rblk.mgr_locs.indexer, :] - - if not (left_ea or right_ea): - lvals = blk_vals[rblk.mgr_locs.indexer, :] - assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) - elif left_ea and right_ea: - lvals = blk_vals - assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) - elif right_ea: - # 1D EA - lvals = blk_vals[rblk.mgr_locs.indexer, :] - assert lvals.shape[0] == 1, lvals.shape - lvals = lvals[0, :] - else: - lvals = blk_vals - assert rvals.shape[0] == 1, rvals.shape - rvals = rvals[0, :] + right_ea = not isinstance(rblk.values, np.ndarray) + + lvals, rvals = get_same_shape_values(blk, rblk, left_ea, right_ea) res_values = array_op(lvals, rvals) nbs = rblk._split_op_result(res_values) - # Debugging assertions if right_ea or left_ea: assert len(nbs) == 1 else: assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) for nb in nbs: - # TODO: maybe optimize by sticking with slices? + # Reset mgr_locs to correspond to our original DataFrame nblocs = locs.as_array[nb.mgr_locs.indexer] nb.mgr_locs = nblocs assert len(nblocs) == nb.shape[0], (len(nblocs), nb.shape) From e348e464457556b1678c898c9cdc7366cd0b3877 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 25 Mar 2020 10:11:25 -0700 Subject: [PATCH 08/32] blackify --- pandas/tests/frame/test_arithmetic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 18cd2a4b0c90b..0b73583002d5c 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1,7 +1,6 @@ from collections import deque from datetime import datetime import operator -import re import numpy as np import pytest @@ -47,10 +46,11 @@ def check(df, df2): ) tm.assert_frame_equal(result, expected) - msg = "|".join([ + msgs = [ r"Invalid comparison between dtype=datetime64\[ns\] and ndarray", "invalid type promotion", - ]) + ] + msg = "|".join(msgs) with pytest.raises(TypeError, match=msg): x >= y with pytest.raises(TypeError, match=msg): From 2b1ba182144759b68e1a7b4eb8f3bfca4f2a05fb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 31 Mar 2020 14:44:03 -0700 Subject: [PATCH 09/32] disable assertions for perf --- pandas/core/ops/__init__.py | 19 ++++++++++--------- pandas/tests/frame/test_arithmetic.py | 1 + 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index b285c5494ee71..3911380c7f795 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -356,19 +356,20 @@ def get_same_shape_values( # Reset mgr_locs to correspond to our original DataFrame nblocs = locs.as_array[nb.mgr_locs.indexer] nb.mgr_locs = nblocs - assert len(nblocs) == nb.shape[0], (len(nblocs), nb.shape) - assert all(x in locs.as_array for x in nb.mgr_locs.as_array) + # Assertions are disabled for performance, but should hold: + # assert len(nblocs) == nb.shape[0], (len(nblocs), nb.shape) + # assert all(x in locs.as_array for x in nb.mgr_locs.as_array) res_blks.extend(nbs) - slocs = {y for nb in res_blks for y in nb.mgr_locs.as_array} - nlocs = sum(len(nb.mgr_locs.as_array) for nb in res_blks) - assert nlocs == len(left.columns), (nlocs, len(left.columns)) - assert len(slocs) == nlocs, (len(slocs), nlocs) - assert slocs == set(range(nlocs)), slocs + # Assertions are disabled for performance, but should hold: + # slocs = {y for nb in res_blks for y in nb.mgr_locs.as_array} + # nlocs = sum(len(nb.mgr_locs.as_array) for nb in res_blks) + # assert nlocs == len(left.columns), (nlocs, len(left.columns)) + # assert len(slocs) == nlocs, (len(slocs), nlocs) + # assert slocs == set(range(nlocs)), slocs - # TODO: once this is working, pass do_integrity_check=False - new_mgr = type(rmgr)(res_blks, axes=rmgr.axes) + new_mgr = type(rmgr)(res_blks, axes=rmgr.axes, do_integrity_check=False) return new_mgr diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 01ea3fc8676de..a5d696be24d0a 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1,6 +1,7 @@ from collections import deque from datetime import datetime import operator +import re import numpy as np import pytest From 91c86a35072e87ec5fa16fb41a89f4c8c7cb65df Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 31 Mar 2020 21:05:01 -0700 Subject: [PATCH 10/32] asv --- asv_bench/benchmarks/arithmetic.py | 48 ++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py index 5a8b109c21858..b53ef9505990d 100644 --- a/asv_bench/benchmarks/arithmetic.py +++ b/asv_bench/benchmarks/arithmetic.py @@ -80,6 +80,54 @@ def time_frame_op_with_series_axis0(self, opname): getattr(self.df, opname)(self.ser, axis=0) +class FrameWithFrameWide: + # Many-columns, mixed dtypes + + params = [ + [ + operator.add, + operator.sub, + operator.mul, + operator.truediv, + operator.floordiv, + operator.pow, + operator.mod, + operator.eq, + operator.ne, + operator.gt, + operator.ge, + operator.lt, + operator.le, + ] + ] + param_names = ["op"] + + def setup(self, op): + # we choose dtypes so as to make the blocks + # a) not perfectly match between right and left + # b) appreciably bigger than single columns + arr = np.random.randn(10 ** 6).reshape(500, 2000).astype(np.float64) + df = pd.DataFrame(arr) + df[1000] = df[1000].astype(np.float32) + df.iloc[:, 1000:] = df.iloc[:, 1000:].astype(np.float32) + + # TODO: GH#33198 the setting here shoudlnt need two steps + df2 = pd.DataFrame(arr) + df2[1000] = df2[1000].astype(np.int64) + df2.iloc[:, 500:1500] = df2.iloc[:, 500:1500].astype(np.int64) + + self.left = df + self.right = df + + def time_op_different_blocks(self, op): + # blocks (and dtypes) are not aligned + op(self.left, self.right) + + def time_op_same_blocks(self, op): + # blocks (and dtypes) are aligned + op(self.left, self.left) + + class Ops: params = [[True, False], ["default", 1]] From 2034084db2a999691bcf28f424fc520dadcdeead Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 1 Apr 2020 08:03:28 -0700 Subject: [PATCH 11/32] whatsnew --- doc/source/whatsnew/v1.1.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 25f847c698278..d424315f5e416 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -274,7 +274,8 @@ Performance improvements :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`, :issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`). - Performance improvement in :meth:`Series.sum` for nullable (integer and boolean) dtypes (:issue:`30982`). - +- Performance improvement in arithmetic operations between two :class:`DataFrame` objects (:issue:`32779`) +- .. --------------------------------------------------------------------------- From 0c12d35e5cb4737fae982a8df1aa90fc9332c1d6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 3 Apr 2020 10:14:00 -0700 Subject: [PATCH 12/32] revert warning suppression --- pandas/core/ops/array_ops.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index f419c548b27c1..c17955457245b 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -6,7 +6,6 @@ from functools import partial import operator from typing import Any, Optional, Tuple -import warnings import numpy as np @@ -159,17 +158,14 @@ def na_arithmetic_op(left, right, op, str_rep: Optional[str], is_cmp: bool = Fal """ import pandas.core.computation.expressions as expressions - with warnings.catch_warnings(): - # suppress warnings from numpy about element-wise comparison - warnings.simplefilter("ignore", DeprecationWarning) - try: - result = expressions.evaluate(op, str_rep, left, right) - except TypeError: - if is_cmp: - # numexpr failed on comparison op, e.g. ndarray[float] > datetime - # In this case we do not fall back to the masked op, as that - # will handle complex numbers incorrectly, see GH#32047 - raise + try: + result = expressions.evaluate(op, str_rep, left, right) + except TypeError: + if is_cmp: + # numexpr failed on comparison op, e.g. ndarray[float] > datetime + # In this case we do not fall back to the masked op, as that + # will handle complex numbers incorrectly, see GH#32047 + raise result = masked_arith_op(left, right, op) if is_cmp and (is_scalar(result) or result is NotImplemented): From 9727562801ac6a1a98bd4ee348fe3666ec49e801 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 3 Apr 2020 10:51:49 -0700 Subject: [PATCH 13/32] Fixupm indentation --- pandas/core/ops/array_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index c17955457245b..05ec48c206b3c 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -166,7 +166,7 @@ def na_arithmetic_op(left, right, op, str_rep: Optional[str], is_cmp: bool = Fal # In this case we do not fall back to the masked op, as that # will handle complex numbers incorrectly, see GH#32047 raise - result = masked_arith_op(left, right, op) + result = masked_arith_op(left, right, op) if is_cmp and (is_scalar(result) or result is NotImplemented): # numpy returned a scalar instead of operating element-wise From 42bbbf3c7ee0ea3d5352191c380972842bf0edd4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 4 Apr 2020 15:42:26 -0700 Subject: [PATCH 14/32] suppress warning --- pandas/core/ops/array_ops.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 05ec48c206b3c..efd0500d351f4 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -6,6 +6,7 @@ from functools import partial import operator from typing import Any, Optional, Tuple +import warnings import numpy as np @@ -308,8 +309,13 @@ def comparison_op( res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) else: - with np.errstate(all="ignore"): - res_values = na_arithmetic_op(lvalues, rvalues, op, str_rep, is_cmp=True) + with warnings.catch_warnings(): + # suppress warnings from numpy about element-wise comparison + warnings.simplefilter("ignore", DeprecationWarning) + with np.errstate(all="ignore"): + res_values = na_arithmetic_op( + lvalues, rvalues, op, str_rep, is_cmp=True + ) return res_values From 0d958a3b6022cec977fc9b5f6fc802da62d2f284 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Apr 2020 10:34:43 -0700 Subject: [PATCH 15/32] update asv --- asv_bench/benchmarks/arithmetic.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py index 0473743dabf46..c5ad533ed82b1 100644 --- a/asv_bench/benchmarks/arithmetic.py +++ b/asv_bench/benchmarks/arithmetic.py @@ -127,11 +127,13 @@ def setup(self, op): df = pd.DataFrame(arr) df[1000] = df[1000].astype(np.float32) df.iloc[:, 1000:] = df.iloc[:, 1000:].astype(np.float32) + df._consolidate_inplace() # TODO: GH#33198 the setting here shoudlnt need two steps df2 = pd.DataFrame(arr) df2[1000] = df2[1000].astype(np.int64) df2.iloc[:, 500:1500] = df2.iloc[:, 500:1500].astype(np.int64) + df2._consolidate_inplace() self.left = df self.right = df From 56eef516d9fc83d9dfa82c41b61c0d86d7aaf202 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Apr 2020 11:09:41 -0700 Subject: [PATCH 16/32] _data->_mgr --- pandas/core/ops/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index bd12fe122bbf0..4d7f4fb026ae8 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -325,8 +325,8 @@ def get_same_shape_values( return lvals, rvals res_blks: List["Block"] = [] - rmgr = right._data - for n, blk in enumerate(left._data.blocks): + rmgr = right._mgr + for n, blk in enumerate(left._mgr.blocks): locs = blk.mgr_locs blk_vals = blk.values From ae744b74a082b7305c81b53a02f6ab3762afa2fd Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 9 Apr 2020 14:33:38 -0700 Subject: [PATCH 17/32] update to use faspath constructor --- pandas/core/frame.py | 4 +++- pandas/core/ops/__init__.py | 19 +++++++++++-------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c3018861bce57..e7732265beea1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -442,6 +442,7 @@ def __init__( mgr = self._init_mgr( data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy ) + elif isinstance(data, dict): mgr = init_dict(data, index, columns, dtype=dtype) elif isinstance(data, ma.MaskedArray): @@ -5453,10 +5454,11 @@ def _construct_result(self, result) -> "DataFrame": ------- DataFrame """ - out = self._constructor(result, index=self.index, copy=False) + out = self._constructor(result, copy=False) # Pin columns instead of passing to constructor for compat with # non-unique columns case out.columns = self.columns + out.index = self.index return out def combine( diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 4d7f4fb026ae8..8bd6e4244e818 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -332,12 +332,14 @@ def get_same_shape_values( left_ea = not isinstance(blk_vals, np.ndarray) + # TODO: joris says this is costly, see if we can optimize rblks = rmgr._slice_take_blocks_ax0(locs.indexer) - if left_ea: - assert len(locs) == 1, locs - assert len(rblks) == 1, rblks - assert rblks[0].shape[0] == 1, rblks[0].shape + # Assertions are disabled for performance, but should hold: + # if left_ea: + # assert len(locs) == 1, locs + # assert len(rblks) == 1, rblks + # assert rblks[0].shape[0] == 1, rblks[0].shape for k, rblk in enumerate(rblks): right_ea = not isinstance(rblk.values, np.ndarray) @@ -347,10 +349,11 @@ def get_same_shape_values( res_values = array_op(lvals, rvals) nbs = rblk._split_op_result(res_values) - if right_ea or left_ea: - assert len(nbs) == 1 - else: - assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) + # Assertions are disabled for performance, but should hold: + # if right_ea or left_ea: + # assert len(nbs) == 1 + # else: + # assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) for nb in nbs: # Reset mgr_locs to correspond to our original DataFrame From f42c40334812f2765f19565d03e82e9263ce1039 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 10 Apr 2020 11:00:55 -0700 Subject: [PATCH 18/32] update import --- pandas/core/ops/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index fede06a8970b9..5f6bc230917a4 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -10,7 +10,7 @@ from pandas._libs import lib from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op # noqa:F401 -from pandas._typing import Level +from pandas._typing import ArrayLike, Level from pandas.util._decorators import Appender from pandas.core.dtypes.common import is_list_like From 8a2807efdc0e3bd14f44a8dc5efbd54226346647 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 10 Apr 2020 11:01:13 -0700 Subject: [PATCH 19/32] remove unused import --- pandas/core/internals/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 75c935cdf2e60..80573f32b936e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -8,7 +8,7 @@ from pandas._libs import NaT, algos as libalgos, lib, writers import pandas._libs.internals as libinternals -from pandas._libs.tslibs import Timedelta, conversion +from pandas._libs.tslibs import conversion from pandas._libs.tslibs.timezones import tz_compare from pandas._typing import ArrayLike from pandas.util._validators import validate_bool_kwarg From fd10fb6cf39619e9a0e66affa5865f6f7755e22f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 10 Apr 2020 14:11:26 -0700 Subject: [PATCH 20/32] rebase compat --- pandas/core/ops/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 5f6bc230917a4..6b703dd7f40aa 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -347,6 +347,8 @@ def get_same_shape_values( lvals, rvals = get_same_shape_values(blk, rblk, left_ea, right_ea) res_values = array_op(lvals, rvals) + if left_ea and not right_ea and hasattr(res_values, "reshape"): + res_values = res_values.reshape(1, -1) nbs = rblk._split_op_result(res_values) # Assertions are disabled for performance, but should hold: From 7150e87363cc205f0df380c06812de1ca264d8c3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 11 Apr 2020 18:09:00 -0700 Subject: [PATCH 21/32] slice instead of take --- pandas/core/internals/managers.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index d1293974b776a..175f0b5332f12 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1322,6 +1322,7 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_value=lib.no_default): if allow_fill and fill_value is None: _, fill_value = maybe_promote(blk.dtype) + # TODO: Any cases where we can optimize this to slice? return [ blk.take_nd( slobj, @@ -1369,11 +1370,18 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_value=lib.no_default): blocks.append(newblk) else: - blocks.append( - blk.take_nd( - blklocs[mgr_locs.indexer], axis=0, new_mgr_locs=mgr_locs, - ) - ) + taker = blklocs[mgr_locs.indexer] + # TODO: taker.max() probably isnt the Technically Correct + # way of calling this? + taker = lib.maybe_indices_to_slice(taker, taker.max()) + + if isinstance(taker, slice): + nb = blk.getitem_block(taker) + nb.mgr_locs = mgr_locs + else: + # TODO: just use getitem_block anyway? + nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs) + blocks.append(nb) return blocks From 0ca2125d85f1b68ad4bc03fc462278b5bb387700 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 14 Apr 2020 07:06:21 -0700 Subject: [PATCH 22/32] Dummy commit to force CI From 2bfc30885b6e3a0bcf1d60ab8b0e731b90c1579e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 15 Apr 2020 07:22:30 -0700 Subject: [PATCH 23/32] update call bound --- pandas/core/internals/managers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index bdeb7820d2762..12f7fd3decb43 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1371,9 +1371,9 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_value=lib.no_default): else: taker = blklocs[mgr_locs.indexer] - # TODO: taker.max() probably isnt the Technically Correct + # TODO: taker.max()+1 probably isnt the Technically Correct # way of calling this? - taker = lib.maybe_indices_to_slice(taker, taker.max()) + taker = lib.maybe_indices_to_slice(taker, taker.max() + 1) if isinstance(taker, slice): nb = blk.getitem_block(taker) From d5ad2a079c1e5ddd097f9bdff32c1ecf8dce890e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 15 Apr 2020 11:49:57 -0700 Subject: [PATCH 24/32] update max_len --- pandas/core/internals/managers.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 12f7fd3decb43..089a12ff40bd9 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1322,7 +1322,6 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_value=lib.no_default): if allow_fill and fill_value is None: _, fill_value = maybe_promote(blk.dtype) - # TODO: Any cases where we can optimize this to slice? return [ blk.take_nd( slobj, @@ -1371,13 +1370,11 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_value=lib.no_default): else: taker = blklocs[mgr_locs.indexer] - # TODO: taker.max()+1 probably isnt the Technically Correct - # way of calling this? - taker = lib.maybe_indices_to_slice(taker, taker.max() + 1) + max_len = max(len(mgr_locs), taker.max() + 1) + taker = lib.maybe_indices_to_slice(taker, max_len) if isinstance(taker, slice): - nb = blk.getitem_block(taker) - nb.mgr_locs = mgr_locs + nb = blk.getitem_block(taker, new_mgr_locs=mgr_locs) else: # TODO: just use getitem_block anyway? nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs) From e78570d3c73411bfb9dff3dd4926333ded488d04 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 17 Apr 2020 13:25:25 -0700 Subject: [PATCH 25/32] never take --- pandas/core/internals/managers.py | 24 ++++++++++++++++++++---- pandas/core/ops/__init__.py | 3 +-- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b51944d0d9c26..8aa31bbb01720 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1253,12 +1253,21 @@ def reindex_indexer( return type(self).from_blocks(new_blocks, new_axes) - def _slice_take_blocks_ax0(self, slice_or_indexer, fill_value=lib.no_default): + def _slice_take_blocks_ax0( + self, slice_or_indexer, fill_value=lib.no_default, only_slice: bool = False + ): """ Slice/take blocks along axis=0. Overloaded for SingleBlock + Parameters + ---------- + slice_or_indexer : slice, ndarray[bool], or list-like of ints + fill_value : scalar, default lib.no_default + only_slice : bool, default False + If True, we always return views on existing arrays, never copies. + Returns ------- new_blocks : list of Block @@ -1331,14 +1340,21 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_value=lib.no_default): else: taker = blklocs[mgr_locs.indexer] max_len = max(len(mgr_locs), taker.max() + 1) - taker = lib.maybe_indices_to_slice(taker, max_len) + if only_slice: + taker = lib.maybe_indices_to_slice(taker, max_len) if isinstance(taker, slice): nb = blk.getitem_block(taker, new_mgr_locs=mgr_locs) + blocks.append(nb) + elif only_slice: + # GH#33597 slice instead of take, so we get + # views instead of copies + for i, ml in zip(taker, mgr_locs): + nb = blk.getitem_block([i], new_mgr_locs=ml) + blocks.append(nb) else: - # TODO: just use getitem_block anyway? nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs) - blocks.append(nb) + blocks.append(nb) return blocks diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 4065ac4ff68f8..8b1b0415fe4e3 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -332,8 +332,7 @@ def get_same_shape_values( left_ea = not isinstance(blk_vals, np.ndarray) - # TODO: joris says this is costly, see if we can optimize - rblks = rmgr._slice_take_blocks_ax0(locs.indexer) + rblks = rmgr._slice_take_blocks_ax0(locs.indexer, only_slice=True) # Assertions are disabled for performance, but should hold: # if left_ea: From 30f655b662fef6943ef3e46c6b1288b25240a67b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 25 Apr 2020 14:45:43 -0700 Subject: [PATCH 26/32] REF: move operate_blockwise to new file --- pandas/core/internals/managers.py | 2 + pandas/core/ops/__init__.py | 88 +----------------------------- pandas/core/ops/blockwise.py | 91 +++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 85 deletions(-) create mode 100644 pandas/core/ops/blockwise.py diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 44e877eecd172..c3ce288e3ad9b 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1318,6 +1318,8 @@ def _slice_take_blocks_ax0( blocks.append(newblk) else: + # GH#32779 to avoid the performance penalty of copying, + # we may try to only slice taker = blklocs[mgr_locs.indexer] max_len = max(len(mgr_locs), taker.max() + 1) if only_slice: diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 4c532a0e22132..0b3775f876b39 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -4,13 +4,13 @@ This is not a public API. """ import operator -from typing import TYPE_CHECKING, List, Optional, Set, Tuple +from typing import TYPE_CHECKING, Optional, Set import numpy as np from pandas._libs import lib from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op # noqa:F401 -from pandas._typing import ArrayLike, Level +from pandas._typing import Level from pandas.util._decorators import Appender from pandas.core.dtypes.common import is_list_like @@ -26,6 +26,7 @@ logical_op, ) from pandas.core.ops.array_ops import comp_method_OBJECT_ARRAY # noqa:F401 +from pandas.core.ops.blockwise import operate_blockwise from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.dispatch import should_series_dispatch from pandas.core.ops.docstrings import ( @@ -294,89 +295,6 @@ def fill_binop(left, right, fill_value): # Dispatch logic -def operate_blockwise(left, right, array_op): - assert right._indexed_same(left) - - def get_same_shape_values( - lblk: "Block", rblk: "Block", left_ea: bool, right_ea: bool - ) -> Tuple[ArrayLike, ArrayLike]: - """ - Slice lblk.values to align with rblk. Squeeze if we have EAs. - """ - lvals = lblk.values - rvals = rblk.values - - # TODO(EA2D): with 2D EAs pnly this first clause would be needed - if not (left_ea or right_ea): - lvals = lvals[rblk.mgr_locs.indexer, :] - assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) - elif left_ea and right_ea: - assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) - elif right_ea: - # lvals are 2D, rvals are 1D - lvals = lvals[rblk.mgr_locs.indexer, :] - assert lvals.shape[0] == 1, lvals.shape - lvals = lvals[0, :] - else: - # lvals are 1D, rvals are 2D - assert rvals.shape[0] == 1, rvals.shape - rvals = rvals[0, :] - - return lvals, rvals - - res_blks: List["Block"] = [] - rmgr = right._mgr - for n, blk in enumerate(left._mgr.blocks): - locs = blk.mgr_locs - blk_vals = blk.values - - left_ea = not isinstance(blk_vals, np.ndarray) - - rblks = rmgr._slice_take_blocks_ax0(locs.indexer, only_slice=True) - - # Assertions are disabled for performance, but should hold: - # if left_ea: - # assert len(locs) == 1, locs - # assert len(rblks) == 1, rblks - # assert rblks[0].shape[0] == 1, rblks[0].shape - - for k, rblk in enumerate(rblks): - right_ea = not isinstance(rblk.values, np.ndarray) - - lvals, rvals = get_same_shape_values(blk, rblk, left_ea, right_ea) - - res_values = array_op(lvals, rvals) - if left_ea and not right_ea and hasattr(res_values, "reshape"): - res_values = res_values.reshape(1, -1) - nbs = rblk._split_op_result(res_values) - - # Assertions are disabled for performance, but should hold: - # if right_ea or left_ea: - # assert len(nbs) == 1 - # else: - # assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) - - for nb in nbs: - # Reset mgr_locs to correspond to our original DataFrame - nblocs = locs.as_array[nb.mgr_locs.indexer] - nb.mgr_locs = nblocs - # Assertions are disabled for performance, but should hold: - # assert len(nblocs) == nb.shape[0], (len(nblocs), nb.shape) - # assert all(x in locs.as_array for x in nb.mgr_locs.as_array) - - res_blks.extend(nbs) - - # Assertions are disabled for performance, but should hold: - # slocs = {y for nb in res_blks for y in nb.mgr_locs.as_array} - # nlocs = sum(len(nb.mgr_locs.as_array) for nb in res_blks) - # assert nlocs == len(left.columns), (nlocs, len(left.columns)) - # assert len(slocs) == nlocs, (len(slocs), nlocs) - # assert slocs == set(range(nlocs)), slocs - - new_mgr = type(rmgr)(res_blks, axes=rmgr.axes, do_integrity_check=False) - return new_mgr - - def dispatch_to_series(left, right, func, str_rep=None, axis=None): """ Evaluate the frame operation func(left, right) by evaluating diff --git a/pandas/core/ops/blockwise.py b/pandas/core/ops/blockwise.py new file mode 100644 index 0000000000000..509d8b9cc2d7b --- /dev/null +++ b/pandas/core/ops/blockwise.py @@ -0,0 +1,91 @@ +from typing import TYPE_CHECKING, List, Tuple + +import numpy as np + +from pandas._typing import ArrayLike + +if TYPE_CHECKING: + from pandas.core.internals.blocks import Block # noqa:F401 + + +def operate_blockwise(left, right, array_op): + assert right._indexed_same(left) + + def get_same_shape_values( + lblk: "Block", rblk: "Block", left_ea: bool, right_ea: bool + ) -> Tuple[ArrayLike, ArrayLike]: + """ + Slice lblk.values to align with rblk. Squeeze if we have EAs. + """ + lvals = lblk.values + rvals = rblk.values + + # TODO(EA2D): with 2D EAs pnly this first clause would be needed + if not (left_ea or right_ea): + lvals = lvals[rblk.mgr_locs.indexer, :] + assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) + elif left_ea and right_ea: + assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) + elif right_ea: + # lvals are 2D, rvals are 1D + lvals = lvals[rblk.mgr_locs.indexer, :] + assert lvals.shape[0] == 1, lvals.shape + lvals = lvals[0, :] + else: + # lvals are 1D, rvals are 2D + assert rvals.shape[0] == 1, rvals.shape + rvals = rvals[0, :] + + return lvals, rvals + + res_blks: List["Block"] = [] + rmgr = right._mgr + for n, blk in enumerate(left._mgr.blocks): + locs = blk.mgr_locs + blk_vals = blk.values + + left_ea = not isinstance(blk_vals, np.ndarray) + + rblks = rmgr._slice_take_blocks_ax0(locs.indexer, only_slice=True) + + # Assertions are disabled for performance, but should hold: + # if left_ea: + # assert len(locs) == 1, locs + # assert len(rblks) == 1, rblks + # assert rblks[0].shape[0] == 1, rblks[0].shape + + for k, rblk in enumerate(rblks): + right_ea = not isinstance(rblk.values, np.ndarray) + + lvals, rvals = get_same_shape_values(blk, rblk, left_ea, right_ea) + + res_values = array_op(lvals, rvals) + if left_ea and not right_ea and hasattr(res_values, "reshape"): + res_values = res_values.reshape(1, -1) + nbs = rblk._split_op_result(res_values) + + # Assertions are disabled for performance, but should hold: + # if right_ea or left_ea: + # assert len(nbs) == 1 + # else: + # assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) + + for nb in nbs: + # Reset mgr_locs to correspond to our original DataFrame + nblocs = locs.as_array[nb.mgr_locs.indexer] + nb.mgr_locs = nblocs + # Assertions are disabled for performance, but should hold: + # assert len(nblocs) == nb.shape[0], (len(nblocs), nb.shape) + # assert all(x in locs.as_array for x in nb.mgr_locs.as_array) + + res_blks.extend(nbs) + + # Assertions are disabled for performance, but should hold: + # slocs = {y for nb in res_blks for y in nb.mgr_locs.as_array} + # nlocs = sum(len(nb.mgr_locs.as_array) for nb in res_blks) + # assert nlocs == len(left.columns), (nlocs, len(left.columns)) + # assert len(slocs) == nlocs, (len(slocs), nlocs) + # assert slocs == set(range(nlocs)), slocs + + new_mgr = type(rmgr)(res_blks, axes=rmgr.axes, do_integrity_check=False) + return new_mgr From 30d658082d98e32320fe750c0c756ff8f386beb3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 25 Apr 2020 16:33:52 -0700 Subject: [PATCH 27/32] ndim compat --- pandas/core/arrays/datetimelike.py | 3 ++- pandas/core/ops/__init__.py | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a864ab03e0ac5..77521e9a71c15 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1359,8 +1359,9 @@ def _addsub_object_array(self, other: np.ndarray, op): result : same class as self """ assert op in [operator.add, operator.sub] - if len(other) == 1 and self.ndim == other.ndim == 1: + if len(other) == 1: # If both 1D then broadcasting is unambiguous + # TODO(EA2D): require self.ndim == other.ndim here return op(self, other[0]) warnings.warn( diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 0b3775f876b39..6fe33534b0bb3 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -58,7 +58,6 @@ if TYPE_CHECKING: from pandas import DataFrame # noqa:F401 - from pandas.core.internals.blocks import Block # noqa: F401 # ----------------------------------------------------------------------------- # constants From f86deb4d8ef7962b416472e258c6e9e68ae438a4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 26 Apr 2020 12:40:35 -0700 Subject: [PATCH 28/32] separate out helper function --- pandas/core/ops/blockwise.py | 57 ++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/pandas/core/ops/blockwise.py b/pandas/core/ops/blockwise.py index 509d8b9cc2d7b..2340a8a485b3d 100644 --- a/pandas/core/ops/blockwise.py +++ b/pandas/core/ops/blockwise.py @@ -11,33 +11,6 @@ def operate_blockwise(left, right, array_op): assert right._indexed_same(left) - def get_same_shape_values( - lblk: "Block", rblk: "Block", left_ea: bool, right_ea: bool - ) -> Tuple[ArrayLike, ArrayLike]: - """ - Slice lblk.values to align with rblk. Squeeze if we have EAs. - """ - lvals = lblk.values - rvals = rblk.values - - # TODO(EA2D): with 2D EAs pnly this first clause would be needed - if not (left_ea or right_ea): - lvals = lvals[rblk.mgr_locs.indexer, :] - assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) - elif left_ea and right_ea: - assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) - elif right_ea: - # lvals are 2D, rvals are 1D - lvals = lvals[rblk.mgr_locs.indexer, :] - assert lvals.shape[0] == 1, lvals.shape - lvals = lvals[0, :] - else: - # lvals are 1D, rvals are 2D - assert rvals.shape[0] == 1, rvals.shape - rvals = rvals[0, :] - - return lvals, rvals - res_blks: List["Block"] = [] rmgr = right._mgr for n, blk in enumerate(left._mgr.blocks): @@ -57,7 +30,7 @@ def get_same_shape_values( for k, rblk in enumerate(rblks): right_ea = not isinstance(rblk.values, np.ndarray) - lvals, rvals = get_same_shape_values(blk, rblk, left_ea, right_ea) + lvals, rvals = _get_same_shape_values(blk, rblk, left_ea, right_ea) res_values = array_op(lvals, rvals) if left_ea and not right_ea and hasattr(res_values, "reshape"): @@ -89,3 +62,31 @@ def get_same_shape_values( new_mgr = type(rmgr)(res_blks, axes=rmgr.axes, do_integrity_check=False) return new_mgr + + +def _get_same_shape_values( + lblk: "Block", rblk: "Block", left_ea: bool, right_ea: bool +) -> Tuple[ArrayLike, ArrayLike]: + """ + Slice lblk.values to align with rblk. Squeeze if we have EAs. + """ + lvals = lblk.values + rvals = rblk.values + + # TODO(EA2D): with 2D EAs pnly this first clause would be needed + if not (left_ea or right_ea): + lvals = lvals[rblk.mgr_locs.indexer, :] + assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) + elif left_ea and right_ea: + assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) + elif right_ea: + # lvals are 2D, rvals are 1D + lvals = lvals[rblk.mgr_locs.indexer, :] + assert lvals.shape[0] == 1, lvals.shape + lvals = lvals[0, :] + else: + # lvals are 1D, rvals are 2D + assert rvals.shape[0] == 1, rvals.shape + rvals = rvals[0, :] + + return lvals, rvals From 0c46531063ea61dbd4447ca68b7947e426b71ccf Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 11 May 2020 13:38:38 -0700 Subject: [PATCH 29/32] update per comments --- pandas/_libs/internals.pyx | 2 +- pandas/core/arrays/datetimelike.py | 4 +++ pandas/core/internals/managers.py | 27 +++++++++++++++------ pandas/core/ops/blockwise.py | 6 ++++- pandas/tests/arithmetic/test_timedelta64.py | 3 +-- 5 files changed, 30 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 1e53b789aa05c..dc8b0dd39bb82 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -48,7 +48,7 @@ cdef class BlockPlacement: else: # Cython memoryview interface requires ndarray to be writeable. arr = np.require(val, dtype=np.int64, requirements='W') - assert arr.ndim == 1 + assert arr.ndim == 1, arr.shape self._as_array = arr self._has_array = True diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index f1e01186e99f0..b4c5868065eaa 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -95,6 +95,10 @@ def _validate_comparison_value(self, other): @unpack_zerodim_and_defer(opname) def wrapper(self, other): + if self.ndim > 1 and getattr(other, "shape", None) == self.shape: + # TODO: handle 2D-like listlikes + return op(self.ravel(), other.ravel()).reshape(self.shape) + try: other = _validate_comparison_value(self, other) except InvalidComparison: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 5e6d2ccd38301..238c68d6b6cd8 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1247,6 +1247,7 @@ def _slice_take_blocks_ax0( fill_value : scalar, default lib.no_default only_slice : bool, default False If True, we always return views on existing arrays, never copies. + This is used when called from ops.blockwise.operate_blockwise. Returns ------- @@ -1271,14 +1272,24 @@ def _slice_take_blocks_ax0( if allow_fill and fill_value is None: _, fill_value = maybe_promote(blk.dtype) - return [ - blk.take_nd( - slobj, - axis=0, - new_mgr_locs=slice(0, sllen), - fill_value=fill_value, - ) - ] + if not allow_fill and only_slice: + # GH#33597 slice instead of take, so we get + # views instead of copies + blocks = [] + for i, ml in enumerate(slobj): + nb = blk.getitem_block([ml], new_mgr_locs=i) + print(nb.shape, np.values.shape) + blocks.append(nb) + return blocks + else: + return [ + blk.take_nd( + slobj, + axis=0, + new_mgr_locs=slice(0, sllen), + fill_value=fill_value, + ) + ] if sl_type in ("slice", "mask"): blknos = self.blknos[slobj] diff --git a/pandas/core/ops/blockwise.py b/pandas/core/ops/blockwise.py index 2340a8a485b3d..153cd2a0a0c76 100644 --- a/pandas/core/ops/blockwise.py +++ b/pandas/core/ops/blockwise.py @@ -9,7 +9,8 @@ def operate_blockwise(left, right, array_op): - assert right._indexed_same(left) + # At this point we have already checked + # assert right._indexed_same(left) res_blks: List["Block"] = [] rmgr = right._mgr @@ -73,6 +74,9 @@ def _get_same_shape_values( lvals = lblk.values rvals = rblk.values + # Require that the indexing into lvals be slice-like + assert rblk.mgr_locs.is_slice_like, rblk.mgr_locs + # TODO(EA2D): with 2D EAs pnly this first clause would be needed if not (left_ea or right_ea): lvals = lvals[rblk.mgr_locs.indexer, :] diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index d540ff923c929..080fdf53b46e6 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -552,8 +552,7 @@ def test_tda_add_dt64_object_array(self, box_with_array, tz_naive_fixture): obj = tm.box_expected(tdi, box) other = tm.box_expected(dti, box) - warn = PerformanceWarning if box is not pd.DataFrame else None - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): result = obj + other.astype(object) tm.assert_equal(result, other) From 32e70d880e22ee7a850669a9ff3742087c81c468 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 12 May 2020 09:49:25 -0700 Subject: [PATCH 30/32] update per comments --- asv_bench/benchmarks/arithmetic.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py index d670566ff160d..e4743601ef207 100644 --- a/asv_bench/benchmarks/arithmetic.py +++ b/asv_bench/benchmarks/arithmetic.py @@ -106,19 +106,10 @@ class FrameWithFrameWide: params = [ [ + # GH#32779 has discussion of which operators are included here operator.add, - operator.sub, - operator.mul, - operator.truediv, operator.floordiv, - operator.pow, - operator.mod, - operator.eq, - operator.ne, operator.gt, - operator.ge, - operator.lt, - operator.le, ] ] param_names = ["op"] @@ -127,13 +118,20 @@ def setup(self, op): # we choose dtypes so as to make the blocks # a) not perfectly match between right and left # b) appreciably bigger than single columns - arr = np.random.randn(10 ** 6).reshape(500, 2000).astype(np.float64) - df = pd.DataFrame(arr) - df[1000] = df[1000].astype(np.float32) - df.iloc[:, 1000:] = df.iloc[:, 1000:].astype(np.float32) + n_cols = 2000 + n_rows = 500 + + # construct dataframe with 2 blocks + arr1 = np.random.randn(n_rows, int(n_cols / 2)).astype("f8") + arr2 = np.random.randn(n_rows, int(n_cols / 2)).astype("f4") + df = pd.concat( + [pd.DataFrame(arr1), pd.DataFrame(arr2)], axis=1, ignore_index=True, + ) + # should already be the case, but just to be sure df._consolidate_inplace() # TODO: GH#33198 the setting here shoudlnt need two steps + arr = np.random.randn(10 ** 6).reshape(n_rows, n_cols).astype(np.float64) df2 = pd.DataFrame(arr) df2[1000] = df2[1000].astype(np.int64) df2.iloc[:, 500:1500] = df2.iloc[:, 500:1500].astype(np.int64) From 41e8e789ece3476bc7998665a6a0a7d10e7a1f4f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 16 May 2020 09:54:53 -0700 Subject: [PATCH 31/32] update asv --- asv_bench/benchmarks/arithmetic.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py index e4743601ef207..08a11ba2607a5 100644 --- a/asv_bench/benchmarks/arithmetic.py +++ b/asv_bench/benchmarks/arithmetic.py @@ -131,14 +131,19 @@ def setup(self, op): df._consolidate_inplace() # TODO: GH#33198 the setting here shoudlnt need two steps - arr = np.random.randn(10 ** 6).reshape(n_rows, n_cols).astype(np.float64) - df2 = pd.DataFrame(arr) - df2[1000] = df2[1000].astype(np.int64) - df2.iloc[:, 500:1500] = df2.iloc[:, 500:1500].astype(np.int64) + arr1 = np.random.randn(n_rows, int(n_cols / 4)).astype("f8") + arr2 = np.random.randn(n_rows, int(n_cols / 2)).astype("i8") + arr3 = np.random.randn(n_rows, int(n_cols / 4)).astype("f8") + df2 = pd.concat( + [pd.DataFrame(arr1), pd.DataFrame(arr2), pd.DataFrame(arr3)], + axis=1, + ignore_index=True, + ) + # should already be the case, but just to be sure df2._consolidate_inplace() self.left = df - self.right = df + self.right = df2 def time_op_different_blocks(self, op): # blocks (and dtypes) are not aligned From 8c4f951684183333070a9f9ae32b19c7c9c9853a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 17 May 2020 14:34:47 -0700 Subject: [PATCH 32/32] requested edits --- pandas/core/internals/managers.py | 9 ++++----- pandas/core/ops/blockwise.py | 20 +++++++++++++------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 8cf0dbd8bd7d8..590b92481feca 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1311,11 +1311,10 @@ def _slice_take_blocks_ax0( if not allow_fill and only_slice: # GH#33597 slice instead of take, so we get # views instead of copies - blocks = [] - for i, ml in enumerate(slobj): - nb = blk.getitem_block([ml], new_mgr_locs=i) - print(nb.shape, np.values.shape) - blocks.append(nb) + blocks = [ + blk.getitem_block([ml], new_mgr_locs=i) + for i, ml in enumerate(slobj) + ] return blocks else: return [ diff --git a/pandas/core/ops/blockwise.py b/pandas/core/ops/blockwise.py index 153cd2a0a0c76..f41a30b136637 100644 --- a/pandas/core/ops/blockwise.py +++ b/pandas/core/ops/blockwise.py @@ -44,13 +44,7 @@ def operate_blockwise(left, right, array_op): # else: # assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) - for nb in nbs: - # Reset mgr_locs to correspond to our original DataFrame - nblocs = locs.as_array[nb.mgr_locs.indexer] - nb.mgr_locs = nblocs - # Assertions are disabled for performance, but should hold: - # assert len(nblocs) == nb.shape[0], (len(nblocs), nb.shape) - # assert all(x in locs.as_array for x in nb.mgr_locs.as_array) + _reset_block_mgr_locs(nbs, locs) res_blks.extend(nbs) @@ -65,6 +59,18 @@ def operate_blockwise(left, right, array_op): return new_mgr +def _reset_block_mgr_locs(nbs: List["Block"], locs): + """ + Reset mgr_locs to correspond to our original DataFrame. + """ + for nb in nbs: + nblocs = locs.as_array[nb.mgr_locs.indexer] + nb.mgr_locs = nblocs + # Assertions are disabled for performance, but should hold: + # assert len(nblocs) == nb.shape[0], (len(nblocs), nb.shape) + # assert all(x in locs.as_array for x in nb.mgr_locs.as_array) + + def _get_same_shape_values( lblk: "Block", rblk: "Block", left_ea: bool, right_ea: bool ) -> Tuple[ArrayLike, ArrayLike]: