From 1697252a73785bb4ad1bfff82304d5c37534897f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 17 Mar 2020 09:59:21 -0700 Subject: [PATCH 01/31] PERF: block-wise arithmetic for frame-with-frame --- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/ops/__init__.py | 72 +++++++++++++++++++++- pandas/core/ops/array_ops.py | 2 +- pandas/tests/arithmetic/common.py | 9 ++- pandas/tests/arithmetic/test_datetime64.py | 7 ++- 5 files changed, 83 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 105d9581b1a25..46f2c239b4193 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1299,7 +1299,7 @@ def _addsub_object_array(self, other: np.ndarray, op): result : same class as self """ assert op in [operator.add, operator.sub] - if len(other) == 1: + if len(other) == 1 and self.ndim == other.ndim == 1: return op(self, other[0]) warnings.warn( diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 3153a9ac28c10..f80ad80e15d7b 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -5,7 +5,7 @@ """ import datetime import operator -from typing import TYPE_CHECKING, Optional, Set, Tuple +from typing import TYPE_CHECKING, List, Optional, Set, Tuple import numpy as np @@ -58,6 +58,7 @@ if TYPE_CHECKING: from pandas import DataFrame # noqa:F401 + from pandas.core.internals.blocks import Block # noqa: F401 # ----------------------------------------------------------------------------- # constants @@ -353,6 +354,70 @@ def fill_binop(left, right, fill_value): # Dispatch logic +def operate_blockwise(left, right, array_op): + assert right._indexed_same(left) + + res_blks: List["Block"] = [] + rmgr = right._data + for n, blk in enumerate(left._data.blocks): + locs = blk.mgr_locs + + blk_vals = blk.values + + if not isinstance(blk_vals, np.ndarray): + # 1D EA + assert len(locs) == 1, locs + rser = right.iloc[:, locs[0]] + rvals = extract_array(rser, extract_numpy=True) + res_values = array_op(blk_vals, rvals) + nbs = blk._split_op_result(res_values) + res_blks.extend(nbs) + continue + + rblks = rmgr._slice_take_blocks_ax0(locs.indexer) + + for k, rblk in enumerate(rblks): + lvals = blk_vals[rblk.mgr_locs.indexer, :] + rvals = rblk.values + + if not isinstance(rvals, np.ndarray): + # 1D EA + assert lvals.shape[0] == 1, lvals.shape + lvals = lvals[0, :] + res_values = array_op(lvals, rvals) + nbs = rblk._split_op_result(res_values) + assert len(nbs) == 1 + nb = nbs[0] + nb.mgr_locs = locs.as_array[nb.mgr_locs] + res_blks.append(nb) + continue + + assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) + + res_values = array_op(lvals, rvals) + assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) + nbs = rblk._split_op_result(res_values) + for nb in nbs: + # TODO: maybe optimize by sticking with slices? + nb_mgr_locs = nb.mgr_locs + nblocs = locs.as_array[nb_mgr_locs.indexer] + nb.mgr_locs = nblocs + assert len(nblocs) == nb.shape[0], (len(nblocs), nb.shape) + assert all(x in locs.as_array for x in nb.mgr_locs.as_array) + + res_blks.extend(nbs) + + slocs = set(y for nb in res_blks for y in nb.mgr_locs.as_array) + nlocs = sum(len(nb.mgr_locs.as_array) for nb in res_blks) + assert nlocs == len(left.columns), (nlocs, len(left.columns)) + assert len(slocs) == nlocs, (len(slocs), nlocs) + assert slocs == set(range(nlocs)), slocs + + # TODO: once this is working, pass do_integrity_check=False + new_mgr = type(rmgr)(res_blks, axes=rmgr.axes) + return new_mgr + + def dispatch_to_series(left, right, func, str_rep=None, axis=None): """ Evaluate the frame operation func(left, right) by evaluating @@ -385,8 +450,9 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None): elif isinstance(right, ABCDataFrame): assert right._indexed_same(left) - def column_op(a, b): - return {i: func(a.iloc[:, i], b.iloc[:, i]) for i in range(len(a.columns))} + array_op = get_array_op(func, str_rep=str_rep) + bm = operate_blockwise(left, right, array_op) + return type(left)(bm) elif isinstance(right, ABCSeries) and axis == "columns": # We only get here if called via _combine_series_frame, diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index e285c53d9813e..dcef6d8f3c981 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -132,7 +132,7 @@ def masked_arith_op(x: np.ndarray, y, op): return result -def define_na_arithmetic_op(op, str_rep: str): +def define_na_arithmetic_op(op, str_rep: Optional[str]): def na_op(x, y): return na_arithmetic_op(x, y, op, str_rep) diff --git a/pandas/tests/arithmetic/common.py b/pandas/tests/arithmetic/common.py index ccc49adc5da82..755fbd0d9036c 100644 --- a/pandas/tests/arithmetic/common.py +++ b/pandas/tests/arithmetic/common.py @@ -70,7 +70,14 @@ def assert_invalid_comparison(left, right, box): result = right != left tm.assert_equal(result, ~expected) - msg = "Invalid comparison between|Cannot compare type|not supported between" + msg = "|".join( + [ + "Invalid comparison between", + "Cannot compare type", + "not supported between", + "invalid type promotion", + ] + ) with pytest.raises(TypeError, match=msg): left < right with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index f7211ab5f9fd4..5cadf8bff51f1 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -964,7 +964,9 @@ def test_dt64arr_sub_dt64object_array(self, box_with_array, tz_naive_fixture): obj = tm.box_expected(dti, box_with_array) expected = tm.box_expected(expected, box_with_array) - warn = PerformanceWarning if box_with_array is not pd.DataFrame else None + warn = None + if box_with_array is not pd.DataFrame or tz_naive_fixture is None: + warn = PerformanceWarning with tm.assert_produces_warning(warn): result = obj - obj.astype(object) tm.assert_equal(result, expected) @@ -1388,8 +1390,7 @@ def test_dt64arr_add_mixed_offset_array(self, box_with_array): s = DatetimeIndex([Timestamp("2000-1-1"), Timestamp("2000-2-1")]) s = tm.box_expected(s, box_with_array) - warn = None if box_with_array is pd.DataFrame else PerformanceWarning - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): other = pd.Index([pd.offsets.DateOffset(years=1), pd.offsets.MonthEnd()]) other = tm.box_expected(other, box_with_array) result = s + other From 30a836d6ce7a7ef0c9f01daca99cb182bebbbcfa Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 17 Mar 2020 10:46:22 -0700 Subject: [PATCH 02/31] lint fixup --- pandas/core/ops/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index f80ad80e15d7b..e127be3b7d644 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -407,7 +407,7 @@ def operate_blockwise(left, right, array_op): res_blks.extend(nbs) - slocs = set(y for nb in res_blks for y in nb.mgr_locs.as_array) + slocs = {y for nb in res_blks for y in nb.mgr_locs.as_array} nlocs = sum(len(nb.mgr_locs.as_array) for nb in res_blks) assert nlocs == len(left.columns), (nlocs, len(left.columns)) assert len(slocs) == nlocs, (len(slocs), nlocs) From 4334353b53ead13ffb777cb869a17e18687be4c2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 17 Mar 2020 19:14:02 -0700 Subject: [PATCH 03/31] troubleshoot npdev build --- pandas/core/ops/array_ops.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index dcef6d8f3c981..4338746f05386 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -5,6 +5,7 @@ from functools import partial import operator from typing import Any, Optional +import warnings import numpy as np @@ -163,15 +164,18 @@ def na_arithmetic_op(left, right, op, str_rep: Optional[str], is_cmp: bool = Fal """ import pandas.core.computation.expressions as expressions - try: - result = expressions.evaluate(op, str_rep, left, right) - except TypeError: - if is_cmp: - # numexpr failed on comparison op, e.g. ndarray[float] > datetime - # In this case we do not fall back to the masked op, as that - # will handle complex numbers incorrectly, see GH#32047 - raise - result = masked_arith_op(left, right, op) + with warnings.catch_warnings(): + # suppress warnings from numpy about element-wise comparison + warnings.simplefilter("ignore", DeprecationWarning) + try: + result = expressions.evaluate(op, str_rep, left, right) + except TypeError: + if is_cmp: + # numexpr failed on comparison op, e.g. ndarray[float] > datetime + # In this case we do not fall back to the masked op, as that + # will handle complex numbers incorrectly, see GH#32047 + raise + result = masked_arith_op(left, right, op) if is_cmp and (is_scalar(result) or result is NotImplemented): # numpy returned a scalar instead of operating element-wise From 713a776e551186e9e2e4d480f362072b60042f65 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 24 Mar 2020 19:25:57 -0700 Subject: [PATCH 04/31] comment --- pandas/core/arrays/datetimelike.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 6ca45f9acec91..2fdb33b7b8a8f 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1297,10 +1297,11 @@ def _addsub_object_array(self, other: np.ndarray, op): """ assert op in [operator.add, operator.sub] if len(other) == 1 and self.ndim == other.ndim == 1: + # If both 1D then broadcasting is unambiguous return op(self, other[0]) warnings.warn( - "Adding/subtracting array of DateOffsets to " + "Adding/subtracting object-dtype array to " f"{type(self).__name__} not vectorized", PerformanceWarning, ) @@ -1308,7 +1309,7 @@ def _addsub_object_array(self, other: np.ndarray, op): # Caller is responsible for broadcasting if necessary assert self.shape == other.shape, (self.shape, other.shape) - res_values = op(self.astype("O"), np.array(other)) + res_values = op(self.astype("O"), np.asarray(other)) result = array(res_values.ravel()) result = extract_array(result, extract_numpy=True).reshape(self.shape) return result From 95ef3adddd6554ff1aa21ba3304cc9b175f6c49b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 24 Mar 2020 20:45:20 -0700 Subject: [PATCH 05/31] checkpoint passing --- pandas/core/ops/__init__.py | 51 +++++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 16 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index caf420f06a7fb..4fdd18418f423 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -304,43 +304,62 @@ def operate_blockwise(left, right, array_op): blk_vals = blk.values + left_ea = not isinstance(blk_vals, np.ndarray) + if not isinstance(blk_vals, np.ndarray): # 1D EA assert len(locs) == 1, locs - rser = right.iloc[:, locs[0]] - rvals = extract_array(rser, extract_numpy=True) + rblks = rmgr._slice_take_blocks_ax0(locs.indexer) + assert len(rblks) == 1, rblks + rblk = rblks[0] + assert rblk.shape[0] == 1, rblk.shape + + rvals = rblk.values + right_ea = not isinstance(rvals, np.ndarray) + if not right_ea: + assert rvals.shape[0] == 1, rvals.shape + rvals = rvals[0, :] + #rser = right.iloc[:, locs[0]] + #rvals = extract_array(rser, extract_numpy=True) res_values = array_op(blk_vals, rvals) nbs = blk._split_op_result(res_values) + # Setting nb.mgr_locs is unnecessary here, but harmless res_blks.extend(nbs) continue rblks = rmgr._slice_take_blocks_ax0(locs.indexer) for k, rblk in enumerate(rblks): - lvals = blk_vals[rblk.mgr_locs.indexer, :] rvals = rblk.values + right_ea = not isinstance(rvals, np.ndarray) - if not isinstance(rvals, np.ndarray): + lvals = blk_vals[rblk.mgr_locs.indexer, :] + + if not (left_ea or right_ea): + assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) + #elif left_ea and right_ea: + # assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) + elif right_ea: # 1D EA assert lvals.shape[0] == 1, lvals.shape lvals = lvals[0, :] - res_values = array_op(lvals, rvals) - nbs = rblk._split_op_result(res_values) - assert len(nbs) == 1 - nb = nbs[0] - nb.mgr_locs = locs.as_array[nb.mgr_locs] - res_blks.append(nb) - continue - - assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) + else: + assert False # should be unreachable ATM + #assert rvals.shape[0] == 1, rvals.shape + #rvals = rvals[0, :] res_values = array_op(lvals, rvals) - assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) nbs = rblk._split_op_result(res_values) + + # Debugging assertions + if right_ea: # or left_ea + assert len(nbs) == 1 + else: + assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) + for nb in nbs: # TODO: maybe optimize by sticking with slices? - nb_mgr_locs = nb.mgr_locs - nblocs = locs.as_array[nb_mgr_locs.indexer] + nblocs = locs.as_array[nb.mgr_locs.indexer] nb.mgr_locs = nblocs assert len(nblocs) == nb.shape[0], (len(nblocs), nb.shape) assert all(x in locs.as_array for x in nb.mgr_locs.as_array) From 61e5cd6260d3a94f195fa5466a6c9cc0a6bd7607 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 25 Mar 2020 09:29:34 -0700 Subject: [PATCH 06/31] checkpoint passing --- pandas/core/ops/__init__.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 4fdd18418f423..687d1b8bb7891 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -306,7 +306,7 @@ def operate_blockwise(left, right, array_op): left_ea = not isinstance(blk_vals, np.ndarray) - if not isinstance(blk_vals, np.ndarray): + if False:#left_ea: # 1D EA assert len(locs) == 1, locs rblks = rmgr._slice_take_blocks_ax0(locs.indexer) @@ -319,8 +319,6 @@ def operate_blockwise(left, right, array_op): if not right_ea: assert rvals.shape[0] == 1, rvals.shape rvals = rvals[0, :] - #rser = right.iloc[:, locs[0]] - #rvals = extract_array(rser, extract_numpy=True) res_values = array_op(blk_vals, rvals) nbs = blk._split_op_result(res_values) # Setting nb.mgr_locs is unnecessary here, but harmless @@ -329,30 +327,38 @@ def operate_blockwise(left, right, array_op): rblks = rmgr._slice_take_blocks_ax0(locs.indexer) + if left_ea: + assert len(locs) == 1, locs + assert len(rblks) == 1, rblks + assert rblks[0].shape[0] == 1, rblks[0].shape + for k, rblk in enumerate(rblks): rvals = rblk.values right_ea = not isinstance(rvals, np.ndarray) - lvals = blk_vals[rblk.mgr_locs.indexer, :] + #lvals = blk_vals[rblk.mgr_locs.indexer, :] if not (left_ea or right_ea): + lvals = blk_vals[rblk.mgr_locs.indexer, :] + assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) + elif left_ea and right_ea: + lvals = blk_vals assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) - #elif left_ea and right_ea: - # assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) elif right_ea: # 1D EA + lvals = blk_vals[rblk.mgr_locs.indexer, :] assert lvals.shape[0] == 1, lvals.shape lvals = lvals[0, :] else: - assert False # should be unreachable ATM - #assert rvals.shape[0] == 1, rvals.shape - #rvals = rvals[0, :] + lvals = blk_vals + assert rvals.shape[0] == 1, rvals.shape + rvals = rvals[0, :] res_values = array_op(lvals, rvals) nbs = rblk._split_op_result(res_values) # Debugging assertions - if right_ea: # or left_ea + if right_ea or left_ea: assert len(nbs) == 1 else: assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) From 89c3d7bdef4a1b89f14161ac4db3a87611aa3cec Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 25 Mar 2020 10:03:31 -0700 Subject: [PATCH 07/31] refactor --- pandas/core/ops/__init__.py | 75 ++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 43 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 687d1b8bb7891..b285c5494ee71 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -4,7 +4,7 @@ This is not a public API. """ import operator -from typing import TYPE_CHECKING, List, Optional, Set +from typing import TYPE_CHECKING, List, Optional, Set, Tuple import numpy as np @@ -297,34 +297,41 @@ def fill_binop(left, right, fill_value): def operate_blockwise(left, right, array_op): assert right._indexed_same(left) + def get_same_shape_values( + lblk: "Block", rblk: "Block", left_ea: bool, right_ea: bool + ) -> Tuple[ArrayLike, ArrayLike]: + """ + Slice lblk.values to align with rblk. Squeeze if we have EAs. + """ + lvals = lblk.values + rvals = rblk.values + + # TODO(EA2D): with 2D EAs pnly this first clause would be needed + if not (left_ea or right_ea): + lvals = lvals[rblk.mgr_locs.indexer, :] + assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) + elif left_ea and right_ea: + assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) + elif right_ea: + # lvals are 2D, rvals are 1D + lvals = lvals[rblk.mgr_locs.indexer, :] + assert lvals.shape[0] == 1, lvals.shape + lvals = lvals[0, :] + else: + # lvals are 1D, rvals are 2D + assert rvals.shape[0] == 1, rvals.shape + rvals = rvals[0, :] + + return lvals, rvals + res_blks: List["Block"] = [] rmgr = right._data for n, blk in enumerate(left._data.blocks): locs = blk.mgr_locs - blk_vals = blk.values left_ea = not isinstance(blk_vals, np.ndarray) - if False:#left_ea: - # 1D EA - assert len(locs) == 1, locs - rblks = rmgr._slice_take_blocks_ax0(locs.indexer) - assert len(rblks) == 1, rblks - rblk = rblks[0] - assert rblk.shape[0] == 1, rblk.shape - - rvals = rblk.values - right_ea = not isinstance(rvals, np.ndarray) - if not right_ea: - assert rvals.shape[0] == 1, rvals.shape - rvals = rvals[0, :] - res_values = array_op(blk_vals, rvals) - nbs = blk._split_op_result(res_values) - # Setting nb.mgr_locs is unnecessary here, but harmless - res_blks.extend(nbs) - continue - rblks = rmgr._slice_take_blocks_ax0(locs.indexer) if left_ea: @@ -333,38 +340,20 @@ def operate_blockwise(left, right, array_op): assert rblks[0].shape[0] == 1, rblks[0].shape for k, rblk in enumerate(rblks): - rvals = rblk.values - right_ea = not isinstance(rvals, np.ndarray) - - #lvals = blk_vals[rblk.mgr_locs.indexer, :] - - if not (left_ea or right_ea): - lvals = blk_vals[rblk.mgr_locs.indexer, :] - assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) - elif left_ea and right_ea: - lvals = blk_vals - assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) - elif right_ea: - # 1D EA - lvals = blk_vals[rblk.mgr_locs.indexer, :] - assert lvals.shape[0] == 1, lvals.shape - lvals = lvals[0, :] - else: - lvals = blk_vals - assert rvals.shape[0] == 1, rvals.shape - rvals = rvals[0, :] + right_ea = not isinstance(rblk.values, np.ndarray) + + lvals, rvals = get_same_shape_values(blk, rblk, left_ea, right_ea) res_values = array_op(lvals, rvals) nbs = rblk._split_op_result(res_values) - # Debugging assertions if right_ea or left_ea: assert len(nbs) == 1 else: assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) for nb in nbs: - # TODO: maybe optimize by sticking with slices? + # Reset mgr_locs to correspond to our original DataFrame nblocs = locs.as_array[nb.mgr_locs.indexer] nb.mgr_locs = nblocs assert len(nblocs) == nb.shape[0], (len(nblocs), nb.shape) From e348e464457556b1678c898c9cdc7366cd0b3877 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 25 Mar 2020 10:11:25 -0700 Subject: [PATCH 08/31] blackify --- pandas/tests/frame/test_arithmetic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 18cd2a4b0c90b..0b73583002d5c 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1,7 +1,6 @@ from collections import deque from datetime import datetime import operator -import re import numpy as np import pytest @@ -47,10 +46,11 @@ def check(df, df2): ) tm.assert_frame_equal(result, expected) - msg = "|".join([ + msgs = [ r"Invalid comparison between dtype=datetime64\[ns\] and ndarray", "invalid type promotion", - ]) + ] + msg = "|".join(msgs) with pytest.raises(TypeError, match=msg): x >= y with pytest.raises(TypeError, match=msg): From 2b1ba182144759b68e1a7b4eb8f3bfca4f2a05fb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 31 Mar 2020 14:44:03 -0700 Subject: [PATCH 09/31] disable assertions for perf --- pandas/core/ops/__init__.py | 19 ++++++++++--------- pandas/tests/frame/test_arithmetic.py | 1 + 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index b285c5494ee71..3911380c7f795 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -356,19 +356,20 @@ def get_same_shape_values( # Reset mgr_locs to correspond to our original DataFrame nblocs = locs.as_array[nb.mgr_locs.indexer] nb.mgr_locs = nblocs - assert len(nblocs) == nb.shape[0], (len(nblocs), nb.shape) - assert all(x in locs.as_array for x in nb.mgr_locs.as_array) + # Assertions are disabled for performance, but should hold: + # assert len(nblocs) == nb.shape[0], (len(nblocs), nb.shape) + # assert all(x in locs.as_array for x in nb.mgr_locs.as_array) res_blks.extend(nbs) - slocs = {y for nb in res_blks for y in nb.mgr_locs.as_array} - nlocs = sum(len(nb.mgr_locs.as_array) for nb in res_blks) - assert nlocs == len(left.columns), (nlocs, len(left.columns)) - assert len(slocs) == nlocs, (len(slocs), nlocs) - assert slocs == set(range(nlocs)), slocs + # Assertions are disabled for performance, but should hold: + # slocs = {y for nb in res_blks for y in nb.mgr_locs.as_array} + # nlocs = sum(len(nb.mgr_locs.as_array) for nb in res_blks) + # assert nlocs == len(left.columns), (nlocs, len(left.columns)) + # assert len(slocs) == nlocs, (len(slocs), nlocs) + # assert slocs == set(range(nlocs)), slocs - # TODO: once this is working, pass do_integrity_check=False - new_mgr = type(rmgr)(res_blks, axes=rmgr.axes) + new_mgr = type(rmgr)(res_blks, axes=rmgr.axes, do_integrity_check=False) return new_mgr diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 01ea3fc8676de..a5d696be24d0a 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1,6 +1,7 @@ from collections import deque from datetime import datetime import operator +import re import numpy as np import pytest From 91c86a35072e87ec5fa16fb41a89f4c8c7cb65df Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 31 Mar 2020 21:05:01 -0700 Subject: [PATCH 10/31] asv --- asv_bench/benchmarks/arithmetic.py | 48 ++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py index 5a8b109c21858..b53ef9505990d 100644 --- a/asv_bench/benchmarks/arithmetic.py +++ b/asv_bench/benchmarks/arithmetic.py @@ -80,6 +80,54 @@ def time_frame_op_with_series_axis0(self, opname): getattr(self.df, opname)(self.ser, axis=0) +class FrameWithFrameWide: + # Many-columns, mixed dtypes + + params = [ + [ + operator.add, + operator.sub, + operator.mul, + operator.truediv, + operator.floordiv, + operator.pow, + operator.mod, + operator.eq, + operator.ne, + operator.gt, + operator.ge, + operator.lt, + operator.le, + ] + ] + param_names = ["op"] + + def setup(self, op): + # we choose dtypes so as to make the blocks + # a) not perfectly match between right and left + # b) appreciably bigger than single columns + arr = np.random.randn(10 ** 6).reshape(500, 2000).astype(np.float64) + df = pd.DataFrame(arr) + df[1000] = df[1000].astype(np.float32) + df.iloc[:, 1000:] = df.iloc[:, 1000:].astype(np.float32) + + # TODO: GH#33198 the setting here shoudlnt need two steps + df2 = pd.DataFrame(arr) + df2[1000] = df2[1000].astype(np.int64) + df2.iloc[:, 500:1500] = df2.iloc[:, 500:1500].astype(np.int64) + + self.left = df + self.right = df + + def time_op_different_blocks(self, op): + # blocks (and dtypes) are not aligned + op(self.left, self.right) + + def time_op_same_blocks(self, op): + # blocks (and dtypes) are aligned + op(self.left, self.left) + + class Ops: params = [[True, False], ["default", 1]] From 2034084db2a999691bcf28f424fc520dadcdeead Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 1 Apr 2020 08:03:28 -0700 Subject: [PATCH 11/31] whatsnew --- doc/source/whatsnew/v1.1.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 25f847c698278..d424315f5e416 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -274,7 +274,8 @@ Performance improvements :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`, :issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`). - Performance improvement in :meth:`Series.sum` for nullable (integer and boolean) dtypes (:issue:`30982`). - +- Performance improvement in arithmetic operations between two :class:`DataFrame` objects (:issue:`32779`) +- .. --------------------------------------------------------------------------- From 0c12d35e5cb4737fae982a8df1aa90fc9332c1d6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 3 Apr 2020 10:14:00 -0700 Subject: [PATCH 12/31] revert warning suppression --- pandas/core/ops/array_ops.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index f419c548b27c1..c17955457245b 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -6,7 +6,6 @@ from functools import partial import operator from typing import Any, Optional, Tuple -import warnings import numpy as np @@ -159,17 +158,14 @@ def na_arithmetic_op(left, right, op, str_rep: Optional[str], is_cmp: bool = Fal """ import pandas.core.computation.expressions as expressions - with warnings.catch_warnings(): - # suppress warnings from numpy about element-wise comparison - warnings.simplefilter("ignore", DeprecationWarning) - try: - result = expressions.evaluate(op, str_rep, left, right) - except TypeError: - if is_cmp: - # numexpr failed on comparison op, e.g. ndarray[float] > datetime - # In this case we do not fall back to the masked op, as that - # will handle complex numbers incorrectly, see GH#32047 - raise + try: + result = expressions.evaluate(op, str_rep, left, right) + except TypeError: + if is_cmp: + # numexpr failed on comparison op, e.g. ndarray[float] > datetime + # In this case we do not fall back to the masked op, as that + # will handle complex numbers incorrectly, see GH#32047 + raise result = masked_arith_op(left, right, op) if is_cmp and (is_scalar(result) or result is NotImplemented): From 9727562801ac6a1a98bd4ee348fe3666ec49e801 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 3 Apr 2020 10:51:49 -0700 Subject: [PATCH 13/31] Fixupm indentation --- pandas/core/ops/array_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index c17955457245b..05ec48c206b3c 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -166,7 +166,7 @@ def na_arithmetic_op(left, right, op, str_rep: Optional[str], is_cmp: bool = Fal # In this case we do not fall back to the masked op, as that # will handle complex numbers incorrectly, see GH#32047 raise - result = masked_arith_op(left, right, op) + result = masked_arith_op(left, right, op) if is_cmp and (is_scalar(result) or result is NotImplemented): # numpy returned a scalar instead of operating element-wise From 42bbbf3c7ee0ea3d5352191c380972842bf0edd4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 4 Apr 2020 15:42:26 -0700 Subject: [PATCH 14/31] suppress warning --- pandas/core/ops/array_ops.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 05ec48c206b3c..efd0500d351f4 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -6,6 +6,7 @@ from functools import partial import operator from typing import Any, Optional, Tuple +import warnings import numpy as np @@ -308,8 +309,13 @@ def comparison_op( res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) else: - with np.errstate(all="ignore"): - res_values = na_arithmetic_op(lvalues, rvalues, op, str_rep, is_cmp=True) + with warnings.catch_warnings(): + # suppress warnings from numpy about element-wise comparison + warnings.simplefilter("ignore", DeprecationWarning) + with np.errstate(all="ignore"): + res_values = na_arithmetic_op( + lvalues, rvalues, op, str_rep, is_cmp=True + ) return res_values From 0d958a3b6022cec977fc9b5f6fc802da62d2f284 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Apr 2020 10:34:43 -0700 Subject: [PATCH 15/31] update asv --- asv_bench/benchmarks/arithmetic.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py index 0473743dabf46..c5ad533ed82b1 100644 --- a/asv_bench/benchmarks/arithmetic.py +++ b/asv_bench/benchmarks/arithmetic.py @@ -127,11 +127,13 @@ def setup(self, op): df = pd.DataFrame(arr) df[1000] = df[1000].astype(np.float32) df.iloc[:, 1000:] = df.iloc[:, 1000:].astype(np.float32) + df._consolidate_inplace() # TODO: GH#33198 the setting here shoudlnt need two steps df2 = pd.DataFrame(arr) df2[1000] = df2[1000].astype(np.int64) df2.iloc[:, 500:1500] = df2.iloc[:, 500:1500].astype(np.int64) + df2._consolidate_inplace() self.left = df self.right = df From 56eef516d9fc83d9dfa82c41b61c0d86d7aaf202 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Apr 2020 11:09:41 -0700 Subject: [PATCH 16/31] _data->_mgr --- pandas/core/ops/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index bd12fe122bbf0..4d7f4fb026ae8 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -325,8 +325,8 @@ def get_same_shape_values( return lvals, rvals res_blks: List["Block"] = [] - rmgr = right._data - for n, blk in enumerate(left._data.blocks): + rmgr = right._mgr + for n, blk in enumerate(left._mgr.blocks): locs = blk.mgr_locs blk_vals = blk.values From ae744b74a082b7305c81b53a02f6ab3762afa2fd Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 9 Apr 2020 14:33:38 -0700 Subject: [PATCH 17/31] update to use faspath constructor --- pandas/core/frame.py | 4 +++- pandas/core/ops/__init__.py | 19 +++++++++++-------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c3018861bce57..e7732265beea1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -442,6 +442,7 @@ def __init__( mgr = self._init_mgr( data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy ) + elif isinstance(data, dict): mgr = init_dict(data, index, columns, dtype=dtype) elif isinstance(data, ma.MaskedArray): @@ -5453,10 +5454,11 @@ def _construct_result(self, result) -> "DataFrame": ------- DataFrame """ - out = self._constructor(result, index=self.index, copy=False) + out = self._constructor(result, copy=False) # Pin columns instead of passing to constructor for compat with # non-unique columns case out.columns = self.columns + out.index = self.index return out def combine( diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 4d7f4fb026ae8..8bd6e4244e818 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -332,12 +332,14 @@ def get_same_shape_values( left_ea = not isinstance(blk_vals, np.ndarray) + # TODO: joris says this is costly, see if we can optimize rblks = rmgr._slice_take_blocks_ax0(locs.indexer) - if left_ea: - assert len(locs) == 1, locs - assert len(rblks) == 1, rblks - assert rblks[0].shape[0] == 1, rblks[0].shape + # Assertions are disabled for performance, but should hold: + # if left_ea: + # assert len(locs) == 1, locs + # assert len(rblks) == 1, rblks + # assert rblks[0].shape[0] == 1, rblks[0].shape for k, rblk in enumerate(rblks): right_ea = not isinstance(rblk.values, np.ndarray) @@ -347,10 +349,11 @@ def get_same_shape_values( res_values = array_op(lvals, rvals) nbs = rblk._split_op_result(res_values) - if right_ea or left_ea: - assert len(nbs) == 1 - else: - assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) + # Assertions are disabled for performance, but should hold: + # if right_ea or left_ea: + # assert len(nbs) == 1 + # else: + # assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) for nb in nbs: # Reset mgr_locs to correspond to our original DataFrame From f42c40334812f2765f19565d03e82e9263ce1039 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 10 Apr 2020 11:00:55 -0700 Subject: [PATCH 18/31] update import --- pandas/core/ops/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index fede06a8970b9..5f6bc230917a4 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -10,7 +10,7 @@ from pandas._libs import lib from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op # noqa:F401 -from pandas._typing import Level +from pandas._typing import ArrayLike, Level from pandas.util._decorators import Appender from pandas.core.dtypes.common import is_list_like From 8a2807efdc0e3bd14f44a8dc5efbd54226346647 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 10 Apr 2020 11:01:13 -0700 Subject: [PATCH 19/31] remove unused import --- pandas/core/internals/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 75c935cdf2e60..80573f32b936e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -8,7 +8,7 @@ from pandas._libs import NaT, algos as libalgos, lib, writers import pandas._libs.internals as libinternals -from pandas._libs.tslibs import Timedelta, conversion +from pandas._libs.tslibs import conversion from pandas._libs.tslibs.timezones import tz_compare from pandas._typing import ArrayLike from pandas.util._validators import validate_bool_kwarg From fd10fb6cf39619e9a0e66affa5865f6f7755e22f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 10 Apr 2020 14:11:26 -0700 Subject: [PATCH 20/31] rebase compat --- pandas/core/ops/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 5f6bc230917a4..6b703dd7f40aa 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -347,6 +347,8 @@ def get_same_shape_values( lvals, rvals = get_same_shape_values(blk, rblk, left_ea, right_ea) res_values = array_op(lvals, rvals) + if left_ea and not right_ea and hasattr(res_values, "reshape"): + res_values = res_values.reshape(1, -1) nbs = rblk._split_op_result(res_values) # Assertions are disabled for performance, but should hold: From 7150e87363cc205f0df380c06812de1ca264d8c3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 11 Apr 2020 18:09:00 -0700 Subject: [PATCH 21/31] slice instead of take --- pandas/core/internals/managers.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index d1293974b776a..175f0b5332f12 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1322,6 +1322,7 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_value=lib.no_default): if allow_fill and fill_value is None: _, fill_value = maybe_promote(blk.dtype) + # TODO: Any cases where we can optimize this to slice? return [ blk.take_nd( slobj, @@ -1369,11 +1370,18 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_value=lib.no_default): blocks.append(newblk) else: - blocks.append( - blk.take_nd( - blklocs[mgr_locs.indexer], axis=0, new_mgr_locs=mgr_locs, - ) - ) + taker = blklocs[mgr_locs.indexer] + # TODO: taker.max() probably isnt the Technically Correct + # way of calling this? + taker = lib.maybe_indices_to_slice(taker, taker.max()) + + if isinstance(taker, slice): + nb = blk.getitem_block(taker) + nb.mgr_locs = mgr_locs + else: + # TODO: just use getitem_block anyway? + nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs) + blocks.append(nb) return blocks From 0ca2125d85f1b68ad4bc03fc462278b5bb387700 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 14 Apr 2020 07:06:21 -0700 Subject: [PATCH 22/31] Dummy commit to force CI From 2bfc30885b6e3a0bcf1d60ab8b0e731b90c1579e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 15 Apr 2020 07:22:30 -0700 Subject: [PATCH 23/31] update call bound --- pandas/core/internals/managers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index bdeb7820d2762..12f7fd3decb43 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1371,9 +1371,9 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_value=lib.no_default): else: taker = blklocs[mgr_locs.indexer] - # TODO: taker.max() probably isnt the Technically Correct + # TODO: taker.max()+1 probably isnt the Technically Correct # way of calling this? - taker = lib.maybe_indices_to_slice(taker, taker.max()) + taker = lib.maybe_indices_to_slice(taker, taker.max() + 1) if isinstance(taker, slice): nb = blk.getitem_block(taker) From d5ad2a079c1e5ddd097f9bdff32c1ecf8dce890e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 15 Apr 2020 11:49:57 -0700 Subject: [PATCH 24/31] update max_len --- pandas/core/internals/managers.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 12f7fd3decb43..089a12ff40bd9 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1322,7 +1322,6 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_value=lib.no_default): if allow_fill and fill_value is None: _, fill_value = maybe_promote(blk.dtype) - # TODO: Any cases where we can optimize this to slice? return [ blk.take_nd( slobj, @@ -1371,13 +1370,11 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_value=lib.no_default): else: taker = blklocs[mgr_locs.indexer] - # TODO: taker.max()+1 probably isnt the Technically Correct - # way of calling this? - taker = lib.maybe_indices_to_slice(taker, taker.max() + 1) + max_len = max(len(mgr_locs), taker.max() + 1) + taker = lib.maybe_indices_to_slice(taker, max_len) if isinstance(taker, slice): - nb = blk.getitem_block(taker) - nb.mgr_locs = mgr_locs + nb = blk.getitem_block(taker, new_mgr_locs=mgr_locs) else: # TODO: just use getitem_block anyway? nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs) From 92007e8c508c5ca6b2b98cb52999268b5e2e8b44 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 15 Apr 2020 18:58:59 -0700 Subject: [PATCH 25/31] revert --- asv_bench/benchmarks/arithmetic.py | 50 -------- doc/source/whatsnew/v1.1.0.rst | 3 +- pandas/_libs/tslibs/period.pyx | 108 +++++++----------- pandas/core/arrays/datetimelike.py | 7 +- pandas/core/frame.py | 25 ++-- pandas/core/generic.py | 12 +- pandas/core/indexes/category.py | 8 +- pandas/core/indexing.py | 32 ++++-- pandas/core/internals/blocks.py | 55 --------- pandas/core/internals/concat.py | 19 ++- pandas/core/ops/__init__.py | 98 ++-------------- pandas/core/ops/array_ops.py | 68 +---------- pandas/tests/arithmetic/common.py | 9 +- pandas/tests/arithmetic/test_datetime64.py | 7 +- pandas/tests/extension/test_external_block.py | 6 - pandas/tests/frame/test_api.py | 13 ++- pandas/tests/frame/test_arithmetic.py | 8 +- pandas/tests/indexing/test_scalar.py | 40 +++++++ pandas/tests/internals/test_internals.py | 107 +++++++++-------- .../tests/resample/test_resampler_grouper.py | 10 +- pandas/tests/series/test_api.py | 13 ++- pandas/util/_decorators.py | 8 +- 22 files changed, 261 insertions(+), 445 deletions(-) diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py index c5ad533ed82b1..2745db58e83e3 100644 --- a/asv_bench/benchmarks/arithmetic.py +++ b/asv_bench/benchmarks/arithmetic.py @@ -97,56 +97,6 @@ def time_frame_op_with_series_axis0(self, opname): getattr(self.df, opname)(self.ser, axis=0) -class FrameWithFrameWide: - # Many-columns, mixed dtypes - - params = [ - [ - operator.add, - operator.sub, - operator.mul, - operator.truediv, - operator.floordiv, - operator.pow, - operator.mod, - operator.eq, - operator.ne, - operator.gt, - operator.ge, - operator.lt, - operator.le, - ] - ] - param_names = ["op"] - - def setup(self, op): - # we choose dtypes so as to make the blocks - # a) not perfectly match between right and left - # b) appreciably bigger than single columns - arr = np.random.randn(10 ** 6).reshape(500, 2000).astype(np.float64) - df = pd.DataFrame(arr) - df[1000] = df[1000].astype(np.float32) - df.iloc[:, 1000:] = df.iloc[:, 1000:].astype(np.float32) - df._consolidate_inplace() - - # TODO: GH#33198 the setting here shoudlnt need two steps - df2 = pd.DataFrame(arr) - df2[1000] = df2[1000].astype(np.int64) - df2.iloc[:, 500:1500] = df2.iloc[:, 500:1500].astype(np.int64) - df2._consolidate_inplace() - - self.left = df - self.right = df - - def time_op_different_blocks(self, op): - # blocks (and dtypes) are not aligned - op(self.left, self.right) - - def time_op_same_blocks(self, op): - # blocks (and dtypes) are aligned - op(self.left, self.left) - - class Ops: params = [[True, False], ["default", 1]] diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index c7b9d80890daf..82c43811c0444 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -379,7 +379,7 @@ Performance improvements :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`, :issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`). - Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`). -- Performance improvement in arithmetic operations between two :class:`DataFrame` objects (:issue:`32779`) + .. --------------------------------------------------------------------------- @@ -467,6 +467,7 @@ Indexing - Bug in :meth:`DatetimeIndex.get_loc` raising ``KeyError`` with converted-integer key instead of the user-passed key (:issue:`31425`) - Bug in :meth:`Series.xs` incorrectly returning ``Timestamp`` instead of ``datetime64`` in some object-dtype cases (:issue:`31630`) - Bug in :meth:`DataFrame.iat` incorrectly returning ``Timestamp`` instead of ``datetime`` in some object-dtype cases (:issue:`32809`) +- Bug in :meth:`DataFrame.at` when either columns or index is non-unique (:issue:`33041`) - Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` when indexing with an integer key on a object-dtype :class:`Index` that is not all-integers (:issue:`31905`) - Bug in :meth:`DataFrame.iloc.__setitem__` on a :class:`DataFrame` with duplicate columns incorrectly setting values for all matching columns (:issue:`15686`, :issue:`22036`) - Bug in :meth:`DataFrame.loc:` and :meth:`Series.loc` with a :class:`DatetimeIndex`, :class:`TimedeltaIndex`, or :class:`PeriodIndex` incorrectly allowing lookups of non-matching datetime-like dtypes (:issue:`32650`) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index dd745f840d0ab..c4a7df0017619 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -806,24 +806,22 @@ cdef int64_t get_period_ordinal(npy_datetimestruct *dts, int freq) nogil: return unix_date_to_week(unix_date, freq - FR_WK) -cdef void get_date_info(int64_t ordinal, int freq, - npy_datetimestruct *dts) nogil: +cdef void get_date_info(int64_t ordinal, int freq, npy_datetimestruct *dts) nogil: cdef: - int64_t unix_date - double abstime + int64_t unix_date, nanos + npy_datetimestruct dts2 unix_date = get_unix_date(ordinal, freq) - abstime = get_abs_time(freq, unix_date, ordinal) - - while abstime < 0: - abstime += 86400 - unix_date -= 1 + nanos = get_time_nanos(freq, unix_date, ordinal) - while abstime >= 86400: - abstime -= 86400 - unix_date += 1 + pandas_datetime_to_datetimestruct(unix_date, NPY_FR_D, dts) - date_info_from_days_and_time(dts, unix_date, abstime) + dt64_to_dtstruct(nanos, &dts2) + dts.hour = dts2.hour + dts.min = dts2.min + dts.sec = dts2.sec + dts.us = dts2.us + dts.ps = dts2.ps cdef int64_t get_unix_date(int64_t period_ordinal, int freq) nogil: @@ -855,74 +853,50 @@ cdef int64_t get_unix_date(int64_t period_ordinal, int freq) nogil: @cython.cdivision -cdef void date_info_from_days_and_time(npy_datetimestruct *dts, - int64_t unix_date, - double abstime) nogil: +cdef int64_t get_time_nanos(int freq, int64_t unix_date, int64_t ordinal) nogil: """ - Set the instance's value using the given date and time. + Find the number of nanoseconds after midnight on the given unix_date + that the ordinal represents in the given frequency. Parameters ---------- - dts : npy_datetimestruct* + freq : int unix_date : int64_t - days elapsed since datetime(1970, 1, 1) - abstime : double - seconds elapsed since beginning of day described by unix_date + ordinal : int64_t - Notes - ----- - Updates dts inplace + Returns + ------- + int64_t """ cdef: - int inttime - int hour, minute - double second, subsecond_fraction - - # Bounds check - # The calling function is responsible for ensuring that - # abstime >= 0.0 and abstime <= 86400 - - # Calculate the date - pandas_datetime_to_datetimestruct(unix_date, NPY_FR_D, dts) + int64_t sub, factor - # Calculate the time - inttime = abstime - hour = inttime / 3600 - minute = (inttime % 3600) / 60 - second = abstime - (hour * 3600 + minute * 60) + freq = get_freq_group(freq) - dts.hour = hour - dts.min = minute - dts.sec = second - - subsecond_fraction = second - dts.sec - dts.us = int((subsecond_fraction) * 1e6) - dts.ps = int(((subsecond_fraction) * 1e6 - dts.us) * 1e6) + if freq <= FR_DAY: + return 0 + elif freq == FR_NS: + factor = 1 -@cython.cdivision -cdef double get_abs_time(int freq, int64_t unix_date, int64_t ordinal) nogil: - cdef: - int freq_index, day_index, base_index - int64_t per_day, start_ord - double unit, result + elif freq == FR_US: + factor = 10**3 - if freq <= FR_DAY: - return 0 + elif freq == FR_MS: + factor = 10**6 - freq_index = freq // 1000 - day_index = FR_DAY // 1000 - base_index = FR_SEC // 1000 + elif freq == FR_SEC: + factor = 10 **9 - per_day = get_daytime_conversion_factor(day_index, freq_index) - unit = get_daytime_conversion_factor(freq_index, base_index) + elif freq == FR_MIN: + factor = 10**9 * 60 - if base_index < freq_index: - unit = 1 / unit + else: + # We must have freq == FR_HR + factor = 10**9 * 3600 - start_ord = unix_date * per_day - result = (unit * (ordinal - start_ord)) - return result + sub = ordinal - unix_date * 24 * 3600 * 10**9 / factor + return sub * factor cdef int get_yq(int64_t ordinal, int freq, int *quarter, int *year): @@ -1176,11 +1150,7 @@ cdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) except? -1: if ordinal == NPY_NAT: return NPY_NAT - if freq == 11000: - # Microsecond, avoid get_date_info to prevent floating point errors - pandas_datetime_to_datetimestruct(ordinal, NPY_FR_us, &dts) - else: - get_date_info(ordinal, freq, &dts) + get_date_info(ordinal, freq, &dts) check_dts_bounds(&dts) return dtstruct_to_dt64(&dts) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 7384dfd20ebfc..ece92acae6461 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1291,12 +1291,11 @@ def _addsub_object_array(self, other: np.ndarray, op): result : same class as self """ assert op in [operator.add, operator.sub] - if len(other) == 1 and self.ndim == other.ndim == 1: - # If both 1D then broadcasting is unambiguous + if len(other) == 1: return op(self, other[0]) warnings.warn( - "Adding/subtracting object-dtype array to " + "Adding/subtracting array of DateOffsets to " f"{type(self).__name__} not vectorized", PerformanceWarning, ) @@ -1304,7 +1303,7 @@ def _addsub_object_array(self, other: np.ndarray, op): # Caller is responsible for broadcasting if necessary assert self.shape == other.shape, (self.shape, other.shape) - res_values = op(self.astype("O"), np.asarray(other)) + res_values = op(self.astype("O"), np.array(other)) result = array(res_values.ravel()) result = extract_array(result, extract_numpy=True).reshape(self.shape) return result diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 66a728c8ef208..85bb47485a2e7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -452,7 +452,6 @@ def __init__( mgr = self._init_mgr( data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy ) - elif isinstance(data, dict): mgr = init_dict(data, index, columns, dtype=dtype) elif isinstance(data, ma.MaskedArray): @@ -2587,7 +2586,7 @@ def _ixs(self, i: int, axis: int = 0): label = self.columns[i] values = self._mgr.iget(i) - result = self._box_col_values(values, label) + result = self._box_col_values(values, i) # this is a cached value, mark it so result._set_as_cached(label, self) @@ -2692,7 +2691,7 @@ def _getitem_bool_array(self, key): def _getitem_multilevel(self, key): # self.columns is a MultiIndex loc = self.columns.get_loc(key) - if isinstance(loc, (slice, Series, np.ndarray, Index)): + if isinstance(loc, (slice, np.ndarray)): new_columns = self.columns[loc] result_columns = maybe_droplevels(new_columns, key) if self._is_mixed_type: @@ -2725,7 +2724,8 @@ def _getitem_multilevel(self, key): result._set_is_copy(self) return result else: - return self._get_item_cache(key) + # loc is neither a slice nor ndarray, so must be an int + return self._ixs(loc, axis=1) def _get_value(self, index, col, takeable: bool = False): """ @@ -2916,19 +2916,15 @@ def _ensure_valid_index(self, value): value.index.copy(), axis=1, fill_value=np.nan ) - def _box_item_values(self, key, values): - items = self.columns[self.columns.get_loc(key)] - if values.ndim == 2: - return self._constructor(values.T, columns=items, index=self.index) - else: - return self._box_col_values(values, items) - - def _box_col_values(self, values, items): + def _box_col_values(self, values, loc: int) -> Series: """ Provide boxed values for a column. """ + # Lookup in columns so that if e.g. a str datetime was passed + # we attach the Timestamp object as the name. + name = self.columns[loc] klass = self._constructor_sliced - return klass(values, index=self.index, name=items, fastpath=True) + return klass(values, index=self.index, name=name, fastpath=True) # ---------------------------------------------------------------------- # Unsorted @@ -5536,11 +5532,10 @@ def _construct_result(self, result) -> "DataFrame": ------- DataFrame """ - out = self._constructor(result, copy=False) + out = self._constructor(result, index=self.index, copy=False) # Pin columns instead of passing to constructor for compat with # non-unique columns case out.columns = self.columns - out.index = self.index return out def combine( diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6a4f83427310e..2878204f5ee79 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3558,8 +3558,13 @@ def _get_item_cache(self, item): cache = self._item_cache res = cache.get(item) if res is None: - values = self._mgr.get(item) - res = self._box_item_values(item, values) + # All places that call _get_item_cache have unique columns, + # pending resolution of GH#33047 + + loc = self.columns.get_loc(item) + values = self._mgr.iget(loc) + res = self._box_col_values(values, loc) + cache[item] = res res._set_as_cached(item, self) @@ -3567,9 +3572,6 @@ def _get_item_cache(self, item): res._is_copy = self._is_copy return res - def _box_item_values(self, key, values): - raise AbstractMethodError(self) - def _slice(self: FrameOrSeries, slobj: slice, axis=0) -> FrameOrSeries: """ Construct a slice of this container. diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 635bf32639075..ba1a9a4e08fa0 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -25,6 +25,7 @@ from pandas.core.algorithms import take_1d from pandas.core.arrays.categorical import Categorical, contains, recode_for_categories import pandas.core.common as com +from pandas.core.construction import extract_array import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs, maybe_extract_name from pandas.core.indexes.extension import ExtensionIndex, inherit_names @@ -198,8 +199,13 @@ def __new__( data = [] assert isinstance(dtype, CategoricalDtype), dtype - if not isinstance(data, Categorical) or data.dtype != dtype: + data = extract_array(data, extract_numpy=True) + + if not isinstance(data, Categorical): data = Categorical(data, dtype=dtype) + elif isinstance(dtype, CategoricalDtype) and dtype != data.dtype: + # we want to silently ignore dtype='category' + data = data._set_dtype(dtype) data = data.copy() if copy else data diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index b74399ed86fbd..dd072cf00ed20 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2045,6 +2045,7 @@ def __setitem__(self, key, value): key = _tuplify(self.ndim, key) if len(key) != self.ndim: raise ValueError("Not enough indexers for scalar access (setting)!") + key = list(self._convert_key(key, is_setter=True)) self.obj._set_value(*key, value=value, takeable=self._takeable) @@ -2064,15 +2065,32 @@ def _convert_key(self, key, is_setter: bool = False): return key + @property + def _axes_are_unique(self) -> bool: + # Only relevant for self.ndim == 2 + assert self.ndim == 2 + return self.obj.index.is_unique and self.obj.columns.is_unique + def __getitem__(self, key): - if self.ndim != 1 or not is_scalar(key): - # FIXME: is_scalar check is a kludge - return super().__getitem__(key) - # Like Index.get_value, but we do not allow positional fallback - obj = self.obj - loc = obj.index.get_loc(key) - return obj.index._get_values_for_loc(obj, loc, key) + if self.ndim == 2 and not self._axes_are_unique: + # GH#33041 fall back to .loc + if not isinstance(key, tuple) or not all(is_scalar(x) for x in key): + raise ValueError("Invalid call for scalar access (getting)!") + return self.obj.loc[key] + + return super().__getitem__(key) + + def __setitem__(self, key, value): + if self.ndim == 2 and not self._axes_are_unique: + # GH#33041 fall back to .loc + if not isinstance(key, tuple) or not all(is_scalar(x) for x in key): + raise ValueError("Invalid call for scalar access (setting)!") + + self.obj.loc[key] = value + return + + return super().__setitem__(key, value) @doc(IndexingMixin.iat) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 185b0f4da2627..ebdc331a673ab 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -48,7 +48,6 @@ is_timedelta64_dtype, pandas_dtype, ) -from pandas.core.dtypes.concat import concat_categorical, concat_datetime from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -110,7 +109,6 @@ class Block(PandasObject): _can_consolidate = True _verify_integrity = True _validate_ndim = True - _concatenator = staticmethod(np.concatenate) def __init__(self, values, placement, ndim=None): self.ndim = self._check_ndim(values, ndim) @@ -309,16 +307,6 @@ def shape(self): def dtype(self): return self.values.dtype - def concat_same_type(self, to_concat): - """ - Concatenate list of single blocks of the same type. - """ - values = self._concatenator( - [blk.values for blk in to_concat], axis=self.ndim - 1 - ) - placement = self.mgr_locs if self.ndim == 2 else slice(len(values)) - return self.make_block_same_class(values, placement=placement) - def iget(self, i): return self.values[i] @@ -1770,14 +1758,6 @@ def _slice(self, slicer): return self.values[slicer] - def concat_same_type(self, to_concat): - """ - Concatenate list of single blocks of the same type. - """ - values = self._holder._concat_same_type([blk.values for blk in to_concat]) - placement = self.mgr_locs if self.ndim == 2 else slice(len(values)) - return self.make_block_same_class(values, placement=placement) - def fillna(self, value, limit=None, inplace=False, downcast=None): values = self.values if inplace else self.values.copy() values = values.fillna(value=value, limit=limit) @@ -2258,20 +2238,6 @@ def diff(self, n: int, axis: int = 0) -> List["Block"]: new_values = new_values.astype("timedelta64[ns]") return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)] - def concat_same_type(self, to_concat): - # need to handle concat([tz1, tz2]) here, since DatetimeArray - # only handles cases where all the tzs are the same. - # Instead of placing the condition here, it could also go into the - # is_uniform_join_units check, but I'm not sure what is better. - if len({x.dtype for x in to_concat}) > 1: - values = concat_datetime([x.values for x in to_concat]) - - values = values.astype(object, copy=False) - placement = self.mgr_locs if self.ndim == 2 else slice(len(values)) - - return self.make_block(values, placement=placement) - return super().concat_same_type(to_concat) - def fillna(self, value, limit=None, inplace=False, downcast=None): # We support filling a DatetimeTZ with a `value` whose timezone # is different by coercing to object. @@ -2642,7 +2608,6 @@ class CategoricalBlock(ExtensionBlock): is_categorical = True _verify_integrity = True _can_hold_na = True - _concatenator = staticmethod(concat_categorical) should_store = Block.should_store @@ -2656,26 +2621,6 @@ def __init__(self, values, placement, ndim=None): def _holder(self): return Categorical - def concat_same_type(self, to_concat): - """ - Concatenate list of single blocks of the same type. - - Note that this CategoricalBlock._concat_same_type *may* not - return a CategoricalBlock. When the categories in `to_concat` - differ, this will return an object ndarray. - - If / when we decide we don't like that behavior: - - 1. Change Categorical._concat_same_type to use union_categoricals - 2. Delete this method. - """ - values = self._concatenator( - [blk.values for blk in to_concat], axis=self.ndim - 1 - ) - placement = self.mgr_locs if self.ndim == 2 else slice(len(values)) - # not using self.make_block_same_class as values can be object dtype - return self.make_block(values, placement=placement) - def replace( self, to_replace, diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 720e6799a3bf3..37e081aeba3f6 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -1,6 +1,7 @@ # TODO: Needs a better name; too many modules are already called "concat" from collections import defaultdict import copy +from typing import List import numpy as np @@ -61,8 +62,18 @@ def concatenate_block_managers( values = values.view() b = b.make_block_same_class(values, placement=placement) elif _is_uniform_join_units(join_units): - b = join_units[0].block.concat_same_type([ju.block for ju in join_units]) - b.mgr_locs = placement + blk = join_units[0].block + vals = [ju.block.values for ju in join_units] + + if not blk.is_extension or blk.is_datetimetz or blk.is_categorical: + # datetimetz and categorical can have the same type but multiple + # dtypes, concatting does not necessarily preserve dtype + values = concat_compat(vals, axis=blk.ndim - 1) + else: + # TODO(EA2D): special-casing not needed with 2D EAs + values = concat_compat(vals) + + b = make_block(values, placement=placement, ndim=blk.ndim) else: b = make_block( _concatenate_join_units(join_units, concat_axis, copy=copy), @@ -419,13 +430,15 @@ def _get_empty_dtype_and_na(join_units): raise AssertionError(msg) -def _is_uniform_join_units(join_units) -> bool: +def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool: """ Check if the join units consist of blocks of uniform type that can be concatenated using Block.concat_same_type instead of the generic _concatenate_join_units (which uses `concat_compat`). """ + # TODO: require dtype match in addition to same type? e.g. DatetimeTZBlock + # cannot necessarily join return ( # all blocks need to have the same type all(type(ju.block) is type(join_units[0].block) for ju in join_units) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 6b703dd7f40aa..9a7c9fdadf90d 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -4,13 +4,13 @@ This is not a public API. """ import operator -from typing import TYPE_CHECKING, List, Optional, Set, Tuple +from typing import TYPE_CHECKING, Optional, Set import numpy as np from pandas._libs import lib from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op # noqa:F401 -from pandas._typing import ArrayLike, Level +from pandas._typing import Level from pandas.util._decorators import Appender from pandas.core.dtypes.common import is_list_like @@ -57,7 +57,6 @@ if TYPE_CHECKING: from pandas import DataFrame # noqa:F401 - from pandas.core.internals.blocks import Block # noqa: F401 # ----------------------------------------------------------------------------- # constants @@ -294,90 +293,6 @@ def fill_binop(left, right, fill_value): # Dispatch logic -def operate_blockwise(left, right, array_op): - assert right._indexed_same(left) - - def get_same_shape_values( - lblk: "Block", rblk: "Block", left_ea: bool, right_ea: bool - ) -> Tuple[ArrayLike, ArrayLike]: - """ - Slice lblk.values to align with rblk. Squeeze if we have EAs. - """ - lvals = lblk.values - rvals = rblk.values - - # TODO(EA2D): with 2D EAs pnly this first clause would be needed - if not (left_ea or right_ea): - lvals = lvals[rblk.mgr_locs.indexer, :] - assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) - elif left_ea and right_ea: - assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) - elif right_ea: - # lvals are 2D, rvals are 1D - lvals = lvals[rblk.mgr_locs.indexer, :] - assert lvals.shape[0] == 1, lvals.shape - lvals = lvals[0, :] - else: - # lvals are 1D, rvals are 2D - assert rvals.shape[0] == 1, rvals.shape - rvals = rvals[0, :] - - return lvals, rvals - - res_blks: List["Block"] = [] - rmgr = right._mgr - for n, blk in enumerate(left._mgr.blocks): - locs = blk.mgr_locs - blk_vals = blk.values - - left_ea = not isinstance(blk_vals, np.ndarray) - - # TODO: joris says this is costly, see if we can optimize - rblks = rmgr._slice_take_blocks_ax0(locs.indexer) - - # Assertions are disabled for performance, but should hold: - # if left_ea: - # assert len(locs) == 1, locs - # assert len(rblks) == 1, rblks - # assert rblks[0].shape[0] == 1, rblks[0].shape - - for k, rblk in enumerate(rblks): - right_ea = not isinstance(rblk.values, np.ndarray) - - lvals, rvals = get_same_shape_values(blk, rblk, left_ea, right_ea) - - res_values = array_op(lvals, rvals) - if left_ea and not right_ea and hasattr(res_values, "reshape"): - res_values = res_values.reshape(1, -1) - nbs = rblk._split_op_result(res_values) - - # Assertions are disabled for performance, but should hold: - # if right_ea or left_ea: - # assert len(nbs) == 1 - # else: - # assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) - - for nb in nbs: - # Reset mgr_locs to correspond to our original DataFrame - nblocs = locs.as_array[nb.mgr_locs.indexer] - nb.mgr_locs = nblocs - # Assertions are disabled for performance, but should hold: - # assert len(nblocs) == nb.shape[0], (len(nblocs), nb.shape) - # assert all(x in locs.as_array for x in nb.mgr_locs.as_array) - - res_blks.extend(nbs) - - # Assertions are disabled for performance, but should hold: - # slocs = {y for nb in res_blks for y in nb.mgr_locs.as_array} - # nlocs = sum(len(nb.mgr_locs.as_array) for nb in res_blks) - # assert nlocs == len(left.columns), (nlocs, len(left.columns)) - # assert len(slocs) == nlocs, (len(slocs), nlocs) - # assert slocs == set(range(nlocs)), slocs - - new_mgr = type(rmgr)(res_blks, axes=rmgr.axes, do_integrity_check=False) - return new_mgr - - def dispatch_to_series(left, right, func, str_rep=None, axis=None): """ Evaluate the frame operation func(left, right) by evaluating @@ -410,9 +325,8 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None): elif isinstance(right, ABCDataFrame): assert right._indexed_same(left) - array_op = get_array_op(func, str_rep=str_rep) - bm = operate_blockwise(left, right, array_op) - return type(left)(bm) + def column_op(a, b): + return {i: func(a.iloc[:, i], b.iloc[:, i]) for i in range(len(a.columns))} elif isinstance(right, ABCSeries) and axis == "columns": # We only get here if called via _combine_series_frame, @@ -596,11 +510,13 @@ def _combine_series_frame(left, right, func, axis: int, str_rep: str): if axis == 0: values = right._values if isinstance(values, np.ndarray): + # TODO(EA2D): no need to special-case with 2D EAs # We can operate block-wise values = values.reshape(-1, 1) + values = np.broadcast_to(values, left.shape) array_op = get_array_op(func, str_rep=str_rep) - bm = left._mgr.apply(array_op, right=values.T) + bm = left._mgr.apply(array_op, right=values.T, align_keys=["right"]) return type(left)(bm) new_data = dispatch_to_series(left, right, func) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index efd0500d351f4..a1d853e38e757 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -6,7 +6,6 @@ from functools import partial import operator from typing import Any, Optional, Tuple -import warnings import numpy as np @@ -76,14 +75,7 @@ def masked_arith_op(x: np.ndarray, y, op): result = np.empty(x.size, dtype=dtype) if len(x) != len(y): - if not _can_broadcast(x, y): - raise ValueError(x.shape, y.shape) - - # Call notna on pre-broadcasted y for performance - ymask = notna(y) - y = np.broadcast_to(y, x.shape) - ymask = np.broadcast_to(ymask, x.shape) - + raise ValueError(x.shape, y.shape) else: ymask = notna(y) @@ -128,7 +120,7 @@ def masked_arith_op(x: np.ndarray, y, op): return result -def define_na_arithmetic_op(op, str_rep: Optional[str]): +def define_na_arithmetic_op(op, str_rep: str): def na_op(x, y): return na_arithmetic_op(x, y, op, str_rep) @@ -212,51 +204,6 @@ def arithmetic_op(left: ArrayLike, right: Any, op, str_rep: str): return res_values -def _broadcast_comparison_op(lvalues, rvalues, op) -> np.ndarray: - """ - Broadcast a comparison operation between two 2D arrays. - - Parameters - ---------- - lvalues : np.ndarray or ExtensionArray - rvalues : np.ndarray or ExtensionArray - - Returns - ------- - np.ndarray[bool] - """ - if isinstance(rvalues, np.ndarray): - rvalues = np.broadcast_to(rvalues, lvalues.shape) - result = comparison_op(lvalues, rvalues, op) - else: - result = np.empty(lvalues.shape, dtype=bool) - for i in range(len(lvalues)): - result[i, :] = comparison_op(lvalues[i], rvalues[:, 0], op) - return result - - -def _can_broadcast(lvalues, rvalues) -> bool: - """ - Check if we can broadcast rvalues to match the shape of lvalues. - - Parameters - ---------- - lvalues : np.ndarray or ExtensionArray - rvalues : np.ndarray or ExtensionArray - - Returns - ------- - bool - """ - # We assume that lengths dont match - if lvalues.ndim == rvalues.ndim == 2: - # See if we can broadcast unambiguously - if lvalues.shape[1] == rvalues.shape[-1]: - if rvalues.shape[0] == 1: - return True - return False - - def comparison_op( left: ArrayLike, right: Any, op, str_rep: Optional[str] = None, ) -> ArrayLike: @@ -288,8 +235,6 @@ def comparison_op( # We are not catching all listlikes here (e.g. frozenset, tuple) # The ambiguous case is object-dtype. See GH#27803 if len(lvalues) != len(rvalues): - if _can_broadcast(lvalues, rvalues): - return _broadcast_comparison_op(lvalues, rvalues, op) raise ValueError( "Lengths must match to compare", lvalues.shape, rvalues.shape ) @@ -309,13 +254,8 @@ def comparison_op( res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) else: - with warnings.catch_warnings(): - # suppress warnings from numpy about element-wise comparison - warnings.simplefilter("ignore", DeprecationWarning) - with np.errstate(all="ignore"): - res_values = na_arithmetic_op( - lvalues, rvalues, op, str_rep, is_cmp=True - ) + with np.errstate(all="ignore"): + res_values = na_arithmetic_op(lvalues, rvalues, op, str_rep, is_cmp=True) return res_values diff --git a/pandas/tests/arithmetic/common.py b/pandas/tests/arithmetic/common.py index 755fbd0d9036c..ccc49adc5da82 100644 --- a/pandas/tests/arithmetic/common.py +++ b/pandas/tests/arithmetic/common.py @@ -70,14 +70,7 @@ def assert_invalid_comparison(left, right, box): result = right != left tm.assert_equal(result, ~expected) - msg = "|".join( - [ - "Invalid comparison between", - "Cannot compare type", - "not supported between", - "invalid type promotion", - ] - ) + msg = "Invalid comparison between|Cannot compare type|not supported between" with pytest.raises(TypeError, match=msg): left < right with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index db2ac092a5750..56c5647d865d3 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -964,9 +964,7 @@ def test_dt64arr_sub_dt64object_array(self, box_with_array, tz_naive_fixture): obj = tm.box_expected(dti, box_with_array) expected = tm.box_expected(expected, box_with_array) - warn = None - if box_with_array is not pd.DataFrame or tz_naive_fixture is None: - warn = PerformanceWarning + warn = PerformanceWarning if box_with_array is not pd.DataFrame else None with tm.assert_produces_warning(warn): result = obj - obj.astype(object) tm.assert_equal(result, expected) @@ -1446,7 +1444,8 @@ def test_dt64arr_add_mixed_offset_array(self, box_with_array): s = DatetimeIndex([Timestamp("2000-1-1"), Timestamp("2000-2-1")]) s = tm.box_expected(s, box_with_array) - with tm.assert_produces_warning(PerformanceWarning): + warn = None if box_with_array is pd.DataFrame else PerformanceWarning + with tm.assert_produces_warning(warn): other = pd.Index([pd.offsets.DateOffset(years=1), pd.offsets.MonthEnd()]) other = tm.box_expected(other, box_with_array) result = s + other diff --git a/pandas/tests/extension/test_external_block.py b/pandas/tests/extension/test_external_block.py index 9925fd51561ae..1843126898f3d 100644 --- a/pandas/tests/extension/test_external_block.py +++ b/pandas/tests/extension/test_external_block.py @@ -32,12 +32,6 @@ def df(): return pd.DataFrame(block_manager) -def test_concat_dataframe(df): - # GH17728 - res = pd.concat([df, df]) - assert isinstance(res._mgr.blocks[1], CustomBlock) - - def test_concat_axis1(df): # GH17954 df2 = pd.DataFrame({"c": [0.1, 0.2, 0.3]}) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 4149485be181d..ec8613faaa663 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -529,7 +529,18 @@ async def test_tab_complete_warning(self, ip): code = "import pandas as pd; df = pd.DataFrame()" await ip.run_code(code) - with tm.assert_produces_warning(None): + + # TODO: remove it when Ipython updates + # GH 33567, jedi version raises Deprecation warning in Ipython + import jedi + + if jedi.__version__ < "0.17.0": + warning = tm.assert_produces_warning(None) + else: + warning = tm.assert_produces_warning( + DeprecationWarning, check_stacklevel=False + ) + with warning: with provisionalcompleter("ignore"): list(ip.Completer.completions("df.", 1)) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 11dbeb93eee37..d929d3e030508 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -49,11 +49,9 @@ def check(df, df2): ) tm.assert_frame_equal(result, expected) - msgs = [ - r"Invalid comparison between dtype=datetime64\[ns\] and ndarray", - "invalid type promotion", - ] - msg = "|".join(msgs) + msg = re.escape( + "Invalid comparison between dtype=datetime64[ns] and ndarray" + ) with pytest.raises(TypeError, match=msg): x >= y with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/indexing/test_scalar.py b/pandas/tests/indexing/test_scalar.py index 61d109344568c..216d554e22b49 100644 --- a/pandas/tests/indexing/test_scalar.py +++ b/pandas/tests/indexing/test_scalar.py @@ -128,6 +128,46 @@ def test_imethods_with_dups(self): result = df.iat[2, 0] assert result == 2 + def test_frame_at_with_duplicate_axes(self): + # GH#33041 + arr = np.random.randn(6).reshape(3, 2) + df = DataFrame(arr, columns=["A", "A"]) + + result = df.at[0, "A"] + expected = df.iloc[0] + + tm.assert_series_equal(result, expected) + + result = df.T.at["A", 0] + tm.assert_series_equal(result, expected) + + # setter + df.at[1, "A"] = 2 + expected = Series([2.0, 2.0], index=["A", "A"], name=1) + tm.assert_series_equal(df.iloc[1], expected) + + def test_frame_at_with_duplicate_axes_requires_scalar_lookup(self): + # GH#33041 check that falling back to loc doesn't allow non-scalar + # args to slip in + + arr = np.random.randn(6).reshape(3, 2) + df = DataFrame(arr, columns=["A", "A"]) + + msg = "Invalid call for scalar access" + with pytest.raises(ValueError, match=msg): + df.at[[1, 2]] + with pytest.raises(ValueError, match=msg): + df.at[1, ["A"]] + with pytest.raises(ValueError, match=msg): + df.at[:, "A"] + + with pytest.raises(ValueError, match=msg): + df.at[[1, 2]] = 1 + with pytest.raises(ValueError, match=msg): + df.at[1, ["A"]] = 1 + with pytest.raises(ValueError, match=msg): + df.at[:, "A"] = 1 + def test_series_at_raises_type_error(self): # at should not fallback # GH 7814 diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 58a84f5a267bc..f1d4c865a0ced 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -329,45 +329,48 @@ def test_categorical_block_pickle(self): smgr2 = tm.round_trip_pickle(smgr) tm.assert_series_equal(Series(smgr), Series(smgr2)) - def test_get(self): + def test_iget(self): cols = Index(list("abc")) values = np.random.rand(3, 3) block = make_block(values=values.copy(), placement=np.arange(3)) mgr = BlockManager(blocks=[block], axes=[cols, np.arange(3)]) - tm.assert_almost_equal(mgr.get("a").internal_values(), values[0]) - tm.assert_almost_equal(mgr.get("b").internal_values(), values[1]) - tm.assert_almost_equal(mgr.get("c").internal_values(), values[2]) + tm.assert_almost_equal(mgr.iget(0).internal_values(), values[0]) + tm.assert_almost_equal(mgr.iget(1).internal_values(), values[1]) + tm.assert_almost_equal(mgr.iget(2).internal_values(), values[2]) def test_set(self): mgr = create_mgr("a,b,c: int", item_shape=(3,)) mgr.set("d", np.array(["foo"] * 3)) mgr.set("b", np.array(["bar"] * 3)) - tm.assert_numpy_array_equal(mgr.get("a").internal_values(), np.array([0] * 3)) + tm.assert_numpy_array_equal(mgr.iget(0).internal_values(), np.array([0] * 3)) tm.assert_numpy_array_equal( - mgr.get("b").internal_values(), np.array(["bar"] * 3, dtype=np.object_) + mgr.iget(1).internal_values(), np.array(["bar"] * 3, dtype=np.object_) ) - tm.assert_numpy_array_equal(mgr.get("c").internal_values(), np.array([2] * 3)) + tm.assert_numpy_array_equal(mgr.iget(2).internal_values(), np.array([2] * 3)) tm.assert_numpy_array_equal( - mgr.get("d").internal_values(), np.array(["foo"] * 3, dtype=np.object_) + mgr.iget(3).internal_values(), np.array(["foo"] * 3, dtype=np.object_) ) def test_set_change_dtype(self, mgr): mgr.set("baz", np.zeros(N, dtype=bool)) mgr.set("baz", np.repeat("foo", N)) - assert mgr.get("baz").dtype == np.object_ + idx = mgr.items.get_loc("baz") + assert mgr.iget(idx).dtype == np.object_ mgr2 = mgr.consolidate() mgr2.set("baz", np.repeat("foo", N)) - assert mgr2.get("baz").dtype == np.object_ + idx = mgr2.items.get_loc("baz") + assert mgr2.iget(idx).dtype == np.object_ mgr2.set("quux", tm.randn(N).astype(int)) - assert mgr2.get("quux").dtype == np.int_ + idx = mgr2.items.get_loc("quux") + assert mgr2.iget(idx).dtype == np.int_ mgr2.set("quux", tm.randn(N)) - assert mgr2.get("quux").dtype == np.float_ + assert mgr2.iget(idx).dtype == np.float_ def test_copy(self, mgr): cp = mgr.copy(deep=False) @@ -430,8 +433,8 @@ def test_as_array_datetime(self): def test_as_array_datetime_tz(self): mgr = create_mgr("h: M8[ns, US/Eastern]; g: M8[ns, CET]") - assert mgr.get("h").dtype == "datetime64[ns, US/Eastern]" - assert mgr.get("g").dtype == "datetime64[ns, CET]" + assert mgr.iget(0).dtype == "datetime64[ns, US/Eastern]" + assert mgr.iget(1).dtype == "datetime64[ns, CET]" assert mgr.as_array().dtype == "object" @pytest.mark.parametrize("t", ["float16", "float32", "float64", "int32", "int64"]) @@ -441,26 +444,26 @@ def test_astype(self, t): t = np.dtype(t) tmgr = mgr.astype(t) - assert tmgr.get("c").dtype.type == t - assert tmgr.get("d").dtype.type == t - assert tmgr.get("e").dtype.type == t + assert tmgr.iget(0).dtype.type == t + assert tmgr.iget(1).dtype.type == t + assert tmgr.iget(2).dtype.type == t # mixed mgr = create_mgr("a,b: object; c: bool; d: datetime; e: f4; f: f2; g: f8") t = np.dtype(t) tmgr = mgr.astype(t, errors="ignore") - assert tmgr.get("c").dtype.type == t - assert tmgr.get("e").dtype.type == t - assert tmgr.get("f").dtype.type == t - assert tmgr.get("g").dtype.type == t + assert tmgr.iget(2).dtype.type == t + assert tmgr.iget(4).dtype.type == t + assert tmgr.iget(5).dtype.type == t + assert tmgr.iget(6).dtype.type == t - assert tmgr.get("a").dtype.type == np.object_ - assert tmgr.get("b").dtype.type == np.object_ + assert tmgr.iget(0).dtype.type == np.object_ + assert tmgr.iget(1).dtype.type == np.object_ if t != np.int64: - assert tmgr.get("d").dtype.type == np.datetime64 + assert tmgr.iget(3).dtype.type == np.datetime64 else: - assert tmgr.get("d").dtype.type == t + assert tmgr.iget(3).dtype.type == t def test_convert(self): def _compare(old_mgr, new_mgr): @@ -497,11 +500,11 @@ def _compare(old_mgr, new_mgr): mgr.set("b", np.array(["2."] * N, dtype=np.object_)) mgr.set("foo", np.array(["foo."] * N, dtype=np.object_)) new_mgr = mgr.convert(numeric=True) - assert new_mgr.get("a").dtype == np.int64 - assert new_mgr.get("b").dtype == np.float64 - assert new_mgr.get("foo").dtype == np.object_ - assert new_mgr.get("f").dtype == np.int64 - assert new_mgr.get("g").dtype == np.float64 + assert new_mgr.iget(0).dtype == np.int64 + assert new_mgr.iget(1).dtype == np.float64 + assert new_mgr.iget(2).dtype == np.object_ + assert new_mgr.iget(3).dtype == np.int64 + assert new_mgr.iget(4).dtype == np.float64 mgr = create_mgr( "a,b,foo: object; f: i4; bool: bool; dt: datetime; i: i8; g: f8; h: f2" @@ -510,15 +513,15 @@ def _compare(old_mgr, new_mgr): mgr.set("b", np.array(["2."] * N, dtype=np.object_)) mgr.set("foo", np.array(["foo."] * N, dtype=np.object_)) new_mgr = mgr.convert(numeric=True) - assert new_mgr.get("a").dtype == np.int64 - assert new_mgr.get("b").dtype == np.float64 - assert new_mgr.get("foo").dtype == np.object_ - assert new_mgr.get("f").dtype == np.int32 - assert new_mgr.get("bool").dtype == np.bool_ - assert new_mgr.get("dt").dtype.type, np.datetime64 - assert new_mgr.get("i").dtype == np.int64 - assert new_mgr.get("g").dtype == np.float64 - assert new_mgr.get("h").dtype == np.float16 + assert new_mgr.iget(0).dtype == np.int64 + assert new_mgr.iget(1).dtype == np.float64 + assert new_mgr.iget(2).dtype == np.object_ + assert new_mgr.iget(3).dtype == np.int32 + assert new_mgr.iget(4).dtype == np.bool_ + assert new_mgr.iget(5).dtype.type, np.datetime64 + assert new_mgr.iget(6).dtype == np.int64 + assert new_mgr.iget(7).dtype == np.float64 + assert new_mgr.iget(8).dtype == np.float16 def test_invalid_ea_block(self): with pytest.raises(AssertionError, match="block.size != values.size"): @@ -620,16 +623,16 @@ def test_reindex_items(self): assert reindexed.nblocks == 2 tm.assert_index_equal(reindexed.items, pd.Index(["g", "c", "a", "d"])) tm.assert_almost_equal( - mgr.get("g").internal_values(), reindexed.get("g").internal_values() + mgr.iget(6).internal_values(), reindexed.iget(0).internal_values() ) tm.assert_almost_equal( - mgr.get("c").internal_values(), reindexed.get("c").internal_values() + mgr.iget(2).internal_values(), reindexed.iget(1).internal_values() ) tm.assert_almost_equal( - mgr.get("a").internal_values(), reindexed.get("a").internal_values() + mgr.iget(0).internal_values(), reindexed.iget(2).internal_values() ) tm.assert_almost_equal( - mgr.get("d").internal_values(), reindexed.get("d").internal_values() + mgr.iget(3).internal_values(), reindexed.iget(3).internal_values() ) def test_get_numeric_data(self): @@ -645,13 +648,15 @@ def test_get_numeric_data(self): numeric.items, pd.Index(["int", "float", "complex", "bool"]) ) tm.assert_almost_equal( - mgr.get("float").internal_values(), numeric.get("float").internal_values() + mgr.iget(mgr.items.get_loc("float")).internal_values(), + numeric.iget(numeric.items.get_loc("float")).internal_values(), ) # Check sharing numeric.set("float", np.array([100.0, 200.0, 300.0])) tm.assert_almost_equal( - mgr.get("float").internal_values(), np.array([100.0, 200.0, 300.0]) + mgr.iget(mgr.items.get_loc("float")).internal_values(), + np.array([100.0, 200.0, 300.0]), ) numeric2 = mgr.get_numeric_data(copy=True) @@ -660,7 +665,8 @@ def test_get_numeric_data(self): ) numeric2.set("float", np.array([1000.0, 2000.0, 3000.0])) tm.assert_almost_equal( - mgr.get("float").internal_values(), np.array([100.0, 200.0, 300.0]) + mgr.iget(mgr.items.get_loc("float")).internal_values(), + np.array([100.0, 200.0, 300.0]), ) def test_get_bool_data(self): @@ -674,19 +680,22 @@ def test_get_bool_data(self): bools = mgr.get_bool_data() tm.assert_index_equal(bools.items, pd.Index(["bool"])) tm.assert_almost_equal( - mgr.get("bool").internal_values(), bools.get("bool").internal_values() + mgr.iget(mgr.items.get_loc("bool")).internal_values(), + bools.iget(bools.items.get_loc("bool")).internal_values(), ) bools.set("bool", np.array([True, False, True])) tm.assert_numpy_array_equal( - mgr.get("bool").internal_values(), np.array([True, False, True]) + mgr.iget(mgr.items.get_loc("bool")).internal_values(), + np.array([True, False, True]), ) # Check sharing bools2 = mgr.get_bool_data(copy=True) bools2.set("bool", np.array([False, True, False])) tm.assert_numpy_array_equal( - mgr.get("bool").internal_values(), np.array([True, False, True]) + mgr.iget(mgr.items.get_loc("bool")).internal_values(), + np.array([True, False, True]), ) def test_unicode_repr_doesnt_raise(self): diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 03c1445e099a0..035698687cfc2 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -28,7 +28,15 @@ async def test_tab_complete_ipython6_warning(ip): ) await ip.run_code(code) - with tm.assert_produces_warning(None): + # TODO: remove it when Ipython updates + # GH 33567, jedi version raises Deprecation warning in Ipython + import jedi + + if jedi.__version__ < "0.17.0": + warning = tm.assert_produces_warning(None) + else: + warning = tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False) + with warning: with provisionalcompleter("ignore"): list(ip.Completer.completions("rs.", 1)) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 302ca8d1aa43e..a6430b4525d4a 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -491,7 +491,18 @@ async def test_tab_complete_warning(self, ip): code = "import pandas as pd; s = pd.Series()" await ip.run_code(code) - with tm.assert_produces_warning(None): + + # TODO: remove it when Ipython updates + # GH 33567, jedi version raises Deprecation warning in Ipython + import jedi + + if jedi.__version__ < "0.17.0": + warning = tm.assert_produces_warning(None) + else: + warning = tm.assert_produces_warning( + DeprecationWarning, check_stacklevel=False + ) + with warning: with provisionalcompleter("ignore"): list(ip.Completer.completions("s.", 1)) diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index 17815c437249b..b7bdbde5bac5e 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -237,15 +237,13 @@ def _format_argument_list(allow_args: Union[List[str], int]): elif allow_args == 1: return " except for the first argument" elif isinstance(allow_args, int): - return " except for the first {num_args} arguments".format(num_args=allow_args) + return f" except for the first {allow_args} arguments" elif len(allow_args) == 1: - return " except for the argument '{arg}'".format(arg=allow_args[0]) + return f" except for the argument '{allow_args[0]}'" else: last = allow_args[-1] args = ", ".join(["'" + x + "'" for x in allow_args[:-1]]) - return " except for the arguments {args} and '{last}'".format( - args=args, last=last - ) + return f" except for the arguments {args} and '{last}'" def deprecate_nonkeyword_arguments( From df471d8a923502be2fbd0be6394915bcd3722496 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 16 Apr 2020 13:25:45 -0700 Subject: [PATCH 26/31] PERF: always slice when indexing on columns --- pandas/_libs/internals.pyx | 64 ++++++++++++++++++++++-- pandas/core/internals/managers.py | 19 +++---- pandas/tests/internals/test_internals.py | 10 +++- 3 files changed, 80 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index d3d8bead88d08..bc681570fef94 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -196,13 +196,27 @@ cdef class BlockPlacement: return self._as_slice + def to_slices(self): + """ + Decompose a BlockPlacement into a list of BlockPlacements, each of + which is slice-like. + + Returns + ------- + List[BlockPlacement] + """ + if self.is_slice_like: + return [self] + slices = indexer_as_slices(self.indexer) + return [BlockPlacement(x) for x in slices] + cdef slice slice_canonize(slice s): """ Convert slice to canonical bounded form. """ cdef: - Py_ssize_t start = 0, stop = 0, step = 1, length + Py_ssize_t start = 0, stop = 0, step = 1 if s.step is None: step = 1 @@ -234,8 +248,8 @@ cdef slice slice_canonize(slice s): if stop > start: stop = start - if start < 0 or (stop < 0 and s.stop is not None): - raise ValueError("unbounded slice") + if start < 0 or (stop < 0 and s.stop is not None and step > 0): + raise ValueError("unbounded slice", start, stop, step, s) if stop < 0: return slice(start, None, step) @@ -318,6 +332,50 @@ cdef slice_getitem(slice slc, ind): return cnp.PyArray_Arange(s_start, s_stop, s_step, NPY_INT64)[ind] +@cython.boundscheck(False) +@cython.wraparound(False) +cdef list indexer_as_slices(int64_t[:] vals): + """ + Convert an indexer to a list of slices. + """ + # TODO: there may be more efficient ways to decompose an indexer into slices + cdef: + Py_ssize_t i, n, start, stop + int64_t d + + if vals is None: + raise TypeError("vals must be ndarray") + + n = vals.shape[0] + + if n == 0: + return [] + + if n == 1: + return [slice(vals[0], vals[0] + 1, 1)] + + # n >= 2 + d = vals[1] - vals[0] + + if d == 0: + # i guess we have duplicate values (TODO: how?) + return [slice(vals[0], vals[0] + 1, 1)] + indexer_as_slices(vals[1:]) + + for i in range(2, n): + if vals[i] - vals[i - 1] != d: + # we need to start a new slice + start = vals[0] + stop = vals[i - 1] + d + return [slice(start, stop, d)] + indexer_as_slices(vals[i:]) + + start = vals[0] + stop = start + n * d + if stop < 0 and d < 0: + return [slice(start, None, d)] + else: + return [slice(start, stop, d)] + + @cython.boundscheck(False) @cython.wraparound(False) cdef slice indexer_as_slice(int64_t[:] vals): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b57fe76846274..b1eab61fd256b 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1324,7 +1324,6 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_value=lib.no_default): for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=True): if blkno == -1: # If we've got here, fill_value was not lib.no_default - blocks.append( self._make_na_block(placement=mgr_locs, fill_value=fill_value) ) @@ -1343,15 +1342,17 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_value=lib.no_default): else: taker = blklocs[mgr_locs.indexer] - max_len = max(len(mgr_locs), taker.max() + 1) - taker = lib.maybe_indices_to_slice(taker, max_len) - if isinstance(taker, slice): - nb = blk.getitem_block(taker, new_mgr_locs=mgr_locs) - else: - # TODO: just use getitem_block anyway? - nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs) - blocks.append(nb) + bp = libinternals.BlockPlacement(taker) + bps = bp.to_slices() + + left, right = 0, 0 + for sub in bps: + right += len(sub) + new_locs = mgr_locs[left:right] + b = blk.getitem_block(sub.indexer, new_mgr_locs=new_locs) + blocks.append(b) + left += len(sub) return blocks diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index f1d4c865a0ced..14323ca624a6f 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -620,7 +620,8 @@ def test_reindex_items(self): mgr = create_mgr("a: f8; b: i8; c: f8; d: i8; e: f8; f: bool; g: f8-2") reindexed = mgr.reindex_axis(["g", "c", "a", "d"], axis=0) - assert reindexed.nblocks == 2 + assert reindexed.nblocks == 3 + assert all(b.mgr_locs.is_slice_like for b in reindexed) tm.assert_index_equal(reindexed.items, pd.Index(["g", "c", "a", "d"])) tm.assert_almost_equal( mgr.iget(6).internal_values(), reindexed.iget(0).internal_values() @@ -957,6 +958,13 @@ def test_unbounded_slice_raises(self, slc): with pytest.raises(ValueError, match=msg): BlockPlacement(slc) + def test_slice_canonize_negative_stop(self): + # GH#???? negative stop is OK with negative step and positive start + slc = slice(3, -1, -2) + + bp = BlockPlacement(slc) + assert bp.indexer == slice(3, None, -2) + @pytest.mark.parametrize( "slc", [ From b323ea6d8a011779499f0c1d81fd06d8c4b6d35f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 16 Apr 2020 13:58:57 -0700 Subject: [PATCH 27/31] typo fixup --- pandas/tests/internals/test_internals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 14323ca624a6f..5673735d1c70c 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -621,7 +621,7 @@ def test_reindex_items(self): reindexed = mgr.reindex_axis(["g", "c", "a", "d"], axis=0) assert reindexed.nblocks == 3 - assert all(b.mgr_locs.is_slice_like for b in reindexed) + assert all(b.mgr_locs.is_slice_like for b in reindexed.blocks) tm.assert_index_equal(reindexed.items, pd.Index(["g", "c", "a", "d"])) tm.assert_almost_equal( mgr.iget(6).internal_values(), reindexed.iget(0).internal_values() From 96b7d6a05611c80f8a6999c7fab837f4cba44189 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 20 Apr 2020 17:55:31 -0700 Subject: [PATCH 28/31] avoid take in one more case --- pandas/core/internals/managers.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index a650392592091..a5df168fb78c4 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1258,7 +1258,10 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_value=lib.no_default): if sllen == 0: return [] return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))] - elif not allow_fill or self.ndim == 1: + elif not allow_fill: + return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))] + + elif self.ndim == 1: if allow_fill and fill_value is None: _, fill_value = maybe_promote(blk.dtype) @@ -1302,6 +1305,7 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_value=lib.no_default): # A non-consolidatable block, it's easy, because there's # only one item and each mgr loc is a copy of that single # item. + # TODO(EA2D): special case unnecessary with 2D EAs for mgr_loc in mgr_locs: newblk = blk.copy(deep=False) newblk.mgr_locs = slice(mgr_loc, mgr_loc + 1) From 7fad75264c231ae4cb6591b37b636d72cf72abd7 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 24 Jul 2021 12:58:56 -0700 Subject: [PATCH 29/31] update --- pandas/core/internals/managers.py | 33 ++++++++++++++++---------- pandas/tests/frame/indexing/test_xs.py | 5 ++-- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index a888649d3ed98..6f39924ca14cd 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -711,6 +711,7 @@ def _slice_take_blocks_ax0( only_slice : bool, default False If True, we always return views on existing arrays, never copies. This is used when called from ops.blockwise.operate_blockwise. + Ignored; TODO: remove argument. Returns ------- @@ -736,7 +737,7 @@ def _slice_take_blocks_ax0( if allow_fill and fill_value is None: fill_value = blk.fill_value - if not allow_fill and only_slice: + if not allow_fill: # GH#33597 slice instead of take, so we get # views instead of copies blocks = [ @@ -773,8 +774,7 @@ def _slice_take_blocks_ax0( # When filling blknos, make sure blknos is updated before appending to # blocks list, that way new blkno is exactly len(blocks). blocks = [] - group = not only_slice - for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=group): + for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=False): if blkno == -1: # If we've got here, fill_value was not lib.no_default @@ -800,24 +800,31 @@ def _slice_take_blocks_ax0( # we may try to only slice taker = blklocs[mgr_locs.indexer] max_len = max(len(mgr_locs), taker.max() + 1) - if only_slice: - taker = lib.maybe_indices_to_slice(taker, max_len) + taker = lib.maybe_indices_to_slice(taker, max_len) if isinstance(taker, slice): + bp = libinternals.BlockPlacement(taker) + bps = bp.to_slices() + assert len(bps) == 1 # just checking nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs) blocks.append(nb) - elif only_slice: + else: # GH#33597 slice instead of take, so we get # views instead of copies - for i, ml in zip(taker, mgr_locs): - slc = slice(i, i + 1) - bp = BlockPlacement(ml) - nb = blk.getitem_block_columns(slc, new_mgr_locs=bp) + bp = libinternals.BlockPlacement(taker) + bps = bp.to_slices() + left, right = 0, 0 + + for sub in bps: + right += len(sub) + new_locs = mgr_locs[left:right] + # we have isinstance(sub.indexer, slice) + nb = blk.getitem_block_columns( + sub.indexer, new_mgr_locs=new_locs + ) # We have np.shares_memory(nb.values, blk.values) blocks.append(nb) - else: - nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs) - blocks.append(nb) + left += len(sub) return blocks diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index d2704876c31c5..8c7f05d40552d 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -374,12 +374,11 @@ def test_xs_droplevel_false_view(self, using_array_manager): expected = DataFrame({"a": [1]}) tm.assert_frame_equal(result, expected) - # with mixed dataframe, modifying the parent doesn't modify result - # TODO the "split" path behaves differently here as with single block + # with mixed dataframe, we still get a view on the parent df = DataFrame([[1, 2.5, "a"]], columns=Index(["a", "b", "c"])) result = df.xs("a", axis=1, drop_level=False) df.iloc[0, 0] = 2 - expected = DataFrame({"a": [1]}) + expected = DataFrame({"a": [2]}) tm.assert_frame_equal(result, expected) def test_xs_list_indexer_droplevel_false(self): From 201e57da521de7efd12cd526e0263b0fca5d42d7 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 14 Aug 2021 12:07:14 -0700 Subject: [PATCH 30/31] fix last 2 failing tests --- pandas/core/frame.py | 3 +++ pandas/core/generic.py | 1 - pandas/core/internals/managers.py | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 823de2133f0b3..2310ca52ec9ae 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4558,6 +4558,9 @@ def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy) columns, method, copy, level, fill_value, limit, tolerance ) + # If we have made a copy, no need to make another one + copy = False + index = axes["index"] if index is not None: frame = frame._reindex_index( diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b2a4fecbf084d..31e5303ccd975 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4817,7 +4817,6 @@ def reindex(self: FrameOrSeries, *args, **kwargs) -> FrameOrSeries: axes, level, limit, tolerance, method, fill_value, copy ).__finalize__(self, method="reindex") - @final def _reindex_axes( self: FrameOrSeries, axes, level, limit, tolerance, method, fill_value, copy ) -> FrameOrSeries: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 3d9d7b5f5f2a9..87bba084269b6 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -665,6 +665,8 @@ def reindex_indexer( new_blocks = self._slice_take_blocks_ax0( indexer, fill_value=fill_value, only_slice=only_slice ) + if copy: + new_blocks = [x.copy() for x in new_blocks] else: new_blocks = [ blk.take_nd( From 69f08cb406ebfcbd2bc6524789ccc271a49f05e0 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 14 Aug 2021 17:03:39 -0700 Subject: [PATCH 31/31] mypy fixup --- pandas/_libs/internals.pyi | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi index 3feefe7ac8ff4..b9f9a0948ff33 100644 --- a/pandas/_libs/internals.pyi +++ b/pandas/_libs/internals.pyi @@ -42,6 +42,7 @@ class BlockPlacement: def __len__(self) -> int: ... def delete(self, loc) -> BlockPlacement: ... def append(self, others: list[BlockPlacement]) -> BlockPlacement: ... + def to_slices(self) -> list[BlockPlacement]: ... class SharedBlock: _mgr_locs: BlockPlacement