Skip to content

REF: avoid broadcasting of Series to DataFrame in ops for ArrayManager #40482

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

12 changes: 9 additions & 3 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,10 @@
)
from pandas.core.array_algos.take import take_2d_multi
from pandas.core.arraylike import OpsMixin
from pandas.core.arrays import ExtensionArray
from pandas.core.arrays import (
ExtensionArray,
TimedeltaArray,
)
from pandas.core.arrays.sparse import SparseFrameAccessor
from pandas.core.construction import (
extract_array,
Expand Down Expand Up @@ -6701,8 +6704,11 @@ def _dispatch_frame_op(self, right, func: Callable, axis: Optional[int] = None):
assert right.index.equals(self.columns)

right = right._values
# maybe_align_as_frame ensures we do not have an ndarray here
assert not isinstance(right, np.ndarray)
# BlockManager only: maybe_align_as_frame ensures we do not have
# an ndarray here (`assert not isinstance(right, np.ndarray)`)

if isinstance(right, TimedeltaArray):
right = right._data
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. this is AM-only right?
  2. wont the array_op call just re-wrap this?
  3. if not, will something like DataFrame[int] + Series[timedelta64ns] fail to raise because of this?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. if this is a hack as the commit message suggests, then a comment would be worthwhile

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See #40482 (comment) for the general explanation of in which case this happens. It was not my intention to keep this, I am hoping to find a better solution.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yah I saw that, thought about commenting, but decided that "last time i handled that by reshaping to 2D" wouldn't be something you'd find that helpful.


with np.errstate(all="ignore"):
arrays = [
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,10 @@ def _as_manager(self: FrameOrSeries, typ: str) -> FrameOrSeries:
# fastpath of passing a manager doesn't check the option/manager class
return self._constructor(new_mgr).__finalize__(self)

@property
def _using_array_manager(self):
return isinstance(self._mgr, ArrayManager)

# ----------------------------------------------------------------------
# attrs and flags

Expand Down
5 changes: 4 additions & 1 deletion pandas/core/ops/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,10 @@ def to_series(right):
left, right = left.align(
right, join="outer", axis=axis, level=level, copy=False
)
right = _maybe_align_series_as_frame(left, right, axis)

if not left._using_array_manager:
# for BlockManager maybe broadcast Series to a DataFrame
right = _maybe_align_series_as_frame(left, right, axis)

return left, right

Expand Down
6 changes: 5 additions & 1 deletion pandas/core/ops/array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,10 @@ def _masked_arith_op(x: np.ndarray, y, op):

# mask is only meaningful for x
result = np.empty(x.size, dtype=x.dtype)
mask = notna(xrav)
if isna(y):
mask = np.zeros(x.size, dtype=bool)
else:
mask = notna(xrav)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is just an optimization, independent of the broadcasting thing?


# 1 ** np.nan is 1. So we have to unmask those.
if op is pow:
Expand Down Expand Up @@ -291,6 +294,7 @@ def na_logical_op(x: np.ndarray, y, op):
y = ensure_object(y)
result = libops.vec_binop(x.ravel(), y.ravel(), op)
else:
x = ensure_object(x)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if we're doing this both here and on L293, then might as well do it just once after L289

Do we have cases that get here with x not object dtype?

# let null fall thru
assert lib.is_scalar(y)
if not isna(y):
Expand Down
23 changes: 18 additions & 5 deletions pandas/tests/arithmetic/test_datetime64.py
Original file line number Diff line number Diff line change
Expand Up @@ -1506,7 +1506,13 @@ def test_dt64arr_add_sub_DateOffset(self, box_with_array):
@pytest.mark.parametrize("op", [operator.add, roperator.radd, operator.sub])
@pytest.mark.parametrize("box_other", [True, False])
def test_dt64arr_add_sub_offset_array(
self, tz_naive_fixture, box_with_array, box_other, op, other
self,
tz_naive_fixture,
box_with_array,
box_other,
op,
other,
using_array_manager,
):
# GH#18849
# GH#10699 array of offsets
Expand All @@ -1522,7 +1528,10 @@ def test_dt64arr_add_sub_offset_array(
if box_other:
other = tm.box_expected(other, box_with_array)

with tm.assert_produces_warning(PerformanceWarning):
warn = PerformanceWarning
if box_with_array is pd.DataFrame and not box_other and using_array_manager:
warn = None
with tm.assert_produces_warning(warn):
res = op(dtarr, other)

tm.assert_equal(res, expected)
Expand Down Expand Up @@ -2452,7 +2461,7 @@ def test_dti_addsub_offset_arraylike(

@pytest.mark.parametrize("other_box", [pd.Index, np.array])
def test_dti_addsub_object_arraylike(
self, tz_naive_fixture, box_with_array, other_box
self, tz_naive_fixture, box_with_array, other_box, using_array_manager
):
tz = tz_naive_fixture

Expand All @@ -2464,14 +2473,18 @@ def test_dti_addsub_object_arraylike(
expected = DatetimeIndex(["2017-01-31", "2017-01-06"], tz=tz_naive_fixture)
expected = tm.box_expected(expected, xbox)

with tm.assert_produces_warning(PerformanceWarning):
warn = PerformanceWarning
if box_with_array is pd.DataFrame and using_array_manager:
warn = None

with tm.assert_produces_warning(warn):
result = dtarr + other
tm.assert_equal(result, expected)

expected = DatetimeIndex(["2016-12-31", "2016-12-29"], tz=tz_naive_fixture)
expected = tm.box_expected(expected, xbox)

with tm.assert_produces_warning(PerformanceWarning):
with tm.assert_produces_warning(warn):
result = dtarr - other
tm.assert_equal(result, expected)

Expand Down
75 changes: 45 additions & 30 deletions pandas/tests/arithmetic/test_timedelta64.py
Original file line number Diff line number Diff line change
Expand Up @@ -1323,7 +1323,7 @@ def test_td64arr_sub_timedeltalike(self, two_hours, box_with_array):
# ------------------------------------------------------------------
# __add__/__sub__ with DateOffsets and arrays of DateOffsets

def test_td64arr_add_offset_index(self, names, box_with_array):
def test_td64arr_add_offset_index(self, names, box_with_array, using_array_manager):
# GH#18849, GH#19744
box = box_with_array
exname = get_expected_name(box, names)
Expand All @@ -1338,17 +1338,21 @@ def test_td64arr_add_offset_index(self, names, box_with_array):
tdi = tm.box_expected(tdi, box)
expected = tm.box_expected(expected, box)

with tm.assert_produces_warning(PerformanceWarning):
warn = PerformanceWarning
if box is DataFrame and using_array_manager:
warn = None

with tm.assert_produces_warning(warn):
res = tdi + other
tm.assert_equal(res, expected)

with tm.assert_produces_warning(PerformanceWarning):
with tm.assert_produces_warning(warn):
res2 = other + tdi
tm.assert_equal(res2, expected)

# TODO: combine with test_td64arr_add_offset_index by parametrizing
# over second box?
def test_td64arr_add_offset_array(self, box_with_array):
def test_td64arr_add_offset_array(self, box_with_array, using_array_manager):
# GH#18849
box = box_with_array
tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"])
Expand All @@ -1361,15 +1365,19 @@ def test_td64arr_add_offset_array(self, box_with_array):
tdi = tm.box_expected(tdi, box)
expected = tm.box_expected(expected, box)

with tm.assert_produces_warning(PerformanceWarning):
warn = PerformanceWarning
if box is DataFrame and using_array_manager:
warn = None

with tm.assert_produces_warning(warn):
res = tdi + other
tm.assert_equal(res, expected)

with tm.assert_produces_warning(PerformanceWarning):
with tm.assert_produces_warning(warn):
res2 = other + tdi
tm.assert_equal(res2, expected)

def test_td64arr_sub_offset_index(self, names, box_with_array):
def test_td64arr_sub_offset_index(self, names, box_with_array, using_array_manager):
# GH#18824, GH#19744
box = box_with_array
xbox = box if box not in [tm.to_array, pd.array] else pd.Index
Expand All @@ -1385,11 +1393,15 @@ def test_td64arr_sub_offset_index(self, names, box_with_array):
tdi = tm.box_expected(tdi, box)
expected = tm.box_expected(expected, xbox)

with tm.assert_produces_warning(PerformanceWarning):
warn = PerformanceWarning
if box is DataFrame and using_array_manager:
warn = None

with tm.assert_produces_warning(warn):
res = tdi - other
tm.assert_equal(res, expected)

def test_td64arr_sub_offset_array(self, box_with_array):
def test_td64arr_sub_offset_array(self, box_with_array, using_array_manager):
# GH#18824
tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"])
other = np.array([offsets.Hour(n=1), offsets.Minute(n=-2)])
Expand All @@ -1401,11 +1413,17 @@ def test_td64arr_sub_offset_array(self, box_with_array):
tdi = tm.box_expected(tdi, box_with_array)
expected = tm.box_expected(expected, box_with_array)

with tm.assert_produces_warning(PerformanceWarning):
warn = PerformanceWarning
if box_with_array is DataFrame and using_array_manager:
warn = None

with tm.assert_produces_warning(warn):
res = tdi - other
tm.assert_equal(res, expected)

def test_td64arr_with_offset_series(self, names, box_with_array):
def test_td64arr_with_offset_series(
self, names, box_with_array, using_array_manager
):
# GH#18849
box = box_with_array
box2 = Series if box in [pd.Index, tm.to_array, pd.array] else box
Expand All @@ -1418,18 +1436,22 @@ def test_td64arr_with_offset_series(self, names, box_with_array):
obj = tm.box_expected(tdi, box)
expected_add = tm.box_expected(expected_add, box2)

with tm.assert_produces_warning(PerformanceWarning):
warn = PerformanceWarning
if box is DataFrame and using_array_manager:
warn = None

with tm.assert_produces_warning(warn):
res = obj + other
tm.assert_equal(res, expected_add)

with tm.assert_produces_warning(PerformanceWarning):
with tm.assert_produces_warning(warn):
res2 = other + obj
tm.assert_equal(res2, expected_add)

expected_sub = Series([tdi[n] - other[n] for n in range(len(tdi))], name=exname)
expected_sub = tm.box_expected(expected_sub, box2)

with tm.assert_produces_warning(PerformanceWarning):
with tm.assert_produces_warning(warn):
res3 = obj - other
tm.assert_equal(res3, expected_sub)

Expand Down Expand Up @@ -1460,7 +1482,7 @@ def test_td64arr_addsub_anchored_offset_arraylike(self, obox, box_with_array):
# ------------------------------------------------------------------
# Unsorted

def test_td64arr_add_sub_object_array(self, box_with_array):
def test_td64arr_add_sub_object_array(self, box_with_array, using_array_manager):
box = box_with_array
xbox = np.ndarray if box is pd.array else box

Expand All @@ -1469,7 +1491,11 @@ def test_td64arr_add_sub_object_array(self, box_with_array):

other = np.array([Timedelta(days=1), offsets.Day(2), Timestamp("2000-01-04")])

with tm.assert_produces_warning(PerformanceWarning):
warn = PerformanceWarning
if box is DataFrame and using_array_manager:
warn = None

with tm.assert_produces_warning(warn):
result = tdarr + other

expected = pd.Index(
Expand All @@ -1480,10 +1506,10 @@ def test_td64arr_add_sub_object_array(self, box_with_array):

msg = "unsupported operand type|cannot subtract a datelike"
with pytest.raises(TypeError, match=msg):
with tm.assert_produces_warning(PerformanceWarning):
with tm.assert_produces_warning(warn):
tdarr - other

with tm.assert_produces_warning(PerformanceWarning):
with tm.assert_produces_warning(warn):
result = other - tdarr

expected = pd.Index([Timedelta(0), Timedelta(0), Timestamp("2000-01-01")])
Expand Down Expand Up @@ -2051,9 +2077,7 @@ def test_td64arr_rmul_numeric_array(self, box_with_array, vector, any_real_dtype
[np.array([20, 30, 40]), pd.Index([20, 30, 40]), Series([20, 30, 40])],
ids=lambda x: type(x).__name__,
)
def test_td64arr_div_numeric_array(
self, box_with_array, vector, any_real_dtype, using_array_manager
):
def test_td64arr_div_numeric_array(self, box_with_array, vector, any_real_dtype):
# GH#4521
# divide/multiply by integers
xbox = get_upcast_box(box_with_array, vector)
Expand Down Expand Up @@ -2089,15 +2113,6 @@ def test_td64arr_div_numeric_array(
expected = pd.Index(expected) # do dtype inference
expected = tm.box_expected(expected, xbox)
assert tm.get_dtype(expected) == "m8[ns]"

if using_array_manager and box_with_array is DataFrame:
# TODO the behaviour is buggy here (third column with all-NaT
# as result doesn't get preserved as timedelta64 dtype).
# Reported at https://github.com/pandas-dev/pandas/issues/39750
# Changing the expected instead of xfailing to continue to test
# the correct behaviour for the other columns
expected[2] = Series([NaT, NaT], dtype=object)

tm.assert_equal(result, expected)

with pytest.raises(TypeError, match=pattern):
Expand Down
38 changes: 29 additions & 9 deletions pandas/tests/frame/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1204,7 +1204,12 @@ def test_combineFrame(self, float_frame, mixed_float_frame, mixed_int_frame):
_check_mixed_float(added, dtype="float64")

def test_combine_series(
self, float_frame, mixed_float_frame, mixed_int_frame, datetime_frame
self,
float_frame,
mixed_float_frame,
mixed_int_frame,
datetime_frame,
using_array_manager,
):

# Series
Expand All @@ -1227,7 +1232,14 @@ def test_combine_series(

# no upcast needed
added = mixed_float_frame + series
assert np.all(added.dtypes == series.dtype)
if using_array_manager:
# INFO(ArrayManager) addition is performed column-wise, and
# apparently in numpy the dtype of array gets priority in
# arithmetic casting rules (array[float32] + scalar[float64] gives
# array[float32] and not array[float64])
assert np.all(added.dtypes == mixed_float_frame.dtypes)
else:
assert np.all(added.dtypes == series.dtype)

# vs mix (upcast) as needed
added = mixed_float_frame + series.astype("float32")
Expand Down Expand Up @@ -1587,7 +1599,7 @@ def test_inplace_ops_identity2(self, op):
expected = id(df)
assert id(df) == expected

def test_alignment_non_pandas(self):
def test_alignment_non_pandas(self, using_array_manager):
index = ["A", "B", "C"]
columns = ["X", "Y", "Z"]
df = DataFrame(np.random.randn(3, 3), index=index, columns=columns)
Expand All @@ -1600,13 +1612,21 @@ def test_alignment_non_pandas(self):
range(1, 4),
]:

expected = DataFrame({"X": val, "Y": val, "Z": val}, index=df.index)
tm.assert_frame_equal(align(df, val, "index")[1], expected)
if not using_array_manager:
expected = DataFrame({"X": val, "Y": val, "Z": val}, index=df.index)
tm.assert_frame_equal(align(df, val, "index")[1], expected)
else:
expected = Series(val, index=index)
tm.assert_series_equal(align(df, val, "index")[1], expected)

expected = DataFrame(
{"X": [1, 1, 1], "Y": [2, 2, 2], "Z": [3, 3, 3]}, index=df.index
)
tm.assert_frame_equal(align(df, val, "columns")[1], expected)
if not using_array_manager:
expected = DataFrame(
{"X": [1, 1, 1], "Y": [2, 2, 2], "Z": [3, 3, 3]}, index=df.index
)
tm.assert_frame_equal(align(df, val, "columns")[1], expected)
else:
expected = Series(val, index=columns)
tm.assert_series_equal(align(df, val, "columns")[1], expected)

# length mismatch
msg = "Unable to coerce to Series, length must be 3: given 2"
Expand Down