Skip to content

BUG: flex op with DataFrame, Series and ea vs ndarray #34277

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
May 25, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 27 additions & 17 deletions pandas/core/ops/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
)

if TYPE_CHECKING:
from pandas import DataFrame # noqa:F401
from pandas import DataFrame, Series # noqa:F401

# -----------------------------------------------------------------------------
# constants
Expand Down Expand Up @@ -467,19 +467,7 @@ def _combine_series_frame(left, right, func, axis: int):
# We assume that self.align(other, ...) has already been called

rvalues = right._values
if isinstance(rvalues, np.ndarray):
# TODO(EA2D): no need to special-case with 2D EAs
# We can operate block-wise
if axis == 0:
rvalues = rvalues.reshape(-1, 1)
else:
rvalues = rvalues.reshape(1, -1)

rvalues = np.broadcast_to(rvalues, left.shape)

array_op = get_array_op(func)
bm = left._mgr.apply(array_op, right=rvalues.T, align_keys=["right"])
return type(left)(bm)
assert not isinstance(rvalues, np.ndarray) # handled by align_series_as_frame

if axis == 0:
new_data = dispatch_to_series(left, right, func)
Expand Down Expand Up @@ -575,6 +563,7 @@ def to_series(right):
left, right = left.align(
right, join="outer", axis=axis, level=level, copy=False
)
right = _maybe_align_series_as_frame(left, right, axis)

return left, right

Expand Down Expand Up @@ -635,6 +624,25 @@ def _frame_arith_method_with_reindex(
return result.reindex(join_columns, axis=1)


def _maybe_align_series_as_frame(frame: "DataFrame", series: "Series", axis: int):
"""
If the Series operand is not EA-dtype, we can broadcast to 2D and operate
blockwise.
"""
rvalues = series._values
if not isinstance(rvalues, np.ndarray):
# TODO(EA2D): no need to special-case with 2D EAs
return series

if axis == 0:
rvalues = rvalues.reshape(-1, 1)
else:
rvalues = rvalues.reshape(1, -1)

rvalues = np.broadcast_to(rvalues, frame.shape)
return type(frame)(rvalues, index=frame.index, columns=frame.columns)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need to turn this into a DataFrame?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The alternative is to re-implement the ensure-shape-shape-values code in ops.blockwise (see the first commit, which did it that way)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does it need to be the same shape? The block op or array op should be able to handle this broadcasting automatically themselves?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ndarray handles broadcasting, but DTA/TDA dont. I checked with seberg a few months ago who said there isnt a perf penalty to operating on the broadcasted ndarray.

ive got plans to make it so we dont need to wrap the ndarray in a DataFrame (which we actually do in _align_method_FRAME too)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but DTA/TDA dont.

You mean the "hidden" 2D version of DTA/TDA? (EAs in general handle broadcasting)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

EAs in general handle broadcasting

im not sure thats accurate in general (in particular the op(ea_len_1, arr_len_n)), but that can be discussed separately

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(in particular the op(ea_len_1, arr_len_n)

Ah, yes, indeed, that's a broadcasting that will not be generally supported (we probably should though, worth to open an issue about that).

Now, I was thinking about the op(frame_with_EA_column, array_len_n) case, where the array is aligned on the columns of the frame. Which means that each EA (1 column) gets a scalar, I think (op(EA, scalar) ?
Or, we might want to change this to an array of len 1 in case the scalar loses information? (which can be the case where the 1D array-like is an EA / Series[EA] instead of a numpy array)



def _arith_method_FRAME(cls: Type["DataFrame"], op, special: bool):
# This is the only function where `special` can be either True or False
op_name = _get_op_name(op, special)
Expand All @@ -656,6 +664,11 @@ def f(self, other, axis=default_axis, level=None, fill_value=None):
):
return _frame_arith_method_with_reindex(self, other, op)

if isinstance(other, ABCSeries) and fill_value is not None:
# TODO: We could allow this in cases where we end up going
# through the DataFrame path
raise NotImplementedError(f"fill_value {fill_value} not supported.")

# TODO: why are we passing flex=True instead of flex=not special?
# 15 tests fail if we pass flex=not special instead
self, other = _align_method_FRAME(self, other, axis, flex=True, level=level)
Expand All @@ -665,9 +678,6 @@ def f(self, other, axis=default_axis, level=None, fill_value=None):
new_data = self._combine_frame(other, na_op, fill_value)

elif isinstance(other, ABCSeries):
if fill_value is not None:
raise NotImplementedError(f"fill_value {fill_value} not supported.")

axis = self._get_axis_number(axis) if axis is not None else 1
new_data = _combine_series_frame(self, other, op, axis=axis)
else:
Expand Down
6 changes: 1 addition & 5 deletions pandas/tests/arithmetic/test_timedelta64.py
Original file line number Diff line number Diff line change
Expand Up @@ -1470,8 +1470,6 @@ def test_td64arr_add_sub_object_array(self, box_with_array):
[pd.Timedelta(days=2), pd.Timedelta(days=4), pd.Timestamp("2000-01-07")]
)
expected = tm.box_expected(expected, box_with_array)
if box_with_array is pd.DataFrame:
expected = expected.astype(object)
tm.assert_equal(result, expected)

msg = "unsupported operand type|cannot subtract a datelike"
Expand All @@ -1486,8 +1484,6 @@ def test_td64arr_add_sub_object_array(self, box_with_array):
[pd.Timedelta(0), pd.Timedelta(0), pd.Timestamp("2000-01-01")]
)
expected = tm.box_expected(expected, box_with_array)
if box_with_array is pd.DataFrame:
expected = expected.astype(object)
tm.assert_equal(result, expected)


Expand Down Expand Up @@ -2012,7 +2008,7 @@ def test_td64arr_div_numeric_array(self, box_with_array, vector, any_real_dtype)
tm.assert_equal(result, expected)

pattern = (
"true_divide cannot use operands|"
"true_divide'? cannot use operands|"
"cannot perform __div__|"
"cannot perform __truediv__|"
"unsupported operand|"
Expand Down
40 changes: 35 additions & 5 deletions pandas/tests/frame/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,20 @@ def test_df_flex_cmp_constant_return_types_empty(self, opname):
result = getattr(empty, opname)(const).dtypes.value_counts()
tm.assert_series_equal(result, pd.Series([2], index=[np.dtype(bool)]))

def test_df_flex_cmp_ea_dtype_with_ndarray_series(self):
ii = pd.IntervalIndex.from_breaks([1, 2, 3])
df = pd.DataFrame({"A": ii, "B": ii})

ser = pd.Series([0, 0])
res = df.eq(ser, axis=0)

expected = pd.DataFrame({"A": [False, False], "B": [False, False]})
tm.assert_frame_equal(res, expected)

ser2 = pd.Series([1, 2], index=["A", "B"])
res2 = df.eq(ser2, axis=1)
tm.assert_frame_equal(res2, expected)


# -------------------------------------------------------------------
# Arithmetic
Expand Down Expand Up @@ -1410,12 +1424,13 @@ def test_alignment_non_pandas(self):
range(1, 4),
]:

tm.assert_series_equal(
align(df, val, "index")[1], Series([1, 2, 3], index=df.index)
)
tm.assert_series_equal(
align(df, val, "columns")[1], Series([1, 2, 3], index=df.columns)
expected = DataFrame({"X": val, "Y": val, "Z": val}, index=df.index)
tm.assert_frame_equal(align(df, val, "index")[1], expected)

expected = DataFrame(
{"X": [1, 1, 1], "Y": [2, 2, 2], "Z": [3, 3, 3]}, index=df.index
)
tm.assert_frame_equal(align(df, val, "columns")[1], expected)

# length mismatch
msg = "Unable to coerce to Series, length must be 3: given 2"
Expand Down Expand Up @@ -1484,3 +1499,18 @@ def test_pow_nan_with_zero():

result = left["A"] ** right["A"]
tm.assert_series_equal(result, expected["A"])


def test_dataframe_series_extension_dtypes():
# https://github.com/pandas-dev/pandas/issues/34311
df = pd.DataFrame(np.random.randint(0, 100, (10, 3)), columns=["a", "b", "c"])
ser = pd.Series([1, 2, 3], index=["a", "b", "c"])

expected = df.to_numpy("int64") + ser.to_numpy("int64").reshape(-1, 3)
expected = pd.DataFrame(expected, columns=df.columns, dtype="Int64")

df_ea = df.astype("Int64")
result = df_ea + ser
tm.assert_frame_equal(result, expected)
result = df_ea + ser.astype("Int64")
tm.assert_frame_equal(result, expected)