From c4194e95844a0a02477d220d1d78bdc497b131fd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 17 Mar 2021 13:02:13 +0100 Subject: [PATCH 1/4] REF: avoid broadcasting of Series to DataFrame in ops for ArrayManager --- pandas/core/frame.py | 2 - pandas/core/ops/__init__.py | 6 +- pandas/tests/arithmetic/test_datetime64.py | 23 +++++-- pandas/tests/arithmetic/test_timedelta64.py | 75 ++++++++++++--------- 4 files changed, 68 insertions(+), 38 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9fdd979ce8eca..0d04517bb4b15 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6674,8 +6674,6 @@ def _dispatch_frame_op(self, right, func: Callable, axis: Optional[int] = None): assert right.index.equals(self.columns) right = right._values - # maybe_align_as_frame ensures we do not have an ndarray here - assert not isinstance(right, np.ndarray) arrays = [ array_op(_left, _right) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 8ace64fedacb9..61a3c786f1907 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -306,7 +306,11 @@ def to_series(right): left, right = left.align( right, join="outer", axis=axis, level=level, copy=False ) - right = _maybe_align_series_as_frame(left, right, axis) + + from pandas.core.internals import BlockManager + + if isinstance(left._mgr, BlockManager): + right = _maybe_align_series_as_frame(left, right, axis) return left, right diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index f75b3800f623f..e664b9a58798a 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1506,7 +1506,13 @@ def test_dt64arr_add_sub_DateOffset(self, box_with_array): @pytest.mark.parametrize("op", [operator.add, roperator.radd, operator.sub]) @pytest.mark.parametrize("box_other", [True, False]) def test_dt64arr_add_sub_offset_array( - self, tz_naive_fixture, box_with_array, box_other, op, other + self, + tz_naive_fixture, + box_with_array, + box_other, + op, + other, + using_array_manager, ): # GH#18849 # GH#10699 array of offsets @@ -1522,7 +1528,10 @@ def test_dt64arr_add_sub_offset_array( if box_other: other = tm.box_expected(other, box_with_array) - with tm.assert_produces_warning(PerformanceWarning): + warn = PerformanceWarning + if box_with_array is pd.DataFrame and not box_other and using_array_manager: + warn = None + with tm.assert_produces_warning(warn): res = op(dtarr, other) tm.assert_equal(res, expected) @@ -2451,7 +2460,7 @@ def test_dti_addsub_offset_arraylike( @pytest.mark.parametrize("other_box", [pd.Index, np.array]) def test_dti_addsub_object_arraylike( - self, tz_naive_fixture, box_with_array, other_box + self, tz_naive_fixture, box_with_array, other_box, using_array_manager ): tz = tz_naive_fixture @@ -2463,14 +2472,18 @@ def test_dti_addsub_object_arraylike( expected = DatetimeIndex(["2017-01-31", "2017-01-06"], tz=tz_naive_fixture) expected = tm.box_expected(expected, xbox) - with tm.assert_produces_warning(PerformanceWarning): + warn = PerformanceWarning + if box_with_array is pd.DataFrame and using_array_manager: + warn = None + + with tm.assert_produces_warning(warn): result = dtarr + other tm.assert_equal(result, expected) expected = DatetimeIndex(["2016-12-31", "2016-12-29"], tz=tz_naive_fixture) expected = tm.box_expected(expected, xbox) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(warn): result = dtarr - other tm.assert_equal(result, expected) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index daebdb542bc20..6e81879c3497b 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -1323,7 +1323,7 @@ def test_td64arr_sub_timedeltalike(self, two_hours, box_with_array): # ------------------------------------------------------------------ # __add__/__sub__ with DateOffsets and arrays of DateOffsets - def test_td64arr_add_offset_index(self, names, box_with_array): + def test_td64arr_add_offset_index(self, names, box_with_array, using_array_manager): # GH#18849, GH#19744 box = box_with_array exname = get_expected_name(box, names) @@ -1338,17 +1338,21 @@ def test_td64arr_add_offset_index(self, names, box_with_array): tdi = tm.box_expected(tdi, box) expected = tm.box_expected(expected, box) - with tm.assert_produces_warning(PerformanceWarning): + warn = PerformanceWarning + if box is DataFrame and using_array_manager: + warn = None + + with tm.assert_produces_warning(warn): res = tdi + other tm.assert_equal(res, expected) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(warn): res2 = other + tdi tm.assert_equal(res2, expected) # TODO: combine with test_td64arr_add_offset_index by parametrizing # over second box? - def test_td64arr_add_offset_array(self, box_with_array): + def test_td64arr_add_offset_array(self, box_with_array, using_array_manager): # GH#18849 box = box_with_array tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"]) @@ -1361,15 +1365,19 @@ def test_td64arr_add_offset_array(self, box_with_array): tdi = tm.box_expected(tdi, box) expected = tm.box_expected(expected, box) - with tm.assert_produces_warning(PerformanceWarning): + warn = PerformanceWarning + if box is DataFrame and using_array_manager: + warn = None + + with tm.assert_produces_warning(warn): res = tdi + other tm.assert_equal(res, expected) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(warn): res2 = other + tdi tm.assert_equal(res2, expected) - def test_td64arr_sub_offset_index(self, names, box_with_array): + def test_td64arr_sub_offset_index(self, names, box_with_array, using_array_manager): # GH#18824, GH#19744 box = box_with_array xbox = box if box not in [tm.to_array, pd.array] else pd.Index @@ -1385,11 +1393,15 @@ def test_td64arr_sub_offset_index(self, names, box_with_array): tdi = tm.box_expected(tdi, box) expected = tm.box_expected(expected, xbox) - with tm.assert_produces_warning(PerformanceWarning): + warn = PerformanceWarning + if box is DataFrame and using_array_manager: + warn = None + + with tm.assert_produces_warning(warn): res = tdi - other tm.assert_equal(res, expected) - def test_td64arr_sub_offset_array(self, box_with_array): + def test_td64arr_sub_offset_array(self, box_with_array, using_array_manager): # GH#18824 tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"]) other = np.array([offsets.Hour(n=1), offsets.Minute(n=-2)]) @@ -1401,11 +1413,17 @@ def test_td64arr_sub_offset_array(self, box_with_array): tdi = tm.box_expected(tdi, box_with_array) expected = tm.box_expected(expected, box_with_array) - with tm.assert_produces_warning(PerformanceWarning): + warn = PerformanceWarning + if box_with_array is DataFrame and using_array_manager: + warn = None + + with tm.assert_produces_warning(warn): res = tdi - other tm.assert_equal(res, expected) - def test_td64arr_with_offset_series(self, names, box_with_array): + def test_td64arr_with_offset_series( + self, names, box_with_array, using_array_manager + ): # GH#18849 box = box_with_array box2 = Series if box in [pd.Index, tm.to_array, pd.array] else box @@ -1418,18 +1436,22 @@ def test_td64arr_with_offset_series(self, names, box_with_array): obj = tm.box_expected(tdi, box) expected_add = tm.box_expected(expected_add, box2) - with tm.assert_produces_warning(PerformanceWarning): + warn = PerformanceWarning + if box is DataFrame and using_array_manager: + warn = None + + with tm.assert_produces_warning(warn): res = obj + other tm.assert_equal(res, expected_add) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(warn): res2 = other + obj tm.assert_equal(res2, expected_add) expected_sub = Series([tdi[n] - other[n] for n in range(len(tdi))], name=exname) expected_sub = tm.box_expected(expected_sub, box2) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(warn): res3 = obj - other tm.assert_equal(res3, expected_sub) @@ -1460,7 +1482,7 @@ def test_td64arr_addsub_anchored_offset_arraylike(self, obox, box_with_array): # ------------------------------------------------------------------ # Unsorted - def test_td64arr_add_sub_object_array(self, box_with_array): + def test_td64arr_add_sub_object_array(self, box_with_array, using_array_manager): box = box_with_array xbox = np.ndarray if box is pd.array else box @@ -1469,7 +1491,11 @@ def test_td64arr_add_sub_object_array(self, box_with_array): other = np.array([Timedelta(days=1), offsets.Day(2), Timestamp("2000-01-04")]) - with tm.assert_produces_warning(PerformanceWarning): + warn = PerformanceWarning + if box is DataFrame and using_array_manager: + warn = None + + with tm.assert_produces_warning(warn): result = tdarr + other expected = pd.Index( @@ -1480,10 +1506,10 @@ def test_td64arr_add_sub_object_array(self, box_with_array): msg = "unsupported operand type|cannot subtract a datelike" with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(warn): tdarr - other - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(warn): result = other - tdarr expected = pd.Index([Timedelta(0), Timedelta(0), Timestamp("2000-01-01")]) @@ -2051,9 +2077,7 @@ def test_td64arr_rmul_numeric_array(self, box_with_array, vector, any_real_dtype [np.array([20, 30, 40]), pd.Index([20, 30, 40]), Series([20, 30, 40])], ids=lambda x: type(x).__name__, ) - def test_td64arr_div_numeric_array( - self, box_with_array, vector, any_real_dtype, using_array_manager - ): + def test_td64arr_div_numeric_array(self, box_with_array, vector, any_real_dtype): # GH#4521 # divide/multiply by integers xbox = get_upcast_box(box_with_array, vector) @@ -2089,15 +2113,6 @@ def test_td64arr_div_numeric_array( expected = pd.Index(expected) # do dtype inference expected = tm.box_expected(expected, xbox) assert tm.get_dtype(expected) == "m8[ns]" - - if using_array_manager and box_with_array is DataFrame: - # TODO the behaviour is buggy here (third column with all-NaT - # as result doesn't get preserved as timedelta64 dtype). - # Reported at https://github.com/pandas-dev/pandas/issues/39750 - # Changing the expected instead of xfailing to continue to test - # the correct behaviour for the other columns - expected[2] = Series([NaT, NaT], dtype=object) - tm.assert_equal(result, expected) with pytest.raises(TypeError, match=pattern): From 120010ada6040060051b9131a47ca7e0bc117473 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 17 Mar 2021 14:32:27 +0100 Subject: [PATCH 2/4] hack for NaT --- pandas/core/frame.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0d04517bb4b15..2471f6fa46675 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -144,7 +144,10 @@ ) from pandas.core.array_algos.take import take_2d_multi from pandas.core.arraylike import OpsMixin -from pandas.core.arrays import ExtensionArray +from pandas.core.arrays import ( + ExtensionArray, + TimedeltaArray, +) from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.construction import ( extract_array, @@ -6675,6 +6678,9 @@ def _dispatch_frame_op(self, right, func: Callable, axis: Optional[int] = None): right = right._values + if isinstance(right, TimedeltaArray): + right = right._data + arrays = [ array_op(_left, _right) for _left, _right in zip(self._iter_column_arrays(), right) From c59d85ef2568fb447b7dbf7cae2ea1d07215e282 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 31 Mar 2021 11:48:34 +0200 Subject: [PATCH 3/4] update for comments --- pandas/core/frame.py | 2 ++ pandas/core/generic.py | 4 ++++ pandas/core/ops/__init__.py | 5 ++--- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5c022d81454fe..6745abe769f24 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6704,6 +6704,8 @@ def _dispatch_frame_op(self, right, func: Callable, axis: Optional[int] = None): assert right.index.equals(self.columns) right = right._values + # BlockManager only: maybe_align_as_frame ensures we do not have + # an ndarray here (`assert not isinstance(right, np.ndarray)`) if isinstance(right, TimedeltaArray): right = right._data diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e47fa0eb45d94..0ccace88d7735 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -316,6 +316,10 @@ def _as_manager(self: FrameOrSeries, typ: str) -> FrameOrSeries: # fastpath of passing a manager doesn't check the option/manager class return self._constructor(new_mgr).__finalize__(self) + @property + def _using_array_manager(self): + return isinstance(self._mgr, ArrayManager) + # ---------------------------------------------------------------------- # attrs and flags diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 417706fa19ede..25f0493666f06 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -310,9 +310,8 @@ def to_series(right): right, join="outer", axis=axis, level=level, copy=False ) - from pandas.core.internals import BlockManager - - if isinstance(left._mgr, BlockManager): + if not left._using_array_manager: + # for BlockManager maybe broadcast Series to a DataFrame right = _maybe_align_series_as_frame(left, right, axis) return left, right From f53bf766e931b0b1648abeaf9efb72de5c64f644 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 31 Mar 2021 17:39:34 +0200 Subject: [PATCH 4/4] fixes --- pandas/core/ops/array_ops.py | 6 ++++- pandas/tests/frame/test_arithmetic.py | 38 ++++++++++++++++++++------- 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 1d7c16de0c05d..405d88fae21a4 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -118,7 +118,10 @@ def _masked_arith_op(x: np.ndarray, y, op): # mask is only meaningful for x result = np.empty(x.size, dtype=x.dtype) - mask = notna(xrav) + if isna(y): + mask = np.zeros(x.size, dtype=bool) + else: + mask = notna(xrav) # 1 ** np.nan is 1. So we have to unmask those. if op is pow: @@ -291,6 +294,7 @@ def na_logical_op(x: np.ndarray, y, op): y = ensure_object(y) result = libops.vec_binop(x.ravel(), y.ravel(), op) else: + x = ensure_object(x) # let null fall thru assert lib.is_scalar(y) if not isna(y): diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index c6816fa6481f4..b0730fca47008 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1204,7 +1204,12 @@ def test_combineFrame(self, float_frame, mixed_float_frame, mixed_int_frame): _check_mixed_float(added, dtype="float64") def test_combine_series( - self, float_frame, mixed_float_frame, mixed_int_frame, datetime_frame + self, + float_frame, + mixed_float_frame, + mixed_int_frame, + datetime_frame, + using_array_manager, ): # Series @@ -1227,7 +1232,14 @@ def test_combine_series( # no upcast needed added = mixed_float_frame + series - assert np.all(added.dtypes == series.dtype) + if using_array_manager: + # INFO(ArrayManager) addition is performed column-wise, and + # apparently in numpy the dtype of array gets priority in + # arithmetic casting rules (array[float32] + scalar[float64] gives + # array[float32] and not array[float64]) + assert np.all(added.dtypes == mixed_float_frame.dtypes) + else: + assert np.all(added.dtypes == series.dtype) # vs mix (upcast) as needed added = mixed_float_frame + series.astype("float32") @@ -1587,7 +1599,7 @@ def test_inplace_ops_identity2(self, op): expected = id(df) assert id(df) == expected - def test_alignment_non_pandas(self): + def test_alignment_non_pandas(self, using_array_manager): index = ["A", "B", "C"] columns = ["X", "Y", "Z"] df = DataFrame(np.random.randn(3, 3), index=index, columns=columns) @@ -1600,13 +1612,21 @@ def test_alignment_non_pandas(self): range(1, 4), ]: - expected = DataFrame({"X": val, "Y": val, "Z": val}, index=df.index) - tm.assert_frame_equal(align(df, val, "index")[1], expected) + if not using_array_manager: + expected = DataFrame({"X": val, "Y": val, "Z": val}, index=df.index) + tm.assert_frame_equal(align(df, val, "index")[1], expected) + else: + expected = Series(val, index=index) + tm.assert_series_equal(align(df, val, "index")[1], expected) - expected = DataFrame( - {"X": [1, 1, 1], "Y": [2, 2, 2], "Z": [3, 3, 3]}, index=df.index - ) - tm.assert_frame_equal(align(df, val, "columns")[1], expected) + if not using_array_manager: + expected = DataFrame( + {"X": [1, 1, 1], "Y": [2, 2, 2], "Z": [3, 3, 3]}, index=df.index + ) + tm.assert_frame_equal(align(df, val, "columns")[1], expected) + else: + expected = Series(val, index=columns) + tm.assert_series_equal(align(df, val, "columns")[1], expected) # length mismatch msg = "Unable to coerce to Series, length must be 3: given 2"