diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 25f847c698278..41ede51b18d27 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -266,6 +266,7 @@ Performance improvements - Performance improvement in :class:`Timedelta` constructor (:issue:`30543`) - Performance improvement in :class:`Timestamp` constructor (:issue:`30543`) - Performance improvement in flex arithmetic ops between :class:`DataFrame` and :class:`Series` with ``axis=0`` (:issue:`31296`) +- Performance improvement in arithmetic operations between :class:`DataFrame` and :class:`Series` (:issue:`32997`) - The internal index method :meth:`~Index._shallow_copy` now copies cached attributes over to the new index, avoiding creating these again on the new index. This can speed up many operations that depend on creating copies of existing indexes (:issue:`28584`, :issue:`32640`, :issue:`32669`) diff --git a/pandas/core/array_algos/npcompat.py b/pandas/core/array_algos/npcompat.py new file mode 100644 index 0000000000000..b7d71cfb312f2 --- /dev/null +++ b/pandas/core/array_algos/npcompat.py @@ -0,0 +1,34 @@ +""" +Implementations of high-level numpy functions that are ExtensionArray-compatible. +""" +import numpy as np + +from pandas._typing import ArrayLike + + +def tile(arr: ArrayLike, shape) -> ArrayLike: + raise NotImplementedError + + +def broadcast_to(arr: ArrayLike, shape, orient=None) -> ArrayLike: + if isinstance(arr, np.ndarray): + values = arr + else: + # ExtensionArray + values = arr._values_for_factorize()[0] + + # TODO: if we are ndim==size==1 it shouldnt matter whether rowlike/columnlike? + if values.ndim == 1 and orient is not None: + # SUpport treating a 1-dimensional array as either a row or column + assert orient in ["rowlike", "columnlike"] + if orient == "rowlike": + values = values.reshape(1, -1) + else: + values = values.reshape(-1, 1) + + btvalues = np.broadcast_to(values, shape) + if isinstance(arr, np.ndarray): + result = btvalues + else: + result = type(arr)._from_factorized(btvalues, arr) + return result diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 442d4ca8cef6d..41669eec2e1d0 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -321,7 +321,8 @@ def _values_for_factorize(self) -> Tuple[np.ndarray, int]: @classmethod def _from_factorized(cls, values, original: "BooleanArray") -> "BooleanArray": - return cls._from_sequence(values, dtype=original.dtype) + mask = values == -1 + return cls(values.astype(bool), mask) _HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a153b4e06157b..0b59ebc6bc446 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -455,6 +455,12 @@ def ravel(self, *args, **kwargs): data = self._data.ravel(*args, **kwargs) return type(self)(data, dtype=self.dtype) + @property + def T(self): + # Note: we drop any freq + data = self._data.T + return type(self)(data, dtype=self.dtype) + @property def _box_func(self): """ @@ -561,7 +567,7 @@ def __getitem__(self, key): else: key = check_array_indexer(self, key) - is_period = is_period_dtype(self) + is_period = is_period_dtype(self.dtype) if is_period: freq = self.freq else: @@ -577,7 +583,7 @@ def __getitem__(self, key): freq = self.freq result = getitem(key) - if result.ndim > 1: + if result.ndim > 1 and not is_period and not is_datetime64tz_dtype(self.dtype): # To support MPL which performs slicing with 2 dim # even though it only has 1 dim by definition return result @@ -1208,9 +1214,13 @@ def _add_timedelta_arraylike(self, other): self_i8 = self.asi8 other_i8 = other.asi8 + # TODO: do we need to worry about these having the same row/column order? new_values = checked_add_with_arr( - self_i8, other_i8, arr_mask=self._isnan, b_mask=other._isnan - ) + self_i8.ravel(), + other_i8.ravel(), + arr_mask=self._isnan.ravel(), + b_mask=other._isnan.ravel(), + ).reshape(self.shape) if self._hasnans or other._hasnans: mask = (self._isnan) | (other._isnan) new_values[mask] = iNaT diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index be9cc53d33d6f..59278df4cf647 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -624,10 +624,20 @@ def _addsub_int_array( assert op in [operator.add, operator.sub] if op is operator.sub: other = -other - res_values = algos.checked_add_with_arr(self.asi8, other, arr_mask=self._isnan) + + mask = self._isnan + if self.ndim == self.size == 1 and other.ndim == 2: + # TODO: more general case? should this be handled by DataFrame + # op before we get here? + arr = np.broadcast_to(self._data[:, None], other.shape) + self = type(self)(arr, freq=self.freq) + + res_values = algos.checked_add_with_arr( + self.asi8.ravel(), other.ravel(), arr_mask=self._isnan, + ) res_values = res_values.view("i8") - res_values[self._isnan] = iNaT - return type(self)(res_values, freq=self.freq) + res_values[mask] = iNaT + return type(self)(res_values.reshape(self.shape), freq=self.freq) def _add_offset(self, other): assert not isinstance(other, Tick) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index a25426c5c99cc..ca0d6424eb92b 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -39,7 +39,7 @@ from pandas.core.algorithms import checked_add_with_arr from pandas.core.arrays import datetimelike as dtl import pandas.core.common as com -from pandas.core.construction import extract_array +from pandas.core.construction import array, extract_array from pandas.tseries.frequencies import to_offset from pandas.tseries.offsets import Tick @@ -521,8 +521,16 @@ def __truediv__(self, other): # Note: we do not do type inference on the result, so either # an object array or numeric-dtyped (if numpy does inference) # will be returned. GH#23829 + # FIXME: the above comment is no longer accurate... sometimes result = [self[n] / other[n] for n in range(len(self))] result = np.array(result) + if self.ndim == 2: + # FIXME: kludge, just trying to get the tests passing + res = extract_array(array(result.ravel()), extract_numpy=True) + result = res.reshape(result.shape) + if result.dtype.kind == "m": + # TODO: no real reason for this, but we test it + result = np.asarray(result) return result else: diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 8aaf828787179..bb5f9d6061fb2 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -32,6 +32,7 @@ validate_dtype_freq, ) import pandas.core.common as com +from pandas.core.indexers import deprecate_ndim_indexing import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( InvalidIndexError, @@ -350,6 +351,17 @@ def _int64index(self) -> Int64Index: # ------------------------------------------------------------------------ # Index Methods + def __getitem__(self, key): + # PeriodArray.__getitem__ returns PeriodArray for 2D lookups, + # so we need to issue deprecation warning and cast here + result = super().__getitem__(key) + + if isinstance(result, PeriodIndex) and result._data.ndim == 2: + # this are not actually a valid Index object + deprecate_ndim_indexing(result._data) + return result._data._data + return result + def __array_wrap__(self, result, context=None): """ Gets called after a ufunc. Needs additional handling as diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 182a5b14a1242..0b782256bafd8 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -410,11 +410,7 @@ def apply(self: T, f, filter=None, align_keys=None, **kwargs) -> T: if f == "where": align_copy = True - aligned_args = { - k: kwargs[k] - for k in align_keys - if isinstance(kwargs[k], (ABCSeries, ABCDataFrame)) - } + aligned_args = {k: kwargs[k] for k in align_keys} for b in self.blocks: if filter is not None: @@ -426,8 +422,20 @@ def apply(self: T, f, filter=None, align_keys=None, **kwargs) -> T: b_items = self.items[b.mgr_locs.indexer] for k, obj in aligned_args.items(): - axis = obj._info_axis_number - kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy)._values + if isinstance(obj, (ABCSeries, ABCDataFrame)): + axis = obj._info_axis_number + kwargs[k] = obj.reindex( + b_items, axis=axis, copy=align_copy + )._values + else: + # We should have an ndarray or ExtensionArray + if obj.ndim == 2: + # FIXME: kludge; shouldnt need the ndim restriction + assert obj.shape[0] == self.shape[0], ( + obj.shape, + self.shape, + ) + kwargs[k] = obj[b.mgr_locs.indexer] if callable(f): applied = b.apply(f, **kwargs) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 10dcb59977cdd..b35a382ef9224 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -17,6 +17,7 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna +from pandas.core.array_algos.npcompat import broadcast_to from pandas.core.construction import extract_array from pandas.core.ops.array_ops import ( arithmetic_op, @@ -333,6 +334,16 @@ def column_op(a, b): # in which case we specifically want to operate row-by-row assert right.index.equals(left.columns) + rvals = right._values + if hasattr(rvals, "reshape"): + # i.e. ndarray, DatetimeArray, TimedeltaArray, PeriodArray + right = broadcast_to(rvals, left.shape, orient="rowlike").T + + array_op = get_array_op(func, str_rep=str_rep) + bm = left._data.apply(array_op, right=right, align_keys=["right"]) + return type(left)(bm) + + # still needed for two tests with PeriodArray if right.dtype == "timedelta64[ns]": # ensure we treat NaT values as the correct dtype # Note: we do not do this unconditionally as it may be lossy or @@ -343,7 +354,7 @@ def column_op(a, b): return {i: func(a.iloc[:, i], b[i]) for i in range(len(a.columns))} else: - + # FIXME: this will be wrong for Categorical `b` def column_op(a, b): return {i: func(a.iloc[:, i], b.iloc[i]) for i in range(len(a.columns))} diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index f7211ab5f9fd4..90d2f7d2930bf 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1065,6 +1065,7 @@ def test_dt64arr_add_sub_parr( "unsupported operand", "descriptor.*requires", "ufunc.*cannot use operands", + "Addition/subtraction of integers and integer-arrays", ] ) assert_invalid_addsub_type(dtarr, parr, msg) @@ -1417,7 +1418,10 @@ def test_dt64arr_add_sub_offset_ndarray(self, tz_naive_fixture, box_with_array): other = np.array([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)]) - warn = None if box_with_array is pd.DataFrame else PerformanceWarning + warn = PerformanceWarning + if box_with_array is pd.DataFrame and tz is not None: + warn = None + with tm.assert_produces_warning(warn): res = dtarr + other expected = DatetimeIndex( @@ -2378,7 +2382,10 @@ def test_dti_addsub_object_arraylike( expected = pd.DatetimeIndex(["2017-01-31", "2017-01-06"], tz=tz_naive_fixture) expected = tm.box_expected(expected, xbox) - warn = None if box_with_array is pd.DataFrame else PerformanceWarning + warn = PerformanceWarning + if box_with_array is pd.DataFrame and tz is not None: + warn = None + with tm.assert_produces_warning(warn): result = dtarr + other tm.assert_equal(result, expected) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index beb16c9549cc4..4dba0329de59b 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -1066,7 +1066,13 @@ def test_td64arr_sub_periodlike(self, box_with_array, tdi_freq, pi_freq): # TODO: parametrize over box for pi? tdi = tm.box_expected(tdi, box_with_array) - msg = "cannot subtract|unsupported operand type" + msg = "|".join( + [ + "cannot subtract", + "unsupported operand type", + "Addition/subtraction of integers and integer-arrays", + ] + ) with pytest.raises(TypeError, match=msg): tdi - pi @@ -1318,14 +1324,11 @@ def test_td64arr_add_offset_index(self, names, box): tdi = tm.box_expected(tdi, box) expected = tm.box_expected(expected, box) - # The DataFrame operation is transposed and so operates as separate - # scalar operations, which do not issue a PerformanceWarning - warn = PerformanceWarning if box is not pd.DataFrame else None - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): res = tdi + other tm.assert_equal(res, expected) - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): res2 = other + tdi tm.assert_equal(res2, expected) @@ -1344,14 +1347,11 @@ def test_td64arr_add_offset_array(self, box_with_array): tdi = tm.box_expected(tdi, box) expected = tm.box_expected(expected, box) - # The DataFrame operation is transposed and so operates as separate - # scalar operations, which do not issue a PerformanceWarning - warn = PerformanceWarning if box is not pd.DataFrame else None - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): res = tdi + other tm.assert_equal(res, expected) - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): res2 = other + tdi tm.assert_equal(res2, expected) @@ -1380,10 +1380,7 @@ def test_td64arr_sub_offset_index(self, names, box_with_array): tdi = tm.box_expected(tdi, box) expected = tm.box_expected(expected, xbox) - # The DataFrame operation is transposed and so operates as separate - # scalar operations, which do not issue a PerformanceWarning - warn = PerformanceWarning if box is not pd.DataFrame else None - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): res = tdi - other tm.assert_equal(res, expected) @@ -1399,10 +1396,7 @@ def test_td64arr_sub_offset_array(self, box_with_array): tdi = tm.box_expected(tdi, box_with_array) expected = tm.box_expected(expected, box_with_array) - # The DataFrame operation is transposed and so operates as separate - # scalar operations, which do not issue a PerformanceWarning - warn = None if box_with_array is pd.DataFrame else PerformanceWarning - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): res = tdi - other tm.assert_equal(res, expected) @@ -1473,28 +1467,31 @@ def test_td64arr_add_sub_object_array(self, box_with_array): [pd.Timedelta(days=1), pd.offsets.Day(2), pd.Timestamp("2000-01-04")] ) - warn = PerformanceWarning if box_with_array is not pd.DataFrame else None - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): result = tdarr + other expected = pd.Index( [pd.Timedelta(days=2), pd.Timedelta(days=4), pd.Timestamp("2000-01-07")] ) expected = tm.box_expected(expected, box_with_array) + if box_with_array is pd.DataFrame: + expected = expected.astype(object) tm.assert_equal(result, expected) msg = "unsupported operand type|cannot subtract a datelike" with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): tdarr - other - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): result = other - tdarr expected = pd.Index( [pd.Timedelta(0), pd.Timedelta(0), pd.Timestamp("2000-01-01")] ) expected = tm.box_expected(expected, box_with_array) + if box_with_array is pd.DataFrame: + expected = expected.astype(object) tm.assert_equal(result, expected) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index ec21898852888..3e82e9d9fa37f 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -324,3 +324,10 @@ def test_transpose(self, data): self.assert_frame_equal(result, expected) self.assert_frame_equal(np.transpose(np.transpose(df)), df) self.assert_frame_equal(np.transpose(np.transpose(df[["A"]])), df[["A"]]) + + def test_factorize_roundtrip(self, data): + # GH#32673 + values = data._values_for_factorize()[0] + result = type(data)._from_factorized(values, data) + + self.assert_equal(result, data) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 1f026e405dc17..d576228674968 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -65,7 +65,9 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): @classmethod def _from_factorized(cls, values, original): - return cls([UserDict(x) for x in values if x != ()]) + return cls( + [UserDict(x) if x != () else original.dtype.na_value for x in values] + ) def __getitem__(self, item): if isinstance(item, numbers.Integral): diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index 3aa188098620d..38666a1709092 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -4,6 +4,7 @@ from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd +import pandas._testing as tm from pandas.core.arrays import DatetimeArray from pandas.tests.extension import base @@ -201,6 +202,13 @@ def test_unstack(self, obj): result = ser.unstack(0) self.assert_equal(result, expected) + def test_factorize_roundtrip(self, data): + # GH#32673, for DTA we dont preserve freq + values = data._values_for_factorize()[0] + result = type(data)._from_factorized(values, data) + + tm.assert_numpy_array_equal(result.asi8, data.asi8) + class TestSetitem(BaseDatetimeTests, base.BaseSetitemTests): pass diff --git a/pandas/tests/frame/common.py b/pandas/tests/frame/common.py index 463a140972ab5..778f07ee6223c 100644 --- a/pandas/tests/frame/common.py +++ b/pandas/tests/frame/common.py @@ -6,13 +6,13 @@ def _check_mixed_float(df, dtype=None): elif isinstance(dtype, dict): dtypes.update(dtype) if dtypes.get("A"): - assert df.dtypes["A"] == dtypes["A"] + assert df.dtypes["A"] == dtypes["A"], (df.dtypes, dtypes) if dtypes.get("B"): - assert df.dtypes["B"] == dtypes["B"] + assert df.dtypes["B"] == dtypes["B"], (df.dtypes, dtypes) if dtypes.get("C"): - assert df.dtypes["C"] == dtypes["C"] + assert df.dtypes["C"] == dtypes["C"], (df.dtypes, dtypes) if dtypes.get("D"): - assert df.dtypes["D"] == dtypes["D"] + assert df.dtypes["D"] == dtypes["D"], (df.dtypes, dtypes) def _check_mixed_int(df, dtype=None): @@ -22,10 +22,10 @@ def _check_mixed_int(df, dtype=None): elif isinstance(dtype, dict): dtypes.update(dtype) if dtypes.get("A"): - assert df.dtypes["A"] == dtypes["A"] + assert df.dtypes["A"] == dtypes["A"], (df.dtypes, dtypes) if dtypes.get("B"): - assert df.dtypes["B"] == dtypes["B"] + assert df.dtypes["B"] == dtypes["B"], (df.dtypes, dtypes) if dtypes.get("C"): - assert df.dtypes["C"] == dtypes["C"] + assert df.dtypes["C"] == dtypes["C"], (df.dtypes, dtypes) if dtypes.get("D"): - assert df.dtypes["D"] == dtypes["D"] + assert df.dtypes["D"] == dtypes["D"], (df.dtypes, dtypes) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 89f8bc433419b..4f5cedb189085 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -613,13 +613,6 @@ def test_df_arith_2d_array_rowlike_broadcasts(self, all_arithmetic_operators): expected = pd.DataFrame(exvals, columns=df.columns, index=df.index) - if opname in ["__rmod__", "__rfloordiv__"]: - # exvals will have dtypes [f8, i8, i8] so expected will be - # all-f8, but the DataFrame operation will return mixed dtypes - # use exvals[-1].dtype instead of "i8" for compat with 32-bit - # systems/pythons - expected[False] = expected[False].astype(exvals[-1].dtype) - result = getattr(df, opname)(rowlike) tm.assert_frame_equal(result, expected) @@ -1040,9 +1033,8 @@ def test_combine_series( assert "E" in larger_added assert np.isnan(larger_added["E"]).all() - # no upcast needed added = mixed_float_frame + series - _check_mixed_float(added) + assert np.all(added.dtypes == series.dtype) # vs mix (upcast) as needed added = mixed_float_frame + series.astype("float32") diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 1340f514e31ce..ba1b3e9d0ca8e 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -266,23 +266,24 @@ def test_scalar_na_logical_ops_corners(self): result = s & list(s) tm.assert_series_equal(result, expected) + def test_scalar_na_logical_ops_corners_aligns(self): + s = Series([2, 3, 4, 5, 6, 7, 8, 9, datetime(2005, 1, 1)]) + s[::2] = np.nan d = DataFrame({"A": s}) - # TODO: Fix this exception - needs to be fixed! (see GH5035) - # (previously this was a TypeError because series returned - # NotImplemented - # this is an alignment issue; these are equivalent - # https://github.com/pandas-dev/pandas/issues/5284 + expected = DataFrame(False, index=range(9), columns=["A"] + list(range(9))) - with pytest.raises(TypeError): - d.__and__(s, axis="columns") - with pytest.raises(TypeError): - d.__and__(s, axis=1) + result = d.__and__(s, axis="columns") + tm.assert_frame_equal(result, expected) - with pytest.raises(TypeError): - s & d - with pytest.raises(TypeError): - d & s + result = d.__and__(s, axis=1) + tm.assert_frame_equal(result, expected) + + result = s & d + tm.assert_frame_equal(result, expected) + + result = d & s + tm.assert_frame_equal(result, expected) expected = (s & s).to_frame("A") result = d.__and__(s, axis="index")