From d6bf5055015d1a94a39e5d078f806088192d54f5 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 15 Nov 2022 14:41:23 -0800 Subject: [PATCH 1/6] API: dont do type inference on arithmetic results --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/_libs/tslibs/timedeltas.pyx | 90 +++++++++++++++++++++ pandas/core/arrays/timedeltas.py | 61 +++++--------- pandas/core/indexes/base.py | 6 +- pandas/core/ops/__init__.py | 34 ++++++-- pandas/core/series.py | 7 +- pandas/tests/arithmetic/test_numeric.py | 2 + pandas/tests/arithmetic/test_object.py | 16 +++- pandas/tests/arithmetic/test_timedelta64.py | 50 ++++++++++-- 9 files changed, 207 insertions(+), 60 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 032bcf09244e5..85edcc8d8d705 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -341,6 +341,7 @@ Other API changes - Passing a sequence containing a type that cannot be converted to :class:`Timedelta` to :func:`to_timedelta` or to the :class:`Series` or :class:`DataFrame` constructor with ``dtype="timedelta64[ns]"`` or to :class:`TimedeltaIndex` now raises ``TypeError`` instead of ``ValueError`` (:issue:`49525`) - Changed behavior of :class:`Index` constructor with sequence containing at least one ``NaT`` and everything else either ``None`` or ``NaN`` to infer ``datetime64[ns]`` dtype instead of ``object``, matching :class:`Series` behavior (:issue:`49340`) - Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`) +- Changed behavior of :class:`Index`, :class:`Series`, and :class:`DataFrame` arithmetic methods when working with object-dtypes, the results no longer do type inference on the result of the array operations (:issue:`??`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 071cfb7cf541a..cef0f98dd9d41 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1996,6 +1996,96 @@ class Timedelta(_Timedelta): return div, other - div * self +def truediv_object_array(ndarray left, ndarray right): + cdef: + ndarray[object] result = np.empty((left).shape, dtype=object) + object td64 # really timedelta64 if we find a way to declare that + object obj, res_value + _Timedelta td + Py_ssize_t i + bint seen_numeric = False, seen_td = False + + for i in range(len(left)): + td64 = left[i] + obj = right[i] + + if get_timedelta64_value(td64) == NPY_NAT: + # td here should be interpreted as a td64 NaT + if _should_cast_to_timedelta(obj): + res_value = np.nan + seen_numeric = True + else: + # if its a number then let numpy handle division, otherwise + # numpy will raise + res_value = td64 / obj + seen_td = True + else: + td = Timedelta(td64) + res_value = td / obj + if is_float_object(res_value) or is_integer_object(res_value): + seen_numeric = True + else: + seen_td = True + + result[i] = res_value + + if not seen_numeric: + # if we haven't seen any numeric results, we have all-td64, so we + # can cast back + return result.astype(left.dtype) + elif not seen_td: + # if we haven't seen any timedelta results, we have all-numeric, and + # can cast + return result.astype(np.float64) + return result + + +def floordiv_object_array(ndarray left, ndarray right): + cdef: + ndarray[object] result = np.empty((left).shape, dtype=object) + object td64 # really timedelta64 if we find a way to declare that + object obj, res_value + _Timedelta td + Py_ssize_t i + bint seen_numeric = False, seen_td = False + + for i in range(len(left)): + td64 = left[i] + obj = right[i] + + if get_timedelta64_value(td64) == NPY_NAT: + # td here should be interpreted as a td64 NaT + if _should_cast_to_timedelta(obj): + res_value = np.nan + seen_numeric = True + else: + # if its a number then let numpy handle division, otherwise + # numpy will raise + res_value = td64 // obj + seen_td = True + else: + td = Timedelta(td64) + res_value = td // obj + if is_float_object(res_value) or is_integer_object(res_value): + seen_numeric = True + else: + seen_td = True + + result[i] = res_value + + # We can't leave this inference to numpy because it will see [td64, int] + # and cast that to all-td64 + if not seen_numeric: + # if we haven't seen any numeric results, we have all-td64, so we + # can cast back + return result.astype(left.dtype) + elif not seen_td: + # if we haven't seen any timedelta results, we have all-numeric, and + # can cast + return result.astype(np.int64) + return result + + cdef bint is_any_td_scalar(object obj): """ Cython equivalent for `isinstance(obj, (timedelta, np.timedelta64, Tick))` diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index fe7cade1711d0..6cbfd8f418c6e 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -32,8 +32,10 @@ from pandas._libs.tslibs.fields import get_timedelta_field from pandas._libs.tslibs.timedeltas import ( array_to_timedelta64, + floordiv_object_array, ints_to_pytimedelta, parse_timedelta_unit, + truediv_object_array, ) from pandas._typing import ( AxisInt, @@ -57,12 +59,14 @@ is_timedelta64_dtype, pandas_dtype, ) +from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.missing import isna from pandas.core import nanops from pandas.core.arrays import datetimelike as dtl from pandas.core.arrays._ranges import generate_regular_range import pandas.core.common as com +from pandas.core.construction import extract_array from pandas.core.ops.common import unpack_zerodim_and_defer if TYPE_CHECKING: @@ -495,31 +499,16 @@ def __truediv__(self, other): return self._ndarray / other elif is_object_dtype(other.dtype): - # We operate on raveled arrays to avoid problems in inference - # on NaT - # TODO: tests with non-nano - srav = self.ravel() - orav = other.ravel() - result_list = [srav[n] / orav[n] for n in range(len(srav))] - result = np.array(result_list).reshape(self.shape) - - # We need to do dtype inference in order to keep DataFrame ops - # behavior consistent with Series behavior - inferred = lib.infer_dtype(result, skipna=False) - if inferred == "timedelta": - flat = result.ravel() - result = type(self)._from_sequence(flat).reshape(result.shape) - elif inferred == "floating": - result = result.astype(float) - elif inferred == "datetime": - # GH#39750 this occurs when result is all-NaT, in which case - # we want to interpret these NaTs as td64. - # We construct an all-td64NaT result. - # error: Incompatible types in assignment (expression has type - # "TimedeltaArray", variable has type "ndarray[Any, - # dtype[floating[_64Bit]]]") - result = self * np.nan # type: ignore[assignment] + other = extract_array(other, extract_numpy=True) + if self.ndim > 1: + res_cols = [left / right for left, right in zip(self, other)] + res_cols2 = [x.reshape(1, -1) for x in res_cols] + result = concat_compat(res_cols2, axis=0) + else: + result = truediv_object_array(self._ndarray, other) + if result.dtype.kind == "m": + result = type(self)(result) return result else: @@ -619,24 +608,16 @@ def __floordiv__(self, other): return result elif is_object_dtype(other.dtype): + other = extract_array(other, extract_numpy=True) # error: Incompatible types in assignment (expression has type # "List[Any]", variable has type "ndarray") - srav = self.ravel() - orav = other.ravel() - res_list = [srav[n] // orav[n] for n in range(len(srav))] - result_flat = np.asarray(res_list) - inferred = lib.infer_dtype(result_flat, skipna=False) - - result = result_flat.reshape(self.shape) - - if inferred == "timedelta": - result, _ = sequence_to_td64ns(result) - return type(self)(result) - if inferred == "datetime": - # GH#39750 occurs when result is all-NaT, which in this - # case should be interpreted as td64nat. This can only - # occur when self is all-td64nat - return self * np.nan + if self.ndim > 1: + result = np.array([left // right for left, right in zip(self, other)]) + else: + result = floordiv_object_array(self._ndarray, other) + + if result.dtype.kind == "m": + result = type(self)(result) return result elif is_integer_dtype(other.dtype) or is_float_dtype(other.dtype): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 068ff7a0bf1c9..f78cbfd614b4f 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6672,10 +6672,10 @@ def _logical_method(self, other, op): def _construct_result(self, result, name): if isinstance(result, tuple): return ( - Index._with_infer(result[0], name=name), - Index._with_infer(result[1], name=name), + Index._with_infer(result[0], name=name, dtype=result[0].dtype), + Index._with_infer(result[1], name=name, dtype=result[1].dtype), ) - return Index._with_infer(result, name=name) + return Index(result, name=name, dtype=result.dtype) def _arith_method(self, other, op): if ( diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 4007d3cfa46da..20835feffadde 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -232,18 +232,27 @@ def align_method_FRAME( def to_series(right): msg = "Unable to coerce to Series, length must be {req_len}: given {given_len}" + + # pass dtype to avoid doing inference, which would break consistency + # with Index/Series ops + dtype = None + if getattr(right, "dtype", None) == object: + # can't pass right.dtype unconditionally as that would break on e.g. + # datetime64[h] ndarray + dtype = object + if axis is not None and left._get_axis_name(axis) == "index": if len(left.index) != len(right): raise ValueError( msg.format(req_len=len(left.index), given_len=len(right)) ) - right = left._constructor_sliced(right, index=left.index) + right = left._constructor_sliced(right, index=left.index, dtype=dtype) else: if len(left.columns) != len(right): raise ValueError( msg.format(req_len=len(left.columns), given_len=len(right)) ) - right = left._constructor_sliced(right, index=left.columns) + right = left._constructor_sliced(right, index=left.columns, dtype=dtype) return right if isinstance(right, np.ndarray): @@ -252,13 +261,25 @@ def to_series(right): right = to_series(right) elif right.ndim == 2: + # We need to pass dtype=right.dtype to retain object dtype + # otherwise we lose consistency with Index and array ops + dtype = None + if getattr(right, "dtype", None) == object: + # can't pass right.dtype unconditionally as that would break on e.g. + # datetime64[h] ndarray + dtype = object + if right.shape == left.shape: - right = left._constructor(right, index=left.index, columns=left.columns) + right = left._constructor( + right, index=left.index, columns=left.columns, dtype=dtype + ) elif right.shape[0] == left.shape[0] and right.shape[1] == 1: # Broadcast across columns right = np.broadcast_to(right, left.shape) - right = left._constructor(right, index=left.index, columns=left.columns) + right = left._constructor( + right, index=left.index, columns=left.columns, dtype=dtype + ) elif right.shape[1] == left.shape[1] and right.shape[0] == 1: # Broadcast along rows @@ -409,7 +430,10 @@ def _maybe_align_series_as_frame(frame: DataFrame, series: Series, axis: AxisInt rvalues = rvalues.reshape(1, -1) rvalues = np.broadcast_to(rvalues, frame.shape) - return type(frame)(rvalues, index=frame.index, columns=frame.columns) + # pass dtype to avoid doing inference + return type(frame)( + rvalues, index=frame.index, columns=frame.columns, dtype=series.dtype + ) def flex_arith_method_FRAME(op): diff --git a/pandas/core/series.py b/pandas/core/series.py index 44732b9060ff9..59713c5c36197 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2996,9 +2996,10 @@ def _construct_result( assert isinstance(res2, Series) return (res1, res2) - # We do not pass dtype to ensure that the Series constructor - # does inference in the case where `result` has object-dtype. - out = self._constructor(result, index=self.index) + # We pass dtype to ensure the constructor does not do dtype inference + out = self._constructor( + result, index=self.index, dtype=getattr(result, "dtype", None) + ) out = out.__finalize__(self) # Set the result's name after __finalize__ is called because __finalize__ diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 529dd6baa70c0..1311d34c4c0f5 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -320,6 +320,8 @@ def test_add_sub_datetimedeltalike_invalid( r"operand type\(s\) all returned NotImplemented from __array_ufunc__", "can only perform ops with numeric values", "cannot subtract DatetimeArray from ndarray", + # pd.Timedelta(1) + Index([0, 1, 2]) + "Cannot add or subtract Timedelta from integers", ] ) assert_invalid_addsub_type(left, other, msg) diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index e107ff6b65c0f..f6a0d6b61f080 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -182,12 +182,14 @@ def test_operators_na_handling(self): @pytest.mark.parametrize("dtype", [None, object]) def test_series_with_dtype_radd_timedelta(self, dtype): # note this test is _not_ aimed at timedelta64-dtyped Series + # as of 2.0 we retain object dtype when ser.dtype == object ser = Series( [pd.Timedelta("1 days"), pd.Timedelta("2 days"), pd.Timedelta("3 days")], dtype=dtype, ) expected = Series( - [pd.Timedelta("4 days"), pd.Timedelta("5 days"), pd.Timedelta("6 days")] + [pd.Timedelta("4 days"), pd.Timedelta("5 days"), pd.Timedelta("6 days")], + dtype=dtype, ) result = pd.Timedelta("3 days") + ser @@ -227,7 +229,10 @@ def test_mixed_timezone_series_ops_object(self): name="xxx", ) assert ser2.dtype == object - exp = Series([pd.Timedelta("2 days"), pd.Timedelta("4 days")], name="xxx") + # as of 2.0 we preserve object dtype + exp = Series( + [pd.Timedelta("2 days"), pd.Timedelta("4 days")], name="xxx", dtype=object + ) tm.assert_series_equal(ser2 - ser, exp) tm.assert_series_equal(ser - ser2, -exp) @@ -238,7 +243,12 @@ def test_mixed_timezone_series_ops_object(self): ) assert ser.dtype == object - exp = Series([pd.Timedelta("01:30:00"), pd.Timedelta("02:30:00")], name="xxx") + # as of 2.0 we preserve object dtype + exp = Series( + [pd.Timedelta("01:30:00"), pd.Timedelta("02:30:00")], + name="xxx", + dtype=object, + ) tm.assert_series_equal(ser + pd.Timedelta("00:30:00"), exp) tm.assert_series_equal(pd.Timedelta("00:30:00") + ser, exp) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 14d50acf3eadf..10ab332e739ec 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -31,6 +31,7 @@ Int64Index, UInt64Index, ) +from pandas.core.arrays import PandasArray from pandas.tests.arithmetic.common import ( assert_invalid_addsub_type, assert_invalid_comparison, @@ -1413,7 +1414,8 @@ def test_td64arr_add_sub_object_array(self, box_with_array): expected = pd.Index( [Timedelta(days=2), Timedelta(days=4), Timestamp("2000-01-07")] ) - expected = tm.box_expected(expected, xbox) + # as of 2.0 we preserve object dtype in the DataFrame case + expected = tm.box_expected(expected, xbox).astype(object) tm.assert_equal(result, expected) msg = "unsupported operand type|cannot subtract a datelike" @@ -1425,7 +1427,8 @@ def test_td64arr_add_sub_object_array(self, box_with_array): result = other - tdarr expected = pd.Index([Timedelta(0), Timedelta(0), Timestamp("2000-01-01")]) - expected = tm.box_expected(expected, xbox) + # as of 2.0 we preserve object dtype in the DataFrame case + expected = tm.box_expected(expected, xbox).astype(object) tm.assert_equal(result, expected) @@ -1698,6 +1701,39 @@ def test_tdarr_div_length_mismatch(self, box_with_array): with pytest.raises(ValueError, match=msg): other / rng + def test_td64_div_object_mixed_result(self, box_with_array): + # Case where we having a NaT in the result inseat of timedelta64("NaT") + # is misleading + orig = timedelta_range("1 Day", periods=3).insert(1, NaT) + tdi = tm.box_expected(orig, box_with_array, transpose=False) + + other = np.array([orig[0], 1.5, 2.0, orig[2]], dtype=object) + other = tm.box_expected(other, box_with_array, transpose=False) + + res = tdi / other + + expected = pd.Index( + [1.0, np.timedelta64("NaT", "ns"), orig[0], 1.5], dtype=object + ) + expected = tm.box_expected(expected, box_with_array, transpose=False) + if isinstance(expected, PandasArray): + expected = expected.to_numpy() + tm.assert_equal(res, expected) + if box_with_array is DataFrame: + # We have a np.timedelta64(NaT), not pd.NaT + assert isinstance(res.iloc[1, 0], np.timedelta64) + + res = tdi // other + + expected = pd.Index([1, np.timedelta64("NaT", "ns"), orig[0], 1], dtype=object) + expected = tm.box_expected(expected, box_with_array, transpose=False) + if isinstance(expected, PandasArray): + expected = expected.to_numpy() + tm.assert_equal(res, expected) + if box_with_array is DataFrame: + # We have a np.timedelta64(NaT), not pd.NaT + assert isinstance(res.iloc[1, 0], np.timedelta64) + # ------------------------------------------------------------------ # __floordiv__, __rfloordiv__ @@ -1991,9 +2027,8 @@ def test_td64arr_div_numeric_array( expected = [tdser.iloc[0, n] / vector[n] for n in range(len(vector))] else: expected = [tdser[n] / vector[n] for n in range(len(tdser))] - expected = pd.Index(expected) # do dtype inference + expected = pd.Index(expected) expected = tm.box_expected(expected, xbox) - assert tm.get_dtype(expected) == "m8[ns]" tm.assert_equal(result, expected) @@ -2061,11 +2096,14 @@ def test_td64arr_all_nat_div_object_dtype_numeric(self, box_with_array): left = tm.box_expected(tdi, box_with_array) right = np.array([2, 2.0], dtype=object) + expected = tdi + expected = tm.box_expected(expected, box_with_array) + result = left / right - tm.assert_equal(result, left) + tm.assert_equal(result, expected) result = left // right - tm.assert_equal(result, left) + tm.assert_equal(result, expected) class TestTimedelta64ArrayLikeArithmetic: From b46e3375183470ded1c86453a368d0245316f623 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 23 Nov 2022 11:03:29 -0800 Subject: [PATCH 2/6] mypy fixup --- pandas/_libs/tslibs/timedeltas.pyi | 6 ++++++ pandas/core/arrays/timedeltas.py | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi index f41bea11985f2..c9904e4592329 100644 --- a/pandas/_libs/tslibs/timedeltas.pyi +++ b/pandas/_libs/tslibs/timedeltas.pyi @@ -76,6 +76,12 @@ def delta_to_nanoseconds( reso: int = ..., # NPY_DATETIMEUNIT round_ok: bool = ..., ) -> int: ... +def floordiv_object_array( + left: np.ndarray, right: npt.NDArray[np.object_] +) -> np.ndarray: ... +def truediv_object_array( + left: np.ndarray, right: npt.NDArray[np.object_] +) -> np.ndarray: ... class Timedelta(timedelta): _creso: int diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 9c69a3b3ab7e8..a31f5596de2d9 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -523,7 +523,7 @@ def __truediv__(self, other): result = truediv_object_array(self._ndarray, other) if result.dtype.kind == "m": - result = type(self)(result) + return type(self)(result) return result else: @@ -632,7 +632,7 @@ def __floordiv__(self, other): result = floordiv_object_array(self._ndarray, other) if result.dtype.kind == "m": - result = type(self)(result) + return type(self)(result) return result elif is_integer_dtype(other.dtype) or is_float_dtype(other.dtype): From a84676341d08aaf79a939316d6660a30622c6c02 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 29 Nov 2022 16:58:49 -0800 Subject: [PATCH 3/6] use concat_compat --- pandas/core/arrays/timedeltas.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index a31f5596de2d9..5843259b3933c 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -624,10 +624,10 @@ def __floordiv__(self, other): elif is_object_dtype(other.dtype): other = extract_array(other, extract_numpy=True) - # error: Incompatible types in assignment (expression has type - # "List[Any]", variable has type "ndarray") if self.ndim > 1: - result = np.array([left // right for left, right in zip(self, other)]) + res_cols = [left // right for left, right in zip(self, other)] + res_cols2 = [x.reshape(1, -1) for x in res_cols] + result = concat_compat(res_cols2, axis=0) else: result = floordiv_object_array(self._ndarray, other) From 40c5e768d8e6ec7f9d99a2f3356c0511ba3871af Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 1 Dec 2022 10:38:24 -0800 Subject: [PATCH 4/6] dont infer in TimedeltaArray --- pandas/_libs/tslibs/timedeltas.pyx | 32 --------------------- pandas/_testing/__init__.py | 16 ++++++----- pandas/core/arrays/timedeltas.py | 9 ++---- pandas/tests/arithmetic/test_timedelta64.py | 15 ++++++---- 4 files changed, 21 insertions(+), 51 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index eaddb0e520b56..c17bbec3fd901 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -2030,7 +2030,6 @@ def truediv_object_array(ndarray left, ndarray right): object obj, res_value _Timedelta td Py_ssize_t i - bint seen_numeric = False, seen_td = False for i in range(len(left)): td64 = left[i] @@ -2040,30 +2039,16 @@ def truediv_object_array(ndarray left, ndarray right): # td here should be interpreted as a td64 NaT if _should_cast_to_timedelta(obj): res_value = np.nan - seen_numeric = True else: # if its a number then let numpy handle division, otherwise # numpy will raise res_value = td64 / obj - seen_td = True else: td = Timedelta(td64) res_value = td / obj - if is_float_object(res_value) or is_integer_object(res_value): - seen_numeric = True - else: - seen_td = True result[i] = res_value - if not seen_numeric: - # if we haven't seen any numeric results, we have all-td64, so we - # can cast back - return result.astype(left.dtype) - elif not seen_td: - # if we haven't seen any timedelta results, we have all-numeric, and - # can cast - return result.astype(np.float64) return result @@ -2074,7 +2059,6 @@ def floordiv_object_array(ndarray left, ndarray right): object obj, res_value _Timedelta td Py_ssize_t i - bint seen_numeric = False, seen_td = False for i in range(len(left)): td64 = left[i] @@ -2084,32 +2068,16 @@ def floordiv_object_array(ndarray left, ndarray right): # td here should be interpreted as a td64 NaT if _should_cast_to_timedelta(obj): res_value = np.nan - seen_numeric = True else: # if its a number then let numpy handle division, otherwise # numpy will raise res_value = td64 // obj - seen_td = True else: td = Timedelta(td64) res_value = td // obj - if is_float_object(res_value) or is_integer_object(res_value): - seen_numeric = True - else: - seen_td = True result[i] = res_value - # We can't leave this inference to numpy because it will see [td64, int] - # and cast that to all-td64 - if not seen_numeric: - # if we haven't seen any numeric results, we have all-td64, so we - # can cast back - return result.astype(left.dtype) - elif not seen_td: - # if we haven't seen any timedelta results, we have all-numeric, and - # can cast - return result.astype(np.int64) return result diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 02ee13d60427e..d1f9610eaa545 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -254,7 +254,7 @@ def equalContents(arr1, arr2) -> bool: return frozenset(arr1) == frozenset(arr2) -def box_expected(expected, box_cls, transpose: bool = True): +def box_expected(expected, box_cls, transpose: bool = True, dtype=None): """ Helper function to wrap the expected output of a test in a given box_class. @@ -270,24 +270,26 @@ def box_expected(expected, box_cls, transpose: bool = True): if box_cls is pd.array: if isinstance(expected, RangeIndex): # pd.array would return an IntegerArray - expected = PandasArray(np.asarray(expected._values)) + expected = PandasArray(np.asarray(expected._values, dtype=dtype)) else: - expected = pd.array(expected, copy=False) + expected = pd.array(expected, copy=False, dtype=dtype) elif box_cls is Index: - expected = Index._with_infer(expected) + expected = Index(expected, dtype=dtype) elif box_cls is Series: - expected = Series(expected) + expected = Series(expected, dtype=dtype) elif box_cls is DataFrame: - expected = Series(expected).to_frame() + expected = Series(expected, dtype=dtype).to_frame() if transpose: # for vector operations, we need a DataFrame to be a single-row, # not a single-column, in order to operate against non-DataFrame # vectors of the same length. But convert to two rows to avoid # single-row special cases in datetime arithmetic expected = expected.T + if dtype is not None: + expected = expected.astype(dtype) expected = pd.concat([expected] * 2, ignore_index=True) elif box_cls is np.ndarray or box_cls is np.array: - expected = np.array(expected) + expected = np.array(expected, dtype=dtype) elif box_cls is to_array: expected = to_array(expected) else: diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 5843259b3933c..6e4d321cb28f2 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -59,7 +59,6 @@ is_timedelta64_dtype, pandas_dtype, ) -from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.missing import isna from pandas.core import nanops @@ -518,12 +517,10 @@ def __truediv__(self, other): if self.ndim > 1: res_cols = [left / right for left, right in zip(self, other)] res_cols2 = [x.reshape(1, -1) for x in res_cols] - result = concat_compat(res_cols2, axis=0) + result = np.concatenate(res_cols2, axis=0) else: result = truediv_object_array(self._ndarray, other) - if result.dtype.kind == "m": - return type(self)(result) return result else: @@ -627,12 +624,10 @@ def __floordiv__(self, other): if self.ndim > 1: res_cols = [left // right for left, right in zip(self, other)] res_cols2 = [x.reshape(1, -1) for x in res_cols] - result = concat_compat(res_cols2, axis=0) + result = np.concatenate(res_cols2, axis=0) else: result = floordiv_object_array(self._ndarray, other) - if result.dtype.kind == "m": - return type(self)(result) return result elif is_integer_dtype(other.dtype) or is_float_dtype(other.dtype): diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 10ab332e739ec..b1afd682d77a9 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -1668,7 +1668,7 @@ def test_td64arr_div_td64_ndarray(self, box_with_array): tm.assert_equal(result, expected) result = rng / other.astype(object) - tm.assert_equal(result, expected) + tm.assert_equal(result, expected.astype(object)) result = rng / list(other) tm.assert_equal(result, expected) @@ -2027,8 +2027,11 @@ def test_td64arr_div_numeric_array( expected = [tdser.iloc[0, n] / vector[n] for n in range(len(vector))] else: expected = [tdser[n] / vector[n] for n in range(len(tdser))] - expected = pd.Index(expected) - expected = tm.box_expected(expected, xbox) + expected = [ + x if x is not NaT else np.timedelta64("NaT", "ns") for x in expected + ] + expected = pd.Index(expected, dtype=object) + expected = tm.box_expected(expected, xbox, dtype=object) tm.assert_equal(result, expected) @@ -2096,8 +2099,10 @@ def test_td64arr_all_nat_div_object_dtype_numeric(self, box_with_array): left = tm.box_expected(tdi, box_with_array) right = np.array([2, 2.0], dtype=object) - expected = tdi - expected = tm.box_expected(expected, box_with_array) + expected = pd.Index([np.timedelta64("NaT", "ns")] * 2, dtype=object) + expected = tm.box_expected(expected, box_with_array, dtype=object).astype( + object + ) result = left / right tm.assert_equal(result, expected) From 95e0b3992c167a5225eb7660ced84e591a9751ba Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 8 Dec 2022 16:22:14 -0800 Subject: [PATCH 5/6] update addsub --- pandas/core/arrays/datetimelike.py | 6 +--- pandas/core/arrays/timedeltas.py | 1 + pandas/tests/arithmetic/test_datetime64.py | 25 ++++++------- pandas/tests/arithmetic/test_period.py | 7 ++-- pandas/tests/arithmetic/test_timedelta64.py | 39 +++++++++++---------- 5 files changed, 36 insertions(+), 42 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c0a65a97d31f8..b254bad2feebb 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1353,11 +1353,7 @@ def _addsub_object_array(self, other: np.ndarray, op): assert self.shape == other.shape, (self.shape, other.shape) res_values = op(self.astype("O"), np.asarray(other)) - - ext_arr = pd_array(res_values.ravel()) - result = cast(np.ndarray, extract_array(ext_arr, extract_numpy=True)) - result = result.reshape(self.shape) - return result + return res_values @unpack_zerodim_and_defer("__add__") def __add__(self, other): diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 6e4d321cb28f2..1702dd5dcfb7a 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -628,6 +628,7 @@ def __floordiv__(self, other): else: result = floordiv_object_array(self._ndarray, other) + assert result.dtype == object return result elif is_integer_dtype(other.dtype) or is_float_dtype(other.dtype): diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index b4f1c5404d178..e840668167f99 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -34,10 +34,6 @@ date_range, ) import pandas._testing as tm -from pandas.core.arrays import ( - DatetimeArray, - TimedeltaArray, -) from pandas.core.ops import roperator from pandas.tests.arithmetic.common import ( assert_cannot_add, @@ -1023,7 +1019,7 @@ def test_dt64arr_sub_dt64object_array(self, box_with_array, tz_naive_fixture): expected = dti - dti obj = tm.box_expected(dti, box_with_array) - expected = tm.box_expected(expected, box_with_array) + expected = tm.box_expected(expected, box_with_array).astype(object) with tm.assert_produces_warning(PerformanceWarning): result = obj - obj.astype(object) @@ -1572,10 +1568,13 @@ def test_dt64arr_add_sub_offset_array( other = np.array([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)]) expected = DatetimeIndex([op(dti[n], other[n]) for n in range(len(dti))]) - expected = tm.box_expected(expected, box_with_array) + expected = tm.box_expected(expected, box_with_array).astype(object) if box_other: other = tm.box_expected(other, box_with_array) + if box_with_array is pd.array and op is roperator.radd: + # We expect a PandasArray, not ndarray[object] here + expected = pd.array(expected, dtype=object) with tm.assert_produces_warning(PerformanceWarning): res = op(dtarr, other) @@ -2373,7 +2372,7 @@ def test_dti_addsub_offset_arraylike( expected = DatetimeIndex( [op(dti[n], other[n]) for n in range(len(dti))], name=names[2], freq="infer" ) - expected = tm.box_expected(expected, xbox) + expected = tm.box_expected(expected, xbox).astype(object) tm.assert_equal(res, expected) @pytest.mark.parametrize("other_box", [pd.Index, np.array]) @@ -2388,14 +2387,14 @@ def test_dti_addsub_object_arraylike( xbox = get_upcast_box(dtarr, other) expected = DatetimeIndex(["2017-01-31", "2017-01-06"], tz=tz_naive_fixture) - expected = tm.box_expected(expected, xbox) + expected = tm.box_expected(expected, xbox).astype(object) with tm.assert_produces_warning(PerformanceWarning): result = dtarr + other tm.assert_equal(result, expected) expected = DatetimeIndex(["2016-12-31", "2016-12-29"], tz=tz_naive_fixture) - expected = tm.box_expected(expected, xbox) + expected = tm.box_expected(expected, xbox).astype(object) with tm.assert_produces_warning(PerformanceWarning): result = dtarr - other @@ -2435,15 +2434,11 @@ def test_dt64arr_addsub_object_dtype_2d(): with tm.assert_produces_warning(PerformanceWarning): expected = (dta[:, 0] + other[:, 0]).reshape(-1, 1) - assert isinstance(result, DatetimeArray) - assert result.freq is None - tm.assert_numpy_array_equal(result._data, expected._data) + tm.assert_numpy_array_equal(result, expected) with tm.assert_produces_warning(PerformanceWarning): # Case where we expect to get a TimedeltaArray back result2 = dta - dta.astype(object) - assert isinstance(result2, TimedeltaArray) assert result2.shape == (4, 1) - assert result2.freq is None - assert (result2.asi8 == 0).all() + assert all(td.value == 0 for td in result2.ravel()) diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 56ad0d622cfb6..7fdb7423d9a1d 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -839,7 +839,7 @@ def test_pi_add_offset_array(self, box): pd.offsets.QuarterEnd(n=-2, startingMonth=12), ] ) - expected = PeriodIndex([Period("2015Q2"), Period("2015Q4")]) + expected = PeriodIndex([Period("2015Q2"), Period("2015Q4")]).astype(object) with tm.assert_produces_warning(PerformanceWarning): res = pi + offs @@ -872,6 +872,7 @@ def test_pi_sub_offset_array(self, box): ) expected = PeriodIndex([pi[n] - other[n] for n in range(len(pi))]) + expected = expected.astype(object) with tm.assert_produces_warning(PerformanceWarning): res = pi - other @@ -1301,13 +1302,13 @@ def test_parr_add_sub_object_array(self): expected = PeriodIndex( ["2001-01-01", "2001-01-03", "2001-01-05"], freq="D" - ).array + )._data.astype(object) tm.assert_equal(result, expected) with tm.assert_produces_warning(PerformanceWarning): result = parr - other - expected = PeriodIndex(["2000-12-30"] * 3, freq="D").array + expected = PeriodIndex(["2000-12-30"] * 3, freq="D")._data.astype(object) tm.assert_equal(result, expected) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index d3c81427ab584..bc5ef573cfd5c 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -584,7 +584,7 @@ def test_tda_add_dt64_object_array(self, box_with_array, tz_naive_fixture): with tm.assert_produces_warning(PerformanceWarning): result = obj + other.astype(object) - tm.assert_equal(result, other) + tm.assert_equal(result, other.astype(object)) # ------------------------------------------------------------- # Binary operations TimedeltaIndex and timedelta-like @@ -1296,8 +1296,8 @@ def test_td64arr_add_sub_offset_index(self, names, box_with_array): ) tdi = tm.box_expected(tdi, box) - expected = tm.box_expected(expected, box) - expected_sub = tm.box_expected(expected_sub, box) + expected = tm.box_expected(expected, box).astype(object, copy=False) + expected_sub = tm.box_expected(expected_sub, box).astype(object, copy=False) with tm.assert_produces_warning(PerformanceWarning): res = tdi + other @@ -1325,7 +1325,7 @@ def test_td64arr_add_sub_offset_array(self, box_with_array): ) tdi = tm.box_expected(tdi, box) - expected = tm.box_expected(expected, box) + expected = tm.box_expected(expected, box).astype(object) with tm.assert_produces_warning(PerformanceWarning): res = tdi + other @@ -1335,7 +1335,7 @@ def test_td64arr_add_sub_offset_array(self, box_with_array): res2 = other + tdi tm.assert_equal(res2, expected) - expected_sub = tm.box_expected(expected_sub, box_with_array) + expected_sub = tm.box_expected(expected_sub, box_with_array).astype(object) with tm.assert_produces_warning(PerformanceWarning): res_sub = tdi - other tm.assert_equal(res_sub, expected_sub) @@ -1349,9 +1349,11 @@ def test_td64arr_with_offset_series(self, names, box_with_array): tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"], name=names[0]) other = Series([offsets.Hour(n=1), offsets.Minute(n=-2)], name=names[1]) - expected_add = Series([tdi[n] + other[n] for n in range(len(tdi))], name=exname) + expected_add = Series( + [tdi[n] + other[n] for n in range(len(tdi))], name=exname, dtype=object + ) obj = tm.box_expected(tdi, box) - expected_add = tm.box_expected(expected_add, box2) + expected_add = tm.box_expected(expected_add, box2).astype(object) with tm.assert_produces_warning(PerformanceWarning): res = obj + other @@ -1361,8 +1363,10 @@ def test_td64arr_with_offset_series(self, names, box_with_array): res2 = other + obj tm.assert_equal(res2, expected_add) - expected_sub = Series([tdi[n] - other[n] for n in range(len(tdi))], name=exname) - expected_sub = tm.box_expected(expected_sub, box2) + expected_sub = Series( + [tdi[n] - other[n] for n in range(len(tdi))], name=exname, dtype=object + ) + expected_sub = tm.box_expected(expected_sub, box2).astype(object) with tm.assert_produces_warning(PerformanceWarning): res3 = obj - other @@ -1395,7 +1399,7 @@ def test_td64arr_addsub_anchored_offset_arraylike(self, obox, box_with_array): # ------------------------------------------------------------------ # Unsorted - def test_td64arr_add_sub_object_array(self, box_with_array, using_array_manager): + def test_td64arr_add_sub_object_array(self, box_with_array): box = box_with_array xbox = np.ndarray if box is pd.array else box @@ -1410,12 +1414,7 @@ def test_td64arr_add_sub_object_array(self, box_with_array, using_array_manager) expected = pd.Index( [Timedelta(days=2), Timedelta(days=4), Timestamp("2000-01-07")] ) - expected = tm.box_expected(expected, xbox) - if not using_array_manager: - # TODO: avoid mismatched behavior. This occurs bc inference - # can happen within TimedeltaArray method, which means results - # depend on whether we split blocks. - expected = expected.astype(object) + expected = tm.box_expected(expected, xbox).astype(object) tm.assert_equal(result, expected) msg = "unsupported operand type|cannot subtract a datelike" @@ -1427,9 +1426,7 @@ def test_td64arr_add_sub_object_array(self, box_with_array, using_array_manager) result = other - tdarr expected = pd.Index([Timedelta(0), Timedelta(0), Timestamp("2000-01-01")]) - expected = tm.box_expected(expected, xbox) - if not using_array_manager: - expected = expected.astype(object) + expected = tm.box_expected(expected, xbox).astype(object) tm.assert_equal(result, expected) @@ -1822,6 +1819,10 @@ def test_td64arr_mod_tdscalar(self, box_with_array, three_days): warn = None if box_with_array is DataFrame and isinstance(three_days, pd.DateOffset): warn = PerformanceWarning + # TODO: making expected be object here a result of DataFrame.__divmod__ + # being defined in a naive way that does not dispatch to the underlying + # array's __divmod__ + expected = expected.astype(object) with tm.assert_produces_warning(warn): result = divmod(tdarr, three_days) From 2c33c84e6d232bdc7ea37a201da9fc4b7b9ba933 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 13 Dec 2022 10:59:26 -0800 Subject: [PATCH 6/6] avoid messing with box_expected --- pandas/_testing/__init__.py | 16 +++++++--------- pandas/tests/arithmetic/test_timedelta64.py | 12 +++++++----- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index d1f9610eaa545..43020ae471f10 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -254,7 +254,7 @@ def equalContents(arr1, arr2) -> bool: return frozenset(arr1) == frozenset(arr2) -def box_expected(expected, box_cls, transpose: bool = True, dtype=None): +def box_expected(expected, box_cls, transpose: bool = True): """ Helper function to wrap the expected output of a test in a given box_class. @@ -270,26 +270,24 @@ def box_expected(expected, box_cls, transpose: bool = True, dtype=None): if box_cls is pd.array: if isinstance(expected, RangeIndex): # pd.array would return an IntegerArray - expected = PandasArray(np.asarray(expected._values, dtype=dtype)) + expected = PandasArray(np.asarray(expected._values)) else: - expected = pd.array(expected, copy=False, dtype=dtype) + expected = pd.array(expected, copy=False) elif box_cls is Index: - expected = Index(expected, dtype=dtype) + expected = Index(expected) elif box_cls is Series: - expected = Series(expected, dtype=dtype) + expected = Series(expected) elif box_cls is DataFrame: - expected = Series(expected, dtype=dtype).to_frame() + expected = Series(expected).to_frame() if transpose: # for vector operations, we need a DataFrame to be a single-row, # not a single-column, in order to operate against non-DataFrame # vectors of the same length. But convert to two rows to avoid # single-row special cases in datetime arithmetic expected = expected.T - if dtype is not None: - expected = expected.astype(dtype) expected = pd.concat([expected] * 2, ignore_index=True) elif box_cls is np.ndarray or box_cls is np.array: - expected = np.array(expected, dtype=dtype) + expected = np.array(expected) elif box_cls is to_array: expected = to_array(expected) else: diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index bc5ef573cfd5c..4e537c8c4c993 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -2027,13 +2027,16 @@ def test_td64arr_div_numeric_array( result = tdser / vector.astype(object) if box_with_array is DataFrame: expected = [tdser.iloc[0, n] / vector[n] for n in range(len(vector))] + expected = tm.box_expected(expected, xbox).astype(object) else: expected = [tdser[n] / vector[n] for n in range(len(tdser))] expected = [ x if x is not NaT else np.timedelta64("NaT", "ns") for x in expected ] - expected = pd.Index(expected, dtype=object) - expected = tm.box_expected(expected, xbox, dtype=object) + if xbox is tm.to_array: + expected = tm.to_array(expected).astype(object) + else: + expected = xbox(expected, dtype=object) tm.assert_equal(result, expected) @@ -2102,9 +2105,8 @@ def test_td64arr_all_nat_div_object_dtype_numeric(self, box_with_array): right = np.array([2, 2.0], dtype=object) expected = pd.Index([np.timedelta64("NaT", "ns")] * 2, dtype=object) - expected = tm.box_expected(expected, box_with_array, dtype=object).astype( - object - ) + if box_with_array is not pd.Index: + expected = tm.box_expected(expected, box_with_array).astype(object) result = left / right tm.assert_equal(result, expected)