diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 7838ef8df4164..a3ba0557bc31c 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -367,6 +367,7 @@ Other API changes - Passing a sequence containing a type that cannot be converted to :class:`Timedelta` to :func:`to_timedelta` or to the :class:`Series` or :class:`DataFrame` constructor with ``dtype="timedelta64[ns]"`` or to :class:`TimedeltaIndex` now raises ``TypeError`` instead of ``ValueError`` (:issue:`49525`) - Changed behavior of :class:`Index` constructor with sequence containing at least one ``NaT`` and everything else either ``None`` or ``NaN`` to infer ``datetime64[ns]`` dtype instead of ``object``, matching :class:`Series` behavior (:issue:`49340`) - :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default) will now set the index on the returned :class:`DataFrame` to a :class:`RangeIndex` instead of a :class:`Int64Index` (:issue:`49745`) +- Changed behavior of :class:`Index`, :class:`Series`, and :class:`DataFrame` arithmetic methods when working with object-dtypes, the results no longer do type inference on the result of the array operations, use ``result.infer_objects()`` to do type inference on the result (:issue:`49999`) - Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`) - Changed behavior of :meth:`DataFrame.shift` with ``axis=1``, an integer ``fill_value``, and homogeneous datetime-like dtype, this now fills new columns with integer dtypes instead of casting to datetimelike (:issue:`49842`) - Files are now closed when encountering an exception in :func:`read_json` (:issue:`49921`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index dc0359426f07c..7ee9d8ff91b6c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6615,10 +6615,10 @@ def _logical_method(self, other, op): def _construct_result(self, result, name): if isinstance(result, tuple): return ( - Index._with_infer(result[0], name=name), - Index._with_infer(result[1], name=name), + Index(result[0], name=name, dtype=result[0].dtype), + Index(result[1], name=name, dtype=result[1].dtype), ) - return Index._with_infer(result, name=name) + return Index(result, name=name, dtype=result.dtype) def _arith_method(self, other, op): if ( diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index bfedaca093a8e..e514bdcac5265 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -230,18 +230,27 @@ def align_method_FRAME( def to_series(right): msg = "Unable to coerce to Series, length must be {req_len}: given {given_len}" + + # pass dtype to avoid doing inference, which would break consistency + # with Index/Series ops + dtype = None + if getattr(right, "dtype", None) == object: + # can't pass right.dtype unconditionally as that would break on e.g. + # datetime64[h] ndarray + dtype = object + if axis is not None and left._get_axis_name(axis) == "index": if len(left.index) != len(right): raise ValueError( msg.format(req_len=len(left.index), given_len=len(right)) ) - right = left._constructor_sliced(right, index=left.index) + right = left._constructor_sliced(right, index=left.index, dtype=dtype) else: if len(left.columns) != len(right): raise ValueError( msg.format(req_len=len(left.columns), given_len=len(right)) ) - right = left._constructor_sliced(right, index=left.columns) + right = left._constructor_sliced(right, index=left.columns, dtype=dtype) return right if isinstance(right, np.ndarray): @@ -250,13 +259,25 @@ def to_series(right): right = to_series(right) elif right.ndim == 2: + # We need to pass dtype=right.dtype to retain object dtype + # otherwise we lose consistency with Index and array ops + dtype = None + if getattr(right, "dtype", None) == object: + # can't pass right.dtype unconditionally as that would break on e.g. + # datetime64[h] ndarray + dtype = object + if right.shape == left.shape: - right = left._constructor(right, index=left.index, columns=left.columns) + right = left._constructor( + right, index=left.index, columns=left.columns, dtype=dtype + ) elif right.shape[0] == left.shape[0] and right.shape[1] == 1: # Broadcast across columns right = np.broadcast_to(right, left.shape) - right = left._constructor(right, index=left.index, columns=left.columns) + right = left._constructor( + right, index=left.index, columns=left.columns, dtype=dtype + ) elif right.shape[1] == left.shape[1] and right.shape[0] == 1: # Broadcast along rows @@ -406,7 +427,10 @@ def _maybe_align_series_as_frame(frame: DataFrame, series: Series, axis: AxisInt rvalues = rvalues.reshape(1, -1) rvalues = np.broadcast_to(rvalues, frame.shape) - return type(frame)(rvalues, index=frame.index, columns=frame.columns) + # pass dtype to avoid doing inference + return type(frame)( + rvalues, index=frame.index, columns=frame.columns, dtype=rvalues.dtype + ) def flex_arith_method_FRAME(op): diff --git a/pandas/core/series.py b/pandas/core/series.py index 1e5f565934b50..bf5a530a28b28 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2995,9 +2995,10 @@ def _construct_result( assert isinstance(res2, Series) return (res1, res2) - # We do not pass dtype to ensure that the Series constructor - # does inference in the case where `result` has object-dtype. - out = self._constructor(result, index=self.index) + # TODO: result should always be ArrayLike, but this fails for some + # JSONArray tests + dtype = getattr(result, "dtype", None) + out = self._constructor(result, index=self.index, dtype=dtype) out = out.__finalize__(self) # Set the result's name after __finalize__ is called because __finalize__ diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index f2af85c2e388d..529dd6baa70c0 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -1147,9 +1147,6 @@ def test_numarr_with_dtype_add_nan(self, dtype, box_with_array): ser = tm.box_expected(ser, box) expected = tm.box_expected(expected, box) - if box is Index and dtype is object: - # TODO: avoid this; match behavior with Series - expected = expected.astype(np.float64) result = np.nan + ser tm.assert_equal(result, expected) @@ -1165,9 +1162,6 @@ def test_numarr_with_dtype_add_int(self, dtype, box_with_array): ser = tm.box_expected(ser, box) expected = tm.box_expected(expected, box) - if box is Index and dtype is object: - # TODO: avoid this; match behavior with Series - expected = expected.astype(np.int64) result = 1 + ser tm.assert_equal(result, expected) diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index e107ff6b65c0f..cba2b9be255fb 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -187,7 +187,8 @@ def test_series_with_dtype_radd_timedelta(self, dtype): dtype=dtype, ) expected = Series( - [pd.Timedelta("4 days"), pd.Timedelta("5 days"), pd.Timedelta("6 days")] + [pd.Timedelta("4 days"), pd.Timedelta("5 days"), pd.Timedelta("6 days")], + dtype=dtype, ) result = pd.Timedelta("3 days") + ser @@ -227,7 +228,9 @@ def test_mixed_timezone_series_ops_object(self): name="xxx", ) assert ser2.dtype == object - exp = Series([pd.Timedelta("2 days"), pd.Timedelta("4 days")], name="xxx") + exp = Series( + [pd.Timedelta("2 days"), pd.Timedelta("4 days")], name="xxx", dtype=object + ) tm.assert_series_equal(ser2 - ser, exp) tm.assert_series_equal(ser - ser2, -exp) @@ -238,7 +241,11 @@ def test_mixed_timezone_series_ops_object(self): ) assert ser.dtype == object - exp = Series([pd.Timedelta("01:30:00"), pd.Timedelta("02:30:00")], name="xxx") + exp = Series( + [pd.Timedelta("01:30:00"), pd.Timedelta("02:30:00")], + name="xxx", + dtype=object, + ) tm.assert_series_equal(ser + pd.Timedelta("00:30:00"), exp) tm.assert_series_equal(pd.Timedelta("00:30:00") + ser, exp) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 1fb1e96cea94b..f3ea741607692 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -1394,7 +1394,7 @@ def test_td64arr_addsub_anchored_offset_arraylike(self, obox, box_with_array): # ------------------------------------------------------------------ # Unsorted - def test_td64arr_add_sub_object_array(self, box_with_array): + def test_td64arr_add_sub_object_array(self, box_with_array, using_array_manager): box = box_with_array xbox = np.ndarray if box is pd.array else box @@ -1410,6 +1410,11 @@ def test_td64arr_add_sub_object_array(self, box_with_array): [Timedelta(days=2), Timedelta(days=4), Timestamp("2000-01-07")] ) expected = tm.box_expected(expected, xbox) + if not using_array_manager: + # TODO: avoid mismatched behavior. This occurs bc inference + # can happen within TimedeltaArray method, which means results + # depend on whether we split blocks. + expected = expected.astype(object) tm.assert_equal(result, expected) msg = "unsupported operand type|cannot subtract a datelike" @@ -1422,6 +1427,8 @@ def test_td64arr_add_sub_object_array(self, box_with_array): expected = pd.Index([Timedelta(0), Timedelta(0), Timestamp("2000-01-01")]) expected = tm.box_expected(expected, xbox) + if not using_array_manager: + expected = expected.astype(object) tm.assert_equal(result, expected)