From b10a812f9a4dfd0279a753b80e7ab1af5119535e Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 1 Dec 2022 13:52:49 -0800 Subject: [PATCH 1/4] API: dont do inference on object-dtype arithmetic results --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/indexes/base.py | 6 ++-- pandas/core/ops/__init__.py | 34 ++++++++++++++++++--- pandas/core/series.py | 7 +++-- pandas/tests/arithmetic/test_object.py | 13 ++++++-- pandas/tests/arithmetic/test_timedelta64.py | 9 +++++- 6 files changed, 55 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index df82bcd37e971..e0a5f9e45acb7 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -348,6 +348,7 @@ Other API changes - Passing a sequence containing a type that cannot be converted to :class:`Timedelta` to :func:`to_timedelta` or to the :class:`Series` or :class:`DataFrame` constructor with ``dtype="timedelta64[ns]"`` or to :class:`TimedeltaIndex` now raises ``TypeError`` instead of ``ValueError`` (:issue:`49525`) - Changed behavior of :class:`Index` constructor with sequence containing at least one ``NaT`` and everything else either ``None`` or ``NaN`` to infer ``datetime64[ns]`` dtype instead of ``object``, matching :class:`Series` behavior (:issue:`49340`) - :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default) will now set the index on the returned :class:`DataFrame` to a :class:`RangeIndex` instead of a :class:`Int64Index` (:issue:`49745`) +- Changed behavior of :class:`Index`, :class:`Series`, and :class:`DataFrame` arithmetic methods when working with object-dtypes, the results no longer do type inference on the result of the array operations (:issue:`49714`) - Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`) - Changed behavior of :meth:`DataFrame.shift` with ``axis=1``, an integer ``fill_value``, and homogeneous datetime-like dtype, this now fills new columns with integer dtypes instead of casting to datetimelike (:issue:`49842`) - :meth:`DataFrame.values`, :meth:`DataFrame.to_numpy`, :meth:`DataFrame.xs`, :meth:`DataFrame.reindex`, :meth:`DataFrame.fillna`, and :meth:`DataFrame.replace` no longer silently consolidate the underlying arrays; do ``df = df.copy()`` to ensure consolidation (:issue:`49356`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 01a1ebd459616..fc27c7cc1c818 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6582,10 +6582,10 @@ def _logical_method(self, other, op): def _construct_result(self, result, name): if isinstance(result, tuple): return ( - Index._with_infer(result[0], name=name), - Index._with_infer(result[1], name=name), + Index(result[0], name=name, dtype=result[0].dtype), + Index(result[1], name=name, dtype=result[1].dtype), ) - return Index._with_infer(result, name=name) + return Index(result, name=name, dtype=result.dtype) def _arith_method(self, other, op): if ( diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index af27ff67599ac..eb325fea9cbd0 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -232,18 +232,27 @@ def align_method_FRAME( def to_series(right): msg = "Unable to coerce to Series, length must be {req_len}: given {given_len}" + + # pass dtype to avoid doing inference, which would break consistency + # with Index/Series ops + dtype = None + if getattr(right, "dtype", None) == object: + # can't pass right.dtype unconditionally as that would break on e.g. + # datetime64[h] ndarray + dtype = object + if axis is not None and left._get_axis_name(axis) == "index": if len(left.index) != len(right): raise ValueError( msg.format(req_len=len(left.index), given_len=len(right)) ) - right = left._constructor_sliced(right, index=left.index) + right = left._constructor_sliced(right, index=left.index, dtype=dtype) else: if len(left.columns) != len(right): raise ValueError( msg.format(req_len=len(left.columns), given_len=len(right)) ) - right = left._constructor_sliced(right, index=left.columns) + right = left._constructor_sliced(right, index=left.columns, dtype=dtype) return right if isinstance(right, np.ndarray): @@ -252,13 +261,25 @@ def to_series(right): right = to_series(right) elif right.ndim == 2: + # We need to pass dtype=right.dtype to retain object dtype + # otherwise we lose consistency with Index and array ops + dtype = None + if getattr(right, "dtype", None) == object: + # can't pass right.dtype unconditionally as that would break on e.g. + # datetime64[h] ndarray + dtype = object + if right.shape == left.shape: - right = left._constructor(right, index=left.index, columns=left.columns) + right = left._constructor( + right, index=left.index, columns=left.columns, dtype=dtype + ) elif right.shape[0] == left.shape[0] and right.shape[1] == 1: # Broadcast across columns right = np.broadcast_to(right, left.shape) - right = left._constructor(right, index=left.index, columns=left.columns) + right = left._constructor( + right, index=left.index, columns=left.columns, dtype=dtype + ) elif right.shape[1] == left.shape[1] and right.shape[0] == 1: # Broadcast along rows @@ -411,7 +432,10 @@ def _maybe_align_series_as_frame(frame: DataFrame, series: Series, axis: AxisInt rvalues = rvalues.reshape(1, -1) rvalues = np.broadcast_to(rvalues, frame.shape) - return type(frame)(rvalues, index=frame.index, columns=frame.columns) + # pass dtype to avoid doing inference + return type(frame)( + rvalues, index=frame.index, columns=frame.columns, dtype=rvalues.dtype + ) def flex_arith_method_FRAME(op): diff --git a/pandas/core/series.py b/pandas/core/series.py index 48bc07ca022ee..456c828b264e4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2995,9 +2995,10 @@ def _construct_result( assert isinstance(res2, Series) return (res1, res2) - # We do not pass dtype to ensure that the Series constructor - # does inference in the case where `result` has object-dtype. - out = self._constructor(result, index=self.index) + # TODO: result should always be ArrayLike, but this fails for some + # JSONArray tests + dtype = getattr(result, "dtype", None) + out = self._constructor(result, index=self.index, dtype=dtype) out = out.__finalize__(self) # Set the result's name after __finalize__ is called because __finalize__ diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index e107ff6b65c0f..cba2b9be255fb 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -187,7 +187,8 @@ def test_series_with_dtype_radd_timedelta(self, dtype): dtype=dtype, ) expected = Series( - [pd.Timedelta("4 days"), pd.Timedelta("5 days"), pd.Timedelta("6 days")] + [pd.Timedelta("4 days"), pd.Timedelta("5 days"), pd.Timedelta("6 days")], + dtype=dtype, ) result = pd.Timedelta("3 days") + ser @@ -227,7 +228,9 @@ def test_mixed_timezone_series_ops_object(self): name="xxx", ) assert ser2.dtype == object - exp = Series([pd.Timedelta("2 days"), pd.Timedelta("4 days")], name="xxx") + exp = Series( + [pd.Timedelta("2 days"), pd.Timedelta("4 days")], name="xxx", dtype=object + ) tm.assert_series_equal(ser2 - ser, exp) tm.assert_series_equal(ser - ser2, -exp) @@ -238,7 +241,11 @@ def test_mixed_timezone_series_ops_object(self): ) assert ser.dtype == object - exp = Series([pd.Timedelta("01:30:00"), pd.Timedelta("02:30:00")], name="xxx") + exp = Series( + [pd.Timedelta("01:30:00"), pd.Timedelta("02:30:00")], + name="xxx", + dtype=object, + ) tm.assert_series_equal(ser + pd.Timedelta("00:30:00"), exp) tm.assert_series_equal(pd.Timedelta("00:30:00") + ser, exp) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 14d50acf3eadf..ab24994ebf9e2 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -1398,7 +1398,7 @@ def test_td64arr_addsub_anchored_offset_arraylike(self, obox, box_with_array): # ------------------------------------------------------------------ # Unsorted - def test_td64arr_add_sub_object_array(self, box_with_array): + def test_td64arr_add_sub_object_array(self, box_with_array, using_array_manager): box = box_with_array xbox = np.ndarray if box is pd.array else box @@ -1414,6 +1414,11 @@ def test_td64arr_add_sub_object_array(self, box_with_array): [Timedelta(days=2), Timedelta(days=4), Timestamp("2000-01-07")] ) expected = tm.box_expected(expected, xbox) + if not using_array_manager: + # TODO: avoid mismatched behavior. This occurs bc inference + # can happen within TimedeltaArray method, which means results + # depend on whether we split blocks. + expected = expected.astype(object) tm.assert_equal(result, expected) msg = "unsupported operand type|cannot subtract a datelike" @@ -1426,6 +1431,8 @@ def test_td64arr_add_sub_object_array(self, box_with_array): expected = pd.Index([Timedelta(0), Timedelta(0), Timestamp("2000-01-01")]) expected = tm.box_expected(expected, xbox) + if not using_array_manager: + expected = expected.astype(object) tm.assert_equal(result, expected) From a8075c997a48822ca0d18e95b52d5ef5a8eaa7ff Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 6 Dec 2022 13:56:50 -0800 Subject: [PATCH 2/4] suggest infer_objects --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index e0a5f9e45acb7..243225a32a00b 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -348,7 +348,7 @@ Other API changes - Passing a sequence containing a type that cannot be converted to :class:`Timedelta` to :func:`to_timedelta` or to the :class:`Series` or :class:`DataFrame` constructor with ``dtype="timedelta64[ns]"`` or to :class:`TimedeltaIndex` now raises ``TypeError`` instead of ``ValueError`` (:issue:`49525`) - Changed behavior of :class:`Index` constructor with sequence containing at least one ``NaT`` and everything else either ``None`` or ``NaN`` to infer ``datetime64[ns]`` dtype instead of ``object``, matching :class:`Series` behavior (:issue:`49340`) - :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default) will now set the index on the returned :class:`DataFrame` to a :class:`RangeIndex` instead of a :class:`Int64Index` (:issue:`49745`) -- Changed behavior of :class:`Index`, :class:`Series`, and :class:`DataFrame` arithmetic methods when working with object-dtypes, the results no longer do type inference on the result of the array operations (:issue:`49714`) +- Changed behavior of :class:`Index`, :class:`Series`, and :class:`DataFrame` arithmetic methods when working with object-dtypes, the results no longer do type inference on the result of the array operations, use ``result.infer_objects()`` to do type inference on the result (:issue:`49999`) - Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`) - Changed behavior of :meth:`DataFrame.shift` with ``axis=1``, an integer ``fill_value``, and homogeneous datetime-like dtype, this now fills new columns with integer dtypes instead of casting to datetimelike (:issue:`49842`) - :meth:`DataFrame.values`, :meth:`DataFrame.to_numpy`, :meth:`DataFrame.xs`, :meth:`DataFrame.reindex`, :meth:`DataFrame.fillna`, and :meth:`DataFrame.replace` no longer silently consolidate the underlying arrays; do ``df = df.copy()`` to ensure consolidation (:issue:`49356`) From d5c6c9e8479876033092c9dead519b572c6ee49f Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 6 Dec 2022 15:05:15 -0800 Subject: [PATCH 3/4] remove special case --- pandas/tests/arithmetic/test_numeric.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index f2af85c2e388d..d0d1a46893483 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -1147,9 +1147,6 @@ def test_numarr_with_dtype_add_nan(self, dtype, box_with_array): ser = tm.box_expected(ser, box) expected = tm.box_expected(expected, box) - if box is Index and dtype is object: - # TODO: avoid this; match behavior with Series - expected = expected.astype(np.float64) result = np.nan + ser tm.assert_equal(result, expected) From 3ab3c98d68ba719c62df2765071067b136b2746c Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 6 Dec 2022 16:32:10 -0800 Subject: [PATCH 4/4] de-special-case --- pandas/tests/arithmetic/test_numeric.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index d0d1a46893483..529dd6baa70c0 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -1162,9 +1162,6 @@ def test_numarr_with_dtype_add_int(self, dtype, box_with_array): ser = tm.box_expected(ser, box) expected = tm.box_expected(expected, box) - if box is Index and dtype is object: - # TODO: avoid this; match behavior with Series - expected = expected.astype(np.int64) result = 1 + ser tm.assert_equal(result, expected)