Skip to content

API: dont do inference on object-dtype arithmetic results #49999

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Dec 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,7 @@ Other API changes
- Passing a sequence containing a type that cannot be converted to :class:`Timedelta` to :func:`to_timedelta` or to the :class:`Series` or :class:`DataFrame` constructor with ``dtype="timedelta64[ns]"`` or to :class:`TimedeltaIndex` now raises ``TypeError`` instead of ``ValueError`` (:issue:`49525`)
- Changed behavior of :class:`Index` constructor with sequence containing at least one ``NaT`` and everything else either ``None`` or ``NaN`` to infer ``datetime64[ns]`` dtype instead of ``object``, matching :class:`Series` behavior (:issue:`49340`)
- :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default) will now set the index on the returned :class:`DataFrame` to a :class:`RangeIndex` instead of a :class:`Int64Index` (:issue:`49745`)
- Changed behavior of :class:`Index`, :class:`Series`, and :class:`DataFrame` arithmetic methods when working with object-dtypes, the results no longer do type inference on the result of the array operations, use ``result.infer_objects()`` to do type inference on the result (:issue:`49999`)
- Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`)
- Changed behavior of :meth:`DataFrame.shift` with ``axis=1``, an integer ``fill_value``, and homogeneous datetime-like dtype, this now fills new columns with integer dtypes instead of casting to datetimelike (:issue:`49842`)
- Files are now closed when encountering an exception in :func:`read_json` (:issue:`49921`)
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6615,10 +6615,10 @@ def _logical_method(self, other, op):
def _construct_result(self, result, name):
if isinstance(result, tuple):
return (
Index._with_infer(result[0], name=name),
Index._with_infer(result[1], name=name),
Index(result[0], name=name, dtype=result[0].dtype),
Index(result[1], name=name, dtype=result[1].dtype),
)
return Index._with_infer(result, name=name)
return Index(result, name=name, dtype=result.dtype)

def _arith_method(self, other, op):
if (
Expand Down
34 changes: 29 additions & 5 deletions pandas/core/ops/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,18 +230,27 @@ def align_method_FRAME(

def to_series(right):
msg = "Unable to coerce to Series, length must be {req_len}: given {given_len}"

# pass dtype to avoid doing inference, which would break consistency
# with Index/Series ops
dtype = None
if getattr(right, "dtype", None) == object:
# can't pass right.dtype unconditionally as that would break on e.g.
# datetime64[h] ndarray
dtype = object

if axis is not None and left._get_axis_name(axis) == "index":
if len(left.index) != len(right):
raise ValueError(
msg.format(req_len=len(left.index), given_len=len(right))
)
right = left._constructor_sliced(right, index=left.index)
right = left._constructor_sliced(right, index=left.index, dtype=dtype)
else:
if len(left.columns) != len(right):
raise ValueError(
msg.format(req_len=len(left.columns), given_len=len(right))
)
right = left._constructor_sliced(right, index=left.columns)
right = left._constructor_sliced(right, index=left.columns, dtype=dtype)
return right

if isinstance(right, np.ndarray):
Expand All @@ -250,13 +259,25 @@ def to_series(right):
right = to_series(right)

elif right.ndim == 2:
# We need to pass dtype=right.dtype to retain object dtype
# otherwise we lose consistency with Index and array ops
dtype = None
if getattr(right, "dtype", None) == object:
# can't pass right.dtype unconditionally as that would break on e.g.
# datetime64[h] ndarray
dtype = object

if right.shape == left.shape:
right = left._constructor(right, index=left.index, columns=left.columns)
right = left._constructor(
right, index=left.index, columns=left.columns, dtype=dtype
)

elif right.shape[0] == left.shape[0] and right.shape[1] == 1:
# Broadcast across columns
right = np.broadcast_to(right, left.shape)
right = left._constructor(right, index=left.index, columns=left.columns)
right = left._constructor(
right, index=left.index, columns=left.columns, dtype=dtype
)

elif right.shape[1] == left.shape[1] and right.shape[0] == 1:
# Broadcast along rows
Expand Down Expand Up @@ -406,7 +427,10 @@ def _maybe_align_series_as_frame(frame: DataFrame, series: Series, axis: AxisInt
rvalues = rvalues.reshape(1, -1)

rvalues = np.broadcast_to(rvalues, frame.shape)
return type(frame)(rvalues, index=frame.index, columns=frame.columns)
# pass dtype to avoid doing inference
return type(frame)(
rvalues, index=frame.index, columns=frame.columns, dtype=rvalues.dtype
)


def flex_arith_method_FRAME(op):
Expand Down
7 changes: 4 additions & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2995,9 +2995,10 @@ def _construct_result(
assert isinstance(res2, Series)
return (res1, res2)

# We do not pass dtype to ensure that the Series constructor
# does inference in the case where `result` has object-dtype.
out = self._constructor(result, index=self.index)
# TODO: result should always be ArrayLike, but this fails for some
# JSONArray tests
dtype = getattr(result, "dtype", None)
out = self._constructor(result, index=self.index, dtype=dtype)
out = out.__finalize__(self)

# Set the result's name after __finalize__ is called because __finalize__
Expand Down
6 changes: 0 additions & 6 deletions pandas/tests/arithmetic/test_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -1147,9 +1147,6 @@ def test_numarr_with_dtype_add_nan(self, dtype, box_with_array):

ser = tm.box_expected(ser, box)
expected = tm.box_expected(expected, box)
if box is Index and dtype is object:
# TODO: avoid this; match behavior with Series
expected = expected.astype(np.float64)

result = np.nan + ser
tm.assert_equal(result, expected)
Expand All @@ -1165,9 +1162,6 @@ def test_numarr_with_dtype_add_int(self, dtype, box_with_array):

ser = tm.box_expected(ser, box)
expected = tm.box_expected(expected, box)
if box is Index and dtype is object:
# TODO: avoid this; match behavior with Series
expected = expected.astype(np.int64)

result = 1 + ser
tm.assert_equal(result, expected)
Expand Down
13 changes: 10 additions & 3 deletions pandas/tests/arithmetic/test_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,8 @@ def test_series_with_dtype_radd_timedelta(self, dtype):
dtype=dtype,
)
expected = Series(
[pd.Timedelta("4 days"), pd.Timedelta("5 days"), pd.Timedelta("6 days")]
[pd.Timedelta("4 days"), pd.Timedelta("5 days"), pd.Timedelta("6 days")],
dtype=dtype,
)

result = pd.Timedelta("3 days") + ser
Expand Down Expand Up @@ -227,7 +228,9 @@ def test_mixed_timezone_series_ops_object(self):
name="xxx",
)
assert ser2.dtype == object
exp = Series([pd.Timedelta("2 days"), pd.Timedelta("4 days")], name="xxx")
exp = Series(
[pd.Timedelta("2 days"), pd.Timedelta("4 days")], name="xxx", dtype=object
)
tm.assert_series_equal(ser2 - ser, exp)
tm.assert_series_equal(ser - ser2, -exp)

Expand All @@ -238,7 +241,11 @@ def test_mixed_timezone_series_ops_object(self):
)
assert ser.dtype == object

exp = Series([pd.Timedelta("01:30:00"), pd.Timedelta("02:30:00")], name="xxx")
exp = Series(
[pd.Timedelta("01:30:00"), pd.Timedelta("02:30:00")],
name="xxx",
dtype=object,
)
tm.assert_series_equal(ser + pd.Timedelta("00:30:00"), exp)
tm.assert_series_equal(pd.Timedelta("00:30:00") + ser, exp)

Expand Down
9 changes: 8 additions & 1 deletion pandas/tests/arithmetic/test_timedelta64.py
Original file line number Diff line number Diff line change
Expand Up @@ -1394,7 +1394,7 @@ def test_td64arr_addsub_anchored_offset_arraylike(self, obox, box_with_array):
# ------------------------------------------------------------------
# Unsorted

def test_td64arr_add_sub_object_array(self, box_with_array):
def test_td64arr_add_sub_object_array(self, box_with_array, using_array_manager):
box = box_with_array
xbox = np.ndarray if box is pd.array else box

Expand All @@ -1410,6 +1410,11 @@ def test_td64arr_add_sub_object_array(self, box_with_array):
[Timedelta(days=2), Timedelta(days=4), Timestamp("2000-01-07")]
)
expected = tm.box_expected(expected, xbox)
if not using_array_manager:
# TODO: avoid mismatched behavior. This occurs bc inference
# can happen within TimedeltaArray method, which means results
# depend on whether we split blocks.
expected = expected.astype(object)
tm.assert_equal(result, expected)

msg = "unsupported operand type|cannot subtract a datelike"
Expand All @@ -1422,6 +1427,8 @@ def test_td64arr_add_sub_object_array(self, box_with_array):

expected = pd.Index([Timedelta(0), Timedelta(0), Timestamp("2000-01-01")])
expected = tm.box_expected(expected, xbox)
if not using_array_manager:
expected = expected.astype(object)
tm.assert_equal(result, expected)


Expand Down