Skip to content

API: dont do type inference on arithmetic results #49714

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Dec 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -477,7 +477,7 @@ Other API changes
- Passing a sequence containing a type that cannot be converted to :class:`Timedelta` to :func:`to_timedelta` or to the :class:`Series` or :class:`DataFrame` constructor with ``dtype="timedelta64[ns]"`` or to :class:`TimedeltaIndex` now raises ``TypeError`` instead of ``ValueError`` (:issue:`49525`)
- Changed behavior of :class:`Index` constructor with sequence containing at least one ``NaT`` and everything else either ``None`` or ``NaN`` to infer ``datetime64[ns]`` dtype instead of ``object``, matching :class:`Series` behavior (:issue:`49340`)
- :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default) will now set the index on the returned :class:`DataFrame` to a :class:`RangeIndex` instead of a :class:`Int64Index` (:issue:`49745`)
- Changed behavior of :class:`Index`, :class:`Series`, and :class:`DataFrame` arithmetic methods when working with object-dtypes, the results no longer do type inference on the result of the array operations, use ``result.infer_objects()`` to do type inference on the result (:issue:`49999`)
- Changed behavior of :class:`Index`, :class:`Series`, and :class:`DataFrame` arithmetic methods when working with object-dtypes, the results no longer do type inference on the result of the array operations, use ``result.infer_objects(copy=False)`` to do type inference on the result (:issue:`49999`, :issue:`49714`)
- Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`)
- Changed behavior of :class:`Series` and :class:`DataFrame` constructors when given an integer dtype and floating-point data that is not round numbers, this now raises ``ValueError`` instead of silently retaining the float dtype; do ``Series(data)`` or ``DataFrame(data)`` to get the old behavior, and ``Series(data).astype(dtype)`` or ``DataFrame(data).astype(dtype)`` to get the specified dtype (:issue:`49599`)
- Changed behavior of :meth:`DataFrame.shift` with ``axis=1``, an integer ``fill_value``, and homogeneous datetime-like dtype, this now fills new columns with integer dtypes instead of casting to datetimelike (:issue:`49842`)
Expand Down
6 changes: 6 additions & 0 deletions pandas/_libs/tslibs/timedeltas.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,12 @@ def delta_to_nanoseconds(
reso: int = ..., # NPY_DATETIMEUNIT
round_ok: bool = ...,
) -> int: ...
def floordiv_object_array(
left: np.ndarray, right: npt.NDArray[np.object_]
) -> np.ndarray: ...
def truediv_object_array(
left: np.ndarray, right: npt.NDArray[np.object_]
) -> np.ndarray: ...

class Timedelta(timedelta):
_creso: int
Expand Down
58 changes: 58 additions & 0 deletions pandas/_libs/tslibs/timedeltas.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2030,6 +2030,64 @@ class Timedelta(_Timedelta):
return div, other - div * self


def truediv_object_array(ndarray left, ndarray right):
cdef:
ndarray[object] result = np.empty((<object>left).shape, dtype=object)
object td64 # really timedelta64 if we find a way to declare that
object obj, res_value
_Timedelta td
Py_ssize_t i

for i in range(len(left)):
td64 = left[i]
obj = right[i]

if get_timedelta64_value(td64) == NPY_NAT:
# td here should be interpreted as a td64 NaT
if _should_cast_to_timedelta(obj):
res_value = np.nan
else:
# if its a number then let numpy handle division, otherwise
# numpy will raise
res_value = td64 / obj
else:
td = Timedelta(td64)
res_value = td / obj

result[i] = res_value

return result


def floordiv_object_array(ndarray left, ndarray right):
cdef:
ndarray[object] result = np.empty((<object>left).shape, dtype=object)
object td64 # really timedelta64 if we find a way to declare that
object obj, res_value
_Timedelta td
Py_ssize_t i

for i in range(len(left)):
td64 = left[i]
obj = right[i]

if get_timedelta64_value(td64) == NPY_NAT:
# td here should be interpreted as a td64 NaT
if _should_cast_to_timedelta(obj):
res_value = np.nan
else:
# if its a number then let numpy handle division, otherwise
# numpy will raise
res_value = td64 // obj
else:
td = Timedelta(td64)
res_value = td // obj

result[i] = res_value

return result


cdef bint is_any_td_scalar(object obj):
"""
Cython equivalent for `isinstance(obj, (timedelta, np.timedelta64, Tick))`
Expand Down
6 changes: 1 addition & 5 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -1373,11 +1373,7 @@ def _addsub_object_array(self, other: np.ndarray, op):
assert self.shape == other.shape, (self.shape, other.shape)

res_values = op(self.astype("O"), np.asarray(other))

ext_arr = pd_array(res_values.ravel())
result = cast(np.ndarray, extract_array(ext_arr, extract_numpy=True))
result = result.reshape(self.shape)
return result
return res_values

def _accumulate(self, name: str, *, skipna: bool = True, **kwargs):

Expand Down
61 changes: 19 additions & 42 deletions pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,10 @@
from pandas._libs.tslibs.fields import get_timedelta_field
from pandas._libs.tslibs.timedeltas import (
array_to_timedelta64,
floordiv_object_array,
ints_to_pytimedelta,
parse_timedelta_unit,
truediv_object_array,
)
from pandas._typing import (
AxisInt,
Expand Down Expand Up @@ -63,6 +65,7 @@
from pandas.core.arrays import datetimelike as dtl
from pandas.core.arrays._ranges import generate_regular_range
import pandas.core.common as com
from pandas.core.construction import extract_array
from pandas.core.ops.common import unpack_zerodim_and_defer

if TYPE_CHECKING:
Expand Down Expand Up @@ -528,30 +531,13 @@ def __truediv__(self, other):
return self._ndarray / other

elif is_object_dtype(other.dtype):
# We operate on raveled arrays to avoid problems in inference
# on NaT
# TODO: tests with non-nano
srav = self.ravel()
orav = other.ravel()
result_list = [srav[n] / orav[n] for n in range(len(srav))]
result = np.array(result_list).reshape(self.shape)

# We need to do dtype inference in order to keep DataFrame ops
# behavior consistent with Series behavior
inferred = lib.infer_dtype(result, skipna=False)
if inferred == "timedelta":
flat = result.ravel()
result = type(self)._from_sequence(flat).reshape(result.shape)
elif inferred == "floating":
result = result.astype(float)
elif inferred == "datetime":
# GH#39750 this occurs when result is all-NaT, in which case
# we want to interpret these NaTs as td64.
# We construct an all-td64NaT result.
# error: Incompatible types in assignment (expression has type
# "TimedeltaArray", variable has type "ndarray[Any,
# dtype[floating[_64Bit]]]")
result = self * np.nan # type: ignore[assignment]
other = extract_array(other, extract_numpy=True)
if self.ndim > 1:
res_cols = [left / right for left, right in zip(self, other)]
res_cols2 = [x.reshape(1, -1) for x in res_cols]
result = np.concatenate(res_cols2, axis=0)
else:
result = truediv_object_array(self._ndarray, other)

return result

Expand Down Expand Up @@ -652,24 +638,15 @@ def __floordiv__(self, other):
return result

elif is_object_dtype(other.dtype):
# error: Incompatible types in assignment (expression has type
# "List[Any]", variable has type "ndarray")
srav = self.ravel()
orav = other.ravel()
res_list = [srav[n] // orav[n] for n in range(len(srav))]
result_flat = np.asarray(res_list)
inferred = lib.infer_dtype(result_flat, skipna=False)

result = result_flat.reshape(self.shape)

if inferred == "timedelta":
result, _ = sequence_to_td64ns(result)
return type(self)(result)
if inferred == "datetime":
# GH#39750 occurs when result is all-NaT, which in this
# case should be interpreted as td64nat. This can only
# occur when self is all-td64nat
return self * np.nan
other = extract_array(other, extract_numpy=True)
if self.ndim > 1:
res_cols = [left // right for left, right in zip(self, other)]
res_cols2 = [x.reshape(1, -1) for x in res_cols]
result = np.concatenate(res_cols2, axis=0)
else:
result = floordiv_object_array(self._ndarray, other)

assert result.dtype == object
return result

elif is_integer_dtype(other.dtype) or is_float_dtype(other.dtype):
Expand Down
25 changes: 10 additions & 15 deletions pandas/tests/arithmetic/test_datetime64.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,6 @@
date_range,
)
import pandas._testing as tm
from pandas.core.arrays import (
DatetimeArray,
TimedeltaArray,
)
from pandas.core.ops import roperator
from pandas.tests.arithmetic.common import (
assert_cannot_add,
Expand Down Expand Up @@ -1023,7 +1019,7 @@ def test_dt64arr_sub_dt64object_array(self, box_with_array, tz_naive_fixture):
expected = dti - dti

obj = tm.box_expected(dti, box_with_array)
expected = tm.box_expected(expected, box_with_array)
expected = tm.box_expected(expected, box_with_array).astype(object)

with tm.assert_produces_warning(PerformanceWarning):
result = obj - obj.astype(object)
Expand Down Expand Up @@ -1572,10 +1568,13 @@ def test_dt64arr_add_sub_offset_array(

other = np.array([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)])
expected = DatetimeIndex([op(dti[n], other[n]) for n in range(len(dti))])
expected = tm.box_expected(expected, box_with_array)
expected = tm.box_expected(expected, box_with_array).astype(object)

if box_other:
other = tm.box_expected(other, box_with_array)
if box_with_array is pd.array and op is roperator.radd:
# We expect a PandasArray, not ndarray[object] here
expected = pd.array(expected, dtype=object)

with tm.assert_produces_warning(PerformanceWarning):
res = op(dtarr, other)
Expand Down Expand Up @@ -2373,7 +2372,7 @@ def test_dti_addsub_offset_arraylike(
expected = DatetimeIndex(
[op(dti[n], other[n]) for n in range(len(dti))], name=names[2], freq="infer"
)
expected = tm.box_expected(expected, xbox)
expected = tm.box_expected(expected, xbox).astype(object)
tm.assert_equal(res, expected)

@pytest.mark.parametrize("other_box", [pd.Index, np.array])
Expand All @@ -2388,14 +2387,14 @@ def test_dti_addsub_object_arraylike(
xbox = get_upcast_box(dtarr, other)

expected = DatetimeIndex(["2017-01-31", "2017-01-06"], tz=tz_naive_fixture)
expected = tm.box_expected(expected, xbox)
expected = tm.box_expected(expected, xbox).astype(object)

with tm.assert_produces_warning(PerformanceWarning):
result = dtarr + other
tm.assert_equal(result, expected)

expected = DatetimeIndex(["2016-12-31", "2016-12-29"], tz=tz_naive_fixture)
expected = tm.box_expected(expected, xbox)
expected = tm.box_expected(expected, xbox).astype(object)

with tm.assert_produces_warning(PerformanceWarning):
result = dtarr - other
Expand Down Expand Up @@ -2435,15 +2434,11 @@ def test_dt64arr_addsub_object_dtype_2d():
with tm.assert_produces_warning(PerformanceWarning):
expected = (dta[:, 0] + other[:, 0]).reshape(-1, 1)

assert isinstance(result, DatetimeArray)
assert result.freq is None
tm.assert_numpy_array_equal(result._ndarray, expected._ndarray)
tm.assert_numpy_array_equal(result, expected)

with tm.assert_produces_warning(PerformanceWarning):
# Case where we expect to get a TimedeltaArray back
result2 = dta - dta.astype(object)

assert isinstance(result2, TimedeltaArray)
assert result2.shape == (4, 1)
assert result2.freq is None
assert (result2.asi8 == 0).all()
assert all(td.value == 0 for td in result2.ravel())
2 changes: 2 additions & 0 deletions pandas/tests/arithmetic/test_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,8 @@ def test_add_sub_datetimedeltalike_invalid(
r"operand type\(s\) all returned NotImplemented from __array_ufunc__",
"can only perform ops with numeric values",
"cannot subtract DatetimeArray from ndarray",
# pd.Timedelta(1) + Index([0, 1, 2])
"Cannot add or subtract Timedelta from integers",
]
)
assert_invalid_addsub_type(left, other, msg)
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/arithmetic/test_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ def test_operators_na_handling(self):
@pytest.mark.parametrize("dtype", [None, object])
def test_series_with_dtype_radd_timedelta(self, dtype):
# note this test is _not_ aimed at timedelta64-dtyped Series
# as of 2.0 we retain object dtype when ser.dtype == object
ser = Series(
[pd.Timedelta("1 days"), pd.Timedelta("2 days"), pd.Timedelta("3 days")],
dtype=dtype,
Expand Down
7 changes: 4 additions & 3 deletions pandas/tests/arithmetic/test_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -839,7 +839,7 @@ def test_pi_add_offset_array(self, box):
pd.offsets.QuarterEnd(n=-2, startingMonth=12),
]
)
expected = PeriodIndex([Period("2015Q2"), Period("2015Q4")])
expected = PeriodIndex([Period("2015Q2"), Period("2015Q4")]).astype(object)

with tm.assert_produces_warning(PerformanceWarning):
res = pi + offs
Expand Down Expand Up @@ -872,6 +872,7 @@ def test_pi_sub_offset_array(self, box):
)

expected = PeriodIndex([pi[n] - other[n] for n in range(len(pi))])
expected = expected.astype(object)

with tm.assert_produces_warning(PerformanceWarning):
res = pi - other
Expand Down Expand Up @@ -1301,13 +1302,13 @@ def test_parr_add_sub_object_array(self):

expected = PeriodIndex(
["2001-01-01", "2001-01-03", "2001-01-05"], freq="D"
).array
)._data.astype(object)
tm.assert_equal(result, expected)

with tm.assert_produces_warning(PerformanceWarning):
result = parr - other

expected = PeriodIndex(["2000-12-30"] * 3, freq="D").array
expected = PeriodIndex(["2000-12-30"] * 3, freq="D")._data.astype(object)
tm.assert_equal(result, expected)


Expand Down
Loading