diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 1894ce4ee12d9..d7d4538cbe4e0 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -951,6 +951,8 @@ Datetimelike - Bug in :func:`Timestamp.utctimetuple` raising a ``TypeError`` (:issue:`32174`) - Bug in :func:`to_datetime` was raising ``ValueError`` when parsing mixed-offset :class:`Timestamp` with ``errors='ignore'`` (:issue:`50585`) - Bug in :func:`to_datetime` was incorrectly handling floating-point inputs within 1 ``unit`` of the overflow boundaries (:issue:`50183`) +- Bug in :func:`to_datetime` with unit of "Y" or "M" giving incorrect results, not matching pointwise :class:`Timestamp` results (:issue:`50870`) +- Timedelta ^^^^^^^^^ diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 5f7fb05876b35..3dcb56c76c4a7 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -220,19 +220,6 @@ def format_array_from_datetime( return result -cdef int64_t _wrapped_cast_from_unit(object val, str unit) except? -1: - """ - Call cast_from_unit and re-raise OverflowError as OutOfBoundsDatetime - """ - # See also timedeltas._maybe_cast_from_unit - try: - return cast_from_unit(val, unit) - except OverflowError as err: - raise OutOfBoundsDatetime( - f"cannot convert input {val} with the unit '{unit}'" - ) from err - - def array_with_unit_to_datetime( ndarray[object] values, str unit, @@ -302,7 +289,7 @@ def array_with_unit_to_datetime( if val != val or val == NPY_NAT: iresult[i] = NPY_NAT else: - iresult[i] = _wrapped_cast_from_unit(val, unit) + iresult[i] = cast_from_unit(val, unit) elif isinstance(val, str): if len(val) == 0 or val in nat_strings: @@ -317,7 +304,7 @@ def array_with_unit_to_datetime( f"non convertible value {val} with the unit '{unit}'" ) - iresult[i] = _wrapped_cast_from_unit(fval, unit) + iresult[i] = cast_from_unit(fval, unit) else: # TODO: makes more sense as TypeError, but that would be an @@ -362,7 +349,7 @@ cdef _array_with_unit_to_datetime_object_fallback(ndarray[object] values, str un else: try: oresult[i] = Timestamp(val, unit=unit) - except OverflowError: + except OutOfBoundsDatetime: oresult[i] = val elif isinstance(val, str): diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 30f60c392167b..5b636ff69a6a6 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -108,22 +108,41 @@ cdef int64_t cast_from_unit(object ts, str unit) except? -1: if ts is None: return m - if unit in ["Y", "M"] and is_float_object(ts) and not ts.is_integer(): - # GH#47267 it is clear that 2 "M" corresponds to 1970-02-01, - # but not clear what 2.5 "M" corresponds to, so we will - # disallow that case. - raise ValueError( - f"Conversion of non-round float with unit={unit} " - "is ambiguous" - ) + if unit in ["Y", "M"]: + if is_float_object(ts) and not ts.is_integer(): + # GH#47267 it is clear that 2 "M" corresponds to 1970-02-01, + # but not clear what 2.5 "M" corresponds to, so we will + # disallow that case. + raise ValueError( + f"Conversion of non-round float with unit={unit} " + "is ambiguous" + ) + # GH#47266 go through np.datetime64 to avoid weird results e.g. with "Y" + # and 150 we'd get 2120-01-01 09:00:00 + if is_float_object(ts): + ts = int(ts) + dt64obj = np.datetime64(ts, unit) + return get_datetime64_nanos(dt64obj, NPY_FR_ns) # cast the unit, multiply base/frace separately # to avoid precision issues from float -> int - base = ts + try: + base = ts + except OverflowError as err: + raise OutOfBoundsDatetime( + f"cannot convert input {ts} with the unit '{unit}'" + ) from err + frac = ts - base if p: frac = round(frac, p) - return (base * m) + (frac * m) + + try: + return (base * m) + (frac * m) + except OverflowError as err: + raise OutOfBoundsDatetime( + f"cannot convert input {ts} with the unit '{unit}'" + ) from err cpdef inline (int64_t, int) precision_from_unit(str unit): @@ -278,25 +297,13 @@ cdef _TSObject convert_to_tsobject(object ts, tzinfo tz, str unit, if ts == NPY_NAT: obj.value = NPY_NAT else: - if unit in ["Y", "M"]: - # GH#47266 cast_from_unit leads to weird results e.g. with "Y" - # and 150 we'd get 2120-01-01 09:00:00 - ts = np.datetime64(ts, unit) - return convert_to_tsobject(ts, tz, None, False, False) - - ts = ts * cast_from_unit(None, unit) + ts = cast_from_unit(ts, unit) obj.value = ts pandas_datetime_to_datetimestruct(ts, NPY_FR_ns, &obj.dts) elif is_float_object(ts): if ts != ts or ts == NPY_NAT: obj.value = NPY_NAT else: - if unit in ["Y", "M"]: - if ts == int(ts): - # GH#47266 Avoid cast_from_unit, which would give weird results - # e.g. with "Y" and 150.0 we'd get 2120-01-01 09:00:00 - return convert_to_tsobject(int(ts), tz, unit, False, False) - ts = cast_from_unit(ts, unit) obj.value = ts pandas_datetime_to_datetimestruct(ts, NPY_FR_ns, &obj.dts) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 8f9dd1fe02c19..feae4d1c28f83 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -373,7 +373,7 @@ cdef _maybe_cast_from_unit(ts, str unit): # assert unit not in ["Y", "y", "M"] try: ts = cast_from_unit(ts, unit) - except OverflowError as err: + except OutOfBoundsDatetime as err: raise OutOfBoundsTimedelta( f"Cannot cast {ts} from {unit} to 'ns' without overflow." ) from err diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index 4b7d2bc4b57d4..c6ceb2fcb0ebd 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -55,6 +55,15 @@ def test_constructor_from_date_second_reso(self): ts = Timestamp(obj) assert ts.unit == "s" + @pytest.mark.parametrize("typ", [int, float]) + def test_construct_from_int_float_with_unit_out_of_bound_raises(self, typ): + # GH#50870 make sure we get a OutOfBoundsDatetime instead of OverflowError + val = typ(150000000) + + msg = f"cannot convert input {val} with the unit 'D'" + with pytest.raises(OutOfBoundsDatetime, match=msg): + Timestamp(val, unit="D") + @pytest.mark.parametrize("typ", [int, float]) def test_constructor_int_float_with_YM_unit(self, typ): # GH#47266 avoid the conversions in cast_from_unit diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 3509c82d2af6d..dfbe78e53de40 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1701,6 +1701,23 @@ def test_to_datetime_fixed_offset(self): class TestToDatetimeUnit: + @pytest.mark.parametrize("unit", ["Y", "M"]) + @pytest.mark.parametrize("item", [150, float(150)]) + def test_to_datetime_month_or_year_unit_int(self, cache, unit, item): + # GH#50870 Note we have separate tests that pd.Timestamp gets these right + ts = Timestamp(item, unit=unit) + expected = DatetimeIndex([ts]) + + result = to_datetime([item], unit=unit, cache=cache) + tm.assert_index_equal(result, expected) + + # TODO: this should also work + # result = to_datetime(np.array([item]), unit=unit, cache=cache) + # tm.assert_index_equal(result, expected) + + result = to_datetime(np.array([item], dtype=object), unit=unit, cache=cache) + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("unit", ["Y", "M"]) def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): # GH#50301