Skip to content

BUG: to_datetime with Y or M unit not matching Timestamp #50870

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -951,6 +951,8 @@ Datetimelike
- Bug in :func:`Timestamp.utctimetuple` raising a ``TypeError`` (:issue:`32174`)
- Bug in :func:`to_datetime` was raising ``ValueError`` when parsing mixed-offset :class:`Timestamp` with ``errors='ignore'`` (:issue:`50585`)
- Bug in :func:`to_datetime` was incorrectly handling floating-point inputs within 1 ``unit`` of the overflow boundaries (:issue:`50183`)
- Bug in :func:`to_datetime` with unit of "Y" or "M" giving incorrect results, not matching pointwise :class:`Timestamp` results (:issue:`50870`)
-

Timedelta
^^^^^^^^^
Expand Down
19 changes: 3 additions & 16 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -220,19 +220,6 @@ def format_array_from_datetime(
return result


cdef int64_t _wrapped_cast_from_unit(object val, str unit) except? -1:
"""
Call cast_from_unit and re-raise OverflowError as OutOfBoundsDatetime
"""
# See also timedeltas._maybe_cast_from_unit
try:
return cast_from_unit(val, unit)
except OverflowError as err:
raise OutOfBoundsDatetime(
f"cannot convert input {val} with the unit '{unit}'"
) from err


def array_with_unit_to_datetime(
ndarray[object] values,
str unit,
Expand Down Expand Up @@ -302,7 +289,7 @@ def array_with_unit_to_datetime(
if val != val or val == NPY_NAT:
iresult[i] = NPY_NAT
else:
iresult[i] = _wrapped_cast_from_unit(val, unit)
iresult[i] = cast_from_unit(val, unit)

elif isinstance(val, str):
if len(val) == 0 or val in nat_strings:
Expand All @@ -317,7 +304,7 @@ def array_with_unit_to_datetime(
f"non convertible value {val} with the unit '{unit}'"
)

iresult[i] = _wrapped_cast_from_unit(fval, unit)
iresult[i] = cast_from_unit(fval, unit)

else:
# TODO: makes more sense as TypeError, but that would be an
Expand Down Expand Up @@ -362,7 +349,7 @@ cdef _array_with_unit_to_datetime_object_fallback(ndarray[object] values, str un
else:
try:
oresult[i] = Timestamp(val, unit=unit)
except OverflowError:
except OutOfBoundsDatetime:
oresult[i] = val

elif isinstance(val, str):
Expand Down
53 changes: 30 additions & 23 deletions pandas/_libs/tslibs/conversion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -108,22 +108,41 @@ cdef int64_t cast_from_unit(object ts, str unit) except? -1:
if ts is None:
return m

if unit in ["Y", "M"] and is_float_object(ts) and not ts.is_integer():
# GH#47267 it is clear that 2 "M" corresponds to 1970-02-01,
# but not clear what 2.5 "M" corresponds to, so we will
# disallow that case.
raise ValueError(
f"Conversion of non-round float with unit={unit} "
"is ambiguous"
)
if unit in ["Y", "M"]:
if is_float_object(ts) and not ts.is_integer():
# GH#47267 it is clear that 2 "M" corresponds to 1970-02-01,
# but not clear what 2.5 "M" corresponds to, so we will
# disallow that case.
raise ValueError(
f"Conversion of non-round float with unit={unit} "
"is ambiguous"
)
# GH#47266 go through np.datetime64 to avoid weird results e.g. with "Y"
# and 150 we'd get 2120-01-01 09:00:00
if is_float_object(ts):
ts = int(ts)
dt64obj = np.datetime64(ts, unit)
return get_datetime64_nanos(dt64obj, NPY_FR_ns)

# cast the unit, multiply base/frace separately
# to avoid precision issues from float -> int
base = <int64_t>ts
try:
base = <int64_t>ts
except OverflowError as err:
raise OutOfBoundsDatetime(
f"cannot convert input {ts} with the unit '{unit}'"
) from err

frac = ts - base
if p:
frac = round(frac, p)
return <int64_t>(base * m) + <int64_t>(frac * m)

try:
return <int64_t>(base * m) + <int64_t>(frac * m)
except OverflowError as err:
raise OutOfBoundsDatetime(
f"cannot convert input {ts} with the unit '{unit}'"
) from err


cpdef inline (int64_t, int) precision_from_unit(str unit):
Expand Down Expand Up @@ -278,25 +297,13 @@ cdef _TSObject convert_to_tsobject(object ts, tzinfo tz, str unit,
if ts == NPY_NAT:
obj.value = NPY_NAT
else:
if unit in ["Y", "M"]:
# GH#47266 cast_from_unit leads to weird results e.g. with "Y"
# and 150 we'd get 2120-01-01 09:00:00
ts = np.datetime64(ts, unit)
return convert_to_tsobject(ts, tz, None, False, False)

ts = ts * cast_from_unit(None, unit)
ts = cast_from_unit(ts, unit)
obj.value = ts
pandas_datetime_to_datetimestruct(ts, NPY_FR_ns, &obj.dts)
elif is_float_object(ts):
if ts != ts or ts == NPY_NAT:
obj.value = NPY_NAT
else:
if unit in ["Y", "M"]:
if ts == int(ts):
# GH#47266 Avoid cast_from_unit, which would give weird results
# e.g. with "Y" and 150.0 we'd get 2120-01-01 09:00:00
return convert_to_tsobject(int(ts), tz, unit, False, False)

ts = cast_from_unit(ts, unit)
obj.value = ts
pandas_datetime_to_datetimestruct(ts, NPY_FR_ns, &obj.dts)
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/tslibs/timedeltas.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ cdef _maybe_cast_from_unit(ts, str unit):
# assert unit not in ["Y", "y", "M"]
try:
ts = cast_from_unit(ts, unit)
except OverflowError as err:
except OutOfBoundsDatetime as err:
raise OutOfBoundsTimedelta(
f"Cannot cast {ts} from {unit} to 'ns' without overflow."
) from err
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/scalar/timestamp/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,15 @@ def test_constructor_from_date_second_reso(self):
ts = Timestamp(obj)
assert ts.unit == "s"

@pytest.mark.parametrize("typ", [int, float])
def test_construct_from_int_float_with_unit_out_of_bound_raises(self, typ):
# GH#50870 make sure we get a OutOfBoundsDatetime instead of OverflowError
val = typ(150000000)

msg = f"cannot convert input {val} with the unit 'D'"
with pytest.raises(OutOfBoundsDatetime, match=msg):
Timestamp(val, unit="D")

@pytest.mark.parametrize("typ", [int, float])
def test_constructor_int_float_with_YM_unit(self, typ):
# GH#47266 avoid the conversions in cast_from_unit
Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -1701,6 +1701,23 @@ def test_to_datetime_fixed_offset(self):


class TestToDatetimeUnit:
@pytest.mark.parametrize("unit", ["Y", "M"])
@pytest.mark.parametrize("item", [150, float(150)])
def test_to_datetime_month_or_year_unit_int(self, cache, unit, item):
# GH#50870 Note we have separate tests that pd.Timestamp gets these right
ts = Timestamp(item, unit=unit)
expected = DatetimeIndex([ts])

result = to_datetime([item], unit=unit, cache=cache)
tm.assert_index_equal(result, expected)

# TODO: this should also work
# result = to_datetime(np.array([item]), unit=unit, cache=cache)
# tm.assert_index_equal(result, expected)

result = to_datetime(np.array([item], dtype=object), unit=unit, cache=cache)
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize("unit", ["Y", "M"])
def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit):
# GH#50301
Expand Down