Skip to content

REF: to_datetime handle non-object cases outside cython #50263

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/_libs/tslib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def format_array_from_datetime(
reso: int = ..., # NPY_DATETIMEUNIT
) -> npt.NDArray[np.object_]: ...
def array_with_unit_to_datetime(
values: np.ndarray,
values: npt.NDArray[np.object_],
unit: str,
errors: str = ...,
) -> tuple[np.ndarray, tzinfo | None]: ...
Expand Down
57 changes: 5 additions & 52 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ import_datetime()

cimport numpy as cnp
from numpy cimport (
float64_t,
int64_t,
ndarray,
)
Expand Down Expand Up @@ -231,7 +230,7 @@ def format_array_from_datetime(


def array_with_unit_to_datetime(
ndarray values,
ndarray[object] values,
str unit,
str errors="coerce"
):
Expand Down Expand Up @@ -266,70 +265,24 @@ def array_with_unit_to_datetime(
cdef:
Py_ssize_t i, n=len(values)
int64_t mult
int prec = 0
ndarray[float64_t] fvalues
bint is_ignore = errors=="ignore"
bint is_coerce = errors=="coerce"
bint is_raise = errors=="raise"
bint need_to_iterate = True
ndarray[int64_t] iresult
ndarray[object] oresult
ndarray mask
object tz = None

assert is_ignore or is_coerce or is_raise

if unit == "ns":
if issubclass(values.dtype.type, (np.integer, np.float_)):
result = values.astype("M8[ns]", copy=False)
else:
result, tz = array_to_datetime(
values.astype(object, copy=False),
errors=errors,
)
result, tz = array_to_datetime(
values.astype(object, copy=False),
errors=errors,
)
return result, tz

mult, _ = precision_from_unit(unit)

if is_raise:
# try a quick conversion to i8/f8
# if we have nulls that are not type-compat
# then need to iterate

if values.dtype.kind in ["i", "f", "u"]:
iresult = values.astype("i8", copy=False)
# fill missing values by comparing to NPY_NAT
mask = iresult == NPY_NAT
# Trying to Convert NaN to integer results in undefined
# behaviour, so handle it explicitly (see GH #48705)
if values.dtype.kind == "f":
mask |= values != values
iresult[mask] = 0
fvalues = iresult.astype("f8") * mult
need_to_iterate = False

if not need_to_iterate:
# check the bounds
if (fvalues < Timestamp.min.value).any() or (
(fvalues > Timestamp.max.value).any()
):
raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'")

if values.dtype.kind in ["i", "u"]:
result = (iresult * mult).astype("M8[ns]")

elif values.dtype.kind == "f":
fresult = (values * mult).astype("f8")
fresult[mask] = 0
if prec:
fresult = round(fresult, prec)
result = fresult.astype("M8[ns]", copy=False)

iresult = result.view("i8")
iresult[mask] = NPY_NAT

return result, tz

result = np.empty(n, dtype="M8[ns]")
iresult = result.view("i8")

Expand Down
45 changes: 44 additions & 1 deletion pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,13 @@
OutOfBoundsDatetime,
Timedelta,
Timestamp,
astype_overflowsafe,
iNaT,
nat_strings,
parsing,
timezones as libtimezones,
)
from pandas._libs.tslibs.conversion import precision_from_unit
from pandas._libs.tslibs.parsing import (
DateParseError,
format_is_iso,
Expand Down Expand Up @@ -557,7 +559,48 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index:
tz_parsed = None
else:
arg = np.asarray(arg)
arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)

if arg.dtype.kind in ["i", "u"]:
# Note we can't do "f" here because that could induce unwanted
# rounding GH#14156, GH#20445
arr = arg.astype(f"datetime64[{unit}]", copy=False)
try:
arr = astype_overflowsafe(arr, np.dtype("M8[ns]"), copy=False)
except OutOfBoundsDatetime:
if errors == "raise":
raise
arg = arg.astype(object)
return _to_datetime_with_unit(arg, unit, name, utc, errors)
tz_parsed = None

elif arg.dtype.kind == "f":
mult, _ = precision_from_unit(unit)

iresult = arg.astype("i8")
mask = np.isnan(arg) | (arg == iNaT)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are we confident that np.isnan(arg) addresses the issue for which values != values needed to be added? if so, just for my understanding, why could it not have been used in tslib.pyx? I'm guessing it was for the performance penalty of calling a Python function within Cython?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are we confident that np.isnan(arg) addresses the issue for which values != values needed to be added?

im pretty confident, yes

why could it not have been used in tslib.pyx? I'm guessing it was for the performance penalty of calling a Python function within Cython?

it should work just fine in cython too.

Copy link
Member

@MarcoGorelli MarcoGorelli Dec 15, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be

Suggested change
mask = np.isnan(arg) | (arg == iNaT)
mask = np.isnan(arg) | (iresult == iNaT)

?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the only test which seems to require that is pandas/tests/tools/test_to_datetime.py::TestToDatetimeUnit::test_to_datetime_unit_fractional_seconds, and it doesn't seem to make a difference there so maybe it's OK

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldnt make a difference

iresult[mask] = 0

fvalues = iresult.astype("f8") * mult

if (fvalues < Timestamp.min.value).any() or (
fvalues > Timestamp.max.value
).any():
if errors != "raise":
arg = arg.astype(object)
return _to_datetime_with_unit(arg, unit, name, utc, errors)
raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'")

# TODO: is fresult meaningfully different from fvalues?
fresult = (arg * mult).astype("f8")
fresult[mask] = 0

arr = fresult.astype("M8[ns]", copy=False)
arr[mask] = np.datetime64("NaT", "ns")

tz_parsed = None
else:
arg = arg.astype(object, copy=False)
arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)

if errors == "ignore":
# Index constructor _may_ infer to DatetimeIndex
Expand Down