Skip to content

DEPR: to_datetime string behavior with unit #58407

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ Removal of prior version deprecations/changes
- :func:`concat` no longer ignores empty objects when determining output dtypes (:issue:`39122`)
- :func:`concat` with all-NA entries no longer ignores the dtype of those entries when determining the result dtype (:issue:`40893`)
- :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`)
- :func:`to_datetime` with a ``unit`` specified no longer parses strings into floats, instead parses them the same way as without ``unit`` (:issue:`50735`)
- :meth:`DataFrame.groupby` with ``as_index=False`` and aggregation methods will no longer exclude from the result the groupings that do not arise from the input (:issue:`49519`)
- :meth:`Series.dt.to_pydatetime` now returns a :class:`Series` of :py:class:`datetime.datetime` objects (:issue:`52459`)
- :meth:`SeriesGroupBy.agg` no longer pins the name of the group to the input passed to the provided ``func`` (:issue:`51703`)
Expand Down
6 changes: 1 addition & 5 deletions pandas/_libs/tslib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,6 @@ def format_array_from_datetime(
na_rep: str | float = ...,
reso: int = ..., # NPY_DATETIMEUNIT
) -> npt.NDArray[np.object_]: ...
def array_with_unit_to_datetime(
values: npt.NDArray[np.object_],
unit: str,
errors: str = ...,
) -> tuple[np.ndarray, tzinfo | None]: ...
def first_non_null(values: np.ndarray) -> int: ...
def array_to_datetime(
values: npt.NDArray[np.object_],
Expand All @@ -24,6 +19,7 @@ def array_to_datetime(
yearfirst: bool = ...,
utc: bool = ...,
creso: int = ...,
unit_for_numerics: str | None = ...,
) -> tuple[np.ndarray, tzinfo | None]: ...

# returned ndarray may be object dtype or datetime64[ns]
Expand Down
127 changes: 11 additions & 116 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
import warnings

from pandas.util._exceptions import find_stack_level

cimport cython

from datetime import timezone
Expand Down Expand Up @@ -234,117 +230,6 @@ def format_array_from_datetime(
return result


def array_with_unit_to_datetime(
ndarray[object] values,
str unit,
str errors="coerce"
):
"""
Convert the ndarray to datetime according to the time unit.

This function converts an array of objects into a numpy array of
datetime64[ns]. It returns the converted array
and also returns the timezone offset

if errors:
- raise: return converted values or raise OutOfBoundsDatetime
if out of range on the conversion or
ValueError for other conversions (e.g. a string)
- ignore: return non-convertible values as the same unit
- coerce: NaT for non-convertibles

Parameters
----------
values : ndarray
Date-like objects to convert.
unit : str
Time unit to use during conversion.
errors : str, default 'raise'
Error behavior when parsing.

Returns
-------
result : ndarray of m8 values
tz : parsed timezone offset or None
"""
cdef:
Py_ssize_t i, n=len(values)
bint is_coerce = errors == "coerce"
bint is_raise = errors == "raise"
ndarray[int64_t] iresult
tzinfo tz = None
double fval

assert is_coerce or is_raise

if unit == "ns":
result, tz = array_to_datetime(
values.astype(object, copy=False),
errors=errors,
creso=NPY_FR_ns,
)
return result, tz

result = np.empty(n, dtype="M8[ns]")
iresult = result.view("i8")

for i in range(n):
val = values[i]

try:
if checknull_with_nat_and_na(val):
iresult[i] = NPY_NAT

elif is_integer_object(val) or is_float_object(val):

if val != val or val == NPY_NAT:
iresult[i] = NPY_NAT
else:
iresult[i] = cast_from_unit(val, unit)

elif isinstance(val, str):
if len(val) == 0 or val in nat_strings:
iresult[i] = NPY_NAT

else:

try:
fval = float(val)
except ValueError:
raise ValueError(
f"non convertible value {val} with the unit '{unit}'"
)
warnings.warn(
"The behavior of 'to_datetime' with 'unit' when parsing "
"strings is deprecated. In a future version, strings will "
"be parsed as datetime strings, matching the behavior "
"without a 'unit'. To retain the old behavior, explicitly "
"cast ints or floats to numeric type before calling "
"to_datetime.",
FutureWarning,
stacklevel=find_stack_level(),
)

iresult[i] = cast_from_unit(fval, unit)

else:
# TODO: makes more sense as TypeError, but that would be an
# API change.
raise ValueError(
f"unit='{unit}' not valid with non-numerical val='{val}'"
)

except (ValueError, TypeError) as err:
if is_raise:
err.args = (f"{err}, at position {i}",)
raise
else:
# is_coerce
iresult[i] = NPY_NAT

return result, tz


@cython.wraparound(False)
@cython.boundscheck(False)
def first_non_null(values: ndarray) -> int:
Expand Down Expand Up @@ -376,6 +261,7 @@ cpdef array_to_datetime(
bint yearfirst=False,
bint utc=False,
NPY_DATETIMEUNIT creso=NPY_FR_ns,
str unit_for_numerics=None,
):
"""
Converts a 1D array of date-like values to a numpy array of either:
Expand Down Expand Up @@ -404,6 +290,7 @@ cpdef array_to_datetime(
indicator whether the dates should be UTC
creso : NPY_DATETIMEUNIT, default NPY_FR_ns
Set to NPY_FR_GENERIC to infer a resolution.
unit_for_numerics : str, default "ns"

Returns
-------
Expand Down Expand Up @@ -434,6 +321,13 @@ cpdef array_to_datetime(
abbrev = "ns"
else:
abbrev = npy_unit_to_abbrev(creso)

if unit_for_numerics is not None:
# either creso or unit_for_numerics should be passed, not both
assert creso == NPY_FR_ns
else:
unit_for_numerics = abbrev

result = np.empty((<object>values).shape, dtype=f"M8[{abbrev}]")
iresult = result.view("i8").ravel()

Expand Down Expand Up @@ -485,7 +379,8 @@ cpdef array_to_datetime(
creso = state.creso

# we now need to parse this as if unit=abbrev
iresult[i] = cast_from_unit(val, abbrev, out_reso=creso)
iresult[i] = cast_from_unit(val, unit_for_numerics, out_reso=creso)

state.found_other = True

elif isinstance(val, str):
Expand Down
9 changes: 7 additions & 2 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index:
"""
arg = extract_array(arg, extract_numpy=True)

# GH#30050 pass an ndarray to tslib.array_with_unit_to_datetime
# GH#30050 pass an ndarray to tslib.array_to_datetime
# because it expects an ndarray argument
if isinstance(arg, IntegerArray):
arr = arg.astype(f"datetime64[{unit}]")
Expand Down Expand Up @@ -519,7 +519,12 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index:
tz_parsed = None
else:
arg = arg.astype(object, copy=False)
arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
arr, tz_parsed = tslib.array_to_datetime(
arg,
utc=utc,
errors=errors,
unit_for_numerics=unit,
)

result = DatetimeIndex(arr, name=name)
if not isinstance(result, DatetimeIndex):
Expand Down
33 changes: 15 additions & 18 deletions pandas/tests/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -1705,22 +1705,24 @@ def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit):
# GH#50301
# Match Timestamp behavior in disallowing non-round floats with
# Y or M unit
warn_msg = "strings will be parsed as datetime strings"
msg = f"Conversion of non-round float with unit={unit} is ambiguous"
with pytest.raises(ValueError, match=msg):
to_datetime([1.5], unit=unit, errors="raise")
with pytest.raises(ValueError, match=msg):
to_datetime(np.array([1.5]), unit=unit, errors="raise")

msg = r"Given date string \"1.5\" not likely a datetime, at position 0"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
to_datetime(["1.5"], unit=unit, errors="raise")
to_datetime(["1.5"], unit=unit, errors="raise")

res = to_datetime([1.5], unit=unit, errors="coerce")
expected = Index([NaT], dtype="M8[ns]")
tm.assert_index_equal(res, expected)

with tm.assert_produces_warning(FutureWarning, match=warn_msg):
res = to_datetime(["1.5"], unit=unit, errors="coerce")
# In 3.0, the string "1.5" is parsed as as it would be without unit,
# which fails. With errors="coerce" this becomes NaT.
res = to_datetime(["1.5"], unit=unit, errors="coerce")
expected = to_datetime([NaT])
tm.assert_index_equal(res, expected)

# round floats are OK
Expand All @@ -1735,17 +1737,6 @@ def test_unit(self, cache):
with pytest.raises(ValueError, match=msg):
to_datetime([1], unit="D", format="%Y%m%d", cache=cache)

def test_unit_str(self, cache):
# GH 57051
# Test that strs aren't dropping precision to 32-bit accidentally.
with tm.assert_produces_warning(
FutureWarning,
match="'to_datetime' with 'unit' when parsing strings is deprecated",
):
res = to_datetime(["1704660000"], unit="s", origin="unix")
expected = to_datetime([1704660000], unit="s", origin="unix")
tm.assert_index_equal(res, expected)

def test_unit_array_mixed_nans(self, cache):
values = [11111111111111111, 1, 1.0, iNaT, NaT, np.nan, "NaT", ""]

Expand Down Expand Up @@ -1774,7 +1765,7 @@ def test_unit_array_mixed_nans_large_int(self, cache):
def test_to_datetime_invalid_str_not_out_of_bounds_valuerror(self, cache):
# if we have a string, then we raise a ValueError
# and NOT an OutOfBoundsDatetime
msg = "non convertible value foo with the unit 's'"
msg = "Unknown datetime string format, unable to parse: foo, at position 0"
with pytest.raises(ValueError, match=msg):
to_datetime("foo", errors="raise", unit="s", cache=cache)

Expand Down Expand Up @@ -1909,7 +1900,13 @@ def test_to_datetime_unit_na_values(self):

@pytest.mark.parametrize("bad_val", ["foo", 111111111])
def test_to_datetime_unit_invalid(self, bad_val):
msg = f"{bad_val} with the unit 'D'"
if bad_val == "foo":
msg = (
"Unknown datetime string format, unable to parse: "
f"{bad_val}, at position 2"
)
else:
msg = "cannot convert input 111111111 with the unit 'D', at position 2"
with pytest.raises(ValueError, match=msg):
to_datetime([1, 2, bad_val], unit="D")

Expand Down