Skip to content

Commit 719e02d

Browse files
jbrockmendelpmhatre1
authored andcommitted
DEPR: to_datetime string behavior with unit (pandas-dev#58407)
* DEPR: to_datetime string behavior with unit * remove outdated test
1 parent 9dee7f3 commit 719e02d

File tree

5 files changed

+35
-141
lines changed

5 files changed

+35
-141
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@ Removal of prior version deprecations/changes
214214
- :func:`concat` no longer ignores empty objects when determining output dtypes (:issue:`39122`)
215215
- :func:`concat` with all-NA entries no longer ignores the dtype of those entries when determining the result dtype (:issue:`40893`)
216216
- :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`)
217+
- :func:`to_datetime` with a ``unit`` specified no longer parses strings into floats, instead parses them the same way as without ``unit`` (:issue:`50735`)
217218
- :meth:`DataFrame.groupby` with ``as_index=False`` and aggregation methods will no longer exclude from the result the groupings that do not arise from the input (:issue:`49519`)
218219
- :meth:`Series.dt.to_pydatetime` now returns a :class:`Series` of :py:class:`datetime.datetime` objects (:issue:`52459`)
219220
- :meth:`SeriesGroupBy.agg` no longer pins the name of the group to the input passed to the provided ``func`` (:issue:`51703`)

pandas/_libs/tslib.pyi

+1-5
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,6 @@ def format_array_from_datetime(
1111
na_rep: str | float = ...,
1212
reso: int = ..., # NPY_DATETIMEUNIT
1313
) -> npt.NDArray[np.object_]: ...
14-
def array_with_unit_to_datetime(
15-
values: npt.NDArray[np.object_],
16-
unit: str,
17-
errors: str = ...,
18-
) -> tuple[np.ndarray, tzinfo | None]: ...
1914
def first_non_null(values: np.ndarray) -> int: ...
2015
def array_to_datetime(
2116
values: npt.NDArray[np.object_],
@@ -24,6 +19,7 @@ def array_to_datetime(
2419
yearfirst: bool = ...,
2520
utc: bool = ...,
2621
creso: int = ...,
22+
unit_for_numerics: str | None = ...,
2723
) -> tuple[np.ndarray, tzinfo | None]: ...
2824

2925
# returned ndarray may be object dtype or datetime64[ns]

pandas/_libs/tslib.pyx

+11-116
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,3 @@
1-
import warnings
2-
3-
from pandas.util._exceptions import find_stack_level
4-
51
cimport cython
62

73
from datetime import timezone
@@ -234,117 +230,6 @@ def format_array_from_datetime(
234230
return result
235231

236232

237-
def array_with_unit_to_datetime(
238-
ndarray[object] values,
239-
str unit,
240-
str errors="coerce"
241-
):
242-
"""
243-
Convert the ndarray to datetime according to the time unit.
244-
245-
This function converts an array of objects into a numpy array of
246-
datetime64[ns]. It returns the converted array
247-
and also returns the timezone offset
248-
249-
if errors:
250-
- raise: return converted values or raise OutOfBoundsDatetime
251-
if out of range on the conversion or
252-
ValueError for other conversions (e.g. a string)
253-
- ignore: return non-convertible values as the same unit
254-
- coerce: NaT for non-convertibles
255-
256-
Parameters
257-
----------
258-
values : ndarray
259-
Date-like objects to convert.
260-
unit : str
261-
Time unit to use during conversion.
262-
errors : str, default 'raise'
263-
Error behavior when parsing.
264-
265-
Returns
266-
-------
267-
result : ndarray of m8 values
268-
tz : parsed timezone offset or None
269-
"""
270-
cdef:
271-
Py_ssize_t i, n=len(values)
272-
bint is_coerce = errors == "coerce"
273-
bint is_raise = errors == "raise"
274-
ndarray[int64_t] iresult
275-
tzinfo tz = None
276-
double fval
277-
278-
assert is_coerce or is_raise
279-
280-
if unit == "ns":
281-
result, tz = array_to_datetime(
282-
values.astype(object, copy=False),
283-
errors=errors,
284-
creso=NPY_FR_ns,
285-
)
286-
return result, tz
287-
288-
result = np.empty(n, dtype="M8[ns]")
289-
iresult = result.view("i8")
290-
291-
for i in range(n):
292-
val = values[i]
293-
294-
try:
295-
if checknull_with_nat_and_na(val):
296-
iresult[i] = NPY_NAT
297-
298-
elif is_integer_object(val) or is_float_object(val):
299-
300-
if val != val or val == NPY_NAT:
301-
iresult[i] = NPY_NAT
302-
else:
303-
iresult[i] = cast_from_unit(val, unit)
304-
305-
elif isinstance(val, str):
306-
if len(val) == 0 or val in nat_strings:
307-
iresult[i] = NPY_NAT
308-
309-
else:
310-
311-
try:
312-
fval = float(val)
313-
except ValueError:
314-
raise ValueError(
315-
f"non convertible value {val} with the unit '{unit}'"
316-
)
317-
warnings.warn(
318-
"The behavior of 'to_datetime' with 'unit' when parsing "
319-
"strings is deprecated. In a future version, strings will "
320-
"be parsed as datetime strings, matching the behavior "
321-
"without a 'unit'. To retain the old behavior, explicitly "
322-
"cast ints or floats to numeric type before calling "
323-
"to_datetime.",
324-
FutureWarning,
325-
stacklevel=find_stack_level(),
326-
)
327-
328-
iresult[i] = cast_from_unit(fval, unit)
329-
330-
else:
331-
# TODO: makes more sense as TypeError, but that would be an
332-
# API change.
333-
raise ValueError(
334-
f"unit='{unit}' not valid with non-numerical val='{val}'"
335-
)
336-
337-
except (ValueError, TypeError) as err:
338-
if is_raise:
339-
err.args = (f"{err}, at position {i}",)
340-
raise
341-
else:
342-
# is_coerce
343-
iresult[i] = NPY_NAT
344-
345-
return result, tz
346-
347-
348233
@cython.wraparound(False)
349234
@cython.boundscheck(False)
350235
def first_non_null(values: ndarray) -> int:
@@ -376,6 +261,7 @@ cpdef array_to_datetime(
376261
bint yearfirst=False,
377262
bint utc=False,
378263
NPY_DATETIMEUNIT creso=NPY_FR_ns,
264+
str unit_for_numerics=None,
379265
):
380266
"""
381267
Converts a 1D array of date-like values to a numpy array of either:
@@ -404,6 +290,7 @@ cpdef array_to_datetime(
404290
indicator whether the dates should be UTC
405291
creso : NPY_DATETIMEUNIT, default NPY_FR_ns
406292
Set to NPY_FR_GENERIC to infer a resolution.
293+
unit_for_numerics : str, default "ns"
407294
408295
Returns
409296
-------
@@ -434,6 +321,13 @@ cpdef array_to_datetime(
434321
abbrev = "ns"
435322
else:
436323
abbrev = npy_unit_to_abbrev(creso)
324+
325+
if unit_for_numerics is not None:
326+
# either creso or unit_for_numerics should be passed, not both
327+
assert creso == NPY_FR_ns
328+
else:
329+
unit_for_numerics = abbrev
330+
437331
result = np.empty((<object>values).shape, dtype=f"M8[{abbrev}]")
438332
iresult = result.view("i8").ravel()
439333

@@ -485,7 +379,8 @@ cpdef array_to_datetime(
485379
creso = state.creso
486380

487381
# we now need to parse this as if unit=abbrev
488-
iresult[i] = cast_from_unit(val, abbrev, out_reso=creso)
382+
iresult[i] = cast_from_unit(val, unit_for_numerics, out_reso=creso)
383+
489384
state.found_other = True
490385

491386
elif isinstance(val, str):

pandas/core/tools/datetimes.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -481,7 +481,7 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index:
481481
"""
482482
arg = extract_array(arg, extract_numpy=True)
483483

484-
# GH#30050 pass an ndarray to tslib.array_with_unit_to_datetime
484+
# GH#30050 pass an ndarray to tslib.array_to_datetime
485485
# because it expects an ndarray argument
486486
if isinstance(arg, IntegerArray):
487487
arr = arg.astype(f"datetime64[{unit}]")
@@ -519,7 +519,12 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index:
519519
tz_parsed = None
520520
else:
521521
arg = arg.astype(object, copy=False)
522-
arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
522+
arr, tz_parsed = tslib.array_to_datetime(
523+
arg,
524+
utc=utc,
525+
errors=errors,
526+
unit_for_numerics=unit,
527+
)
523528

524529
result = DatetimeIndex(arr, name=name)
525530
if not isinstance(result, DatetimeIndex):

pandas/tests/tools/test_to_datetime.py

+15-18
Original file line numberDiff line numberDiff line change
@@ -1705,22 +1705,24 @@ def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit):
17051705
# GH#50301
17061706
# Match Timestamp behavior in disallowing non-round floats with
17071707
# Y or M unit
1708-
warn_msg = "strings will be parsed as datetime strings"
17091708
msg = f"Conversion of non-round float with unit={unit} is ambiguous"
17101709
with pytest.raises(ValueError, match=msg):
17111710
to_datetime([1.5], unit=unit, errors="raise")
17121711
with pytest.raises(ValueError, match=msg):
17131712
to_datetime(np.array([1.5]), unit=unit, errors="raise")
1713+
1714+
msg = r"Given date string \"1.5\" not likely a datetime, at position 0"
17141715
with pytest.raises(ValueError, match=msg):
1715-
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
1716-
to_datetime(["1.5"], unit=unit, errors="raise")
1716+
to_datetime(["1.5"], unit=unit, errors="raise")
17171717

17181718
res = to_datetime([1.5], unit=unit, errors="coerce")
17191719
expected = Index([NaT], dtype="M8[ns]")
17201720
tm.assert_index_equal(res, expected)
17211721

1722-
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
1723-
res = to_datetime(["1.5"], unit=unit, errors="coerce")
1722+
# In 3.0, the string "1.5" is parsed as as it would be without unit,
1723+
# which fails. With errors="coerce" this becomes NaT.
1724+
res = to_datetime(["1.5"], unit=unit, errors="coerce")
1725+
expected = to_datetime([NaT])
17241726
tm.assert_index_equal(res, expected)
17251727

17261728
# round floats are OK
@@ -1735,17 +1737,6 @@ def test_unit(self, cache):
17351737
with pytest.raises(ValueError, match=msg):
17361738
to_datetime([1], unit="D", format="%Y%m%d", cache=cache)
17371739

1738-
def test_unit_str(self, cache):
1739-
# GH 57051
1740-
# Test that strs aren't dropping precision to 32-bit accidentally.
1741-
with tm.assert_produces_warning(
1742-
FutureWarning,
1743-
match="'to_datetime' with 'unit' when parsing strings is deprecated",
1744-
):
1745-
res = to_datetime(["1704660000"], unit="s", origin="unix")
1746-
expected = to_datetime([1704660000], unit="s", origin="unix")
1747-
tm.assert_index_equal(res, expected)
1748-
17491740
def test_unit_array_mixed_nans(self, cache):
17501741
values = [11111111111111111, 1, 1.0, iNaT, NaT, np.nan, "NaT", ""]
17511742

@@ -1774,7 +1765,7 @@ def test_unit_array_mixed_nans_large_int(self, cache):
17741765
def test_to_datetime_invalid_str_not_out_of_bounds_valuerror(self, cache):
17751766
# if we have a string, then we raise a ValueError
17761767
# and NOT an OutOfBoundsDatetime
1777-
msg = "non convertible value foo with the unit 's'"
1768+
msg = "Unknown datetime string format, unable to parse: foo, at position 0"
17781769
with pytest.raises(ValueError, match=msg):
17791770
to_datetime("foo", errors="raise", unit="s", cache=cache)
17801771

@@ -1909,7 +1900,13 @@ def test_to_datetime_unit_na_values(self):
19091900

19101901
@pytest.mark.parametrize("bad_val", ["foo", 111111111])
19111902
def test_to_datetime_unit_invalid(self, bad_val):
1912-
msg = f"{bad_val} with the unit 'D'"
1903+
if bad_val == "foo":
1904+
msg = (
1905+
"Unknown datetime string format, unable to parse: "
1906+
f"{bad_val}, at position 2"
1907+
)
1908+
else:
1909+
msg = "cannot convert input 111111111 with the unit 'D', at position 2"
19131910
with pytest.raises(ValueError, match=msg):
19141911
to_datetime([1, 2, bad_val], unit="D")
19151912

0 commit comments

Comments
 (0)