Skip to content

Commit 34f6c7e

Browse files
Backport PR #31520: REGR: to_datetime, unique with OOB values (#31540)
Co-authored-by: Tom Augspurger <[email protected]>
1 parent 0dfcdc8 commit 34f6c7e

File tree

5 files changed

+44
-12
lines changed

5 files changed

+44
-12
lines changed

doc/source/whatsnew/v1.0.1.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ Categorical
2525

2626
Datetimelike
2727
^^^^^^^^^^^^
28-
-
29-
-
28+
- Fixed regression in :meth:`to_datetime` when parsing non-nanosecond resolution datetimes (:issue:`31491`)
29+
- Fixed bug in :meth:`to_datetime` raising when ``cache=True`` and out-of-bound values are present (:issue:`31491`)
3030

3131
Timedelta
3232
^^^^^^^^^

pandas/core/algorithms.py

+6
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
is_categorical_dtype,
3030
is_complex_dtype,
3131
is_datetime64_any_dtype,
32+
is_datetime64_dtype,
3233
is_datetime64_ns_dtype,
3334
is_extension_array_dtype,
3435
is_float_dtype,
@@ -191,6 +192,11 @@ def _reconstruct_data(values, dtype, original):
191192
if isinstance(original, ABCIndexClass):
192193
values = values.astype(object, copy=False)
193194
elif dtype is not None:
195+
if is_datetime64_dtype(dtype):
196+
dtype = "datetime64[ns]"
197+
elif is_timedelta64_dtype(dtype):
198+
dtype = "timedelta64[ns]"
199+
194200
values = values.astype(dtype, copy=False)
195201

196202
return values

pandas/core/tools/datetimes.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -603,7 +603,9 @@ def to_datetime(
603603
cache : bool, default True
604604
If True, use a cache of unique, converted dates to apply the datetime
605605
conversion. May produce significant speed-up when parsing duplicate
606-
date strings, especially ones with timezone offsets.
606+
date strings, especially ones with timezone offsets. The cache is only
607+
used when there are at least 50 values. The presence of out-of-bounds
608+
values will render the cache unusable and may slow down parsing.
607609
608610
.. versionadded:: 0.23.0
609611
@@ -735,7 +737,17 @@ def to_datetime(
735737
convert_listlike = partial(convert_listlike, name=arg.name)
736738
result = convert_listlike(arg, format)
737739
elif is_list_like(arg):
738-
cache_array = _maybe_cache(arg, format, cache, convert_listlike)
740+
try:
741+
cache_array = _maybe_cache(arg, format, cache, convert_listlike)
742+
except tslibs.OutOfBoundsDatetime:
743+
# caching attempts to create a DatetimeIndex, which may raise
744+
# an OOB. If that's the desired behavior, then just reraise...
745+
if errors == "raise":
746+
raise
747+
# ... otherwise, continue without the cache.
748+
from pandas import Series
749+
750+
cache_array = Series([], dtype=object) # just an empty array
739751
if not cache_array.empty:
740752
result = _convert_and_box_cache(arg, cache_array)
741753
else:

pandas/tests/indexes/datetimes/test_tools.py

+10-8
Original file line numberDiff line numberDiff line change
@@ -559,9 +559,14 @@ def test_to_datetime_dt64s_out_of_bounds(self, cache, dt):
559559
assert pd.to_datetime(dt, errors="coerce", cache=cache) is NaT
560560

561561
@pytest.mark.parametrize("cache", [True, False])
562-
def test_to_datetime_array_of_dt64s(self, cache):
563-
dts = [np.datetime64("2000-01-01"), np.datetime64("2000-01-02")]
564-
562+
@pytest.mark.parametrize("unit", ["s", "D"])
563+
def test_to_datetime_array_of_dt64s(self, cache, unit):
564+
# https://github.com/pandas-dev/pandas/issues/31491
565+
# Need at least 50 to ensure cache is used.
566+
dts = [
567+
np.datetime64("2000-01-01", unit),
568+
np.datetime64("2000-01-02", unit),
569+
] * 30
565570
# Assuming all datetimes are in bounds, to_datetime() returns
566571
# an array that is equal to Timestamp() parsing
567572
tm.assert_index_equal(
@@ -579,11 +584,8 @@ def test_to_datetime_array_of_dt64s(self, cache):
579584
tm.assert_index_equal(
580585
pd.to_datetime(dts_with_oob, errors="coerce", cache=cache),
581586
pd.DatetimeIndex(
582-
[
583-
Timestamp(dts_with_oob[0]).asm8,
584-
Timestamp(dts_with_oob[1]).asm8,
585-
pd.NaT,
586-
]
587+
[Timestamp(dts_with_oob[0]).asm8, Timestamp(dts_with_oob[1]).asm8] * 30
588+
+ [pd.NaT],
587589
),
588590
)
589591

pandas/tests/test_algos.py

+12
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,18 @@ def test_datetime64_dtype_array_returned(self):
420420
tm.assert_numpy_array_equal(result, expected)
421421
assert result.dtype == expected.dtype
422422

423+
def test_datetime_non_ns(self):
424+
a = np.array(["2000", "2000", "2001"], dtype="datetime64[s]")
425+
result = pd.unique(a)
426+
expected = np.array(["2000", "2001"], dtype="datetime64[ns]")
427+
tm.assert_numpy_array_equal(result, expected)
428+
429+
def test_timedelta_non_ns(self):
430+
a = np.array(["2000", "2000", "2001"], dtype="timedelta64[s]")
431+
result = pd.unique(a)
432+
expected = np.array([2000000000000, 2001000000000], dtype="timedelta64[ns]")
433+
tm.assert_numpy_array_equal(result, expected)
434+
423435
def test_timedelta64_dtype_array_returned(self):
424436
# GH 9431
425437
expected = np.array([31200, 45678, 10000], dtype="m8[ns]")

0 commit comments

Comments
 (0)