Skip to content

Commit efca541

Browse files
committed
REGR: to_datetime, unique with OOB values
Closes pandas-dev#31491
1 parent ba08390 commit efca541

File tree

5 files changed

+43
-12
lines changed

5 files changed

+43
-12
lines changed

doc/source/whatsnew/v1.0.1.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ Categorical
2525

2626
Datetimelike
2727
^^^^^^^^^^^^
28-
-
29-
-
28+
- Fixed regression in :meth:`to_datetime` when parsing non-nanosecond resolution datetimes (:issue:`31491`)
29+
- Fixed bug in :meth:`to_datetime` raising when ``cache=True`` and out-of-bound values are present (:issue:`31491`)
3030

3131
Timedelta
3232
^^^^^^^^^

pandas/core/algorithms.py

+6
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
is_categorical_dtype,
3030
is_complex_dtype,
3131
is_datetime64_any_dtype,
32+
is_datetime64_dtype,
3233
is_datetime64_ns_dtype,
3334
is_extension_array_dtype,
3435
is_float_dtype,
@@ -191,6 +192,11 @@ def _reconstruct_data(values, dtype, original):
191192
if isinstance(original, ABCIndexClass):
192193
values = values.astype(object, copy=False)
193194
elif dtype is not None:
195+
if is_datetime64_dtype(dtype):
196+
dtype = "datetime64[ns]"
197+
elif is_timedelta64_dtype(dtype):
198+
dtype = "timedelta64[ns]"
199+
194200
values = values.astype(dtype, copy=False)
195201

196202
return values

pandas/core/tools/datetimes.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -602,7 +602,9 @@ def to_datetime(
602602
cache : bool, default True
603603
If True, use a cache of unique, converted dates to apply the datetime
604604
conversion. May produce significant speed-up when parsing duplicate
605-
date strings, especially ones with timezone offsets.
605+
date strings, especially ones with timezone offsets. The cache is only
606+
used when there are at least 50 values. The presence of out-of-bounds
607+
values will render the cache unusable and may slow down parsing.
606608
607609
.. versionadded:: 0.23.0
608610
@@ -734,7 +736,17 @@ def to_datetime(
734736
convert_listlike = partial(convert_listlike, name=arg.name)
735737
result = convert_listlike(arg, format)
736738
elif is_list_like(arg):
737-
cache_array = _maybe_cache(arg, format, cache, convert_listlike)
739+
try:
740+
cache_array = _maybe_cache(arg, format, cache, convert_listlike)
741+
except tslibs.OutOfBoundsDatetime:
742+
# caching attempts to create a DatetimeIndex, which may raise
743+
# an OOB. If that's the desired behavior, then just reraise...
744+
if errors == "raise":
745+
raise
746+
# ... otherwise, continue without the cache.
747+
from pandas import Series
748+
749+
cache_array = Series([], dtype=object) # just an empty array
738750
if not cache_array.empty:
739751
result = _convert_and_box_cache(arg, cache_array)
740752
else:

pandas/tests/indexes/datetimes/test_tools.py

+9-8
Original file line numberDiff line numberDiff line change
@@ -559,9 +559,13 @@ def test_to_datetime_dt64s_out_of_bounds(self, cache, dt):
559559
assert pd.to_datetime(dt, errors="coerce", cache=cache) is NaT
560560

561561
@pytest.mark.parametrize("cache", [True, False])
562-
def test_to_datetime_array_of_dt64s(self, cache):
563-
dts = [np.datetime64("2000-01-01"), np.datetime64("2000-01-02")]
564-
562+
@pytest.mark.parametrize("unit", ["s", "D"])
563+
def test_to_datetime_array_of_dt64s(self, cache, unit):
564+
# Need at least 50 to ensure cache is used.
565+
dts = [
566+
np.datetime64("2000-01-01", unit),
567+
np.datetime64("2000-01-02", unit),
568+
] * 30
565569
# Assuming all datetimes are in bounds, to_datetime() returns
566570
# an array that is equal to Timestamp() parsing
567571
tm.assert_index_equal(
@@ -579,11 +583,8 @@ def test_to_datetime_array_of_dt64s(self, cache):
579583
tm.assert_index_equal(
580584
pd.to_datetime(dts_with_oob, errors="coerce", cache=cache),
581585
pd.DatetimeIndex(
582-
[
583-
Timestamp(dts_with_oob[0]).asm8,
584-
Timestamp(dts_with_oob[1]).asm8,
585-
pd.NaT,
586-
]
586+
[Timestamp(dts_with_oob[0]).asm8, Timestamp(dts_with_oob[1]).asm8] * 30
587+
+ [pd.NaT],
587588
),
588589
)
589590

pandas/tests/test_algos.py

+12
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,18 @@ def test_datetime64_dtype_array_returned(self):
420420
tm.assert_numpy_array_equal(result, expected)
421421
assert result.dtype == expected.dtype
422422

423+
def test_datetime_non_ns(self):
424+
a = np.array(["2000", "2000", "2001"], dtype="datetime64[s]")
425+
result = pd.unique(a)
426+
expected = np.array(["2000", "2001"], dtype="datetime64[ns]")
427+
tm.assert_numpy_array_equal(result, expected)
428+
429+
def test_timedelta_non_ns(self):
430+
a = np.array(["2000", "2000", "2001"], dtype="timedelta64[s]")
431+
result = pd.unique(a)
432+
expected = np.array([2000000000000, 2001000000000], dtype="timedelta64[ns]")
433+
tm.assert_numpy_array_equal(result, expected)
434+
423435
def test_timedelta64_dtype_array_returned(self):
424436
# GH 9431
425437
expected = np.array([31200, 45678, 10000], dtype="m8[ns]")

0 commit comments

Comments
 (0)