diff --git a/doc/source/whatsnew/v1.0.1.rst b/doc/source/whatsnew/v1.0.1.rst index ff8433c7cafd9..95fab6a18ffe1 100644 --- a/doc/source/whatsnew/v1.0.1.rst +++ b/doc/source/whatsnew/v1.0.1.rst @@ -25,8 +25,8 @@ Categorical Datetimelike ^^^^^^^^^^^^ -- -- +- Fixed regression in :meth:`to_datetime` when parsing non-nanosecond resolution datetimes (:issue:`31491`) +- Fixed bug in :meth:`to_datetime` raising when ``cache=True`` and out-of-bound values are present (:issue:`31491`) Timedelta ^^^^^^^^^ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 8af9e2cc9790f..886b0a3c5fec1 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -29,6 +29,7 @@ is_categorical_dtype, is_complex_dtype, is_datetime64_any_dtype, + is_datetime64_dtype, is_datetime64_ns_dtype, is_extension_array_dtype, is_float_dtype, @@ -191,6 +192,11 @@ def _reconstruct_data(values, dtype, original): if isinstance(original, ABCIndexClass): values = values.astype(object, copy=False) elif dtype is not None: + if is_datetime64_dtype(dtype): + dtype = "datetime64[ns]" + elif is_timedelta64_dtype(dtype): + dtype = "timedelta64[ns]" + values = values.astype(dtype, copy=False) return values diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 0cf0f943ae442..6d45ddd29d783 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -602,7 +602,9 @@ def to_datetime( cache : bool, default True If True, use a cache of unique, converted dates to apply the datetime conversion. May produce significant speed-up when parsing duplicate - date strings, especially ones with timezone offsets. + date strings, especially ones with timezone offsets. The cache is only + used when there are at least 50 values. The presence of out-of-bounds + values will render the cache unusable and may slow down parsing. .. versionadded:: 0.23.0 @@ -734,7 +736,17 @@ def to_datetime( convert_listlike = partial(convert_listlike, name=arg.name) result = convert_listlike(arg, format) elif is_list_like(arg): - cache_array = _maybe_cache(arg, format, cache, convert_listlike) + try: + cache_array = _maybe_cache(arg, format, cache, convert_listlike) + except tslibs.OutOfBoundsDatetime: + # caching attempts to create a DatetimeIndex, which may raise + # an OOB. If that's the desired behavior, then just reraise... + if errors == "raise": + raise + # ... otherwise, continue without the cache. + from pandas import Series + + cache_array = Series([], dtype=object) # just an empty array if not cache_array.empty: result = _convert_and_box_cache(arg, cache_array) else: diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 7abf810e6bcfc..df3a49fb7c292 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -559,9 +559,14 @@ def test_to_datetime_dt64s_out_of_bounds(self, cache, dt): assert pd.to_datetime(dt, errors="coerce", cache=cache) is NaT @pytest.mark.parametrize("cache", [True, False]) - def test_to_datetime_array_of_dt64s(self, cache): - dts = [np.datetime64("2000-01-01"), np.datetime64("2000-01-02")] - + @pytest.mark.parametrize("unit", ["s", "D"]) + def test_to_datetime_array_of_dt64s(self, cache, unit): + # https://github.com/pandas-dev/pandas/issues/31491 + # Need at least 50 to ensure cache is used. + dts = [ + np.datetime64("2000-01-01", unit), + np.datetime64("2000-01-02", unit), + ] * 30 # Assuming all datetimes are in bounds, to_datetime() returns # an array that is equal to Timestamp() parsing tm.assert_index_equal( @@ -579,11 +584,8 @@ def test_to_datetime_array_of_dt64s(self, cache): tm.assert_index_equal( pd.to_datetime(dts_with_oob, errors="coerce", cache=cache), pd.DatetimeIndex( - [ - Timestamp(dts_with_oob[0]).asm8, - Timestamp(dts_with_oob[1]).asm8, - pd.NaT, - ] + [Timestamp(dts_with_oob[0]).asm8, Timestamp(dts_with_oob[1]).asm8] * 30 + + [pd.NaT], ), ) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 6c7f8c9b0475e..a1de9c435c9ba 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -420,6 +420,18 @@ def test_datetime64_dtype_array_returned(self): tm.assert_numpy_array_equal(result, expected) assert result.dtype == expected.dtype + def test_datetime_non_ns(self): + a = np.array(["2000", "2000", "2001"], dtype="datetime64[s]") + result = pd.unique(a) + expected = np.array(["2000", "2001"], dtype="datetime64[ns]") + tm.assert_numpy_array_equal(result, expected) + + def test_timedelta_non_ns(self): + a = np.array(["2000", "2000", "2001"], dtype="timedelta64[s]") + result = pd.unique(a) + expected = np.array([2000000000000, 2001000000000], dtype="timedelta64[ns]") + tm.assert_numpy_array_equal(result, expected) + def test_timedelta64_dtype_array_returned(self): # GH 9431 expected = np.array([31200, 45678, 10000], dtype="m8[ns]")