diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 06620c2ad0dca..b84cacfd74840 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -11,7 +11,10 @@ from typing import ( import numpy as np -from pandas._typing import ArrayLike +from pandas._typing import ( + ArrayLike, + DtypeObj, +) # placeholder until we can specify np.ndarray[object, ndim=2] ndarray_obj_2d = np.ndarray @@ -73,6 +76,7 @@ def maybe_convert_objects( convert_timedelta: bool = ..., convert_period: Literal[False] = ..., convert_to_nullable_integer: Literal[False] = ..., + dtype_if_all_nat: DtypeObj | None = ..., ) -> np.ndarray: ... @overload @@ -85,6 +89,7 @@ def maybe_convert_objects( convert_timedelta: bool = ..., convert_period: bool = ..., convert_to_nullable_integer: Literal[True] = ..., + dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @overload @@ -97,6 +102,7 @@ def maybe_convert_objects( convert_timedelta: bool = ..., convert_period: bool = ..., convert_to_nullable_integer: bool = ..., + dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @overload @@ -109,6 +115,7 @@ def maybe_convert_objects( convert_timedelta: bool = ..., convert_period: Literal[True] = ..., convert_to_nullable_integer: bool = ..., + dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @overload @@ -121,6 +128,7 @@ def maybe_convert_objects( convert_timedelta: bool = ..., convert_period: bool = ..., convert_to_nullable_integer: bool = ..., + dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @overload diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4d184ee13e3db..73ff3b85ca46b 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -84,6 +84,10 @@ from pandas._libs.util cimport ( ) from pandas._libs.tslib import array_to_datetime +from pandas._libs.tslibs import ( + OutOfBoundsDatetime, + OutOfBoundsTimedelta, +) from pandas._libs.tslibs.period import Period from pandas._libs.missing cimport ( @@ -1640,7 +1644,8 @@ def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]: # convert *every* string array if len(objs): try: - array_to_datetime(objs, errors="raise") + # require_iso8601 as in maybe_infer_to_datetimelike + array_to_datetime(objs, errors="raise", require_iso8601=True) return "datetime", seen_str except (ValueError, TypeError): pass @@ -2322,7 +2327,8 @@ def maybe_convert_objects(ndarray[object] objects, bint convert_timedelta=False, bint convert_period=False, bint convert_interval=False, - bint convert_to_nullable_integer=False) -> "ArrayLike": + bint convert_to_nullable_integer=False, + object dtype_if_all_nat=None) -> "ArrayLike": """ Type inference function-- convert object array to proper dtype @@ -2351,6 +2357,8 @@ def maybe_convert_objects(ndarray[object] objects, convert_to_nullable_integer : bool, default False If an array-like object contains only integer values (and NaN) is encountered, whether to convert and return an IntegerArray. + dtype_if_all_nat : np.dtype, ExtensionDtype, or None, default None + Dtype to cast to if we have all-NaT. Returns ------- @@ -2419,8 +2427,12 @@ def maybe_convert_objects(ndarray[object] objects, seen.float_ = True elif is_timedelta(val): if convert_timedelta: - itimedeltas[i] = convert_to_timedelta64(val, "ns").view("i8") seen.timedelta_ = True + try: + itimedeltas[i] = convert_to_timedelta64(val, "ns").view("i8") + except OutOfBoundsTimedelta: + seen.object_ = True + break else: seen.object_ = True break @@ -2457,8 +2469,12 @@ def maybe_convert_objects(ndarray[object] objects, break else: seen.datetime_ = True - idatetimes[i] = convert_to_tsobject( - val, None, None, 0, 0).value + try: + idatetimes[i] = convert_to_tsobject( + val, None, None, 0, 0).value + except OutOfBoundsDatetime: + seen.object_ = True + break else: seen.object_ = True break @@ -2546,8 +2562,13 @@ def maybe_convert_objects(ndarray[object] objects, elif seen.nat_: if not seen.numeric_: if convert_datetime and convert_timedelta: - # TODO: array full of NaT ambiguity resolve here needed - pass + dtype = dtype_if_all_nat + if dtype is not None: + # otherwise we keep object dtype + result = _infer_all_nats( + dtype, datetimes, timedeltas + ) + elif convert_datetime: result = datetimes elif convert_timedelta: @@ -2586,8 +2607,13 @@ def maybe_convert_objects(ndarray[object] objects, elif seen.nat_: if not seen.numeric_: if convert_datetime and convert_timedelta: - # TODO: array full of NaT ambiguity resolve here needed - pass + dtype = dtype_if_all_nat + if dtype is not None: + # otherwise we keep object dtype + result = _infer_all_nats( + dtype, datetimes, timedeltas + ) + elif convert_datetime: result = datetimes elif convert_timedelta: @@ -2618,6 +2644,26 @@ def maybe_convert_objects(ndarray[object] objects, return objects +cdef _infer_all_nats(dtype, ndarray datetimes, ndarray timedeltas): + """ + If we have all-NaT values, cast these to the given dtype. + """ + if isinstance(dtype, np.dtype): + if dtype == "M8[ns]": + result = datetimes + elif dtype == "m8[ns]": + result = timedeltas + else: + raise ValueError(dtype) + else: + # ExtensionDtype + cls = dtype.construct_array_type() + i8vals = np.empty(len(datetimes), dtype="i8") + i8vals.fill(NPY_NAT) + result = cls(i8vals, dtype=dtype) + return result + + class NoDefault(Enum): # We make this an Enum # 1) because it round-trips through pickle correctly (see GH#40397) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 7e0b26391e132..3c541a309e42a 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -665,6 +665,57 @@ def test_maybe_convert_objects_datetime(self): ) tm.assert_numpy_array_equal(out, exp) + def test_maybe_convert_objects_dtype_if_all_nat(self): + arr = np.array([pd.NaT, pd.NaT], dtype=object) + out = lib.maybe_convert_objects( + arr, convert_datetime=True, convert_timedelta=True + ) + # no dtype_if_all_nat passed -> we dont guess + tm.assert_numpy_array_equal(out, arr) + + out = lib.maybe_convert_objects( + arr, + convert_datetime=True, + convert_timedelta=True, + dtype_if_all_nat=np.dtype("timedelta64[ns]"), + ) + exp = np.array(["NaT", "NaT"], dtype="timedelta64[ns]") + tm.assert_numpy_array_equal(out, exp) + + out = lib.maybe_convert_objects( + arr, + convert_datetime=True, + convert_timedelta=True, + dtype_if_all_nat=np.dtype("datetime64[ns]"), + ) + exp = np.array(["NaT", "NaT"], dtype="datetime64[ns]") + tm.assert_numpy_array_equal(out, exp) + + def test_maybe_convert_objects_dtype_if_all_nat_invalid(self): + # we accept datetime64[ns], timedelta64[ns], and EADtype + arr = np.array([pd.NaT, pd.NaT], dtype=object) + + with pytest.raises(ValueError, match="int64"): + lib.maybe_convert_objects( + arr, + convert_datetime=True, + convert_timedelta=True, + dtype_if_all_nat=np.dtype("int64"), + ) + + @pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) + def test_maybe_convert_objects_datetime_overflow_safe(self, dtype): + stamp = datetime(2363, 10, 4) # Enterprise-D launch date + if dtype == "timedelta64[ns]": + stamp = stamp - datetime(1970, 1, 1) + arr = np.array([stamp], dtype=object) + + out = lib.maybe_convert_objects( + arr, convert_datetime=True, convert_timedelta=True + ) + # no OutOfBoundsDatetime/OutOfBoundsTimedeltas + tm.assert_numpy_array_equal(out, arr) + def test_maybe_convert_objects_timedelta64_nat(self): obj = np.timedelta64("NaT", "ns") arr = np.array([obj], dtype=object)