diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index c76555f9ef417..267b3bd72cbfc 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -249,6 +249,7 @@ Other API changes - Passing a sequence containing ``datetime`` objects and ``date`` objects to :class:`Series` constructor will return with ``object`` dtype instead of ``datetime64[ns]`` dtype, consistent with :class:`Index` behavior (:issue:`49341`) - Passing strings that cannot be parsed as datetimes to :class:`Series` or :class:`DataFrame` with ``dtype="datetime64[ns]"`` will raise instead of silently ignoring the keyword and returning ``object`` dtype (:issue:`24435`) - Passing a sequence containing a type that cannot be converted to :class:`Timedelta` to :func:`to_timedelta` or to the :class:`Series` or :class:`DataFrame` constructor with ``dtype="timedelta64[ns]"`` or to :class:`TimedeltaIndex` now raises ``TypeError`` instead of ``ValueError`` (:issue:`49525`) +- Changed behavior of :class:`Index` construct with sequence containing at least one ``NaT`` and everything else either ``None`` or ``NaN`` to infer ``datetime64[ns]`` dtype instead of ``object``, matching :class:`Series` behavior (:issue:`49340`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1b871bf0b745f..ffe5ec67bf17f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2454,6 +2454,16 @@ def maybe_convert_objects(ndarray[object] objects, object val float64_t fnan = np.nan + if dtype_if_all_nat is not None: + # in practice we don't expect to ever pass dtype_if_all_nat + # without both convert_datetime and convert_timedelta, so disallow + # it to avoid needing to handle it below. + if not convert_datetime or not convert_timedelta: + raise ValueError( + "Cannot specify 'dtype_if_all_nat' without convert_datetime=True " + "and convert_timedelta=True" + ) + n = len(objects) floats = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_FLOAT64, 0) @@ -2491,7 +2501,7 @@ def maybe_convert_objects(ndarray[object] objects, if not (convert_datetime or convert_timedelta or convert_period): seen.object_ = True break - elif val is np.nan: + elif util.is_nan(val): seen.nan_ = True mask[i] = True floats[i] = complexes[i] = val @@ -2641,6 +2651,38 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True + if seen.nat_: + if not seen.object_ and not seen.numeric_ and not seen.bool_: + # all NaT, None, or nan (at least one NaT) + # see GH#49340 for discussion of desired behavior + dtype = dtype_if_all_nat + if cnp.PyArray_DescrCheck(dtype): + # i.e. isinstance(dtype, np.dtype) + if dtype.kind not in ["m", "M"]: + raise ValueError(dtype) + else: + res = np.empty((objects).shape, dtype=dtype) + res[:] = NPY_NAT + return res + elif dtype is not None: + # EA, we don't expect to get here, but _could_ implement + raise NotImplementedError(dtype) + elif convert_datetime and convert_timedelta: + # we don't guess + seen.object_ = True + elif convert_datetime: + res = np.empty((objects).shape, dtype="M8[ns]") + res[:] = NPY_NAT + return res + elif convert_timedelta: + res = np.empty((objects).shape, dtype="m8[ns]") + res[:] = NPY_NAT + return res + else: + seen.object_ = True + else: + seen.object_ = True + if not seen.object_: result = None if not safe: @@ -2666,20 +2708,6 @@ def maybe_convert_objects(ndarray[object] objects, elif seen.timedelta_: if not seen.numeric_: result = timedeltas - elif seen.nat_: - if not seen.numeric_: - if convert_datetime and convert_timedelta: - dtype = dtype_if_all_nat - if dtype is not None: - # otherwise we keep object dtype - result = _infer_all_nats( - dtype, datetimes, timedeltas - ) - - elif convert_datetime: - result = datetimes - elif convert_timedelta: - result = timedeltas else: if seen.complex_: result = complexes @@ -2711,20 +2739,6 @@ def maybe_convert_objects(ndarray[object] objects, elif seen.timedelta_: if not seen.numeric_: result = timedeltas - elif seen.nat_: - if not seen.numeric_: - if convert_datetime and convert_timedelta: - dtype = dtype_if_all_nat - if dtype is not None: - # otherwise we keep object dtype - result = _infer_all_nats( - dtype, datetimes, timedeltas - ) - - elif convert_datetime: - result = datetimes - elif convert_timedelta: - result = timedeltas else: if seen.complex_: if not seen.int_: @@ -2751,27 +2765,6 @@ def maybe_convert_objects(ndarray[object] objects, return objects -cdef _infer_all_nats(dtype, ndarray datetimes, ndarray timedeltas): - """ - If we have all-NaT values, cast these to the given dtype. - """ - if cnp.PyArray_DescrCheck(dtype): - # i.e. isinstance(dtype, np.dtype): - if dtype == "M8[ns]": - result = datetimes - elif dtype == "m8[ns]": - result = timedeltas - else: - raise ValueError(dtype) - else: - # ExtensionDtype - cls = dtype.construct_array_type() - i8vals = cnp.PyArray_EMPTY(1, datetimes.shape, cnp.NPY_INT64, 0) - i8vals.fill(NPY_NAT) - result = cls(i8vals, dtype=dtype) - return result - - class _NoDefault(Enum): # We make this an Enum # 1) because it round-trips through pickle correctly (see GH#40397) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 793f407b78714..a931e1d875908 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1231,34 +1231,23 @@ def maybe_infer_to_datetimelike( if not len(v): return value - inferred_type = lib.infer_datetimelike_array(ensure_object(v)) - - if inferred_type in ["period", "interval", "timedelta", "datetime"]: - # Incompatible return value type (got "Union[ExtensionArray, ndarray]", - # expected "Union[ndarray, DatetimeArray, TimedeltaArray, PeriodArray, - # IntervalArray]") - return lib.maybe_convert_objects( # type: ignore[return-value] - v, - convert_period=True, - convert_interval=True, - convert_timedelta=True, - convert_datetime=True, - dtype_if_all_nat=np.dtype("M8[ns]"), - ) - - elif inferred_type == "nat": - # if all NaT, return as datetime - # only reached if we have at least 1 NaT and the rest (NaT or None or np.nan) - # This is slightly different from what we'd get with maybe_convert_objects, - # which only converts of all-NaT - from pandas.core.arrays.datetimes import sequence_to_datetimes - - # Incompatible types in assignment (expression has type "DatetimeArray", - # variable has type "ndarray[Any, Any]") - value = sequence_to_datetimes(v) # type: ignore[assignment] - assert value.dtype == "M8[ns]" - - return value + out = lib.maybe_convert_objects( + v, + convert_period=True, + convert_interval=True, + convert_timedelta=True, + convert_datetime=True, + dtype_if_all_nat=np.dtype("M8[ns]"), + ) + if out.dtype.kind in ["i", "u", "f", "b", "c"]: + # Here we do not convert numeric dtypes, as if we wanted that, + # numpy would have done it for us. + # See also _maybe_cast_data_without_dtype + return v + # Incompatible return value type (got "Union[ExtensionArray, ndarray[Any, Any]]", + # expected "Union[ndarray[Any, Any], DatetimeArray, TimedeltaArray, PeriodArray, + # IntervalArray]") + return out # type: ignore[return-value] def maybe_cast_to_datetime( diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 0353d9e23c86e..50fe8379ffa06 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -701,6 +701,29 @@ def test_convert_int_overflow(self, value): result = lib.maybe_convert_objects(arr) tm.assert_numpy_array_equal(arr, result) + @pytest.mark.parametrize("val", [None, np.nan, float("nan")]) + @pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"]) + def test_maybe_convert_objects_nat_inference(self, val, dtype): + dtype = np.dtype(dtype) + vals = np.array([pd.NaT, val], dtype=object) + result = lib.maybe_convert_objects( + vals, + convert_datetime=True, + convert_timedelta=True, + dtype_if_all_nat=dtype, + ) + assert result.dtype == dtype + assert np.isnat(result).all() + + result = lib.maybe_convert_objects( + vals[::-1], + convert_datetime=True, + convert_timedelta=True, + dtype_if_all_nat=dtype, + ) + assert result.dtype == dtype + assert np.isnat(result).all() + @pytest.mark.parametrize( "value, expected_dtype", [ diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index 9a57e3e08a59c..b718c33e666d7 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -39,6 +39,24 @@ class TestIndexConstructorInference: + @pytest.mark.parametrize("val", [NaT, None, np.nan, float("nan")]) + def test_infer_nat(self, val): + # GH#49340 all NaT/None/nan and at least 1 NaT -> datetime64[ns], + # matching Series behavior + values = [NaT, val] + + idx = Index(values) + assert idx.dtype == "datetime64[ns]" and idx.isna().all() + + idx = Index(values[::-1]) + assert idx.dtype == "datetime64[ns]" and idx.isna().all() + + idx = Index(np.array(values, dtype=object)) + assert idx.dtype == "datetime64[ns]" and idx.isna().all() + + idx = Index(np.array(values, dtype=object)[::-1]) + assert idx.dtype == "datetime64[ns]" and idx.isna().all() + @pytest.mark.parametrize("na_value", [None, np.nan]) @pytest.mark.parametrize("vtype", [list, tuple, iter]) def test_construction_list_tuples_nan(self, na_value, vtype):