diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 59cc709359a8d..dee793f5ef002 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -214,6 +214,7 @@ Removal of prior version deprecations/changes - :func:`concat` no longer ignores empty objects when determining output dtypes (:issue:`39122`) - :func:`concat` with all-NA entries no longer ignores the dtype of those entries when determining the result dtype (:issue:`40893`) - :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`) +- :func:`to_datetime` with a ``unit`` specified no longer parses strings into floats, instead parses them the same way as without ``unit`` (:issue:`50735`) - :meth:`DataFrame.groupby` with ``as_index=False`` and aggregation methods will no longer exclude from the result the groupings that do not arise from the input (:issue:`49519`) - :meth:`Series.dt.to_pydatetime` now returns a :class:`Series` of :py:class:`datetime.datetime` objects (:issue:`52459`) - :meth:`SeriesGroupBy.agg` no longer pins the name of the group to the input passed to the provided ``func`` (:issue:`51703`) diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index 5a340c1d88bc4..7e3372a80db9d 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -11,11 +11,6 @@ def format_array_from_datetime( na_rep: str | float = ..., reso: int = ..., # NPY_DATETIMEUNIT ) -> npt.NDArray[np.object_]: ... -def array_with_unit_to_datetime( - values: npt.NDArray[np.object_], - unit: str, - errors: str = ..., -) -> tuple[np.ndarray, tzinfo | None]: ... def first_non_null(values: np.ndarray) -> int: ... def array_to_datetime( values: npt.NDArray[np.object_], @@ -24,6 +19,7 @@ def array_to_datetime( yearfirst: bool = ..., utc: bool = ..., creso: int = ..., + unit_for_numerics: str | None = ..., ) -> tuple[np.ndarray, tzinfo | None]: ... # returned ndarray may be object dtype or datetime64[ns] diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index aecf9f2e46bd4..dca3ba0ce49b3 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -1,7 +1,3 @@ -import warnings - -from pandas.util._exceptions import find_stack_level - cimport cython from datetime import timezone @@ -234,117 +230,6 @@ def format_array_from_datetime( return result -def array_with_unit_to_datetime( - ndarray[object] values, - str unit, - str errors="coerce" -): - """ - Convert the ndarray to datetime according to the time unit. - - This function converts an array of objects into a numpy array of - datetime64[ns]. It returns the converted array - and also returns the timezone offset - - if errors: - - raise: return converted values or raise OutOfBoundsDatetime - if out of range on the conversion or - ValueError for other conversions (e.g. a string) - - ignore: return non-convertible values as the same unit - - coerce: NaT for non-convertibles - - Parameters - ---------- - values : ndarray - Date-like objects to convert. - unit : str - Time unit to use during conversion. - errors : str, default 'raise' - Error behavior when parsing. - - Returns - ------- - result : ndarray of m8 values - tz : parsed timezone offset or None - """ - cdef: - Py_ssize_t i, n=len(values) - bint is_coerce = errors == "coerce" - bint is_raise = errors == "raise" - ndarray[int64_t] iresult - tzinfo tz = None - double fval - - assert is_coerce or is_raise - - if unit == "ns": - result, tz = array_to_datetime( - values.astype(object, copy=False), - errors=errors, - creso=NPY_FR_ns, - ) - return result, tz - - result = np.empty(n, dtype="M8[ns]") - iresult = result.view("i8") - - for i in range(n): - val = values[i] - - try: - if checknull_with_nat_and_na(val): - iresult[i] = NPY_NAT - - elif is_integer_object(val) or is_float_object(val): - - if val != val or val == NPY_NAT: - iresult[i] = NPY_NAT - else: - iresult[i] = cast_from_unit(val, unit) - - elif isinstance(val, str): - if len(val) == 0 or val in nat_strings: - iresult[i] = NPY_NAT - - else: - - try: - fval = float(val) - except ValueError: - raise ValueError( - f"non convertible value {val} with the unit '{unit}'" - ) - warnings.warn( - "The behavior of 'to_datetime' with 'unit' when parsing " - "strings is deprecated. In a future version, strings will " - "be parsed as datetime strings, matching the behavior " - "without a 'unit'. To retain the old behavior, explicitly " - "cast ints or floats to numeric type before calling " - "to_datetime.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - iresult[i] = cast_from_unit(fval, unit) - - else: - # TODO: makes more sense as TypeError, but that would be an - # API change. - raise ValueError( - f"unit='{unit}' not valid with non-numerical val='{val}'" - ) - - except (ValueError, TypeError) as err: - if is_raise: - err.args = (f"{err}, at position {i}",) - raise - else: - # is_coerce - iresult[i] = NPY_NAT - - return result, tz - - @cython.wraparound(False) @cython.boundscheck(False) def first_non_null(values: ndarray) -> int: @@ -376,6 +261,7 @@ cpdef array_to_datetime( bint yearfirst=False, bint utc=False, NPY_DATETIMEUNIT creso=NPY_FR_ns, + str unit_for_numerics=None, ): """ Converts a 1D array of date-like values to a numpy array of either: @@ -404,6 +290,7 @@ cpdef array_to_datetime( indicator whether the dates should be UTC creso : NPY_DATETIMEUNIT, default NPY_FR_ns Set to NPY_FR_GENERIC to infer a resolution. + unit_for_numerics : str, default "ns" Returns ------- @@ -434,6 +321,13 @@ cpdef array_to_datetime( abbrev = "ns" else: abbrev = npy_unit_to_abbrev(creso) + + if unit_for_numerics is not None: + # either creso or unit_for_numerics should be passed, not both + assert creso == NPY_FR_ns + else: + unit_for_numerics = abbrev + result = np.empty((values).shape, dtype=f"M8[{abbrev}]") iresult = result.view("i8").ravel() @@ -485,7 +379,8 @@ cpdef array_to_datetime( creso = state.creso # we now need to parse this as if unit=abbrev - iresult[i] = cast_from_unit(val, abbrev, out_reso=creso) + iresult[i] = cast_from_unit(val, unit_for_numerics, out_reso=creso) + state.found_other = True elif isinstance(val, str): diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index df7a6cdb1ea52..b01cdb335ec46 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -481,7 +481,7 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: """ arg = extract_array(arg, extract_numpy=True) - # GH#30050 pass an ndarray to tslib.array_with_unit_to_datetime + # GH#30050 pass an ndarray to tslib.array_to_datetime # because it expects an ndarray argument if isinstance(arg, IntegerArray): arr = arg.astype(f"datetime64[{unit}]") @@ -519,7 +519,12 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: tz_parsed = None else: arg = arg.astype(object, copy=False) - arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors) + arr, tz_parsed = tslib.array_to_datetime( + arg, + utc=utc, + errors=errors, + unit_for_numerics=unit, + ) result = DatetimeIndex(arr, name=name) if not isinstance(result, DatetimeIndex): diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 7ce02c12ac1ca..f4042acd05dc3 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1705,22 +1705,24 @@ def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): # GH#50301 # Match Timestamp behavior in disallowing non-round floats with # Y or M unit - warn_msg = "strings will be parsed as datetime strings" msg = f"Conversion of non-round float with unit={unit} is ambiguous" with pytest.raises(ValueError, match=msg): to_datetime([1.5], unit=unit, errors="raise") with pytest.raises(ValueError, match=msg): to_datetime(np.array([1.5]), unit=unit, errors="raise") + + msg = r"Given date string \"1.5\" not likely a datetime, at position 0" with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - to_datetime(["1.5"], unit=unit, errors="raise") + to_datetime(["1.5"], unit=unit, errors="raise") res = to_datetime([1.5], unit=unit, errors="coerce") expected = Index([NaT], dtype="M8[ns]") tm.assert_index_equal(res, expected) - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - res = to_datetime(["1.5"], unit=unit, errors="coerce") + # In 3.0, the string "1.5" is parsed as as it would be without unit, + # which fails. With errors="coerce" this becomes NaT. + res = to_datetime(["1.5"], unit=unit, errors="coerce") + expected = to_datetime([NaT]) tm.assert_index_equal(res, expected) # round floats are OK @@ -1735,17 +1737,6 @@ def test_unit(self, cache): with pytest.raises(ValueError, match=msg): to_datetime([1], unit="D", format="%Y%m%d", cache=cache) - def test_unit_str(self, cache): - # GH 57051 - # Test that strs aren't dropping precision to 32-bit accidentally. - with tm.assert_produces_warning( - FutureWarning, - match="'to_datetime' with 'unit' when parsing strings is deprecated", - ): - res = to_datetime(["1704660000"], unit="s", origin="unix") - expected = to_datetime([1704660000], unit="s", origin="unix") - tm.assert_index_equal(res, expected) - def test_unit_array_mixed_nans(self, cache): values = [11111111111111111, 1, 1.0, iNaT, NaT, np.nan, "NaT", ""] @@ -1774,7 +1765,7 @@ def test_unit_array_mixed_nans_large_int(self, cache): def test_to_datetime_invalid_str_not_out_of_bounds_valuerror(self, cache): # if we have a string, then we raise a ValueError # and NOT an OutOfBoundsDatetime - msg = "non convertible value foo with the unit 's'" + msg = "Unknown datetime string format, unable to parse: foo, at position 0" with pytest.raises(ValueError, match=msg): to_datetime("foo", errors="raise", unit="s", cache=cache) @@ -1909,7 +1900,13 @@ def test_to_datetime_unit_na_values(self): @pytest.mark.parametrize("bad_val", ["foo", 111111111]) def test_to_datetime_unit_invalid(self, bad_val): - msg = f"{bad_val} with the unit 'D'" + if bad_val == "foo": + msg = ( + "Unknown datetime string format, unable to parse: " + f"{bad_val}, at position 2" + ) + else: + msg = "cannot convert input 111111111 with the unit 'D', at position 2" with pytest.raises(ValueError, match=msg): to_datetime([1, 2, bad_val], unit="D")