From cf71fab4192542bbd39f2d50220b0bc9a98a1778 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 20 Jan 2023 13:44:42 -0800 Subject: [PATCH] DEPR: casting strings to float in to_datetime with unit --- doc/source/whatsnew/v2.0.0.rst | 2 ++ pandas/_libs/tslib.pyx | 14 ++++++++++++++ pandas/io/json/_json.py | 4 +++- pandas/tests/groupby/test_value_counts.py | 8 ++------ pandas/tests/tools/test_to_datetime.py | 7 +++++-- 5 files changed, 26 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index bbecf3fee01f3..2551e57616421 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -614,6 +614,8 @@ Deprecations - :meth:`Index.is_floating` has been deprecated. Use :func:`pandas.api.types.is_float_dtype` instead (:issue:`50042`) - :meth:`Index.holds_integer` has been deprecated. Use :func:`pandas.api.types.infer_dtype` instead (:issue:`50243`) - :meth:`Index.is_categorical` has been deprecated. Use :func:`pandas.api.types.is_categorical_dtype` instead (:issue:`50042`) +- Deprecated behavior of :func:`to_datetime` with ``unit`` when parsing strings, in a future version these will be parsed as datetimes (matching unit-less behavior) instead of cast to floats. To retain the old behavior, cast strings to numeric types before calling :func:`to_datetime` (:issue:`50735`) +- .. --------------------------------------------------------------------------- .. _whatsnew_200.prior_deprecations: diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 1515965644092..65bc5ca4eb99b 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -1,3 +1,7 @@ +import warnings + +from pandas.util._exceptions import find_stack_level + cimport cython from datetime import timezone @@ -316,6 +320,16 @@ def array_with_unit_to_datetime( raise ValueError( f"non convertible value {val} with the unit '{unit}'" ) + warnings.warn( + "The behavior of 'to_datetime' with 'unit' when parsing " + "strings is deprecated. In a future version, strings will " + "be parsed as datetime strings, matching the behavior " + "without a 'unit'. To retain the old behavior, explicitly " + "cast ints or floats to numeric type before calling " + "to_datetime.", + FutureWarning, + stacklevel=find_stack_level(), + ) iresult[i] = _wrapped_cast_from_unit(fval, unit) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index aa1342d0f135f..ae9206447dd75 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -1213,7 +1213,9 @@ def _try_convert_to_date(self, data): if new_data.dtype == "object": try: new_data = data.astype("int64") - except (TypeError, ValueError, OverflowError): + except OverflowError: + return data, False + except (TypeError, ValueError): pass # ignore numbers that are out of range diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 3d1228d65ac7c..ae4b74fc814da 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -135,9 +135,7 @@ def test_series_groupby_value_counts_with_grouper(utc): } ).drop([3]) - df["Datetime"] = to_datetime( - df["Timestamp"].apply(lambda t: str(t)), utc=utc, unit="s" - ) + df["Datetime"] = to_datetime(df["Timestamp"], utc=utc, unit="s") dfg = df.groupby(Grouper(freq="1D", key="Datetime")) # have to sort on index because of unstable sort on values xref GH9212 @@ -1010,9 +1008,7 @@ def test_value_counts_time_grouper(utc): } ).drop([3]) - df["Datetime"] = to_datetime( - df["Timestamp"].apply(lambda t: str(t)), utc=utc, unit="s" - ) + df["Datetime"] = to_datetime(df["Timestamp"], utc=utc, unit="s") gb = df.groupby(Grouper(freq="1D", key="Datetime")) result = gb.value_counts() dates = to_datetime( diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 3509c82d2af6d..b81e753b007bc 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1706,11 +1706,13 @@ def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): # GH#50301 # Match Timestamp behavior in disallowing non-round floats with # Y or M unit + warn_msg = "strings will be parsed as datetime strings" msg = f"Conversion of non-round float with unit={unit} is ambiguous" with pytest.raises(ValueError, match=msg): to_datetime([1.5], unit=unit, errors="raise") with pytest.raises(ValueError, match=msg): - to_datetime(["1.5"], unit=unit, errors="raise") + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + to_datetime(["1.5"], unit=unit, errors="raise") # with errors="ignore" we also end up raising within the Timestamp # constructor; this may not be ideal @@ -1725,7 +1727,8 @@ def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): expected = Index([NaT], dtype="M8[ns]") tm.assert_index_equal(res, expected) - res = to_datetime(["1.5"], unit=unit, errors="coerce") + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + res = to_datetime(["1.5"], unit=unit, errors="coerce") tm.assert_index_equal(res, expected) # round floats are OK