From 7455e87e6da6a35a9621a567c12fdd9f3b29b111 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 16 Dec 2022 10:21:48 -0800 Subject: [PATCH 1/4] BUG: to_datetime with M or Y unit and non-round float --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/_libs/tslib.pyx | 46 ++++++++++++++++++++++++-- pandas/tests/tools/test_to_datetime.py | 22 ++++++++++++ 3 files changed, 67 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 215e9c2a85bba..754051796d0c2 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -486,6 +486,7 @@ Other API changes new DataFrame (shallow copy) instead of the original DataFrame, consistent with other methods to get a full slice (for example ``df.loc[:]`` or ``df[:]``) (:issue:`49469`) - Disallow computing ``cumprod`` for :class:`Timedelta` object; previously this returned incorrect values (:issue:`50246`) +- :func:`to_datetime` with ``unit`` of either "Y" or "M" will now raise if a sequence contains a non-round ``float`` value, matching the ``Timestamp`` behavior (:issue:`??`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 6f0ab6eb0d532..3934ba4bbdbea 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -276,9 +276,13 @@ def array_with_unit_to_datetime( ndarray[object] oresult ndarray mask object tz = None + bint is_ym + float fval assert is_ignore or is_coerce or is_raise + is_ym = unit in "YM" + if unit == "ns": if issubclass(values.dtype.type, (np.integer, np.float_)): result = values.astype("M8[ns]", copy=False) @@ -345,6 +349,18 @@ def array_with_unit_to_datetime( if val != val or val == NPY_NAT: iresult[i] = NPY_NAT else: + if is_ym and is_float_object(val) and val != int(val): + # Analogous to GH#47266 for Timestamp + if is_raise: + raise ValueError( + f"Conversion of non-round float with unit={unit} " + "is ambiguous" + ) + elif is_ignore: + raise AssertionError + iresult[i] = NPY_NAT + continue + try: iresult[i] = cast_from_unit(val, unit) except OverflowError: @@ -361,8 +377,33 @@ def array_with_unit_to_datetime( iresult[i] = NPY_NAT else: + + try: + fval = float(val) + except ValueError: + if is_raise: + raise ValueError( + f"non convertible value {val} with the unit '{unit}'" + ) + elif is_ignore: + raise AssertionError + iresult[i] = NPY_NAT + continue + + if is_ym and fval != int(fval): + # Analogous to GH#47266 for Timestamp + if is_raise: + raise ValueError( + f"Conversion of non-round float with unit={unit} " + "is ambiguous" + ) + elif is_ignore: + raise AssertionError + iresult[i] = NPY_NAT + continue + try: - iresult[i] = cast_from_unit(float(val), unit) + iresult[i] = cast_from_unit(fval, unit) except ValueError: if is_raise: raise ValueError( @@ -400,6 +441,7 @@ def array_with_unit_to_datetime( # and are in ignore mode # redo as object + # TODO: fix subtle differences between this and no-unit code oresult = cnp.PyArray_EMPTY(values.ndim, values.shape, cnp.NPY_OBJECT, 0) for i in range(n): val = values[i] @@ -412,7 +454,7 @@ def array_with_unit_to_datetime( oresult[i] = NaT else: try: - oresult[i] = Timestamp(cast_from_unit(val, unit)) + oresult[i] = Timestamp(val, unit=unit) except OverflowError: oresult[i] = val diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index c4c60b6c4231e..4b7ced59fd636 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1421,6 +1421,28 @@ def test_to_datetime_fixed_offset(self): class TestToDatetimeUnit: + @pytest.mark.parametrize("unit", ["Y", "M"]) + def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): + # Match Timestamp behavior in disallowing non-round floats with + # Y or M unit + msg = f"Conversion of non-round float with unit={unit} is ambiguous" + with pytest.raises(ValueError, match=msg): + to_datetime([1.5], unit=unit, errors="raise") + + # with errors="ignore" we also end up raising within the Timestamp + # constructor; this may not be ideal + with pytest.raises(ValueError, match=msg): + to_datetime([1.5], unit=unit, errors="ignore") + + res = to_datetime([1.5], unit=unit, errors="coerce") + expected = Index([NaT], dtype="M8[ns]") + tm.assert_index_equal(res, expected) + + # round floats are OK + res = to_datetime([1.0], unit=unit) + expected = to_datetime([1], unit=unit) + tm.assert_index_equal(res, expected) + def test_unit(self, cache): # GH 11758 # test proper behavior with errors From 811aec90eaf610f7856c2878bc140642c828c7e1 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 17 Dec 2022 10:58:19 -0800 Subject: [PATCH 2/4] use is_integer --- pandas/_libs/tslib.pyx | 4 ++-- pandas/tests/tools/test_to_datetime.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 3934ba4bbdbea..ccbbb4436423a 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -349,7 +349,7 @@ def array_with_unit_to_datetime( if val != val or val == NPY_NAT: iresult[i] = NPY_NAT else: - if is_ym and is_float_object(val) and val != int(val): + if is_ym and is_float_object(val) and not val.is_integer(): # Analogous to GH#47266 for Timestamp if is_raise: raise ValueError( @@ -390,7 +390,7 @@ def array_with_unit_to_datetime( iresult[i] = NPY_NAT continue - if is_ym and fval != int(fval): + if is_ym and not fval.is_integer(): # Analogous to GH#47266 for Timestamp if is_raise: raise ValueError( diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 4b7ced59fd636..b9735b41c2ad0 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1423,6 +1423,7 @@ def test_to_datetime_fixed_offset(self): class TestToDatetimeUnit: @pytest.mark.parametrize("unit", ["Y", "M"]) def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): + # GH#50301 # Match Timestamp behavior in disallowing non-round floats with # Y or M unit msg = f"Conversion of non-round float with unit={unit} is ambiguous" From 3d6d2a0200292e1328bf7a3d3599e3249b339634 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 22 Dec 2022 09:22:53 -0800 Subject: [PATCH 3/4] GH ref --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 032eeee2f16b6..ba5c3c64b4a06 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -489,7 +489,7 @@ Other API changes new DataFrame (shallow copy) instead of the original DataFrame, consistent with other methods to get a full slice (for example ``df.loc[:]`` or ``df[:]``) (:issue:`49469`) - Disallow computing ``cumprod`` for :class:`Timedelta` object; previously this returned incorrect values (:issue:`50246`) -- :func:`to_datetime` with ``unit`` of either "Y" or "M" will now raise if a sequence contains a non-round ``float`` value, matching the ``Timestamp`` behavior (:issue:`??`) +- :func:`to_datetime` with ``unit`` of either "Y" or "M" will now raise if a sequence contains a non-round ``float`` value, matching the ``Timestamp`` behavior (:issue:`50301`) - .. --------------------------------------------------------------------------- From f008c847fb46039957553f2a0abe5d1ab7096ec6 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 27 Dec 2022 15:23:28 -0800 Subject: [PATCH 4/4] added test --- pandas/tests/tools/test_to_datetime.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 35447419f5372..559e4602a37e1 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1497,16 +1497,25 @@ def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): msg = f"Conversion of non-round float with unit={unit} is ambiguous" with pytest.raises(ValueError, match=msg): to_datetime([1.5], unit=unit, errors="raise") + with pytest.raises(ValueError, match=msg): + to_datetime(["1.5"], unit=unit, errors="raise") # with errors="ignore" we also end up raising within the Timestamp # constructor; this may not be ideal with pytest.raises(ValueError, match=msg): to_datetime([1.5], unit=unit, errors="ignore") + # TODO: we are NOT consistent with the Timestamp behavior in the + # float-like string case + # with pytest.raises(ValueError, match=msg): + # to_datetime(["1.5"], unit=unit, errors="ignore") res = to_datetime([1.5], unit=unit, errors="coerce") expected = Index([NaT], dtype="M8[ns]") tm.assert_index_equal(res, expected) + res = to_datetime(["1.5"], unit=unit, errors="coerce") + tm.assert_index_equal(res, expected) + # round floats are OK res = to_datetime([1.0], unit=unit) expected = to_datetime([1], unit=unit)