From de3e43c0e5f86f6316b466e3a799c3d03c7570cc Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 16 Jan 2023 18:19:19 -0800 Subject: [PATCH 1/3] REF: fewer paths through datetime parsing code --- doc/source/user_guide/timeseries.rst | 2 +- pandas/_libs/tslibs/parsing.pyx | 32 +++++++++++-------- pandas/tests/frame/test_block_internals.py | 2 +- .../indexes/datetimes/test_constructors.py | 2 +- .../indexes/datetimes/test_date_range.py | 2 +- .../scalar/timestamp/test_constructors.py | 2 +- pandas/tests/series/test_constructors.py | 2 +- pandas/tests/tools/test_to_datetime.py | 9 +++--- .../tseries/frequencies/test_inference.py | 4 +-- 9 files changed, 31 insertions(+), 26 deletions(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 7e1368061322b..df2508397ff34 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -292,7 +292,7 @@ The default behavior, ``errors='raise'``, is to raise when unparsable: .. code-block:: ipython In [2]: pd.to_datetime(['2009/07/31', 'asd'], errors='raise') - ValueError: Unknown string format + ValueError: Unknown datetime string format Pass ``errors='ignore'`` to return the original input when unparsable: diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 9e593ec64f7d2..99b50f86971db 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -287,12 +287,9 @@ def parse_datetime_string( pass try: - dt = du_parse(date_string, default=_DEFAULT_DATETIME, - dayfirst=dayfirst, yearfirst=yearfirst) - except TypeError: - # following may be raised from dateutil - # TypeError: 'NoneType' object is not iterable - raise ValueError(f'Given date string "{date_string}" not likely a datetime') + dt, _ = dateutil_parse(date_string, default=_DEFAULT_DATETIME, + dayfirst=dayfirst, yearfirst=yearfirst, + ignoretz=False) except OverflowError as err: # with e.g. "08335394550" dateutil raises when trying to pass # year=8335394550 to datetime.replace @@ -418,10 +415,8 @@ def parse_datetime_string_with_reso( dayfirst=dayfirst, yearfirst=yearfirst, ignoretz=False) except (ValueError, OverflowError) as err: - # TODO: allow raise of errors within instead - raise DateParseError(err) - if parsed is None: - raise DateParseError(f"Could not parse {date_string}") + # e.g. "day is out of range for month" raised in default.replace + raise DateParseError(err) from err return parsed, reso @@ -472,6 +467,8 @@ cpdef bint _does_string_look_like_datetime(str py_string): return True +# TODO: declaring date_string as str (and avoiding assertion below) +# breaks tests, not clear why cdef object _parse_dateabbr_string(object date_string, datetime default, str freq=None): cdef: @@ -608,7 +605,7 @@ cpdef quarter_to_myear(int year, int quarter, str freq): cdef dateutil_parse( str timestr, - object default, + datetime default, bint ignoretz=False, bint dayfirst=False, bint yearfirst=False, @@ -625,7 +622,9 @@ cdef dateutil_parse( res, _ = DEFAULTPARSER._parse(timestr, dayfirst=dayfirst, yearfirst=yearfirst) if res is None: - raise ValueError(f"Unknown datetime string format, unable to parse: {timestr}") + raise DateParseError( + f"Unknown datetime string format, unable to parse: {timestr}" + ) for attr in ["year", "month", "day", "hour", "minute", "second", "microsecond"]: @@ -635,7 +634,7 @@ cdef dateutil_parse( reso = attr if reso is None: - raise ValueError(f"Unable to parse datetime string: {timestr}") + raise DateParseError(f"Unable to parse datetime string: {timestr}") if reso == "microsecond": if repl["microsecond"] == 0: @@ -643,7 +642,12 @@ cdef dateutil_parse( elif repl["microsecond"] % 1000 == 0: reso = "millisecond" - ret = default.replace(**repl) + try: + ret = default.replace(**repl) + except ValueError as err: + # we re-raise to match dateutil's exception message + raise ValueError(str(err) + ": " + timestr) from err + if res.weekday is not None and not res.day: ret = ret + relativedelta.relativedelta(weekday=res.weekday) if not ignoretz: diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index f2de6b607d737..5fca8d0568a67 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -259,7 +259,7 @@ def f(dtype): f("float64") # 10822 - msg = "^Unknown string format: aa, at position 0$" + msg = "^Unknown datetime string format, unable to parse: aa, at position 0$" with pytest.raises(ValueError, match=msg): f("M8[ns]") diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index e1ada9f10c261..1d82d3d9c5b9d 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -1037,7 +1037,7 @@ def test_from_freq_recreate_from_data(self, freq): def test_datetimeindex_constructor_misc(self): arr = ["1/1/2005", "1/2/2005", "Jn 3, 2005", "2005-01-04"] - msg = r"(\(')?Unknown string format(:', 'Jn 3, 2005'\))?" + msg = r"(\(')?Unknown datetime string format(:', 'Jn 3, 2005'\))?" with pytest.raises(ValueError, match=msg): DatetimeIndex(arr) diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 11bc785c66b70..8979d99675589 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -980,7 +980,7 @@ def test_misc(self): def test_date_parse_failure(self): badly_formed_date = "2007/100/1" - msg = "Unknown string format: 2007/100/1" + msg = "Unknown datetime string format, unable to parse: 2007/100/1" with pytest.raises(ValueError, match=msg): Timestamp(badly_formed_date) diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index 8ee92e28b78bf..4b7d2bc4b57d4 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -439,7 +439,7 @@ def test_constructor_nanosecond(self, result): @pytest.mark.parametrize("z", ["Z0", "Z00"]) def test_constructor_invalid_Z0_isostring(self, z): # GH 8910 - msg = f"Unknown string format: 2014-11-02 01:00{z}" + msg = f"Unknown datetime string format, unable to parse: 2014-11-02 01:00{z}" with pytest.raises(ValueError, match=msg): Timestamp(f"2014-11-02 01:00{z}") diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 05e40e20f1226..1ff43707ed849 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -79,7 +79,7 @@ def test_infer_with_date_and_datetime(self): def test_unparseable_strings_with_dt64_dtype(self): # pre-2.0 these would be silently ignored and come back with object dtype vals = ["aa"] - msg = "^Unknown string format: aa, at position 0$" + msg = "^Unknown datetime string format, unable to parse: aa, at position 0$" with pytest.raises(ValueError, match=msg): Series(vals, dtype="datetime64[ns]") diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index fcbe33a555f4f..3684c291873e3 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2477,7 +2477,7 @@ def test_string_na_nat_conversion_malformed(self, cache): malformed = np.array(["1/100/2000", np.nan], dtype=object) # GH 10636, default is now 'raise' - msg = r"Unknown string format:|day is out of range for month" + msg = r"Unknown datetime string format" with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning( UserWarning, match="Could not infer format" @@ -3218,9 +3218,10 @@ def test_invalid_origins_tzinfo(self): def test_incorrect_value_exception(self): # GH47495 - with pytest.raises( - ValueError, match="Unknown string format: yesterday, at position 1" - ): + msg = ( + "Unknown datetime string format, unable to parse: yesterday, at position 1" + ) + with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning( UserWarning, match="Could not infer format" ): diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index 773936c9517e8..3badebb224aee 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -387,7 +387,7 @@ def test_invalid_index_types_unicode(): # see gh-10822 # # Odd error message on conversions to datetime for unicode. - msg = "Unknown string format" + msg = "Unknown datetime string format" with pytest.raises(ValueError, match=msg): frequencies.infer_freq(tm.makeStringIndex(10)) @@ -422,7 +422,7 @@ def test_series_invalid_type(end): def test_series_inconvertible_string(): # see gh-6407 - msg = "Unknown string format" + msg = "Unknown datetime string format" with pytest.raises(ValueError, match=msg): frequencies.infer_freq(Series(["foo", "bar"])) From e3548f75f6497638660c31a755a490583c97c861 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 16 Jan 2023 19:03:35 -0800 Subject: [PATCH 2/3] dont catch in conversion.pyx --- pandas/_libs/tslibs/conversion.pyx | 13 +++---------- pandas/tests/io/pytables/test_select.py | 2 +- pandas/tests/tools/test_to_datetime.py | 2 +- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index aacb06fe36037..e0d2debbfb200 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -538,16 +538,9 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, str unit, maybe_localize_tso(obj, tz, obj.creso) return obj - try: - dt = parse_datetime_string( - ts, dayfirst=dayfirst, yearfirst=yearfirst - ) - except ValueError as err: - if "out of range for month" in str(err): - # dateutil raised when constructing a datetime object, - # let's give a nicer exception message - raise ValueError("could not convert string to Timestamp") from err - raise + dt = parse_datetime_string( + ts, dayfirst=dayfirst, yearfirst=yearfirst + ) return convert_datetime_to_tsobject(dt, tz) diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py index b0c9b85e7ad05..efaa907ec98b9 100644 --- a/pandas/tests/io/pytables/test_select.py +++ b/pandas/tests/io/pytables/test_select.py @@ -594,7 +594,7 @@ def test_frame_select(setup_path): # invalid terms df = tm.makeTimeDataFrame() store.append("df_time", df) - msg = "could not convert string to Timestamp" + msg = "day is out of range for month: 0" with pytest.raises(ValueError, match=msg): store.select("df_time", "index>0") diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 3684c291873e3..bb71a537ab58f 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2791,7 +2791,7 @@ def test_day_not_in_month_coerce(self, cache, arg, format, warning): assert isna(to_datetime(arg, errors="coerce", format=format, cache=cache)) def test_day_not_in_month_raise(self, cache): - msg = "could not convert string to Timestamp" + msg = "day is out of range for month: 2015-02-29, at position 0" with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning( UserWarning, match="Could not infer format" From 61b95efb81cf8b5099c964a2baccf140b26dc57d Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 17 Jan 2023 08:23:57 -0800 Subject: [PATCH 3/3] REF: move exception handling within dateutil_parse --- pandas/_libs/tslibs/parsing.pyx | 37 ++++++++++++++------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 99b50f86971db..ad52dd9949091 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -286,16 +286,10 @@ def parse_datetime_string( except ValueError: pass - try: - dt, _ = dateutil_parse(date_string, default=_DEFAULT_DATETIME, - dayfirst=dayfirst, yearfirst=yearfirst, - ignoretz=False) - except OverflowError as err: - # with e.g. "08335394550" dateutil raises when trying to pass - # year=8335394550 to datetime.replace - raise OutOfBoundsDatetime( - f'Parsing "{date_string}" to datetime overflows' - ) from err + dt, _ = dateutil_parse(date_string, default=_DEFAULT_DATETIME, + dayfirst=dayfirst, yearfirst=yearfirst, + ignoretz=False) + if dt.tzinfo is not None: # dateutil can return a datetime with a tzoffset outside of (-24H, 24H) # bounds, which is invalid (can be constructed, but raises if we call @@ -410,13 +404,9 @@ def parse_datetime_string_with_reso( except ValueError: pass - try: - parsed, reso = dateutil_parse(date_string, _DEFAULT_DATETIME, - dayfirst=dayfirst, yearfirst=yearfirst, - ignoretz=False) - except (ValueError, OverflowError) as err: - # e.g. "day is out of range for month" raised in default.replace - raise DateParseError(err) from err + parsed, reso = dateutil_parse(date_string, _DEFAULT_DATETIME, + dayfirst=dayfirst, yearfirst=yearfirst, + ignoretz=False) return parsed, reso @@ -467,8 +457,6 @@ cpdef bint _does_string_look_like_datetime(str py_string): return True -# TODO: declaring date_string as str (and avoiding assertion below) -# breaks tests, not clear why cdef object _parse_dateabbr_string(object date_string, datetime default, str freq=None): cdef: @@ -616,7 +604,7 @@ cdef dateutil_parse( str attr datetime ret object res - object reso = None + str reso = None dict repl = {} res, _ = DEFAULTPARSER._parse(timestr, dayfirst=dayfirst, yearfirst=yearfirst) @@ -645,8 +633,15 @@ cdef dateutil_parse( try: ret = default.replace(**repl) except ValueError as err: + # e.g. "day is out of range for month" # we re-raise to match dateutil's exception message - raise ValueError(str(err) + ": " + timestr) from err + raise DateParseError(str(err) + ": " + timestr) from err + except OverflowError as err: + # with e.g. "08335394550" dateutil raises when trying to pass + # year=8335394550 to datetime.replace + raise OutOfBoundsDatetime( + f'Parsing "{timestr}" to datetime overflows' + ) from err if res.weekday is not None and not res.day: ret = ret + relativedelta.relativedelta(weekday=res.weekday)