diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 598def4e1d9fa..840c302c08f21 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -188,7 +188,7 @@ def ints_to_pydatetime(const int64_t[:] arr, object tz=None, object freq=None, return result -def _test_parse_iso8601(object ts): +def _test_parse_iso8601(ts: str): """ TESTING ONLY: Parse string into Timestamp using iso8601 parser. Used only for testing, actual construction uses `convert_str_to_tsobject` @@ -204,6 +204,9 @@ def _test_parse_iso8601(object ts): elif ts == 'today': return Timestamp.now().normalize() + if not ts.isprintable(): + raise ValueError(f'Error parsing datetime string "{repr(ts)}"') + _string_to_dts(ts, &obj.dts, &out_local, &out_tzoffset, True) obj.value = dtstruct_to_dt64(&obj.dts) check_dts_bounds(&obj.dts) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index c5315219b8422..6923252690557 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -444,15 +444,15 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, bint dayfirst=False, bint yearfirst=False): """ - Convert a string-like (bytes or unicode) input `ts`, along with optional - timezone object `tz` to a _TSObject. + Convert a string input `ts`, along with optional timezone object `tz` + to a _TSObject. The optional arguments `dayfirst` and `yearfirst` are passed to the dateutil parser. Parameters ---------- - ts : bytes or unicode + ts : str Value to be converted to _TSObject tz : tzinfo or None timezone for the timezone-aware output diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index 020bcdf0a7b15..ebedee79405e5 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -72,6 +72,6 @@ cdef npy_datetime get_datetime64_value(object obj) nogil cdef npy_timedelta get_timedelta64_value(object obj) nogil cdef NPY_DATETIMEUNIT get_datetime64_unit(object obj) nogil -cdef int _string_to_dts(object val, npy_datetimestruct* dts, +cdef int _string_to_dts(str val, npy_datetimestruct* dts, int* out_local, int* out_tzoffset, bint want_exc) except? -1 diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index b9406074bb130..b59a1101e0bf7 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -167,7 +167,7 @@ cdef inline int64_t pydate_to_dt64(date val, npy_datetimestruct *dts): return dtstruct_to_dt64(dts) -cdef inline int _string_to_dts(object val, npy_datetimestruct* dts, +cdef inline int _string_to_dts(str val, npy_datetimestruct* dts, int* out_local, int* out_tzoffset, bint want_exc) except? -1: cdef: diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index ecf3e35c86d76..fb5ebd6634666 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -86,16 +86,15 @@ cdef inline int _parse_4digit(const char* s): return result -cdef inline object _parse_delimited_date(object date_string, bint dayfirst): +cdef inline object _parse_delimited_date(str date_string, bint dayfirst): """ Parse special cases of dates: MM/DD/YYYY, DD/MM/YYYY, MM/YYYY. + At the beginning function tries to parse date in MM/DD/YYYY format, but if month > 12 - in DD/MM/YYYY (`dayfirst == False`). With `dayfirst == True` function makes an attempt to parse date in DD/MM/YYYY, if an attempt is wrong - in DD/MM/YYYY - Note - ---- For MM/DD/YYYY, DD/MM/YYYY: delimiter can be a space or one of /-. For MM/YYYY: delimiter can be a space or one of /- If `date_string` can't be converted to date, then function returns @@ -104,11 +103,16 @@ cdef inline object _parse_delimited_date(object date_string, bint dayfirst): Parameters ---------- date_string : str - dayfirst : bint + dayfirst : bool Returns: -------- datetime, resolution + + Notes + ----- + We assume that date_string.isprintable() has already been confirmed + at this point. """ cdef: const char* buf @@ -156,18 +160,24 @@ cdef inline object _parse_delimited_date(object date_string, bint dayfirst): raise DateParseError(f"Invalid date specified ({month}/{day})") -cdef inline bint does_string_look_like_time(object parse_string): +cdef inline bint does_string_look_like_time(str parse_string): """ Checks whether given string is a time: it has to start either from H:MM or from HH:MM, and hour and minute values must be valid. Parameters ---------- - date_string : str + parse_string : str Returns: -------- - whether given string is a time + bool + Whether given string is potentially a time. + + Notes + ----- + We assume that date_string.isprintable() has already been confirmed + at this point. """ cdef: const char* buf @@ -188,7 +198,7 @@ cdef inline bint does_string_look_like_time(object parse_string): return 0 <= hour <= 23 and 0 <= minute <= 59 -def parse_datetime_string(date_string, freq=None, dayfirst=False, +def parse_datetime_string(date_string: str, freq=None, dayfirst=False, yearfirst=False, **kwargs): """parse datetime string, only returns datetime. Also cares special handling matching time patterns. @@ -270,7 +280,7 @@ def parse_time_string(arg: str, freq=None, dayfirst=None, yearfirst=None): return res -cdef parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False, +cdef parse_datetime_string_with_reso(str date_string, freq=None, dayfirst=False, yearfirst=False): """parse datetime string, only returns datetime @@ -315,18 +325,19 @@ cdef parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False, return parsed, parsed, reso -cpdef bint _does_string_look_like_datetime(object py_string): +cpdef bint _does_string_look_like_datetime(str py_string): """ Checks whether given string is a datetime: it has to start with '0' or be greater than 1000. Parameters ---------- - py_string: object + py_string: str Returns ------- - whether given string is a datetime + bool + Whether given string is potentially a datetime. """ cdef: const char *buf @@ -336,6 +347,11 @@ cpdef bint _does_string_look_like_datetime(object py_string): char first int error = 0 + if not py_string.isprintable(): + # e.g. unicode surrogates, call to get_c_string_buf_and_size + # risks segfault; definitely not a datetime + return False + buf = get_c_string_buf_and_size(py_string, &length) if length >= 1: first = buf[0] diff --git a/pandas/tests/tslibs/test_parse_iso8601.py b/pandas/tests/tslibs/test_parse_iso8601.py index a6e7aee46b485..c262dcded5656 100644 --- a/pandas/tests/tslibs/test_parse_iso8601.py +++ b/pandas/tests/tslibs/test_parse_iso8601.py @@ -48,10 +48,16 @@ def test_parsers_iso8601(date_str, exp): "20010101 123", "20010101 12345", "20010101 12345Z", + # GH#30374 un-printable unicode, largely a test that we don't segfault + "\ud83d", ], ) def test_parsers_iso8601_invalid(date_str): + # Note: repr is needed here for the non-printable case msg = 'Error parsing datetime string "{s}"'.format(s=date_str) + if not msg.isprintable(): + # no clear way to get regex to work + msg = "Error parsing datetime string" with pytest.raises(ValueError, match=msg): tslib._test_parse_iso8601(date_str) diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 0bc30347b3fa9..e636b458a8384 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -225,3 +225,19 @@ def test_parse_time_string_check_instance_type_raise_exception(): result = parse_time_string("2019") expected = (datetime(2019, 1, 1), datetime(2019, 1, 1), "year") assert result == expected + + +def test_non_printable_inputs_dont_raise(): + # Check that tslibs.parsing functions do not raise/segfault when passed + # non-printable unicode strings. + + val = "\ud83d" # unprintable "surrogate" + + assert not parsing._does_string_look_like_datetime(val) + + msg = "Given date string not likely a datetime" + with pytest.raises(ValueError, match=msg): + parsing.parse_datetime_string(val) + + with pytest.raises(ValueError, match=msg): + parse_time_string(val)