Skip to content

BUG: passing non-printable unicode to datetime parsing functions #30374

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def ints_to_pydatetime(const int64_t[:] arr, object tz=None, object freq=None,
return result


def _test_parse_iso8601(object ts):
def _test_parse_iso8601(ts: str):
"""
TESTING ONLY: Parse string into Timestamp using iso8601 parser. Used
only for testing, actual construction uses `convert_str_to_tsobject`
Expand All @@ -204,6 +204,9 @@ def _test_parse_iso8601(object ts):
elif ts == 'today':
return Timestamp.now().normalize()

if not ts.isprintable():
raise ValueError(f'Error parsing datetime string "{repr(ts)}"')

_string_to_dts(ts, &obj.dts, &out_local, &out_tzoffset, True)
obj.value = dtstruct_to_dt64(&obj.dts)
check_dts_bounds(&obj.dts)
Expand Down
6 changes: 3 additions & 3 deletions pandas/_libs/tslibs/conversion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -444,15 +444,15 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit,
bint dayfirst=False,
bint yearfirst=False):
"""
Convert a string-like (bytes or unicode) input `ts`, along with optional
timezone object `tz` to a _TSObject.
Convert a string input `ts`, along with optional timezone object `tz`
to a _TSObject.

The optional arguments `dayfirst` and `yearfirst` are passed to the
dateutil parser.

Parameters
----------
ts : bytes or unicode
ts : str
Value to be converted to _TSObject
tz : tzinfo or None
timezone for the timezone-aware output
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/tslibs/np_datetime.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,6 @@ cdef npy_datetime get_datetime64_value(object obj) nogil
cdef npy_timedelta get_timedelta64_value(object obj) nogil
cdef NPY_DATETIMEUNIT get_datetime64_unit(object obj) nogil

cdef int _string_to_dts(object val, npy_datetimestruct* dts,
cdef int _string_to_dts(str val, npy_datetimestruct* dts,
int* out_local, int* out_tzoffset,
bint want_exc) except? -1
2 changes: 1 addition & 1 deletion pandas/_libs/tslibs/np_datetime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ cdef inline int64_t pydate_to_dt64(date val, npy_datetimestruct *dts):
return dtstruct_to_dt64(dts)


cdef inline int _string_to_dts(object val, npy_datetimestruct* dts,
cdef inline int _string_to_dts(str val, npy_datetimestruct* dts,
int* out_local, int* out_tzoffset,
bint want_exc) except? -1:
cdef:
Expand Down
40 changes: 28 additions & 12 deletions pandas/_libs/tslibs/parsing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -86,16 +86,15 @@ cdef inline int _parse_4digit(const char* s):
return result


cdef inline object _parse_delimited_date(object date_string, bint dayfirst):
cdef inline object _parse_delimited_date(str date_string, bint dayfirst):
"""
Parse special cases of dates: MM/DD/YYYY, DD/MM/YYYY, MM/YYYY.

At the beginning function tries to parse date in MM/DD/YYYY format, but
if month > 12 - in DD/MM/YYYY (`dayfirst == False`).
With `dayfirst == True` function makes an attempt to parse date in
DD/MM/YYYY, if an attempt is wrong - in DD/MM/YYYY

Note
----
For MM/DD/YYYY, DD/MM/YYYY: delimiter can be a space or one of /-.
For MM/YYYY: delimiter can be a space or one of /-
If `date_string` can't be converted to date, then function returns
Expand All @@ -104,11 +103,16 @@ cdef inline object _parse_delimited_date(object date_string, bint dayfirst):
Parameters
----------
date_string : str
dayfirst : bint
dayfirst : bool

Returns:
--------
datetime, resolution

Notes
-----
We assume that date_string.isprintable() has already been confirmed
at this point.
"""
cdef:
const char* buf
Expand Down Expand Up @@ -156,18 +160,24 @@ cdef inline object _parse_delimited_date(object date_string, bint dayfirst):
raise DateParseError(f"Invalid date specified ({month}/{day})")


cdef inline bint does_string_look_like_time(object parse_string):
cdef inline bint does_string_look_like_time(str parse_string):
"""
Checks whether given string is a time: it has to start either from
H:MM or from HH:MM, and hour and minute values must be valid.

Parameters
----------
date_string : str
parse_string : str

Returns:
--------
whether given string is a time
bool
Whether given string is potentially a time.

Notes
-----
We assume that date_string.isprintable() has already been confirmed
at this point.
"""
cdef:
const char* buf
Expand All @@ -188,7 +198,7 @@ cdef inline bint does_string_look_like_time(object parse_string):
return 0 <= hour <= 23 and 0 <= minute <= 59


def parse_datetime_string(date_string, freq=None, dayfirst=False,
def parse_datetime_string(date_string: str, freq=None, dayfirst=False,
yearfirst=False, **kwargs):
"""parse datetime string, only returns datetime.
Also cares special handling matching time patterns.
Expand Down Expand Up @@ -270,7 +280,7 @@ def parse_time_string(arg: str, freq=None, dayfirst=None, yearfirst=None):
return res


cdef parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False,
cdef parse_datetime_string_with_reso(str date_string, freq=None, dayfirst=False,
yearfirst=False):
"""parse datetime string, only returns datetime

Expand Down Expand Up @@ -315,18 +325,19 @@ cdef parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False,
return parsed, parsed, reso


cpdef bint _does_string_look_like_datetime(object py_string):
cpdef bint _does_string_look_like_datetime(str py_string):
"""
Checks whether given string is a datetime: it has to start with '0' or
be greater than 1000.

Parameters
----------
py_string: object
py_string: str

Returns
-------
whether given string is a datetime
bool
Whether given string is potentially a datetime.
"""
cdef:
const char *buf
Expand All @@ -336,6 +347,11 @@ cpdef bint _does_string_look_like_datetime(object py_string):
char first
int error = 0

if not py_string.isprintable():
# e.g. unicode surrogates, call to get_c_string_buf_and_size
# risks segfault; definitely not a datetime
return False

buf = get_c_string_buf_and_size(py_string, &length)
if length >= 1:
first = buf[0]
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/tslibs/test_parse_iso8601.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,16 @@ def test_parsers_iso8601(date_str, exp):
"20010101 123",
"20010101 12345",
"20010101 12345Z",
# GH#30374 un-printable unicode, largely a test that we don't segfault
"\ud83d",
],
)
def test_parsers_iso8601_invalid(date_str):
# Note: repr is needed here for the non-printable case
msg = 'Error parsing datetime string "{s}"'.format(s=date_str)
if not msg.isprintable():
# no clear way to get regex to work
msg = "Error parsing datetime string"

with pytest.raises(ValueError, match=msg):
tslib._test_parse_iso8601(date_str)
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/tslibs/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,3 +225,19 @@ def test_parse_time_string_check_instance_type_raise_exception():
result = parse_time_string("2019")
expected = (datetime(2019, 1, 1), datetime(2019, 1, 1), "year")
assert result == expected


def test_non_printable_inputs_dont_raise():
# Check that tslibs.parsing functions do not raise/segfault when passed
# non-printable unicode strings.

val = "\ud83d" # unprintable "surrogate"

assert not parsing._does_string_look_like_datetime(val)

msg = "Given date string not likely a datetime"
with pytest.raises(ValueError, match=msg):
parsing.parse_datetime_string(val)

with pytest.raises(ValueError, match=msg):
parse_time_string(val)