diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 3644928d8dedc..8c2c560c062ac 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -9,6 +9,8 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # VectorData # ---------------------------------------------------------------------- +from pandas._libs.tslibs.util cimport get_c_string + {{py: # name, dtype, arg @@ -595,7 +597,7 @@ cdef class StringHashTable(HashTable): cdef: khiter_t k const char *v - v = util.get_c_string(val) + v = get_c_string(val) k = kh_get_str(self.table, v) if k != self.table.n_buckets: @@ -609,7 +611,7 @@ cdef class StringHashTable(HashTable): int ret = 0 const char *v - v = util.get_c_string(val) + v = get_c_string(val) k = kh_put_str(self.table, v, &ret) self.table.keys[k] = key @@ -632,7 +634,7 @@ cdef class StringHashTable(HashTable): vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] - v = util.get_c_string(val) + v = get_c_string(val) vecs[i] = v with nogil: @@ -662,9 +664,9 @@ cdef class StringHashTable(HashTable): val = values[i] if isinstance(val, (str, unicode)): - v = util.get_c_string(val) + v = get_c_string(val) else: - v = util.get_c_string(self.na_string_sentinel) + v = get_c_string(self.na_string_sentinel) vecs[i] = v with nogil: @@ -695,9 +697,9 @@ cdef class StringHashTable(HashTable): val = values[i] if isinstance(val, (str, unicode)): - v = util.get_c_string(val) + v = get_c_string(val) else: - v = util.get_c_string(self.na_string_sentinel) + v = get_c_string(self.na_string_sentinel) vecs[i] = v with nogil: @@ -776,7 +778,7 @@ cdef class StringHashTable(HashTable): labels[i] = na_sentinel else: # if ignore_na is False, we also stringify NaN/None/etc. - v = util.get_c_string(val) + v = get_c_string(val) vecs[i] = v # compute diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index dbbe9da381f0a..000ec94901457 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- -from cpython cimport (Py_EQ, Py_NE, Py_GE, Py_GT, Py_LT, Py_LE, - PyUnicode_AsASCIIString) +from cpython cimport Py_EQ, Py_NE, Py_GE, Py_GT, Py_LT, Py_LE from cpython.datetime cimport (datetime, date, PyDateTime_IMPORT, @@ -13,6 +12,7 @@ from cpython.datetime cimport (datetime, date, PyDateTime_IMPORT from numpy cimport int64_t +from pandas._libs.tslibs.util cimport get_c_string_buf_and_size cdef extern from "src/datetime/np_datetime.h": int cmp_npy_datetimestruct(npy_datetimestruct *a, @@ -33,7 +33,7 @@ cdef extern from "src/datetime/np_datetime.h": npy_datetimestruct _NS_MIN_DTS, _NS_MAX_DTS cdef extern from "src/datetime/np_datetime_strings.h": - int parse_iso_8601_datetime(char *str, int len, + int parse_iso_8601_datetime(const char *str, int len, npy_datetimestruct *out, int *out_local, int *out_tzoffset) @@ -174,30 +174,9 @@ cdef inline int64_t pydate_to_dt64(date val, npy_datetimestruct *dts): cdef inline int _string_to_dts(object val, npy_datetimestruct* dts, int* out_local, int* out_tzoffset) except? -1: cdef: - int result - char *tmp + Py_ssize_t length + const char* buf - if isinstance(val, unicode): - val = PyUnicode_AsASCIIString(val) - - tmp = val - result = _cstring_to_dts(tmp, len(val), dts, out_local, out_tzoffset) - - if result == -1: - raise ValueError('Unable to parse %s' % str(val)) - return result - - -cdef inline int _cstring_to_dts(char *val, int length, - npy_datetimestruct* dts, - int* out_local, int* out_tzoffset) except? -1: - # Note: without this "extra layer" between _string_to_dts - # and parse_iso_8601_datetime, calling _string_to_dts raises - # `SystemError: returned a result with an error set` - # in Python3 - cdef: - int result - - result = parse_iso_8601_datetime(val, length, - dts, out_local, out_tzoffset) - return result + buf = get_c_string_buf_and_size(val, &length) + return parse_iso_8601_datetime(buf, length, + dts, out_local, out_tzoffset) diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index 207da4b8f8340..abeeaba1d1198 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -66,12 +66,13 @@ This file implements string parsing and creation for NumPy datetime. * * Returns 0 on success, -1 on failure. */ -int parse_iso_8601_datetime(char *str, int len, +int parse_iso_8601_datetime(const char *str, int len, npy_datetimestruct *out, int *out_local, int *out_tzoffset) { int year_leap = 0; int i, numdigits; - char *substr, sublen; + const char *substr; + int sublen; /* If year-month-day are separated by a valid separator, * months/days without leading zeroes will be parsed @@ -586,7 +587,8 @@ int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) { */ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, NPY_DATETIMEUNIT base) { - char *substr = outstr, sublen = outlen; + char *substr = outstr; + int sublen = outlen; int tmplen; /* diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h index 15d5dd357eaef..86ebe890810d6 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h @@ -54,7 +54,7 @@ This file implements string parsing and creation for NumPy datetime. * Returns 0 on success, -1 on failure. */ int -parse_iso_8601_datetime(char *str, int len, +parse_iso_8601_datetime(const char *str, int len, npy_datetimestruct *out, int *out_local, int *out_tzoffset); diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index ef7065a44f18b..414a26c349452 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -24,6 +24,15 @@ cdef extern from "Python.h": bint PyComplex_Check(object obj) nogil bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil + # Note that following functions can potentially raise an exception, + # thus they cannot be declared 'nogil'. Also PyUnicode_AsUTF8AndSize() can + # potentially allocate memory inside in unlikely case of when underlying + # unicode object was stored as non-utf8 and utf8 wasn't requested before. + bint PyBytes_AsStringAndSize(object obj, char** buf, + Py_ssize_t* length) except -1 + const char* PyUnicode_AsUTF8AndSize(object obj, + Py_ssize_t* length) except NULL + from numpy cimport int64_t cdef extern from "numpy/arrayobject.h": @@ -227,3 +236,37 @@ cdef inline bint is_nan(object val): is_nan : bool """ return (is_float_object(val) or is_complex_object(val)) and val != val + + +cdef inline const char* get_c_string_buf_and_size(object py_string, + Py_ssize_t *length): + """ + Extract internal char* buffer of unicode or bytes object `py_string` with + getting length of this internal buffer saved in `length`. + + Notes + ----- + Python object owns memory, thus returned char* must not be freed. + `length` can be NULL if getting buffer length is not needed. + + Parameters + ---------- + py_string : object + length : Py_ssize_t* + + Returns + ------- + buf : const char* + """ + cdef: + const char *buf + + if PyUnicode_Check(py_string): + buf = PyUnicode_AsUTF8AndSize(py_string, length) + else: + PyBytes_AsStringAndSize(py_string, &buf, length) + return buf + + +cdef inline const char* get_c_string(object py_string): + return get_c_string_buf_and_size(py_string, NULL) diff --git a/pandas/_libs/util.pxd b/pandas/_libs/util.pxd index 05a013ec0d7c9..15fedbb20beec 100644 --- a/pandas/_libs/util.pxd +++ b/pandas/_libs/util.pxd @@ -15,21 +15,6 @@ cdef extern from "numpy/arrayobject.h": NPY_ARRAY_F_CONTIGUOUS -cdef extern from *: - """ - // returns ASCII or UTF8 (py3) view on python str - // python object owns memory, should not be freed - static const char* get_c_string(PyObject* obj) { - #if PY_VERSION_HEX >= 0x03000000 - return PyUnicode_AsUTF8(obj); - #else - return PyString_AsString(obj); - #endif - } - """ - const char *get_c_string(object) except NULL - - cdef extern from "src/headers/stdint.h": enum: UINT8_MAX enum: UINT16_MAX diff --git a/pandas/tests/tslibs/test_parse_iso8601.py b/pandas/tests/tslibs/test_parse_iso8601.py index d1b3dee948afe..3a4625d880bb3 100644 --- a/pandas/tests/tslibs/test_parse_iso8601.py +++ b/pandas/tests/tslibs/test_parse_iso8601.py @@ -60,3 +60,10 @@ def test_parsers_iso8601_invalid_offset_invalid(): with pytest.raises(ValueError, match=msg): tslib._test_parse_iso8601(date_str) + + +def test_parsers_iso8601_leading_space(): + # GH#25895 make sure isoparser doesn't overflow with long input + date_str, expected = ("2013-1-1 5:30:00", datetime(2013, 1, 1, 5, 30)) + actual = tslib._test_parse_iso8601(' ' * 200 + date_str) + assert actual == expected