From 3551121af10ddb12ee90a7586ad2d6d8f833a774 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 27 Mar 2019 09:34:22 +0300 Subject: [PATCH 01/11] removed extra layer; using get_string_data now --- pandas/_libs/tslibs/np_datetime.pyx | 31 ++++++----------------------- pandas/_libs/tslibs/util.pxd | 12 +++++++++++ 2 files changed, 18 insertions(+), 25 deletions(-) diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index dbbe9da381f0a..f49bc197bc215 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- -from cpython cimport (Py_EQ, Py_NE, Py_GE, Py_GT, Py_LT, Py_LE, - PyUnicode_AsASCIIString) +from cpython cimport Py_EQ, Py_NE, Py_GE, Py_GT, Py_LT, Py_LE from cpython.datetime cimport (datetime, date, PyDateTime_IMPORT, @@ -13,6 +12,7 @@ from cpython.datetime cimport (datetime, date, PyDateTime_IMPORT from numpy cimport int64_t +from pandas._libs.tslibs.util cimport get_string_data cdef extern from "src/datetime/np_datetime.h": int cmp_npy_datetimestruct(npy_datetimestruct *a, @@ -174,30 +174,11 @@ cdef inline int64_t pydate_to_dt64(date val, npy_datetimestruct *dts): cdef inline int _string_to_dts(object val, npy_datetimestruct* dts, int* out_local, int* out_tzoffset) except? -1: cdef: - int result + Py_ssize_t length char *tmp - if isinstance(val, unicode): - val = PyUnicode_AsASCIIString(val) - - tmp = val - result = _cstring_to_dts(tmp, len(val), dts, out_local, out_tzoffset) - - if result == -1: + if not get_string_data(val, &tmp, &length): raise ValueError('Unable to parse %s' % str(val)) - return result - - -cdef inline int _cstring_to_dts(char *val, int length, - npy_datetimestruct* dts, - int* out_local, int* out_tzoffset) except? -1: - # Note: without this "extra layer" between _string_to_dts - # and parse_iso_8601_datetime, calling _string_to_dts raises - # `SystemError: returned a result with an error set` - # in Python3 - cdef: - int result + return parse_iso_8601_datetime(tmp, length, + dts, out_local, out_tzoffset) - result = parse_iso_8601_datetime(val, length, - dts, out_local, out_tzoffset) - return result diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index ef7065a44f18b..1103d01ac7fce 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -18,11 +18,14 @@ cdef extern from "Python.h": # Note: importing extern-style allows us to declare these as nogil # functions, whereas `from cpython cimport` does not. bint PyUnicode_Check(object obj) nogil + bint PyBytes_Check(object obj) nogil bint PyString_Check(object obj) nogil bint PyBool_Check(object obj) nogil bint PyFloat_Check(object obj) nogil bint PyComplex_Check(object obj) nogil bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil + bint PyBytes_AsStringAndSize(object obj, char** buf, Py_ssize_t* length) nogil + char* PyUnicode_AsUTF8AndSize(object obj, Py_ssize_t* length) nogil from numpy cimport int64_t @@ -227,3 +230,12 @@ cdef inline bint is_nan(object val): is_nan : bool """ return (is_float_object(val) or is_complex_object(val)) and val != val + + +cdef inline bint get_string_data(object s, char **buf, Py_ssize_t *length): + if PyUnicode_Check(s): + buf[0] = PyUnicode_AsUTF8AndSize(s, length) + return buf[0] != NULL + if PyBytes_Check(s): + return PyBytes_AsStringAndSize(s, buf, length) == 0 + return False From d24e72841117e88cbb37c396a86aa6f28f999eaa Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 27 Mar 2019 11:17:37 +0300 Subject: [PATCH 02/11] fix problem with const char* value, that return PyUnicode_AsUTF8AndSize in Python3.7 case; added docstring to get_string_data func --- pandas/_libs/tslibs/np_datetime.pyx | 4 ++-- .../tslibs/src/datetime/np_datetime_strings.c | 5 +++-- .../tslibs/src/datetime/np_datetime_strings.h | 2 +- pandas/_libs/tslibs/util.pxd | 21 +++++++++++++++++-- 4 files changed, 25 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index f49bc197bc215..a67dc592e6eb4 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -33,7 +33,7 @@ cdef extern from "src/datetime/np_datetime.h": npy_datetimestruct _NS_MIN_DTS, _NS_MAX_DTS cdef extern from "src/datetime/np_datetime_strings.h": - int parse_iso_8601_datetime(char *str, int len, + int parse_iso_8601_datetime(const char *str, int len, npy_datetimestruct *out, int *out_local, int *out_tzoffset) @@ -175,7 +175,7 @@ cdef inline int _string_to_dts(object val, npy_datetimestruct* dts, int* out_local, int* out_tzoffset) except? -1: cdef: Py_ssize_t length - char *tmp + const char* tmp if not get_string_data(val, &tmp, &length): raise ValueError('Unable to parse %s' % str(val)) diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index 207da4b8f8340..aa925d160760f 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -66,12 +66,13 @@ This file implements string parsing and creation for NumPy datetime. * * Returns 0 on success, -1 on failure. */ -int parse_iso_8601_datetime(char *str, int len, +int parse_iso_8601_datetime(const char *str, int len, npy_datetimestruct *out, int *out_local, int *out_tzoffset) { int year_leap = 0; int i, numdigits; - char *substr, sublen; + const char *substr; + char sublen; /* If year-month-day are separated by a valid separator, * months/days without leading zeroes will be parsed diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h index 15d5dd357eaef..86ebe890810d6 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h @@ -54,7 +54,7 @@ This file implements string parsing and creation for NumPy datetime. * Returns 0 on success, -1 on failure. */ int -parse_iso_8601_datetime(char *str, int len, +parse_iso_8601_datetime(const char *str, int len, npy_datetimestruct *out, int *out_local, int *out_tzoffset); diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index 1103d01ac7fce..f7e521aa61c94 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -232,10 +232,27 @@ cdef inline bint is_nan(object val): return (is_float_object(val) or is_complex_object(val)) and val != val -cdef inline bint get_string_data(object s, char **buf, Py_ssize_t *length): +cdef inline bint get_string_data(object s, const char **buf, + Py_ssize_t *length): + """ + Extract internal char * buffer of unicode or bytes object `s` to `buf` with + getting length of this internal buffer, that save in `length`. + Return `False` if it failed to extract such buffer for whatever reason + otherwise return `True` + + Parameters + ---------- + s : object + buf : const char** + length : Py_ssize_t* + + Returns + ------- + bint + """ if PyUnicode_Check(s): buf[0] = PyUnicode_AsUTF8AndSize(s, length) return buf[0] != NULL if PyBytes_Check(s): - return PyBytes_AsStringAndSize(s, buf, length) == 0 + return PyBytes_AsStringAndSize(s, buf, length) == 0 return False From 44efe95eb1b516f44878912821e8704ae7b9ef5c Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 27 Mar 2019 11:44:02 +0300 Subject: [PATCH 03/11] fix code style --- pandas/_libs/tslibs/util.pxd | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index f7e521aa61c94..e5c665e77ef11 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -24,7 +24,8 @@ cdef extern from "Python.h": bint PyFloat_Check(object obj) nogil bint PyComplex_Check(object obj) nogil bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil - bint PyBytes_AsStringAndSize(object obj, char** buf, Py_ssize_t* length) nogil + bint PyBytes_AsStringAndSize(object obj, char** buf, + Py_ssize_t* length) nogil char* PyUnicode_AsUTF8AndSize(object obj, Py_ssize_t* length) nogil from numpy cimport int64_t @@ -245,7 +246,7 @@ cdef inline bint get_string_data(object s, const char **buf, s : object buf : const char** length : Py_ssize_t* - + Returns ------- bint From c90635b74b8c0d19ad88623260f3bbeb05c13795 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 27 Mar 2019 12:49:49 +0300 Subject: [PATCH 04/11] replaced get_c_string to get_string_data, added 'note' paragraph in get_string_data docstring --- pandas/_libs/hashtable_class_helper.pxi.in | 18 ++++++++++-------- pandas/_libs/tslibs/util.pxd | 5 +++++ pandas/_libs/util.pxd | 15 --------------- 3 files changed, 15 insertions(+), 23 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 3644928d8dedc..6689869a19748 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -9,6 +9,8 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # VectorData # ---------------------------------------------------------------------- +from pandas._libs.tslibs.util cimport get_string_data + {{py: # name, dtype, arg @@ -595,7 +597,7 @@ cdef class StringHashTable(HashTable): cdef: khiter_t k const char *v - v = util.get_c_string(val) + get_string_data(val, &v, NULL) k = kh_get_str(self.table, v) if k != self.table.n_buckets: @@ -609,7 +611,7 @@ cdef class StringHashTable(HashTable): int ret = 0 const char *v - v = util.get_c_string(val) + get_string_data(val, &v, NULL) k = kh_put_str(self.table, v, &ret) self.table.keys[k] = key @@ -632,7 +634,7 @@ cdef class StringHashTable(HashTable): vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] - v = util.get_c_string(val) + get_string_data(val, &v, NULL) vecs[i] = v with nogil: @@ -662,9 +664,9 @@ cdef class StringHashTable(HashTable): val = values[i] if isinstance(val, (str, unicode)): - v = util.get_c_string(val) + get_string_data(val, &v, NULL) else: - v = util.get_c_string(self.na_string_sentinel) + get_string_data(self.na_string_sentinel, &v, NULL) vecs[i] = v with nogil: @@ -695,9 +697,9 @@ cdef class StringHashTable(HashTable): val = values[i] if isinstance(val, (str, unicode)): - v = util.get_c_string(val) + get_string_data(val, &v, NULL) else: - v = util.get_c_string(self.na_string_sentinel) + get_string_data(self.na_string_sentinel, &v, NULL) vecs[i] = v with nogil: @@ -776,7 +778,7 @@ cdef class StringHashTable(HashTable): labels[i] = na_sentinel else: # if ignore_na is False, we also stringify NaN/None/etc. - v = util.get_c_string(val) + get_string_data(val, &v, NULL) vecs[i] = v # compute diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index e5c665e77ef11..a22ac449a6e3d 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -241,6 +241,11 @@ cdef inline bint get_string_data(object s, const char **buf, Return `False` if it failed to extract such buffer for whatever reason otherwise return `True` + Note + ---- + python object owns memory, `buf` should not be freed + `length` can be NULL + Parameters ---------- s : object diff --git a/pandas/_libs/util.pxd b/pandas/_libs/util.pxd index 05a013ec0d7c9..15fedbb20beec 100644 --- a/pandas/_libs/util.pxd +++ b/pandas/_libs/util.pxd @@ -15,21 +15,6 @@ cdef extern from "numpy/arrayobject.h": NPY_ARRAY_F_CONTIGUOUS -cdef extern from *: - """ - // returns ASCII or UTF8 (py3) view on python str - // python object owns memory, should not be freed - static const char* get_c_string(PyObject* obj) { - #if PY_VERSION_HEX >= 0x03000000 - return PyUnicode_AsUTF8(obj); - #else - return PyString_AsString(obj); - #endif - } - """ - const char *get_c_string(object) except NULL - - cdef extern from "src/headers/stdint.h": enum: UINT8_MAX enum: UINT16_MAX From 4d1916c91027657590e46ddec8ac6e9e69897da5 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Wed, 27 Mar 2019 19:18:02 +0300 Subject: [PATCH 05/11] Re-instate raising TypeError when trying to get string data of non-string object --- pandas/_libs/hashtable_class_helper.pxi.in | 10 +++--- pandas/_libs/tslibs/np_datetime.pyx | 6 ++-- .../tslibs/src/datetime/np_datetime_strings.c | 5 +-- pandas/_libs/tslibs/util.pxd | 35 +++++++++++++------ 4 files changed, 35 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 6689869a19748..879f99734aa7b 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -9,7 +9,7 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # VectorData # ---------------------------------------------------------------------- -from pandas._libs.tslibs.util cimport get_string_data +from pandas._libs.tslibs.util cimport get_string_data, get_string_data_checked {{py: @@ -597,7 +597,7 @@ cdef class StringHashTable(HashTable): cdef: khiter_t k const char *v - get_string_data(val, &v, NULL) + get_string_data_checked(val, &v, NULL) k = kh_get_str(self.table, v) if k != self.table.n_buckets: @@ -611,7 +611,7 @@ cdef class StringHashTable(HashTable): int ret = 0 const char *v - get_string_data(val, &v, NULL) + get_string_data_checked(val, &v, NULL) k = kh_put_str(self.table, v, &ret) self.table.keys[k] = key @@ -634,7 +634,7 @@ cdef class StringHashTable(HashTable): vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] - get_string_data(val, &v, NULL) + get_string_data_checked(val, &v, NULL) vecs[i] = v with nogil: @@ -778,7 +778,7 @@ cdef class StringHashTable(HashTable): labels[i] = na_sentinel else: # if ignore_na is False, we also stringify NaN/None/etc. - get_string_data(val, &v, NULL) + get_string_data_checked(val, &v, NULL) vecs[i] = v # compute diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index a67dc592e6eb4..78bc5998c5335 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -12,7 +12,7 @@ from cpython.datetime cimport (datetime, date, PyDateTime_IMPORT from numpy cimport int64_t -from pandas._libs.tslibs.util cimport get_string_data +from pandas._libs.tslibs.util cimport get_string_data_checked cdef extern from "src/datetime/np_datetime.h": int cmp_npy_datetimestruct(npy_datetimestruct *a, @@ -177,8 +177,6 @@ cdef inline int _string_to_dts(object val, npy_datetimestruct* dts, Py_ssize_t length const char* tmp - if not get_string_data(val, &tmp, &length): - raise ValueError('Unable to parse %s' % str(val)) + get_string_data_checked(val, &tmp, &length) return parse_iso_8601_datetime(tmp, length, dts, out_local, out_tzoffset) - diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index aa925d160760f..abeeaba1d1198 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -72,7 +72,7 @@ int parse_iso_8601_datetime(const char *str, int len, int year_leap = 0; int i, numdigits; const char *substr; - char sublen; + int sublen; /* If year-month-day are separated by a valid separator, * months/days without leading zeroes will be parsed @@ -587,7 +587,8 @@ int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) { */ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, NPY_DATETIMEUNIT base) { - char *substr = outstr, sublen = outlen; + char *substr = outstr; + int sublen = outlen; int tmplen; /* diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index a22ac449a6e3d..e77c13e741ecd 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -1,5 +1,5 @@ -from cpython cimport PyTypeObject +from cpython cimport PyTypeObject, PyErr_BadArgument cdef extern from *: """ @@ -24,9 +24,14 @@ cdef extern from "Python.h": bint PyFloat_Check(object obj) nogil bint PyComplex_Check(object obj) nogil bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil + + # Note that following functions can potentially raise an exception, + # thus they cannot be declared 'nogil'. Also PyUnicode_AsUTF8AndSize() can + # potentially allocate memory inside in unlikely case of when underlying + # unicode object was stored as non-utf8 and utf8 wasn't requested before. bint PyBytes_AsStringAndSize(object obj, char** buf, - Py_ssize_t* length) nogil - char* PyUnicode_AsUTF8AndSize(object obj, Py_ssize_t* length) nogil + Py_ssize_t* length) + char* PyUnicode_AsUTF8AndSize(object obj, Py_ssize_t* length) from numpy cimport int64_t @@ -237,14 +242,15 @@ cdef inline bint get_string_data(object s, const char **buf, Py_ssize_t *length): """ Extract internal char * buffer of unicode or bytes object `s` to `buf` with - getting length of this internal buffer, that save in `length`. - Return `False` if it failed to extract such buffer for whatever reason - otherwise return `True` + getting length of this internal buffer saved in `length`. + Returns `False` if it failed to extract such buffer for whatever reason, + otherwise returns `True`. - Note - ---- - python object owns memory, `buf` should not be freed - `length` can be NULL + Notes + ----- + Python object owns memory, `buf` should not be freed. + `length` can be NULL if getting buffer length is not needed. + This function should only raise exceptions in out-of-memory cases. Parameters ---------- @@ -262,3 +268,12 @@ cdef inline bint get_string_data(object s, const char **buf, if PyBytes_Check(s): return PyBytes_AsStringAndSize(s, buf, length) == 0 return False + +cdef inline void get_string_data_checked(object s, const char **buf, + Py_ssize_t *length): + """ + This is a wrapper for get_string_data() that raises TypeError + when supplied with neither unicode nor bytes object + """ + if not get_string_data(s, buf, length): + PyErr_BadArgument() From 789614e54fa2d8b4caa93bff83e17b980e3bbcb8 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 28 Mar 2019 10:17:59 +0300 Subject: [PATCH 06/11] test case for overflow in parse_iso_8601_datetime --- pandas/tests/tslibs/test_parse_iso8601.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/tslibs/test_parse_iso8601.py b/pandas/tests/tslibs/test_parse_iso8601.py index d1b3dee948afe..e8efc169e2114 100644 --- a/pandas/tests/tslibs/test_parse_iso8601.py +++ b/pandas/tests/tslibs/test_parse_iso8601.py @@ -60,3 +60,15 @@ def test_parsers_iso8601_invalid_offset_invalid(): with pytest.raises(ValueError, match=msg): tslib._test_parse_iso8601(date_str) + + +def test_parsers_iso8601_overflow(): + # if use char sublen variable in parse_iso_8601_datetime, then overflow + # occurs and no exception is thrown + date_str = "2013-01-01 05:30:00aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"\ + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"\ + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"\ + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"\ + "aaaaaaaaaaaaaaaaaaaaaa" + with pytest.raises(ValueError): + tslib._test_parse_iso8601(date_str) From 2bdddc820f81eebde970f1d566d43f59d4d26ec4 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 28 Mar 2019 13:04:41 +0300 Subject: [PATCH 07/11] change get_string_data signature to more pythonic --- pandas/_libs/hashtable_class_helper.pxi.in | 16 ++++++------- pandas/_libs/tslibs/np_datetime.pyx | 2 +- pandas/_libs/tslibs/util.pxd | 27 +++++++++++++--------- 3 files changed, 25 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 879f99734aa7b..f538489407e14 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -597,7 +597,7 @@ cdef class StringHashTable(HashTable): cdef: khiter_t k const char *v - get_string_data_checked(val, &v, NULL) + v = get_string_data_checked(val, NULL) k = kh_get_str(self.table, v) if k != self.table.n_buckets: @@ -611,7 +611,7 @@ cdef class StringHashTable(HashTable): int ret = 0 const char *v - get_string_data_checked(val, &v, NULL) + v = get_string_data_checked(val, NULL) k = kh_put_str(self.table, v, &ret) self.table.keys[k] = key @@ -634,7 +634,7 @@ cdef class StringHashTable(HashTable): vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] - get_string_data_checked(val, &v, NULL) + v = get_string_data_checked(val, NULL) vecs[i] = v with nogil: @@ -664,9 +664,9 @@ cdef class StringHashTable(HashTable): val = values[i] if isinstance(val, (str, unicode)): - get_string_data(val, &v, NULL) + v = get_string_data(val, NULL) else: - get_string_data(self.na_string_sentinel, &v, NULL) + v = get_string_data(self.na_string_sentinel, NULL) vecs[i] = v with nogil: @@ -697,9 +697,9 @@ cdef class StringHashTable(HashTable): val = values[i] if isinstance(val, (str, unicode)): - get_string_data(val, &v, NULL) + v = get_string_data(val, NULL) else: - get_string_data(self.na_string_sentinel, &v, NULL) + v = get_string_data(self.na_string_sentinel, NULL) vecs[i] = v with nogil: @@ -778,7 +778,7 @@ cdef class StringHashTable(HashTable): labels[i] = na_sentinel else: # if ignore_na is False, we also stringify NaN/None/etc. - get_string_data_checked(val, &v, NULL) + v = get_string_data_checked(val, NULL) vecs[i] = v # compute diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 78bc5998c5335..72e2fac847267 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -177,6 +177,6 @@ cdef inline int _string_to_dts(object val, npy_datetimestruct* dts, Py_ssize_t length const char* tmp - get_string_data_checked(val, &tmp, &length) + tmp = get_string_data_checked(val, &length) return parse_iso_8601_datetime(tmp, length, dts, out_local, out_tzoffset) diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index e77c13e741ecd..195e452839c7a 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -238,8 +238,7 @@ cdef inline bint is_nan(object val): return (is_float_object(val) or is_complex_object(val)) and val != val -cdef inline bint get_string_data(object s, const char **buf, - Py_ssize_t *length): +cdef inline const char* get_string_data(object s, Py_ssize_t *length): """ Extract internal char * buffer of unicode or bytes object `s` to `buf` with getting length of this internal buffer saved in `length`. @@ -255,25 +254,31 @@ cdef inline bint get_string_data(object s, const char **buf, Parameters ---------- s : object - buf : const char** length : Py_ssize_t* Returns ------- - bint + buf : const char* """ + cdef: + const char *buf + if PyUnicode_Check(s): - buf[0] = PyUnicode_AsUTF8AndSize(s, length) - return buf[0] != NULL + buf = PyUnicode_AsUTF8AndSize(s, length) if PyBytes_Check(s): - return PyBytes_AsStringAndSize(s, buf, length) == 0 - return False + if PyBytes_AsStringAndSize(s, &buf, length) != 0: + return NULL + return buf + -cdef inline void get_string_data_checked(object s, const char **buf, - Py_ssize_t *length): +cdef inline const char* get_string_data_checked(object s, Py_ssize_t *length): """ This is a wrapper for get_string_data() that raises TypeError when supplied with neither unicode nor bytes object """ - if not get_string_data(s, buf, length): + cdef: + const char *buf = get_string_data(s, length) + + if not buf: PyErr_BadArgument() + return buf From 13c8e95ea621280027364cae424e6ee135e1209d Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 28 Mar 2019 13:58:05 +0300 Subject: [PATCH 08/11] Added test for parsing leading spaces --- pandas/tests/tslibs/test_parse_iso8601.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/pandas/tests/tslibs/test_parse_iso8601.py b/pandas/tests/tslibs/test_parse_iso8601.py index e8efc169e2114..6fe57f9cc2539 100644 --- a/pandas/tests/tslibs/test_parse_iso8601.py +++ b/pandas/tests/tslibs/test_parse_iso8601.py @@ -62,13 +62,7 @@ def test_parsers_iso8601_invalid_offset_invalid(): tslib._test_parse_iso8601(date_str) -def test_parsers_iso8601_overflow(): - # if use char sublen variable in parse_iso_8601_datetime, then overflow - # occurs and no exception is thrown - date_str = "2013-01-01 05:30:00aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"\ - "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"\ - "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"\ - "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"\ - "aaaaaaaaaaaaaaaaaaaaaa" - with pytest.raises(ValueError): - tslib._test_parse_iso8601(date_str) +def test_parsers_iso8601_leading_space(): + date_str, expected = ("2013-1-1 5:30:00", datetime(2013, 1, 1, 5, 30)) + actual = tslib._test_parse_iso8601(' ' * 200 + date_str) + assert actual == expected From 1db613e6b43f39987acc9e72f29dfc8dd40938ec Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 28 Mar 2019 14:31:01 +0300 Subject: [PATCH 09/11] Rework get_string_data to cleaner get_c_string_buf_and_size --- pandas/_libs/hashtable_class_helper.pxi.in | 18 +++++------ pandas/_libs/tslibs/np_datetime.pyx | 8 ++--- pandas/_libs/tslibs/util.pxd | 35 +++++++--------------- 3 files changed, 24 insertions(+), 37 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index f538489407e14..8c2c560c062ac 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -9,7 +9,7 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # VectorData # ---------------------------------------------------------------------- -from pandas._libs.tslibs.util cimport get_string_data, get_string_data_checked +from pandas._libs.tslibs.util cimport get_c_string {{py: @@ -597,7 +597,7 @@ cdef class StringHashTable(HashTable): cdef: khiter_t k const char *v - v = get_string_data_checked(val, NULL) + v = get_c_string(val) k = kh_get_str(self.table, v) if k != self.table.n_buckets: @@ -611,7 +611,7 @@ cdef class StringHashTable(HashTable): int ret = 0 const char *v - v = get_string_data_checked(val, NULL) + v = get_c_string(val) k = kh_put_str(self.table, v, &ret) self.table.keys[k] = key @@ -634,7 +634,7 @@ cdef class StringHashTable(HashTable): vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] - v = get_string_data_checked(val, NULL) + v = get_c_string(val) vecs[i] = v with nogil: @@ -664,9 +664,9 @@ cdef class StringHashTable(HashTable): val = values[i] if isinstance(val, (str, unicode)): - v = get_string_data(val, NULL) + v = get_c_string(val) else: - v = get_string_data(self.na_string_sentinel, NULL) + v = get_c_string(self.na_string_sentinel) vecs[i] = v with nogil: @@ -697,9 +697,9 @@ cdef class StringHashTable(HashTable): val = values[i] if isinstance(val, (str, unicode)): - v = get_string_data(val, NULL) + v = get_c_string(val) else: - v = get_string_data(self.na_string_sentinel, NULL) + v = get_c_string(self.na_string_sentinel) vecs[i] = v with nogil: @@ -778,7 +778,7 @@ cdef class StringHashTable(HashTable): labels[i] = na_sentinel else: # if ignore_na is False, we also stringify NaN/None/etc. - v = get_string_data_checked(val, NULL) + v = get_c_string(val) vecs[i] = v # compute diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 72e2fac847267..000ec94901457 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -12,7 +12,7 @@ from cpython.datetime cimport (datetime, date, PyDateTime_IMPORT from numpy cimport int64_t -from pandas._libs.tslibs.util cimport get_string_data_checked +from pandas._libs.tslibs.util cimport get_c_string_buf_and_size cdef extern from "src/datetime/np_datetime.h": int cmp_npy_datetimestruct(npy_datetimestruct *a, @@ -175,8 +175,8 @@ cdef inline int _string_to_dts(object val, npy_datetimestruct* dts, int* out_local, int* out_tzoffset) except? -1: cdef: Py_ssize_t length - const char* tmp + const char* buf - tmp = get_string_data_checked(val, &length) - return parse_iso_8601_datetime(tmp, length, + buf = get_c_string_buf_and_size(val, &length) + return parse_iso_8601_datetime(buf, length, dts, out_local, out_tzoffset) diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index 195e452839c7a..84dc2869bbc41 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -1,5 +1,5 @@ -from cpython cimport PyTypeObject, PyErr_BadArgument +from cpython cimport PyTypeObject cdef extern from *: """ @@ -18,7 +18,6 @@ cdef extern from "Python.h": # Note: importing extern-style allows us to declare these as nogil # functions, whereas `from cpython cimport` does not. bint PyUnicode_Check(object obj) nogil - bint PyBytes_Check(object obj) nogil bint PyString_Check(object obj) nogil bint PyBool_Check(object obj) nogil bint PyFloat_Check(object obj) nogil @@ -30,8 +29,8 @@ cdef extern from "Python.h": # potentially allocate memory inside in unlikely case of when underlying # unicode object was stored as non-utf8 and utf8 wasn't requested before. bint PyBytes_AsStringAndSize(object obj, char** buf, - Py_ssize_t* length) - char* PyUnicode_AsUTF8AndSize(object obj, Py_ssize_t* length) + Py_ssize_t* length) except -1 + char* PyUnicode_AsUTF8AndSize(object obj, Py_ssize_t* length) except NULL from numpy cimport int64_t @@ -238,18 +237,16 @@ cdef inline bint is_nan(object val): return (is_float_object(val) or is_complex_object(val)) and val != val -cdef inline const char* get_string_data(object s, Py_ssize_t *length): +cdef inline const char* get_c_string_buf_and_size(object s, + Py_ssize_t *length): """ - Extract internal char * buffer of unicode or bytes object `s` to `buf` with + Extract internal char * buffer of unicode or bytes object `s` with getting length of this internal buffer saved in `length`. - Returns `False` if it failed to extract such buffer for whatever reason, - otherwise returns `True`. Notes ----- - Python object owns memory, `buf` should not be freed. + Python object owns memory, thus returned char* must not be freed. `length` can be NULL if getting buffer length is not needed. - This function should only raise exceptions in out-of-memory cases. Parameters ---------- @@ -265,20 +262,10 @@ cdef inline const char* get_string_data(object s, Py_ssize_t *length): if PyUnicode_Check(s): buf = PyUnicode_AsUTF8AndSize(s, length) - if PyBytes_Check(s): - if PyBytes_AsStringAndSize(s, &buf, length) != 0: - return NULL + else: + PyBytes_AsStringAndSize(s, &buf, length) return buf -cdef inline const char* get_string_data_checked(object s, Py_ssize_t *length): - """ - This is a wrapper for get_string_data() that raises TypeError - when supplied with neither unicode nor bytes object - """ - cdef: - const char *buf = get_string_data(s, length) - - if not buf: - PyErr_BadArgument() - return buf +cdef inline const char* get_c_string(object s): + return get_c_string_buf_and_size(s, NULL) From bbf37c619b18ebdbba56e2ba66ea85bf2c68a795 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 28 Mar 2019 07:13:06 -0500 Subject: [PATCH 10/11] Fix Python 3.7 compilation --- pandas/_libs/tslibs/util.pxd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index 84dc2869bbc41..b2641e2587f47 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -30,7 +30,8 @@ cdef extern from "Python.h": # unicode object was stored as non-utf8 and utf8 wasn't requested before. bint PyBytes_AsStringAndSize(object obj, char** buf, Py_ssize_t* length) except -1 - char* PyUnicode_AsUTF8AndSize(object obj, Py_ssize_t* length) except NULL + const char* PyUnicode_AsUTF8AndSize(object obj, + Py_ssize_t* length) except NULL from numpy cimport int64_t From f5f8e2959b6a4fa797c607298aeea14337d34d57 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 29 Mar 2019 08:54:30 +0300 Subject: [PATCH 11/11] added comment for test; changed name variable: s -> py_string --- pandas/_libs/tslibs/util.pxd | 16 ++++++++-------- pandas/tests/tslibs/test_parse_iso8601.py | 1 + 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index b2641e2587f47..414a26c349452 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -238,10 +238,10 @@ cdef inline bint is_nan(object val): return (is_float_object(val) or is_complex_object(val)) and val != val -cdef inline const char* get_c_string_buf_and_size(object s, +cdef inline const char* get_c_string_buf_and_size(object py_string, Py_ssize_t *length): """ - Extract internal char * buffer of unicode or bytes object `s` with + Extract internal char* buffer of unicode or bytes object `py_string` with getting length of this internal buffer saved in `length`. Notes @@ -251,7 +251,7 @@ cdef inline const char* get_c_string_buf_and_size(object s, Parameters ---------- - s : object + py_string : object length : Py_ssize_t* Returns @@ -261,12 +261,12 @@ cdef inline const char* get_c_string_buf_and_size(object s, cdef: const char *buf - if PyUnicode_Check(s): - buf = PyUnicode_AsUTF8AndSize(s, length) + if PyUnicode_Check(py_string): + buf = PyUnicode_AsUTF8AndSize(py_string, length) else: - PyBytes_AsStringAndSize(s, &buf, length) + PyBytes_AsStringAndSize(py_string, &buf, length) return buf -cdef inline const char* get_c_string(object s): - return get_c_string_buf_and_size(s, NULL) +cdef inline const char* get_c_string(object py_string): + return get_c_string_buf_and_size(py_string, NULL) diff --git a/pandas/tests/tslibs/test_parse_iso8601.py b/pandas/tests/tslibs/test_parse_iso8601.py index 6fe57f9cc2539..3a4625d880bb3 100644 --- a/pandas/tests/tslibs/test_parse_iso8601.py +++ b/pandas/tests/tslibs/test_parse_iso8601.py @@ -63,6 +63,7 @@ def test_parsers_iso8601_invalid_offset_invalid(): def test_parsers_iso8601_leading_space(): + # GH#25895 make sure isoparser doesn't overflow with long input date_str, expected = ("2013-1-1 5:30:00", datetime(2013, 1, 1, 5, 30)) actual = tslib._test_parse_iso8601(' ' * 200 + date_str) assert actual == expected