Skip to content

Commit 1d4c89f

Browse files
vnlitvinovjbrockmendel
authored andcommitted
Implement helper method to get char* buffer from Python objects (pandas-dev#25895)
* removed extra layer; using get_string_data now * fix problem with const char* value, that return PyUnicode_AsUTF8AndSize in Python3.7 case; added docstring to get_string_data func * fix code style * replaced get_c_string to get_string_data, added 'note' paragraph in get_string_data docstring * Re-instate raising TypeError when trying to get string data of non-string object * test case for overflow in parse_iso_8601_datetime * change get_string_data signature to more pythonic * Added test for parsing leading spaces * Rework get_string_data to cleaner get_c_string_buf_and_size * Fix Python 3.7 compilation * added comment for test; changed name variable: s -> py_string
1 parent 19626d2 commit 1d4c89f

File tree

7 files changed

+74
-56
lines changed

7 files changed

+74
-56
lines changed

pandas/_libs/hashtable_class_helper.pxi.in

+10-8
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
99
# VectorData
1010
# ----------------------------------------------------------------------
1111

12+
from pandas._libs.tslibs.util cimport get_c_string
13+
1214
{{py:
1315

1416
# name, dtype, arg
@@ -595,7 +597,7 @@ cdef class StringHashTable(HashTable):
595597
cdef:
596598
khiter_t k
597599
const char *v
598-
v = util.get_c_string(val)
600+
v = get_c_string(val)
599601

600602
k = kh_get_str(self.table, v)
601603
if k != self.table.n_buckets:
@@ -609,7 +611,7 @@ cdef class StringHashTable(HashTable):
609611
int ret = 0
610612
const char *v
611613

612-
v = util.get_c_string(val)
614+
v = get_c_string(val)
613615

614616
k = kh_put_str(self.table, v, &ret)
615617
self.table.keys[k] = key
@@ -632,7 +634,7 @@ cdef class StringHashTable(HashTable):
632634
vecs = <const char **>malloc(n * sizeof(char *))
633635
for i in range(n):
634636
val = values[i]
635-
v = util.get_c_string(val)
637+
v = get_c_string(val)
636638
vecs[i] = v
637639

638640
with nogil:
@@ -662,9 +664,9 @@ cdef class StringHashTable(HashTable):
662664
val = values[i]
663665

664666
if isinstance(val, (str, unicode)):
665-
v = util.get_c_string(val)
667+
v = get_c_string(val)
666668
else:
667-
v = util.get_c_string(self.na_string_sentinel)
669+
v = get_c_string(self.na_string_sentinel)
668670
vecs[i] = v
669671

670672
with nogil:
@@ -695,9 +697,9 @@ cdef class StringHashTable(HashTable):
695697
val = values[i]
696698

697699
if isinstance(val, (str, unicode)):
698-
v = util.get_c_string(val)
700+
v = get_c_string(val)
699701
else:
700-
v = util.get_c_string(self.na_string_sentinel)
702+
v = get_c_string(self.na_string_sentinel)
701703
vecs[i] = v
702704

703705
with nogil:
@@ -776,7 +778,7 @@ cdef class StringHashTable(HashTable):
776778
labels[i] = na_sentinel
777779
else:
778780
# if ignore_na is False, we also stringify NaN/None/etc.
779-
v = util.get_c_string(val)
781+
v = get_c_string(val)
780782
vecs[i] = v
781783

782784
# compute

pandas/_libs/tslibs/np_datetime.pyx

+8-29
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# -*- coding: utf-8 -*-
22

3-
from cpython cimport (Py_EQ, Py_NE, Py_GE, Py_GT, Py_LT, Py_LE,
4-
PyUnicode_AsASCIIString)
3+
from cpython cimport Py_EQ, Py_NE, Py_GE, Py_GT, Py_LT, Py_LE
54

65
from cpython.datetime cimport (datetime, date,
76
PyDateTime_IMPORT,
@@ -13,6 +12,7 @@ from cpython.datetime cimport (datetime, date,
1312
PyDateTime_IMPORT
1413

1514
from numpy cimport int64_t
15+
from pandas._libs.tslibs.util cimport get_c_string_buf_and_size
1616

1717
cdef extern from "src/datetime/np_datetime.h":
1818
int cmp_npy_datetimestruct(npy_datetimestruct *a,
@@ -33,7 +33,7 @@ cdef extern from "src/datetime/np_datetime.h":
3333
npy_datetimestruct _NS_MIN_DTS, _NS_MAX_DTS
3434

3535
cdef extern from "src/datetime/np_datetime_strings.h":
36-
int parse_iso_8601_datetime(char *str, int len,
36+
int parse_iso_8601_datetime(const char *str, int len,
3737
npy_datetimestruct *out,
3838
int *out_local, int *out_tzoffset)
3939

@@ -174,30 +174,9 @@ cdef inline int64_t pydate_to_dt64(date val, npy_datetimestruct *dts):
174174
cdef inline int _string_to_dts(object val, npy_datetimestruct* dts,
175175
int* out_local, int* out_tzoffset) except? -1:
176176
cdef:
177-
int result
178-
char *tmp
177+
Py_ssize_t length
178+
const char* buf
179179

180-
if isinstance(val, unicode):
181-
val = PyUnicode_AsASCIIString(val)
182-
183-
tmp = val
184-
result = _cstring_to_dts(tmp, len(val), dts, out_local, out_tzoffset)
185-
186-
if result == -1:
187-
raise ValueError('Unable to parse %s' % str(val))
188-
return result
189-
190-
191-
cdef inline int _cstring_to_dts(char *val, int length,
192-
npy_datetimestruct* dts,
193-
int* out_local, int* out_tzoffset) except? -1:
194-
# Note: without this "extra layer" between _string_to_dts
195-
# and parse_iso_8601_datetime, calling _string_to_dts raises
196-
# `SystemError: <class 'str'> returned a result with an error set`
197-
# in Python3
198-
cdef:
199-
int result
200-
201-
result = parse_iso_8601_datetime(val, length,
202-
dts, out_local, out_tzoffset)
203-
return result
180+
buf = get_c_string_buf_and_size(val, &length)
181+
return parse_iso_8601_datetime(buf, length,
182+
dts, out_local, out_tzoffset)

pandas/_libs/tslibs/src/datetime/np_datetime_strings.c

+5-3
Original file line numberDiff line numberDiff line change
@@ -66,12 +66,13 @@ This file implements string parsing and creation for NumPy datetime.
6666
*
6767
* Returns 0 on success, -1 on failure.
6868
*/
69-
int parse_iso_8601_datetime(char *str, int len,
69+
int parse_iso_8601_datetime(const char *str, int len,
7070
npy_datetimestruct *out,
7171
int *out_local, int *out_tzoffset) {
7272
int year_leap = 0;
7373
int i, numdigits;
74-
char *substr, sublen;
74+
const char *substr;
75+
int sublen;
7576

7677
/* If year-month-day are separated by a valid separator,
7778
* months/days without leading zeroes will be parsed
@@ -586,7 +587,8 @@ int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) {
586587
*/
587588
int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
588589
NPY_DATETIMEUNIT base) {
589-
char *substr = outstr, sublen = outlen;
590+
char *substr = outstr;
591+
int sublen = outlen;
590592
int tmplen;
591593

592594
/*

pandas/_libs/tslibs/src/datetime/np_datetime_strings.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ This file implements string parsing and creation for NumPy datetime.
5454
* Returns 0 on success, -1 on failure.
5555
*/
5656
int
57-
parse_iso_8601_datetime(char *str, int len,
57+
parse_iso_8601_datetime(const char *str, int len,
5858
npy_datetimestruct *out,
5959
int *out_local,
6060
int *out_tzoffset);

pandas/_libs/tslibs/util.pxd

+43
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,15 @@ cdef extern from "Python.h":
2424
bint PyComplex_Check(object obj) nogil
2525
bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil
2626

27+
# Note that following functions can potentially raise an exception,
28+
# thus they cannot be declared 'nogil'. Also PyUnicode_AsUTF8AndSize() can
29+
# potentially allocate memory inside in unlikely case of when underlying
30+
# unicode object was stored as non-utf8 and utf8 wasn't requested before.
31+
bint PyBytes_AsStringAndSize(object obj, char** buf,
32+
Py_ssize_t* length) except -1
33+
const char* PyUnicode_AsUTF8AndSize(object obj,
34+
Py_ssize_t* length) except NULL
35+
2736
from numpy cimport int64_t
2837

2938
cdef extern from "numpy/arrayobject.h":
@@ -227,3 +236,37 @@ cdef inline bint is_nan(object val):
227236
is_nan : bool
228237
"""
229238
return (is_float_object(val) or is_complex_object(val)) and val != val
239+
240+
241+
cdef inline const char* get_c_string_buf_and_size(object py_string,
242+
Py_ssize_t *length):
243+
"""
244+
Extract internal char* buffer of unicode or bytes object `py_string` with
245+
getting length of this internal buffer saved in `length`.
246+
247+
Notes
248+
-----
249+
Python object owns memory, thus returned char* must not be freed.
250+
`length` can be NULL if getting buffer length is not needed.
251+
252+
Parameters
253+
----------
254+
py_string : object
255+
length : Py_ssize_t*
256+
257+
Returns
258+
-------
259+
buf : const char*
260+
"""
261+
cdef:
262+
const char *buf
263+
264+
if PyUnicode_Check(py_string):
265+
buf = PyUnicode_AsUTF8AndSize(py_string, length)
266+
else:
267+
PyBytes_AsStringAndSize(py_string, <char**>&buf, length)
268+
return buf
269+
270+
271+
cdef inline const char* get_c_string(object py_string):
272+
return get_c_string_buf_and_size(py_string, NULL)

pandas/_libs/util.pxd

-15
Original file line numberDiff line numberDiff line change
@@ -15,21 +15,6 @@ cdef extern from "numpy/arrayobject.h":
1515
NPY_ARRAY_F_CONTIGUOUS
1616

1717

18-
cdef extern from *:
19-
"""
20-
// returns ASCII or UTF8 (py3) view on python str
21-
// python object owns memory, should not be freed
22-
static const char* get_c_string(PyObject* obj) {
23-
#if PY_VERSION_HEX >= 0x03000000
24-
return PyUnicode_AsUTF8(obj);
25-
#else
26-
return PyString_AsString(obj);
27-
#endif
28-
}
29-
"""
30-
const char *get_c_string(object) except NULL
31-
32-
3318
cdef extern from "src/headers/stdint.h":
3419
enum: UINT8_MAX
3520
enum: UINT16_MAX

pandas/tests/tslibs/test_parse_iso8601.py

+7
Original file line numberDiff line numberDiff line change
@@ -60,3 +60,10 @@ def test_parsers_iso8601_invalid_offset_invalid():
6060

6161
with pytest.raises(ValueError, match=msg):
6262
tslib._test_parse_iso8601(date_str)
63+
64+
65+
def test_parsers_iso8601_leading_space():
66+
# GH#25895 make sure isoparser doesn't overflow with long input
67+
date_str, expected = ("2013-1-1 5:30:00", datetime(2013, 1, 1, 5, 30))
68+
actual = tslib._test_parse_iso8601(' ' * 200 + date_str)
69+
assert actual == expected

0 commit comments

Comments
 (0)