Skip to content

Implement helper method to get char* buffer from Python objects #25895

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Mar 29, 2019
Merged
18 changes: 10 additions & 8 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
# VectorData
# ----------------------------------------------------------------------

from pandas._libs.tslibs.util cimport get_c_string

{{py:

# name, dtype, arg
Expand Down Expand Up @@ -595,7 +597,7 @@ cdef class StringHashTable(HashTable):
cdef:
khiter_t k
const char *v
v = util.get_c_string(val)
v = get_c_string(val)

k = kh_get_str(self.table, v)
if k != self.table.n_buckets:
Expand All @@ -609,7 +611,7 @@ cdef class StringHashTable(HashTable):
int ret = 0
const char *v

v = util.get_c_string(val)
v = get_c_string(val)

k = kh_put_str(self.table, v, &ret)
self.table.keys[k] = key
Expand All @@ -632,7 +634,7 @@ cdef class StringHashTable(HashTable):
vecs = <const char **>malloc(n * sizeof(char *))
for i in range(n):
val = values[i]
v = util.get_c_string(val)
v = get_c_string(val)
vecs[i] = v

with nogil:
Expand Down Expand Up @@ -662,9 +664,9 @@ cdef class StringHashTable(HashTable):
val = values[i]

if isinstance(val, (str, unicode)):
v = util.get_c_string(val)
v = get_c_string(val)
else:
v = util.get_c_string(self.na_string_sentinel)
v = get_c_string(self.na_string_sentinel)
vecs[i] = v

with nogil:
Expand Down Expand Up @@ -695,9 +697,9 @@ cdef class StringHashTable(HashTable):
val = values[i]

if isinstance(val, (str, unicode)):
v = util.get_c_string(val)
v = get_c_string(val)
else:
v = util.get_c_string(self.na_string_sentinel)
v = get_c_string(self.na_string_sentinel)
vecs[i] = v

with nogil:
Expand Down Expand Up @@ -776,7 +778,7 @@ cdef class StringHashTable(HashTable):
labels[i] = na_sentinel
else:
# if ignore_na is False, we also stringify NaN/None/etc.
v = util.get_c_string(val)
v = get_c_string(val)
vecs[i] = v

# compute
Expand Down
37 changes: 8 additions & 29 deletions pandas/_libs/tslibs/np_datetime.pyx
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-

from cpython cimport (Py_EQ, Py_NE, Py_GE, Py_GT, Py_LT, Py_LE,
PyUnicode_AsASCIIString)
from cpython cimport Py_EQ, Py_NE, Py_GE, Py_GT, Py_LT, Py_LE

from cpython.datetime cimport (datetime, date,
PyDateTime_IMPORT,
Expand All @@ -13,6 +12,7 @@ from cpython.datetime cimport (datetime, date,
PyDateTime_IMPORT

from numpy cimport int64_t
from pandas._libs.tslibs.util cimport get_c_string_buf_and_size

cdef extern from "src/datetime/np_datetime.h":
int cmp_npy_datetimestruct(npy_datetimestruct *a,
Expand All @@ -33,7 +33,7 @@ cdef extern from "src/datetime/np_datetime.h":
npy_datetimestruct _NS_MIN_DTS, _NS_MAX_DTS

cdef extern from "src/datetime/np_datetime_strings.h":
int parse_iso_8601_datetime(char *str, int len,
int parse_iso_8601_datetime(const char *str, int len,
npy_datetimestruct *out,
int *out_local, int *out_tzoffset)

Expand Down Expand Up @@ -174,30 +174,9 @@ cdef inline int64_t pydate_to_dt64(date val, npy_datetimestruct *dts):
cdef inline int _string_to_dts(object val, npy_datetimestruct* dts,
int* out_local, int* out_tzoffset) except? -1:
cdef:
int result
char *tmp
Py_ssize_t length
const char* buf

if isinstance(val, unicode):
val = PyUnicode_AsASCIIString(val)

tmp = val
result = _cstring_to_dts(tmp, len(val), dts, out_local, out_tzoffset)

if result == -1:
raise ValueError('Unable to parse %s' % str(val))
return result


cdef inline int _cstring_to_dts(char *val, int length,
npy_datetimestruct* dts,
int* out_local, int* out_tzoffset) except? -1:
# Note: without this "extra layer" between _string_to_dts
# and parse_iso_8601_datetime, calling _string_to_dts raises
# `SystemError: <class 'str'> returned a result with an error set`
# in Python3
cdef:
int result

result = parse_iso_8601_datetime(val, length,
dts, out_local, out_tzoffset)
return result
buf = get_c_string_buf_and_size(val, &length)
return parse_iso_8601_datetime(buf, length,
dts, out_local, out_tzoffset)
8 changes: 5 additions & 3 deletions pandas/_libs/tslibs/src/datetime/np_datetime_strings.c
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,13 @@ This file implements string parsing and creation for NumPy datetime.
*
* Returns 0 on success, -1 on failure.
*/
int parse_iso_8601_datetime(char *str, int len,
int parse_iso_8601_datetime(const char *str, int len,
npy_datetimestruct *out,
int *out_local, int *out_tzoffset) {
int year_leap = 0;
int i, numdigits;
char *substr, sublen;
const char *substr;
int sublen;

/* If year-month-day are separated by a valid separator,
* months/days without leading zeroes will be parsed
Expand Down Expand Up @@ -586,7 +587,8 @@ int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) {
*/
int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
NPY_DATETIMEUNIT base) {
char *substr = outstr, sublen = outlen;
char *substr = outstr;
int sublen = outlen;
int tmplen;

/*
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/tslibs/src/datetime/np_datetime_strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ This file implements string parsing and creation for NumPy datetime.
* Returns 0 on success, -1 on failure.
*/
int
parse_iso_8601_datetime(char *str, int len,
parse_iso_8601_datetime(const char *str, int len,
npy_datetimestruct *out,
int *out_local,
int *out_tzoffset);
Expand Down
43 changes: 43 additions & 0 deletions pandas/_libs/tslibs/util.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,15 @@ cdef extern from "Python.h":
bint PyComplex_Check(object obj) nogil
bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil

# Note that following functions can potentially raise an exception,
# thus they cannot be declared 'nogil'. Also PyUnicode_AsUTF8AndSize() can
# potentially allocate memory inside in unlikely case of when underlying
# unicode object was stored as non-utf8 and utf8 wasn't requested before.
bint PyBytes_AsStringAndSize(object obj, char** buf,
Py_ssize_t* length) except -1
const char* PyUnicode_AsUTF8AndSize(object obj,
Py_ssize_t* length) except NULL

from numpy cimport int64_t

cdef extern from "numpy/arrayobject.h":
Expand Down Expand Up @@ -227,3 +236,37 @@ cdef inline bint is_nan(object val):
is_nan : bool
"""
return (is_float_object(val) or is_complex_object(val)) and val != val


cdef inline const char* get_c_string_buf_and_size(object py_string,
Py_ssize_t *length):
"""
Extract internal char* buffer of unicode or bytes object `py_string` with
getting length of this internal buffer saved in `length`.

Notes
-----
Python object owns memory, thus returned char* must not be freed.
`length` can be NULL if getting buffer length is not needed.

Parameters
----------
py_string : object
length : Py_ssize_t*

Returns
-------
buf : const char*
"""
cdef:
const char *buf

if PyUnicode_Check(py_string):
buf = PyUnicode_AsUTF8AndSize(py_string, length)
else:
PyBytes_AsStringAndSize(py_string, <char**>&buf, length)
return buf


cdef inline const char* get_c_string(object py_string):
return get_c_string_buf_and_size(py_string, NULL)
15 changes: 0 additions & 15 deletions pandas/_libs/util.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,6 @@ cdef extern from "numpy/arrayobject.h":
NPY_ARRAY_F_CONTIGUOUS


cdef extern from *:
"""
// returns ASCII or UTF8 (py3) view on python str
// python object owns memory, should not be freed
static const char* get_c_string(PyObject* obj) {
#if PY_VERSION_HEX >= 0x03000000
return PyUnicode_AsUTF8(obj);
#else
return PyString_AsString(obj);
#endif
}
"""
const char *get_c_string(object) except NULL


cdef extern from "src/headers/stdint.h":
enum: UINT8_MAX
enum: UINT16_MAX
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/tslibs/test_parse_iso8601.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,10 @@ def test_parsers_iso8601_invalid_offset_invalid():

with pytest.raises(ValueError, match=msg):
tslib._test_parse_iso8601(date_str)


def test_parsers_iso8601_leading_space():
# GH#25895 make sure isoparser doesn't overflow with long input
date_str, expected = ("2013-1-1 5:30:00", datetime(2013, 1, 1, 5, 30))
actual = tslib._test_parse_iso8601(' ' * 200 + date_str)
assert actual == expected