Skip to content

Implement helper method to get char* buffer from Python objects #25895

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Mar 29, 2019
Merged
18 changes: 10 additions & 8 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
# VectorData
# ----------------------------------------------------------------------

from pandas._libs.tslibs.util cimport get_string_data, get_string_data_checked

{{py:

# name, dtype, arg
Expand Down Expand Up @@ -595,7 +597,7 @@ cdef class StringHashTable(HashTable):
cdef:
khiter_t k
const char *v
v = util.get_c_string(val)
get_string_data_checked(val, &v, NULL)

k = kh_get_str(self.table, v)
if k != self.table.n_buckets:
Expand All @@ -609,7 +611,7 @@ cdef class StringHashTable(HashTable):
int ret = 0
const char *v

v = util.get_c_string(val)
get_string_data_checked(val, &v, NULL)

k = kh_put_str(self.table, v, &ret)
self.table.keys[k] = key
Expand All @@ -632,7 +634,7 @@ cdef class StringHashTable(HashTable):
vecs = <const char **>malloc(n * sizeof(char *))
for i in range(n):
val = values[i]
v = util.get_c_string(val)
get_string_data_checked(val, &v, NULL)
vecs[i] = v

with nogil:
Expand Down Expand Up @@ -662,9 +664,9 @@ cdef class StringHashTable(HashTable):
val = values[i]

if isinstance(val, (str, unicode)):
v = util.get_c_string(val)
get_string_data(val, &v, NULL)
else:
v = util.get_c_string(self.na_string_sentinel)
get_string_data(self.na_string_sentinel, &v, NULL)
vecs[i] = v

with nogil:
Expand Down Expand Up @@ -695,9 +697,9 @@ cdef class StringHashTable(HashTable):
val = values[i]

if isinstance(val, (str, unicode)):
v = util.get_c_string(val)
get_string_data(val, &v, NULL)
else:
v = util.get_c_string(self.na_string_sentinel)
get_string_data(self.na_string_sentinel, &v, NULL)
vecs[i] = v

with nogil:
Expand Down Expand Up @@ -776,7 +778,7 @@ cdef class StringHashTable(HashTable):
labels[i] = na_sentinel
else:
# if ignore_na is False, we also stringify NaN/None/etc.
v = util.get_c_string(val)
get_string_data_checked(val, &v, NULL)
vecs[i] = v

# compute
Expand Down
37 changes: 8 additions & 29 deletions pandas/_libs/tslibs/np_datetime.pyx
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-

from cpython cimport (Py_EQ, Py_NE, Py_GE, Py_GT, Py_LT, Py_LE,
PyUnicode_AsASCIIString)
from cpython cimport Py_EQ, Py_NE, Py_GE, Py_GT, Py_LT, Py_LE

from cpython.datetime cimport (datetime, date,
PyDateTime_IMPORT,
Expand All @@ -13,6 +12,7 @@ from cpython.datetime cimport (datetime, date,
PyDateTime_IMPORT

from numpy cimport int64_t
from pandas._libs.tslibs.util cimport get_string_data_checked

cdef extern from "src/datetime/np_datetime.h":
int cmp_npy_datetimestruct(npy_datetimestruct *a,
Expand All @@ -33,7 +33,7 @@ cdef extern from "src/datetime/np_datetime.h":
npy_datetimestruct _NS_MIN_DTS, _NS_MAX_DTS

cdef extern from "src/datetime/np_datetime_strings.h":
int parse_iso_8601_datetime(char *str, int len,
int parse_iso_8601_datetime(const char *str, int len,
npy_datetimestruct *out,
int *out_local, int *out_tzoffset)

Expand Down Expand Up @@ -174,30 +174,9 @@ cdef inline int64_t pydate_to_dt64(date val, npy_datetimestruct *dts):
cdef inline int _string_to_dts(object val, npy_datetimestruct* dts,
int* out_local, int* out_tzoffset) except? -1:
cdef:
int result
char *tmp
Py_ssize_t length
const char* tmp

if isinstance(val, unicode):
val = PyUnicode_AsASCIIString(val)

tmp = val
result = _cstring_to_dts(tmp, len(val), dts, out_local, out_tzoffset)

if result == -1:
raise ValueError('Unable to parse %s' % str(val))
return result


cdef inline int _cstring_to_dts(char *val, int length,
npy_datetimestruct* dts,
int* out_local, int* out_tzoffset) except? -1:
# Note: without this "extra layer" between _string_to_dts
# and parse_iso_8601_datetime, calling _string_to_dts raises
# `SystemError: <class 'str'> returned a result with an error set`
# in Python3
cdef:
int result

result = parse_iso_8601_datetime(val, length,
dts, out_local, out_tzoffset)
return result
get_string_data_checked(val, &tmp, &length)
return parse_iso_8601_datetime(tmp, length,
dts, out_local, out_tzoffset)
8 changes: 5 additions & 3 deletions pandas/_libs/tslibs/src/datetime/np_datetime_strings.c
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,13 @@ This file implements string parsing and creation for NumPy datetime.
*
* Returns 0 on success, -1 on failure.
*/
int parse_iso_8601_datetime(char *str, int len,
int parse_iso_8601_datetime(const char *str, int len,
npy_datetimestruct *out,
int *out_local, int *out_tzoffset) {
int year_leap = 0;
int i, numdigits;
char *substr, sublen;
const char *substr;
int sublen;

/* If year-month-day are separated by a valid separator,
* months/days without leading zeroes will be parsed
Expand Down Expand Up @@ -586,7 +587,8 @@ int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) {
*/
int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
NPY_DATETIMEUNIT base) {
char *substr = outstr, sublen = outlen;
char *substr = outstr;
int sublen = outlen;
int tmplen;

/*
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/tslibs/src/datetime/np_datetime_strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ This file implements string parsing and creation for NumPy datetime.
* Returns 0 on success, -1 on failure.
*/
int
parse_iso_8601_datetime(char *str, int len,
parse_iso_8601_datetime(const char *str, int len,
npy_datetimestruct *out,
int *out_local,
int *out_tzoffset);
Expand Down
52 changes: 51 additions & 1 deletion pandas/_libs/tslibs/util.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

from cpython cimport PyTypeObject
from cpython cimport PyTypeObject, PyErr_BadArgument

cdef extern from *:
"""
Expand All @@ -18,12 +18,21 @@ cdef extern from "Python.h":
# Note: importing extern-style allows us to declare these as nogil
# functions, whereas `from cpython cimport` does not.
bint PyUnicode_Check(object obj) nogil
bint PyBytes_Check(object obj) nogil
bint PyString_Check(object obj) nogil
bint PyBool_Check(object obj) nogil
bint PyFloat_Check(object obj) nogil
bint PyComplex_Check(object obj) nogil
bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil

# Note that following functions can potentially raise an exception,
# thus they cannot be declared 'nogil'. Also PyUnicode_AsUTF8AndSize() can
# potentially allocate memory inside in unlikely case of when underlying
# unicode object was stored as non-utf8 and utf8 wasn't requested before.
bint PyBytes_AsStringAndSize(object obj, char** buf,
Py_ssize_t* length)
char* PyUnicode_AsUTF8AndSize(object obj, Py_ssize_t* length)

from numpy cimport int64_t

cdef extern from "numpy/arrayobject.h":
Expand Down Expand Up @@ -227,3 +236,44 @@ cdef inline bint is_nan(object val):
is_nan : bool
"""
return (is_float_object(val) or is_complex_object(val)) and val != val


cdef inline bint get_string_data(object s, const char **buf,
Py_ssize_t *length):
"""
Extract internal char * buffer of unicode or bytes object `s` to `buf` with
getting length of this internal buffer saved in `length`.
Returns `False` if it failed to extract such buffer for whatever reason,
otherwise returns `True`.

Notes
-----
Python object owns memory, `buf` should not be freed.
`length` can be NULL if getting buffer length is not needed.
This function should only raise exceptions in out-of-memory cases.

Parameters
----------
s : object
buf : const char**
length : Py_ssize_t*

Returns
-------
bint
"""
if PyUnicode_Check(s):
buf[0] = PyUnicode_AsUTF8AndSize(s, length)
return buf[0] != NULL
if PyBytes_Check(s):
return PyBytes_AsStringAndSize(s, <char**>buf, length) == 0
return False

cdef inline void get_string_data_checked(object s, const char **buf,
Py_ssize_t *length):
"""
This is a wrapper for get_string_data() that raises TypeError
when supplied with neither unicode nor bytes object
"""
if not get_string_data(s, buf, length):
PyErr_BadArgument()
15 changes: 0 additions & 15 deletions pandas/_libs/util.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,6 @@ cdef extern from "numpy/arrayobject.h":
NPY_ARRAY_F_CONTIGUOUS


cdef extern from *:
"""
// returns ASCII or UTF8 (py3) view on python str
// python object owns memory, should not be freed
static const char* get_c_string(PyObject* obj) {
#if PY_VERSION_HEX >= 0x03000000
return PyUnicode_AsUTF8(obj);
#else
return PyString_AsString(obj);
#endif
}
"""
const char *get_c_string(object) except NULL


cdef extern from "src/headers/stdint.h":
enum: UINT8_MAX
enum: UINT16_MAX
Expand Down