Skip to content

Replaced void pointers with Types in JSON Datetime Conversions #30283

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Dec 24, 2019
277 changes: 141 additions & 136 deletions pandas/_libs/src/ujson/python/objToJSON.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ PyObject *cls_timedelta;

npy_int64 get_nat(void) { return NPY_MIN_INT64; }

typedef void *(*PFN_PyTypeToJSON)(JSOBJ obj, JSONTypeContext *ti,
void *outValue, size_t *_outLen);
typedef char *(*PFN_PyTypeToUTF8)(JSOBJ obj, JSONTypeContext *ti,
size_t *_outLen);

typedef struct __NpyArrContext {
PyObject *array;
Expand Down Expand Up @@ -94,7 +94,7 @@ typedef struct __TypeContext {
JSPFN_ITERNEXT iterNext;
JSPFN_ITERGETNAME iterGetName;
JSPFN_ITERGETVALUE iterGetValue;
PFN_PyTypeToJSON PyTypeToJSON;
PFN_PyTypeToUTF8 PyTypeToUTF8;
PyObject *newObj;
PyObject *dictObj;
Py_ssize_t index;
Expand Down Expand Up @@ -396,96 +396,116 @@ static PyObject *get_item(PyObject *obj, Py_ssize_t i) {
return ret;
}

static void *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue,
size_t *_outLen) {
static char *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *tc, size_t *_outLen) {
PyObject *obj = (PyObject *)_obj;
*_outLen = PyBytes_GET_SIZE(obj);
return PyBytes_AS_STRING(obj);
}

static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue,
size_t *_outLen) {
return PyUnicode_AsUTF8AndSize(_obj, _outLen);
static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, size_t *_outLen) {
return (char *)PyUnicode_AsUTF8AndSize(_obj, (Py_ssize_t *)_outLen);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PyUnicode_AsUTF8AndSize came up in the thread about unicode surrogates. do we need to worry about those here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@WillAyd i think this is the same function where passing a surrogate unicode character can cause a segfault. can that be ruled out here?

}

static void *PandasDateTimeStructToJSON(npy_datetimestruct *dts,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The diff is a little difficult to read here, but this got rid of the ToJSON functions and instead replaces them with explicit ToIso or ToEpoch functions with strong types

JSONTypeContext *tc, void *outValue,
size_t *_outLen) {
NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
/* returns a char* and mutates the pointer to *len */
static char *NpyDateTimeToIso(JSOBJ unused, JSONTypeContext *tc, size_t *len) {
npy_datetimestruct dts;
int ret_code;
int64_t longVal = GET_TC(tc)->longValue;

if (((PyObjectEncoder *)tc->encoder)->datetimeIso) {
PRINTMARK();
*_outLen = (size_t)get_datetime_iso_8601_strlen(0, base);
GET_TC(tc)->cStr = PyObject_Malloc(sizeof(char) * (*_outLen));
if (!GET_TC(tc)->cStr) {
PyErr_NoMemory();
((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
return NULL;
}
pandas_datetime_to_datetimestruct(longVal, NPY_FR_ns, &dts);

if (!make_iso_8601_datetime(dts, GET_TC(tc)->cStr, *_outLen, base)) {
PRINTMARK();
*_outLen = strlen(GET_TC(tc)->cStr);
return GET_TC(tc)->cStr;
} else {
PRINTMARK();
PyErr_SetString(PyExc_ValueError,
"Could not convert datetime value to string");
((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
PyObject_Free(GET_TC(tc)->cStr);
return NULL;
}
} else {
PRINTMARK();
*((JSINT64 *)outValue) = npy_datetimestruct_to_datetime(base, dts);
NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
*len = (size_t)get_datetime_iso_8601_strlen(0, base);
char *result = PyObject_Malloc(*len);

if (result == NULL) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how would we get here? if len == 0?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would happen if malloc fails

https://stackoverflow.com/a/12434865/621736

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So failing to address it would mean getting something like a segfault? the PyErr_NoMemory just below raises back in py-space?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would segfault if you tried to access the memory if malloc failed to allocate it. PyErr_NoMemory would just set the global error indicator so when the extension exits that should raise back in the Python space (assuming nothing else clears it out)

FWIW the error handling here is copy / paste from here:

There are similar checks elsewhere throughout the module (not always consistent) so maybe worth unifying in a follow up

PyErr_NoMemory();
((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
return NULL;
}
}

static void *NpyDateTimeScalarToJSON(JSOBJ _obj, JSONTypeContext *tc,
void *outValue, size_t *_outLen) {
npy_datetimestruct dts;
PyDatetimeScalarObject *obj = (PyDatetimeScalarObject *)_obj;
PRINTMARK();
// TODO(anyone): Does not appear to be reached in tests.
ret_code = make_iso_8601_datetime(&dts, result, *len, base);
if (ret_code != 0) {
PyErr_SetString(PyExc_ValueError,
"Could not convert datetime value to string");
((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
PyObject_Free(result);
}

pandas_datetime_to_datetimestruct(obj->obval,
(NPY_DATETIMEUNIT)obj->obmeta.base, &dts);
return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen);
// Note that get_datetime_iso_8601_strlen just gives a generic size
// for ISO string conversion, not the actual size used
*len = strlen(result);
return result;
}

static void *PyDateTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue,
size_t *_outLen) {
static npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base) {
scaleNanosecToUnit(&dt, base);
return dt;
}

static char *PyDateTimeToIso(JSOBJ obj, JSONTypeContext *tc, size_t *len) {
npy_datetimestruct dts;
PyDateTime_Date *obj = (PyDateTime_Date *)_obj;
int ret;

PRINTMARK();
if (!PyDateTime_Check(obj)) {
// TODO: raise TypeError
}

if (!convert_pydatetime_to_datetimestruct(obj, &dts)) {
PRINTMARK();
return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen);
} else {
ret = convert_pydatetime_to_datetimestruct(obj, &dts);
if (ret != 0) {
if (!PyErr_Occurred()) {
PyErr_SetString(PyExc_ValueError,
"Could not convert datetime value to string");
"Could not convert PyDateTime to numpy datetime");
}
((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
return NULL;
}

NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
*len = (size_t)get_datetime_iso_8601_strlen(0, base);
char *result = PyObject_Malloc(*len);
ret = make_iso_8601_datetime(&dts, result, *len, base);

if (ret != 0) {
PRINTMARK();
PyErr_SetString(PyExc_ValueError,
"Could not convert datetime value to string");
((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
PyObject_Free(result);
return NULL;
}

// Note that get_datetime_iso_8601_strlen just gives a generic size
// for ISO string conversion, not the actual size used
*len = strlen(result);
return result;
}

static void *NpyDatetime64ToJSON(JSOBJ _obj, JSONTypeContext *tc,
void *outValue, size_t *_outLen) {
static npy_datetime PyDateTimeToEpoch(PyObject *obj, NPY_DATETIMEUNIT base) {
npy_datetimestruct dts;
PRINTMARK();
int ret;

if (!PyDateTime_Check(obj)) {
// TODO: raise TypeError
}
PyDateTime_Date *dt = (PyDateTime_Date *)obj;

pandas_datetime_to_datetimestruct((npy_datetime)GET_TC(tc)->longValue,
NPY_FR_ns, &dts);
return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen);
ret = convert_pydatetime_to_datetimestruct(dt, &dts);
if (ret != 0) {
if (!PyErr_Occurred()) {
PyErr_SetString(PyExc_ValueError,
"Could not convert PyDateTime to numpy datetime");
}
// TODO: is setting errMsg required?
//((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
// return NULL;
}

npy_datetime npy_dt = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts);
return NpyDateTimeToEpoch(npy_dt, base);
}

static void *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue,
size_t *outLen) {
static char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, size_t *outLen) {
PyObject *obj = (PyObject *)_obj;
PyObject *str;
PyObject *tmp;
Expand All @@ -509,49 +529,10 @@ static void *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue,
GET_TC(tc)->newObj = str;

*outLen = PyBytes_GET_SIZE(str);
outValue = (void *)PyBytes_AS_STRING(str);
char *outValue = PyBytes_AS_STRING(str);
return outValue;
}

static int NpyTypeToJSONType(PyObject *obj, JSONTypeContext *tc, int npyType,
void *value) {
PyArray_VectorUnaryFunc *castfunc;
npy_int64 longVal;

if (PyTypeNum_ISDATETIME(npyType)) {
PRINTMARK();
castfunc =
PyArray_GetCastFunc(PyArray_DescrFromType(npyType), NPY_INT64);
if (!castfunc) {
PyErr_Format(PyExc_ValueError, "Cannot cast numpy dtype %d to long",
npyType);
}
castfunc(value, &longVal, 1, NULL, NULL);
if (longVal == get_nat()) {
PRINTMARK();
return JT_NULL;
}

if (((PyObjectEncoder *)tc->encoder)->datetimeIso) {
GET_TC(tc)->longValue = (JSINT64)longVal;
GET_TC(tc)->PyTypeToJSON = NpyDatetime64ToJSON;
return JT_UTF8;
} else {
NPY_DATETIMEUNIT unit =
((PyObjectEncoder *)tc->encoder)->datetimeUnit;
if (!scaleNanosecToUnit(&longVal, unit)) {
GET_TC(tc)->longValue = longVal;
return JT_LONG;
} else {
// TODO: some kind of error handling
}
}
}

PRINTMARK();
return JT_INVALID;
}

//=============================================================================
// Numpy array iteration functions
//=============================================================================
Expand Down Expand Up @@ -1705,29 +1686,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
obj = (PyObject *)_obj;
enc = (PyObjectEncoder *)tc->encoder;

if (enc->npyType >= 0) {
PRINTMARK();
tc->prv = &(enc->basicTypeContext);
tc->type = NpyTypeToJSONType(obj, tc, enc->npyType, enc->npyValue);

if (tc->type == JT_INVALID) {
if (enc->defaultHandler) {
enc->npyType = -1;
PRINTMARK();
Object_invokeDefaultHandler(
enc->npyCtxtPassthru->getitem(enc->npyValue,
enc->npyCtxtPassthru->array),
enc);
} else {
PyErr_Format(PyExc_RuntimeError, "Unhandled numpy dtype %d",
enc->npyType);
}
}
enc->npyCtxtPassthru = NULL;
enc->npyType = -1;
return;
}

if (PyBool_Check(obj)) {
PRINTMARK();
tc->type = (obj == Py_True) ? JT_TRUE : JT_FALSE;
Expand All @@ -1745,6 +1703,44 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
}
tc->prv = pc;

if (PyTypeNum_ISDATETIME(enc->npyType)) {
PRINTMARK();
int64_t longVal;
PyArray_VectorUnaryFunc *castfunc =
PyArray_GetCastFunc(PyArray_DescrFromType(enc->npyType), NPY_INT64);
if (!castfunc) {
PyErr_Format(PyExc_ValueError, "Cannot cast numpy dtype %d to long",
enc->npyType);
}
castfunc(enc->npyValue, &longVal, 1, NULL, NULL);
if (longVal == get_nat()) {
PRINTMARK();
tc->type = JT_NULL;
} else {

if (enc->datetimeIso) {
PRINTMARK();
pc->PyTypeToUTF8 = NpyDateTimeToIso;
// Currently no way to pass longVal to iso function, so use
// state management
GET_TC(tc)->longValue = longVal;
tc->type = JT_UTF8;
} else {
PRINTMARK();
NPY_DATETIMEUNIT base =
((PyObjectEncoder *)tc->encoder)->datetimeUnit;
GET_TC(tc)->longValue = NpyDateTimeToEpoch(longVal, base);
tc->type = JT_LONG;
}
}

// TODO: this prevents infinite loop with mixed-type DataFrames;
// refactor
enc->npyCtxtPassthru = NULL;
enc->npyType = -1;
return;
}

if (PyIter_Check(obj) ||
(PyArray_Check(obj) && !PyArray_CheckScalar(obj))) {
PRINTMARK();
Expand Down Expand Up @@ -1776,12 +1772,12 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
return;
} else if (PyBytes_Check(obj)) {
PRINTMARK();
pc->PyTypeToJSON = PyBytesToUTF8;
pc->PyTypeToUTF8 = PyBytesToUTF8;
tc->type = JT_UTF8;
return;
} else if (PyUnicode_Check(obj)) {
PRINTMARK();
pc->PyTypeToJSON = PyUnicodeToUTF8;
pc->PyTypeToUTF8 = PyUnicodeToUTF8;
tc->type = JT_UTF8;
return;
} else if (PyObject_TypeCheck(obj, type_decimal)) {
Expand All @@ -1799,19 +1795,19 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
PRINTMARK();
if (enc->datetimeIso) {
PRINTMARK();
pc->PyTypeToJSON = PyDateTimeToJSON;
pc->PyTypeToUTF8 = PyDateTimeToIso;
tc->type = JT_UTF8;
} else {
PRINTMARK();
// TODO: last argument here is unused; should decouple string
// from long datetimelike conversion routines
PyDateTimeToJSON(obj, tc, &(GET_TC(tc)->longValue), 0);
NPY_DATETIMEUNIT base =
((PyObjectEncoder *)tc->encoder)->datetimeUnit;
GET_TC(tc)->longValue = PyDateTimeToEpoch(obj, base);
tc->type = JT_LONG;
}
return;
} else if (PyTime_Check(obj)) {
PRINTMARK();
pc->PyTypeToJSON = PyTimeToJSON;
pc->PyTypeToUTF8 = PyTimeToJSON;
tc->type = JT_UTF8;
return;
} else if (PyArray_IsScalar(obj, Datetime)) {
Expand All @@ -1823,8 +1819,17 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
}

PRINTMARK();
pc->PyTypeToJSON = NpyDateTimeScalarToJSON;
tc->type = enc->datetimeIso ? JT_UTF8 : JT_LONG;
if (enc->datetimeIso) {
PRINTMARK();
pc->PyTypeToUTF8 = PyDateTimeToIso;
tc->type = JT_UTF8;
} else {
PRINTMARK();
NPY_DATETIMEUNIT base =
((PyObjectEncoder *)tc->encoder)->datetimeUnit;
GET_TC(tc)->longValue = PyDateTimeToEpoch(obj, base);
tc->type = JT_LONG;
}
return;
} else if (PyDelta_Check(obj)) {
if (PyObject_HasAttrString(obj, "value")) {
Expand Down Expand Up @@ -2226,7 +2231,7 @@ void Object_endTypeContext(JSOBJ obj, JSONTypeContext *tc) {

const char *Object_getStringValue(JSOBJ obj, JSONTypeContext *tc,
size_t *_outLen) {
return GET_TC(tc)->PyTypeToJSON(obj, tc, NULL, _outLen);
return GET_TC(tc)->PyTypeToUTF8(obj, tc, _outLen);
}

JSINT64 Object_getLongValue(JSOBJ obj, JSONTypeContext *tc) {
Expand Down