From 648d2b5d6118dbef94640bbe369f09b9fb460f1f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 25 Jul 2019 23:16:24 -0700 Subject: [PATCH 01/26] with segfaults --- pandas/_libs/src/ujson/python/objToJSON.c | 80 ++++++++++++----------- 1 file changed, 42 insertions(+), 38 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 926440218b5d9..33212d3b90fa0 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -144,7 +144,7 @@ typedef struct __PyObjectEncoder { enum PANDAS_FORMAT { SPLIT, RECORDS, INDEX, COLUMNS, VALUES }; -#define PRINTMARK() +#define PRINTMARK() printf("%d\n", __LINE__) int PdBlock_iterNext(JSOBJ, JSONTypeContext *); @@ -787,30 +787,27 @@ JSOBJ NpyArr_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static void NpyArr_getLabel(JSOBJ obj, JSONTypeContext *tc, size_t *outLen, - npy_intp idx, char **labels) { - JSONObjectEncoder *enc = (JSONObjectEncoder *)tc->encoder; - PRINTMARK(); - *outLen = strlen(labels[idx]); - Buffer_Reserve(enc, *outLen); - memcpy(enc->offset, labels[idx], sizeof(char) * (*outLen)); - enc->offset += *outLen; - *outLen = 0; -} - char *NpyArr_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { NpyArrContext *npyarr = GET_TC(tc)->npyarr; npy_intp idx; PRINTMARK(); + char *cStr; if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { idx = npyarr->index[npyarr->stridedim] - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels); + cStr = npyarr->columnLabels[idx]; } else { idx = npyarr->index[npyarr->stridedim - npyarr->inc] - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels); + cStr = npyarr->rowLabels[idx]; } - return NULL; + + printf("giving back %s\n", cStr); + *outLen = strlen(cStr); + + // encoding will free whatever is returned here, so copy as the + // call to NpyArr_freeLabels at the end of iteration will try + // to free as well + return strdup(cStr); } //============================================================================= @@ -852,19 +849,25 @@ char *PdBlock_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; NpyArrContext *npyarr = blkCtxt->npyCtxts[0]; npy_intp idx; + char *cStr; PRINTMARK(); if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) { idx = blkCtxt->colIdx - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels); + cStr = npyarr->columnLabels[idx]; } else { idx = GET_TC(tc)->iterNext != PdBlock_iterNext ? npyarr->index[npyarr->stridedim - npyarr->inc] - 1 : npyarr->index[npyarr->stridedim]; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels); + cStr = npyarr->rowLabels[idx]; } - return NULL; + + *outLen = strlen(cStr); + // encoding will free whatever is returned here, so copy as the + // call to NpyArr_freeLabels at the end of iteration will try + // to free as well + return strdup(cStr); } char *PdBlock_iterGetName_Transpose(JSOBJ obj, JSONTypeContext *tc, @@ -872,16 +875,22 @@ char *PdBlock_iterGetName_Transpose(JSOBJ obj, JSONTypeContext *tc, PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; npy_intp idx; + char *cStr; PRINTMARK(); if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { idx = npyarr->index[npyarr->stridedim] - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels); + cStr = npyarr->columnLabels[idx]; } else { idx = blkCtxt->colIdx; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels); + cStr = npyarr->rowLabels[idx]; } - return NULL; + + *outLen = strlen(cStr); + // encoding will free whatever is returned here, so copy as the + // call to NpyArr_freeLabels at the end of iteration will try + // to free as well + return strdup(cStr); } int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) { @@ -1583,7 +1592,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, // NOTE this function steals a reference to labels. PyObjectEncoder *pyenc = (PyObjectEncoder *)enc; PyObject *item = NULL; - npy_intp i, stride, len, need_quotes; + npy_intp i, stride; char **ret; char *dataptr, *cLabel, *origend, *origst, *origoffset; char labelBuffer[NPY_JSON_BUFSIZE]; @@ -1614,6 +1623,8 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, ret[i] = NULL; } + // Because we make calls to JSON encode with the shared encoder + // for the labels be sure to keep track of where we started origst = enc->start; origend = enc->end; origoffset = enc->offset; @@ -1624,7 +1635,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, type_num = PyArray_TYPE(labels); for (i = 0; i < num; i++) { - if (PyTypeNum_ISDATETIME(type_num) || PyTypeNum_ISNUMBER(type_num)) + if (PyTypeNum_ISDATETIME(type_num)) { item = (PyObject *)labels; pyenc->npyType = type_num; @@ -1638,7 +1649,10 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, } } - cLabel = JSON_EncodeObject(item, enc, labelBuffer, NPY_JSON_BUFSIZE); + PyObject *str = PyObject_Str(item); + Py_ssize_t *size; + cLabel = PyUnicode_AsUTF8AndSize(str, size); + Py_DECREF(str); if (item != (PyObject *)labels) { Py_DECREF(item); @@ -1650,9 +1664,8 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, break; } - need_quotes = ((*cLabel) != '"'); - len = enc->offset - cLabel + 1 + 2 * need_quotes; - ret[i] = PyObject_Malloc(sizeof(char) * len); + printf("label is %s\n", cLabel); + ret[i] = cLabel; if (!ret[i]) { PyErr_NoMemory(); @@ -1660,22 +1673,13 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, break; } - if (need_quotes) { - ret[i][0] = '"'; - memcpy(ret[i] + 1, cLabel, sizeof(char) * (len - 4)); - ret[i][len - 3] = '"'; - } else { - memcpy(ret[i], cLabel, sizeof(char) * (len - 2)); - } - ret[i][len - 2] = ':'; - ret[i][len - 1] = '\0'; dataptr += stride; } enc->start = origst; enc->end = origend; enc->offset = origoffset; - + Py_DECREF(labels); return ret; } @@ -2429,7 +2433,6 @@ PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) { PRINTMARK(); ret = JSON_EncodeObject(oinput, encoder, buffer, sizeof(buffer)); PRINTMARK(); - if (PyErr_Occurred()) { PRINTMARK(); return NULL; @@ -2447,6 +2450,7 @@ PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) { newobj = PyUnicode_FromString(ret); + printf("returning %s\n", ret); if (ret != buffer) { encoder->free(ret); } From e890578d3d693c84cda612139ab87a424486a13b Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 26 Jul 2019 13:41:20 -0700 Subject: [PATCH 02/26] Works save datetimes --- pandas/_libs/src/ujson/python/objToJSON.c | 32 ++++++++--------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 33212d3b90fa0..f6b44e1ebc907 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -144,7 +144,7 @@ typedef struct __PyObjectEncoder { enum PANDAS_FORMAT { SPLIT, RECORDS, INDEX, COLUMNS, VALUES }; -#define PRINTMARK() printf("%d\n", __LINE__) +#define PRINTMARK() int PdBlock_iterNext(JSOBJ, JSONTypeContext *); @@ -801,13 +801,9 @@ char *NpyArr_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { cStr = npyarr->rowLabels[idx]; } - printf("giving back %s\n", cStr); *outLen = strlen(cStr); - // encoding will free whatever is returned here, so copy as the - // call to NpyArr_freeLabels at the end of iteration will try - // to free as well - return strdup(cStr); + return cStr; } //============================================================================= @@ -864,10 +860,7 @@ char *PdBlock_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { } *outLen = strlen(cStr); - // encoding will free whatever is returned here, so copy as the - // call to NpyArr_freeLabels at the end of iteration will try - // to free as well - return strdup(cStr); + return cStr; } char *PdBlock_iterGetName_Transpose(JSOBJ obj, JSONTypeContext *tc, @@ -887,10 +880,7 @@ char *PdBlock_iterGetName_Transpose(JSOBJ obj, JSONTypeContext *tc, } *outLen = strlen(cStr); - // encoding will free whatever is returned here, so copy as the - // call to NpyArr_freeLabels at the end of iteration will try - // to free as well - return strdup(cStr); + return cStr; } int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) { @@ -1592,7 +1582,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, // NOTE this function steals a reference to labels. PyObjectEncoder *pyenc = (PyObjectEncoder *)enc; PyObject *item = NULL; - npy_intp i, stride; + npy_intp i, stride, len; char **ret; char *dataptr, *cLabel, *origend, *origst, *origoffset; char labelBuffer[NPY_JSON_BUFSIZE]; @@ -1635,7 +1625,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, type_num = PyArray_TYPE(labels); for (i = 0; i < num; i++) { - if (PyTypeNum_ISDATETIME(type_num)) + if (PyTypeNum_ISDATETIME(type_num)) { item = (PyObject *)labels; pyenc->npyType = type_num; @@ -1650,8 +1640,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, } PyObject *str = PyObject_Str(item); - Py_ssize_t *size; - cLabel = PyUnicode_AsUTF8AndSize(str, size); + cLabel = PyUnicode_AsUTF8(str); Py_DECREF(str); if (item != (PyObject *)labels) { @@ -1664,8 +1653,10 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, break; } - printf("label is %s\n", cLabel); - ret[i] = cLabel; + len = strlen(cLabel); + // Add 1 to include NULL terminator + ret[i] = PyObject_Malloc(len + 1); + memcpy(ret[i], cLabel, len + 1); if (!ret[i]) { PyErr_NoMemory(); @@ -2450,7 +2441,6 @@ PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) { newobj = PyUnicode_FromString(ret); - printf("returning %s\n", ret); if (ret != buffer) { encoder->free(ret); } From 24a585a5fa32d576217db368a9e4bcfd20f307a6 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 26 Jul 2019 15:02:32 -0700 Subject: [PATCH 03/26] Less failures, still not correct dt support --- pandas/_libs/src/ujson/python/objToJSON.c | 38 +++++++++-------------- 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index f6b44e1ebc907..66403c0c84f42 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1625,39 +1625,29 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, type_num = PyArray_TYPE(labels); for (i = 0; i < num; i++) { - if (PyTypeNum_ISDATETIME(type_num)) - { - item = (PyObject *)labels; - pyenc->npyType = type_num; - pyenc->npyValue = dataptr; - } else { - item = getitem(dataptr, labels); - if (!item) { - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - } - + item = getitem(dataptr, labels); + if (!item) { + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + PyObject *str = PyObject_Str(item); - cLabel = PyUnicode_AsUTF8(str); + cLabel = PyUnicode_AsUTF8(str); + len = strlen(cLabel); Py_DECREF(str); - if (item != (PyObject *)labels) { - Py_DECREF(item); - } + Py_DECREF(item); + // Add 1 to include NULL terminator + ret[i] = PyObject_Malloc(len + 1); + memcpy(ret[i], cLabel, len + 1); - if (PyErr_Occurred() || enc->errorMsg) { + if (PyErr_Occurred()) { NpyArr_freeLabels(ret, num); ret = 0; break; } - len = strlen(cLabel); - // Add 1 to include NULL terminator - ret[i] = PyObject_Malloc(len + 1); - memcpy(ret[i], cLabel, len + 1); - if (!ret[i]) { PyErr_NoMemory(); ret = 0; From 42cc1ec37fd8c9770d92518d48ad42588aa86c30 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 26 Jul 2019 15:03:48 -0700 Subject: [PATCH 04/26] Removed encoder attributes in encodeLabels --- pandas/_libs/src/ujson/python/objToJSON.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 66403c0c84f42..f7fb6285558ad 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1584,8 +1584,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, PyObject *item = NULL; npy_intp i, stride, len; char **ret; - char *dataptr, *cLabel, *origend, *origst, *origoffset; - char labelBuffer[NPY_JSON_BUFSIZE]; + char *dataptr, *cLabel; PyArray_GetItemFunc *getitem; int type_num; PRINTMARK(); @@ -1613,12 +1612,6 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, ret[i] = NULL; } - // Because we make calls to JSON encode with the shared encoder - // for the labels be sure to keep track of where we started - origst = enc->start; - origend = enc->end; - origoffset = enc->offset; - stride = PyArray_STRIDE(labels, 0); dataptr = PyArray_DATA(labels); getitem = (PyArray_GetItemFunc *)PyArray_DESCR(labels)->f->getitem; @@ -1656,10 +1649,6 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, dataptr += stride; } - - enc->start = origst; - enc->end = origend; - enc->offset = origoffset; Py_DECREF(labels); return ret; From 7385fd870af3256f652ca0933be399cb61a490a5 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 26 Jul 2019 15:28:57 -0700 Subject: [PATCH 05/26] Added docstring --- pandas/_libs/src/ujson/python/objToJSON.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index f7fb6285558ad..1fd84dddfddfd 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1577,6 +1577,23 @@ void NpyArr_freeLabels(char **labels, npy_intp len) { } } +/* + * Function: NpyArr_encodeLabels + * ----------------------------- + * + * Builds an array of "encoded" labels. + * + * labels: PyArrayObject pointer for labels to be "encoded" + * num : number of labels + * + * "encode" is quoted above because we aren't really doing encoding + * For historical reasons this function would actually encode the entire + * array into a separate buffer with a separate call to JSON_Encode + * and would leave it to complex pointer manipulation from there to + * unpack values as needed. To make things simpler and more idiomatic + * this has instead just stringified any input save for datetime values, + * which may need to be represented in various formats. + */ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, npy_intp num) { // NOTE this function steals a reference to labels. From 4aca7873f237d3d0a47ce4047d77d951a796c267 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 26 Jul 2019 16:03:31 -0700 Subject: [PATCH 06/26] whitespace cleanup --- pandas/_libs/src/ujson/python/objToJSON.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 1fd84dddfddfd..aafa0544d4b09 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -859,7 +859,7 @@ char *PdBlock_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { cStr = npyarr->rowLabels[idx]; } - *outLen = strlen(cStr); + *outLen = strlen(cStr); return cStr; } @@ -1641,7 +1641,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, ret = 0; break; } - + PyObject *str = PyObject_Str(item); cLabel = PyUnicode_AsUTF8(str); len = strlen(cLabel); @@ -1650,7 +1650,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, Py_DECREF(item); // Add 1 to include NULL terminator ret[i] = PyObject_Malloc(len + 1); - memcpy(ret[i], cLabel, len + 1); + memcpy(ret[i], cLabel, len + 1); if (PyErr_Occurred()) { NpyArr_freeLabels(ret, num); @@ -1666,7 +1666,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, dataptr += stride; } - + Py_DECREF(labels); return ret; } From bc5e69ac8de2703eb590e2ffebaa59cf62110244 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 26 Jul 2019 16:52:31 -0700 Subject: [PATCH 07/26] Removed unused headers --- pandas/_libs/src/ujson/lib/ultrajson.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index 0470fef450dde..ee6e7081bf00e 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -307,11 +307,4 @@ EXPORTFUNCTION JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuffer); EXPORTFUNCTION void encode(JSOBJ, JSONObjectEncoder *, const char *, size_t); -#define Buffer_Reserve(__enc, __len) \ - if ((size_t)((__enc)->end - (__enc)->offset) < (size_t)(__len)) { \ - Buffer_Realloc((__enc), (__len)); \ - } - -void Buffer_Realloc(JSONObjectEncoder *enc, size_t cbNeeded); - #endif // PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_ From d53527ffaf51204fba4af485dc40866271e778ff Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 26 Jul 2019 16:57:06 -0700 Subject: [PATCH 08/26] Moved macro back to vendor c file --- pandas/_libs/src/ujson/lib/ultrajsonenc.c | 6 ++++++ pandas/_libs/src/ujson/python/objToJSON.c | 3 --- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/src/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/ujson/lib/ultrajsonenc.c index 2d6c823a45515..d5b379bee585b 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/ujson/lib/ultrajsonenc.c @@ -714,6 +714,12 @@ int Buffer_EscapeStringValidated(JSOBJ obj, JSONObjectEncoder *enc, } } +#define Buffer_Reserve(__enc, __len) \ + if ( (size_t) ((__enc)->end - (__enc)->offset) < (size_t) (__len)) \ + { \ + Buffer_Realloc((__enc), (__len));\ + } \ + #define Buffer_AppendCharUnchecked(__enc, __chr) *((__enc)->offset++) = __chr; FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC strreverse(char *begin, diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index aafa0544d4b09..723bc5ec98fdd 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -48,8 +48,6 @@ Numeric decoder derived from from TCL library #include <../../../tslibs/src/datetime/np_datetime_strings.h> #include "datetime.h" -#define NPY_JSON_BUFSIZE 32768 - static PyTypeObject *type_decimal; static PyTypeObject *cls_dataframe; static PyTypeObject *cls_series; @@ -1597,7 +1595,6 @@ void NpyArr_freeLabels(char **labels, npy_intp len) { char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, npy_intp num) { // NOTE this function steals a reference to labels. - PyObjectEncoder *pyenc = (PyObjectEncoder *)enc; PyObject *item = NULL; npy_intp i, stride, len; char **ret; From 2e25bef7cdad76300d8197bd1e36164d263b9993 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 6 Aug 2019 20:42:47 -0700 Subject: [PATCH 09/26] Working TS conversions --- pandas/_libs/src/ujson/python/objToJSON.c | 55 +++++++++++++++++++---- 1 file changed, 46 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 723bc5ec98fdd..61c45eb28aa87 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -53,6 +53,7 @@ static PyTypeObject *cls_dataframe; static PyTypeObject *cls_series; static PyTypeObject *cls_index; static PyTypeObject *cls_nat; +PyObject *cls_timestamp; npy_int64 get_nat(void) { return NPY_MIN_INT64; } @@ -164,6 +165,7 @@ void *initObjToJSON(void) cls_index = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Index"); cls_series = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Series"); + cls_timestamp = PyObject_GetAttrString(mod_pandas, "Timestamp"); Py_DECREF(mod_pandas); } @@ -1592,7 +1594,7 @@ void NpyArr_freeLabels(char **labels, npy_intp len) { * this has instead just stringified any input save for datetime values, * which may need to be represented in various formats. */ -char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, +char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, npy_intp num) { // NOTE this function steals a reference to labels. PyObject *item = NULL; @@ -1639,10 +1641,45 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, break; } - PyObject *str = PyObject_Str(item); - cLabel = PyUnicode_AsUTF8(str); - len = strlen(cLabel); - Py_DECREF(str); + // Using a date as a key we need to special case the formatting + if (enc->datetimeIso && (PyTypeNum_ISDATETIME(type_num) || + PyDateTime_Check(item) || PyDate_Check(item))) { + PyObject *argList = Py_BuildValue("(O)", item); + + if (argList == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + PyObject *ts = PyObject_CallObject(cls_timestamp, argList); + Py_DECREF(argList); + if (ts == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + PyObject *iso = PyObject_CallMethod(ts, "isoformat", NULL); + Py_DECREF(ts); + if (iso == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + cLabel = PyUnicode_AsUTF8(iso); + Py_DECREF(iso); + len = strlen(cLabel); + } else { // Otherwise use the str representation as the key + PyObject *str = PyObject_Str(item); + cLabel = PyUnicode_AsUTF8(str); + Py_DECREF(str); + len = strlen(cLabel); + } Py_DECREF(item); // Add 1 to include NULL terminator @@ -1960,7 +1997,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, + enc, pc->columnLabelsLen); if (!pc->columnLabels) { goto INVALID; @@ -2063,7 +2100,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } pc->columnLabelsLen = PyObject_Size(tmpObj); pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, + enc, pc->columnLabelsLen); Py_DECREF(tmpObj); if (!pc->columnLabels) { @@ -2086,7 +2123,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->rowLabelsLen = PyObject_Size(tmpObj); pc->rowLabels = NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, pc->rowLabelsLen); + enc, pc->rowLabelsLen); Py_DECREF(tmpObj); tmpObj = (enc->outputFormat == INDEX ? PyObject_GetAttrString(obj, "columns") @@ -2105,7 +2142,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } pc->columnLabelsLen = PyObject_Size(tmpObj); pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, + enc, pc->columnLabelsLen); Py_DECREF(tmpObj); if (!pc->columnLabels) { From 0820f57e2de55b79eb517d6378941bba4f7bae36 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 7 Aug 2019 08:14:26 -0700 Subject: [PATCH 10/26] Parametrized test --- pandas/tests/io/json/test_pandas.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 9c687f036aa68..d10b7d9f9c0f3 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1065,7 +1065,8 @@ def test_w_date(date, date_unit=None): with pytest.raises(ValueError, match=msg): ts.to_json(date_format="iso", date_unit="foo") - def test_date_unit(self): + @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) + def test_date_unit(self, unit): df = self.tsframe.copy() df["date"] = Timestamp("20130101 20:43:42") dl = df.columns.get_loc("date") @@ -1073,16 +1074,15 @@ def test_date_unit(self): df.iloc[2, dl] = Timestamp("21460101 20:43:42") df.iloc[4, dl] = pd.NaT - for unit in ("s", "ms", "us", "ns"): - json = df.to_json(date_format="epoch", date_unit=unit) + json = df.to_json(date_format="epoch", date_unit=unit) - # force date unit - result = read_json(json, date_unit=unit) - assert_frame_equal(result, df) + # force date unit + result = read_json(json, date_unit=unit) + assert_frame_equal(result, df) - # detect date unit - result = read_json(json, date_unit=None) - assert_frame_equal(result, df) + # detect date unit + result = read_json(json, date_unit=None) + assert_frame_equal(result, df) def test_weird_nested_json(self): # this used to core dump the parser From 185aa61305bff3ec41aaa30c729ca16af3a12930 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 7 Aug 2019 09:26:32 -0700 Subject: [PATCH 11/26] Fixed epoch precision issue for labels --- pandas/_libs/src/ujson/python/objToJSON.c | 60 +++++++++++++++++------ 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 61c45eb28aa87..b646ddf26967f 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1642,10 +1642,9 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, } // Using a date as a key we need to special case the formatting - if (enc->datetimeIso && (PyTypeNum_ISDATETIME(type_num) || - PyDateTime_Check(item) || PyDate_Check(item))) { + if (PyTypeNum_ISDATETIME(type_num) || + PyDateTime_Check(item) || PyDate_Check(item)) { PyObject *argList = Py_BuildValue("(O)", item); - if (argList == NULL) { Py_DECREF(item); NpyArr_freeLabels(ret, num); @@ -1662,18 +1661,51 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, break; } - PyObject *iso = PyObject_CallMethod(ts, "isoformat", NULL); - Py_DECREF(ts); - if (iso == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; + if (enc->datetimeIso) { + PyObject *iso = PyObject_CallMethod(ts, "isoformat", NULL); + Py_DECREF(ts); + if (iso == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + cLabel = PyUnicode_AsUTF8(iso); + Py_DECREF(iso); + len = strlen(cLabel); + } else { + npy_int64 value; + // TODO: refactor to not duplicate what goes on in beginTypeContext + if (PyObject_HasAttrString(ts, "value")) { + PRINTMARK(); + value = get_long_attr(ts, "value"); + } else { + PRINTMARK(); + value = + total_seconds(ts) * 1000000000LL; // nanoseconds per second + } + Py_DECREF(ts); + + switch (enc->datetimeUnit) { + case NPY_FR_ns: + break; + case NPY_FR_us: + value /= 1000LL; + break; + case NPY_FR_ms: + value /= 1000000LL; + break; + case NPY_FR_s: + value /= 1000000000LL; + break; + } + + char buf[21] = {0}; // 21 chars for 2**63 as string + cLabel = buf; + sprintf(buf, "%lld", value); + len = strlen(cLabel); } - - cLabel = PyUnicode_AsUTF8(iso); - Py_DECREF(iso); - len = strlen(cLabel); } else { // Otherwise use the str representation as the key PyObject *str = PyObject_Str(item); cLabel = PyUnicode_AsUTF8(str); From 221e43decfeaa89084d92f93c377f9155bf95b3f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 7 Aug 2019 09:32:27 -0700 Subject: [PATCH 12/26] Parametrized failing test --- pandas/tests/io/json/test_pandas.py | 41 +++++++++++++++-------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index d10b7d9f9c0f3..3c96467d3bc5b 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1039,27 +1039,28 @@ def test_w_date(date, date_unit=None): with pytest.raises(ValueError, match=msg): df.to_json(date_format="iso", date_unit="foo") - def test_date_format_series(self): - def test_w_date(date, date_unit=None): - ts = Series(Timestamp(date), index=self.ts.index) - ts.iloc[1] = pd.NaT - ts.iloc[5] = pd.NaT - if date_unit: - json = ts.to_json(date_format="iso", date_unit=date_unit) - else: - json = ts.to_json(date_format="iso") - result = read_json(json, typ="series") - expected = ts.copy() - expected.index = expected.index.tz_localize("UTC") - expected = expected.dt.tz_localize("UTC") - assert_series_equal(result, expected) - - test_w_date("20130101 20:43:42.123") - test_w_date("20130101 20:43:42", date_unit="s") - test_w_date("20130101 20:43:42.123", date_unit="ms") - test_w_date("20130101 20:43:42.123456", date_unit="us") - test_w_date("20130101 20:43:42.123456789", date_unit="ns") + @pytest.mark.parametrize("date,date_unit", [ + ("20130101 20:43:42.123", None), + ("20130101 20:43:42", "s"), + ("20130101 20:43:42.123", "ms"), + ("20130101 20:43:42.123456789", "us"), + ("20130101 20:43:42.123456789", "ns") + ]) + def test_date_format_series(self, date, date_unit): + ts = Series(Timestamp(date), index=self.ts.index) + ts.iloc[1] = pd.NaT + ts.iloc[5] = pd.NaT + if date_unit: + json = ts.to_json(date_format="iso", date_unit=date_unit) + else: + json = ts.to_json(date_format="iso") + result = read_json(json, typ="series") + expected = ts.copy() + expected.index = expected.index.tz_localize("UTC") + expected = expected.dt.tz_localize("UTC") + assert_series_equal(result, expected) + def test_date_format_series_raises(self): ts = Series(Timestamp("20130101 20:43:42.123"), index=self.ts.index) msg = "Invalid value 'foo' for option 'date_unit'" with pytest.raises(ValueError, match=msg): From 18d28c88b4046bd04c74c0550b607777e6af4642 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 7 Aug 2019 09:35:38 -0700 Subject: [PATCH 13/26] Parametrized frame test --- pandas/tests/io/json/test_pandas.py | 44 +++++++++++++++-------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 3c96467d3bc5b..fb982d340863d 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1012,29 +1012,31 @@ def test_convert_dates_infer(self): result = read_json(dumps(data))[["id", infer_word]] assert_frame_equal(result, expected) - def test_date_format_frame(self): + @pytest.mark.parametrize("date,date_unit", [ + ("20130101 20:43:42.123", None), + ("20130101 20:43:42", "s"), + ("20130101 20:43:42.123", "ms"), + ("20130101 20:43:42.123456", "us"), + ("20130101 20:43:42.123456789", "ns") + ]) + def test_date_format_frame(self, date, date_unit): df = self.tsframe.copy() - def test_w_date(date, date_unit=None): - df["date"] = Timestamp(date) - df.iloc[1, df.columns.get_loc("date")] = pd.NaT - df.iloc[5, df.columns.get_loc("date")] = pd.NaT - if date_unit: - json = df.to_json(date_format="iso", date_unit=date_unit) - else: - json = df.to_json(date_format="iso") - result = read_json(json) - expected = df.copy() - expected.index = expected.index.tz_localize("UTC") - expected["date"] = expected["date"].dt.tz_localize("UTC") - assert_frame_equal(result, expected) - - test_w_date("20130101 20:43:42.123") - test_w_date("20130101 20:43:42", date_unit="s") - test_w_date("20130101 20:43:42.123", date_unit="ms") - test_w_date("20130101 20:43:42.123456", date_unit="us") - test_w_date("20130101 20:43:42.123456789", date_unit="ns") + df["date"] = Timestamp(date) + df.iloc[1, df.columns.get_loc("date")] = pd.NaT + df.iloc[5, df.columns.get_loc("date")] = pd.NaT + if date_unit: + json = df.to_json(date_format="iso", date_unit=date_unit) + else: + json = df.to_json(date_format="iso") + result = read_json(json) + expected = df.copy() + expected.index = expected.index.tz_localize("UTC") + expected["date"] = expected["date"].dt.tz_localize("UTC") + assert_frame_equal(result, expected) + def test_date_format_frame_raises(self): + df = self.tsframe.copy() msg = "Invalid value 'foo' for option 'date_unit'" with pytest.raises(ValueError, match=msg): df.to_json(date_format="iso", date_unit="foo") @@ -1043,7 +1045,7 @@ def test_w_date(date, date_unit=None): ("20130101 20:43:42.123", None), ("20130101 20:43:42", "s"), ("20130101 20:43:42.123", "ms"), - ("20130101 20:43:42.123456789", "us"), + ("20130101 20:43:42.123456", "us"), ("20130101 20:43:42.123456789", "ns") ]) def test_date_format_series(self, date, date_unit): From 7a5fc9cc16db4ba499c6321c365de4d5f7828015 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 7 Aug 2019 12:12:35 -0700 Subject: [PATCH 14/26] Removed UTC cast for index --- pandas/tests/io/json/test_pandas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index fb982d340863d..d8091fb6001f8 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1031,7 +1031,7 @@ def test_date_format_frame(self, date, date_unit): json = df.to_json(date_format="iso") result = read_json(json) expected = df.copy() - expected.index = expected.index.tz_localize("UTC") + #expected.index = expected.index.tz_localize("UTC") expected["date"] = expected["date"].dt.tz_localize("UTC") assert_frame_equal(result, expected) @@ -1058,7 +1058,7 @@ def test_date_format_series(self, date, date_unit): json = ts.to_json(date_format="iso") result = read_json(json, typ="series") expected = ts.copy() - expected.index = expected.index.tz_localize("UTC") + #expected.index = expected.index.tz_localize("UTC") expected = expected.dt.tz_localize("UTC") assert_series_equal(result, expected) From 3e22820f36ff086a5b9b6b2144ba3015a68bf6d3 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 7 Aug 2019 12:13:51 -0700 Subject: [PATCH 15/26] blackify --- pandas/tests/io/json/test_pandas.py | 38 +++++++++++++++++------------ 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index d8091fb6001f8..1dc65d8ee5511 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1012,13 +1012,16 @@ def test_convert_dates_infer(self): result = read_json(dumps(data))[["id", infer_word]] assert_frame_equal(result, expected) - @pytest.mark.parametrize("date,date_unit", [ - ("20130101 20:43:42.123", None), - ("20130101 20:43:42", "s"), - ("20130101 20:43:42.123", "ms"), - ("20130101 20:43:42.123456", "us"), - ("20130101 20:43:42.123456789", "ns") - ]) + @pytest.mark.parametrize( + "date,date_unit", + [ + ("20130101 20:43:42.123", None), + ("20130101 20:43:42", "s"), + ("20130101 20:43:42.123", "ms"), + ("20130101 20:43:42.123456", "us"), + ("20130101 20:43:42.123456789", "ns"), + ], + ) def test_date_format_frame(self, date, date_unit): df = self.tsframe.copy() @@ -1031,7 +1034,7 @@ def test_date_format_frame(self, date, date_unit): json = df.to_json(date_format="iso") result = read_json(json) expected = df.copy() - #expected.index = expected.index.tz_localize("UTC") + # expected.index = expected.index.tz_localize("UTC") expected["date"] = expected["date"].dt.tz_localize("UTC") assert_frame_equal(result, expected) @@ -1041,13 +1044,16 @@ def test_date_format_frame_raises(self): with pytest.raises(ValueError, match=msg): df.to_json(date_format="iso", date_unit="foo") - @pytest.mark.parametrize("date,date_unit", [ - ("20130101 20:43:42.123", None), - ("20130101 20:43:42", "s"), - ("20130101 20:43:42.123", "ms"), - ("20130101 20:43:42.123456", "us"), - ("20130101 20:43:42.123456789", "ns") - ]) + @pytest.mark.parametrize( + "date,date_unit", + [ + ("20130101 20:43:42.123", None), + ("20130101 20:43:42", "s"), + ("20130101 20:43:42.123", "ms"), + ("20130101 20:43:42.123456", "us"), + ("20130101 20:43:42.123456789", "ns"), + ], + ) def test_date_format_series(self, date, date_unit): ts = Series(Timestamp(date), index=self.ts.index) ts.iloc[1] = pd.NaT @@ -1058,7 +1064,7 @@ def test_date_format_series(self, date, date_unit): json = ts.to_json(date_format="iso") result = read_json(json, typ="series") expected = ts.copy() - #expected.index = expected.index.tz_localize("UTC") + # expected.index = expected.index.tz_localize("UTC") expected = expected.dt.tz_localize("UTC") assert_series_equal(result, expected) From 2389a338aeb6b83eed0aa482fa6e7c232853bec6 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 7 Aug 2019 15:35:53 -0700 Subject: [PATCH 16/26] Added test for timedelta --- pandas/tests/io/json/test_pandas.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 1dc65d8ee5511..3357710994dda 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1620,3 +1620,12 @@ def test_read_timezone_information(self): ) expected = Series([88], index=DatetimeIndex(["2019-01-01 11:00:00"], tz="UTC")) assert_series_equal(result, expected) + + @pytest.mark.parametrize("date_format,key", [ + ("epoch", 86400000), ("iso", "P1DT0H0M0S")]) + def test_timedelta_as_label(self, date_format, key): + df = pd.DataFrame([[1]], columns=[pd.Timedelta('1D')]) + expected = '{{"{key}":{{"0":1}}}}'.format(key=key) + result = df.to_json(date_format=date_format) + + assert result == expected From 104d2f272c6d10bc093193d27be0c519ea2fbfbf Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 7 Aug 2019 16:20:18 -0700 Subject: [PATCH 17/26] Added fix for timedelta as label --- pandas/_libs/src/ujson/python/objToJSON.c | 40 ++++++++++++++++++++--- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index b646ddf26967f..f5c54442ff64a 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -54,6 +54,7 @@ static PyTypeObject *cls_series; static PyTypeObject *cls_index; static PyTypeObject *cls_nat; PyObject *cls_timestamp; +PyObject *cls_timedelta; npy_int64 get_nat(void) { return NPY_MIN_INT64; } @@ -166,6 +167,7 @@ void *initObjToJSON(void) cls_series = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Series"); cls_timestamp = PyObject_GetAttrString(mod_pandas, "Timestamp"); + cls_timedelta = PyObject_GetAttrString(mod_pandas, "Timedelta"); Py_DECREF(mod_pandas); } @@ -1630,11 +1632,10 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, stride = PyArray_STRIDE(labels, 0); dataptr = PyArray_DATA(labels); - getitem = (PyArray_GetItemFunc *)PyArray_DESCR(labels)->f->getitem; type_num = PyArray_TYPE(labels); for (i = 0; i < num; i++) { - item = getitem(dataptr, labels); + item = PyArray_GETITEM(labels, dataptr); if (!item) { NpyArr_freeLabels(ret, num); ret = 0; @@ -1642,7 +1643,38 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, } // Using a date as a key we need to special case the formatting - if (PyTypeNum_ISDATETIME(type_num) || + if (enc->datetimeIso && (type_num == NPY_TIMEDELTA || PyDelta_Check(item))) { + PyObject *argList = Py_BuildValue("(O)", item); + if (argList == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + PyObject *td = PyObject_CallObject(cls_timedelta, argList); + Py_DECREF(argList); + if (td == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + PyObject *iso = PyObject_CallMethod(td, "isoformat", NULL); + Py_DECREF(td); + if (iso == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + cLabel = PyUnicode_AsUTF8(iso); + Py_DECREF(iso); + len = strlen(cLabel); + } + else if (PyTypeNum_ISDATETIME(type_num) || PyDateTime_Check(item) || PyDate_Check(item)) { PyObject *argList = Py_BuildValue("(O)", item); if (argList == NULL) { @@ -1706,7 +1738,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, sprintf(buf, "%lld", value); len = strlen(cLabel); } - } else { // Otherwise use the str representation as the key + } else { // Fallack to string representation PyObject *str = PyObject_Str(item); cLabel = PyUnicode_AsUTF8(str); Py_DECREF(str); From dbdaed146121b9b21f70c2cd20b1561e4e0ba9d2 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 8 Aug 2019 08:32:09 -0700 Subject: [PATCH 18/26] Blackify --- pandas/tests/io/json/test_pandas.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 3357710994dda..05c3ada8d9241 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1621,10 +1621,11 @@ def test_read_timezone_information(self): expected = Series([88], index=DatetimeIndex(["2019-01-01 11:00:00"], tz="UTC")) assert_series_equal(result, expected) - @pytest.mark.parametrize("date_format,key", [ - ("epoch", 86400000), ("iso", "P1DT0H0M0S")]) + @pytest.mark.parametrize( + "date_format,key", [("epoch", 86400000), ("iso", "P1DT0H0M0S")] + ) def test_timedelta_as_label(self, date_format, key): - df = pd.DataFrame([[1]], columns=[pd.Timedelta('1D')]) + df = pd.DataFrame([[1]], columns=[pd.Timedelta("1D")]) expected = '{{"{key}":{{"0":1}}}}'.format(key=key) result = df.to_json(date_format=date_format) From 3ed79799a7ef33cb488688665ded214cc5fd3e62 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 8 Aug 2019 08:38:55 -0700 Subject: [PATCH 19/26] Comment cleanup and error handling --- pandas/_libs/src/ujson/python/objToJSON.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index f5c54442ff64a..39b3c64de444d 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1642,7 +1642,8 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, break; } - // Using a date as a key we need to special case the formatting + // TODO: for any matches on type_num (date and timedeltas) should use a + // vectorized solution to convert to epoch or iso formats if (enc->datetimeIso && (type_num == NPY_TIMEDELTA || PyDelta_Check(item))) { PyObject *argList = Py_BuildValue("(O)", item); if (argList == NULL) { @@ -1740,6 +1741,13 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, } } else { // Fallack to string representation PyObject *str = PyObject_Str(item); + if (str == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + cLabel = PyUnicode_AsUTF8(str); Py_DECREF(str); len = strlen(cLabel); From 01ae178bc8f17f71bc95713e2507a6bef75b2e39 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 8 Aug 2019 10:38:37 -0700 Subject: [PATCH 20/26] Added test for tuples --- pandas/tests/io/json/test_pandas.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 05c3ada8d9241..9842a706f43d7 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1630,3 +1630,20 @@ def test_timedelta_as_label(self, date_format, key): result = df.to_json(date_format=date_format) assert result == expected + + @pytest.mark.parametrize( + "orient,expected", + [ + ("index", "{\"('a', 'b')\":{\"('c', 'd')\":1}}"), + ("columns", "{\"('c', 'd')\":{\"('a', 'b')\":1}}"), + # TODO: the below have separate encoding procedures + # They produce JSON but not in a consistent manner + pytest.param("split", "", marks=pytest.mark.skip), + pytest.param("table", "", marks=pytest.mark.skip), + ], + ) + def test_tuple_labels(self, orient, expected): + # GH 20500 + df = pd.DataFrame([[1]], index=[("a", "b")], columns=[("c", "d")]) + result = df.to_json(orient=orient) + assert result == expected From 0372794013a987936336778fae4b2147d47c2907 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 8 Aug 2019 10:41:09 -0700 Subject: [PATCH 21/26] Whatsnew --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index f5ca843e1a6f7..471e95706e30f 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -157,7 +157,7 @@ MultiIndex I/O ^^^ -- +- Bug in :meth:`DataFrame.to_json` where using a Tuple as a column or index value and using ``orient="columns"`` or ``orient="index"`` would produce invalid JSON (:issue:`20500`) - Plotting From e15bede9a0fc2063b5fe58aa3c32e53768433727 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 8 Aug 2019 12:40:15 -0700 Subject: [PATCH 22/26] Warnings fixup --- pandas/_libs/src/ujson/python/objToJSON.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 39b3c64de444d..53d027f953e00 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1603,7 +1603,6 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, npy_intp i, stride, len; char **ret; char *dataptr, *cLabel; - PyArray_GetItemFunc *getitem; int type_num; PRINTMARK(); @@ -1671,7 +1670,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, break; } - cLabel = PyUnicode_AsUTF8(iso); + cLabel = (char *)PyUnicode_AsUTF8(iso); Py_DECREF(iso); len = strlen(cLabel); } @@ -1704,7 +1703,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, break; } - cLabel = PyUnicode_AsUTF8(iso); + cLabel = (char *)PyUnicode_AsUTF8(iso); Py_DECREF(iso); len = strlen(cLabel); } else { @@ -1732,11 +1731,16 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, case NPY_FR_s: value /= 1000000000LL; break; + default: + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; } char buf[21] = {0}; // 21 chars for 2**63 as string cLabel = buf; - sprintf(buf, "%lld", value); + sprintf(buf, "%ld", value); len = strlen(cLabel); } } else { // Fallack to string representation @@ -1748,7 +1752,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, break; } - cLabel = PyUnicode_AsUTF8(str); + cLabel = (char *)PyUnicode_AsUTF8(str); Py_DECREF(str); len = strlen(cLabel); } From d698255700cd80bbcec033944419e2555b417b81 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 8 Aug 2019 14:15:51 -0700 Subject: [PATCH 23/26] Simplified object construction --- pandas/_libs/src/ujson/python/objToJSON.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 53d027f953e00..5a9b2b7618f72 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1644,16 +1644,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, // TODO: for any matches on type_num (date and timedeltas) should use a // vectorized solution to convert to epoch or iso formats if (enc->datetimeIso && (type_num == NPY_TIMEDELTA || PyDelta_Check(item))) { - PyObject *argList = Py_BuildValue("(O)", item); - if (argList == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - PyObject *td = PyObject_CallObject(cls_timedelta, argList); - Py_DECREF(argList); + PyObject *td = PyObject_CallFunction(cls_timedelta, "(O)", item); if (td == NULL) { Py_DECREF(item); NpyArr_freeLabels(ret, num); @@ -1676,16 +1667,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, } else if (PyTypeNum_ISDATETIME(type_num) || PyDateTime_Check(item) || PyDate_Check(item)) { - PyObject *argList = Py_BuildValue("(O)", item); - if (argList == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - PyObject *ts = PyObject_CallObject(cls_timestamp, argList); - Py_DECREF(argList); + PyObject *ts = PyObject_CallFunction(cls_timestamp, "(O)", item); if (ts == NULL) { Py_DECREF(item); NpyArr_freeLabels(ret, num); From d453ed016d5b72e7f0e8061812ee6e1acf57cfa4 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 8 Aug 2019 14:32:54 -0700 Subject: [PATCH 24/26] Macro for int64 copying --- pandas/_libs/src/ujson/python/objToJSON.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 5a9b2b7618f72..578f7931689d7 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1722,7 +1722,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, char buf[21] = {0}; // 21 chars for 2**63 as string cLabel = buf; - sprintf(buf, "%ld", value); + sprintf(buf, "%04" NPY_INT64_FMT, value); len = strlen(cLabel); } } else { // Fallack to string representation From 5ab7ecf41b62aaa26b4a7c8a16b321e9212b0a14 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 9 Aug 2019 11:08:44 +0100 Subject: [PATCH 25/26] Removed unnecessary padding in sprintf --- pandas/_libs/src/ujson/python/objToJSON.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 578f7931689d7..de336fb3aa1dc 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1722,7 +1722,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, char buf[21] = {0}; // 21 chars for 2**63 as string cLabel = buf; - sprintf(buf, "%04" NPY_INT64_FMT, value); + sprintf(buf, "%" NPY_INT64_FMT, value); len = strlen(cLabel); } } else { // Fallack to string representation From 439b69539086fadaa8e4b3e90cc98121ee08e205 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 23 Aug 2019 15:09:26 +0200 Subject: [PATCH 26/26] whitespace fixup --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index da994d16a5025..8e25857e5ad69 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -159,7 +159,7 @@ I/O ^^^ - :meth:`read_csv` now accepts binary mode file buffers when using the Python csv engine (:issue:`23779`) -- Bug in :meth:`DataFrame.to_json` where using a Tuple as a column or index value and using ``orient="columns"`` or ``orient="index"`` would produce invalid JSON (:issue:`20500`) +- Bug in :meth:`DataFrame.to_json` where using a Tuple as a column or index value and using ``orient="columns"`` or ``orient="index"`` would produce invalid JSON (:issue:`20500`) - Plotting