diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index 0ce42856fb14a..c8d5fcbb11282 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -63,10 +63,10 @@ def peakmem_read_json_lines_concat(self, index): class ToJSON(BaseIO): fname = "__test__.json" - params = ["split", "columns", "index"] + params = ["split", "columns", "index", "records", "values"] param_names = ["orient"] - def setup(self, lines_orient): + def setup(self, orient): N = 10 ** 5 ncols = 5 index = date_range("20000101", periods=N, freq="H") @@ -126,29 +126,84 @@ def time_float_int(self, orient): def time_float_int_str(self, orient): self.df_int_float_str.to_json(self.fname, orient=orient) - def time_floats_with_int_idex_lines(self, orient): + +class ToJSONLines(BaseIO): + + fname = "__test__.json" + + def setup(self): + N = 10 ** 5 + ncols = 5 + index = date_range("20000101", periods=N, freq="H") + timedeltas = timedelta_range(start=1, periods=N, freq="s") + datetimes = date_range(start=1, periods=N, freq="s") + ints = np.random.randint(100000000, size=N) + floats = np.random.randn(N) + strings = tm.makeStringIndex(N) + self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N)) + self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index) + self.df_td_int_ts = DataFrame( + { + "td_1": timedeltas, + "td_2": timedeltas, + "int_1": ints, + "int_2": ints, + "ts_1": datetimes, + "ts_2": datetimes, + }, + index=index, + ) + self.df_int_floats = DataFrame( + { + "int_1": ints, + "int_2": ints, + "int_3": ints, + "float_1": floats, + "float_2": floats, + "float_3": floats, + }, + index=index, + ) + self.df_int_float_str = DataFrame( + { + "int_1": ints, + "int_2": ints, + "float_1": floats, + "float_2": floats, + "str_1": strings, + "str_2": strings, + }, + index=index, + ) + + def time_floats_with_int_index_lines(self): self.df.to_json(self.fname, orient="records", lines=True) - def time_floats_with_dt_index_lines(self, orient): + def time_floats_with_dt_index_lines(self): self.df_date_idx.to_json(self.fname, orient="records", lines=True) - def time_delta_int_tstamp_lines(self, orient): + def time_delta_int_tstamp_lines(self): self.df_td_int_ts.to_json(self.fname, orient="records", lines=True) - def time_float_int_lines(self, orient): + def time_float_int_lines(self): self.df_int_floats.to_json(self.fname, orient="records", lines=True) - def time_float_int_str_lines(self, orient): + def time_float_int_str_lines(self): self.df_int_float_str.to_json(self.fname, orient="records", lines=True) class ToJSONMem: def setup_cache(self): df = DataFrame([[1]]) - frames = {"int": df, "float": df.astype(float)} + wide = DataFrame(np.ones((10_000, 10_000))) + frames = {"int": df, "float": df.astype(float), "wide": wide} return frames + def mem_int(self, frames): + df = frames["wide"] + df.to_json() + def peakmem_int(self, frames): df = frames["int"] for _ in range(100_000): diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 926440218b5d9..a860969d759a5 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -78,14 +78,6 @@ typedef struct __NpyArrContext { char **columnLabels; } NpyArrContext; -typedef struct __PdBlockContext { - int colIdx; - int ncols; - int transpose; - - int *cindices; // frame column -> block column map - NpyArrContext **npyCtxts; // NpyArrContext for each column -} PdBlockContext; typedef struct __TypeContext { JSPFN_ITERBEGIN iterBegin; @@ -108,8 +100,6 @@ typedef struct __TypeContext { char *cStr; NpyArrContext *npyarr; - PdBlockContext *pdblock; - int transpose; char **rowLabels; char **columnLabels; npy_intp rowLabelsLen; @@ -122,9 +112,6 @@ typedef struct __PyObjectEncoder { // pass through the NpyArrContext when encoding multi-dimensional arrays NpyArrContext *npyCtxtPassthru; - // pass through the PdBlockContext when encoding blocks - PdBlockContext *blkCtxtPassthru; - // pass-through to encode numpy data directly int npyType; void *npyValue; @@ -140,14 +127,14 @@ typedef struct __PyObjectEncoder { PyObject *defaultHandler; } PyObjectEncoder; -#define GET_TC(__ptrtc) ((TypeContext *)((__ptrtc)->prv)) +inline TypeContext* GET_TC(JSONTypeContext * __ptrtc) { + return (TypeContext *)__ptrtc->prv; +} enum PANDAS_FORMAT { SPLIT, RECORDS, INDEX, COLUMNS, VALUES }; #define PRINTMARK() -int PdBlock_iterNext(JSOBJ, JSONTypeContext *); - void *initObjToJSON(void) { PyObject *mod_pandas; @@ -200,10 +187,8 @@ static TypeContext *createTypeContext(void) { pc->doubleValue = 0.0; pc->cStr = NULL; pc->npyarr = NULL; - pc->pdblock = NULL; pc->rowLabels = NULL; pc->columnLabels = NULL; - pc->transpose = 0; pc->rowLabelsLen = 0; pc->columnLabelsLen = 0; @@ -269,6 +254,7 @@ static PyObject *get_values(PyObject *obj) { } } + // For Categorical et al if (!values && PyObject_HasAttrString(obj, "_internal_get_values")) { PRINTMARK(); values = PyObject_CallMethod(obj, "_internal_get_values", NULL); @@ -279,16 +265,6 @@ static PyObject *get_values(PyObject *obj) { } } - if (!values && PyObject_HasAttrString(obj, "get_block_values")) { - PRINTMARK(); - values = PyObject_CallMethod(obj, "get_block_values", NULL); - if (values && !PyArray_CheckExact(values)) { - PRINTMARK(); - Py_DECREF(values); - values = NULL; - } - } - if (!values) { PyObject *typeRepr = PyObject_Repr((PyObject *)Py_TYPE(obj)); PyObject *repr; @@ -312,48 +288,6 @@ static PyObject *get_values(PyObject *obj) { return values; } -static PyObject *get_sub_attr(PyObject *obj, char *attr, char *subAttr) { - PyObject *tmp = PyObject_GetAttrString(obj, attr); - PyObject *ret; - - if (tmp == 0) { - return 0; - } - ret = PyObject_GetAttrString(tmp, subAttr); - Py_DECREF(tmp); - - return ret; -} - -static int is_simple_frame(PyObject *obj) { - PyObject *check = get_sub_attr(obj, "_data", "is_mixed_type"); - int ret = (check == Py_False); - - if (!check) { - return 0; - } - - Py_DECREF(check); - return ret; -} - -static Py_ssize_t get_attr_length(PyObject *obj, char *attr) { - PyObject *tmp = PyObject_GetAttrString(obj, attr); - Py_ssize_t ret; - - if (tmp == 0) { - return 0; - } - ret = PyObject_Length(tmp); - Py_DECREF(tmp); - - if (ret == -1) { - return 0; - } - - return ret; -} - static npy_int64 get_long_attr(PyObject *o, const char *attr) { npy_int64 long_val; PyObject *value = PyObject_GetAttrString(o, attr); @@ -371,19 +305,6 @@ static npy_float64 total_seconds(PyObject *td) { return double_val; } -static PyObject *get_item(PyObject *obj, Py_ssize_t i) { - PyObject *tmp = PyLong_FromSsize_t(i); - PyObject *ret; - - if (tmp == 0) { - return 0; - } - ret = PyObject_GetItem(obj, tmp); - Py_DECREF(tmp); - - return ret; -} - static void *CDouble(JSOBJ obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) { PRINTMARK(); @@ -674,19 +595,11 @@ void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { npyarr->curdim = 0; npyarr->type_num = PyArray_DESCR(obj)->type_num; - if (GET_TC(tc)->transpose) { - npyarr->dim = PyArray_DIM(obj, npyarr->ndim); - npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); - npyarr->stridedim = npyarr->ndim; - npyarr->index[npyarr->ndim] = 0; - npyarr->inc = -1; - } else { - npyarr->dim = PyArray_DIM(obj, 0); - npyarr->stride = PyArray_STRIDE(obj, 0); - npyarr->stridedim = 0; - npyarr->index[0] = 0; - npyarr->inc = 1; - } + npyarr->dim = PyArray_DIM(obj, 0); + npyarr->stride = PyArray_STRIDE(obj, 0); + npyarr->stridedim = 0; + npyarr->index[0] = 0; + npyarr->inc = 1; npyarr->columnLabels = GET_TC(tc)->columnLabels; npyarr->rowLabels = GET_TC(tc)->rowLabels; @@ -813,311 +726,6 @@ char *NpyArr_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { return NULL; } -//============================================================================= -// Pandas block iteration functions -// -// Serialises a DataFrame column by column to avoid unnecessary data copies and -// more representative serialisation when dealing with mixed dtypes. -// -// Uses a dedicated NpyArrContext for each column. -//============================================================================= - -void PdBlockPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - PRINTMARK(); - - if (blkCtxt->transpose) { - blkCtxt->colIdx++; - } else { - blkCtxt->colIdx = 0; - } - - NpyArr_freeItemValue(obj, tc); -} - -int PdBlock_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - PRINTMARK(); - - if (blkCtxt->colIdx >= blkCtxt->ncols) { - return 0; - } - - GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; - blkCtxt->colIdx++; - return NpyArr_iterNextItem(obj, tc); -} - -char *PdBlock_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - NpyArrContext *npyarr = blkCtxt->npyCtxts[0]; - npy_intp idx; - PRINTMARK(); - - if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) { - idx = blkCtxt->colIdx - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels); - } else { - idx = GET_TC(tc)->iterNext != PdBlock_iterNext - ? npyarr->index[npyarr->stridedim - npyarr->inc] - 1 - : npyarr->index[npyarr->stridedim]; - - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels); - } - return NULL; -} - -char *PdBlock_iterGetName_Transpose(JSOBJ obj, JSONTypeContext *tc, - size_t *outLen) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; - npy_intp idx; - PRINTMARK(); - - if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { - idx = npyarr->index[npyarr->stridedim] - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels); - } else { - idx = blkCtxt->colIdx; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels); - } - return NULL; -} - -int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - NpyArrContext *npyarr; - PRINTMARK(); - - if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) { - return 0; - } - - if (blkCtxt->transpose) { - if (blkCtxt->colIdx >= blkCtxt->ncols) { - return 0; - } - } else { - npyarr = blkCtxt->npyCtxts[0]; - if (npyarr->index[npyarr->stridedim] >= npyarr->dim) { - return 0; - } - } - - ((PyObjectEncoder *)tc->encoder)->blkCtxtPassthru = blkCtxt; - GET_TC(tc)->itemValue = obj; - - return 1; -} - -void PdBlockPassThru_iterBegin(JSOBJ obj, JSONTypeContext *tc) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - PRINTMARK(); - - if (blkCtxt->transpose) { - // if transposed we exhaust each column before moving to the next - GET_TC(tc)->iterNext = NpyArr_iterNextItem; - GET_TC(tc)->iterGetName = PdBlock_iterGetName_Transpose; - GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; - } -} - -void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { - PyObject *obj, *blocks, *block, *values, *tmp; - PyArrayObject *locs; - PdBlockContext *blkCtxt; - NpyArrContext *npyarr; - Py_ssize_t i; - PyArray_Descr *dtype; - NpyIter *iter; - NpyIter_IterNextFunc *iternext; - npy_int64 **dataptr; - npy_int64 colIdx; - npy_intp idx; - - PRINTMARK(); - - i = 0; - blocks = NULL; - dtype = PyArray_DescrFromType(NPY_INT64); - obj = (PyObject *)_obj; - - GET_TC(tc) - ->iterGetName = GET_TC(tc)->transpose ? PdBlock_iterGetName_Transpose - : PdBlock_iterGetName; - - blkCtxt = PyObject_Malloc(sizeof(PdBlockContext)); - if (!blkCtxt) { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - GET_TC(tc)->pdblock = blkCtxt; - - blkCtxt->colIdx = 0; - blkCtxt->transpose = GET_TC(tc)->transpose; - blkCtxt->ncols = get_attr_length(obj, "columns"); - - if (blkCtxt->ncols == 0) { - blkCtxt->npyCtxts = NULL; - blkCtxt->cindices = NULL; - - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - - blkCtxt->npyCtxts = - PyObject_Malloc(sizeof(NpyArrContext *) * blkCtxt->ncols); - if (!blkCtxt->npyCtxts) { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - for (i = 0; i < blkCtxt->ncols; i++) { - blkCtxt->npyCtxts[i] = NULL; - } - - blkCtxt->cindices = PyObject_Malloc(sizeof(int) * blkCtxt->ncols); - if (!blkCtxt->cindices) { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - - blocks = get_sub_attr(obj, "_data", "blocks"); - if (!blocks) { - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - - // force transpose so each NpyArrContext strides down its column - GET_TC(tc)->transpose = 1; - - for (i = 0; i < PyObject_Length(blocks); i++) { - block = get_item(blocks, i); - if (!block) { - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - - tmp = get_values(block); - if (!tmp) { - ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; - Py_DECREF(block); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - - values = PyArray_Transpose((PyArrayObject *)tmp, NULL); - Py_DECREF(tmp); - if (!values) { - Py_DECREF(block); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - - locs = (PyArrayObject *)get_sub_attr(block, "mgr_locs", "as_array"); - if (!locs) { - Py_DECREF(block); - Py_DECREF(values); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - - iter = NpyIter_New(locs, NPY_ITER_READONLY, NPY_KEEPORDER, - NPY_NO_CASTING, dtype); - if (!iter) { - Py_DECREF(block); - Py_DECREF(values); - Py_DECREF(locs); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - iternext = NpyIter_GetIterNext(iter, NULL); - if (!iternext) { - NpyIter_Deallocate(iter); - Py_DECREF(block); - Py_DECREF(values); - Py_DECREF(locs); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - dataptr = (npy_int64 **)NpyIter_GetDataPtrArray(iter); - do { - colIdx = **dataptr; - idx = NpyIter_GetIterIndex(iter); - - blkCtxt->cindices[colIdx] = idx; - - // Reference freed in Pdblock_iterend - Py_INCREF(values); - GET_TC(tc)->newObj = values; - - // init a dedicated context for this column - NpyArr_iterBegin(obj, tc); - npyarr = GET_TC(tc)->npyarr; - - // set the dataptr to our desired column and initialise - if (npyarr != NULL) { - npyarr->dataptr += npyarr->stride * idx; - NpyArr_iterNext(obj, tc); - } - GET_TC(tc)->itemValue = NULL; - ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL; - - blkCtxt->npyCtxts[colIdx] = npyarr; - GET_TC(tc)->newObj = NULL; - } while (iternext(iter)); - - NpyIter_Deallocate(iter); - Py_DECREF(block); - Py_DECREF(values); - Py_DECREF(locs); - } - GET_TC(tc)->npyarr = blkCtxt->npyCtxts[0]; - -BLKRET: - Py_XDECREF(dtype); - Py_XDECREF(blocks); -} - -void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) { - PdBlockContext *blkCtxt; - NpyArrContext *npyarr; - int i; - PRINTMARK(); - - GET_TC(tc)->itemValue = NULL; - npyarr = GET_TC(tc)->npyarr; - - blkCtxt = GET_TC(tc)->pdblock; - - if (blkCtxt) { - for (i = 0; i < blkCtxt->ncols; i++) { - npyarr = blkCtxt->npyCtxts[i]; - if (npyarr) { - if (npyarr->array) { - Py_DECREF(npyarr->array); - npyarr->array = NULL; - } - - GET_TC(tc)->npyarr = npyarr; - NpyArr_iterEnd(obj, tc); - - blkCtxt->npyCtxts[i] = NULL; - } - } - - if (blkCtxt->npyCtxts) { - PyObject_Free(blkCtxt->npyCtxts); - } - if (blkCtxt->cindices) { - PyObject_Free(blkCtxt->cindices); - } - PyObject_Free(blkCtxt); - } -} - //============================================================================= // Tuple iteration functions // itemValue is borrowed reference, no ref counting @@ -1450,45 +1058,126 @@ char *Series_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { //============================================================================= // pandas DataFrame iteration functions //============================================================================= + +/* + * Function: DataFrame_iterBegin + * ----------------------------- + * Sets iteration state for dealing with DataFrame objects + * + * obj: JSON object being serialized (should be a DataFrame at Python level) + * tc: shared TypeContext for seralization + * + * Because various orientations are handling by the JSON parser this method + * is responsible for setting the appropriate iterator. + * + * Supported orient formats are: + * + * SPLIT: {index -> [index], columns -> [columns], data -> [values]} + * RECORDS: [{column -> value}, … , {column -> value}] + * INDEX: {index -> {column -> value}} + * COLUMNS: {column -> {index -> value}} + * VALUES: [[value, value, ...], [value, value, ...], ...] + * + * The context of serialization here is dependent upon the orient. + * RECORDS, would make the context of serialization here + * JT_ARRAY (essentially a sequence we iterate over) whereas the other orients + * require a JT_OBJECT context (whereby we extract keys and values from the DataFrame). + * + * VALUES orients actually shouldn't pass through here at all and can be dispatched + * directly to the NumPy array serialization, since they don't encode labels. + */ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { + PRINTMARK(); PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); - enc->outputFormat = VALUES; // for contained series & index - if (!GET_TC(tc)->cStr) { - PyErr_NoMemory(); - } + enc->originalOutputFormat = enc->outputFormat; + + GET_TC(tc)->index = 0; + // For SPLIT format the index tracks columns->index->data progression + if (enc->outputFormat == SPLIT) { + PRINTMARK(); + GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); + + // TODO: changing the outputFormat was in place when this method only dealt with + // the SPLIT orient. Now that this handles quite a few we should probably use + // another method for changing the outputFormat of underlying objects (maybe tc->prv) + enc->outputFormat = VALUES; // for contained series & index + if (!GET_TC(tc)->cStr) { + PyErr_NoMemory(); + } + } else { + char *method; + if (enc->outputFormat == COLUMNS) + method = "items"; + else + method = "iterrows"; + + PyObject *iter = PyObject_CallMethod(obj, method, NULL); + + if (iter == 0) { + return; + } + + GET_TC(tc)->iterator = iter; + + // The RECORDS format essentially generates a JSON array of Series in the + // INDEX format, so set that context during serialization + + if (enc->outputFormat == RECORDS) { + enc->outputFormat = INDEX; + } + + } + PRINTMARK(); + return; } +/* + * Function: DataFrame_iterNext + * ----------------------------- + * Provides instructions how to appropriately iterate the object. + * + * This is dependent on the orient as mentioned in DataFrame_iterBegin + */ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { - Py_ssize_t index; - if (!GET_TC(tc)->cStr) { - return 0; + PRINTMARK(); + PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + + // free previous entry + if (GET_TC(tc)->itemValue) { + Py_DECREF(GET_TC(tc)->itemValue); } - index = GET_TC(tc)->index; - Py_XDECREF(GET_TC(tc)->itemValue); - if (index == 0) { + // Check the original output format as this may have been modified for + // underlying series' + if (enc->originalOutputFormat == SPLIT) { + Py_ssize_t index; + index = GET_TC(tc)->index; + if (index == 0) { memcpy(GET_TC(tc)->cStr, "columns", sizeof(char) * 8); + PRINTMARK(); GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns"); - } else if (index == 1) { + } else if (index == 1) { memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6); GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); - } else if (index == 2) { + } else if (index == 2) { memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); - if (is_simple_frame(obj)) { - GET_TC(tc)->itemValue = get_values(obj); - if (!GET_TC(tc)->itemValue) { - return 0; - } - } else { - Py_INCREF(obj); - GET_TC(tc)->itemValue = obj; - } - } else { + GET_TC(tc)->itemValue = get_values(obj); + if (!GET_TC(tc)->itemValue) { + return 0; + } + } else { PRINTMARK(); return 0; + } + + } else { + PyObject *tmp = PyIter_Next(GET_TC(tc)->iterator); + if (tmp == 0) + return 0; + + GET_TC(tc)->itemValue = PySequence_GetItem(tmp, 1); + Py_DECREF(tmp); } GET_TC(tc)->index++; @@ -1496,19 +1185,56 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { return 1; } +/* + * Function: DataFrame_iterEnd + * ----------------------------- + * Callaback after DataFrame has been entirely iterated upon. + * + */ void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { - PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - enc->outputFormat = enc->originalOutputFormat; - PRINTMARK(); + PRINTMARK(); + PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + + if (enc->originalOutputFormat != SPLIT) { + Py_DECREF(GET_TC(tc)->iterator); + } + + enc->outputFormat = enc->originalOutputFormat; + } +/* + * Function: DataFrame_iterGetValue + * ----------------------------- + * Provides the value(s) for a particular iteration. This is valid whether + * the type context is JT_OBJECT or JT_ARRAY. + */ JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; + PRINTMARK(); + return GET_TC(tc)->itemValue; } +/* + * Function: DataFrame_iterGetName + * ----------------------------- + * Provides the name for a particular iteration. This is only called if + * the type context is JT_OBJECT, which is dictated by the orient. + */ char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { + PRINTMARK(); + PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + if (enc->originalOutputFormat == SPLIT) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; + } else if (enc->originalOutputFormat == COLUMNS || enc->originalOutputFormat == INDEX) { + // TODO: index is incremented before iteration completes which is unfortunate + // Need to align with SPLIT format and then can maybe increment here for clarity + int index = GET_TC(tc)->index - 1; + + // Also TODO: return a value here rather than having NpyArr_getLabel modify output buf + NpyArr_getLabel(obj, tc, outLen, index, GET_TC(tc)->columnLabels); + return NULL; + } } //============================================================================= @@ -1578,6 +1304,25 @@ void NpyArr_freeLabels(char **labels, npy_intp len) { } } + +/* function NpyArr_encodeLabels + * ---------------------------- + * + * labels: a list-like object of labels to encode + * enc: JSON encoder + * num: number of labels + * + * This function takes care of encoding labels in one pass and is + * typically used for the columns or labels when a DataFrame or Series. + * It is particularly useful for items whose str repr is not what should + * be written out as a label (ex: Timestamp) + * + * However, it's usage here is rather non-idiomatic as it would be better + * to simply define a iterGetName method for the appropriate objects which + * converts the labels into the appropriate string. + * + * TODO: refactor this to fit better into ujson iteration model + */ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, npy_intp num) { // NOTE this function steals a reference to labels. @@ -2012,128 +1757,59 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->iterGetName = NpyArr_iterGetName; return; } else if (PyObject_TypeCheck(obj, cls_dataframe)) { - if (enc->blkCtxtPassthru) { - PRINTMARK(); - pc->pdblock = enc->blkCtxtPassthru; - tc->type = - (pc->pdblock->npyCtxts[0]->columnLabels ? JT_OBJECT : JT_ARRAY); - - pc->iterBegin = PdBlockPassThru_iterBegin; - pc->iterEnd = PdBlockPassThru_iterEnd; - pc->iterNext = PdBlock_iterNextItem; - pc->iterGetName = PdBlock_iterGetName; - pc->iterGetValue = NpyArr_iterGetValue; - - enc->blkCtxtPassthru = NULL; - return; - } - - if (enc->outputFormat == SPLIT) { - PRINTMARK(); - tc->type = JT_OBJECT; - pc->iterBegin = DataFrame_iterBegin; - pc->iterEnd = DataFrame_iterEnd; - pc->iterNext = DataFrame_iterNext; - pc->iterGetValue = DataFrame_iterGetValue; - pc->iterGetName = DataFrame_iterGetName; - return; - } - - PRINTMARK(); - if (is_simple_frame(obj)) { - pc->iterBegin = NpyArr_iterBegin; - pc->iterEnd = NpyArr_iterEnd; - pc->iterNext = NpyArr_iterNext; - pc->iterGetName = NpyArr_iterGetName; - - pc->newObj = get_values(obj); - if (!pc->newObj) { - goto INVALID; - } - } else { - pc->iterBegin = PdBlock_iterBegin; - pc->iterEnd = PdBlock_iterEnd; - pc->iterNext = PdBlock_iterNext; - pc->iterGetName = PdBlock_iterGetName; + // VALUES doesn't encode labels, so can treat as numpy array + if (enc->outputFormat == VALUES) { + pc->newObj = get_values(obj); + if (!pc->newObj) { + goto INVALID; } + tc->type = JT_ARRAY; + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; pc->iterGetValue = NpyArr_iterGetValue; + return; + } + else if (enc->outputFormat == RECORDS) + tc->type = JT_ARRAY; + else + tc->type = JT_OBJECT; + + // TODO: calling these columnLabels with INDEX formatting is confusing, + // but there's not really a need to have both columnLabels and rowLabels + // anyway; subsequent refactor should just make these labels + if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) { + char *attr; + if (enc->outputFormat == INDEX) + attr = "index"; + else + attr = "columns"; + + tmpObj = PyObject_GetAttrString(obj, attr); + if (!tmpObj) { + goto INVALID; + } + values = get_values(tmpObj); + Py_DECREF(tmpObj); + if (!values) { + goto INVALID; + } + pc->columnLabelsLen = PyArray_DIM(values, 0); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, + (JSONObjectEncoder *)enc, + pc->columnLabelsLen); + if (!pc->columnLabels) { + goto INVALID; + } + } + + pc->iterBegin = DataFrame_iterBegin; + pc->iterEnd = DataFrame_iterEnd; + pc->iterNext = DataFrame_iterNext; + pc->iterGetValue = DataFrame_iterGetValue; + pc->iterGetName = DataFrame_iterGetName; - if (enc->outputFormat == VALUES) { - PRINTMARK(); - tc->type = JT_ARRAY; - } else if (enc->outputFormat == RECORDS) { - PRINTMARK(); - tc->type = JT_ARRAY; - tmpObj = PyObject_GetAttrString(obj, "columns"); - if (!tmpObj) { - goto INVALID; - } - values = get_values(tmpObj); - if (!values) { - Py_DECREF(tmpObj); - goto INVALID; - } - pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, - pc->columnLabelsLen); - Py_DECREF(tmpObj); - if (!pc->columnLabels) { - goto INVALID; - } - } else if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) { - PRINTMARK(); - tc->type = JT_OBJECT; - tmpObj = (enc->outputFormat == INDEX - ? PyObject_GetAttrString(obj, "index") - : PyObject_GetAttrString(obj, "columns")); - if (!tmpObj) { - goto INVALID; - } - values = get_values(tmpObj); - if (!values) { - Py_DECREF(tmpObj); - goto INVALID; - } - pc->rowLabelsLen = PyObject_Size(tmpObj); - pc->rowLabels = - NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, pc->rowLabelsLen); - Py_DECREF(tmpObj); - tmpObj = (enc->outputFormat == INDEX - ? PyObject_GetAttrString(obj, "columns") - : PyObject_GetAttrString(obj, "index")); - if (!tmpObj) { - NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); - pc->rowLabels = NULL; - goto INVALID; - } - values = get_values(tmpObj); - if (!values) { - Py_DECREF(tmpObj); - NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); - pc->rowLabels = NULL; - goto INVALID; - } - pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, - pc->columnLabelsLen); - Py_DECREF(tmpObj); - if (!pc->columnLabels) { - NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); - pc->rowLabels = NULL; - goto INVALID; - } - - if (enc->outputFormat == COLUMNS) { - PRINTMARK(); - pc->transpose = 1; - } - } else { - goto INVALID; - } - return; + return; } else if (PyDict_Check(obj)) { PRINTMARK(); tc->type = JT_OBJECT; @@ -2336,7 +2012,6 @@ PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) { JSONObjectEncoder *encoder = (JSONObjectEncoder *)&pyEncoder; pyEncoder.npyCtxtPassthru = NULL; - pyEncoder.blkCtxtPassthru = NULL; pyEncoder.npyType = -1; pyEncoder.npyValue = NULL; pyEncoder.datetimeIso = 0;