diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index bbcee479aeb5a..31b43cdb28d9d 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -83,7 +83,6 @@ typedef struct __PdBlockContext { int ncols; int transpose; - int *cindices; // frame column -> block column map NpyArrContext **npyCtxts; // NpyArrContext for each column } PdBlockContext; @@ -294,7 +293,12 @@ static int is_simple_frame(PyObject *obj) { if (!mgr) { return 0; } - int ret = (get_attr_length(mgr, "blocks") <= 1); + int ret; + if (PyObject_HasAttrString(mgr, "blocks")) { + ret = (get_attr_length(mgr, "blocks") <= 1); + } else { + ret = 0; + } Py_DECREF(mgr); return ret; @@ -656,16 +660,10 @@ void PdBlockPassThru_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { } void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { - PyObject *obj, *blocks, *block, *values, *tmp; - PyArrayObject *locs; + PyObject *obj, *values, *arrays, *array; PdBlockContext *blkCtxt; NpyArrContext *npyarr; Py_ssize_t i; - NpyIter *iter; - NpyIter_IterNextFunc *iternext; - npy_int64 **dataptr; - npy_int64 colIdx; - npy_intp idx; obj = (PyObject *)_obj; @@ -687,7 +685,6 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { if (blkCtxt->ncols == 0) { blkCtxt->npyCtxts = NULL; - blkCtxt->cindices = NULL; GET_TC(tc)->iterNext = NpyArr_iterNextNone; return; @@ -701,104 +698,45 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { return; } - blkCtxt->cindices = PyObject_Malloc(sizeof(int) * blkCtxt->ncols); - if (!blkCtxt->cindices) { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - return; - } - - blocks = get_sub_attr(obj, "_mgr", "blocks"); - if (!blocks) { + arrays = get_sub_attr(obj, "_mgr", "column_arrays"); + if (!arrays) { GET_TC(tc)->iterNext = NpyArr_iterNextNone; return; - } else if (!PyTuple_Check(blocks)) { - PyErr_SetString(PyExc_TypeError, "blocks must be a tuple!"); - goto BLKRET; } - // force transpose so each NpyArrContext strides down its column - GET_TC(tc)->transpose = 1; - - for (i = 0; i < PyObject_Length(blocks); i++) { - block = PyTuple_GET_ITEM(blocks, i); - if (!block) { + for (i = 0; i < PyObject_Length(arrays); i++) { + array = PyList_GET_ITEM(arrays, i); + if (!array) { GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; + goto ARR_RET; } - tmp = PyObject_CallMethod(block, "get_block_values_for_json", NULL); - if (!tmp) { + // ensure we have a numpy array (i.e. np.asarray) + values = PyObject_CallMethod(array, "__array__", NULL); + if ((!values) || (!PyArray_CheckExact(values))) { + // Didn't get a numpy array ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - - values = PyArray_Transpose((PyArrayObject *)tmp, NULL); - Py_DECREF(tmp); - if (!values) { - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - - locs = (PyArrayObject *)get_sub_attr(block, "mgr_locs", "as_array"); - if (!locs) { - Py_DECREF(values); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; + goto ARR_RET; } - iter = NpyIter_New(locs, NPY_ITER_READONLY, NPY_KEEPORDER, - NPY_NO_CASTING, NULL); - if (!iter) { - Py_DECREF(values); - Py_DECREF(locs); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - iternext = NpyIter_GetIterNext(iter, NULL); - if (!iternext) { - NpyIter_Deallocate(iter); - Py_DECREF(values); - Py_DECREF(locs); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - dataptr = (npy_int64 **)NpyIter_GetDataPtrArray(iter); - do { - colIdx = **dataptr; - idx = NpyIter_GetIterIndex(iter); + GET_TC(tc)->newObj = values; - blkCtxt->cindices[colIdx] = idx; + // init a dedicated context for this column + NpyArr_iterBegin(obj, tc); + npyarr = GET_TC(tc)->npyarr; - // Reference freed in Pdblock_iterend - Py_INCREF(values); - GET_TC(tc)->newObj = values; - - // init a dedicated context for this column - NpyArr_iterBegin(obj, tc); - npyarr = GET_TC(tc)->npyarr; - - // set the dataptr to our desired column and initialise - if (npyarr != NULL) { - npyarr->dataptr += npyarr->stride * idx; - NpyArr_iterNext(obj, tc); - } - GET_TC(tc)->itemValue = NULL; - ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL; - - blkCtxt->npyCtxts[colIdx] = npyarr; - GET_TC(tc)->newObj = NULL; - } while (iternext(iter)); + GET_TC(tc)->itemValue = NULL; + ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL; - NpyIter_Deallocate(iter); - Py_DECREF(values); - Py_DECREF(locs); + blkCtxt->npyCtxts[i] = npyarr; + GET_TC(tc)->newObj = NULL; } GET_TC(tc)->npyarr = blkCtxt->npyCtxts[0]; + goto ARR_RET; -BLKRET: - Py_DECREF(blocks); +ARR_RET: + Py_DECREF(arrays); } void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) { @@ -830,9 +768,6 @@ void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) { if (blkCtxt->npyCtxts) { PyObject_Free(blkCtxt->npyCtxts); } - if (blkCtxt->cindices) { - PyObject_Free(blkCtxt->cindices); - } PyObject_Free(blkCtxt); } } diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 8c9902d330eee..a25750e7e1eab 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -919,6 +919,13 @@ def iget_values(self, i: int) -> ArrayLike: """ return self.arrays[i] + @property + def column_arrays(self) -> list[ArrayLike]: + """ + Used in the JSON C code to access column arrays. + """ + return self.arrays + def iset(self, loc: int | slice | np.ndarray, value: ArrayLike): """ Set new column(s). diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 4276aadd8edd6..2d7d83d6a2bc3 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -226,13 +226,6 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray: # expected "ndarray") return self.values # type: ignore[return-value] - def get_block_values_for_json(self) -> np.ndarray: - """ - This is used in the JSON C code. - """ - # TODO(EA2D): reshape will be unnecessary with 2D EAs - return np.asarray(self.values).reshape(self.shape) - @final @cache_readonly def fill_value(self): @@ -1778,10 +1771,6 @@ class DatetimeLikeBlock(NDArrayBackedExtensionBlock): is_numeric = False values: DatetimeArray | TimedeltaArray - def get_block_values_for_json(self): - # Not necessary to override, but helps perf - return self.values._ndarray - class DatetimeTZBlock(DatetimeLikeBlock): """ implement a datetime64 block with a tz attribute """ diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 5db6592ba77f9..487047f1a1dbb 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1153,6 +1153,30 @@ def iget_values(self, i: int) -> ArrayLike: values = block.iget(self.blklocs[i]) return values + @property + def column_arrays(self) -> list[np.ndarray]: + """ + Used in the JSON C code to access column arrays. + This optimizes compared to using `iget_values` by converting each + block.values to a np.ndarray only once up front + """ + # special casing datetimetz to avoid conversion through object dtype + arrays = [ + blk.values._ndarray + if isinstance(blk, DatetimeTZBlock) + else np.asarray(blk.values) + for blk in self.blocks + ] + result = [] + for i in range(len(self.items)): + arr = arrays[self.blknos[i]] + if arr.ndim == 2: + values = arr[self.blklocs[i]] + else: + values = arr + result.append(values) + return result + def iset(self, loc: int | slice | np.ndarray, value: ArrayLike): """ Set new item in-place. Does not consolidate. Adds new Block if not diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 9d955545aede3..71f1d03ea6d1f 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -6,8 +6,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, @@ -26,8 +24,6 @@ set_default_names, ) -pytestmark = td.skip_array_manager_not_yet_implemented - class TestBuildSchema: def setup_method(self, method): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 97d44aafef74b..dc94354728ef6 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -857,8 +857,6 @@ def test_convert_dates_infer(self, infer_word): result = read_json(dumps(data))[["id", infer_word]] tm.assert_frame_equal(result, expected) - # TODO(ArrayManager) JSON - @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize( "date,date_unit", [