Skip to content

Commit de9d9c4

Browse files
jorisvandenbosscheyeshsurya
authored andcommitted
REF: remove Block access in the JSON writing code (pandas-dev#41081)
1 parent b1f6601 commit de9d9c4

File tree

6 files changed

+60
-111
lines changed

6 files changed

+60
-111
lines changed

pandas/_libs/src/ujson/python/objToJSON.c

+29-94
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,6 @@ typedef struct __PdBlockContext {
8383
int ncols;
8484
int transpose;
8585

86-
int *cindices; // frame column -> block column map
8786
NpyArrContext **npyCtxts; // NpyArrContext for each column
8887
} PdBlockContext;
8988

@@ -294,7 +293,12 @@ static int is_simple_frame(PyObject *obj) {
294293
if (!mgr) {
295294
return 0;
296295
}
297-
int ret = (get_attr_length(mgr, "blocks") <= 1);
296+
int ret;
297+
if (PyObject_HasAttrString(mgr, "blocks")) {
298+
ret = (get_attr_length(mgr, "blocks") <= 1);
299+
} else {
300+
ret = 0;
301+
}
298302

299303
Py_DECREF(mgr);
300304
return ret;
@@ -656,16 +660,10 @@ void PdBlockPassThru_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
656660
}
657661

658662
void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {
659-
PyObject *obj, *blocks, *block, *values, *tmp;
660-
PyArrayObject *locs;
663+
PyObject *obj, *values, *arrays, *array;
661664
PdBlockContext *blkCtxt;
662665
NpyArrContext *npyarr;
663666
Py_ssize_t i;
664-
NpyIter *iter;
665-
NpyIter_IterNextFunc *iternext;
666-
npy_int64 **dataptr;
667-
npy_int64 colIdx;
668-
npy_intp idx;
669667

670668
obj = (PyObject *)_obj;
671669

@@ -687,7 +685,6 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {
687685

688686
if (blkCtxt->ncols == 0) {
689687
blkCtxt->npyCtxts = NULL;
690-
blkCtxt->cindices = NULL;
691688

692689
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
693690
return;
@@ -701,104 +698,45 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {
701698
return;
702699
}
703700

704-
blkCtxt->cindices = PyObject_Malloc(sizeof(int) * blkCtxt->ncols);
705-
if (!blkCtxt->cindices) {
706-
PyErr_NoMemory();
707-
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
708-
return;
709-
}
710-
711-
blocks = get_sub_attr(obj, "_mgr", "blocks");
712-
if (!blocks) {
701+
arrays = get_sub_attr(obj, "_mgr", "column_arrays");
702+
if (!arrays) {
713703
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
714704
return;
715-
} else if (!PyTuple_Check(blocks)) {
716-
PyErr_SetString(PyExc_TypeError, "blocks must be a tuple!");
717-
goto BLKRET;
718705
}
719706

720-
// force transpose so each NpyArrContext strides down its column
721-
GET_TC(tc)->transpose = 1;
722-
723-
for (i = 0; i < PyObject_Length(blocks); i++) {
724-
block = PyTuple_GET_ITEM(blocks, i);
725-
if (!block) {
707+
for (i = 0; i < PyObject_Length(arrays); i++) {
708+
array = PyList_GET_ITEM(arrays, i);
709+
if (!array) {
726710
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
727-
goto BLKRET;
711+
goto ARR_RET;
728712
}
729713

730-
tmp = PyObject_CallMethod(block, "get_block_values_for_json", NULL);
731-
if (!tmp) {
714+
// ensure we have a numpy array (i.e. np.asarray)
715+
values = PyObject_CallMethod(array, "__array__", NULL);
716+
if ((!values) || (!PyArray_CheckExact(values))) {
717+
// Didn't get a numpy array
732718
((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
733719
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
734-
goto BLKRET;
735-
}
736-
737-
values = PyArray_Transpose((PyArrayObject *)tmp, NULL);
738-
Py_DECREF(tmp);
739-
if (!values) {
740-
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
741-
goto BLKRET;
742-
}
743-
744-
locs = (PyArrayObject *)get_sub_attr(block, "mgr_locs", "as_array");
745-
if (!locs) {
746-
Py_DECREF(values);
747-
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
748-
goto BLKRET;
720+
goto ARR_RET;
749721
}
750722

751-
iter = NpyIter_New(locs, NPY_ITER_READONLY, NPY_KEEPORDER,
752-
NPY_NO_CASTING, NULL);
753-
if (!iter) {
754-
Py_DECREF(values);
755-
Py_DECREF(locs);
756-
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
757-
goto BLKRET;
758-
}
759-
iternext = NpyIter_GetIterNext(iter, NULL);
760-
if (!iternext) {
761-
NpyIter_Deallocate(iter);
762-
Py_DECREF(values);
763-
Py_DECREF(locs);
764-
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
765-
goto BLKRET;
766-
}
767-
dataptr = (npy_int64 **)NpyIter_GetDataPtrArray(iter);
768-
do {
769-
colIdx = **dataptr;
770-
idx = NpyIter_GetIterIndex(iter);
723+
GET_TC(tc)->newObj = values;
771724

772-
blkCtxt->cindices[colIdx] = idx;
725+
// init a dedicated context for this column
726+
NpyArr_iterBegin(obj, tc);
727+
npyarr = GET_TC(tc)->npyarr;
773728

774-
// Reference freed in Pdblock_iterend
775-
Py_INCREF(values);
776-
GET_TC(tc)->newObj = values;
777-
778-
// init a dedicated context for this column
779-
NpyArr_iterBegin(obj, tc);
780-
npyarr = GET_TC(tc)->npyarr;
781-
782-
// set the dataptr to our desired column and initialise
783-
if (npyarr != NULL) {
784-
npyarr->dataptr += npyarr->stride * idx;
785-
NpyArr_iterNext(obj, tc);
786-
}
787-
GET_TC(tc)->itemValue = NULL;
788-
((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL;
789-
790-
blkCtxt->npyCtxts[colIdx] = npyarr;
791-
GET_TC(tc)->newObj = NULL;
792-
} while (iternext(iter));
729+
GET_TC(tc)->itemValue = NULL;
730+
((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL;
793731

794-
NpyIter_Deallocate(iter);
795-
Py_DECREF(values);
796-
Py_DECREF(locs);
732+
blkCtxt->npyCtxts[i] = npyarr;
733+
GET_TC(tc)->newObj = NULL;
797734
}
798735
GET_TC(tc)->npyarr = blkCtxt->npyCtxts[0];
736+
goto ARR_RET;
799737

800-
BLKRET:
801-
Py_DECREF(blocks);
738+
ARR_RET:
739+
Py_DECREF(arrays);
802740
}
803741

804742
void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
@@ -830,9 +768,6 @@ void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
830768
if (blkCtxt->npyCtxts) {
831769
PyObject_Free(blkCtxt->npyCtxts);
832770
}
833-
if (blkCtxt->cindices) {
834-
PyObject_Free(blkCtxt->cindices);
835-
}
836771
PyObject_Free(blkCtxt);
837772
}
838773
}

pandas/core/internals/array_manager.py

+7
Original file line numberDiff line numberDiff line change
@@ -919,6 +919,13 @@ def iget_values(self, i: int) -> ArrayLike:
919919
"""
920920
return self.arrays[i]
921921

922+
@property
923+
def column_arrays(self) -> list[ArrayLike]:
924+
"""
925+
Used in the JSON C code to access column arrays.
926+
"""
927+
return self.arrays
928+
922929
def iset(self, loc: int | slice | np.ndarray, value: ArrayLike):
923930
"""
924931
Set new column(s).

pandas/core/internals/blocks.py

-11
Original file line numberDiff line numberDiff line change
@@ -226,13 +226,6 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
226226
# expected "ndarray")
227227
return self.values # type: ignore[return-value]
228228

229-
def get_block_values_for_json(self) -> np.ndarray:
230-
"""
231-
This is used in the JSON C code.
232-
"""
233-
# TODO(EA2D): reshape will be unnecessary with 2D EAs
234-
return np.asarray(self.values).reshape(self.shape)
235-
236229
@final
237230
@cache_readonly
238231
def fill_value(self):
@@ -1778,10 +1771,6 @@ class DatetimeLikeBlock(NDArrayBackedExtensionBlock):
17781771
is_numeric = False
17791772
values: DatetimeArray | TimedeltaArray
17801773

1781-
def get_block_values_for_json(self):
1782-
# Not necessary to override, but helps perf
1783-
return self.values._ndarray
1784-
17851774

17861775
class DatetimeTZBlock(DatetimeLikeBlock):
17871776
""" implement a datetime64 block with a tz attribute """

pandas/core/internals/managers.py

+24
Original file line numberDiff line numberDiff line change
@@ -1153,6 +1153,30 @@ def iget_values(self, i: int) -> ArrayLike:
11531153
values = block.iget(self.blklocs[i])
11541154
return values
11551155

1156+
@property
1157+
def column_arrays(self) -> list[np.ndarray]:
1158+
"""
1159+
Used in the JSON C code to access column arrays.
1160+
This optimizes compared to using `iget_values` by converting each
1161+
block.values to a np.ndarray only once up front
1162+
"""
1163+
# special casing datetimetz to avoid conversion through object dtype
1164+
arrays = [
1165+
blk.values._ndarray
1166+
if isinstance(blk, DatetimeTZBlock)
1167+
else np.asarray(blk.values)
1168+
for blk in self.blocks
1169+
]
1170+
result = []
1171+
for i in range(len(self.items)):
1172+
arr = arrays[self.blknos[i]]
1173+
if arr.ndim == 2:
1174+
values = arr[self.blklocs[i]]
1175+
else:
1176+
values = arr
1177+
result.append(values)
1178+
return result
1179+
11561180
def iset(self, loc: int | slice | np.ndarray, value: ArrayLike):
11571181
"""
11581182
Set new item in-place. Does not consolidate. Adds new Block if not

pandas/tests/io/json/test_json_table_schema.py

-4
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66
import numpy as np
77
import pytest
88

9-
import pandas.util._test_decorators as td
10-
119
from pandas.core.dtypes.dtypes import (
1210
CategoricalDtype,
1311
DatetimeTZDtype,
@@ -26,8 +24,6 @@
2624
set_default_names,
2725
)
2826

29-
pytestmark = td.skip_array_manager_not_yet_implemented
30-
3127

3228
class TestBuildSchema:
3329
def setup_method(self, method):

pandas/tests/io/json/test_pandas.py

-2
Original file line numberDiff line numberDiff line change
@@ -857,8 +857,6 @@ def test_convert_dates_infer(self, infer_word):
857857
result = read_json(dumps(data))[["id", infer_word]]
858858
tm.assert_frame_equal(result, expected)
859859

860-
# TODO(ArrayManager) JSON
861-
@td.skip_array_manager_not_yet_implemented
862860
@pytest.mark.parametrize(
863861
"date,date_unit",
864862
[

0 commit comments

Comments
 (0)