REF: remove Block access in the JSON writing code (pandas-dev#41081)

jorisvandenbossche · yeshsurya · commit de9d9c4475bd · 2021-05-06T14:24:46.000+05:30
diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c
@@ -83,7 +83,6 @@ typedef struct __PdBlockContext {
     int ncols;
     int transpose;
 
-    int *cindices;             // frame column -> block column map
     NpyArrContext **npyCtxts;  // NpyArrContext for each column
 } PdBlockContext;
 
@@ -294,7 +293,12 @@ static int is_simple_frame(PyObject *obj) {
     if (!mgr) {
         return 0;
     }
-    int ret = (get_attr_length(mgr, "blocks") <= 1);
+    int ret;
+    if (PyObject_HasAttrString(mgr, "blocks")) {
+        ret = (get_attr_length(mgr, "blocks") <= 1);
+    } else {
+        ret = 0;
+    }
 
     Py_DECREF(mgr);
     return ret;
@@ -656,16 +660,10 @@ void PdBlockPassThru_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
 }
 
 void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {
-    PyObject *obj, *blocks, *block, *values, *tmp;
-    PyArrayObject *locs;
+    PyObject *obj, *values, *arrays, *array;
     PdBlockContext *blkCtxt;
     NpyArrContext *npyarr;
     Py_ssize_t i;
-    NpyIter *iter;
-    NpyIter_IterNextFunc *iternext;
-    npy_int64 **dataptr;
-    npy_int64 colIdx;
-    npy_intp idx;
 
     obj = (PyObject *)_obj;
 
@@ -687,7 +685,6 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {
 
     if (blkCtxt->ncols == 0) {
         blkCtxt->npyCtxts = NULL;
-        blkCtxt->cindices = NULL;
 
         GET_TC(tc)->iterNext = NpyArr_iterNextNone;
         return;
@@ -701,104 +698,45 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {
         return;
     }
 
-    blkCtxt->cindices = PyObject_Malloc(sizeof(int) * blkCtxt->ncols);
-    if (!blkCtxt->cindices) {
-        PyErr_NoMemory();
-        GET_TC(tc)->iterNext = NpyArr_iterNextNone;
-        return;
-    }
-
-    blocks = get_sub_attr(obj, "_mgr", "blocks");
-    if (!blocks) {
+    arrays = get_sub_attr(obj, "_mgr", "column_arrays");
+    if (!arrays) {
         GET_TC(tc)->iterNext = NpyArr_iterNextNone;
         return;
-    } else if (!PyTuple_Check(blocks)) {
-        PyErr_SetString(PyExc_TypeError, "blocks must be a tuple!");
-        goto BLKRET;
     }
 
-    // force transpose so each NpyArrContext strides down its column
-    GET_TC(tc)->transpose = 1;
-
-    for (i = 0; i < PyObject_Length(blocks); i++) {
-        block = PyTuple_GET_ITEM(blocks, i);
-        if (!block) {
+    for (i = 0; i < PyObject_Length(arrays); i++) {
+        array = PyList_GET_ITEM(arrays, i);
+        if (!array) {
             GET_TC(tc)->iterNext = NpyArr_iterNextNone;
-            goto BLKRET;
+            goto ARR_RET;
         }
 
-        tmp = PyObject_CallMethod(block, "get_block_values_for_json", NULL);
-        if (!tmp) {
+        // ensure we have a numpy array (i.e. np.asarray)
+        values = PyObject_CallMethod(array, "__array__", NULL);
+        if ((!values) || (!PyArray_CheckExact(values))) {
+            // Didn't get a numpy array
             ((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
             GET_TC(tc)->iterNext = NpyArr_iterNextNone;
-            goto BLKRET;
-        }
-
-        values = PyArray_Transpose((PyArrayObject *)tmp, NULL);
-        Py_DECREF(tmp);
-        if (!values) {
-            GET_TC(tc)->iterNext = NpyArr_iterNextNone;
-            goto BLKRET;
-        }
-
-        locs = (PyArrayObject *)get_sub_attr(block, "mgr_locs", "as_array");
-        if (!locs) {
-            Py_DECREF(values);
-            GET_TC(tc)->iterNext = NpyArr_iterNextNone;
-            goto BLKRET;
+            goto ARR_RET;
         }
 
-        iter = NpyIter_New(locs, NPY_ITER_READONLY, NPY_KEEPORDER,
-                           NPY_NO_CASTING, NULL);
-        if (!iter) {
-            Py_DECREF(values);
-            Py_DECREF(locs);
-            GET_TC(tc)->iterNext = NpyArr_iterNextNone;
-            goto BLKRET;
-        }
-        iternext = NpyIter_GetIterNext(iter, NULL);
-        if (!iternext) {
-            NpyIter_Deallocate(iter);
-            Py_DECREF(values);
-            Py_DECREF(locs);
-            GET_TC(tc)->iterNext = NpyArr_iterNextNone;
-            goto BLKRET;
-        }
-        dataptr = (npy_int64 **)NpyIter_GetDataPtrArray(iter);
-        do {
-            colIdx = **dataptr;
-            idx = NpyIter_GetIterIndex(iter);
+        GET_TC(tc)->newObj = values;
 
-            blkCtxt->cindices[colIdx] = idx;
+        // init a dedicated context for this column
+        NpyArr_iterBegin(obj, tc);
+        npyarr = GET_TC(tc)->npyarr;
 
-            // Reference freed in Pdblock_iterend
-            Py_INCREF(values);
-            GET_TC(tc)->newObj = values;
-
-            // init a dedicated context for this column
-            NpyArr_iterBegin(obj, tc);
-            npyarr = GET_TC(tc)->npyarr;
-
-            // set the dataptr to our desired column and initialise
-            if (npyarr != NULL) {
-                npyarr->dataptr += npyarr->stride * idx;
-                NpyArr_iterNext(obj, tc);
-            }
-            GET_TC(tc)->itemValue = NULL;
-            ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL;
-
-            blkCtxt->npyCtxts[colIdx] = npyarr;
-            GET_TC(tc)->newObj = NULL;
-        } while (iternext(iter));
+        GET_TC(tc)->itemValue = NULL;
+        ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL;
 
-        NpyIter_Deallocate(iter);
-        Py_DECREF(values);
-        Py_DECREF(locs);
+        blkCtxt->npyCtxts[i] = npyarr;
+        GET_TC(tc)->newObj = NULL;
     }
     GET_TC(tc)->npyarr = blkCtxt->npyCtxts[0];
+    goto ARR_RET;
 
-BLKRET:
-    Py_DECREF(blocks);
+ARR_RET:
+    Py_DECREF(arrays);
 }
 
 void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
@@ -830,9 +768,6 @@ void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
         if (blkCtxt->npyCtxts) {
             PyObject_Free(blkCtxt->npyCtxts);
         }
-        if (blkCtxt->cindices) {
-            PyObject_Free(blkCtxt->cindices);
-        }
         PyObject_Free(blkCtxt);
     }
 }
diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py
@@ -919,6 +919,13 @@ def iget_values(self, i: int) -> ArrayLike:
         """
         return self.arrays[i]
 
+    @property
+    def column_arrays(self) -> list[ArrayLike]:
+        """
+        Used in the JSON C code to access column arrays.
+        """
+        return self.arrays
+
     def iset(self, loc: int | slice | np.ndarray, value: ArrayLike):
         """
         Set new column(s).
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -226,13 +226,6 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
         # expected "ndarray")
         return self.values  # type: ignore[return-value]
 
-    def get_block_values_for_json(self) -> np.ndarray:
-        """
-        This is used in the JSON C code.
-        """
-        # TODO(EA2D): reshape will be unnecessary with 2D EAs
-        return np.asarray(self.values).reshape(self.shape)
-
     @final
     @cache_readonly
     def fill_value(self):
@@ -1778,10 +1771,6 @@ class DatetimeLikeBlock(NDArrayBackedExtensionBlock):
     is_numeric = False
     values: DatetimeArray | TimedeltaArray
 
-    def get_block_values_for_json(self):
-        # Not necessary to override, but helps perf
-        return self.values._ndarray
-
 
 class DatetimeTZBlock(DatetimeLikeBlock):
     """ implement a datetime64 block with a tz attribute """
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -1153,6 +1153,30 @@ def iget_values(self, i: int) -> ArrayLike:
         values = block.iget(self.blklocs[i])
         return values
 
+    @property
+    def column_arrays(self) -> list[np.ndarray]:
+        """
+        Used in the JSON C code to access column arrays.
+        This optimizes compared to using `iget_values` by converting each
+        block.values to a np.ndarray only once up front
+        """
+        # special casing datetimetz to avoid conversion through object dtype
+        arrays = [
+            blk.values._ndarray
+            if isinstance(blk, DatetimeTZBlock)
+            else np.asarray(blk.values)
+            for blk in self.blocks
+        ]
+        result = []
+        for i in range(len(self.items)):
+            arr = arrays[self.blknos[i]]
+            if arr.ndim == 2:
+                values = arr[self.blklocs[i]]
+            else:
+                values = arr
+            result.append(values)
+        return result
+
     def iset(self, loc: int | slice | np.ndarray, value: ArrayLike):
         """
         Set new item in-place. Does not consolidate. Adds new Block if not
diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py
@@ -6,8 +6,6 @@
 import numpy as np
 import pytest
 
-import pandas.util._test_decorators as td
-
 from pandas.core.dtypes.dtypes import (
     CategoricalDtype,
     DatetimeTZDtype,
@@ -26,8 +24,6 @@
     set_default_names,
 )
 
-pytestmark = td.skip_array_manager_not_yet_implemented
-
 
 class TestBuildSchema:
     def setup_method(self, method):
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -857,8 +857,6 @@ def test_convert_dates_infer(self, infer_word):
         result = read_json(dumps(data))[["id", infer_word]]
         tm.assert_frame_equal(result, expected)
 
-    # TODO(ArrayManager) JSON
-    @td.skip_array_manager_not_yet_implemented
     @pytest.mark.parametrize(
         "date,date_unit",
         [

Original file line number	Diff line number	Diff line change
`@@ -857,8 +857,6 @@ def test_convert_dates_infer(self, infer_word):`
`857`	`857`	`result = read_json(dumps(data))[["id", infer_word]]`
`858`	`858`	`tm.assert_frame_equal(result, expected)`
`859`	`859`
`860`		`- # TODO(ArrayManager) JSON`
`861`		`- @td.skip_array_manager_not_yet_implemented`
`862`	`860`	`@pytest.mark.parametrize(`
`863`	`861`	`"date,date_unit",`
`864`	`862`	`[`