From e9d3e481f0bb4335f651230920303806599009ef Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 19 Jul 2023 13:45:12 -0700 Subject: [PATCH 1/8] POC: pass date_unit to values_for_json --- .../src/vendored/ujson/python/objToJSON.c | 17 +++++++++++++- pandas/core/internals/array_manager.py | 3 +-- pandas/core/internals/blocks.py | 23 +++++++++++++++---- pandas/core/internals/managers.py | 5 ++-- 4 files changed, 38 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 65b468f268d75..f2a35232b95f6 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -713,7 +713,22 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { return; } - arrays = get_sub_attr(obj, "_mgr", "column_arrays"); + NPY_DATETIMEUNIT dunit = (PyObjectEncoder *)tc->datetimeUnit; + PyObject *date_unit; + if (dunit == NPY_FR_s) { + date_unit = "s"; + } else if (dunit == NPY_FR_ms) { + date_unit = "ms"; + } else if (dunit == NPY_FR_us) { + date_unit = "us"; + } else if (dunit == NPY_FR_ns) { + date_unit = "ns"; + } + + PyObject *mgr = PyObject_GetAttrString(obj, "_mgr"); + PyObject *name = "column_arrays"; + arrays = PyObject_CallMethodOneArg(mgr, name, date_unit); + if (!arrays) { GET_TC(tc)->iterNext = NpyArr_iterNextNone; return; diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 3b77540efcdd2..4d92ef489823d 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -691,8 +691,7 @@ def iget_values(self, i: int) -> ArrayLike: """ return self.arrays[i] - @property - def column_arrays(self) -> list[ArrayLike]: + def column_arrays(self, date_unit) -> list[np.ndarray]: """ Used in the JSON C code to access column arrays. """ diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 2d102de879df0..2d5454c426366 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1638,7 +1638,7 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray: """ raise AbstractMethodError(self) - def values_for_json(self) -> np.ndarray: + def values_for_json(self, date_unit) -> np.ndarray: raise AbstractMethodError(self) @@ -1885,7 +1885,7 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray: # TODO(EA2D): reshape not needed with 2D EAs return np.asarray(values).reshape(self.shape) - def values_for_json(self) -> np.ndarray: + def values_for_json(self, date_unit) -> np.ndarray: return np.asarray(self.values) @final @@ -2174,7 +2174,7 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray: return self.values.astype(_dtype_obj) return self.values - def values_for_json(self) -> np.ndarray: + def values_for_json(self, date_unit) -> np.ndarray: return self.values @cache_readonly @@ -2231,7 +2231,22 @@ class DatetimeLikeBlock(NDArrayBackedExtensionBlock): is_numeric = False values: DatetimeArray | TimedeltaArray - def values_for_json(self) -> np.ndarray: + def values_for_json(self, date_unit) -> np.ndarray: + values = self.values + if values.dtype.kind == "M": + if date_unit == "s": + return self.values.strftime("%Y-%M-%D %h-%m-%s") + elif date_unit == "ms": + raise NotImplementedError + elif date_unit == "us": + return self.values.strftime("%Y-%M-%D %h-%m-%s.%f") + elif date_unit == "ns": + return self.values.astype(str) + else: + raise NotImplementedError + else: + raise NotImplementedError + return self.values._ndarray diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index ac2dd08d47427..03d0f56e32018 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -995,8 +995,7 @@ def iget_values(self, i: int) -> ArrayLike: values = block.iget(self.blklocs[i]) return values - @property - def column_arrays(self) -> list[np.ndarray]: + def column_arrays(self, date_unit) -> list[np.ndarray]: """ Used in the JSON C code to access column arrays. This optimizes compared to using `iget_values` by converting each @@ -1010,7 +1009,7 @@ def column_arrays(self) -> list[np.ndarray]: for blk in self.blocks: mgr_locs = blk._mgr_locs - values = blk.values_for_json() + values = blk.values_for_json(date_unit) if values.ndim == 1: # TODO(EA2D): special casing not needed with 2D EAs result[mgr_locs[0]] = values From 1c0c7105fa0f7cab674b30267ede954f68d4a834 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 20 Jul 2023 08:58:18 -0700 Subject: [PATCH 2/8] apply suggestions --- pandas/_libs/src/vendored/ujson/python/objToJSON.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index f2a35232b95f6..7d33c9a5eb07d 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -713,21 +713,21 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { return; } - NPY_DATETIMEUNIT dunit = (PyObjectEncoder *)tc->datetimeUnit; + NPY_DATETIMEUNIT dunit = ((PyObjectEncoder *)tc)->datetimeUnit; PyObject *date_unit; if (dunit == NPY_FR_s) { - date_unit = "s"; + date_unit = PyUnicode_FromString("s"); } else if (dunit == NPY_FR_ms) { - date_unit = "ms"; + date_unit = PyUnicode_FromString("ms"); } else if (dunit == NPY_FR_us) { - date_unit = "us"; + date_unit = PyUnicode_FromString("us"); } else if (dunit == NPY_FR_ns) { - date_unit = "ns"; + date_unit = PyUnicode_FromString("ns"); } PyObject *mgr = PyObject_GetAttrString(obj, "_mgr"); - PyObject *name = "column_arrays"; - arrays = PyObject_CallMethodOneArg(mgr, name, date_unit); + arrays = PyObject_CallMethod(mgr, "column_arrays", "%s", date_unit); + Py_DECREF(mgr); if (!arrays) { GET_TC(tc)->iterNext = NpyArr_iterNextNone; From 101c612ed5795d1214bb2689b922ce17450559d1 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 20 Jul 2023 18:08:17 -0700 Subject: [PATCH 3/8] suggested edit --- pandas/_libs/src/vendored/ujson/python/objToJSON.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 7d33c9a5eb07d..0e1186a0f96df 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -714,15 +714,15 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { } NPY_DATETIMEUNIT dunit = ((PyObjectEncoder *)tc)->datetimeUnit; - PyObject *date_unit; + char *date_unit; if (dunit == NPY_FR_s) { - date_unit = PyUnicode_FromString("s"); + date_unit = "s"; } else if (dunit == NPY_FR_ms) { - date_unit = PyUnicode_FromString("ms"); + date_unit = "ms"; } else if (dunit == NPY_FR_us) { - date_unit = PyUnicode_FromString("us"); + date_unit = "us"; } else if (dunit == NPY_FR_ns) { - date_unit = PyUnicode_FromString("ns"); + date_unit = "ns"; } PyObject *mgr = PyObject_GetAttrString(obj, "_mgr"); From f021ef93bb94d0604ebef00e840825e9bc7de3e1 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 21 Jul 2023 08:27:53 -0700 Subject: [PATCH 4/8] check for NULL --- pandas/_libs/src/vendored/ujson/python/objToJSON.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 0e1186a0f96df..d509615a4d955 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -726,6 +726,9 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { } PyObject *mgr = PyObject_GetAttrString(obj, "_mgr"); + if (mgr == NULL) { + return; + } arrays = PyObject_CallMethod(mgr, "column_arrays", "%s", date_unit); Py_DECREF(mgr); From ac727ca294bc27e44744a0484cd5a282e4e08e75 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 19 Jul 2023 13:45:12 -0700 Subject: [PATCH 5/8] POC: pass date_unit to values_for_json --- .../src/vendored/ujson/python/objToJSON.c | 17 +++++++++++++- pandas/core/internals/array_manager.py | 3 +-- pandas/core/internals/blocks.py | 23 +++++++++++++++---- pandas/core/internals/managers.py | 5 ++-- 4 files changed, 38 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 65b468f268d75..f2a35232b95f6 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -713,7 +713,22 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { return; } - arrays = get_sub_attr(obj, "_mgr", "column_arrays"); + NPY_DATETIMEUNIT dunit = (PyObjectEncoder *)tc->datetimeUnit; + PyObject *date_unit; + if (dunit == NPY_FR_s) { + date_unit = "s"; + } else if (dunit == NPY_FR_ms) { + date_unit = "ms"; + } else if (dunit == NPY_FR_us) { + date_unit = "us"; + } else if (dunit == NPY_FR_ns) { + date_unit = "ns"; + } + + PyObject *mgr = PyObject_GetAttrString(obj, "_mgr"); + PyObject *name = "column_arrays"; + arrays = PyObject_CallMethodOneArg(mgr, name, date_unit); + if (!arrays) { GET_TC(tc)->iterNext = NpyArr_iterNextNone; return; diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 3b77540efcdd2..4d92ef489823d 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -691,8 +691,7 @@ def iget_values(self, i: int) -> ArrayLike: """ return self.arrays[i] - @property - def column_arrays(self) -> list[ArrayLike]: + def column_arrays(self, date_unit) -> list[np.ndarray]: """ Used in the JSON C code to access column arrays. """ diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 2d102de879df0..2d5454c426366 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1638,7 +1638,7 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray: """ raise AbstractMethodError(self) - def values_for_json(self) -> np.ndarray: + def values_for_json(self, date_unit) -> np.ndarray: raise AbstractMethodError(self) @@ -1885,7 +1885,7 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray: # TODO(EA2D): reshape not needed with 2D EAs return np.asarray(values).reshape(self.shape) - def values_for_json(self) -> np.ndarray: + def values_for_json(self, date_unit) -> np.ndarray: return np.asarray(self.values) @final @@ -2174,7 +2174,7 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray: return self.values.astype(_dtype_obj) return self.values - def values_for_json(self) -> np.ndarray: + def values_for_json(self, date_unit) -> np.ndarray: return self.values @cache_readonly @@ -2231,7 +2231,22 @@ class DatetimeLikeBlock(NDArrayBackedExtensionBlock): is_numeric = False values: DatetimeArray | TimedeltaArray - def values_for_json(self) -> np.ndarray: + def values_for_json(self, date_unit) -> np.ndarray: + values = self.values + if values.dtype.kind == "M": + if date_unit == "s": + return self.values.strftime("%Y-%M-%D %h-%m-%s") + elif date_unit == "ms": + raise NotImplementedError + elif date_unit == "us": + return self.values.strftime("%Y-%M-%D %h-%m-%s.%f") + elif date_unit == "ns": + return self.values.astype(str) + else: + raise NotImplementedError + else: + raise NotImplementedError + return self.values._ndarray diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index ac2dd08d47427..03d0f56e32018 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -995,8 +995,7 @@ def iget_values(self, i: int) -> ArrayLike: values = block.iget(self.blklocs[i]) return values - @property - def column_arrays(self) -> list[np.ndarray]: + def column_arrays(self, date_unit) -> list[np.ndarray]: """ Used in the JSON C code to access column arrays. This optimizes compared to using `iget_values` by converting each @@ -1010,7 +1009,7 @@ def column_arrays(self) -> list[np.ndarray]: for blk in self.blocks: mgr_locs = blk._mgr_locs - values = blk.values_for_json() + values = blk.values_for_json(date_unit) if values.ndim == 1: # TODO(EA2D): special casing not needed with 2D EAs result[mgr_locs[0]] = values From 8c2e1933f34ce094049c8e7c8875d0c9138abafc Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 20 Jul 2023 08:58:18 -0700 Subject: [PATCH 6/8] apply suggestions --- pandas/_libs/src/vendored/ujson/python/objToJSON.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index f2a35232b95f6..7d33c9a5eb07d 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -713,21 +713,21 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { return; } - NPY_DATETIMEUNIT dunit = (PyObjectEncoder *)tc->datetimeUnit; + NPY_DATETIMEUNIT dunit = ((PyObjectEncoder *)tc)->datetimeUnit; PyObject *date_unit; if (dunit == NPY_FR_s) { - date_unit = "s"; + date_unit = PyUnicode_FromString("s"); } else if (dunit == NPY_FR_ms) { - date_unit = "ms"; + date_unit = PyUnicode_FromString("ms"); } else if (dunit == NPY_FR_us) { - date_unit = "us"; + date_unit = PyUnicode_FromString("us"); } else if (dunit == NPY_FR_ns) { - date_unit = "ns"; + date_unit = PyUnicode_FromString("ns"); } PyObject *mgr = PyObject_GetAttrString(obj, "_mgr"); - PyObject *name = "column_arrays"; - arrays = PyObject_CallMethodOneArg(mgr, name, date_unit); + arrays = PyObject_CallMethod(mgr, "column_arrays", "%s", date_unit); + Py_DECREF(mgr); if (!arrays) { GET_TC(tc)->iterNext = NpyArr_iterNextNone; From 51b443b3e1c5e33b1dfc53654fa088dbf92ec542 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 20 Jul 2023 18:08:17 -0700 Subject: [PATCH 7/8] suggested edit --- pandas/_libs/src/vendored/ujson/python/objToJSON.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 7d33c9a5eb07d..0e1186a0f96df 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -714,15 +714,15 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { } NPY_DATETIMEUNIT dunit = ((PyObjectEncoder *)tc)->datetimeUnit; - PyObject *date_unit; + char *date_unit; if (dunit == NPY_FR_s) { - date_unit = PyUnicode_FromString("s"); + date_unit = "s"; } else if (dunit == NPY_FR_ms) { - date_unit = PyUnicode_FromString("ms"); + date_unit = "ms"; } else if (dunit == NPY_FR_us) { - date_unit = PyUnicode_FromString("us"); + date_unit = "us"; } else if (dunit == NPY_FR_ns) { - date_unit = PyUnicode_FromString("ns"); + date_unit = "ns"; } PyObject *mgr = PyObject_GetAttrString(obj, "_mgr"); From 540deee4214e1db333a04dbafe622a903bef9344 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 21 Jul 2023 08:27:53 -0700 Subject: [PATCH 8/8] check for NULL --- pandas/_libs/src/vendored/ujson/python/objToJSON.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 0e1186a0f96df..d509615a4d955 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -726,6 +726,9 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { } PyObject *mgr = PyObject_GetAttrString(obj, "_mgr"); + if (mgr == NULL) { + return; + } arrays = PyObject_CallMethod(mgr, "column_arrays", "%s", date_unit); Py_DECREF(mgr);