Skip to content

POC: pass date_unit to values_for_json #54198

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
17 changes: 16 additions & 1 deletion pandas/_libs/src/vendored/ujson/python/objToJSON.c
Original file line number Diff line number Diff line change
Expand Up @@ -713,7 +713,22 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {
return;
}

arrays = get_sub_attr(obj, "_mgr", "column_arrays");
NPY_DATETIMEUNIT dunit = ((PyObjectEncoder *)tc)->datetimeUnit;
PyObject *date_unit;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you should just declare this as a char * - by making it a PyObject you have to manage the lifecycle and DECREF, which is currently missing. It also adds unnecessary overhead

if (dunit == NPY_FR_s) {
date_unit = PyUnicode_FromString("s");
} else if (dunit == NPY_FR_ms) {
date_unit = PyUnicode_FromString("ms");
} else if (dunit == NPY_FR_us) {
date_unit = PyUnicode_FromString("us");
} else if (dunit == NPY_FR_ns) {
date_unit = PyUnicode_FromString("ns");
}

PyObject *mgr = PyObject_GetAttrString(obj, "_mgr");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There might be some code segments that hit this but don't have a _mgr attr. This would return NULL and could explain the segfaults

So you will want to check if the result is equal to NULL and return an appropriate error message

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will update, but we only get here with a DataFrame.

arrays = PyObject_CallMethod(mgr, "column_arrays", "%s", date_unit);
Py_DECREF(mgr);

if (!arrays) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After you are done with mgr you will want to Py_DECREF(mgr) to avoid memory leaks

GET_TC(tc)->iterNext = NpyArr_iterNextNone;
return;
Expand Down
3 changes: 1 addition & 2 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -691,8 +691,7 @@ def iget_values(self, i: int) -> ArrayLike:
"""
return self.arrays[i]

@property
def column_arrays(self) -> list[ArrayLike]:
def column_arrays(self, date_unit) -> list[np.ndarray]:
"""
Used in the JSON C code to access column arrays.
"""
Expand Down
23 changes: 19 additions & 4 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1638,7 +1638,7 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
"""
raise AbstractMethodError(self)

def values_for_json(self) -> np.ndarray:
def values_for_json(self, date_unit) -> np.ndarray:
raise AbstractMethodError(self)


Expand Down Expand Up @@ -1885,7 +1885,7 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
# TODO(EA2D): reshape not needed with 2D EAs
return np.asarray(values).reshape(self.shape)

def values_for_json(self) -> np.ndarray:
def values_for_json(self, date_unit) -> np.ndarray:
return np.asarray(self.values)

@final
Expand Down Expand Up @@ -2174,7 +2174,7 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
return self.values.astype(_dtype_obj)
return self.values

def values_for_json(self) -> np.ndarray:
def values_for_json(self, date_unit) -> np.ndarray:
return self.values

@cache_readonly
Expand Down Expand Up @@ -2231,7 +2231,22 @@ class DatetimeLikeBlock(NDArrayBackedExtensionBlock):
is_numeric = False
values: DatetimeArray | TimedeltaArray

def values_for_json(self) -> np.ndarray:
def values_for_json(self, date_unit) -> np.ndarray:
values = self.values
if values.dtype.kind == "M":
if date_unit == "s":
return self.values.strftime("%Y-%M-%D %h-%m-%s")
elif date_unit == "ms":
raise NotImplementedError
elif date_unit == "us":
return self.values.strftime("%Y-%M-%D %h-%m-%s.%f")
elif date_unit == "ns":
return self.values.astype(str)
else:
raise NotImplementedError
else:
raise NotImplementedError

return self.values._ndarray


Expand Down
5 changes: 2 additions & 3 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -995,8 +995,7 @@ def iget_values(self, i: int) -> ArrayLike:
values = block.iget(self.blklocs[i])
return values

@property
def column_arrays(self) -> list[np.ndarray]:
def column_arrays(self, date_unit) -> list[np.ndarray]:
"""
Used in the JSON C code to access column arrays.
This optimizes compared to using `iget_values` by converting each
Expand All @@ -1010,7 +1009,7 @@ def column_arrays(self) -> list[np.ndarray]:

for blk in self.blocks:
mgr_locs = blk._mgr_locs
values = blk.values_for_json()
values = blk.values_for_json(date_unit)
if values.ndim == 1:
# TODO(EA2D): special casing not needed with 2D EAs
result[mgr_locs[0]] = values
Expand Down