Skip to content

BUG: to_json not serializing non-nanosecond numpy dt64 correctly #53757

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jul 25, 2023
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,7 @@ I/O
- Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`)
- Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`)
- Bug in :func:`read_sql` when reading multiple timezone aware columns with the same column name (:issue:`44421`)
- Bug in :meth:`DataFrame.to_json` where :class:`DateTimeArray`/:class:`DateTimeIndex` with non nanosecond precision could not be serialized correctly (:issue:`53686`)
- Bug in :func:`read_xml` stripping whitespace in string data (:issue:`53811`)
- Bug in :meth:`DataFrame.to_html` where ``colspace`` was incorrectly applied in case of multi index columns (:issue:`53885`)
- Bug when writing and reading empty Stata dta files where dtype information was lost (:issue:`46240`)
Expand Down
5 changes: 4 additions & 1 deletion pandas/_libs/include/pandas/datetime/date_conversions.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@ int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit);
// up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z
// while base="ns" yields "2020-01-01T00:00:00.000000000Z"
// len is mutated to save the length of the returned string
char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len);
char *int64ToIso(int64_t value,
NPY_DATETIMEUNIT valueUnit,
NPY_DATETIMEUNIT base,
size_t *len);

// TODO(username): this function doesn't do a lot; should augment or
// replace with scaleNanosecToUnit
Expand Down
6 changes: 3 additions & 3 deletions pandas/_libs/include/pandas/datetime/pd_datetime.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ typedef struct {
npy_datetime (*npy_datetimestruct_to_datetime)(NPY_DATETIMEUNIT,
const npy_datetimestruct *);
int (*scaleNanosecToUnit)(npy_int64 *, NPY_DATETIMEUNIT);
char *(*int64ToIso)(int64_t, NPY_DATETIMEUNIT, size_t *);
char *(*int64ToIso)(int64_t, NPY_DATETIMEUNIT, NPY_DATETIMEUNIT, size_t *);
npy_datetime (*NpyDateTimeToEpoch)(npy_datetime, NPY_DATETIMEUNIT);
char *(*PyDateTimeToIso)(PyObject *, NPY_DATETIMEUNIT, size_t *);
npy_datetime (*PyDateTimeToEpoch)(PyObject *, NPY_DATETIMEUNIT);
Expand Down Expand Up @@ -73,8 +73,8 @@ static PandasDateTime_CAPI *PandasDateTimeAPI = NULL;
(npy_datetimestruct))
#define scaleNanosecToUnit(value, unit) \
PandasDateTimeAPI->scaleNanosecToUnit((value), (unit))
#define int64ToIso(value, base, len) \
PandasDateTimeAPI->int64ToIso((value), (base), (len))
#define int64ToIso(value, valueUnit, base, len) \
PandasDateTimeAPI->int64ToIso((value), (valueUnit), (base), (len))
#define NpyDateTimeToEpoch(dt, base) \
PandasDateTimeAPI->NpyDateTimeToEpoch((dt), (base))
#define PyDateTimeToIso(obj, base, len) \
Expand Down
7 changes: 5 additions & 2 deletions pandas/_libs/src/datetime/date_conversions.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,14 @@ int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit) {
}

/* Converts the int64_t representation of a datetime to ISO; mutates len */
char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) {
char *int64ToIso(int64_t value,
NPY_DATETIMEUNIT valueUnit,
NPY_DATETIMEUNIT base,
size_t *len) {
npy_datetimestruct dts;
int ret_code;

pandas_datetime_to_datetimestruct(value, NPY_FR_ns, &dts);
pandas_datetime_to_datetimestruct(value, valueUnit, &dts);

*len = (size_t)get_datetime_iso_8601_strlen(0, base);
char *result = PyObject_Malloc(*len);
Expand Down
49 changes: 36 additions & 13 deletions pandas/_libs/src/vendored/ujson/python/objToJSON.c
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ typedef struct __PyObjectEncoder {

int datetimeIso;
NPY_DATETIMEUNIT datetimeUnit;
NPY_DATETIMEUNIT valueUnit;

// output format style for pandas data types
int outputFormat;
Expand Down Expand Up @@ -350,7 +351,8 @@ static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc,
static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused),
JSONTypeContext *tc, size_t *len) {
NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, base, len);
NPY_DATETIMEUNIT valueUnit = ((PyObjectEncoder *)tc->encoder)->valueUnit;
GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, valueUnit, base, len);
return GET_TC(tc)->cStr;
}

Expand All @@ -364,8 +366,9 @@ static char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused),
/* JSON callback */
static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc,
size_t *len) {
if (!PyDate_Check(obj)) {
PyErr_SetString(PyExc_TypeError, "Expected date object");
if (!PyDate_Check(obj) && !PyDateTime_Check(obj)) {
PyErr_SetString(PyExc_TypeError, "Expected date or datetime object");
((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Off-topic, but we should be setting the errorMsg whenever we return NULL in a callback.

Otherwise, there is a possibility of a segfault, since ujson will still think it's in a valid state and try to continue encoding, instead of instantly returning (allowing the Python exception to propagate up).

return NULL;
}

Expand Down Expand Up @@ -502,6 +505,10 @@ int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) {
GET_TC(tc)->itemValue = obj;
Py_INCREF(obj);
((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(npyarr->array);
// Also write the resolution (unit) of the ndarray
PyArray_Descr *dtype = PyArray_DESCR(npyarr->array);
((PyObjectEncoder *)tc->encoder)->valueUnit =
get_datetime_metadata_from_dtype(dtype).base;
((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr;
((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr;
} else {
Expand Down Expand Up @@ -1255,6 +1262,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
char **ret;
char *dataptr, *cLabel;
int type_num;
PyArray_Descr *dtype;
NPY_DATETIMEUNIT base = enc->datetimeUnit;

if (!labels) {
Expand Down Expand Up @@ -1283,6 +1291,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
stride = PyArray_STRIDE(labels, 0);
dataptr = PyArray_DATA(labels);
type_num = PyArray_TYPE(labels);
dtype = PyArray_DESCR(labels);

for (i = 0; i < num; i++) {
item = PyArray_GETITEM(labels, dataptr);
Expand All @@ -1293,7 +1302,8 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
}

int is_datetimelike = 0;
npy_int64 nanosecVal;
npy_int64 i8date;
NPY_DATETIMEUNIT dateUnit = NPY_FR_ns;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the idea here that this should default to NS for things that are stored within object arrays?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's for the other stuff like timedeltas and dates. We use nanosecond reso there.

I didn't try to fix those cases as well.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you think it would take to handle those consistently? I don't know how common this would happen but I guess its strange to only do this for numpy-typed arrays and not object arrays that may contain datetimes

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

knee-jerk i think that for object-dtype we should just call str(x) on everything and call it a day

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The object stuff definitely adds a lot of complexity that may not be worth it, but that would be a breaking change to just pass as a str. Particularly for nested containers within an object array that would not work

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it might work for object arrays, already.

I haven't checked yet, I'm not on the right branch. Will update later in the day.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, it turns out that this didn't work already. Sorry for the confusion.

I've fixed this in the latest commit.

if (PyTypeNum_ISDATETIME(type_num)) {
is_datetimelike = 1;
PyArray_VectorUnaryFunc *castfunc =
Expand All @@ -1303,35 +1313,37 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
"Cannot cast numpy dtype %d to long",
enc->npyType);
}
castfunc(dataptr, &nanosecVal, 1, NULL, NULL);
castfunc(dataptr, &i8date, 1, NULL, NULL);
dateUnit = get_datetime_metadata_from_dtype(dtype).base;
} else if (PyDate_Check(item) || PyDelta_Check(item)) {
is_datetimelike = 1;
if (PyObject_HasAttrString(item, "_value")) {
// see test_date_index_and_values for case with non-nano
nanosecVal = get_long_attr(item, "_value");
i8date = get_long_attr(item, "_value");
} else {
if (PyDelta_Check(item)) {
nanosecVal = total_seconds(item) *
i8date = total_seconds(item) *
1000000000LL; // nanoseconds per second
} else {
// datetime.* objects don't follow above rules
nanosecVal = PyDateTimeToEpoch(item, NPY_FR_ns);
i8date = PyDateTimeToEpoch(item, NPY_FR_ns);
}
}
}

if (is_datetimelike) {
if (nanosecVal == get_nat()) {
if (i8date == get_nat()) {
len = 4;
cLabel = PyObject_Malloc(len + 1);
strncpy(cLabel, "null", len + 1);
} else {
if (enc->datetimeIso) {
if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) {
cLabel = int64ToIsoDuration(nanosecVal, &len);
// TODO(username): non-nano timedelta support?
cLabel = int64ToIsoDuration(i8date, &len);
} else {
if (type_num == NPY_DATETIME) {
cLabel = int64ToIso(nanosecVal, base, &len);
cLabel = int64ToIso(i8date, dateUnit, base, &len);
} else {
cLabel = PyDateTimeToIso(item, base, &len);
}
Expand All @@ -1346,7 +1358,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
int size_of_cLabel = 21; // 21 chars for int 64
cLabel = PyObject_Malloc(size_of_cLabel);
snprintf(cLabel, size_of_cLabel, "%" NPY_DATETIME_FMT,
NpyDateTimeToEpoch(nanosecVal, base));
NpyDateTimeToEpoch(i8date, base));
len = strlen(cLabel);
}
}
Expand Down Expand Up @@ -1538,13 +1550,24 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
tc->type = JT_UTF8;
return;
} else if (PyArray_IsScalar(obj, Datetime)) {
npy_int64 longVal;
if (((PyDatetimeScalarObject *)obj)->obval == get_nat()) {
tc->type = JT_NULL;
return;
}
PyArray_Descr *dtype = PyArray_DescrFromScalar(obj);
if (dtype->type_num == NPY_OBJECT) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wondering if this can be removed. This shouldn't ever fail, but if it does, we'll end up with a nasty segfault.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://numpy.org/doc/stable/reference/c-api/array.html#c.PyArray_DescrFromScalar

Reading through that makes me think we are supposed to check we are dealing with an array scalar before even calling this function,

As far as the resulting comparison to NPY_OBJECT, I think would be better to explicitly use the PyTypeNum_ISDATETIME macro and error when that is not true; that would be a better and more explicit safeguard against troublesome refactors

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, this whole block is in a PyArray_IsScalar if block.

PyErr_Format(PyExc_ValueError, "Could not get resolution of datetime");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason we aren't returning here? Feels a bit off to let execution continue after setting error

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My bad, updated.

}

PyArray_Descr *outcode = PyArray_DescrFromType(NPY_INT64);
PyArray_CastScalarToCtype(obj, &longVal, outcode);
Py_DECREF(outcode);

if (enc->datetimeIso) {
pc->PyTypeToUTF8 = PyDateTimeToIsoCallback;
GET_TC(tc)->longValue = longVal;
pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback;
enc->valueUnit = get_datetime_metadata_from_dtype(dtype).base;
tc->type = JT_UTF8;
} else {
NPY_DATETIMEUNIT base =
Expand Down
39 changes: 39 additions & 0 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -954,6 +954,45 @@ def test_date_unit(self, unit, datetime_frame):
result = read_json(StringIO(json), date_unit=None)
tm.assert_frame_equal(result, df)

@pytest.mark.parametrize("unit", ["s", "ms", "us"])
def test_iso_non_nano_datetimes(self, unit):
# Test that numpy datetimes
# in an Index or a column with non-nano resolution can be serialized
# correctly
# GH53686
index = DatetimeIndex(
[np.datetime64("2023-01-01T11:22:33.123456", unit)],
dtype=f"datetime64[{unit}]",
)
df = DataFrame(
{
"date": Series(
[np.datetime64("2022-01-01T11:22:33.123456", unit)],
dtype=f"datetime64[{unit}]",
index=index,
),
"date_obj": Series(
[np.datetime64("2023-01-01T11:22:33.123456", unit)],
dtype=object,
index=index,
),
},
)

buf = StringIO()
df.to_json(buf, date_format="iso", date_unit=unit)
buf.seek(0)

# read_json always reads datetimes in nanosecond resolution
# TODO: check_dtype/check_index_type should be removable
# once read_json gets non-nano support
tm.assert_frame_equal(
read_json(buf, convert_dates=["date", "date_obj"]),
df,
check_index_type=False,
check_dtype=False,
)

def test_weird_nested_json(self):
# this used to core dump the parser
s = r"""{
Expand Down