From e648188e53863461bcbb0a9407895dc722a8c768 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 20 Jun 2023 18:07:12 -0700 Subject: [PATCH 1/6] BUG: to_json not serializing non-nanosecond numpy dt64 correctly --- doc/source/whatsnew/v2.1.0.rst | 1 + .../pandas/datetime/date_conversions.h | 5 ++- .../include/pandas/datetime/pd_datetime.h | 6 ++-- pandas/_libs/src/datetime/date_conversions.c | 7 ++-- .../src/vendored/ujson/python/objToJSON.c | 35 +++++++++++++------ pandas/tests/io/json/test_pandas.py | 25 +++++++++++++ 6 files changed, 63 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index e2f0904a78cf9..cedf79ac7b2d7 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -431,6 +431,7 @@ I/O - Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`) - Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`) - Bug in :func:`read_sql` when reading multiple timezone aware columns with the same column name (:issue:`44421`) +- Bug in :meth:`DataFrame.to_json` where :class:`DateTimeArray`/:class:`DateTimeIndex` with non nanosecond precision could not be serialized correctly (:issue:`53686`) - Bug when writing and reading empty Stata dta files where dtype information was lost (:issue:`46240`) Period diff --git a/pandas/_libs/include/pandas/datetime/date_conversions.h b/pandas/_libs/include/pandas/datetime/date_conversions.h index 3f9dad918938e..a5ad926924dc5 100644 --- a/pandas/_libs/include/pandas/datetime/date_conversions.h +++ b/pandas/_libs/include/pandas/datetime/date_conversions.h @@ -18,7 +18,10 @@ int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit); // up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z // while base="ns" yields "2020-01-01T00:00:00.000000000Z" // len is mutated to save the length of the returned string -char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len); +char *int64ToIso(int64_t value, + NPY_DATETIMEUNIT valueUnit, + NPY_DATETIMEUNIT base, + size_t *len); // TODO(username): this function doesn't do a lot; should augment or // replace with scaleNanosecToUnit diff --git a/pandas/_libs/include/pandas/datetime/pd_datetime.h b/pandas/_libs/include/pandas/datetime/pd_datetime.h index 55aa046cf076b..3e362deb87807 100644 --- a/pandas/_libs/include/pandas/datetime/pd_datetime.h +++ b/pandas/_libs/include/pandas/datetime/pd_datetime.h @@ -34,7 +34,7 @@ typedef struct { npy_datetime (*npy_datetimestruct_to_datetime)(NPY_DATETIMEUNIT, const npy_datetimestruct *); int (*scaleNanosecToUnit)(npy_int64 *, NPY_DATETIMEUNIT); - char *(*int64ToIso)(int64_t, NPY_DATETIMEUNIT, size_t *); + char *(*int64ToIso)(int64_t, NPY_DATETIMEUNIT, NPY_DATETIMEUNIT, size_t *); npy_datetime (*NpyDateTimeToEpoch)(npy_datetime, NPY_DATETIMEUNIT); char *(*PyDateTimeToIso)(PyObject *, NPY_DATETIMEUNIT, size_t *); npy_datetime (*PyDateTimeToEpoch)(PyObject *, NPY_DATETIMEUNIT); @@ -73,8 +73,8 @@ static PandasDateTime_CAPI *PandasDateTimeAPI = NULL; (npy_datetimestruct)) #define scaleNanosecToUnit(value, unit) \ PandasDateTimeAPI->scaleNanosecToUnit((value), (unit)) -#define int64ToIso(value, base, len) \ - PandasDateTimeAPI->int64ToIso((value), (base), (len)) +#define int64ToIso(value, valueUnit, base, len) \ + PandasDateTimeAPI->int64ToIso((value), (valueUnit), (base), (len)) #define NpyDateTimeToEpoch(dt, base) \ PandasDateTimeAPI->NpyDateTimeToEpoch((dt), (base)) #define PyDateTimeToIso(obj, base, len) \ diff --git a/pandas/_libs/src/datetime/date_conversions.c b/pandas/_libs/src/datetime/date_conversions.c index 84fc5507010ed..3bc3275be1cfe 100644 --- a/pandas/_libs/src/datetime/date_conversions.c +++ b/pandas/_libs/src/datetime/date_conversions.c @@ -41,11 +41,14 @@ int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit) { } /* Converts the int64_t representation of a datetime to ISO; mutates len */ -char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) { +char *int64ToIso(int64_t value, + NPY_DATETIMEUNIT valueUnit, + NPY_DATETIMEUNIT base, + size_t *len) { npy_datetimestruct dts; int ret_code; - pandas_datetime_to_datetimestruct(value, NPY_FR_ns, &dts); + pandas_datetime_to_datetimestruct(value, valueUnit, &dts); *len = (size_t)get_datetime_iso_8601_strlen(0, base); char *result = PyObject_Malloc(*len); diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 65b468f268d75..c397a24206b6f 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -131,6 +131,7 @@ typedef struct __PyObjectEncoder { int datetimeIso; NPY_DATETIMEUNIT datetimeUnit; + NPY_DATETIMEUNIT valueUnit; // output format style for pandas data types int outputFormat; @@ -350,7 +351,8 @@ static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), JSONTypeContext *tc, size_t *len) { NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, base, len); + NPY_DATETIMEUNIT valueUnit = ((PyObjectEncoder *)tc->encoder)->valueUnit; + GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, valueUnit, base, len); return GET_TC(tc)->cStr; } @@ -502,6 +504,12 @@ int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { GET_TC(tc)->itemValue = obj; Py_INCREF(obj); ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(npyarr->array); + // Also write the resolution (unit) of the ndarray + PyArray_Descr *dtype = PyArray_DESCR(npyarr->array); + // copied from + // https://github.com/numpy/numpy/blob/c8fe278a754a271af57eaf6c7ffb2382e5a954f9/numpy/core/src/multiarray/datetime.c#L692-L701 + ((PyObjectEncoder *)tc->encoder)->valueUnit = + ((PyArray_DatetimeDTypeMetaData *)dtype->c_metadata)->meta.base; ((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr; ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; } else { @@ -1255,6 +1263,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, char **ret; char *dataptr, *cLabel; int type_num; + PyArray_Descr *dtype; NPY_DATETIMEUNIT base = enc->datetimeUnit; if (!labels) { @@ -1283,6 +1292,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, stride = PyArray_STRIDE(labels, 0); dataptr = PyArray_DATA(labels); type_num = PyArray_TYPE(labels); + dtype = PyArray_DESCR(labels); for (i = 0; i < num; i++) { item = PyArray_GETITEM(labels, dataptr); @@ -1293,7 +1303,8 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, } int is_datetimelike = 0; - npy_int64 nanosecVal; + npy_int64 i8date; + NPY_DATETIMEUNIT dateUnit = NPY_FR_ns; if (PyTypeNum_ISDATETIME(type_num)) { is_datetimelike = 1; PyArray_VectorUnaryFunc *castfunc = @@ -1303,35 +1314,39 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, "Cannot cast numpy dtype %d to long", enc->npyType); } - castfunc(dataptr, &nanosecVal, 1, NULL, NULL); + castfunc(dataptr, &i8date, 1, NULL, NULL); + // copied from + // https://github.com/numpy/numpy/blob/c8fe278a754a271af57eaf6c7ffb2382e5a954f9/numpy/core/src/multiarray/datetime.c#L692-L701 + dateUnit = ((PyArray_DatetimeDTypeMetaData *)dtype->c_metadata)->meta.base; } else if (PyDate_Check(item) || PyDelta_Check(item)) { is_datetimelike = 1; if (PyObject_HasAttrString(item, "_value")) { // see test_date_index_and_values for case with non-nano - nanosecVal = get_long_attr(item, "_value"); + i8date = get_long_attr(item, "_value"); } else { if (PyDelta_Check(item)) { - nanosecVal = total_seconds(item) * + i8date = total_seconds(item) * 1000000000LL; // nanoseconds per second } else { // datetime.* objects don't follow above rules - nanosecVal = PyDateTimeToEpoch(item, NPY_FR_ns); + i8date = PyDateTimeToEpoch(item, NPY_FR_ns); } } } if (is_datetimelike) { - if (nanosecVal == get_nat()) { + if (i8date == get_nat()) { len = 4; cLabel = PyObject_Malloc(len + 1); strncpy(cLabel, "null", len + 1); } else { if (enc->datetimeIso) { if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) { - cLabel = int64ToIsoDuration(nanosecVal, &len); + // TODO(username): non-nano timedelta support? + cLabel = int64ToIsoDuration(i8date, &len); } else { if (type_num == NPY_DATETIME) { - cLabel = int64ToIso(nanosecVal, base, &len); + cLabel = int64ToIso(i8date, dateUnit, base, &len); } else { cLabel = PyDateTimeToIso(item, base, &len); } @@ -1346,7 +1361,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, int size_of_cLabel = 21; // 21 chars for int 64 cLabel = PyObject_Malloc(size_of_cLabel); snprintf(cLabel, size_of_cLabel, "%" NPY_DATETIME_FMT, - NpyDateTimeToEpoch(nanosecVal, base)); + NpyDateTimeToEpoch(i8date, base)); len = strlen(cLabel); } } diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index a966ad1dabcaa..80e1fac0e4dfb 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -895,6 +895,31 @@ def test_date_unit(self, unit, datetime_frame): result = read_json(json, date_unit=None) tm.assert_frame_equal(result, df) + @pytest.mark.parametrize("unit", ["s", "ms", "us"]) + def test_iso_non_nano_datetimes(self, unit): + # Test that numpy datetimes + # in an Index or a column with non-nano resolution can be serialized + # correctly + # GH53686 + df = DataFrame( + { + "date": Series( + [np.datetime64("2022-01-01T11:22:33.123456", unit)], + index=DatetimeIndex( + [np.datetime64("2023-01-01T11:22:33.123456", unit)], + dtype=f"datetime64[{unit}]", + ), + dtype=f"datetime64[{unit}]", + ) + } + ) + + json = df.to_json(date_format="iso", date_unit=unit) + # read_json always reads datetimes in nanosecond resolution + tm.assert_frame_equal( + read_json(json), df, check_index_type=False, check_dtype=False + ) + def test_weird_nested_json(self): # this used to core dump the parser s = r"""{ From 4136ce19806eace82cb6715054ff2066f084f889 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 20 Jun 2023 20:31:07 -0700 Subject: [PATCH 2/6] fix tests --- pandas/tests/io/json/test_pandas.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 80e1fac0e4dfb..dac0733d63253 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -914,10 +914,12 @@ def test_iso_non_nano_datetimes(self, unit): } ) - json = df.to_json(date_format="iso", date_unit=unit) + buf = StringIO() + df.to_json(buf, date_format="iso", date_unit=unit) + buf.seek(0) # read_json always reads datetimes in nanosecond resolution tm.assert_frame_equal( - read_json(json), df, check_index_type=False, check_dtype=False + read_json(buf), df, check_index_type=False, check_dtype=False ) def test_weird_nested_json(self): From 4759340e7ea7fd2be2053bcd27e33d09177ad58c Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 21 Jun 2023 09:59:25 -0700 Subject: [PATCH 3/6] change extraction mech --- pandas/_libs/src/vendored/ujson/python/objToJSON.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index c397a24206b6f..b8897bafb304f 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -506,10 +506,8 @@ int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(npyarr->array); // Also write the resolution (unit) of the ndarray PyArray_Descr *dtype = PyArray_DESCR(npyarr->array); - // copied from - // https://github.com/numpy/numpy/blob/c8fe278a754a271af57eaf6c7ffb2382e5a954f9/numpy/core/src/multiarray/datetime.c#L692-L701 ((PyObjectEncoder *)tc->encoder)->valueUnit = - ((PyArray_DatetimeDTypeMetaData *)dtype->c_metadata)->meta.base; + get_datetime_metadata_from_dtype(dtype).base; ((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr; ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; } else { @@ -1315,9 +1313,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, enc->npyType); } castfunc(dataptr, &i8date, 1, NULL, NULL); - // copied from - // https://github.com/numpy/numpy/blob/c8fe278a754a271af57eaf6c7ffb2382e5a954f9/numpy/core/src/multiarray/datetime.c#L692-L701 - dateUnit = ((PyArray_DatetimeDTypeMetaData *)dtype->c_metadata)->meta.base; + dateUnit = get_datetime_metadata_from_dtype(dtype).base; } else if (PyDate_Check(item) || PyDelta_Check(item)) { is_datetimelike = 1; if (PyObject_HasAttrString(item, "_value")) { From f861b6a280233df4aef0827e784e04597dc6a2e3 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sat, 22 Jul 2023 11:58:23 -0700 Subject: [PATCH 4/6] fix object array case --- .../src/vendored/ujson/python/objToJSON.c | 18 ++++++++++--- pandas/tests/io/json/test_pandas.py | 26 ++++++++++++++----- 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index b8897bafb304f..b8e68b9f95a23 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -366,8 +366,9 @@ static char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused), /* JSON callback */ static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, size_t *len) { - if (!PyDate_Check(obj)) { - PyErr_SetString(PyExc_TypeError, "Expected date object"); + if (!PyDate_Check(obj) && !PyDateTime_Check(obj)) { + PyErr_SetString(PyExc_TypeError, "Expected date or datetime object"); + ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; return NULL; } @@ -1549,13 +1550,24 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { tc->type = JT_UTF8; return; } else if (PyArray_IsScalar(obj, Datetime)) { + npy_int64 longVal; if (((PyDatetimeScalarObject *)obj)->obval == get_nat()) { tc->type = JT_NULL; return; } + PyArray_Descr *dtype = PyArray_DescrFromScalar(obj); + if (dtype->type_num == NPY_OBJECT) { + PyErr_Format(PyExc_ValueError, "Could not get resolution of datetime"); + } + + PyArray_Descr *outcode = PyArray_DescrFromType(NPY_INT64); + PyArray_CastScalarToCtype(obj, &longVal, outcode); + Py_DECREF(outcode); if (enc->datetimeIso) { - pc->PyTypeToUTF8 = PyDateTimeToIsoCallback; + GET_TC(tc)->longValue = longVal; + pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; + enc->valueUnit = get_datetime_metadata_from_dtype(dtype).base; tc->type = JT_UTF8; } else { NPY_DATETIMEUNIT base = diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 65c54af242ace..3596e8128f883 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -960,25 +960,37 @@ def test_iso_non_nano_datetimes(self, unit): # in an Index or a column with non-nano resolution can be serialized # correctly # GH53686 + index = DatetimeIndex( + [np.datetime64("2023-01-01T11:22:33.123456", unit)], + dtype=f"datetime64[{unit}]", + ) df = DataFrame( { "date": Series( [np.datetime64("2022-01-01T11:22:33.123456", unit)], - index=DatetimeIndex( - [np.datetime64("2023-01-01T11:22:33.123456", unit)], - dtype=f"datetime64[{unit}]", - ), dtype=f"datetime64[{unit}]", - ) - } + index=index, + ), + "date_obj": Series( + [np.datetime64("2023-01-01T11:22:33.123456", unit)], + dtype=object, + index=index, + ), + }, ) buf = StringIO() df.to_json(buf, date_format="iso", date_unit=unit) buf.seek(0) + # read_json always reads datetimes in nanosecond resolution + # TODO: check_dtype/check_index_type should be removable + # once read_json gets non-nano support tm.assert_frame_equal( - read_json(buf), df, check_index_type=False, check_dtype=False + read_json(buf, convert_dates=["date", "date_obj"]), + df, + check_index_type=False, + check_dtype=False, ) def test_weird_nested_json(self): From c3299ded5e0d17b5b8df31d2b385f6378106d7f6 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 23 Jul 2023 08:58:01 -0700 Subject: [PATCH 5/6] pre-commit --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 0f82e783db2ac..d38f4991759a1 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -528,9 +528,9 @@ I/O - Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`) - Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`) - Bug in :func:`read_sql` when reading multiple timezone aware columns with the same column name (:issue:`44421`) -- Bug in :meth:`DataFrame.to_json` where :class:`DateTimeArray`/:class:`DateTimeIndex` with non nanosecond precision could not be serialized correctly (:issue:`53686`) - Bug in :func:`read_xml` stripping whitespace in string data (:issue:`53811`) - Bug in :meth:`DataFrame.to_html` where ``colspace`` was incorrectly applied in case of multi index columns (:issue:`53885`) +- Bug in :meth:`DataFrame.to_json` where :class:`DateTimeArray`/:class:`DateTimeIndex` with non nanosecond precision could not be serialized correctly (:issue:`53686`) - Bug when writing and reading empty Stata dta files where dtype information was lost (:issue:`46240`) - Bug where ``bz2`` was treated as a hard requirement (:issue:`53857`) From c9da07030aedc22c983d7c9e31c97a8f5fc2dea1 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 24 Jul 2023 09:28:17 -0700 Subject: [PATCH 6/6] address comments --- pandas/_libs/src/vendored/ujson/python/objToJSON.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index b8e68b9f95a23..1fa82215179a8 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -1556,8 +1556,9 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { return; } PyArray_Descr *dtype = PyArray_DescrFromScalar(obj); - if (dtype->type_num == NPY_OBJECT) { + if (!PyTypeNum_ISDATETIME(dtype->type_num)) { PyErr_Format(PyExc_ValueError, "Could not get resolution of datetime"); + return; } PyArray_Descr *outcode = PyArray_DescrFromType(NPY_INT64);