Skip to content

Commit ee5cf2c

Browse files
authored
BUG: to_json not serializing non-nanosecond numpy dt64 correctly (#53757)
* BUG: to_json not serializing non-nanosecond numpy dt64 correctly * fix tests * change extraction mech * fix object array case * pre-commit * address comments
1 parent d2f05c2 commit ee5cf2c

File tree

6 files changed

+89
-19
lines changed

6 files changed

+89
-19
lines changed

doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -538,6 +538,7 @@ I/O
538538
- Bug in :func:`read_sql` when reading multiple timezone aware columns with the same column name (:issue:`44421`)
539539
- Bug in :func:`read_xml` stripping whitespace in string data (:issue:`53811`)
540540
- Bug in :meth:`DataFrame.to_html` where ``colspace`` was incorrectly applied in case of multi index columns (:issue:`53885`)
541+
- Bug in :meth:`DataFrame.to_json` where :class:`DateTimeArray`/:class:`DateTimeIndex` with non nanosecond precision could not be serialized correctly (:issue:`53686`)
541542
- Bug when writing and reading empty Stata dta files where dtype information was lost (:issue:`46240`)
542543
- Bug where ``bz2`` was treated as a hard requirement (:issue:`53857`)
543544

pandas/_libs/include/pandas/datetime/date_conversions.h

+4-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,10 @@ int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit);
1818
// up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z
1919
// while base="ns" yields "2020-01-01T00:00:00.000000000Z"
2020
// len is mutated to save the length of the returned string
21-
char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len);
21+
char *int64ToIso(int64_t value,
22+
NPY_DATETIMEUNIT valueUnit,
23+
NPY_DATETIMEUNIT base,
24+
size_t *len);
2225

2326
// TODO(username): this function doesn't do a lot; should augment or
2427
// replace with scaleNanosecToUnit

pandas/_libs/include/pandas/datetime/pd_datetime.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ typedef struct {
3434
npy_datetime (*npy_datetimestruct_to_datetime)(NPY_DATETIMEUNIT,
3535
const npy_datetimestruct *);
3636
int (*scaleNanosecToUnit)(npy_int64 *, NPY_DATETIMEUNIT);
37-
char *(*int64ToIso)(int64_t, NPY_DATETIMEUNIT, size_t *);
37+
char *(*int64ToIso)(int64_t, NPY_DATETIMEUNIT, NPY_DATETIMEUNIT, size_t *);
3838
npy_datetime (*NpyDateTimeToEpoch)(npy_datetime, NPY_DATETIMEUNIT);
3939
char *(*PyDateTimeToIso)(PyObject *, NPY_DATETIMEUNIT, size_t *);
4040
npy_datetime (*PyDateTimeToEpoch)(PyObject *, NPY_DATETIMEUNIT);
@@ -73,8 +73,8 @@ static PandasDateTime_CAPI *PandasDateTimeAPI = NULL;
7373
(npy_datetimestruct))
7474
#define scaleNanosecToUnit(value, unit) \
7575
PandasDateTimeAPI->scaleNanosecToUnit((value), (unit))
76-
#define int64ToIso(value, base, len) \
77-
PandasDateTimeAPI->int64ToIso((value), (base), (len))
76+
#define int64ToIso(value, valueUnit, base, len) \
77+
PandasDateTimeAPI->int64ToIso((value), (valueUnit), (base), (len))
7878
#define NpyDateTimeToEpoch(dt, base) \
7979
PandasDateTimeAPI->NpyDateTimeToEpoch((dt), (base))
8080
#define PyDateTimeToIso(obj, base, len) \

pandas/_libs/src/datetime/date_conversions.c

+5-2
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,14 @@ int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit) {
4141
}
4242

4343
/* Converts the int64_t representation of a datetime to ISO; mutates len */
44-
char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) {
44+
char *int64ToIso(int64_t value,
45+
NPY_DATETIMEUNIT valueUnit,
46+
NPY_DATETIMEUNIT base,
47+
size_t *len) {
4548
npy_datetimestruct dts;
4649
int ret_code;
4750

48-
pandas_datetime_to_datetimestruct(value, NPY_FR_ns, &dts);
51+
pandas_datetime_to_datetimestruct(value, valueUnit, &dts);
4952

5053
*len = (size_t)get_datetime_iso_8601_strlen(0, base);
5154
char *result = PyObject_Malloc(*len);

pandas/_libs/src/vendored/ujson/python/objToJSON.c

+37-13
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ typedef struct __PyObjectEncoder {
131131

132132
int datetimeIso;
133133
NPY_DATETIMEUNIT datetimeUnit;
134+
NPY_DATETIMEUNIT valueUnit;
134135

135136
// output format style for pandas data types
136137
int outputFormat;
@@ -350,7 +351,8 @@ static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc,
350351
static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused),
351352
JSONTypeContext *tc, size_t *len) {
352353
NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
353-
GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, base, len);
354+
NPY_DATETIMEUNIT valueUnit = ((PyObjectEncoder *)tc->encoder)->valueUnit;
355+
GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, valueUnit, base, len);
354356
return GET_TC(tc)->cStr;
355357
}
356358

@@ -364,8 +366,9 @@ static char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused),
364366
/* JSON callback */
365367
static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc,
366368
size_t *len) {
367-
if (!PyDate_Check(obj)) {
368-
PyErr_SetString(PyExc_TypeError, "Expected date object");
369+
if (!PyDate_Check(obj) && !PyDateTime_Check(obj)) {
370+
PyErr_SetString(PyExc_TypeError, "Expected date or datetime object");
371+
((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
369372
return NULL;
370373
}
371374

@@ -502,6 +505,10 @@ int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) {
502505
GET_TC(tc)->itemValue = obj;
503506
Py_INCREF(obj);
504507
((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(npyarr->array);
508+
// Also write the resolution (unit) of the ndarray
509+
PyArray_Descr *dtype = PyArray_DESCR(npyarr->array);
510+
((PyObjectEncoder *)tc->encoder)->valueUnit =
511+
get_datetime_metadata_from_dtype(dtype).base;
505512
((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr;
506513
((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr;
507514
} else {
@@ -1255,6 +1262,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
12551262
char **ret;
12561263
char *dataptr, *cLabel;
12571264
int type_num;
1265+
PyArray_Descr *dtype;
12581266
NPY_DATETIMEUNIT base = enc->datetimeUnit;
12591267

12601268
if (!labels) {
@@ -1283,6 +1291,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
12831291
stride = PyArray_STRIDE(labels, 0);
12841292
dataptr = PyArray_DATA(labels);
12851293
type_num = PyArray_TYPE(labels);
1294+
dtype = PyArray_DESCR(labels);
12861295

12871296
for (i = 0; i < num; i++) {
12881297
item = PyArray_GETITEM(labels, dataptr);
@@ -1293,7 +1302,8 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
12931302
}
12941303

12951304
int is_datetimelike = 0;
1296-
npy_int64 nanosecVal;
1305+
npy_int64 i8date;
1306+
NPY_DATETIMEUNIT dateUnit = NPY_FR_ns;
12971307
if (PyTypeNum_ISDATETIME(type_num)) {
12981308
is_datetimelike = 1;
12991309
PyArray_VectorUnaryFunc *castfunc =
@@ -1303,35 +1313,37 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
13031313
"Cannot cast numpy dtype %d to long",
13041314
enc->npyType);
13051315
}
1306-
castfunc(dataptr, &nanosecVal, 1, NULL, NULL);
1316+
castfunc(dataptr, &i8date, 1, NULL, NULL);
1317+
dateUnit = get_datetime_metadata_from_dtype(dtype).base;
13071318
} else if (PyDate_Check(item) || PyDelta_Check(item)) {
13081319
is_datetimelike = 1;
13091320
if (PyObject_HasAttrString(item, "_value")) {
13101321
// see test_date_index_and_values for case with non-nano
1311-
nanosecVal = get_long_attr(item, "_value");
1322+
i8date = get_long_attr(item, "_value");
13121323
} else {
13131324
if (PyDelta_Check(item)) {
1314-
nanosecVal = total_seconds(item) *
1325+
i8date = total_seconds(item) *
13151326
1000000000LL; // nanoseconds per second
13161327
} else {
13171328
// datetime.* objects don't follow above rules
1318-
nanosecVal = PyDateTimeToEpoch(item, NPY_FR_ns);
1329+
i8date = PyDateTimeToEpoch(item, NPY_FR_ns);
13191330
}
13201331
}
13211332
}
13221333

13231334
if (is_datetimelike) {
1324-
if (nanosecVal == get_nat()) {
1335+
if (i8date == get_nat()) {
13251336
len = 4;
13261337
cLabel = PyObject_Malloc(len + 1);
13271338
strncpy(cLabel, "null", len + 1);
13281339
} else {
13291340
if (enc->datetimeIso) {
13301341
if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) {
1331-
cLabel = int64ToIsoDuration(nanosecVal, &len);
1342+
// TODO(username): non-nano timedelta support?
1343+
cLabel = int64ToIsoDuration(i8date, &len);
13321344
} else {
13331345
if (type_num == NPY_DATETIME) {
1334-
cLabel = int64ToIso(nanosecVal, base, &len);
1346+
cLabel = int64ToIso(i8date, dateUnit, base, &len);
13351347
} else {
13361348
cLabel = PyDateTimeToIso(item, base, &len);
13371349
}
@@ -1346,7 +1358,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
13461358
int size_of_cLabel = 21; // 21 chars for int 64
13471359
cLabel = PyObject_Malloc(size_of_cLabel);
13481360
snprintf(cLabel, size_of_cLabel, "%" NPY_DATETIME_FMT,
1349-
NpyDateTimeToEpoch(nanosecVal, base));
1361+
NpyDateTimeToEpoch(i8date, base));
13501362
len = strlen(cLabel);
13511363
}
13521364
}
@@ -1538,13 +1550,25 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
15381550
tc->type = JT_UTF8;
15391551
return;
15401552
} else if (PyArray_IsScalar(obj, Datetime)) {
1553+
npy_int64 longVal;
15411554
if (((PyDatetimeScalarObject *)obj)->obval == get_nat()) {
15421555
tc->type = JT_NULL;
15431556
return;
15441557
}
1558+
PyArray_Descr *dtype = PyArray_DescrFromScalar(obj);
1559+
if (!PyTypeNum_ISDATETIME(dtype->type_num)) {
1560+
PyErr_Format(PyExc_ValueError, "Could not get resolution of datetime");
1561+
return;
1562+
}
1563+
1564+
PyArray_Descr *outcode = PyArray_DescrFromType(NPY_INT64);
1565+
PyArray_CastScalarToCtype(obj, &longVal, outcode);
1566+
Py_DECREF(outcode);
15451567

15461568
if (enc->datetimeIso) {
1547-
pc->PyTypeToUTF8 = PyDateTimeToIsoCallback;
1569+
GET_TC(tc)->longValue = longVal;
1570+
pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback;
1571+
enc->valueUnit = get_datetime_metadata_from_dtype(dtype).base;
15481572
tc->type = JT_UTF8;
15491573
} else {
15501574
NPY_DATETIMEUNIT base =

pandas/tests/io/json/test_pandas.py

+39
Original file line numberDiff line numberDiff line change
@@ -954,6 +954,45 @@ def test_date_unit(self, unit, datetime_frame):
954954
result = read_json(StringIO(json), date_unit=None)
955955
tm.assert_frame_equal(result, df)
956956

957+
@pytest.mark.parametrize("unit", ["s", "ms", "us"])
958+
def test_iso_non_nano_datetimes(self, unit):
959+
# Test that numpy datetimes
960+
# in an Index or a column with non-nano resolution can be serialized
961+
# correctly
962+
# GH53686
963+
index = DatetimeIndex(
964+
[np.datetime64("2023-01-01T11:22:33.123456", unit)],
965+
dtype=f"datetime64[{unit}]",
966+
)
967+
df = DataFrame(
968+
{
969+
"date": Series(
970+
[np.datetime64("2022-01-01T11:22:33.123456", unit)],
971+
dtype=f"datetime64[{unit}]",
972+
index=index,
973+
),
974+
"date_obj": Series(
975+
[np.datetime64("2023-01-01T11:22:33.123456", unit)],
976+
dtype=object,
977+
index=index,
978+
),
979+
},
980+
)
981+
982+
buf = StringIO()
983+
df.to_json(buf, date_format="iso", date_unit=unit)
984+
buf.seek(0)
985+
986+
# read_json always reads datetimes in nanosecond resolution
987+
# TODO: check_dtype/check_index_type should be removable
988+
# once read_json gets non-nano support
989+
tm.assert_frame_equal(
990+
read_json(buf, convert_dates=["date", "date_obj"]),
991+
df,
992+
check_index_type=False,
993+
check_dtype=False,
994+
)
995+
957996
def test_weird_nested_json(self):
958997
# this used to core dump the parser
959998
s = r"""{

0 commit comments

Comments
 (0)