Skip to content

BUG: Timedelta not formatted correctly in to_json #28595

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion asv_bench/benchmarks/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ class ToJSON(BaseIO):
fname = "__test__.json"
params = [
["split", "columns", "index", "values", "records"],
["df", "df_date_idx", "df_td_int_ts", "df_int_floats", "df_int_float_str"],
["df", "df_date_idx", "df_td", "df_td_int_ts", "df_int_floats",
"df_int_float_str"],
]
param_names = ["orient", "frame"]

Expand All @@ -81,6 +82,13 @@ def setup(self, orient, frame):
strings = tm.makeStringIndex(N)
self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N))
self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index)
self.df_td = DataFrame(
{
"td_1": timedeltas,
"td_2": timedeltas
},
index=index,
)
self.df_td_int_ts = DataFrame(
{
"td_1": timedeltas,
Expand Down Expand Up @@ -118,6 +126,10 @@ def setup(self, orient, frame):
def time_to_json(self, orient, frame):
getattr(self, frame).to_json(self.fname, orient=orient)

def time_to_json_iso(self, orient, frame):
getattr(self, frame).to_json(self.fname, orient=orient,
date_format="iso")

def peakmem_to_json(self, orient, frame):
getattr(self, frame).to_json(self.fname, orient=orient)

Expand Down
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.25.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,9 @@ I/O

- Fix regression in notebook display where <th> tags not used for :attr:`DataFrame.index` (:issue:`28204`).
- Regression in :meth:`~DataFrame.to_csv` where writing a :class:`Series` or :class:`DataFrame` indexed by an :class:`IntervalIndex` would incorrectly raise a ``TypeError`` (:issue:`28210`)
- Bug in :meth:`DataFrame.to_json` and :meth:`Series.to_json` where :class:`Timedelta` was not correctly formatted when `date_format="iso"` (:issue:`28256`).
-
-


Plotting
^^^^^^^^
Expand Down
93 changes: 54 additions & 39 deletions pandas/_libs/src/ujson/python/objToJSON.c
Original file line number Diff line number Diff line change
Expand Up @@ -735,12 +735,20 @@ int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) {
NpyArr_freeItemValue(obj, tc);

if (PyArray_ISDATETIME(npyarr->array)) {
PRINTMARK();
GET_TC(tc)->itemValue = obj;
Py_INCREF(obj);
((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(npyarr->array);
((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr;
((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr;
if (PyArray_TYPE(npyarr->array) == NPY_TIMEDELTA) {
PRINTMARK();
PyObject *item = npyarr->getitem(npyarr->dataptr, npyarr->array);
PyObject *td = PyObject_CallFunction(cls_timedelta, "(O)", item);
GET_TC(tc)->itemValue = td;
Py_DECREF(item);
} else {
PRINTMARK();
GET_TC(tc)->itemValue = obj;
Py_INCREF(obj);
((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(npyarr->array);
((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr;
((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr;
}
} else {
PRINTMARK();
GET_TC(tc)->itemValue = npyarr->getitem(npyarr->dataptr, npyarr->array);
Expand Down Expand Up @@ -1917,47 +1925,54 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
tc->type = enc->datetimeIso ? JT_UTF8 : JT_LONG;
return;
} else if (PyDelta_Check(obj)) {
if (PyObject_HasAttrString(obj, "value")) {
if (enc->datetimeIso) {
PRINTMARK();
value = get_long_attr(obj, "value");
pc->PyTypeToJSON = PyTimeToJSON;
tc->type = JT_UTF8;

} else {
PRINTMARK();
value = total_seconds(obj) * 1000000000LL; // nanoseconds per second
}
if (PyObject_HasAttrString(obj, "value")) {
PRINTMARK();
value = get_long_attr(obj, "value");
} else {
PRINTMARK();
value = total_seconds(obj) * 1000000000LL; // nanoseconds per second
}

base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
switch (base) {
case NPY_FR_ns:
break;
case NPY_FR_us:
value /= 1000LL;
break;
case NPY_FR_ms:
value /= 1000000LL;
break;
case NPY_FR_s:
value /= 1000000000LL;
break;
}
base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
switch (base) {
case NPY_FR_ns:
break;
case NPY_FR_us:
value /= 1000LL;
break;
case NPY_FR_ms:
value /= 1000000LL;
break;
case NPY_FR_s:
value /= 1000000000LL;
break;
}

exc = PyErr_Occurred();
exc = PyErr_Occurred();

if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) {
PRINTMARK();
goto INVALID;
}
if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) {
PRINTMARK();
goto INVALID;
}

if (value == get_nat()) {
PRINTMARK();
tc->type = JT_NULL;
return;
}
if (value == get_nat()) {
PRINTMARK();
tc->type = JT_NULL;
return;
}

GET_TC(tc)->longValue = value;
GET_TC(tc)->longValue = value;

PRINTMARK();
pc->PyTypeToJSON = PyLongToINT64;
tc->type = JT_LONG;
PRINTMARK();
pc->PyTypeToJSON = PyLongToINT64;
tc->type = JT_LONG;
}
return;
} else if (PyArray_IsScalar(obj, Integer)) {
PRINTMARK();
Expand Down
61 changes: 60 additions & 1 deletion pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from pandas._libs.tslibs import iNaT
from pandas.errors import AbstractMethodError

from pandas.core.dtypes.common import ensure_str, is_period_dtype
from pandas.core.dtypes.common import ensure_str, is_period_dtype, is_timedelta64_dtype

from pandas import DataFrame, MultiIndex, Series, isna, to_datetime
from pandas._typing import Scalar
Expand Down Expand Up @@ -170,6 +170,34 @@ def _write(

class SeriesWriter(Writer):
_default_orient = "index"
#
# def __init__(
# self,
# obj,
# orient: Optional[str],
# date_format: str,
# double_precision: int,
# ensure_ascii: bool,
# date_unit: str,
# index: bool,
# default_handler: Optional[Callable[[Any], Serializable]] = None,
# indent: int = 0,
# ):
# super().__init__(
# obj,
# orient,
# date_format,
# double_precision,
# ensure_ascii,
# date_unit,
# index,
# default_handler=default_handler,
# indent=indent,
# )
#
# if is_timedelta64_dtype(obj.dtype) and self.date_format == "iso":
# obj = obj.copy()
# self.obj = obj.apply(lambda x: x.isoformat())

def _format_axes(self):
if not self.obj.index.is_unique and self.orient == "index":
Expand Down Expand Up @@ -206,6 +234,37 @@ def _write(
class FrameWriter(Writer):
_default_orient = "columns"

# def __init__(
# self,
# obj,
# orient: Optional[str],
# date_format: str,
# double_precision: int,
# ensure_ascii: bool,
# date_unit: str,
# index: bool,
# default_handler: Optional[Callable[[Any], Serializable]] = None,
# indent: int = 0,
# ):
# super().__init__(
# obj,
# orient,
# date_format,
# double_precision,
# ensure_ascii,
# date_unit,
# index,
# default_handler=default_handler,
# indent=indent,
# )
#
# obj = obj.copy()
# timedeltas = obj.select_dtypes(include=["timedelta"]).columns
#
# if len(timedeltas) and self.date_format == "iso":
# obj[timedeltas] = obj[timedeltas].applymap(lambda x: x.isoformat())
# self.obj = obj

def _format_axes(self):
"""
Try to format axes if they are datelike.
Expand Down
3 changes: 1 addition & 2 deletions pandas/tests/io/json/test_json_table_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -613,8 +613,7 @@ def test_timestamp_in_columns(self):
result = df.to_json(orient="table")
js = json.loads(result)
assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000Z"
# TODO - below expectation is not correct; see GH 28256
assert js["schema"]["fields"][2]["name"] == 10000
assert js["schema"]["fields"][2]["name"] == "P0DT0H0M10S"

@pytest.mark.parametrize(
"case",
Expand Down
34 changes: 34 additions & 0 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -813,6 +813,40 @@ def test_reconstruction_index(self):
result = read_json(df.to_json())
assert_frame_equal(result, df)

@pytest.mark.parametrize(
"date_format,expected",
[
("iso", '{"0":"P1DT0H0M0S","1":"P2DT0H0M0S"}'),
("epoch", '{"0":86400000,"1":172800000}'),
],
)
def test_series_timedelta_to_json(self, date_format, expected):
# GH28156: to_json not correctly formatting Timedelta
s = Series(pd.timedelta_range(start="1D", periods=2))

result = s.to_json(date_format=date_format)
assert result == expected

result = s.astype(object).to_json(date_format=date_format)
assert result == expected
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm can you make the test compare against the actual expected output? Right now I'm not sure this actually tests the date format, i.e. it could still always write out in epoch and just infer that back on the way in and test would still pass

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed.


@pytest.mark.parametrize(
"date_format,expected",
[
("iso", '{"0":{"0":"P1DT0H0M0S","1":"P2DT0H0M0S"}}'),
("epoch", '{"0":{"0":86400000,"1":172800000}}'),
],
)
def test_dataframe_timedelta_to_json(self, date_format, expected):
# GH28156: to_json not correctly formatting Timedelta
df = DataFrame(pd.timedelta_range(start="1D", periods=2))

result = df.to_json(date_format=date_format)
assert result == expected

result = df.astype(object).to_json(date_format=date_format)
assert result == expected

def test_path(self):
with ensure_clean("test.json") as path:
for df in [
Expand Down