From acdcb05c58533923e6636fc80494ca66aebbc8a2 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 10 Apr 2022 09:42:14 -0700 Subject: [PATCH 01/10] BUG: to_json incorrectly localizes tz-naive datetimes to UTC --- doc/source/whatsnew/v1.5.0.rst | 1 + .../_libs/src/ujson/python/date_conversions.c | 31 +++++++++++++++++-- pandas/_libs/src/ujson/python/objToJSON.c | 29 ++++++++++++++++- .../tslibs/src/datetime/np_datetime_strings.c | 15 ++++----- .../tests/io/json/test_json_table_schema.py | 14 ++++----- .../json/test_json_table_schema_ext_dtype.py | 4 +-- pandas/tests/io/json/test_pandas.py | 31 ++++++++++++++++--- 7 files changed, 100 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 73dc832e2007b..bd891e5a84d4f 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -563,6 +563,7 @@ I/O - Bug in Parquet roundtrip for Interval dtype with ``datetime64[ns]`` subtype (:issue:`45881`) - Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements (:issue:`45598`) - Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`) +- Bug in :meth:`DataFrame.to_json`, :meth:`Series.to_json`, and :meth:`Index.to_json` where tz-aware datetimes were being incorrectly localized to UTC (:issue:`38760`) Period ^^^^^^ diff --git a/pandas/_libs/src/ujson/python/date_conversions.c b/pandas/_libs/src/ujson/python/date_conversions.c index 0744c6af74480..c11e4ed60541c 100644 --- a/pandas/_libs/src/ujson/python/date_conversions.c +++ b/pandas/_libs/src/ujson/python/date_conversions.c @@ -54,8 +54,8 @@ char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) { PyErr_NoMemory(); return NULL; } - - ret_code = make_iso_8601_datetime(&dts, result, *len, base); + ret_code = make_iso_8601_datetime(&dts, result, *len, + 0 /* datetime64 is always naive */, base); if (ret_code != 0) { PyErr_SetString(PyExc_ValueError, "Could not convert datetime value to string"); @@ -90,7 +90,32 @@ char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base, *len = (size_t)get_datetime_iso_8601_strlen(0, base); char *result = PyObject_Malloc(*len); - ret = make_iso_8601_datetime(&dts, result, *len, base); + // Check to see if PyDateTime has a timezone. + // Don't convert to UTC if it doesn't. + int is_tz_aware = 0; + PyObject *tmp; + PyObject *offset; + if (PyObject_HasAttrString((PyObject*)obj, "tzinfo")) { + tmp = PyObject_GetAttrString(obj, "tzinfo"); + if (tmp == NULL) { + return -1; + } + if (tmp == Py_None) { + Py_DECREF(tmp); + } else { + offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj); + if (offset == NULL) { + Py_DECREF(tmp); + return -1; + } + Py_DECREF(tmp); + if (offset != Py_None) { + is_tz_aware = 1; + } + Py_DECREF(offset); + } + } + ret = make_iso_8601_datetime(&dts, result, *len, is_tz_aware, base); if (ret != 0) { PyErr_SetString(PyExc_ValueError, diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index c4609992342c3..8913da477ea92 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -221,8 +221,19 @@ static PyObject *get_values(PyObject *obj) { // The special cases to worry about are dt64tz and category[dt64tz]. // In both cases we want the UTC-localized datetime64 ndarray, // without going through and object array of Timestamps. + if (PyObject_HasAttrString(obj, "tz")) { + PyObject *tz = PyObject_GetAttrString(obj, "tz"); + if (tz != Py_None) { + // Go through object array if we have dt64tz, since tz info will + // be lost if values is used directly. + Py_DECREF(tz); + values = PyObject_CallMethod(obj, "__array__", NULL); + PyObject_Print(values, stdout, NULL); + return values; + } + Py_DECREF(tz); + } values = PyObject_GetAttrString(obj, "values"); - if (values == NULL) { // Clear so we can subsequently try another method PyErr_Clear(); @@ -707,6 +718,22 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { for (i = 0; i < PyObject_Length(arrays); i++) { array = PyList_GET_ITEM(arrays, i); + // tz information is lost when dt64tz is converted + // to numpy arrays. + // TODO(lithomas1): Patch column_arrays(actually values_for_json) + // to return EAs instead of casting to object + if (PyArray_TYPE((PyArrayObject *)array) == NPY_DATETIME) { + PyObject *mgr = PyObject_GetAttrString(obj, "_mgr"); + PyObject *dtarr = PyObject_CallMethod(mgr, "iget_values", "n", i); + PyObject *tz = PyObject_GetAttrString(dtarr, "tz"); + if (tz != Py_None) { + // we have a timezone, use an object array of Timestamp + array = PyObject_CallMethod(dtarr, "__array__", NULL); + } + Py_DECREF(mgr); + Py_DECREF(dtarr); + Py_DECREF(tz); + } if (!array) { GET_TC(tc)->iterNext = NpyArr_iterNextNone; goto ARR_RET; diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index 847e84b21c06c..8e05c5c9b98e3 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -605,7 +605,7 @@ int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) { * string was too short). */ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, - NPY_DATETIMEUNIT base) { + int utc, NPY_DATETIMEUNIT base) { char *substr = outstr; int sublen = outlen; int tmplen; @@ -884,13 +884,14 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, add_time_zone: /* UTC "Zulu" time */ - if (sublen < 1) { - goto string_too_short; + if (utc) { + if (sublen < 1) { + goto string_too_short; + } + substr[0] = 'Z'; + substr += 1; + sublen -= 1; } - substr[0] = 'Z'; - substr += 1; - sublen -= 1; - /* Add a NULL terminator, and return */ if (sublen > 0) { substr[0] = '\0'; diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index a2fd8d2cbbf04..c90ac2fb3b813 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -301,7 +301,7 @@ def test_to_json(self, df_table): ("idx", 0), ("A", 1), ("B", "a"), - ("C", "2016-01-01T00:00:00.000Z"), + ("C", "2016-01-01T00:00:00.000"), ("D", "P0DT1H0M0S"), ("E", "a"), ("F", "a"), @@ -314,7 +314,7 @@ def test_to_json(self, df_table): ("idx", 1), ("A", 2), ("B", "b"), - ("C", "2016-01-02T00:00:00.000Z"), + ("C", "2016-01-02T00:00:00.000"), ("D", "P0DT1H1M0S"), ("E", "b"), ("F", "b"), @@ -327,7 +327,7 @@ def test_to_json(self, df_table): ("idx", 2), ("A", 3), ("B", "c"), - ("C", "2016-01-03T00:00:00.000Z"), + ("C", "2016-01-03T00:00:00.000"), ("D", "P0DT1H2M0S"), ("E", "c"), ("F", "c"), @@ -340,7 +340,7 @@ def test_to_json(self, df_table): ("idx", 3), ("A", 4), ("B", "c"), - ("C", "2016-01-04T00:00:00.000Z"), + ("C", "2016-01-04T00:00:00.000"), ("D", "P0DT1H3M0S"), ("E", "c"), ("F", "c"), @@ -397,8 +397,8 @@ def test_to_json_period_index(self): schema = {"fields": fields, "primaryKey": ["index"]} data = [ - OrderedDict([("index", "2015-11-01T00:00:00.000Z"), ("values", 1)]), - OrderedDict([("index", "2016-02-01T00:00:00.000Z"), ("values", 1)]), + OrderedDict([("index", "2015-11-01T00:00:00.000"), ("values", 1)]), + OrderedDict([("index", "2016-02-01T00:00:00.000"), ("values", 1)]), ] expected = OrderedDict([("schema", schema), ("data", data)]) @@ -635,7 +635,7 @@ def test_timestamp_in_columns(self): ) result = df.to_json(orient="table") js = json.loads(result) - assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000Z" + assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000" assert js["schema"]["fields"][2]["name"] == "P0DT0H0M10S" @pytest.mark.parametrize( diff --git a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py index f6aa16ff0ce38..08b182ced4915 100644 --- a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py +++ b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py @@ -151,7 +151,7 @@ def test_build_date_series(self): expected = OrderedDict( [ ("schema", schema), - ("data", [OrderedDict([("id", 0), ("a", "2021-10-10T00:00:00.000Z")])]), + ("data", [OrderedDict([("id", 0), ("a", "2021-10-10T00:00:00.000")])]), ] ) @@ -256,7 +256,7 @@ def test_to_json(self): OrderedDict( [ ("idx", 0), - ("A", "2021-10-10T00:00:00.000Z"), + ("A", "2021-10-10T00:00:00.000"), ("B", 10.0), ("C", "pandas"), ("D", 10), diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 985d9e47ea7bd..76f711dc058d3 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -822,7 +822,7 @@ def test_date_index_and_values(self, date_format, as_object, date_typ): expected = '{"1577836800000":1577836800000,"null":null}' else: expected = ( - '{"2020-01-01T00:00:00.000Z":"2020-01-01T00:00:00.000Z","null":null}' + '{"2020-01-01T00:00:00.000":"2020-01-01T00:00:00.000","null":null}' ) if as_object: @@ -875,8 +875,6 @@ def test_date_format_frame(self, date, date_unit, datetime_frame): json = df.to_json(date_format="iso") result = read_json(json) expected = df.copy() - expected.index = expected.index.tz_localize("UTC") - expected["date"] = expected["date"].dt.tz_localize("UTC") tm.assert_frame_equal(result, expected) def test_date_format_frame_raises(self, datetime_frame): @@ -905,8 +903,6 @@ def test_date_format_series(self, date, date_unit, datetime_series): json = ts.to_json(date_format="iso") result = read_json(json, typ="series") expected = ts.copy() - expected.index = expected.index.tz_localize("UTC") - expected = expected.dt.tz_localize("UTC") tm.assert_series_equal(result, expected) def test_date_format_series_raises(self, datetime_series): @@ -1192,6 +1188,16 @@ def test_tz_is_utc(self, ts): dt = ts.to_pydatetime() assert dumps(dt, iso_dates=True) == exp + def test_tz_is_naive(self): + from pandas.io.json import dumps + + ts = Timestamp("2013-01-10 05:00:00") + exp = '"2013-01-10T05:00:00.000"' + + assert dumps(ts, iso_dates=True) == exp + dt = ts.to_pydatetime() + assert dumps(dt, iso_dates=True) == exp + @pytest.mark.parametrize( "tz_range", [ @@ -1217,6 +1223,21 @@ def test_tz_range_is_utc(self, tz_range): result = dumps(df, iso_dates=True) assert result == dfexp + def test_tz_range_is_naive(self): + from pandas.io.json import dumps + + tz_range = pd.date_range("2013-01-01 05:00:00", periods=2) + + exp = '["2013-01-01T05:00:00.000","2013-01-02T05:00:00.000"]' + dfexp = '{"DT":{"0":"2013-01-01T05:00:00.000", "1":"2013-01-02T05:00:00.000"}}' + + assert dumps(tz_range, iso_dates=True) == exp + dti = DatetimeIndex(tz_range) + assert dumps(dti, iso_dates=True) == exp + df = DataFrame({"DT": dti}) + result = dumps(df, iso_dates=True) + assert result == dfexp + def test_read_inline_jsonl(self): # GH9180 result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) From 9652002184613ac116ca71dc2b1ba503467dcff0 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 10 Apr 2022 11:54:38 -0700 Subject: [PATCH 02/10] fix warnings? --- pandas/_libs/src/ujson/python/date_conversions.c | 2 +- pandas/_libs/tslibs/src/datetime/np_datetime_strings.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/ujson/python/date_conversions.c b/pandas/_libs/src/ujson/python/date_conversions.c index c11e4ed60541c..791dc71dc6d67 100644 --- a/pandas/_libs/src/ujson/python/date_conversions.c +++ b/pandas/_libs/src/ujson/python/date_conversions.c @@ -98,7 +98,7 @@ char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base, if (PyObject_HasAttrString((PyObject*)obj, "tzinfo")) { tmp = PyObject_GetAttrString(obj, "tzinfo"); if (tmp == NULL) { - return -1; + return NULL; } if (tmp == Py_None) { Py_DECREF(tmp); diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h index 200a71ff0c2b7..e0b3e8c8b9d91 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h @@ -78,7 +78,7 @@ get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base); */ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, - NPY_DATETIMEUNIT base); + int utc, NPY_DATETIMEUNIT base); /* * Converts an pandas_timedeltastruct to an ISO 8601 string. From 16332fce4663aacebf37bf23456bc0c247c240df Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 10 Apr 2022 12:33:12 -0700 Subject: [PATCH 03/10] actually fix warnings --- pandas/_libs/src/ujson/python/date_conversions.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/date_conversions.c b/pandas/_libs/src/ujson/python/date_conversions.c index 791dc71dc6d67..f1e4a2fddb81d 100644 --- a/pandas/_libs/src/ujson/python/date_conversions.c +++ b/pandas/_libs/src/ujson/python/date_conversions.c @@ -106,7 +106,7 @@ char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base, offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj); if (offset == NULL) { Py_DECREF(tmp); - return -1; + return NULL; } Py_DECREF(tmp); if (offset != Py_None) { From aa6444c9fc361c1d7f2ed45627c7c8f73e5ec79c Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 10 Apr 2022 15:20:52 -0700 Subject: [PATCH 04/10] Update objToJSON.c --- pandas/_libs/src/ujson/python/objToJSON.c | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 8913da477ea92..c027c386527fe 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -228,7 +228,6 @@ static PyObject *get_values(PyObject *obj) { // be lost if values is used directly. Py_DECREF(tz); values = PyObject_CallMethod(obj, "__array__", NULL); - PyObject_Print(values, stdout, NULL); return values; } Py_DECREF(tz); From 94ad92a69e11305e40fafd416d4529fa5900d165 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 11 Apr 2022 07:03:36 -0700 Subject: [PATCH 05/10] fix formatting(and test) --- pandas/tests/io/json/test_pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 76f711dc058d3..51bcf85af8821 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1229,7 +1229,7 @@ def test_tz_range_is_naive(self): tz_range = pd.date_range("2013-01-01 05:00:00", periods=2) exp = '["2013-01-01T05:00:00.000","2013-01-02T05:00:00.000"]' - dfexp = '{"DT":{"0":"2013-01-01T05:00:00.000", "1":"2013-01-02T05:00:00.000"}}' + dfexp = '{"DT":{"0":"2013-01-01T05:00:00.000","1":"2013-01-02T05:00:00.000"}}' assert dumps(tz_range, iso_dates=True) == exp dti = DatetimeIndex(tz_range) From ab895dc0781bda95083aa6cea33cd72c88d48477 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sat, 16 Apr 2022 08:07:47 -0700 Subject: [PATCH 06/10] Address code comments --- .../_libs/src/ujson/python/date_conversions.c | 25 +----- pandas/_libs/src/ujson/python/objToJSON.c | 16 ---- .../_libs/tslibs/src/datetime/np_datetime.c | 90 +++++++++++-------- .../_libs/tslibs/src/datetime/np_datetime.h | 2 + pandas/core/internals/blocks.py | 8 +- 5 files changed, 63 insertions(+), 78 deletions(-) diff --git a/pandas/_libs/src/ujson/python/date_conversions.c b/pandas/_libs/src/ujson/python/date_conversions.c index f1e4a2fddb81d..993e0e89562cc 100644 --- a/pandas/_libs/src/ujson/python/date_conversions.c +++ b/pandas/_libs/src/ujson/python/date_conversions.c @@ -93,27 +93,10 @@ char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base, // Check to see if PyDateTime has a timezone. // Don't convert to UTC if it doesn't. int is_tz_aware = 0; - PyObject *tmp; - PyObject *offset; - if (PyObject_HasAttrString((PyObject*)obj, "tzinfo")) { - tmp = PyObject_GetAttrString(obj, "tzinfo"); - if (tmp == NULL) { - return NULL; - } - if (tmp == Py_None) { - Py_DECREF(tmp); - } else { - offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj); - if (offset == NULL) { - Py_DECREF(tmp); - return NULL; - } - Py_DECREF(tmp); - if (offset != Py_None) { - is_tz_aware = 1; - } - Py_DECREF(offset); - } + PyObject *offset = extract_utc_offset(obj); + if (offset != NULL) { + is_tz_aware = 1; + Py_DECREF(offset); } ret = make_iso_8601_datetime(&dts, result, *len, is_tz_aware, base); diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index c027c386527fe..7de47749e500c 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -717,22 +717,6 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { for (i = 0; i < PyObject_Length(arrays); i++) { array = PyList_GET_ITEM(arrays, i); - // tz information is lost when dt64tz is converted - // to numpy arrays. - // TODO(lithomas1): Patch column_arrays(actually values_for_json) - // to return EAs instead of casting to object - if (PyArray_TYPE((PyArrayObject *)array) == NPY_DATETIME) { - PyObject *mgr = PyObject_GetAttrString(obj, "_mgr"); - PyObject *dtarr = PyObject_CallMethod(mgr, "iget_values", "n", i); - PyObject *tz = PyObject_GetAttrString(dtarr, "tz"); - if (tz != Py_None) { - // we have a timezone, use an object array of Timestamp - array = PyObject_CallMethod(dtarr, "__array__", NULL); - } - Py_DECREF(mgr); - Py_DECREF(dtarr); - Py_DECREF(tz); - } if (!array) { GET_TC(tc)->iterNext = NpyArr_iterNextNone; goto ARR_RET; diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.c b/pandas/_libs/tslibs/src/datetime/np_datetime.c index f5400cf8da4df..4ce0b57ea985a 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.c @@ -331,6 +331,34 @@ int cmp_npy_datetimestruct(const npy_datetimestruct *a, return 0; } +/* +* Returns the offset from utc of the timezone. +* If the passed object is timezone naive, or if extraction +* of the offset fails, NULL is returned. +* +* NOTE: This function is not vendored from numpy. +*/ +PyObject *extract_utc_offset(PyObject *obj) { + if (PyObject_HasAttrString(obj, "tzinfo")) { + PyObject *tmp = PyObject_GetAttrString(obj, "tzinfo"); + if (tmp == NULL) { + return NULL; + } + if (tmp != Py_None) { + PyObject *offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj); + if (offset == NULL) { + Py_DECREF(tmp); + return NULL; + } + if (offset != Py_None) { + return offset; + } + Py_DECREF(offset); + } + Py_DECREF(tmp); + } + return NULL; +} /* * @@ -376,54 +404,38 @@ int convert_pydatetime_to_datetimestruct(PyObject *dtobj, out->sec = PyLong_AsLong(PyObject_GetAttrString(obj, "second")); out->us = PyLong_AsLong(PyObject_GetAttrString(obj, "microsecond")); + PyObject *offset = extract_utc_offset(obj); /* Apply the time zone offset if datetime obj is tz-aware */ - if (PyObject_HasAttrString((PyObject*)obj, "tzinfo")) { - tmp = PyObject_GetAttrString(obj, "tzinfo"); + if (offset != NULL) { + PyObject *tmp_int; + int seconds_offset, minutes_offset; + /* + * The timedelta should have a function "total_seconds" + * which contains the value we want. + */ + tmp = PyObject_CallMethod(offset, "total_seconds", ""); + Py_DECREF(offset); if (tmp == NULL) { return -1; } - if (tmp == Py_None) { - Py_DECREF(tmp); - } else { - PyObject *offset; - PyObject *tmp_int; - int seconds_offset, minutes_offset; - - /* The utcoffset function should return a timedelta */ - offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj); - if (offset == NULL) { - Py_DECREF(tmp); - return -1; - } + tmp_int = PyNumber_Long(tmp); + if (tmp_int == NULL) { Py_DECREF(tmp); - - /* - * The timedelta should have a function "total_seconds" - * which contains the value we want. - */ - tmp = PyObject_CallMethod(offset, "total_seconds", ""); - if (tmp == NULL) { - return -1; - } - tmp_int = PyNumber_Long(tmp); - if (tmp_int == NULL) { - Py_DECREF(tmp); - return -1; - } - seconds_offset = PyLong_AsLong(tmp_int); - if (seconds_offset == -1 && PyErr_Occurred()) { - Py_DECREF(tmp_int); - Py_DECREF(tmp); - return -1; - } + return -1; + } + seconds_offset = PyLong_AsLong(tmp_int); + if (seconds_offset == -1 && PyErr_Occurred()) { Py_DECREF(tmp_int); Py_DECREF(tmp); + return -1; + } + Py_DECREF(tmp_int); + Py_DECREF(tmp); - /* Convert to a minutes offset and apply it */ - minutes_offset = seconds_offset / 60; + /* Convert to a minutes offset and apply it */ + minutes_offset = seconds_offset / 60; - add_minutes_to_datetimestruct(out, -minutes_offset); - } + add_minutes_to_datetimestruct(out, -minutes_offset); } return 0; diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.h b/pandas/_libs/tslibs/src/datetime/np_datetime.h index 065f09a6d93b5..6ab915e517cfb 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.h @@ -48,6 +48,8 @@ extern const npy_datetimestruct _M_MAX_DTS; // stuff pandas needs // ---------------------------------------------------------------------------- +PyObject *extract_utc_offset(PyObject *obj); + int convert_pydatetime_to_datetimestruct(PyObject *dtobj, npy_datetimestruct *out); diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 26569b571724d..c8430a9266ea5 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1974,8 +1974,6 @@ class DatetimeLikeBlock(NDArrayBackedExtensionBlock): values: DatetimeArray | TimedeltaArray def values_for_json(self) -> np.ndarray: - # special casing datetimetz to avoid conversion through - # object dtype return self.values._ndarray @@ -1989,6 +1987,12 @@ class DatetimeTZBlock(DatetimeLikeBlock): _validate_ndim = True _can_consolidate = False + def values_for_json(self) -> np.ndarray: + # force dt64tz to go through object dtype + # tz info will be lost when converting to + # dt64 which is naive + return self.values.astype(object) + class ObjectBlock(NumpyBlock): __slots__ = () From 075e7ca45ae0467faaf0f7ebad5959e051122dbc Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 19 Apr 2022 20:10:26 -0700 Subject: [PATCH 07/10] address comments --- doc/source/whatsnew/v1.5.0.rst | 15 +++++++++++---- pandas/_libs/src/ujson/python/date_conversions.c | 4 ++-- pandas/tests/io/json/test_pandas.py | 12 +++++++++--- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index bd891e5a84d4f..da32396fcbb6e 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -175,10 +175,17 @@ Styler - Fix showing "None" as ylabel in :meth:`Series.plot` when not setting ylabel (:issue:`46129`) -.. _whatsnew_150.notable_bug_fixes.notable_bug_fix2: +.. _whatsnew_150.notable_bug_fixes.to_json_incorrectly_localizing_naive_timestamps: -notable_bug_fix2 -^^^^^^^^^^^^^^^^ +Serializing tz-naive Timestamps with to_json() with ``iso_dates=True`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`DataFrame.to_json`, :meth:`Series.to_json`, and :meth:`Index.to_json` +would incorrectly localize DatetimeArrays/DatetimeIndexes with tz-naive Timestamps +to UTC. (:issue:`38760`) + +Note that this patch does not fix the localization of tz-aware Timestamps to UTC +upon serialization. (Related issue :issue:`12997`) .. --------------------------------------------------------------------------- .. _whatsnew_150.api_breaking: @@ -563,7 +570,7 @@ I/O - Bug in Parquet roundtrip for Interval dtype with ``datetime64[ns]`` subtype (:issue:`45881`) - Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements (:issue:`45598`) - Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`) -- Bug in :meth:`DataFrame.to_json`, :meth:`Series.to_json`, and :meth:`Index.to_json` where tz-aware datetimes were being incorrectly localized to UTC (:issue:`38760`) +- Period ^^^^^^ diff --git a/pandas/_libs/src/ujson/python/date_conversions.c b/pandas/_libs/src/ujson/python/date_conversions.c index 993e0e89562cc..53673da6aa93c 100644 --- a/pandas/_libs/src/ujson/python/date_conversions.c +++ b/pandas/_libs/src/ujson/python/date_conversions.c @@ -54,8 +54,8 @@ char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) { PyErr_NoMemory(); return NULL; } - ret_code = make_iso_8601_datetime(&dts, result, *len, - 0 /* datetime64 is always naive */, base); + // datetime64 is always naive + ret_code = make_iso_8601_datetime(&dts, result, *len, 0, base); if (ret_code != 0) { PyErr_SetString(PyExc_ValueError, "Could not convert datetime value to string"); diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 51bcf85af8821..576d99f25e25c 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1218,25 +1218,31 @@ def test_tz_range_is_utc(self, tz_range): assert dumps(tz_range, iso_dates=True) == exp dti = DatetimeIndex(tz_range) + # Ensure datetimes in object array are serialized correctly + # in addition to the normal DTI case assert dumps(dti, iso_dates=True) == exp + assert dumps(dti.astype(object), iso_dates=True) == exp df = DataFrame({"DT": dti}) result = dumps(df, iso_dates=True) assert result == dfexp + assert dumps(df.astype({"DT": object}), iso_dates=True) def test_tz_range_is_naive(self): from pandas.io.json import dumps - tz_range = pd.date_range("2013-01-01 05:00:00", periods=2) + dti = pd.date_range("2013-01-01 05:00:00", periods=2) exp = '["2013-01-01T05:00:00.000","2013-01-02T05:00:00.000"]' dfexp = '{"DT":{"0":"2013-01-01T05:00:00.000","1":"2013-01-02T05:00:00.000"}}' - assert dumps(tz_range, iso_dates=True) == exp - dti = DatetimeIndex(tz_range) + # Ensure datetimes in object array are serialized correctly + # in addition to the normal DTI case assert dumps(dti, iso_dates=True) == exp + assert dumps(dti.astype(object), iso_dates=True) == exp df = DataFrame({"DT": dti}) result = dumps(df, iso_dates=True) assert result == dfexp + assert dumps(df.astype({"DT": object}), iso_dates=True) def test_read_inline_jsonl(self): # GH9180 From ca3316d3c263873a9524d510f430b8272234ffb0 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 26 Apr 2022 20:14:36 -0700 Subject: [PATCH 08/10] address more comments --- doc/source/whatsnew/v1.5.0.rst | 30 ++++++ .../_libs/src/ujson/python/date_conversions.c | 12 ++- .../_libs/tslibs/src/datetime/np_datetime.c | 97 ++++++++++--------- 3 files changed, 89 insertions(+), 50 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index da32396fcbb6e..8eb73cd3f40c0 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -187,6 +187,36 @@ to UTC. (:issue:`38760`) Note that this patch does not fix the localization of tz-aware Timestamps to UTC upon serialization. (Related issue :issue:`12997`) +*Old Behavior* + +.. ipython:: python + + index = pd.date_range( + start='2020-12-28 00:00:00', + end='2020-12-28 02:00:00', + freq='1H', + ) + a = pd.Series( + data=range(3), + index=index, + ) + +.. code-block:: ipython + + In [4]: a.to_json(date_format='iso') + Out[4]: '{"2020-12-28T00:00:00.000Z":0,"2020-12-28T01:00:00.000Z":1,"2020-12-28T02:00:00.000Z":2}' + + In [5]: pd.read_json(a.to_json(date_format='iso'), typ="series").index == a.index + Out[5]: array([False, False, False]) + +*New Behavior* + +.. ipython:: python + + a.to_json(date_format='iso') + # Roundtripping now works + pd.read_json(a.to_json(date_format='iso'), typ="series").index == a.index + .. --------------------------------------------------------------------------- .. _whatsnew_150.api_breaking: diff --git a/pandas/_libs/src/ujson/python/date_conversions.c b/pandas/_libs/src/ujson/python/date_conversions.c index 53673da6aa93c..6bc75590c1feb 100644 --- a/pandas/_libs/src/ujson/python/date_conversions.c +++ b/pandas/_libs/src/ujson/python/date_conversions.c @@ -93,10 +93,14 @@ char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base, // Check to see if PyDateTime has a timezone. // Don't convert to UTC if it doesn't. int is_tz_aware = 0; - PyObject *offset = extract_utc_offset(obj); - if (offset != NULL) { - is_tz_aware = 1; - Py_DECREF(offset); + if (PyObject_HasAttrString(obj, "tzinfo")) { + PyObject *offset = extract_utc_offset(obj); + if (offset != NULL) { + if (offset != Py_None) { + is_tz_aware = 1; + } + Py_DECREF(offset); + } } ret = make_iso_8601_datetime(&dts, result, *len, is_tz_aware, base); diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.c b/pandas/_libs/tslibs/src/datetime/np_datetime.c index 4ce0b57ea985a..c5ff7219264e2 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.c @@ -332,32 +332,31 @@ int cmp_npy_datetimestruct(const npy_datetimestruct *a, return 0; } /* -* Returns the offset from utc of the timezone. -* If the passed object is timezone naive, or if extraction -* of the offset fails, NULL is returned. +* Returns the offset from utc of the timezone as a timedelta. +* The caller is responsible for ensuring that the tzinfo +* attribute exists on the datetime object. +* +* If the passed object is timezone naive, Py_None is returned. +* If extraction of the offset fails, NULL is returned. * * NOTE: This function is not vendored from numpy. */ PyObject *extract_utc_offset(PyObject *obj) { - if (PyObject_HasAttrString(obj, "tzinfo")) { - PyObject *tmp = PyObject_GetAttrString(obj, "tzinfo"); - if (tmp == NULL) { + PyObject *tmp = PyObject_GetAttrString(obj, "tzinfo"); + if (tmp == NULL) { + return NULL; + } + if (tmp != Py_None) { + PyObject *offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj); + if (offset == NULL) { + Py_DECREF(tmp); return NULL; } - if (tmp != Py_None) { - PyObject *offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj); - if (offset == NULL) { - Py_DECREF(tmp); - return NULL; - } - if (offset != Py_None) { - return offset; - } - Py_DECREF(offset); + if (offset != Py_None) { + return offset; } - Py_DECREF(tmp); } - return NULL; + return tmp; } /* @@ -404,38 +403,44 @@ int convert_pydatetime_to_datetimestruct(PyObject *dtobj, out->sec = PyLong_AsLong(PyObject_GetAttrString(obj, "second")); out->us = PyLong_AsLong(PyObject_GetAttrString(obj, "microsecond")); - PyObject *offset = extract_utc_offset(obj); - /* Apply the time zone offset if datetime obj is tz-aware */ - if (offset != NULL) { - PyObject *tmp_int; - int seconds_offset, minutes_offset; - /* - * The timedelta should have a function "total_seconds" - * which contains the value we want. - */ - tmp = PyObject_CallMethod(offset, "total_seconds", ""); - Py_DECREF(offset); - if (tmp == NULL) { - return -1; - } - tmp_int = PyNumber_Long(tmp); - if (tmp_int == NULL) { - Py_DECREF(tmp); - return -1; - } - seconds_offset = PyLong_AsLong(tmp_int); - if (seconds_offset == -1 && PyErr_Occurred()) { + if (PyObject_HasAttrString(obj, "tzinfo")) { + PyObject *offset = extract_utc_offset(obj); + /* Apply the time zone offset if datetime obj is tz-aware */ + if (offset != NULL) { + if (offset == Py_None) { + Py_DECREF(offset); + return 0; + } + PyObject *tmp_int; + int seconds_offset, minutes_offset; + /* + * The timedelta should have a function "total_seconds" + * which contains the value we want. + */ + tmp = PyObject_CallMethod(offset, "total_seconds", ""); + Py_DECREF(offset); + if (tmp == NULL) { + return -1; + } + tmp_int = PyNumber_Long(tmp); + if (tmp_int == NULL) { + Py_DECREF(tmp); + return -1; + } + seconds_offset = PyLong_AsLong(tmp_int); + if (seconds_offset == -1 && PyErr_Occurred()) { + Py_DECREF(tmp_int); + Py_DECREF(tmp); + return -1; + } Py_DECREF(tmp_int); Py_DECREF(tmp); - return -1; - } - Py_DECREF(tmp_int); - Py_DECREF(tmp); - /* Convert to a minutes offset and apply it */ - minutes_offset = seconds_offset / 60; + /* Convert to a minutes offset and apply it */ + minutes_offset = seconds_offset / 60; - add_minutes_to_datetimestruct(out, -minutes_offset); + add_minutes_to_datetimestruct(out, -minutes_offset); + } } return 0; From fe90bb3ed19f49b969c7cddb3b6ccd6ea41dc6b9 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 29 Apr 2022 20:47:42 -0700 Subject: [PATCH 09/10] address comments --- pandas/_libs/src/ujson/python/date_conversions.c | 9 ++++----- pandas/_libs/tslibs/src/datetime/np_datetime.c | 4 +--- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/src/ujson/python/date_conversions.c b/pandas/_libs/src/ujson/python/date_conversions.c index 6bc75590c1feb..7aee0b0195712 100644 --- a/pandas/_libs/src/ujson/python/date_conversions.c +++ b/pandas/_libs/src/ujson/python/date_conversions.c @@ -95,12 +95,11 @@ char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base, int is_tz_aware = 0; if (PyObject_HasAttrString(obj, "tzinfo")) { PyObject *offset = extract_utc_offset(obj); - if (offset != NULL) { - if (offset != Py_None) { - is_tz_aware = 1; - } - Py_DECREF(offset); + if (offset == NULL) { + return NULL; } + is_tz_aware = offset != Py_None; + Py_DECREF(offset); } ret = make_iso_8601_datetime(&dts, result, *len, is_tz_aware, base); diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.c b/pandas/_libs/tslibs/src/datetime/np_datetime.c index c5ff7219264e2..9029f9ae14b34 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.c @@ -352,9 +352,7 @@ PyObject *extract_utc_offset(PyObject *obj) { Py_DECREF(tmp); return NULL; } - if (offset != Py_None) { - return offset; - } + return offset; } return tmp; } From 65f321b07cf66e865790172a7ab3ddf9396e9d3e Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 3 May 2022 08:07:12 -0700 Subject: [PATCH 10/10] fix memleak --- pandas/_libs/src/ujson/python/date_conversions.c | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/src/ujson/python/date_conversions.c b/pandas/_libs/src/ujson/python/date_conversions.c index 7aee0b0195712..86cb68f869cb0 100644 --- a/pandas/_libs/src/ujson/python/date_conversions.c +++ b/pandas/_libs/src/ujson/python/date_conversions.c @@ -96,6 +96,7 @@ char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base, if (PyObject_HasAttrString(obj, "tzinfo")) { PyObject *offset = extract_utc_offset(obj); if (offset == NULL) { + PyObject_Free(result); return NULL; } is_tz_aware = offset != Py_None;