diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 931d18dc349f3..f180b01f17bff 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -194,10 +194,47 @@ did not have the same index as the input. df.groupby('a', dropna=True).transform('ffill') df.groupby('a', dropna=True).transform(lambda x: x) -.. _whatsnew_150.notable_bug_fixes.notable_bug_fix2: +.. _whatsnew_150.notable_bug_fixes.to_json_incorrectly_localizing_naive_timestamps: -notable_bug_fix2 -^^^^^^^^^^^^^^^^ +Serializing tz-naive Timestamps with to_json() with ``iso_dates=True`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`DataFrame.to_json`, :meth:`Series.to_json`, and :meth:`Index.to_json` +would incorrectly localize DatetimeArrays/DatetimeIndexes with tz-naive Timestamps +to UTC. (:issue:`38760`) + +Note that this patch does not fix the localization of tz-aware Timestamps to UTC +upon serialization. (Related issue :issue:`12997`) + +*Old Behavior* + +.. ipython:: python + + index = pd.date_range( + start='2020-12-28 00:00:00', + end='2020-12-28 02:00:00', + freq='1H', + ) + a = pd.Series( + data=range(3), + index=index, + ) + +.. code-block:: ipython + + In [4]: a.to_json(date_format='iso') + Out[4]: '{"2020-12-28T00:00:00.000Z":0,"2020-12-28T01:00:00.000Z":1,"2020-12-28T02:00:00.000Z":2}' + + In [5]: pd.read_json(a.to_json(date_format='iso'), typ="series").index == a.index + Out[5]: array([False, False, False]) + +*New Behavior* + +.. ipython:: python + + a.to_json(date_format='iso') + # Roundtripping now works + pd.read_json(a.to_json(date_format='iso'), typ="series").index == a.index .. --------------------------------------------------------------------------- .. _whatsnew_150.api_breaking: diff --git a/pandas/_libs/src/ujson/python/date_conversions.c b/pandas/_libs/src/ujson/python/date_conversions.c index 0744c6af74480..86cb68f869cb0 100644 --- a/pandas/_libs/src/ujson/python/date_conversions.c +++ b/pandas/_libs/src/ujson/python/date_conversions.c @@ -54,8 +54,8 @@ char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) { PyErr_NoMemory(); return NULL; } - - ret_code = make_iso_8601_datetime(&dts, result, *len, base); + // datetime64 is always naive + ret_code = make_iso_8601_datetime(&dts, result, *len, 0, base); if (ret_code != 0) { PyErr_SetString(PyExc_ValueError, "Could not convert datetime value to string"); @@ -90,7 +90,19 @@ char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base, *len = (size_t)get_datetime_iso_8601_strlen(0, base); char *result = PyObject_Malloc(*len); - ret = make_iso_8601_datetime(&dts, result, *len, base); + // Check to see if PyDateTime has a timezone. + // Don't convert to UTC if it doesn't. + int is_tz_aware = 0; + if (PyObject_HasAttrString(obj, "tzinfo")) { + PyObject *offset = extract_utc_offset(obj); + if (offset == NULL) { + PyObject_Free(result); + return NULL; + } + is_tz_aware = offset != Py_None; + Py_DECREF(offset); + } + ret = make_iso_8601_datetime(&dts, result, *len, is_tz_aware, base); if (ret != 0) { PyErr_SetString(PyExc_ValueError, diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index c4609992342c3..7de47749e500c 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -221,8 +221,18 @@ static PyObject *get_values(PyObject *obj) { // The special cases to worry about are dt64tz and category[dt64tz]. // In both cases we want the UTC-localized datetime64 ndarray, // without going through and object array of Timestamps. + if (PyObject_HasAttrString(obj, "tz")) { + PyObject *tz = PyObject_GetAttrString(obj, "tz"); + if (tz != Py_None) { + // Go through object array if we have dt64tz, since tz info will + // be lost if values is used directly. + Py_DECREF(tz); + values = PyObject_CallMethod(obj, "__array__", NULL); + return values; + } + Py_DECREF(tz); + } values = PyObject_GetAttrString(obj, "values"); - if (values == NULL) { // Clear so we can subsequently try another method PyErr_Clear(); diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.c b/pandas/_libs/tslibs/src/datetime/np_datetime.c index f5400cf8da4df..9029f9ae14b34 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.c @@ -331,6 +331,31 @@ int cmp_npy_datetimestruct(const npy_datetimestruct *a, return 0; } +/* +* Returns the offset from utc of the timezone as a timedelta. +* The caller is responsible for ensuring that the tzinfo +* attribute exists on the datetime object. +* +* If the passed object is timezone naive, Py_None is returned. +* If extraction of the offset fails, NULL is returned. +* +* NOTE: This function is not vendored from numpy. +*/ +PyObject *extract_utc_offset(PyObject *obj) { + PyObject *tmp = PyObject_GetAttrString(obj, "tzinfo"); + if (tmp == NULL) { + return NULL; + } + if (tmp != Py_None) { + PyObject *offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj); + if (offset == NULL) { + Py_DECREF(tmp); + return NULL; + } + return offset; + } + return tmp; +} /* * @@ -376,32 +401,22 @@ int convert_pydatetime_to_datetimestruct(PyObject *dtobj, out->sec = PyLong_AsLong(PyObject_GetAttrString(obj, "second")); out->us = PyLong_AsLong(PyObject_GetAttrString(obj, "microsecond")); - /* Apply the time zone offset if datetime obj is tz-aware */ - if (PyObject_HasAttrString((PyObject*)obj, "tzinfo")) { - tmp = PyObject_GetAttrString(obj, "tzinfo"); - if (tmp == NULL) { - return -1; - } - if (tmp == Py_None) { - Py_DECREF(tmp); - } else { - PyObject *offset; + if (PyObject_HasAttrString(obj, "tzinfo")) { + PyObject *offset = extract_utc_offset(obj); + /* Apply the time zone offset if datetime obj is tz-aware */ + if (offset != NULL) { + if (offset == Py_None) { + Py_DECREF(offset); + return 0; + } PyObject *tmp_int; int seconds_offset, minutes_offset; - - /* The utcoffset function should return a timedelta */ - offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj); - if (offset == NULL) { - Py_DECREF(tmp); - return -1; - } - Py_DECREF(tmp); - /* * The timedelta should have a function "total_seconds" * which contains the value we want. */ tmp = PyObject_CallMethod(offset, "total_seconds", ""); + Py_DECREF(offset); if (tmp == NULL) { return -1; } diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.h b/pandas/_libs/tslibs/src/datetime/np_datetime.h index 065f09a6d93b5..6ab915e517cfb 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.h @@ -48,6 +48,8 @@ extern const npy_datetimestruct _M_MAX_DTS; // stuff pandas needs // ---------------------------------------------------------------------------- +PyObject *extract_utc_offset(PyObject *obj); + int convert_pydatetime_to_datetimestruct(PyObject *dtobj, npy_datetimestruct *out); diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index f787a26ab51fb..cfbaed01b57c9 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -632,7 +632,7 @@ int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) { * string was too short). */ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, - NPY_DATETIMEUNIT base) { + int utc, NPY_DATETIMEUNIT base) { char *substr = outstr; int sublen = outlen; int tmplen; @@ -911,13 +911,14 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, add_time_zone: /* UTC "Zulu" time */ - if (sublen < 1) { - goto string_too_short; + if (utc) { + if (sublen < 1) { + goto string_too_short; + } + substr[0] = 'Z'; + substr += 1; + sublen -= 1; } - substr[0] = 'Z'; - substr += 1; - sublen -= 1; - /* Add a NULL terminator, and return */ if (sublen > 0) { substr[0] = '\0'; diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h index 2cc032c0e278c..511d9a401fed2 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h @@ -79,7 +79,7 @@ get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base); */ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, - NPY_DATETIMEUNIT base); + int utc, NPY_DATETIMEUNIT base); /* * Converts an pandas_timedeltastruct to an ISO 8601 string. diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 26569b571724d..c8430a9266ea5 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1974,8 +1974,6 @@ class DatetimeLikeBlock(NDArrayBackedExtensionBlock): values: DatetimeArray | TimedeltaArray def values_for_json(self) -> np.ndarray: - # special casing datetimetz to avoid conversion through - # object dtype return self.values._ndarray @@ -1989,6 +1987,12 @@ class DatetimeTZBlock(DatetimeLikeBlock): _validate_ndim = True _can_consolidate = False + def values_for_json(self) -> np.ndarray: + # force dt64tz to go through object dtype + # tz info will be lost when converting to + # dt64 which is naive + return self.values.astype(object) + class ObjectBlock(NumpyBlock): __slots__ = () diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index a2fd8d2cbbf04..c90ac2fb3b813 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -301,7 +301,7 @@ def test_to_json(self, df_table): ("idx", 0), ("A", 1), ("B", "a"), - ("C", "2016-01-01T00:00:00.000Z"), + ("C", "2016-01-01T00:00:00.000"), ("D", "P0DT1H0M0S"), ("E", "a"), ("F", "a"), @@ -314,7 +314,7 @@ def test_to_json(self, df_table): ("idx", 1), ("A", 2), ("B", "b"), - ("C", "2016-01-02T00:00:00.000Z"), + ("C", "2016-01-02T00:00:00.000"), ("D", "P0DT1H1M0S"), ("E", "b"), ("F", "b"), @@ -327,7 +327,7 @@ def test_to_json(self, df_table): ("idx", 2), ("A", 3), ("B", "c"), - ("C", "2016-01-03T00:00:00.000Z"), + ("C", "2016-01-03T00:00:00.000"), ("D", "P0DT1H2M0S"), ("E", "c"), ("F", "c"), @@ -340,7 +340,7 @@ def test_to_json(self, df_table): ("idx", 3), ("A", 4), ("B", "c"), - ("C", "2016-01-04T00:00:00.000Z"), + ("C", "2016-01-04T00:00:00.000"), ("D", "P0DT1H3M0S"), ("E", "c"), ("F", "c"), @@ -397,8 +397,8 @@ def test_to_json_period_index(self): schema = {"fields": fields, "primaryKey": ["index"]} data = [ - OrderedDict([("index", "2015-11-01T00:00:00.000Z"), ("values", 1)]), - OrderedDict([("index", "2016-02-01T00:00:00.000Z"), ("values", 1)]), + OrderedDict([("index", "2015-11-01T00:00:00.000"), ("values", 1)]), + OrderedDict([("index", "2016-02-01T00:00:00.000"), ("values", 1)]), ] expected = OrderedDict([("schema", schema), ("data", data)]) @@ -635,7 +635,7 @@ def test_timestamp_in_columns(self): ) result = df.to_json(orient="table") js = json.loads(result) - assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000Z" + assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000" assert js["schema"]["fields"][2]["name"] == "P0DT0H0M10S" @pytest.mark.parametrize( diff --git a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py index f31262a1eacfd..fbf4006066f6b 100644 --- a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py +++ b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py @@ -145,7 +145,7 @@ def test_build_date_series(self): expected = OrderedDict( [ ("schema", schema), - ("data", [OrderedDict([("id", 0), ("a", "2021-10-10T00:00:00.000Z")])]), + ("data", [OrderedDict([("id", 0), ("a", "2021-10-10T00:00:00.000")])]), ] ) @@ -250,7 +250,7 @@ def test_to_json(self): OrderedDict( [ ("idx", 0), - ("A", "2021-10-10T00:00:00.000Z"), + ("A", "2021-10-10T00:00:00.000"), ("B", 10.0), ("C", "pandas"), ("D", 10), diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 985d9e47ea7bd..576d99f25e25c 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -822,7 +822,7 @@ def test_date_index_and_values(self, date_format, as_object, date_typ): expected = '{"1577836800000":1577836800000,"null":null}' else: expected = ( - '{"2020-01-01T00:00:00.000Z":"2020-01-01T00:00:00.000Z","null":null}' + '{"2020-01-01T00:00:00.000":"2020-01-01T00:00:00.000","null":null}' ) if as_object: @@ -875,8 +875,6 @@ def test_date_format_frame(self, date, date_unit, datetime_frame): json = df.to_json(date_format="iso") result = read_json(json) expected = df.copy() - expected.index = expected.index.tz_localize("UTC") - expected["date"] = expected["date"].dt.tz_localize("UTC") tm.assert_frame_equal(result, expected) def test_date_format_frame_raises(self, datetime_frame): @@ -905,8 +903,6 @@ def test_date_format_series(self, date, date_unit, datetime_series): json = ts.to_json(date_format="iso") result = read_json(json, typ="series") expected = ts.copy() - expected.index = expected.index.tz_localize("UTC") - expected = expected.dt.tz_localize("UTC") tm.assert_series_equal(result, expected) def test_date_format_series_raises(self, datetime_series): @@ -1192,6 +1188,16 @@ def test_tz_is_utc(self, ts): dt = ts.to_pydatetime() assert dumps(dt, iso_dates=True) == exp + def test_tz_is_naive(self): + from pandas.io.json import dumps + + ts = Timestamp("2013-01-10 05:00:00") + exp = '"2013-01-10T05:00:00.000"' + + assert dumps(ts, iso_dates=True) == exp + dt = ts.to_pydatetime() + assert dumps(dt, iso_dates=True) == exp + @pytest.mark.parametrize( "tz_range", [ @@ -1212,10 +1218,31 @@ def test_tz_range_is_utc(self, tz_range): assert dumps(tz_range, iso_dates=True) == exp dti = DatetimeIndex(tz_range) + # Ensure datetimes in object array are serialized correctly + # in addition to the normal DTI case + assert dumps(dti, iso_dates=True) == exp + assert dumps(dti.astype(object), iso_dates=True) == exp + df = DataFrame({"DT": dti}) + result = dumps(df, iso_dates=True) + assert result == dfexp + assert dumps(df.astype({"DT": object}), iso_dates=True) + + def test_tz_range_is_naive(self): + from pandas.io.json import dumps + + dti = pd.date_range("2013-01-01 05:00:00", periods=2) + + exp = '["2013-01-01T05:00:00.000","2013-01-02T05:00:00.000"]' + dfexp = '{"DT":{"0":"2013-01-01T05:00:00.000","1":"2013-01-02T05:00:00.000"}}' + + # Ensure datetimes in object array are serialized correctly + # in addition to the normal DTI case assert dumps(dti, iso_dates=True) == exp + assert dumps(dti.astype(object), iso_dates=True) == exp df = DataFrame({"DT": dti}) result = dumps(df, iso_dates=True) assert result == dfexp + assert dumps(df.astype({"DT": object}), iso_dates=True) def test_read_inline_jsonl(self): # GH9180