Skip to content

Commit 76da657

Browse files
lithomas1yehoshuadimarsky
authored andcommitted
BUG: to_json incorrectly localizes tz-naive datetimes to UTC (pandas-dev#46730)
1 parent c6c809d commit 76da657

File tree

11 files changed

+158
-50
lines changed

11 files changed

+158
-50
lines changed

doc/source/whatsnew/v1.5.0.rst

+40-3
Original file line numberDiff line numberDiff line change
@@ -194,10 +194,47 @@ did not have the same index as the input.
194194
df.groupby('a', dropna=True).transform('ffill')
195195
df.groupby('a', dropna=True).transform(lambda x: x)
196196
197-
.. _whatsnew_150.notable_bug_fixes.notable_bug_fix2:
197+
.. _whatsnew_150.notable_bug_fixes.to_json_incorrectly_localizing_naive_timestamps:
198198

199-
notable_bug_fix2
200-
^^^^^^^^^^^^^^^^
199+
Serializing tz-naive Timestamps with to_json() with ``iso_dates=True``
200+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
201+
202+
:meth:`DataFrame.to_json`, :meth:`Series.to_json`, and :meth:`Index.to_json`
203+
would incorrectly localize DatetimeArrays/DatetimeIndexes with tz-naive Timestamps
204+
to UTC. (:issue:`38760`)
205+
206+
Note that this patch does not fix the localization of tz-aware Timestamps to UTC
207+
upon serialization. (Related issue :issue:`12997`)
208+
209+
*Old Behavior*
210+
211+
.. ipython:: python
212+
213+
index = pd.date_range(
214+
start='2020-12-28 00:00:00',
215+
end='2020-12-28 02:00:00',
216+
freq='1H',
217+
)
218+
a = pd.Series(
219+
data=range(3),
220+
index=index,
221+
)
222+
223+
.. code-block:: ipython
224+
225+
In [4]: a.to_json(date_format='iso')
226+
Out[4]: '{"2020-12-28T00:00:00.000Z":0,"2020-12-28T01:00:00.000Z":1,"2020-12-28T02:00:00.000Z":2}'
227+
228+
In [5]: pd.read_json(a.to_json(date_format='iso'), typ="series").index == a.index
229+
Out[5]: array([False, False, False])
230+
231+
*New Behavior*
232+
233+
.. ipython:: python
234+
235+
a.to_json(date_format='iso')
236+
# Roundtripping now works
237+
pd.read_json(a.to_json(date_format='iso'), typ="series").index == a.index
201238
202239
.. ---------------------------------------------------------------------------
203240
.. _whatsnew_150.api_breaking:

pandas/_libs/src/ujson/python/date_conversions.c

+15-3
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,8 @@ char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) {
5454
PyErr_NoMemory();
5555
return NULL;
5656
}
57-
58-
ret_code = make_iso_8601_datetime(&dts, result, *len, base);
57+
// datetime64 is always naive
58+
ret_code = make_iso_8601_datetime(&dts, result, *len, 0, base);
5959
if (ret_code != 0) {
6060
PyErr_SetString(PyExc_ValueError,
6161
"Could not convert datetime value to string");
@@ -90,7 +90,19 @@ char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base,
9090

9191
*len = (size_t)get_datetime_iso_8601_strlen(0, base);
9292
char *result = PyObject_Malloc(*len);
93-
ret = make_iso_8601_datetime(&dts, result, *len, base);
93+
// Check to see if PyDateTime has a timezone.
94+
// Don't convert to UTC if it doesn't.
95+
int is_tz_aware = 0;
96+
if (PyObject_HasAttrString(obj, "tzinfo")) {
97+
PyObject *offset = extract_utc_offset(obj);
98+
if (offset == NULL) {
99+
PyObject_Free(result);
100+
return NULL;
101+
}
102+
is_tz_aware = offset != Py_None;
103+
Py_DECREF(offset);
104+
}
105+
ret = make_iso_8601_datetime(&dts, result, *len, is_tz_aware, base);
94106

95107
if (ret != 0) {
96108
PyErr_SetString(PyExc_ValueError,

pandas/_libs/src/ujson/python/objToJSON.c

+11-1
Original file line numberDiff line numberDiff line change
@@ -221,8 +221,18 @@ static PyObject *get_values(PyObject *obj) {
221221
// The special cases to worry about are dt64tz and category[dt64tz].
222222
// In both cases we want the UTC-localized datetime64 ndarray,
223223
// without going through and object array of Timestamps.
224+
if (PyObject_HasAttrString(obj, "tz")) {
225+
PyObject *tz = PyObject_GetAttrString(obj, "tz");
226+
if (tz != Py_None) {
227+
// Go through object array if we have dt64tz, since tz info will
228+
// be lost if values is used directly.
229+
Py_DECREF(tz);
230+
values = PyObject_CallMethod(obj, "__array__", NULL);
231+
return values;
232+
}
233+
Py_DECREF(tz);
234+
}
224235
values = PyObject_GetAttrString(obj, "values");
225-
226236
if (values == NULL) {
227237
// Clear so we can subsequently try another method
228238
PyErr_Clear();

pandas/_libs/tslibs/src/datetime/np_datetime.c

+34-19
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,31 @@ int cmp_npy_datetimestruct(const npy_datetimestruct *a,
331331

332332
return 0;
333333
}
334+
/*
335+
* Returns the offset from utc of the timezone as a timedelta.
336+
* The caller is responsible for ensuring that the tzinfo
337+
* attribute exists on the datetime object.
338+
*
339+
* If the passed object is timezone naive, Py_None is returned.
340+
* If extraction of the offset fails, NULL is returned.
341+
*
342+
* NOTE: This function is not vendored from numpy.
343+
*/
344+
PyObject *extract_utc_offset(PyObject *obj) {
345+
PyObject *tmp = PyObject_GetAttrString(obj, "tzinfo");
346+
if (tmp == NULL) {
347+
return NULL;
348+
}
349+
if (tmp != Py_None) {
350+
PyObject *offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj);
351+
if (offset == NULL) {
352+
Py_DECREF(tmp);
353+
return NULL;
354+
}
355+
return offset;
356+
}
357+
return tmp;
358+
}
334359

335360
/*
336361
*
@@ -376,32 +401,22 @@ int convert_pydatetime_to_datetimestruct(PyObject *dtobj,
376401
out->sec = PyLong_AsLong(PyObject_GetAttrString(obj, "second"));
377402
out->us = PyLong_AsLong(PyObject_GetAttrString(obj, "microsecond"));
378403

379-
/* Apply the time zone offset if datetime obj is tz-aware */
380-
if (PyObject_HasAttrString((PyObject*)obj, "tzinfo")) {
381-
tmp = PyObject_GetAttrString(obj, "tzinfo");
382-
if (tmp == NULL) {
383-
return -1;
384-
}
385-
if (tmp == Py_None) {
386-
Py_DECREF(tmp);
387-
} else {
388-
PyObject *offset;
404+
if (PyObject_HasAttrString(obj, "tzinfo")) {
405+
PyObject *offset = extract_utc_offset(obj);
406+
/* Apply the time zone offset if datetime obj is tz-aware */
407+
if (offset != NULL) {
408+
if (offset == Py_None) {
409+
Py_DECREF(offset);
410+
return 0;
411+
}
389412
PyObject *tmp_int;
390413
int seconds_offset, minutes_offset;
391-
392-
/* The utcoffset function should return a timedelta */
393-
offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj);
394-
if (offset == NULL) {
395-
Py_DECREF(tmp);
396-
return -1;
397-
}
398-
Py_DECREF(tmp);
399-
400414
/*
401415
* The timedelta should have a function "total_seconds"
402416
* which contains the value we want.
403417
*/
404418
tmp = PyObject_CallMethod(offset, "total_seconds", "");
419+
Py_DECREF(offset);
405420
if (tmp == NULL) {
406421
return -1;
407422
}

pandas/_libs/tslibs/src/datetime/np_datetime.h

+2
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ extern const npy_datetimestruct _M_MAX_DTS;
4848
// stuff pandas needs
4949
// ----------------------------------------------------------------------------
5050

51+
PyObject *extract_utc_offset(PyObject *obj);
52+
5153
int convert_pydatetime_to_datetimestruct(PyObject *dtobj,
5254
npy_datetimestruct *out);
5355

pandas/_libs/tslibs/src/datetime/np_datetime_strings.c

+8-7
Original file line numberDiff line numberDiff line change
@@ -632,7 +632,7 @@ int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) {
632632
* string was too short).
633633
*/
634634
int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
635-
NPY_DATETIMEUNIT base) {
635+
int utc, NPY_DATETIMEUNIT base) {
636636
char *substr = outstr;
637637
int sublen = outlen;
638638
int tmplen;
@@ -911,13 +911,14 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
911911

912912
add_time_zone:
913913
/* UTC "Zulu" time */
914-
if (sublen < 1) {
915-
goto string_too_short;
914+
if (utc) {
915+
if (sublen < 1) {
916+
goto string_too_short;
917+
}
918+
substr[0] = 'Z';
919+
substr += 1;
920+
sublen -= 1;
916921
}
917-
substr[0] = 'Z';
918-
substr += 1;
919-
sublen -= 1;
920-
921922
/* Add a NULL terminator, and return */
922923
if (sublen > 0) {
923924
substr[0] = '\0';

pandas/_libs/tslibs/src/datetime/np_datetime_strings.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base);
7979
*/
8080
int
8181
make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
82-
NPY_DATETIMEUNIT base);
82+
int utc, NPY_DATETIMEUNIT base);
8383

8484
/*
8585
* Converts an pandas_timedeltastruct to an ISO 8601 string.

pandas/core/internals/blocks.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -1974,8 +1974,6 @@ class DatetimeLikeBlock(NDArrayBackedExtensionBlock):
19741974
values: DatetimeArray | TimedeltaArray
19751975

19761976
def values_for_json(self) -> np.ndarray:
1977-
# special casing datetimetz to avoid conversion through
1978-
# object dtype
19791977
return self.values._ndarray
19801978

19811979

@@ -1989,6 +1987,12 @@ class DatetimeTZBlock(DatetimeLikeBlock):
19891987
_validate_ndim = True
19901988
_can_consolidate = False
19911989

1990+
def values_for_json(self) -> np.ndarray:
1991+
# force dt64tz to go through object dtype
1992+
# tz info will be lost when converting to
1993+
# dt64 which is naive
1994+
return self.values.astype(object)
1995+
19921996

19931997
class ObjectBlock(NumpyBlock):
19941998
__slots__ = ()

pandas/tests/io/json/test_json_table_schema.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,7 @@ def test_to_json(self, df_table):
301301
("idx", 0),
302302
("A", 1),
303303
("B", "a"),
304-
("C", "2016-01-01T00:00:00.000Z"),
304+
("C", "2016-01-01T00:00:00.000"),
305305
("D", "P0DT1H0M0S"),
306306
("E", "a"),
307307
("F", "a"),
@@ -314,7 +314,7 @@ def test_to_json(self, df_table):
314314
("idx", 1),
315315
("A", 2),
316316
("B", "b"),
317-
("C", "2016-01-02T00:00:00.000Z"),
317+
("C", "2016-01-02T00:00:00.000"),
318318
("D", "P0DT1H1M0S"),
319319
("E", "b"),
320320
("F", "b"),
@@ -327,7 +327,7 @@ def test_to_json(self, df_table):
327327
("idx", 2),
328328
("A", 3),
329329
("B", "c"),
330-
("C", "2016-01-03T00:00:00.000Z"),
330+
("C", "2016-01-03T00:00:00.000"),
331331
("D", "P0DT1H2M0S"),
332332
("E", "c"),
333333
("F", "c"),
@@ -340,7 +340,7 @@ def test_to_json(self, df_table):
340340
("idx", 3),
341341
("A", 4),
342342
("B", "c"),
343-
("C", "2016-01-04T00:00:00.000Z"),
343+
("C", "2016-01-04T00:00:00.000"),
344344
("D", "P0DT1H3M0S"),
345345
("E", "c"),
346346
("F", "c"),
@@ -397,8 +397,8 @@ def test_to_json_period_index(self):
397397

398398
schema = {"fields": fields, "primaryKey": ["index"]}
399399
data = [
400-
OrderedDict([("index", "2015-11-01T00:00:00.000Z"), ("values", 1)]),
401-
OrderedDict([("index", "2016-02-01T00:00:00.000Z"), ("values", 1)]),
400+
OrderedDict([("index", "2015-11-01T00:00:00.000"), ("values", 1)]),
401+
OrderedDict([("index", "2016-02-01T00:00:00.000"), ("values", 1)]),
402402
]
403403
expected = OrderedDict([("schema", schema), ("data", data)])
404404

@@ -635,7 +635,7 @@ def test_timestamp_in_columns(self):
635635
)
636636
result = df.to_json(orient="table")
637637
js = json.loads(result)
638-
assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000Z"
638+
assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000"
639639
assert js["schema"]["fields"][2]["name"] == "P0DT0H0M10S"
640640

641641
@pytest.mark.parametrize(

pandas/tests/io/json/test_json_table_schema_ext_dtype.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ def test_build_date_series(self):
145145
expected = OrderedDict(
146146
[
147147
("schema", schema),
148-
("data", [OrderedDict([("id", 0), ("a", "2021-10-10T00:00:00.000Z")])]),
148+
("data", [OrderedDict([("id", 0), ("a", "2021-10-10T00:00:00.000")])]),
149149
]
150150
)
151151

@@ -250,7 +250,7 @@ def test_to_json(self):
250250
OrderedDict(
251251
[
252252
("idx", 0),
253-
("A", "2021-10-10T00:00:00.000Z"),
253+
("A", "2021-10-10T00:00:00.000"),
254254
("B", 10.0),
255255
("C", "pandas"),
256256
("D", 10),

pandas/tests/io/json/test_pandas.py

+32-5
Original file line numberDiff line numberDiff line change
@@ -822,7 +822,7 @@ def test_date_index_and_values(self, date_format, as_object, date_typ):
822822
expected = '{"1577836800000":1577836800000,"null":null}'
823823
else:
824824
expected = (
825-
'{"2020-01-01T00:00:00.000Z":"2020-01-01T00:00:00.000Z","null":null}'
825+
'{"2020-01-01T00:00:00.000":"2020-01-01T00:00:00.000","null":null}'
826826
)
827827

828828
if as_object:
@@ -875,8 +875,6 @@ def test_date_format_frame(self, date, date_unit, datetime_frame):
875875
json = df.to_json(date_format="iso")
876876
result = read_json(json)
877877
expected = df.copy()
878-
expected.index = expected.index.tz_localize("UTC")
879-
expected["date"] = expected["date"].dt.tz_localize("UTC")
880878
tm.assert_frame_equal(result, expected)
881879

882880
def test_date_format_frame_raises(self, datetime_frame):
@@ -905,8 +903,6 @@ def test_date_format_series(self, date, date_unit, datetime_series):
905903
json = ts.to_json(date_format="iso")
906904
result = read_json(json, typ="series")
907905
expected = ts.copy()
908-
expected.index = expected.index.tz_localize("UTC")
909-
expected = expected.dt.tz_localize("UTC")
910906
tm.assert_series_equal(result, expected)
911907

912908
def test_date_format_series_raises(self, datetime_series):
@@ -1192,6 +1188,16 @@ def test_tz_is_utc(self, ts):
11921188
dt = ts.to_pydatetime()
11931189
assert dumps(dt, iso_dates=True) == exp
11941190

1191+
def test_tz_is_naive(self):
1192+
from pandas.io.json import dumps
1193+
1194+
ts = Timestamp("2013-01-10 05:00:00")
1195+
exp = '"2013-01-10T05:00:00.000"'
1196+
1197+
assert dumps(ts, iso_dates=True) == exp
1198+
dt = ts.to_pydatetime()
1199+
assert dumps(dt, iso_dates=True) == exp
1200+
11951201
@pytest.mark.parametrize(
11961202
"tz_range",
11971203
[
@@ -1212,10 +1218,31 @@ def test_tz_range_is_utc(self, tz_range):
12121218

12131219
assert dumps(tz_range, iso_dates=True) == exp
12141220
dti = DatetimeIndex(tz_range)
1221+
# Ensure datetimes in object array are serialized correctly
1222+
# in addition to the normal DTI case
1223+
assert dumps(dti, iso_dates=True) == exp
1224+
assert dumps(dti.astype(object), iso_dates=True) == exp
1225+
df = DataFrame({"DT": dti})
1226+
result = dumps(df, iso_dates=True)
1227+
assert result == dfexp
1228+
assert dumps(df.astype({"DT": object}), iso_dates=True)
1229+
1230+
def test_tz_range_is_naive(self):
1231+
from pandas.io.json import dumps
1232+
1233+
dti = pd.date_range("2013-01-01 05:00:00", periods=2)
1234+
1235+
exp = '["2013-01-01T05:00:00.000","2013-01-02T05:00:00.000"]'
1236+
dfexp = '{"DT":{"0":"2013-01-01T05:00:00.000","1":"2013-01-02T05:00:00.000"}}'
1237+
1238+
# Ensure datetimes in object array are serialized correctly
1239+
# in addition to the normal DTI case
12151240
assert dumps(dti, iso_dates=True) == exp
1241+
assert dumps(dti.astype(object), iso_dates=True) == exp
12161242
df = DataFrame({"DT": dti})
12171243
result = dumps(df, iso_dates=True)
12181244
assert result == dfexp
1245+
assert dumps(df.astype({"DT": object}), iso_dates=True)
12191246

12201247
def test_read_inline_jsonl(self):
12211248
# GH9180

0 commit comments

Comments
 (0)