Skip to content

Commit e788c25

Browse files
Merge remote-tracking branch 'upstream/main' into td64-tests
2 parents d53d1ce + d5cf2b8 commit e788c25

File tree

13 files changed

+169
-51
lines changed

13 files changed

+169
-51
lines changed

doc/source/whatsnew/v1.5.0.rst

+40-3
Original file line numberDiff line numberDiff line change
@@ -194,10 +194,47 @@ did not have the same index as the input.
194194
df.groupby('a', dropna=True).transform('ffill')
195195
df.groupby('a', dropna=True).transform(lambda x: x)
196196
197-
.. _whatsnew_150.notable_bug_fixes.notable_bug_fix2:
197+
.. _whatsnew_150.notable_bug_fixes.to_json_incorrectly_localizing_naive_timestamps:
198198

199-
notable_bug_fix2
200-
^^^^^^^^^^^^^^^^
199+
Serializing tz-naive Timestamps with to_json() with ``iso_dates=True``
200+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
201+
202+
:meth:`DataFrame.to_json`, :meth:`Series.to_json`, and :meth:`Index.to_json`
203+
would incorrectly localize DatetimeArrays/DatetimeIndexes with tz-naive Timestamps
204+
to UTC. (:issue:`38760`)
205+
206+
Note that this patch does not fix the localization of tz-aware Timestamps to UTC
207+
upon serialization. (Related issue :issue:`12997`)
208+
209+
*Old Behavior*
210+
211+
.. ipython:: python
212+
213+
index = pd.date_range(
214+
start='2020-12-28 00:00:00',
215+
end='2020-12-28 02:00:00',
216+
freq='1H',
217+
)
218+
a = pd.Series(
219+
data=range(3),
220+
index=index,
221+
)
222+
223+
.. code-block:: ipython
224+
225+
In [4]: a.to_json(date_format='iso')
226+
Out[4]: '{"2020-12-28T00:00:00.000Z":0,"2020-12-28T01:00:00.000Z":1,"2020-12-28T02:00:00.000Z":2}'
227+
228+
In [5]: pd.read_json(a.to_json(date_format='iso'), typ="series").index == a.index
229+
Out[5]: array([False, False, False])
230+
231+
*New Behavior*
232+
233+
.. ipython:: python
234+
235+
a.to_json(date_format='iso')
236+
# Roundtripping now works
237+
pd.read_json(a.to_json(date_format='iso'), typ="series").index == a.index
201238
202239
.. ---------------------------------------------------------------------------
203240
.. _whatsnew_150.api_breaking:

pandas/_libs/src/ujson/python/date_conversions.c

+15-3
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,8 @@ char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) {
5454
PyErr_NoMemory();
5555
return NULL;
5656
}
57-
58-
ret_code = make_iso_8601_datetime(&dts, result, *len, base);
57+
// datetime64 is always naive
58+
ret_code = make_iso_8601_datetime(&dts, result, *len, 0, base);
5959
if (ret_code != 0) {
6060
PyErr_SetString(PyExc_ValueError,
6161
"Could not convert datetime value to string");
@@ -90,7 +90,19 @@ char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base,
9090

9191
*len = (size_t)get_datetime_iso_8601_strlen(0, base);
9292
char *result = PyObject_Malloc(*len);
93-
ret = make_iso_8601_datetime(&dts, result, *len, base);
93+
// Check to see if PyDateTime has a timezone.
94+
// Don't convert to UTC if it doesn't.
95+
int is_tz_aware = 0;
96+
if (PyObject_HasAttrString(obj, "tzinfo")) {
97+
PyObject *offset = extract_utc_offset(obj);
98+
if (offset == NULL) {
99+
PyObject_Free(result);
100+
return NULL;
101+
}
102+
is_tz_aware = offset != Py_None;
103+
Py_DECREF(offset);
104+
}
105+
ret = make_iso_8601_datetime(&dts, result, *len, is_tz_aware, base);
94106

95107
if (ret != 0) {
96108
PyErr_SetString(PyExc_ValueError,

pandas/_libs/src/ujson/python/objToJSON.c

+11-1
Original file line numberDiff line numberDiff line change
@@ -221,8 +221,18 @@ static PyObject *get_values(PyObject *obj) {
221221
// The special cases to worry about are dt64tz and category[dt64tz].
222222
// In both cases we want the UTC-localized datetime64 ndarray,
223223
// without going through and object array of Timestamps.
224+
if (PyObject_HasAttrString(obj, "tz")) {
225+
PyObject *tz = PyObject_GetAttrString(obj, "tz");
226+
if (tz != Py_None) {
227+
// Go through object array if we have dt64tz, since tz info will
228+
// be lost if values is used directly.
229+
Py_DECREF(tz);
230+
values = PyObject_CallMethod(obj, "__array__", NULL);
231+
return values;
232+
}
233+
Py_DECREF(tz);
234+
}
224235
values = PyObject_GetAttrString(obj, "values");
225-
226236
if (values == NULL) {
227237
// Clear so we can subsequently try another method
228238
PyErr_Clear();

pandas/_libs/tslibs/src/datetime/np_datetime.c

+34-19
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,31 @@ int cmp_npy_datetimestruct(const npy_datetimestruct *a,
331331

332332
return 0;
333333
}
334+
/*
335+
* Returns the offset from utc of the timezone as a timedelta.
336+
* The caller is responsible for ensuring that the tzinfo
337+
* attribute exists on the datetime object.
338+
*
339+
* If the passed object is timezone naive, Py_None is returned.
340+
* If extraction of the offset fails, NULL is returned.
341+
*
342+
* NOTE: This function is not vendored from numpy.
343+
*/
344+
PyObject *extract_utc_offset(PyObject *obj) {
345+
PyObject *tmp = PyObject_GetAttrString(obj, "tzinfo");
346+
if (tmp == NULL) {
347+
return NULL;
348+
}
349+
if (tmp != Py_None) {
350+
PyObject *offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj);
351+
if (offset == NULL) {
352+
Py_DECREF(tmp);
353+
return NULL;
354+
}
355+
return offset;
356+
}
357+
return tmp;
358+
}
334359

335360
/*
336361
*
@@ -376,32 +401,22 @@ int convert_pydatetime_to_datetimestruct(PyObject *dtobj,
376401
out->sec = PyLong_AsLong(PyObject_GetAttrString(obj, "second"));
377402
out->us = PyLong_AsLong(PyObject_GetAttrString(obj, "microsecond"));
378403

379-
/* Apply the time zone offset if datetime obj is tz-aware */
380-
if (PyObject_HasAttrString((PyObject*)obj, "tzinfo")) {
381-
tmp = PyObject_GetAttrString(obj, "tzinfo");
382-
if (tmp == NULL) {
383-
return -1;
384-
}
385-
if (tmp == Py_None) {
386-
Py_DECREF(tmp);
387-
} else {
388-
PyObject *offset;
404+
if (PyObject_HasAttrString(obj, "tzinfo")) {
405+
PyObject *offset = extract_utc_offset(obj);
406+
/* Apply the time zone offset if datetime obj is tz-aware */
407+
if (offset != NULL) {
408+
if (offset == Py_None) {
409+
Py_DECREF(offset);
410+
return 0;
411+
}
389412
PyObject *tmp_int;
390413
int seconds_offset, minutes_offset;
391-
392-
/* The utcoffset function should return a timedelta */
393-
offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj);
394-
if (offset == NULL) {
395-
Py_DECREF(tmp);
396-
return -1;
397-
}
398-
Py_DECREF(tmp);
399-
400414
/*
401415
* The timedelta should have a function "total_seconds"
402416
* which contains the value we want.
403417
*/
404418
tmp = PyObject_CallMethod(offset, "total_seconds", "");
419+
Py_DECREF(offset);
405420
if (tmp == NULL) {
406421
return -1;
407422
}

pandas/_libs/tslibs/src/datetime/np_datetime.h

+2
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ extern const npy_datetimestruct _M_MAX_DTS;
4848
// stuff pandas needs
4949
// ----------------------------------------------------------------------------
5050

51+
PyObject *extract_utc_offset(PyObject *obj);
52+
5153
int convert_pydatetime_to_datetimestruct(PyObject *dtobj,
5254
npy_datetimestruct *out);
5355

pandas/_libs/tslibs/src/datetime/np_datetime_strings.c

+8-7
Original file line numberDiff line numberDiff line change
@@ -632,7 +632,7 @@ int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) {
632632
* string was too short).
633633
*/
634634
int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
635-
NPY_DATETIMEUNIT base) {
635+
int utc, NPY_DATETIMEUNIT base) {
636636
char *substr = outstr;
637637
int sublen = outlen;
638638
int tmplen;
@@ -911,13 +911,14 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
911911

912912
add_time_zone:
913913
/* UTC "Zulu" time */
914-
if (sublen < 1) {
915-
goto string_too_short;
914+
if (utc) {
915+
if (sublen < 1) {
916+
goto string_too_short;
917+
}
918+
substr[0] = 'Z';
919+
substr += 1;
920+
sublen -= 1;
916921
}
917-
substr[0] = 'Z';
918-
substr += 1;
919-
sublen -= 1;
920-
921922
/* Add a NULL terminator, and return */
922923
if (sublen > 0) {
923924
substr[0] = '\0';

pandas/_libs/tslibs/src/datetime/np_datetime_strings.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base);
7979
*/
8080
int
8181
make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
82-
NPY_DATETIMEUNIT base);
82+
int utc, NPY_DATETIMEUNIT base);
8383

8484
/*
8585
* Converts an pandas_timedeltastruct to an ISO 8601 string.

pandas/core/generic.py

+4
Original file line numberDiff line numberDiff line change
@@ -2599,6 +2599,8 @@ def to_hdf(
25992599
like searching / selecting subsets of the data.
26002600
- If None, pd.get_option('io.hdf.default_format') is checked,
26012601
followed by fallback to "fixed".
2602+
index : bool, default True
2603+
Write DataFrame index as a column.
26022604
errors : str, default 'strict'
26032605
Specifies how encoding and decoding errors are to be handled.
26042606
See the errors argument for :func:`open` for a full list
@@ -2609,6 +2611,8 @@ def to_hdf(
26092611
nan_rep : Any, optional
26102612
How to represent null values as str.
26112613
Not allowed with append=True.
2614+
dropna : bool, default False, optional
2615+
Remove missing values.
26122616
data_columns : list of columns or True, optional
26132617
List of columns to create as indexed data columns for on-disk
26142618
queries, or True to use all columns. By default only the axes

pandas/core/internals/blocks.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -1974,8 +1974,6 @@ class DatetimeLikeBlock(NDArrayBackedExtensionBlock):
19741974
values: DatetimeArray | TimedeltaArray
19751975

19761976
def values_for_json(self) -> np.ndarray:
1977-
# special casing datetimetz to avoid conversion through
1978-
# object dtype
19791977
return self.values._ndarray
19801978

19811979

@@ -1989,6 +1987,12 @@ class DatetimeTZBlock(DatetimeLikeBlock):
19891987
_validate_ndim = True
19901988
_can_consolidate = False
19911989

1990+
def values_for_json(self) -> np.ndarray:
1991+
# force dt64tz to go through object dtype
1992+
# tz info will be lost when converting to
1993+
# dt64 which is naive
1994+
return self.values.astype(object)
1995+
19921996

19931997
class ObjectBlock(NumpyBlock):
19941998
__slots__ = ()

pandas/io/pytables.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -1112,6 +1112,8 @@ def put(
11121112
Table format. Write as a PyTables Table structure which may perform
11131113
worse but allow more flexible operations like searching / selecting
11141114
subsets of the data.
1115+
index : bool, default True
1116+
Write DataFrame index as a column.
11151117
append : bool, default False
11161118
This will force Table format, append the input data to the existing.
11171119
data_columns : list of columns or True, default None
@@ -1124,6 +1126,8 @@ def put(
11241126
Parameter is propagated to 'create_table' method of 'PyTables'.
11251127
If set to False it enables to have the same h5 files (same hashes)
11261128
independent on creation time.
1129+
dropna : bool, default False, optional
1130+
Remove missing values.
11271131
11281132
.. versionadded:: 1.1.0
11291133
"""
@@ -1239,6 +1243,8 @@ def append(
12391243
Table format. Write as a PyTables Table structure which may perform
12401244
worse but allow more flexible operations like searching / selecting
12411245
subsets of the data.
1246+
index : bool, default True
1247+
Write DataFrame index as a column.
12421248
append : bool, default True
12431249
Append the input data to the existing.
12441250
data_columns : list of columns, or True, default None
@@ -1251,7 +1257,7 @@ def append(
12511257
chunksize : size to chunk the writing
12521258
expectedrows : expected TOTAL row size of this table
12531259
encoding : default None, provide an encoding for str
1254-
dropna : bool, default False
1260+
dropna : bool, default False, optional
12551261
Do not write an ALL nan row to the store settable
12561262
by the option 'io.hdf.dropna_table'.
12571263

pandas/tests/io/json/test_json_table_schema.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,7 @@ def test_to_json(self, df_table):
301301
("idx", 0),
302302
("A", 1),
303303
("B", "a"),
304-
("C", "2016-01-01T00:00:00.000Z"),
304+
("C", "2016-01-01T00:00:00.000"),
305305
("D", "P0DT1H0M0S"),
306306
("E", "a"),
307307
("F", "a"),
@@ -314,7 +314,7 @@ def test_to_json(self, df_table):
314314
("idx", 1),
315315
("A", 2),
316316
("B", "b"),
317-
("C", "2016-01-02T00:00:00.000Z"),
317+
("C", "2016-01-02T00:00:00.000"),
318318
("D", "P0DT1H1M0S"),
319319
("E", "b"),
320320
("F", "b"),
@@ -327,7 +327,7 @@ def test_to_json(self, df_table):
327327
("idx", 2),
328328
("A", 3),
329329
("B", "c"),
330-
("C", "2016-01-03T00:00:00.000Z"),
330+
("C", "2016-01-03T00:00:00.000"),
331331
("D", "P0DT1H2M0S"),
332332
("E", "c"),
333333
("F", "c"),
@@ -340,7 +340,7 @@ def test_to_json(self, df_table):
340340
("idx", 3),
341341
("A", 4),
342342
("B", "c"),
343-
("C", "2016-01-04T00:00:00.000Z"),
343+
("C", "2016-01-04T00:00:00.000"),
344344
("D", "P0DT1H3M0S"),
345345
("E", "c"),
346346
("F", "c"),
@@ -397,8 +397,8 @@ def test_to_json_period_index(self):
397397

398398
schema = {"fields": fields, "primaryKey": ["index"]}
399399
data = [
400-
OrderedDict([("index", "2015-11-01T00:00:00.000Z"), ("values", 1)]),
401-
OrderedDict([("index", "2016-02-01T00:00:00.000Z"), ("values", 1)]),
400+
OrderedDict([("index", "2015-11-01T00:00:00.000"), ("values", 1)]),
401+
OrderedDict([("index", "2016-02-01T00:00:00.000"), ("values", 1)]),
402402
]
403403
expected = OrderedDict([("schema", schema), ("data", data)])
404404

@@ -635,7 +635,7 @@ def test_timestamp_in_columns(self):
635635
)
636636
result = df.to_json(orient="table")
637637
js = json.loads(result)
638-
assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000Z"
638+
assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000"
639639
assert js["schema"]["fields"][2]["name"] == "P0DT0H0M10S"
640640

641641
@pytest.mark.parametrize(

pandas/tests/io/json/test_json_table_schema_ext_dtype.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ def test_build_date_series(self):
145145
expected = OrderedDict(
146146
[
147147
("schema", schema),
148-
("data", [OrderedDict([("id", 0), ("a", "2021-10-10T00:00:00.000Z")])]),
148+
("data", [OrderedDict([("id", 0), ("a", "2021-10-10T00:00:00.000")])]),
149149
]
150150
)
151151

@@ -250,7 +250,7 @@ def test_to_json(self):
250250
OrderedDict(
251251
[
252252
("idx", 0),
253-
("A", "2021-10-10T00:00:00.000Z"),
253+
("A", "2021-10-10T00:00:00.000"),
254254
("B", 10.0),
255255
("C", "pandas"),
256256
("D", 10),

0 commit comments

Comments
 (0)