pandas-dev · jreback · May 4, 2022 · Apr 10, 2022 · Apr 10, 2022 · Apr 10, 2022
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -563,6 +563,7 @@ I/O
 - Bug in Parquet roundtrip for Interval dtype with ``datetime64[ns]`` subtype (:issue:`45881`)
 - Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements (:issue:`45598`)
 - Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`)
+- Bug in :meth:`DataFrame.to_json`, :meth:`Series.to_json`, and :meth:`Index.to_json` where tz-aware datetimes were being incorrectly localized to UTC (:issue:`38760`)
 
 Period
 ^^^^^^

diff --git a/pandas/_libs/src/ujson/python/date_conversions.c b/pandas/_libs/src/ujson/python/date_conversions.c
@@ -54,8 +54,8 @@ char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) {
         PyErr_NoMemory();
         return NULL;
     }
-
-    ret_code = make_iso_8601_datetime(&dts, result, *len, base);
+    ret_code = make_iso_8601_datetime(&dts, result, *len,
+                                      0 /* datetime64 is always naive */, base);
     if (ret_code != 0) {
         PyErr_SetString(PyExc_ValueError,
                         "Could not convert datetime value to string");
@@ -90,7 +90,15 @@ char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base,
 
     *len = (size_t)get_datetime_iso_8601_strlen(0, base);
     char *result = PyObject_Malloc(*len);
-    ret = make_iso_8601_datetime(&dts, result, *len, base);
+    // Check to see if PyDateTime has a timezone.
+    // Don't convert to UTC if it doesn't.
+    int is_tz_aware = 0;
+    PyObject *offset = extract_utc_offset(obj);
+    if (offset != NULL) {
+        is_tz_aware = 1;
+        Py_DECREF(offset);
+    }
+    ret = make_iso_8601_datetime(&dts, result, *len, is_tz_aware, base);
 
     if (ret != 0) {
         PyErr_SetString(PyExc_ValueError,

diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c
@@ -221,8 +221,18 @@ static PyObject *get_values(PyObject *obj) {
         // The special cases to worry about are dt64tz and category[dt64tz].
         //  In both cases we want the UTC-localized datetime64 ndarray,
         //  without going through and object array of Timestamps.
+        if (PyObject_HasAttrString(obj, "tz")) {
+            PyObject *tz = PyObject_GetAttrString(obj, "tz");
+            if (tz != Py_None) {
+                // Go through object array if we have dt64tz, since tz info will
+                // be lost if values is used directly.
+                Py_DECREF(tz);
+                values = PyObject_CallMethod(obj, "__array__", NULL);
+                return values;
+            }
+            Py_DECREF(tz);
+        }
         values = PyObject_GetAttrString(obj, "values");
-
         if (values == NULL) {
             // Clear so we can subsequently try another method
             PyErr_Clear();

diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.c b/pandas/_libs/tslibs/src/datetime/np_datetime.c
@@ -331,6 +331,34 @@ int cmp_npy_datetimestruct(const npy_datetimestruct *a,
 
     return 0;
 }
+/*
+* Returns the offset from utc of the timezone.
+* If the passed object is timezone naive, or if extraction
+* of the offset fails, NULL is returned.
+*
+* NOTE: This function is not vendored from numpy.
+*/
+PyObject *extract_utc_offset(PyObject *obj) {
+    if (PyObject_HasAttrString(obj, "tzinfo")) {
+        PyObject *tmp = PyObject_GetAttrString(obj, "tzinfo");
+        if (tmp == NULL) {
+            return NULL;
+        }
+        if (tmp != Py_None) {
+            PyObject *offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj);
+            if (offset == NULL) {
+                Py_DECREF(tmp);
+                return NULL;
+            }
+            if (offset != Py_None) {
+                return offset;
+            }
+            Py_DECREF(offset);
+        }
+        Py_DECREF(tmp);
+    }
+    return NULL;
+}
 
 /*
  *
@@ -376,54 +404,38 @@ int convert_pydatetime_to_datetimestruct(PyObject *dtobj,
     out->sec = PyLong_AsLong(PyObject_GetAttrString(obj, "second"));
     out->us = PyLong_AsLong(PyObject_GetAttrString(obj, "microsecond"));
 
+    PyObject *offset = extract_utc_offset(obj);
     /* Apply the time zone offset if datetime obj is tz-aware */
-    if (PyObject_HasAttrString((PyObject*)obj, "tzinfo")) {
-        tmp = PyObject_GetAttrString(obj, "tzinfo");
+    if (offset != NULL) {
+        PyObject *tmp_int;
+        int seconds_offset, minutes_offset;
+        /*
+         * The timedelta should have a function "total_seconds"
+         * which contains the value we want.
+         */
+        tmp = PyObject_CallMethod(offset, "total_seconds", "");
+        Py_DECREF(offset);
         if (tmp == NULL) {
             return -1;
         }
-        if (tmp == Py_None) {
-            Py_DECREF(tmp);
-        } else {
-            PyObject *offset;
-            PyObject *tmp_int;
-            int seconds_offset, minutes_offset;
-
-            /* The utcoffset function should return a timedelta */
-            offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj);
-            if (offset == NULL) {
-                Py_DECREF(tmp);
-                return -1;
-            }
+        tmp_int = PyNumber_Long(tmp);
+        if (tmp_int == NULL) {
             Py_DECREF(tmp);
-
-            /*
-             * The timedelta should have a function "total_seconds"
-             * which contains the value we want.
-             */
-            tmp = PyObject_CallMethod(offset, "total_seconds", "");
-            if (tmp == NULL) {
-                return -1;
-            }
-            tmp_int = PyNumber_Long(tmp);
-            if (tmp_int == NULL) {
-                Py_DECREF(tmp);
-                return -1;
-            }
-            seconds_offset = PyLong_AsLong(tmp_int);
-            if (seconds_offset == -1 && PyErr_Occurred()) {
-                Py_DECREF(tmp_int);
-                Py_DECREF(tmp);
-                return -1;
-            }
+            return -1;
+        }
+        seconds_offset = PyLong_AsLong(tmp_int);
+        if (seconds_offset == -1 && PyErr_Occurred()) {
             Py_DECREF(tmp_int);
             Py_DECREF(tmp);
+            return -1;
+        }
+        Py_DECREF(tmp_int);
+        Py_DECREF(tmp);
 
-            /* Convert to a minutes offset and apply it */
-            minutes_offset = seconds_offset / 60;
+        /* Convert to a minutes offset and apply it */
+        minutes_offset = seconds_offset / 60;
 
-            add_minutes_to_datetimestruct(out, -minutes_offset);
-        }
+        add_minutes_to_datetimestruct(out, -minutes_offset);
     }
 
     return 0;

diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.h b/pandas/_libs/tslibs/src/datetime/np_datetime.h
@@ -48,6 +48,8 @@ extern const npy_datetimestruct _M_MAX_DTS;
 // stuff pandas needs
 // ----------------------------------------------------------------------------
 
+PyObject *extract_utc_offset(PyObject *obj);
+
 int convert_pydatetime_to_datetimestruct(PyObject *dtobj,
                                          npy_datetimestruct *out);
 

diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c
@@ -605,7 +605,7 @@ int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) {
  *  string was too short).
  */
 int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
-                           NPY_DATETIMEUNIT base) {
+                           int utc, NPY_DATETIMEUNIT base) {
     char *substr = outstr;
     int sublen = outlen;
     int tmplen;
@@ -884,13 +884,14 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
 
 add_time_zone:
     /* UTC "Zulu" time */
-    if (sublen < 1) {
-        goto string_too_short;
+    if (utc) {
+        if (sublen < 1) {
+            goto string_too_short;
+        }
+        substr[0] = 'Z';
+        substr += 1;
+        sublen -= 1;
     }
-    substr[0] = 'Z';
-    substr += 1;
-    sublen -= 1;
-
     /* Add a NULL terminator, and return */
     if (sublen > 0) {
         substr[0] = '\0';

diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h
@@ -78,7 +78,7 @@ get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base);
  */
 int
 make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
-                       NPY_DATETIMEUNIT base);
+                       int utc, NPY_DATETIMEUNIT base);
 
 /*
  * Converts an pandas_timedeltastruct to an ISO 8601 string.

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -1974,8 +1974,6 @@ class DatetimeLikeBlock(NDArrayBackedExtensionBlock):
     values: DatetimeArray | TimedeltaArray
 
     def values_for_json(self) -> np.ndarray:
-        # special casing datetimetz to avoid conversion through
-        #  object dtype
         return self.values._ndarray
 
 
@@ -1989,6 +1987,12 @@ class DatetimeTZBlock(DatetimeLikeBlock):
     _validate_ndim = True
     _can_consolidate = False
 
+    def values_for_json(self) -> np.ndarray:
+        # force dt64tz to go through object dtype
+        # tz info will be lost when converting to
+        # dt64 which is naive
+        return self.values.astype(object)
+
 
 class ObjectBlock(NumpyBlock):
     __slots__ = ()

diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py
@@ -301,7 +301,7 @@ def test_to_json(self, df_table):
                     ("idx", 0),
                     ("A", 1),
                     ("B", "a"),
-                    ("C", "2016-01-01T00:00:00.000Z"),
+                    ("C", "2016-01-01T00:00:00.000"),
                     ("D", "P0DT1H0M0S"),
                     ("E", "a"),
                     ("F", "a"),
@@ -314,7 +314,7 @@ def test_to_json(self, df_table):
                     ("idx", 1),
                     ("A", 2),
                     ("B", "b"),
-                    ("C", "2016-01-02T00:00:00.000Z"),
+                    ("C", "2016-01-02T00:00:00.000"),
                     ("D", "P0DT1H1M0S"),
                     ("E", "b"),
                     ("F", "b"),
@@ -327,7 +327,7 @@ def test_to_json(self, df_table):
                     ("idx", 2),
                     ("A", 3),
                     ("B", "c"),
-                    ("C", "2016-01-03T00:00:00.000Z"),
+                    ("C", "2016-01-03T00:00:00.000"),
                     ("D", "P0DT1H2M0S"),
                     ("E", "c"),
                     ("F", "c"),
@@ -340,7 +340,7 @@ def test_to_json(self, df_table):
                     ("idx", 3),
                     ("A", 4),
                     ("B", "c"),
-                    ("C", "2016-01-04T00:00:00.000Z"),
+                    ("C", "2016-01-04T00:00:00.000"),
                     ("D", "P0DT1H3M0S"),
                     ("E", "c"),
                     ("F", "c"),
@@ -397,8 +397,8 @@ def test_to_json_period_index(self):
 
         schema = {"fields": fields, "primaryKey": ["index"]}
         data = [
-            OrderedDict([("index", "2015-11-01T00:00:00.000Z"), ("values", 1)]),
-            OrderedDict([("index", "2016-02-01T00:00:00.000Z"), ("values", 1)]),
+            OrderedDict([("index", "2015-11-01T00:00:00.000"), ("values", 1)]),
+            OrderedDict([("index", "2016-02-01T00:00:00.000"), ("values", 1)]),
         ]
         expected = OrderedDict([("schema", schema), ("data", data)])
 
@@ -635,7 +635,7 @@ def test_timestamp_in_columns(self):
         )
         result = df.to_json(orient="table")
         js = json.loads(result)
-        assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000Z"
+        assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000"
         assert js["schema"]["fields"][2]["name"] == "P0DT0H0M10S"
 
     @pytest.mark.parametrize(

diff --git a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py
@@ -151,7 +151,7 @@ def test_build_date_series(self):
         expected = OrderedDict(
             [
                 ("schema", schema),
-                ("data", [OrderedDict([("id", 0), ("a", "2021-10-10T00:00:00.000Z")])]),
+                ("data", [OrderedDict([("id", 0), ("a", "2021-10-10T00:00:00.000")])]),
             ]
         )
 
@@ -256,7 +256,7 @@ def test_to_json(self):
             OrderedDict(
                 [
                     ("idx", 0),
-                    ("A", "2021-10-10T00:00:00.000Z"),
+                    ("A", "2021-10-10T00:00:00.000"),
                     ("B", 10.0),
                     ("C", "pandas"),
                     ("D", 10),

diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -822,7 +822,7 @@ def test_date_index_and_values(self, date_format, as_object, date_typ):
             expected = '{"1577836800000":1577836800000,"null":null}'
         else:
             expected = (
-                '{"2020-01-01T00:00:00.000Z":"2020-01-01T00:00:00.000Z","null":null}'
+                '{"2020-01-01T00:00:00.000":"2020-01-01T00:00:00.000","null":null}'
             )
 
         if as_object:
@@ -875,8 +875,6 @@ def test_date_format_frame(self, date, date_unit, datetime_frame):
             json = df.to_json(date_format="iso")
         result = read_json(json)
         expected = df.copy()
-        expected.index = expected.index.tz_localize("UTC")
-        expected["date"] = expected["date"].dt.tz_localize("UTC")
         tm.assert_frame_equal(result, expected)
 
     def test_date_format_frame_raises(self, datetime_frame):
@@ -905,8 +903,6 @@ def test_date_format_series(self, date, date_unit, datetime_series):
             json = ts.to_json(date_format="iso")
         result = read_json(json, typ="series")
         expected = ts.copy()
-        expected.index = expected.index.tz_localize("UTC")
-        expected = expected.dt.tz_localize("UTC")
         tm.assert_series_equal(result, expected)
 
     def test_date_format_series_raises(self, datetime_series):
@@ -1192,6 +1188,16 @@ def test_tz_is_utc(self, ts):
         dt = ts.to_pydatetime()
         assert dumps(dt, iso_dates=True) == exp
 
+    def test_tz_is_naive(self):
+        from pandas.io.json import dumps
+
+        ts = Timestamp("2013-01-10 05:00:00")
+        exp = '"2013-01-10T05:00:00.000"'
+
+        assert dumps(ts, iso_dates=True) == exp
+        dt = ts.to_pydatetime()
+        assert dumps(dt, iso_dates=True) == exp
+
     @pytest.mark.parametrize(
         "tz_range",
         [
@@ -1217,6 +1223,21 @@ def test_tz_range_is_utc(self, tz_range):
         result = dumps(df, iso_dates=True)
         assert result == dfexp
 
+    def test_tz_range_is_naive(self):
+        from pandas.io.json import dumps
+
+        tz_range = pd.date_range("2013-01-01 05:00:00", periods=2)
+
+        exp = '["2013-01-01T05:00:00.000","2013-01-02T05:00:00.000"]'
+        dfexp = '{"DT":{"0":"2013-01-01T05:00:00.000","1":"2013-01-02T05:00:00.000"}}'
+
+        assert dumps(tz_range, iso_dates=True) == exp
+        dti = DatetimeIndex(tz_range)
+        assert dumps(dti, iso_dates=True) == exp
+        df = DataFrame({"DT": dti})
+        result = dumps(df, iso_dates=True)
+        assert result == dfexp
+
     def test_read_inline_jsonl(self):
         # GH9180
         result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)