Merge remote-tracking branch 'upstream/main' into td64-tests

patrickmckenna · patrickmckenna · commit e788c257da01 · 2022-05-04T17:21:26.000-07:00
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -194,10 +194,47 @@ did not have the same index as the input.
     df.groupby('a', dropna=True).transform('ffill')
     df.groupby('a', dropna=True).transform(lambda x: x)
 
-.. _whatsnew_150.notable_bug_fixes.notable_bug_fix2:
+.. _whatsnew_150.notable_bug_fixes.to_json_incorrectly_localizing_naive_timestamps:
 
-notable_bug_fix2
-^^^^^^^^^^^^^^^^
+Serializing tz-naive Timestamps with to_json() with ``iso_dates=True``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:meth:`DataFrame.to_json`, :meth:`Series.to_json`, and :meth:`Index.to_json`
+would incorrectly localize DatetimeArrays/DatetimeIndexes with tz-naive Timestamps
+to UTC. (:issue:`38760`)
+
+Note that this patch does not fix the localization of tz-aware Timestamps to UTC
+upon serialization. (Related issue :issue:`12997`)
+
+*Old Behavior*
+
+.. ipython:: python
+
+    index = pd.date_range(
+        start='2020-12-28 00:00:00',
+        end='2020-12-28 02:00:00',
+        freq='1H',
+    )
+    a = pd.Series(
+        data=range(3),
+        index=index,
+    )
+
+.. code-block:: ipython
+
+    In [4]: a.to_json(date_format='iso')
+    Out[4]: '{"2020-12-28T00:00:00.000Z":0,"2020-12-28T01:00:00.000Z":1,"2020-12-28T02:00:00.000Z":2}'
+
+    In [5]: pd.read_json(a.to_json(date_format='iso'), typ="series").index == a.index
+    Out[5]: array([False, False, False])
+
+*New Behavior*
+
+.. ipython:: python
+
+    a.to_json(date_format='iso')
+    # Roundtripping now works
+    pd.read_json(a.to_json(date_format='iso'), typ="series").index == a.index
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.api_breaking:
diff --git a/pandas/_libs/src/ujson/python/date_conversions.c b/pandas/_libs/src/ujson/python/date_conversions.c
@@ -54,8 +54,8 @@ char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) {
         PyErr_NoMemory();
         return NULL;
     }
-
-    ret_code = make_iso_8601_datetime(&dts, result, *len, base);
+    // datetime64 is always naive
+    ret_code = make_iso_8601_datetime(&dts, result, *len, 0, base);
     if (ret_code != 0) {
         PyErr_SetString(PyExc_ValueError,
                         "Could not convert datetime value to string");
@@ -90,7 +90,19 @@ char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base,
 
     *len = (size_t)get_datetime_iso_8601_strlen(0, base);
     char *result = PyObject_Malloc(*len);
-    ret = make_iso_8601_datetime(&dts, result, *len, base);
+    // Check to see if PyDateTime has a timezone.
+    // Don't convert to UTC if it doesn't.
+    int is_tz_aware = 0;
+    if (PyObject_HasAttrString(obj, "tzinfo")) {
+        PyObject *offset = extract_utc_offset(obj);
+        if (offset == NULL) {
+            PyObject_Free(result);
+            return NULL;
+        }
+        is_tz_aware = offset != Py_None;
+        Py_DECREF(offset);
+    }
+    ret = make_iso_8601_datetime(&dts, result, *len, is_tz_aware, base);
 
     if (ret != 0) {
         PyErr_SetString(PyExc_ValueError,
diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c
@@ -221,8 +221,18 @@ static PyObject *get_values(PyObject *obj) {
         // The special cases to worry about are dt64tz and category[dt64tz].
         //  In both cases we want the UTC-localized datetime64 ndarray,
         //  without going through and object array of Timestamps.
+        if (PyObject_HasAttrString(obj, "tz")) {
+            PyObject *tz = PyObject_GetAttrString(obj, "tz");
+            if (tz != Py_None) {
+                // Go through object array if we have dt64tz, since tz info will
+                // be lost if values is used directly.
+                Py_DECREF(tz);
+                values = PyObject_CallMethod(obj, "__array__", NULL);
+                return values;
+            }
+            Py_DECREF(tz);
+        }
         values = PyObject_GetAttrString(obj, "values");
-
         if (values == NULL) {
             // Clear so we can subsequently try another method
             PyErr_Clear();
diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.c b/pandas/_libs/tslibs/src/datetime/np_datetime.c
@@ -331,6 +331,31 @@ int cmp_npy_datetimestruct(const npy_datetimestruct *a,
 
     return 0;
 }
+/*
+* Returns the offset from utc of the timezone as a timedelta.
+* The caller is responsible for ensuring that the tzinfo
+* attribute exists on the datetime object.
+*
+* If the passed object is timezone naive, Py_None is returned.
+* If extraction of the offset fails, NULL is returned.
+*
+* NOTE: This function is not vendored from numpy.
+*/
+PyObject *extract_utc_offset(PyObject *obj) {
+    PyObject *tmp = PyObject_GetAttrString(obj, "tzinfo");
+    if (tmp == NULL) {
+        return NULL;
+    }
+    if (tmp != Py_None) {
+        PyObject *offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj);
+        if (offset == NULL) {
+            Py_DECREF(tmp);
+            return NULL;
+        }
+        return offset;
+    }
+    return tmp;
+}
 
 /*
  *
@@ -376,32 +401,22 @@ int convert_pydatetime_to_datetimestruct(PyObject *dtobj,
     out->sec = PyLong_AsLong(PyObject_GetAttrString(obj, "second"));
     out->us = PyLong_AsLong(PyObject_GetAttrString(obj, "microsecond"));
 
-    /* Apply the time zone offset if datetime obj is tz-aware */
-    if (PyObject_HasAttrString((PyObject*)obj, "tzinfo")) {
-        tmp = PyObject_GetAttrString(obj, "tzinfo");
-        if (tmp == NULL) {
-            return -1;
-        }
-        if (tmp == Py_None) {
-            Py_DECREF(tmp);
-        } else {
-            PyObject *offset;
+    if (PyObject_HasAttrString(obj, "tzinfo")) {
+        PyObject *offset = extract_utc_offset(obj);
+        /* Apply the time zone offset if datetime obj is tz-aware */
+        if (offset != NULL) {
+            if (offset == Py_None) {
+                Py_DECREF(offset);
+                return 0;
+            }
             PyObject *tmp_int;
             int seconds_offset, minutes_offset;
-
-            /* The utcoffset function should return a timedelta */
-            offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj);
-            if (offset == NULL) {
-                Py_DECREF(tmp);
-                return -1;
-            }
-            Py_DECREF(tmp);
-
             /*
              * The timedelta should have a function "total_seconds"
              * which contains the value we want.
              */
             tmp = PyObject_CallMethod(offset, "total_seconds", "");
+            Py_DECREF(offset);
             if (tmp == NULL) {
                 return -1;
             }
diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.h b/pandas/_libs/tslibs/src/datetime/np_datetime.h
@@ -48,6 +48,8 @@ extern const npy_datetimestruct _M_MAX_DTS;
 // stuff pandas needs
 // ----------------------------------------------------------------------------
 
+PyObject *extract_utc_offset(PyObject *obj);
+
 int convert_pydatetime_to_datetimestruct(PyObject *dtobj,
                                          npy_datetimestruct *out);
 
diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c
@@ -632,7 +632,7 @@ int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) {
  *  string was too short).
  */
 int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
-                           NPY_DATETIMEUNIT base) {
+                           int utc, NPY_DATETIMEUNIT base) {
     char *substr = outstr;
     int sublen = outlen;
     int tmplen;
@@ -911,13 +911,14 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
 
 add_time_zone:
     /* UTC "Zulu" time */
-    if (sublen < 1) {
-        goto string_too_short;
+    if (utc) {
+        if (sublen < 1) {
+            goto string_too_short;
+        }
+        substr[0] = 'Z';
+        substr += 1;
+        sublen -= 1;
     }
-    substr[0] = 'Z';
-    substr += 1;
-    sublen -= 1;
-
     /* Add a NULL terminator, and return */
     if (sublen > 0) {
         substr[0] = '\0';
diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h
@@ -79,7 +79,7 @@ get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base);
  */
 int
 make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
-                       NPY_DATETIMEUNIT base);
+                       int utc, NPY_DATETIMEUNIT base);
 
 /*
  * Converts an pandas_timedeltastruct to an ISO 8601 string.
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -2599,6 +2599,8 @@ def to_hdf(
               like searching / selecting subsets of the data.
             - If None, pd.get_option('io.hdf.default_format') is checked,
               followed by fallback to "fixed".
+        index : bool, default True
+            Write DataFrame index as a column.
         errors : str, default 'strict'
             Specifies how encoding and decoding errors are to be handled.
             See the errors argument for :func:`open` for a full list
@@ -2609,6 +2611,8 @@ def to_hdf(
         nan_rep : Any, optional
             How to represent null values as str.
             Not allowed with append=True.
+        dropna : bool, default False, optional
+            Remove missing values.
         data_columns : list of columns or True, optional
             List of columns to create as indexed data columns for on-disk
             queries, or True to use all columns. By default only the axes
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -1974,8 +1974,6 @@ class DatetimeLikeBlock(NDArrayBackedExtensionBlock):
     values: DatetimeArray | TimedeltaArray
 
     def values_for_json(self) -> np.ndarray:
-        # special casing datetimetz to avoid conversion through
-        #  object dtype
         return self.values._ndarray
 
 
@@ -1989,6 +1987,12 @@ class DatetimeTZBlock(DatetimeLikeBlock):
     _validate_ndim = True
     _can_consolidate = False
 
+    def values_for_json(self) -> np.ndarray:
+        # force dt64tz to go through object dtype
+        # tz info will be lost when converting to
+        # dt64 which is naive
+        return self.values.astype(object)
+
 
 class ObjectBlock(NumpyBlock):
     __slots__ = ()
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -1112,6 +1112,8 @@ def put(
                 Table format.  Write as a PyTables Table structure which may perform
                 worse but allow more flexible operations like searching / selecting
                 subsets of the data.
+        index : bool, default True
+            Write DataFrame index as a column.
         append : bool, default False
             This will force Table format, append the input data to the existing.
         data_columns : list of columns or True, default None
@@ -1124,6 +1126,8 @@ def put(
             Parameter is propagated to 'create_table' method of 'PyTables'.
             If set to False it enables to have the same h5 files (same hashes)
             independent on creation time.
+        dropna : bool, default False, optional
+            Remove missing values.
 
             .. versionadded:: 1.1.0
         """
@@ -1239,6 +1243,8 @@ def append(
                 Table format. Write as a PyTables Table structure which may perform
                 worse but allow more flexible operations like searching / selecting
                 subsets of the data.
+        index : bool, default True
+            Write DataFrame index as a column.
         append       : bool, default True
             Append the input data to the existing.
         data_columns : list of columns, or True, default None
@@ -1251,7 +1257,7 @@ def append(
         chunksize    : size to chunk the writing
         expectedrows : expected TOTAL row size of this table
         encoding     : default None, provide an encoding for str
-        dropna : bool, default False
+        dropna : bool, default False, optional
             Do not write an ALL nan row to the store settable
             by the option 'io.hdf.dropna_table'.
 
diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py
@@ -301,7 +301,7 @@ def test_to_json(self, df_table):
                     ("idx", 0),
                     ("A", 1),
                     ("B", "a"),
-                    ("C", "2016-01-01T00:00:00.000Z"),
+                    ("C", "2016-01-01T00:00:00.000"),
                     ("D", "P0DT1H0M0S"),
                     ("E", "a"),
                     ("F", "a"),
@@ -314,7 +314,7 @@ def test_to_json(self, df_table):
                     ("idx", 1),
                     ("A", 2),
                     ("B", "b"),
-                    ("C", "2016-01-02T00:00:00.000Z"),
+                    ("C", "2016-01-02T00:00:00.000"),
                     ("D", "P0DT1H1M0S"),
                     ("E", "b"),
                     ("F", "b"),
@@ -327,7 +327,7 @@ def test_to_json(self, df_table):
                     ("idx", 2),
                     ("A", 3),
                     ("B", "c"),
-                    ("C", "2016-01-03T00:00:00.000Z"),
+                    ("C", "2016-01-03T00:00:00.000"),
                     ("D", "P0DT1H2M0S"),
                     ("E", "c"),
                     ("F", "c"),
@@ -340,7 +340,7 @@ def test_to_json(self, df_table):
                     ("idx", 3),
                     ("A", 4),
                     ("B", "c"),
-                    ("C", "2016-01-04T00:00:00.000Z"),
+                    ("C", "2016-01-04T00:00:00.000"),
                     ("D", "P0DT1H3M0S"),
                     ("E", "c"),
                     ("F", "c"),
@@ -397,8 +397,8 @@ def test_to_json_period_index(self):
 
         schema = {"fields": fields, "primaryKey": ["index"]}
         data = [
-            OrderedDict([("index", "2015-11-01T00:00:00.000Z"), ("values", 1)]),
-            OrderedDict([("index", "2016-02-01T00:00:00.000Z"), ("values", 1)]),
+            OrderedDict([("index", "2015-11-01T00:00:00.000"), ("values", 1)]),
+            OrderedDict([("index", "2016-02-01T00:00:00.000"), ("values", 1)]),
         ]
         expected = OrderedDict([("schema", schema), ("data", data)])
 
@@ -635,7 +635,7 @@ def test_timestamp_in_columns(self):
         )
         result = df.to_json(orient="table")
         js = json.loads(result)
-        assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000Z"
+        assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000"
         assert js["schema"]["fields"][2]["name"] == "P0DT0H0M10S"
 
     @pytest.mark.parametrize(
diff --git a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py
@@ -145,7 +145,7 @@ def test_build_date_series(self):
         expected = OrderedDict(
             [
                 ("schema", schema),
-                ("data", [OrderedDict([("id", 0), ("a", "2021-10-10T00:00:00.000Z")])]),
+                ("data", [OrderedDict([("id", 0), ("a", "2021-10-10T00:00:00.000")])]),
             ]
         )
 
@@ -250,7 +250,7 @@ def test_to_json(self):
             OrderedDict(
                 [
                     ("idx", 0),
-                    ("A", "2021-10-10T00:00:00.000Z"),
+                    ("A", "2021-10-10T00:00:00.000"),
                     ("B", 10.0),
                     ("C", "pandas"),
                     ("D", 10),
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py

Original file line number	Diff line number	Diff line change
`@@ -145,7 +145,7 @@ def test_build_date_series(self):`
`145`	`145`	`expected = OrderedDict(`
`146`	`146`	`[`
`147`	`147`	`("schema", schema),`
`148`		`- ("data", [OrderedDict([("id", 0), ("a", "2021-10-10T00:00:00.000Z")])]),`
	`148`	`+ ("data", [OrderedDict([("id", 0), ("a", "2021-10-10T00:00:00.000")])]),`
`149`	`149`	`]`
`150`	`150`	`)`
`151`	`151`
`@@ -250,7 +250,7 @@ def test_to_json(self):`
`250`	`250`	`OrderedDict(`
`251`	`251`	`[`
`252`	`252`	`("idx", 0),`
`253`		`- ("A", "2021-10-10T00:00:00.000Z"),`
	`253`	`+ ("A", "2021-10-10T00:00:00.000"),`
`254`	`254`	`("B", 10.0),`
`255`	`255`	`("C", "pandas"),`
`256`	`256`	`("D", 10),`