BUG: pytables with non-nano dt64 (#55622)

jbrockmendel · web-flow · commit ea65f90ec60b · 2023-10-23T09:51:15.000-07:00
* BUG: pytables with non-nano dt64

* GH ref

* fix whatsnew
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -365,8 +365,10 @@ I/O
 - Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`)
 - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`)
 - Bug in :func:`to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`)
+- Bug in :meth:`DataFrame.to_hdf` and :func:`read_hdf` with ``datetime64`` dtypes with non-nanosecond resolution failing to round-trip correctly (:issue:`55622`)
 - Bug in :meth:`pandas.read_excel` with an ODS file without cached formatted cell for float values (:issue:`55219`)
 - Bug where :meth:`DataFrame.to_json` would raise an ``OverflowError`` instead of a ``TypeError`` with unsupported NumPy types (:issue:`55403`)
+-
 
 Period
 ^^^^^^
diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py
@@ -217,7 +217,7 @@ def stringify(value):
 
         kind = ensure_decoded(self.kind)
         meta = ensure_decoded(self.meta)
-        if kind in ("datetime64", "datetime"):
+        if kind == "datetime" or (kind and kind.startswith("datetime64")):
             if isinstance(v, (int, float)):
                 v = stringify(v)
             v = ensure_decoded(v)
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -2152,7 +2152,6 @@ def convert(
 
         val_kind = _ensure_decoded(self.kind)
         values = _maybe_convert(values, val_kind, encoding, errors)
-
         kwargs = {}
         kwargs["name"] = _ensure_decoded(self.index_name)
 
@@ -2577,7 +2576,7 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
         dtype = _ensure_decoded(dtype_name)
 
         # reverse converts
-        if dtype == "datetime64":
+        if dtype.startswith("datetime64"):
             # recreate with tz if indicated
             converted = _set_tz(converted, tz, coerce=True)
 
@@ -2870,7 +2869,9 @@ def _get_index_factory(self, attrs):
 
             def f(values, freq=None, tz=None):
                 # data are already in UTC, localize and convert if tz present
-                dta = DatetimeArray._simple_new(values.values, freq=freq)
+                dta = DatetimeArray._simple_new(
+                    values.values, dtype=values.dtype, freq=freq
+                )
                 result = DatetimeIndex._simple_new(dta, name=None)
                 if tz is not None:
                     result = result.tz_localize("UTC").tz_convert(tz)
@@ -2961,7 +2962,7 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None
             else:
                 ret = node[start:stop]
 
-            if dtype == "datetime64":
+            if dtype and dtype.startswith("datetime64"):
                 # reconstruct a timezone if indicated
                 tz = getattr(attrs, "tz", None)
                 ret = _set_tz(ret, tz, coerce=True)
@@ -3170,7 +3171,7 @@ def write_array(
 
         elif lib.is_np_dtype(value.dtype, "M"):
             self._handle.create_array(self.group, key, value.view("i8"))
-            getattr(self.group, key)._v_attrs.value_type = "datetime64"
+            getattr(self.group, key)._v_attrs.value_type = str(value.dtype)
         elif isinstance(value.dtype, DatetimeTZDtype):
             # store as UTC
             # with a zone
@@ -3185,7 +3186,7 @@ def write_array(
             # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no
             # attribute "tz"
             node._v_attrs.tz = _get_tz(value.tz)  # type: ignore[union-attr]
-            node._v_attrs.value_type = "datetime64"
+            node._v_attrs.value_type = f"datetime64[{value.dtype.unit}]"
         elif lib.is_np_dtype(value.dtype, "m"):
             self._handle.create_array(self.group, key, value.view("i8"))
             getattr(self.group, key)._v_attrs.value_type = "timedelta64"
@@ -4689,7 +4690,6 @@ def read(
         selection = Selection(self, where=where, start=start, stop=stop)
         # apply the selection filters & axis orderings
         df = self.process_axes(df, selection=selection, columns=columns)
-
         return df
 
 
@@ -4932,11 +4932,12 @@ def _set_tz(
         #  call below (which returns an ndarray).  So we are only non-lossy
         #  if `tz` matches `values.tz`.
         assert values.tz is None or values.tz == tz
+        if values.tz is not None:
+            return values
 
     if tz is not None:
         if isinstance(values, DatetimeIndex):
             name = values.name
-            values = values.asi8
         else:
             name = None
             values = values.ravel()
@@ -5019,8 +5020,12 @@ def _convert_index(name: str, index: Index, encoding: str, errors: str) -> Index
 def _unconvert_index(data, kind: str, encoding: str, errors: str) -> np.ndarray | Index:
     index: Index | np.ndarray
 
-    if kind == "datetime64":
-        index = DatetimeIndex(data)
+    if kind.startswith("datetime64"):
+        if kind == "datetime64":
+            # created before we stored resolution information
+            index = DatetimeIndex(data)
+        else:
+            index = DatetimeIndex(data.view(kind))
     elif kind == "timedelta64":
         index = TimedeltaIndex(data)
     elif kind == "date":
@@ -5194,6 +5199,8 @@ def _maybe_convert(values: np.ndarray, val_kind: str, encoding: str, errors: str
 def _get_converter(kind: str, encoding: str, errors: str):
     if kind == "datetime64":
         return lambda x: np.asarray(x, dtype="M8[ns]")
+    elif "datetime64" in kind:
+        return lambda x: np.asarray(x, dtype=kind)
     elif kind == "string":
         return lambda x: _unconvert_string_array(
             x, nan_rep=None, encoding=encoding, errors=errors
@@ -5203,7 +5210,7 @@ def _get_converter(kind: str, encoding: str, errors: str):
 
 
 def _need_convert(kind: str) -> bool:
-    if kind in ("datetime64", "string"):
+    if kind in ("datetime64", "string") or "datetime64" in kind:
         return True
     return False
 
@@ -5248,7 +5255,7 @@ def _dtype_to_kind(dtype_str: str) -> str:
     elif dtype_str.startswith(("int", "uint")):
         kind = "integer"
     elif dtype_str.startswith("datetime64"):
-        kind = "datetime64"
+        kind = dtype_str
     elif dtype_str.startswith("timedelta"):
         kind = "timedelta64"
     elif dtype_str.startswith("bool"):
@@ -5273,8 +5280,11 @@ def _get_data_and_dtype_name(data: ArrayLike):
     if isinstance(data, Categorical):
         data = data.codes
 
-    # For datetime64tz we need to drop the TZ in tests TODO: why?
-    dtype_name = data.dtype.name.split("[")[0]
+    if isinstance(data.dtype, DatetimeTZDtype):
+        # For datetime64tz we need to drop the TZ in tests TODO: why?
+        dtype_name = f"datetime64[{data.dtype.unit}]"
+    else:
+        dtype_name = data.dtype.name
 
     if data.dtype.kind in "mM":
         data = np.asarray(data.view("i8"))
diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py
@@ -772,7 +772,7 @@ def test_append_raise(setup_path):
             "dtype->bytes24,kind->string,shape->(1, 30)] "
             "vs current table "
             "[name->values_block_1,cname->values_block_1,"
-            "dtype->datetime64,kind->datetime64,shape->None]"
+            "dtype->datetime64[s],kind->datetime64[s],shape->None]"
         )
         with pytest.raises(ValueError, match=msg):
             store.append("df", df)
diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py
@@ -49,7 +49,7 @@ def test_table_index_incompatible_dtypes(setup_path):
 
     with ensure_clean_store(setup_path) as store:
         store.put("frame", df1, format="table")
-        msg = re.escape("incompatible kind in col [integer - datetime64]")
+        msg = re.escape("incompatible kind in col [integer - datetime64[ns]]")
         with pytest.raises(TypeError, match=msg):
             store.put("frame", df2, format="table", append=True)
 
diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py
@@ -541,16 +541,22 @@ def test_store_index_name(setup_path):
         tm.assert_frame_equal(recons, df)
 
 
+@pytest.mark.parametrize("tz", [None, "US/Pacific"])
+@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"])
 @pytest.mark.parametrize("table_format", ["table", "fixed"])
-def test_store_index_name_numpy_str(tmp_path, table_format, setup_path):
+def test_store_index_name_numpy_str(tmp_path, table_format, setup_path, unit, tz):
     # GH #13492
     idx = Index(
         pd.to_datetime([dt.date(2000, 1, 1), dt.date(2000, 1, 2)]),
         name="cols\u05d2",
-    )
-    idx1 = Index(
-        pd.to_datetime([dt.date(2010, 1, 1), dt.date(2010, 1, 2)]),
-        name="rows\u05d0",
+    ).tz_localize(tz)
+    idx1 = (
+        Index(
+            pd.to_datetime([dt.date(2010, 1, 1), dt.date(2010, 1, 2)]),
+            name="rows\u05d0",
+        )
+        .as_unit(unit)
+        .tz_localize(tz)
     )
     df = DataFrame(np.arange(4).reshape(2, 2), columns=idx, index=idx1)
 

Original file line number	Diff line number	Diff line change
`@@ -772,7 +772,7 @@ def test_append_raise(setup_path):`
`772`	`772`	`"dtype->bytes24,kind->string,shape->(1, 30)] "`
`773`	`773`	`"vs current table "`
`774`	`774`	`"[name->values_block_1,cname->values_block_1,"`
`775`		`- "dtype->datetime64,kind->datetime64,shape->None]"`
	`775`	`+ "dtype->datetime64[s],kind->datetime64[s],shape->None]"`
`776`	`776`	`)`
`777`	`777`	`with pytest.raises(ValueError, match=msg):`
`778`	`778`	`store.append("df", df)`