From 52a45be100ffb9ad7f58a47c0414d26094b860c2 Mon Sep 17 00:00:00 2001 From: Chaoyi Hu Date: Fri, 14 Jun 2024 14:22:29 -0700 Subject: [PATCH 1/8] Fix wrong save of datetime64[s] in HDFStore --- pandas/io/pytables.py | 12 +++++++++--- pandas/tests/io/pytables/test_read.py | 11 +++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 4fce338ccad6f..30b0055fe6fb8 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3036,7 +3036,10 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None if dtype and dtype.startswith("datetime64"): # reconstruct a timezone if indicated tz = getattr(attrs, "tz", None) - ret = _set_tz(ret, tz) + if dtype == "datetime64[s]": + ret = _set_tz(ret, tz, unit="s") + else: + ret = _set_tz(ret, tz) # default unit is ns elif dtype == "timedelta64": ret = np.asarray(ret, dtype="m8[ns]") @@ -4964,7 +4967,9 @@ def _get_tz(tz: tzinfo) -> str | tzinfo: return zone -def _set_tz(values: npt.NDArray[np.int64], tz: str | tzinfo | None) -> DatetimeArray: +def _set_tz( + values: npt.NDArray[np.int64], tz: str | tzinfo | None, unit: str = "ns" +) -> DatetimeArray: """ Coerce the values to a DatetimeArray with appropriate tz. @@ -4972,11 +4977,12 @@ def _set_tz(values: npt.NDArray[np.int64], tz: str | tzinfo | None) -> DatetimeA ---------- values : ndarray[int64] tz : str, tzinfo, or None + unit : str. The default unit is ns. Needs to be specified otherwise. """ assert values.dtype == "i8", values.dtype # Argument "tz" to "tz_to_dtype" has incompatible type "str | tzinfo | None"; # expected "tzinfo" - dtype = tz_to_dtype(tz=tz, unit="ns") # type: ignore[arg-type] + dtype = tz_to_dtype(tz=tz, unit=unit) # type: ignore[arg-type] dta = DatetimeArray._from_sequence(values, dtype=dtype) return dta diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index e33ddaf3b81f0..5db6cc48a7f35 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -317,3 +317,14 @@ def test_read_infer_string(tmp_path, setup_path): columns=Index(["a"], dtype="string[pyarrow_numpy]"), ) tm.assert_frame_equal(result, expected) + + +def test_hdfstore_read_datetime64_unit_s(): + # Fix issue 59004: HDFStore doesn't save datetime64[s] right + df_s = DataFrame(["2001-01-01", "2002-02-02"], dtype="datetime64[s]") + with HDFStore("deleteme.h5", mode="w") as store: + store.put("df_s", df_s) + with HDFStore("deleteme.h5", mode="r") as store: + df_fromstore = store.get("df_s") + tm.assert_frame_equal(df_s, df_fromstore) + Path("deleteme.h5").unlink() # Delete created file From d235341116d119e67a165a87fd11d08b4ef753c7 Mon Sep 17 00:00:00 2001 From: Chaoyi Hu Date: Fri, 14 Jun 2024 15:03:46 -0700 Subject: [PATCH 2/8] generic datetime unit parsing --- pandas/io/pytables.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 30b0055fe6fb8..35d48cc70e156 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3036,10 +3036,8 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None if dtype and dtype.startswith("datetime64"): # reconstruct a timezone if indicated tz = getattr(attrs, "tz", None) - if dtype == "datetime64[s]": - ret = _set_tz(ret, tz, unit="s") - else: - ret = _set_tz(ret, tz) # default unit is ns + # set time zone with parsed datetime64 unit + ret = _set_tz(ret, tz, unit=dtype.split("[")[1].strip("]")) elif dtype == "timedelta64": ret = np.asarray(ret, dtype="m8[ns]") From 135f2ded47f2f129e84cce42e26849fcc11e5457 Mon Sep 17 00:00:00 2001 From: Chaoyi Hu Date: Fri, 14 Jun 2024 15:04:27 -0700 Subject: [PATCH 3/8] use tmp_path --- pandas/tests/io/pytables/test_read.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index 5db6cc48a7f35..3f60ce93400a8 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -319,12 +319,12 @@ def test_read_infer_string(tmp_path, setup_path): tm.assert_frame_equal(result, expected) -def test_hdfstore_read_datetime64_unit_s(): +def test_hdfstore_read_datetime64_unit_s(tmp_path, setup_path): # Fix issue 59004: HDFStore doesn't save datetime64[s] right df_s = DataFrame(["2001-01-01", "2002-02-02"], dtype="datetime64[s]") - with HDFStore("deleteme.h5", mode="w") as store: + path = tmp_path / setup_path + with HDFStore(path, mode="w") as store: store.put("df_s", df_s) - with HDFStore("deleteme.h5", mode="r") as store: + with HDFStore(path, mode="r") as store: df_fromstore = store.get("df_s") tm.assert_frame_equal(df_s, df_fromstore) - Path("deleteme.h5").unlink() # Delete created file From b8422453e8494e70cfe411043336290e30c21557 Mon Sep 17 00:00:00 2001 From: Chaoyi Hu Date: Fri, 14 Jun 2024 15:05:09 -0700 Subject: [PATCH 4/8] Adding entry to whatsnew --- doc/source/whatsnew/v2.2.2.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst index 72a2f84c4aaee..16ed675836d91 100644 --- a/doc/source/whatsnew/v2.2.2.rst +++ b/doc/source/whatsnew/v2.2.2.rst @@ -42,6 +42,7 @@ Bug fixes - :meth:`DataFrame.__dataframe__` was showing bytemask instead of bitmask for ``'string[pyarrow]'`` validity buffer (:issue:`57762`) - :meth:`DataFrame.__dataframe__` was showing non-null validity buffer (instead of ``None``) ``'string[pyarrow]'`` without missing values (:issue:`57761`) - :meth:`DataFrame.to_sql` was failing to find the right table when using the schema argument (:issue:`57539`) +- :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) .. --------------------------------------------------------------------------- .. _whatsnew_222.other: From afd65264671716d98031a95510a5af6a35392b85 Mon Sep 17 00:00:00 2001 From: Chaoyi Hu Date: Fri, 14 Jun 2024 18:32:10 -0700 Subject: [PATCH 5/8] datetime64 dtype parsing using numpy api --- pandas/io/pytables.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 35d48cc70e156..d98c51159eb63 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2655,7 +2655,7 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): # reverse converts if dtype.startswith("datetime64"): # recreate with tz if indicated - converted = _set_tz(converted, tz) + converted = _set_tz(converted, tz, dtype) elif dtype == "timedelta64": converted = np.asarray(converted, dtype="m8[ns]") @@ -3036,8 +3036,7 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None if dtype and dtype.startswith("datetime64"): # reconstruct a timezone if indicated tz = getattr(attrs, "tz", None) - # set time zone with parsed datetime64 unit - ret = _set_tz(ret, tz, unit=dtype.split("[")[1].strip("]")) + ret = _set_tz(ret, tz, dtype) elif dtype == "timedelta64": ret = np.asarray(ret, dtype="m8[ns]") @@ -4966,7 +4965,7 @@ def _get_tz(tz: tzinfo) -> str | tzinfo: def _set_tz( - values: npt.NDArray[np.int64], tz: str | tzinfo | None, unit: str = "ns" + values: npt.NDArray[np.int64], tz: str | tzinfo | None, datetime64_dtype: str ) -> DatetimeArray: """ Coerce the values to a DatetimeArray with appropriate tz. @@ -4975,11 +4974,12 @@ def _set_tz( ---------- values : ndarray[int64] tz : str, tzinfo, or None - unit : str. The default unit is ns. Needs to be specified otherwise. + datetime64_dtype : str, e.g. "datetime64[ns]", "datetime64[25s]" """ assert values.dtype == "i8", values.dtype # Argument "tz" to "tz_to_dtype" has incompatible type "str | tzinfo | None"; # expected "tzinfo" + unit, _ = np.datetime_data(datetime64_dtype) # parsing dtype: unit, count dtype = tz_to_dtype(tz=tz, unit=unit) # type: ignore[arg-type] dta = DatetimeArray._from_sequence(values, dtype=dtype) return dta From bcf65723a5b1d8ae2fd9857013ef82b781e98455 Mon Sep 17 00:00:00 2001 From: Chaoyi Hu Date: Sat, 15 Jun 2024 13:12:30 -0700 Subject: [PATCH 6/8] move whatsnew entry --- doc/source/whatsnew/v2.2.2.rst | 1 - doc/source/whatsnew/v3.0.0.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst index 16ed675836d91..72a2f84c4aaee 100644 --- a/doc/source/whatsnew/v2.2.2.rst +++ b/doc/source/whatsnew/v2.2.2.rst @@ -42,7 +42,6 @@ Bug fixes - :meth:`DataFrame.__dataframe__` was showing bytemask instead of bitmask for ``'string[pyarrow]'`` validity buffer (:issue:`57762`) - :meth:`DataFrame.__dataframe__` was showing non-null validity buffer (instead of ``None``) ``'string[pyarrow]'`` without missing values (:issue:`57761`) - :meth:`DataFrame.to_sql` was failing to find the right table when using the schema argument (:issue:`57539`) -- :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) .. --------------------------------------------------------------------------- .. _whatsnew_222.other: diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 865996bdf8892..27174c75fec0f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -512,6 +512,7 @@ I/O - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) +- Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) Period From b8ee8f1bd24e28de3a97e62cfbb6736176c0c8b6 Mon Sep 17 00:00:00 2001 From: Chaoyi Hu Date: Sat, 15 Jun 2024 13:13:05 -0700 Subject: [PATCH 7/8] update test comment --- pandas/tests/io/pytables/test_read.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index 3f60ce93400a8..ba108370a4a92 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -320,7 +320,7 @@ def test_read_infer_string(tmp_path, setup_path): def test_hdfstore_read_datetime64_unit_s(tmp_path, setup_path): - # Fix issue 59004: HDFStore doesn't save datetime64[s] right + # GH 59004 df_s = DataFrame(["2001-01-01", "2002-02-02"], dtype="datetime64[s]") path = tmp_path / setup_path with HDFStore(path, mode="w") as store: From bf3ee6260aa953865bd5a8de20d01bbbce9d3972 Mon Sep 17 00:00:00 2001 From: Chaoyi Hu Date: Sat, 15 Jun 2024 14:49:02 -0700 Subject: [PATCH 8/8] update hdfstore dtypes test case --- pandas/tests/io/pytables/test_round_trip.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 51ee289c8e27a..3ad05cec3bca3 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -236,8 +236,10 @@ def test_table_values_dtypes_roundtrip(setup_path): df1["float322"] = 1.0 df1["float322"] = df1["float322"].astype("float32") df1["bool"] = df1["float32"] > 0 - df1["time1"] = Timestamp("20130101") - df1["time2"] = Timestamp("20130102") + df1["time_s_1"] = Timestamp("20130101") + df1["time_s_2"] = Timestamp("20130101 00:00:00") + df1["time_ms"] = Timestamp("20130101 00:00:00.000") + df1["time_ns"] = Timestamp("20130102 00:00:00.000000000") store.append("df_mixed_dtypes1", df1) result = store.select("df_mixed_dtypes1").dtypes.value_counts() @@ -252,7 +254,9 @@ def test_table_values_dtypes_roundtrip(setup_path): "int8": 1, "int64": 1, "object": 1, - "datetime64[ns]": 2, + "datetime64[s]": 2, + "datetime64[ms]": 1, + "datetime64[ns]": 1, }, name="count", )