From 56ec9468ad142461a0063b102afe1b1f6621dccd Mon Sep 17 00:00:00 2001 From: Pedro Reys Date: Thu, 6 Feb 2020 16:40:18 -0300 Subject: [PATCH] BUG Decode to UTF-8 the dtype string read from a hdf file Fixes GH31750 The dtype value wasn't being decoded to `UTF-8` when reading a DataFrame from a hdf file. This was a problem when reading a hdf that was created from python 2 with a fixed format as the dtype was being read as `b'datetime'` instead of `datetime`, which caused `HDFStore` to read the data as `int64` instead of coercing it to the correct `datetime64` dtype. move doc to right file --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/io/pytables.py | 2 +- .../legacy_table_fixed_datetime_py2.h5 | Bin 0 -> 7104 bytes pandas/tests/io/pytables/test_store.py | 15 +++++++++++++++ 4 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 pandas/tests/io/data/legacy_hdf/legacy_table_fixed_datetime_py2.h5 diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 64f0cb3f2e26d..0ac26ca65f468 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -184,6 +184,7 @@ I/O - Bug in :meth:`DataFrame.to_parquet` overwriting pyarrow's default for ``coerce_timestamps``; following pyarrow's default allows writing nanosecond timestamps with ``version="2.0"`` (:issue:`31652`). +- Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`) Plotting ^^^^^^^^ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index c1e12887b0150..0e2b909d5cdc7 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2722,7 +2722,7 @@ def read_array( if isinstance(node, tables.VLArray): ret = node[0][start:stop] else: - dtype = getattr(attrs, "value_type", None) + dtype = _ensure_decoded(getattr(attrs, "value_type", None)) shape = getattr(attrs, "shape", None) if shape is not None: diff --git a/pandas/tests/io/data/legacy_hdf/legacy_table_fixed_datetime_py2.h5 b/pandas/tests/io/data/legacy_hdf/legacy_table_fixed_datetime_py2.h5 new file mode 100644 index 0000000000000000000000000000000000000000..18cfae15a3a78c2f4bd78c7fd23ec05e996c077b GIT binary patch literal 7104 zcmeHLPfrs;6rXJiEKsGW#D5n~hEqd{8V?>=rC5?mo3@g2(7MuvY@pj_w-EUTdh=r9 z)teuKBcH<)N4|m1`!k)*(ggz^s_ZtM*_nCs-pt$Id-L9WF0HKIn0hcJ;4w{+5jlKS z#V>x29#W5LtMT{zVsQP5>mMn4F~qolv3uN~WB&7L>Nw&4My*&Bl2Y+kH4ls9ae6aV zM^axa4U~#j6*k=DT`QCxGtQJU+S~M|@K{mY3 z2iZzK63HS;|FUe=>#`r{$K+qpzp7Q`j-oH*7iA0RpJ8oC zO9T4qX2W_>F4kMCmD+2o(TbbDlrQkSS>zeD(;OSzDGR*FPIJ@9ZeZ`!O3Le^fvBAl zDuw8=|5kdiKe_=R7ivJ@hrRCB?qch}-rIKq-FEv<2bu%C&qHI#Q(v#F7?O(ND$EQK zDLYx@b~Un&A*`--mpY$IbEF3FMet~UG%Eg%{GK6K2L7&Rdu=;t^$&Xvt~csew|%<< zBUz#5&4lM2IDX)EJ@Ur@kpXVr9^_4g z=K0Rf{+{hiJ~PCN7p}g4^Nu8z&nZ=dGA+3SDe!}OjN7SH3OShU`O`)-OOu!}HN-96 z1@u00p1;%jAza0si&m{>ZOS>C{`_-sDgPKHX2m?#bWUf^oYPW&N`**Aek8-IlwA}V zfK6es%sH-94Xf)JLth-FL+D_pg&bPv@*rUv09!L<5Ni z5)C98NHmaW;BROE?jhhY42u7#)N!kh<0vj(En6Fv8pW02`w(xx(}Va`-)~G~T-^6< zFX(jxr;TC+gsUWjqU3ma5V4wK*Y$Abus9r|dc8iP_b7dcjq_F9+~c^dO3RXbRKB6( z^1>g*mA|eTL(5NOj}m@=cD*IU(kCj%r-4zr+89lP{bHG|%9~M0|UAAri#q zeH!1>h?AoMPAJnbC>>*)P_}WlPRsjEm>SJ58AX4bmL`+%lRpF-yFTK(tPL8fAM`8-j9Xc$vJuc<^2ilH69Pn QKb7ZS1rKo3G-~J0Z%r>E&Hw-a literal 0 HcmV?d00001 diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index f56d042093886..547de39eec5e0 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -4074,6 +4074,21 @@ def test_legacy_table_fixed_format_read_py2(self, datapath, setup_path): ) tm.assert_frame_equal(expected, result) + def test_legacy_table_fixed_format_read_datetime_py2(self, datapath, setup_path): + # GH 31750 + # legacy table with fixed format and datetime64 column written in Python 2 + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "legacy_table_fixed_datetime_py2.h5"), + mode="r", + ) as store: + result = store.select("df") + expected = pd.DataFrame( + [[pd.Timestamp("2020-02-06T18:00")]], + columns=["A"], + index=pd.Index(["date"]), + ) + tm.assert_frame_equal(expected, result) + def test_legacy_table_read_py2(self, datapath, setup_path): # issue: 24925 # legacy table written in Python 2