diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5c74965bffdd7..1437006ee3fb8 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -234,6 +234,7 @@ Other enhancements compression library. Compression was also added to the low-level Stata-file writers :class:`~pandas.io.stata.StataWriter`, :class:`~pandas.io.stata.StataWriter117`, and :class:`~pandas.io.stata.StataWriterUTF8` (:issue:`26599`). +- :meth:`HDFStore.put` now accepts `track_times` parameter. Parameter is passed to ``create_table`` method of ``PyTables`` (:issue:`32682`). .. --------------------------------------------------------------------------- diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 345402e619ff2..85fcfd107b121 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -984,6 +984,7 @@ def put( data_columns: Optional[List[str]] = None, encoding=None, errors: str = "strict", + track_times: bool = True, ): """ Store object in HDFStore. @@ -1010,6 +1011,12 @@ def put( Provide an encoding for strings. dropna : bool, default False, do not write an ALL nan row to The store settable by the option 'io.hdf.dropna_table'. + track_times : bool, default True + Parameter is propagated to 'create_table' method of 'PyTables'. + If set to False it enables to have the same h5 files (same hashes) + independent on creation time. + + .. versionadded:: 1.1.0 """ if format is None: format = get_option("io.hdf.default_format") or "fixed" @@ -1027,6 +1034,7 @@ def put( data_columns=data_columns, encoding=encoding, errors=errors, + track_times=track_times, ) def remove(self, key: str, where=None, start=None, stop=None): @@ -1626,6 +1634,7 @@ def _write_to_group( data_columns=None, encoding=None, errors: str = "strict", + track_times: bool = True, ): group = self.get_node(key) @@ -1688,6 +1697,7 @@ def _write_to_group( dropna=dropna, nan_rep=nan_rep, data_columns=data_columns, + track_times=track_times, ) if isinstance(s, Table) and index: @@ -4106,8 +4116,8 @@ def write( dropna=False, nan_rep=None, data_columns=None, + track_times=True, ): - if not append and self.is_exists: self._handle.remove_node(self.group, "table") @@ -4137,6 +4147,8 @@ def write( # set the table attributes table.set_attrs() + options["track_times"] = track_times + # create the table table._handle.create_table(table.group, **options) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index c937f9ac42818..fe59b989bab7e 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -1,10 +1,12 @@ import datetime from datetime import timedelta from distutils.version import LooseVersion +import hashlib from io import BytesIO import os from pathlib import Path import re +import time from warnings import catch_warnings, simplefilter import numpy as np @@ -296,6 +298,49 @@ def test_keys(self, setup_path): assert set(store.keys()) == expected assert set(store) == expected + def test_no_track_times(self, setup_path): + + # GH 32682 + # enables to set track_times (see `pytables` `create_table` documentation) + + def checksum(filename, hash_factory=hashlib.md5, chunk_num_blocks=128): + h = hash_factory() + with open(filename, "rb") as f: + for chunk in iter(lambda: f.read(chunk_num_blocks * h.block_size), b""): + h.update(chunk) + return h.digest() + + def create_h5_and_return_checksum(track_times): + with ensure_clean_path(setup_path) as path: + df = pd.DataFrame({"a": [1]}) + + with pd.HDFStore(path, mode="w") as hdf: + hdf.put( + "table", + df, + format="table", + data_columns=True, + index=None, + track_times=track_times, + ) + + return checksum(path) + + checksum_0_tt_false = create_h5_and_return_checksum(track_times=False) + checksum_0_tt_true = create_h5_and_return_checksum(track_times=True) + + # sleep is necessary to create h5 with different creation time + time.sleep(1) + + checksum_1_tt_false = create_h5_and_return_checksum(track_times=False) + checksum_1_tt_true = create_h5_and_return_checksum(track_times=True) + + # checksums are the same if track_time = False + assert checksum_0_tt_false == checksum_1_tt_false + + # checksums are NOT same if track_time = True + assert checksum_0_tt_true != checksum_1_tt_true + def test_keys_ignore_hdf_softlink(self, setup_path): # GH 20523