Skip to content

Track times #32700

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
May 14, 2020
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ Other enhancements
compression library. Compression was also added to the low-level Stata-file writers
:class:`~pandas.io.stata.StataWriter`, :class:`~pandas.io.stata.StataWriter117`,
and :class:`~pandas.io.stata.StataWriterUTF8` (:issue:`26599`).
- :meth:`HDFStore.put` now accepts `track_times` parameter. Parameter is passed to ``create_table`` method of ``PyTables`` (:issue:`32682`).

.. ---------------------------------------------------------------------------

Expand Down
14 changes: 13 additions & 1 deletion pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -984,6 +984,7 @@ def put(
data_columns: Optional[List[str]] = None,
encoding=None,
errors: str = "strict",
track_times: bool = True,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you update the doc-string, make sure to add a 1.1 versionadded tag

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

doc string added

):
"""
Store object in HDFStore.
Expand All @@ -1010,6 +1011,12 @@ def put(
Provide an encoding for strings.
dropna : bool, default False, do not write an ALL nan row to
The store settable by the option 'io.hdf.dropna_table'.
track_times : bool, default True
Parameter is propagated to 'create_table' method of 'PyTables'.
If set to False it enables to have the same h5 files (same hashes)
independent on creation time.

.. versionadded:: 1.1.0
"""
if format is None:
format = get_option("io.hdf.default_format") or "fixed"
Expand All @@ -1027,6 +1034,7 @@ def put(
data_columns=data_columns,
encoding=encoding,
errors=errors,
track_times=track_times,
)

def remove(self, key: str, where=None, start=None, stop=None):
Expand Down Expand Up @@ -1626,6 +1634,7 @@ def _write_to_group(
data_columns=None,
encoding=None,
errors: str = "strict",
track_times: bool = True,
):
group = self.get_node(key)

Expand Down Expand Up @@ -1688,6 +1697,7 @@ def _write_to_group(
dropna=dropna,
nan_rep=nan_rep,
data_columns=data_columns,
track_times=track_times,
)

if isinstance(s, Table) and index:
Expand Down Expand Up @@ -4106,8 +4116,8 @@ def write(
dropna=False,
nan_rep=None,
data_columns=None,
track_times=True,
):

if not append and self.is_exists:
self._handle.remove_node(self.group, "table")

Expand Down Expand Up @@ -4137,6 +4147,8 @@ def write(
# set the table attributes
table.set_attrs()

options["track_times"] = track_times

# create the table
table._handle.create_table(table.group, **options)

Expand Down
45 changes: 45 additions & 0 deletions pandas/tests/io/pytables/test_store.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import datetime
from datetime import timedelta
from distutils.version import LooseVersion
import hashlib
from io import BytesIO
import os
from pathlib import Path
import re
import time
from warnings import catch_warnings, simplefilter

import numpy as np
Expand Down Expand Up @@ -296,6 +298,49 @@ def test_keys(self, setup_path):
assert set(store.keys()) == expected
assert set(store) == expected

def test_no_track_times(self, setup_path):

# GH 32682
# enables to set track_times (see `pytables` `create_table` documentation)

def checksum(filename, hash_factory=hashlib.md5, chunk_num_blocks=128):
h = hash_factory()
with open(filename, "rb") as f:
for chunk in iter(lambda: f.read(chunk_num_blocks * h.block_size), b""):
h.update(chunk)
return h.digest()

def create_h5_and_return_checksum(track_times):
with ensure_clean_path(setup_path) as path:
df = pd.DataFrame({"a": [1]})

with pd.HDFStore(path, mode="w") as hdf:
hdf.put(
"table",
df,
format="table",
data_columns=True,
index=None,
track_times=track_times,
)

return checksum(path)

checksum_0_tt_false = create_h5_and_return_checksum(track_times=False)
checksum_0_tt_true = create_h5_and_return_checksum(track_times=True)

# sleep is necessary to create h5 with different creation time
time.sleep(1)

checksum_1_tt_false = create_h5_and_return_checksum(track_times=False)
checksum_1_tt_true = create_h5_and_return_checksum(track_times=True)

# checksums are the same if track_time = False
assert checksum_0_tt_false == checksum_1_tt_false

# checksums are NOT same if track_time = True
assert checksum_0_tt_true != checksum_1_tt_true

def test_keys_ignore_hdf_softlink(self, setup_path):

# GH 20523
Expand Down