Skip to content

Unblock supported compression libs in pytables #16196

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 11, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.20.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ Highlights include:
Enhancements
~~~~~~~~~~~~


- Unblocked access to additional compression types supported in pytables: 'blosc:blosclz, 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd' (:issue:`14478`)

.. _whatsnew_0202.performance:

Expand Down
17 changes: 11 additions & 6 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1266,12 +1266,17 @@ def to_hdf(self, path_or_buf, key, **kwargs):
<http://pandas.pydata.org/pandas-docs/stable/io.html#query-via-data-columns>`__.

Applicable only to format='table'.
complevel : int, 1-9, default 0
If a complib is specified compression will be applied
where possible
complib : {'zlib', 'bzip2', 'lzo', 'blosc', None}, default None
If complevel is > 0 apply compression to objects written
in the store wherever possible
complevel : int, 0-9, default 0
Specifies a compression level for data.
A value of 0 disables compression.
complib : {'zlib', 'lzo', 'bzip2', 'blosc', None}, default None
Specifies the compression library to be used.
As of v0.20.2 these additional compressors for Blosc are supported
(default if no compressor specified: 'blosc:blosclz'):
{'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
'blosc:zlib', 'blosc:zstd'}.
Specifying a compression library which is not available issues
a ValueError.
fletcher32 : bool, default False
If applying compression use the fletcher32 checksum
dropna : boolean, default False.
Expand Down
24 changes: 15 additions & 9 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,12 +402,17 @@ class HDFStore(StringMixin):
and if the file does not exist it is created.
``'r+'``
It is similar to ``'a'``, but the file must already exist.
complevel : int, 1-9, default 0
If a complib is specified compression will be applied
where possible
complib : {'zlib', 'bzip2', 'lzo', 'blosc', None}, default None
If complevel is > 0 apply compression to objects written
in the store wherever possible
complevel : int, 0-9, default 0
Specifies a compression level for data.
A value of 0 disables compression.
complib : {'zlib', 'lzo', 'bzip2', 'blosc', None}, default None
Specifies the compression library to be used.
As of v0.20.2 these additional compressors for Blosc are supported
(default if no compressor specified: 'blosc:blosclz'):
{'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
'blosc:zlib', 'blosc:zstd'}.
Specifying a compression library which is not available issues
a ValueError.
fletcher32 : bool, default False
If applying compression use the fletcher32 checksum

Expand All @@ -430,9 +435,10 @@ def __init__(self, path, mode=None, complevel=None, complib=None,
raise ImportError('HDFStore requires PyTables, "{ex}" problem '
'importing'.format(ex=str(ex)))

if complib not in (None, 'blosc', 'bzip2', 'lzo', 'zlib'):
raise ValueError("complib only supports 'blosc', 'bzip2', lzo' "
"or 'zlib' compression.")
if complib is not None and complib not in tables.filters.all_complibs:
raise ValueError(
"complib only supports {libs} compression.".format(
libs=tables.filters.all_complibs))

self._path = path
if mode is None:
Expand Down
37 changes: 35 additions & 2 deletions pandas/tests/io/test_pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -734,6 +734,39 @@ def test_put_compression_blosc(self):
store.put('c', df, format='table', complib='blosc')
tm.assert_frame_equal(store['c'], df)

def test_complibs(self):
# GH14478
df = tm.makeDataFrame()

# Building list of all complibs and complevels tuples
all_complibs = tables.filters.all_complibs
# Remove lzo if its not available on this platform
if not tables.which_lib_version('lzo'):
all_complibs.remove('lzo')
all_levels = range(0, 10)
all_tests = [(lib, lvl) for lib in all_complibs for lvl in all_levels]

for (lib, lvl) in all_tests:
with ensure_clean_path(self.path) as tmpfile:
gname = 'foo'

# Write and read file to see if data is consistent
df.to_hdf(tmpfile, gname, complib=lib, complevel=lvl)
result = pd.read_hdf(tmpfile, gname)
tm.assert_frame_equal(result, df)

# Open file and check metadata
# for correct amount of compression
h5table = tables.open_file(tmpfile, mode='r')
for node in h5table.walk_nodes(where='/' + gname,
classname='Leaf'):
assert node.filters.complevel == lvl
if lvl == 0:
assert node.filters.complib is None
else:
assert node.filters.complib == lib
h5table.close()

def test_put_integer(self):
# non-date, non-string index
df = DataFrame(np.random.randn(50, 100))
Expand Down Expand Up @@ -4939,8 +4972,8 @@ def test_invalid_complib(self):
index=list('abcd'),
columns=list('ABCDE'))
with ensure_clean_path(self.path) as path:
pytest.raises(ValueError, df.to_hdf, path,
'df', complib='blosc:zlib')
with pytest.raises(ValueError):
df.to_hdf(path, 'df', complib='foolib')
# GH10443

def test_read_nokey(self):
Expand Down