diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 983f3edfa2f46..95e88f610004f 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -19,7 +19,7 @@ Highlights include: Enhancements ~~~~~~~~~~~~ - +- Unblocked access to additional compression types supported in pytables: 'blosc:blosclz, 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd' (:issue:`14478`) .. _whatsnew_0202.performance: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b72f83ce723cc..777cfcae7a326 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1266,12 +1266,17 @@ def to_hdf(self, path_or_buf, key, **kwargs): `__. Applicable only to format='table'. - complevel : int, 1-9, default 0 - If a complib is specified compression will be applied - where possible - complib : {'zlib', 'bzip2', 'lzo', 'blosc', None}, default None - If complevel is > 0 apply compression to objects written - in the store wherever possible + complevel : int, 0-9, default 0 + Specifies a compression level for data. + A value of 0 disables compression. + complib : {'zlib', 'lzo', 'bzip2', 'blosc', None}, default None + Specifies the compression library to be used. + As of v0.20.2 these additional compressors for Blosc are supported + (default if no compressor specified: 'blosc:blosclz'): + {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', + 'blosc:zlib', 'blosc:zstd'}. + Specifying a compression library which is not available issues + a ValueError. fletcher32 : bool, default False If applying compression use the fletcher32 checksum dropna : boolean, default False. diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 17bedd016f617..f017421c1f83a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -402,12 +402,17 @@ class HDFStore(StringMixin): and if the file does not exist it is created. ``'r+'`` It is similar to ``'a'``, but the file must already exist. - complevel : int, 1-9, default 0 - If a complib is specified compression will be applied - where possible - complib : {'zlib', 'bzip2', 'lzo', 'blosc', None}, default None - If complevel is > 0 apply compression to objects written - in the store wherever possible + complevel : int, 0-9, default 0 + Specifies a compression level for data. + A value of 0 disables compression. + complib : {'zlib', 'lzo', 'bzip2', 'blosc', None}, default None + Specifies the compression library to be used. + As of v0.20.2 these additional compressors for Blosc are supported + (default if no compressor specified: 'blosc:blosclz'): + {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', + 'blosc:zlib', 'blosc:zstd'}. + Specifying a compression library which is not available issues + a ValueError. fletcher32 : bool, default False If applying compression use the fletcher32 checksum @@ -430,9 +435,10 @@ def __init__(self, path, mode=None, complevel=None, complib=None, raise ImportError('HDFStore requires PyTables, "{ex}" problem ' 'importing'.format(ex=str(ex))) - if complib not in (None, 'blosc', 'bzip2', 'lzo', 'zlib'): - raise ValueError("complib only supports 'blosc', 'bzip2', lzo' " - "or 'zlib' compression.") + if complib is not None and complib not in tables.filters.all_complibs: + raise ValueError( + "complib only supports {libs} compression.".format( + libs=tables.filters.all_complibs)) self._path = path if mode is None: diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 873bb20b3bba9..abfd88a6f13e1 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -734,6 +734,39 @@ def test_put_compression_blosc(self): store.put('c', df, format='table', complib='blosc') tm.assert_frame_equal(store['c'], df) + def test_complibs(self): + # GH14478 + df = tm.makeDataFrame() + + # Building list of all complibs and complevels tuples + all_complibs = tables.filters.all_complibs + # Remove lzo if its not available on this platform + if not tables.which_lib_version('lzo'): + all_complibs.remove('lzo') + all_levels = range(0, 10) + all_tests = [(lib, lvl) for lib in all_complibs for lvl in all_levels] + + for (lib, lvl) in all_tests: + with ensure_clean_path(self.path) as tmpfile: + gname = 'foo' + + # Write and read file to see if data is consistent + df.to_hdf(tmpfile, gname, complib=lib, complevel=lvl) + result = pd.read_hdf(tmpfile, gname) + tm.assert_frame_equal(result, df) + + # Open file and check metadata + # for correct amount of compression + h5table = tables.open_file(tmpfile, mode='r') + for node in h5table.walk_nodes(where='/' + gname, + classname='Leaf'): + assert node.filters.complevel == lvl + if lvl == 0: + assert node.filters.complib is None + else: + assert node.filters.complib == lib + h5table.close() + def test_put_integer(self): # non-date, non-string index df = DataFrame(np.random.randn(50, 100)) @@ -4939,8 +4972,8 @@ def test_invalid_complib(self): index=list('abcd'), columns=list('ABCDE')) with ensure_clean_path(self.path) as path: - pytest.raises(ValueError, df.to_hdf, path, - 'df', complib='blosc:zlib') + with pytest.raises(ValueError): + df.to_hdf(path, 'df', complib='foolib') # GH10443 def test_read_nokey(self):