Unblock supported compression libs in pytables (pandas-dev#16196)

linebp · pcluo · commit bee30f36d00b · 2017-05-22T15:21:59.000-04:00
diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt
@@ -19,7 +19,7 @@ Highlights include:
 Enhancements
 ~~~~~~~~~~~~
 
-
+- Unblocked access to additional compression types supported in pytables: 'blosc:blosclz, 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd' (:issue:`14478`) 
 
 .. _whatsnew_0202.performance:
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1266,12 +1266,17 @@ def to_hdf(self, path_or_buf, key, **kwargs):
             <http://pandas.pydata.org/pandas-docs/stable/io.html#query-via-data-columns>`__.
 
             Applicable only to format='table'.
-        complevel : int, 1-9, default 0
-            If a complib is specified compression will be applied
-            where possible
-        complib : {'zlib', 'bzip2', 'lzo', 'blosc', None}, default None
-            If complevel is > 0 apply compression to objects written
-            in the store wherever possible
+        complevel : int, 0-9, default 0
+            Specifies a compression level for data.
+            A value of 0 disables compression.
+        complib : {'zlib', 'lzo', 'bzip2', 'blosc', None}, default None
+            Specifies the compression library to be used.
+            As of v0.20.2 these additional compressors for Blosc are supported
+            (default if no compressor specified: 'blosc:blosclz'):
+            {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
+             'blosc:zlib', 'blosc:zstd'}.
+            Specifying a compression library which is not available issues
+            a ValueError.
         fletcher32 : bool, default False
             If applying compression use the fletcher32 checksum
         dropna : boolean, default False.
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -402,12 +402,17 @@ class HDFStore(StringMixin):
             and if the file does not exist it is created.
         ``'r+'``
             It is similar to ``'a'``, but the file must already exist.
-    complevel : int, 1-9, default 0
-            If a complib is specified compression will be applied
-            where possible
-    complib : {'zlib', 'bzip2', 'lzo', 'blosc', None}, default None
-            If complevel is > 0 apply compression to objects written
-            in the store wherever possible
+    complevel : int, 0-9, default 0
+            Specifies a compression level for data.
+            A value of 0 disables compression.
+    complib : {'zlib', 'lzo', 'bzip2', 'blosc', None}, default None
+            Specifies the compression library to be used.
+            As of v0.20.2 these additional compressors for Blosc are supported
+            (default if no compressor specified: 'blosc:blosclz'):
+            {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
+             'blosc:zlib', 'blosc:zstd'}.
+            Specifying a compression library which is not available issues
+            a ValueError.
     fletcher32 : bool, default False
             If applying compression use the fletcher32 checksum
 
@@ -430,9 +435,10 @@ def __init__(self, path, mode=None, complevel=None, complib=None,
             raise ImportError('HDFStore requires PyTables, "{ex}" problem '
                               'importing'.format(ex=str(ex)))
 
-        if complib not in (None, 'blosc', 'bzip2', 'lzo', 'zlib'):
-            raise ValueError("complib only supports 'blosc', 'bzip2', lzo' "
-                             "or 'zlib' compression.")
+        if complib is not None and complib not in tables.filters.all_complibs:
+            raise ValueError(
+                "complib only supports {libs} compression.".format(
+                    libs=tables.filters.all_complibs))
 
         self._path = path
         if mode is None:
diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py
@@ -734,6 +734,39 @@ def test_put_compression_blosc(self):
             store.put('c', df, format='table', complib='blosc')
             tm.assert_frame_equal(store['c'], df)
 
+    def test_complibs(self):
+        # GH14478
+        df = tm.makeDataFrame()
+
+        # Building list of all complibs and complevels tuples
+        all_complibs = tables.filters.all_complibs
+        # Remove lzo if its not available on this platform
+        if not tables.which_lib_version('lzo'):
+            all_complibs.remove('lzo')
+        all_levels = range(0, 10)
+        all_tests = [(lib, lvl) for lib in all_complibs for lvl in all_levels]
+
+        for (lib, lvl) in all_tests:
+            with ensure_clean_path(self.path) as tmpfile:
+                gname = 'foo'
+
+                # Write and read file to see if data is consistent
+                df.to_hdf(tmpfile, gname, complib=lib, complevel=lvl)
+                result = pd.read_hdf(tmpfile, gname)
+                tm.assert_frame_equal(result, df)
+
+                # Open file and check metadata
+                # for correct amount of compression
+                h5table = tables.open_file(tmpfile, mode='r')
+                for node in h5table.walk_nodes(where='/' + gname,
+                                               classname='Leaf'):
+                    assert node.filters.complevel == lvl
+                    if lvl == 0:
+                        assert node.filters.complib is None
+                    else:
+                        assert node.filters.complib == lib
+                h5table.close()
+
     def test_put_integer(self):
         # non-date, non-string index
         df = DataFrame(np.random.randn(50, 100))
@@ -4939,8 +4972,8 @@ def test_invalid_complib(self):
                        index=list('abcd'),
                        columns=list('ABCDE'))
         with ensure_clean_path(self.path) as path:
-            pytest.raises(ValueError, df.to_hdf, path,
-                          'df', complib='blosc:zlib')
+            with pytest.raises(ValueError):
+                df.to_hdf(path, 'df', complib='foolib')
     # GH10443
 
     def test_read_nokey(self):