ENH: Support passing compression args to gzip and bz2 (pandas-dev#33398)

jessefarnham · web-flow · commit 7456fc52e154 · 2020-04-10T18:21:09.000-04:00
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -285,14 +285,18 @@ chunksize : int, default ``None``
 Quoting, compression, and file format
 +++++++++++++++++++++++++++++++++++++
 
-compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``}, default ``'infer'``
+compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``, ``dict``}, default ``'infer'``
   For on-the-fly decompression of on-disk data. If 'infer', then use gzip,
   bz2, zip, or xz if filepath_or_buffer is a string ending in '.gz', '.bz2',
   '.zip', or '.xz', respectively, and no decompression otherwise. If using 'zip',
   the ZIP file must contain only one data file to be read in.
-  Set to ``None`` for no decompression.
+  Set to ``None`` for no decompression. Can also be a dict with key ``'method'``
+  set to one of {``'zip'``, ``'gzip'``, ``'bz2'``}, and other keys set to
+  compression settings. As an example, the following could be passed for
+  faster compression: ``compression={'method': 'gzip', 'compresslevel': 1}``.
 
   .. versionchanged:: 0.24.0 'infer' option added and set to default.
+  .. versionchanged:: 1.1.0 dict option extended to support ``gzip`` and ``bz2``.
 thousands : str, default ``None``
   Thousands separator.
 decimal : str, default ``'.'``
@@ -3347,6 +3351,12 @@ The compression type can be an explicit parameter or be inferred from the file e
 If 'infer', then use ``gzip``, ``bz2``, ``zip``, or ``xz`` if filename ends in ``'.gz'``, ``'.bz2'``, ``'.zip'``, or
 ``'.xz'``, respectively.
 
+The compression parameter can also be a ``dict`` in order to pass options to the
+compression protocol. It must have a ``'method'`` key set to the name
+of the compression protocol, which must be one of
+{``'zip'``, ``'gzip'``, ``'bz2'``}. All other key-value pairs are passed to
+the underlying compression library.
+
 .. ipython:: python
 
    df = pd.DataFrame({
@@ -3383,6 +3393,15 @@ The default is to 'infer':
    rt = pd.read_pickle("s1.pkl.bz2")
    rt
 
+Passing options to the compression protocol in order to speed up compression:
+
+.. ipython:: python
+
+   df.to_pickle(
+       "data.pkl.gz",
+       compression={"method": "gzip", 'compresslevel': 1}
+   )
+
 .. ipython:: python
    :suppress:
 
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -91,6 +91,12 @@ Other enhancements
 - The :meth:`DataFrame.to_feather` method now supports additional keyword
   arguments (e.g. to set the compression) that are added in pyarrow 0.17
   (:issue:`33422`).
+- :meth:`DataFrame.to_csv`, :meth:`DataFrame.to_pickle`,
+  and :meth:`DataFrame.to_json` now support passing a dict of
+  compression arguments when using the ``gzip`` and ``bz2`` protocols.
+  This can be used to set a custom compression level, e.g.,
+  ``df.to_csv(path, compression={'method': 'gzip', 'compresslevel': 1}``
+  (:issue:`33196`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3096,7 +3096,8 @@ def to_csv(
             compression mode is 'infer' and `path_or_buf` is path-like, then
             detect compression mode from the following extensions: '.gz',
             '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given
-            and mode is 'zip' or inferred as 'zip', other entries passed as
+            and mode is one of {'zip', 'gzip', 'bz2'}, or inferred as
+            one of the above, other entries passed as
             additional compression options.
 
             .. versionchanged:: 1.0.0
@@ -3105,6 +3106,12 @@ def to_csv(
                and other entries as additional compression options if
                compression mode is 'zip'.
 
+            .. versionchanged:: 1.1.0
+
+               Passing compression options as keys in dict is
+               supported for compression modes 'gzip' and 'bz2'
+               as well as 'zip'.
+
         quoting : optional constant from csv module
             Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
             then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -351,15 +351,21 @@ def get_handle(
         'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer'
         and `filepath_or_buffer` is path-like, then detect compression from
         the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise
-        no compression). If dict and compression mode is 'zip' or inferred as
-        'zip', other entries passed as additional compression options.
+        no compression). If dict and compression mode is one of
+        {'zip', 'gzip', 'bz2'}, or inferred as one of the above,
+        other entries passed as additional compression options.
 
         .. versionchanged:: 1.0.0
 
            May now be a dict with key 'method' as compression mode
            and other keys as compression options if compression
            mode is 'zip'.
 
+        .. versionchanged:: 1.1.0
+
+           Passing compression options as keys in dict is now
+           supported for compression modes 'gzip' and 'bz2' as well as 'zip'.
+
     memory_map : boolean, default False
         See parsers._parser_params for more information.
     is_text : boolean, default True
@@ -394,19 +400,28 @@ def get_handle(
 
     if compression:
 
+        # GH33398 the type ignores here seem related to mypy issue #5382;
+        # it may be possible to remove them once that is resolved.
+
         # GZ Compression
         if compression == "gzip":
             if is_path:
-                f = gzip.open(path_or_buf, mode)
+                f = gzip.open(
+                    path_or_buf, mode, **compression_args  # type: ignore
+                )
             else:
-                f = gzip.GzipFile(fileobj=path_or_buf)
+                f = gzip.GzipFile(
+                    fileobj=path_or_buf, **compression_args  # type: ignore
+                )
 
         # BZ Compression
         elif compression == "bz2":
             if is_path:
-                f = bz2.BZ2File(path_or_buf, mode)
+                f = bz2.BZ2File(
+                    path_or_buf, mode, **compression_args  # type: ignore
+                )
             else:
-                f = bz2.BZ2File(path_or_buf)
+                f = bz2.BZ2File(path_or_buf, **compression_args)  # type: ignore
 
         # ZIP Compression
         elif compression == "zip":
diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py
@@ -143,3 +143,44 @@ def test_with_missing_lzma_runtime():
         """
     )
     subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE)
+
+
+@pytest.mark.parametrize(
+    "obj",
+    [
+        pd.DataFrame(
+            100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
+            columns=["X", "Y", "Z"],
+        ),
+        pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
+    ],
+)
+@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
+def test_gzip_compression_level(obj, method):
+    # GH33196
+    with tm.ensure_clean() as path:
+        getattr(obj, method)(path, compression="gzip")
+        compressed_size_default = os.path.getsize(path)
+        getattr(obj, method)(path, compression={"method": "gzip", "compresslevel": 1})
+        compressed_size_fast = os.path.getsize(path)
+        assert compressed_size_default < compressed_size_fast
+
+
+@pytest.mark.parametrize(
+    "obj",
+    [
+        pd.DataFrame(
+            100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
+            columns=["X", "Y", "Z"],
+        ),
+        pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
+    ],
+)
+@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
+def test_bzip_compression_level(obj, method):
+    """GH33196 bzip needs file size > 100k to show a size difference between
+    compression levels, so here we just check if the call works when
+    compression is passed as a dict.
+    """
+    with tm.ensure_clean() as path:
+        getattr(obj, method)(path, compression={"method": "bz2", "compresslevel": 1})