diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index f2152c43ceaba..df6b44ac654ce 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -285,14 +285,18 @@ chunksize : int, default ``None`` Quoting, compression, and file format +++++++++++++++++++++++++++++++++++++ -compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``}, default ``'infer'`` +compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``, ``dict``}, default ``'infer'`` For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, zip, or xz if filepath_or_buffer is a string ending in '.gz', '.bz2', '.zip', or '.xz', respectively, and no decompression otherwise. If using 'zip', the ZIP file must contain only one data file to be read in. - Set to ``None`` for no decompression. + Set to ``None`` for no decompression. Can also be a dict with key ``'method'`` + set to one of {``'zip'``, ``'gzip'``, ``'bz2'``}, and other keys set to + compression settings. As an example, the following could be passed for + faster compression: ``compression={'method': 'gzip', 'compresslevel': 1}``. .. versionchanged:: 0.24.0 'infer' option added and set to default. + .. versionchanged:: 1.1.0 dict option extended to support ``gzip`` and ``bz2``. thousands : str, default ``None`` Thousands separator. decimal : str, default ``'.'`` @@ -3347,6 +3351,12 @@ The compression type can be an explicit parameter or be inferred from the file e If 'infer', then use ``gzip``, ``bz2``, ``zip``, or ``xz`` if filename ends in ``'.gz'``, ``'.bz2'``, ``'.zip'``, or ``'.xz'``, respectively. +The compression parameter can also be a ``dict`` in order to pass options to the +compression protocol. It must have a ``'method'`` key set to the name +of the compression protocol, which must be one of +{``'zip'``, ``'gzip'``, ``'bz2'``}. All other key-value pairs are passed to +the underlying compression library. + .. ipython:: python df = pd.DataFrame({ @@ -3383,6 +3393,15 @@ The default is to 'infer': rt = pd.read_pickle("s1.pkl.bz2") rt +Passing options to the compression protocol in order to speed up compression: + +.. ipython:: python + + df.to_pickle( + "data.pkl.gz", + compression={"method": "gzip", 'compresslevel': 1} + ) + .. ipython:: python :suppress: diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index fc5893e401836..f7ed07848bb84 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -91,6 +91,12 @@ Other enhancements - The :meth:`DataFrame.to_feather` method now supports additional keyword arguments (e.g. to set the compression) that are added in pyarrow 0.17 (:issue:`33422`). +- :meth:`DataFrame.to_csv`, :meth:`DataFrame.to_pickle`, + and :meth:`DataFrame.to_json` now support passing a dict of + compression arguments when using the ``gzip`` and ``bz2`` protocols. + This can be used to set a custom compression level, e.g., + ``df.to_csv(path, compression={'method': 'gzip', 'compresslevel': 1}`` + (:issue:`33196`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index eb6554bf2260c..2adfd2bb9a7b3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3096,7 +3096,8 @@ def to_csv( compression mode is 'infer' and `path_or_buf` is path-like, then detect compression mode from the following extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given - and mode is 'zip' or inferred as 'zip', other entries passed as + and mode is one of {'zip', 'gzip', 'bz2'}, or inferred as + one of the above, other entries passed as additional compression options. .. versionchanged:: 1.0.0 @@ -3105,6 +3106,12 @@ def to_csv( and other entries as additional compression options if compression mode is 'zip'. + .. versionchanged:: 1.1.0 + + Passing compression options as keys in dict is + supported for compression modes 'gzip' and 'bz2' + as well as 'zip'. + quoting : optional constant from csv module Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` then floats are converted to strings and thus csv.QUOTE_NONNUMERIC diff --git a/pandas/io/common.py b/pandas/io/common.py index 0fce8f5382686..ff527de79c387 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -351,8 +351,9 @@ def get_handle( 'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer' and `filepath_or_buffer` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise - no compression). If dict and compression mode is 'zip' or inferred as - 'zip', other entries passed as additional compression options. + no compression). If dict and compression mode is one of + {'zip', 'gzip', 'bz2'}, or inferred as one of the above, + other entries passed as additional compression options. .. versionchanged:: 1.0.0 @@ -360,6 +361,11 @@ def get_handle( and other keys as compression options if compression mode is 'zip'. + .. versionchanged:: 1.1.0 + + Passing compression options as keys in dict is now + supported for compression modes 'gzip' and 'bz2' as well as 'zip'. + memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True @@ -394,19 +400,28 @@ def get_handle( if compression: + # GH33398 the type ignores here seem related to mypy issue #5382; + # it may be possible to remove them once that is resolved. + # GZ Compression if compression == "gzip": if is_path: - f = gzip.open(path_or_buf, mode) + f = gzip.open( + path_or_buf, mode, **compression_args # type: ignore + ) else: - f = gzip.GzipFile(fileobj=path_or_buf) + f = gzip.GzipFile( + fileobj=path_or_buf, **compression_args # type: ignore + ) # BZ Compression elif compression == "bz2": if is_path: - f = bz2.BZ2File(path_or_buf, mode) + f = bz2.BZ2File( + path_or_buf, mode, **compression_args # type: ignore + ) else: - f = bz2.BZ2File(path_or_buf) + f = bz2.BZ2File(path_or_buf, **compression_args) # type: ignore # ZIP Compression elif compression == "zip": diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 841241d5124e0..59c9bd0a36d3d 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -143,3 +143,44 @@ def test_with_missing_lzma_runtime(): """ ) subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE) + + +@pytest.mark.parametrize( + "obj", + [ + pd.DataFrame( + 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + columns=["X", "Y", "Z"], + ), + pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"), + ], +) +@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"]) +def test_gzip_compression_level(obj, method): + # GH33196 + with tm.ensure_clean() as path: + getattr(obj, method)(path, compression="gzip") + compressed_size_default = os.path.getsize(path) + getattr(obj, method)(path, compression={"method": "gzip", "compresslevel": 1}) + compressed_size_fast = os.path.getsize(path) + assert compressed_size_default < compressed_size_fast + + +@pytest.mark.parametrize( + "obj", + [ + pd.DataFrame( + 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + columns=["X", "Y", "Z"], + ), + pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"), + ], +) +@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"]) +def test_bzip_compression_level(obj, method): + """GH33196 bzip needs file size > 100k to show a size difference between + compression levels, so here we just check if the call works when + compression is passed as a dict. + """ + with tm.ensure_clean() as path: + getattr(obj, method)(path, compression={"method": "bz2", "compresslevel": 1})