Skip to content

Commit 7456fc5

Browse files
authored
ENH: Support passing compression args to gzip and bz2 (pandas-dev#33398)
1 parent b9cf9be commit 7456fc5

File tree

5 files changed

+97
-9
lines changed

5 files changed

+97
-9
lines changed

doc/source/user_guide/io.rst

+21-2
Original file line numberDiff line numberDiff line change
@@ -285,14 +285,18 @@ chunksize : int, default ``None``
285285
Quoting, compression, and file format
286286
+++++++++++++++++++++++++++++++++++++
287287

288-
compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``}, default ``'infer'``
288+
compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``, ``dict``}, default ``'infer'``
289289
For on-the-fly decompression of on-disk data. If 'infer', then use gzip,
290290
bz2, zip, or xz if filepath_or_buffer is a string ending in '.gz', '.bz2',
291291
'.zip', or '.xz', respectively, and no decompression otherwise. If using 'zip',
292292
the ZIP file must contain only one data file to be read in.
293-
Set to ``None`` for no decompression.
293+
Set to ``None`` for no decompression. Can also be a dict with key ``'method'``
294+
set to one of {``'zip'``, ``'gzip'``, ``'bz2'``}, and other keys set to
295+
compression settings. As an example, the following could be passed for
296+
faster compression: ``compression={'method': 'gzip', 'compresslevel': 1}``.
294297

295298
.. versionchanged:: 0.24.0 'infer' option added and set to default.
299+
.. versionchanged:: 1.1.0 dict option extended to support ``gzip`` and ``bz2``.
296300
thousands : str, default ``None``
297301
Thousands separator.
298302
decimal : str, default ``'.'``
@@ -3347,6 +3351,12 @@ The compression type can be an explicit parameter or be inferred from the file e
33473351
If 'infer', then use ``gzip``, ``bz2``, ``zip``, or ``xz`` if filename ends in ``'.gz'``, ``'.bz2'``, ``'.zip'``, or
33483352
``'.xz'``, respectively.
33493353

3354+
The compression parameter can also be a ``dict`` in order to pass options to the
3355+
compression protocol. It must have a ``'method'`` key set to the name
3356+
of the compression protocol, which must be one of
3357+
{``'zip'``, ``'gzip'``, ``'bz2'``}. All other key-value pairs are passed to
3358+
the underlying compression library.
3359+
33503360
.. ipython:: python
33513361
33523362
df = pd.DataFrame({
@@ -3383,6 +3393,15 @@ The default is to 'infer':
33833393
rt = pd.read_pickle("s1.pkl.bz2")
33843394
rt
33853395
3396+
Passing options to the compression protocol in order to speed up compression:
3397+
3398+
.. ipython:: python
3399+
3400+
df.to_pickle(
3401+
"data.pkl.gz",
3402+
compression={"method": "gzip", 'compresslevel': 1}
3403+
)
3404+
33863405
.. ipython:: python
33873406
:suppress:
33883407

doc/source/whatsnew/v1.1.0.rst

+6
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,12 @@ Other enhancements
9191
- The :meth:`DataFrame.to_feather` method now supports additional keyword
9292
arguments (e.g. to set the compression) that are added in pyarrow 0.17
9393
(:issue:`33422`).
94+
- :meth:`DataFrame.to_csv`, :meth:`DataFrame.to_pickle`,
95+
and :meth:`DataFrame.to_json` now support passing a dict of
96+
compression arguments when using the ``gzip`` and ``bz2`` protocols.
97+
This can be used to set a custom compression level, e.g.,
98+
``df.to_csv(path, compression={'method': 'gzip', 'compresslevel': 1}``
99+
(:issue:`33196`)
94100

95101
.. ---------------------------------------------------------------------------
96102

pandas/core/generic.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -3096,7 +3096,8 @@ def to_csv(
30963096
compression mode is 'infer' and `path_or_buf` is path-like, then
30973097
detect compression mode from the following extensions: '.gz',
30983098
'.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given
3099-
and mode is 'zip' or inferred as 'zip', other entries passed as
3099+
and mode is one of {'zip', 'gzip', 'bz2'}, or inferred as
3100+
one of the above, other entries passed as
31003101
additional compression options.
31013102
31023103
.. versionchanged:: 1.0.0
@@ -3105,6 +3106,12 @@ def to_csv(
31053106
and other entries as additional compression options if
31063107
compression mode is 'zip'.
31073108
3109+
.. versionchanged:: 1.1.0
3110+
3111+
Passing compression options as keys in dict is
3112+
supported for compression modes 'gzip' and 'bz2'
3113+
as well as 'zip'.
3114+
31083115
quoting : optional constant from csv module
31093116
Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
31103117
then floats are converted to strings and thus csv.QUOTE_NONNUMERIC

pandas/io/common.py

+21-6
Original file line numberDiff line numberDiff line change
@@ -351,15 +351,21 @@ def get_handle(
351351
'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer'
352352
and `filepath_or_buffer` is path-like, then detect compression from
353353
the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise
354-
no compression). If dict and compression mode is 'zip' or inferred as
355-
'zip', other entries passed as additional compression options.
354+
no compression). If dict and compression mode is one of
355+
{'zip', 'gzip', 'bz2'}, or inferred as one of the above,
356+
other entries passed as additional compression options.
356357
357358
.. versionchanged:: 1.0.0
358359
359360
May now be a dict with key 'method' as compression mode
360361
and other keys as compression options if compression
361362
mode is 'zip'.
362363
364+
.. versionchanged:: 1.1.0
365+
366+
Passing compression options as keys in dict is now
367+
supported for compression modes 'gzip' and 'bz2' as well as 'zip'.
368+
363369
memory_map : boolean, default False
364370
See parsers._parser_params for more information.
365371
is_text : boolean, default True
@@ -394,19 +400,28 @@ def get_handle(
394400

395401
if compression:
396402

403+
# GH33398 the type ignores here seem related to mypy issue #5382;
404+
# it may be possible to remove them once that is resolved.
405+
397406
# GZ Compression
398407
if compression == "gzip":
399408
if is_path:
400-
f = gzip.open(path_or_buf, mode)
409+
f = gzip.open(
410+
path_or_buf, mode, **compression_args # type: ignore
411+
)
401412
else:
402-
f = gzip.GzipFile(fileobj=path_or_buf)
413+
f = gzip.GzipFile(
414+
fileobj=path_or_buf, **compression_args # type: ignore
415+
)
403416

404417
# BZ Compression
405418
elif compression == "bz2":
406419
if is_path:
407-
f = bz2.BZ2File(path_or_buf, mode)
420+
f = bz2.BZ2File(
421+
path_or_buf, mode, **compression_args # type: ignore
422+
)
408423
else:
409-
f = bz2.BZ2File(path_or_buf)
424+
f = bz2.BZ2File(path_or_buf, **compression_args) # type: ignore
410425

411426
# ZIP Compression
412427
elif compression == "zip":

pandas/tests/io/test_compression.py

+41
Original file line numberDiff line numberDiff line change
@@ -143,3 +143,44 @@ def test_with_missing_lzma_runtime():
143143
"""
144144
)
145145
subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE)
146+
147+
148+
@pytest.mark.parametrize(
149+
"obj",
150+
[
151+
pd.DataFrame(
152+
100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
153+
columns=["X", "Y", "Z"],
154+
),
155+
pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
156+
],
157+
)
158+
@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
159+
def test_gzip_compression_level(obj, method):
160+
# GH33196
161+
with tm.ensure_clean() as path:
162+
getattr(obj, method)(path, compression="gzip")
163+
compressed_size_default = os.path.getsize(path)
164+
getattr(obj, method)(path, compression={"method": "gzip", "compresslevel": 1})
165+
compressed_size_fast = os.path.getsize(path)
166+
assert compressed_size_default < compressed_size_fast
167+
168+
169+
@pytest.mark.parametrize(
170+
"obj",
171+
[
172+
pd.DataFrame(
173+
100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
174+
columns=["X", "Y", "Z"],
175+
),
176+
pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
177+
],
178+
)
179+
@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
180+
def test_bzip_compression_level(obj, method):
181+
"""GH33196 bzip needs file size > 100k to show a size difference between
182+
compression levels, so here we just check if the call works when
183+
compression is passed as a dict.
184+
"""
185+
with tm.ensure_clean() as path:
186+
getattr(obj, method)(path, compression={"method": "bz2", "compresslevel": 1})

0 commit comments

Comments
 (0)