Skip to content

Commit 59febbd

Browse files
authored
BUG/ENH: consistent gzip compression arguments (#35645)
1 parent ed23eb8 commit 59febbd

File tree

10 files changed

+118
-50
lines changed

10 files changed

+118
-50
lines changed

doc/source/user_guide/io.rst

+7-4
Original file line numberDiff line numberDiff line change
@@ -287,16 +287,19 @@ Quoting, compression, and file format
287287

288288
compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``, ``dict``}, default ``'infer'``
289289
For on-the-fly decompression of on-disk data. If 'infer', then use gzip,
290-
bz2, zip, or xz if filepath_or_buffer is a string ending in '.gz', '.bz2',
290+
bz2, zip, or xz if ``filepath_or_buffer`` is path-like ending in '.gz', '.bz2',
291291
'.zip', or '.xz', respectively, and no decompression otherwise. If using 'zip',
292292
the ZIP file must contain only one data file to be read in.
293293
Set to ``None`` for no decompression. Can also be a dict with key ``'method'``
294-
set to one of {``'zip'``, ``'gzip'``, ``'bz2'``}, and other keys set to
295-
compression settings. As an example, the following could be passed for
296-
faster compression: ``compression={'method': 'gzip', 'compresslevel': 1}``.
294+
set to one of {``'zip'``, ``'gzip'``, ``'bz2'``} and other key-value pairs are
295+
forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``, or ``bz2.BZ2File``.
296+
As an example, the following could be passed for faster compression and to
297+
create a reproducible gzip archive:
298+
``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.
297299

298300
.. versionchanged:: 0.24.0 'infer' option added and set to default.
299301
.. versionchanged:: 1.1.0 dict option extended to support ``gzip`` and ``bz2``.
302+
.. versionchanged:: 1.2.0 Previous versions forwarded dict entries for 'gzip' to `gzip.open`.
300303
thousands : str, default ``None``
301304
Thousands separator.
302305
decimal : str, default ``'.'``

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,7 @@ I/O
235235
- Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`)
236236
- In :meth:`read_csv` `float_precision='round_trip'` now handles `decimal` and `thousands` parameters (:issue:`35365`)
237237
- :meth:`to_pickle` and :meth:`read_pickle` were closing user-provided file objects (:issue:`35679`)
238+
- :meth:`to_csv` passes compression arguments for `'gzip'` always to `gzip.GzipFile` (:issue:`28103`)
238239

239240
Plotting
240241
^^^^^^^^

pandas/_typing.py

+5
Original file line numberDiff line numberDiff line change
@@ -109,3 +109,8 @@
109109

110110
# for arbitrary kwargs passed during reading/writing files
111111
StorageOptions = Optional[Dict[str, Any]]
112+
113+
114+
# compression keywords and compression
115+
CompressionDict = Mapping[str, Optional[Union[str, int, bool]]]
116+
CompressionOptions = Optional[Union[str, CompressionDict]]

pandas/core/generic.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
from pandas._libs.tslibs import Tick, Timestamp, to_offset
3636
from pandas._typing import (
3737
Axis,
38+
CompressionOptions,
3839
FilePathOrBuffer,
3940
FrameOrSeries,
4041
JSONSerializable,
@@ -2058,7 +2059,7 @@ def to_json(
20582059
date_unit: str = "ms",
20592060
default_handler: Optional[Callable[[Any], JSONSerializable]] = None,
20602061
lines: bool_t = False,
2061-
compression: Optional[str] = "infer",
2062+
compression: CompressionOptions = "infer",
20622063
index: bool_t = True,
20632064
indent: Optional[int] = None,
20642065
storage_options: StorageOptions = None,
@@ -2646,7 +2647,7 @@ def to_sql(
26462647
def to_pickle(
26472648
self,
26482649
path,
2649-
compression: Optional[str] = "infer",
2650+
compression: CompressionOptions = "infer",
26502651
protocol: int = pickle.HIGHEST_PROTOCOL,
26512652
storage_options: StorageOptions = None,
26522653
) -> None:
@@ -3053,7 +3054,7 @@ def to_csv(
30533054
index_label: Optional[Union[bool_t, str, Sequence[Label]]] = None,
30543055
mode: str = "w",
30553056
encoding: Optional[str] = None,
3056-
compression: Optional[Union[str, Mapping[str, str]]] = "infer",
3057+
compression: CompressionOptions = "infer",
30573058
quoting: Optional[int] = None,
30583059
quotechar: str = '"',
30593060
line_terminator: Optional[str] = None,
@@ -3144,6 +3145,12 @@ def to_csv(
31443145
31453146
Compression is supported for binary file objects.
31463147
3148+
.. versionchanged:: 1.2.0
3149+
3150+
Previous versions forwarded dict entries for 'gzip' to
3151+
`gzip.open` instead of `gzip.GzipFile` which prevented
3152+
setting `mtime`.
3153+
31473154
quoting : optional constant from csv module
31483155
Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
31493156
then floats are converted to strings and thus csv.QUOTE_NONNUMERIC

pandas/io/common.py

+17-14
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
Optional,
1919
Tuple,
2020
Type,
21-
Union,
2221
)
2322
from urllib.parse import (
2423
urljoin,
@@ -29,7 +28,12 @@
2928
)
3029
import zipfile
3130

32-
from pandas._typing import FilePathOrBuffer, StorageOptions
31+
from pandas._typing import (
32+
CompressionDict,
33+
CompressionOptions,
34+
FilePathOrBuffer,
35+
StorageOptions,
36+
)
3337
from pandas.compat import _get_lzma_file, _import_lzma
3438
from pandas.compat._optional import import_optional_dependency
3539

@@ -160,7 +164,7 @@ def is_fsspec_url(url: FilePathOrBuffer) -> bool:
160164
def get_filepath_or_buffer(
161165
filepath_or_buffer: FilePathOrBuffer,
162166
encoding: Optional[str] = None,
163-
compression: Optional[str] = None,
167+
compression: CompressionOptions = None,
164168
mode: Optional[str] = None,
165169
storage_options: StorageOptions = None,
166170
):
@@ -188,7 +192,7 @@ def get_filepath_or_buffer(
188192
189193
Returns
190194
-------
191-
Tuple[FilePathOrBuffer, str, str, bool]
195+
Tuple[FilePathOrBuffer, str, CompressionOptions, bool]
192196
Tuple containing the filepath or buffer, the encoding, the compression
193197
and should_close.
194198
"""
@@ -291,8 +295,8 @@ def file_path_to_url(path: str) -> str:
291295

292296

293297
def get_compression_method(
294-
compression: Optional[Union[str, Mapping[str, Any]]]
295-
) -> Tuple[Optional[str], Dict[str, Any]]:
298+
compression: CompressionOptions,
299+
) -> Tuple[Optional[str], CompressionDict]:
296300
"""
297301
Simplifies a compression argument to a compression method string and
298302
a mapping containing additional arguments.
@@ -316,7 +320,7 @@ def get_compression_method(
316320
if isinstance(compression, Mapping):
317321
compression_args = dict(compression)
318322
try:
319-
compression_method = compression_args.pop("method")
323+
compression_method = compression_args.pop("method") # type: ignore
320324
except KeyError as err:
321325
raise ValueError("If mapping, compression must have key 'method'") from err
322326
else:
@@ -383,7 +387,7 @@ def get_handle(
383387
path_or_buf,
384388
mode: str,
385389
encoding=None,
386-
compression: Optional[Union[str, Mapping[str, Any]]] = None,
390+
compression: CompressionOptions = None,
387391
memory_map: bool = False,
388392
is_text: bool = True,
389393
errors=None,
@@ -464,16 +468,13 @@ def get_handle(
464468
# GZ Compression
465469
if compression == "gzip":
466470
if is_path:
467-
f = gzip.open(path_or_buf, mode, **compression_args)
471+
f = gzip.GzipFile(filename=path_or_buf, mode=mode, **compression_args)
468472
else:
469473
f = gzip.GzipFile(fileobj=path_or_buf, mode=mode, **compression_args)
470474

471475
# BZ Compression
472476
elif compression == "bz2":
473-
if is_path:
474-
f = bz2.BZ2File(path_or_buf, mode, **compression_args)
475-
else:
476-
f = bz2.BZ2File(path_or_buf, mode=mode, **compression_args)
477+
f = bz2.BZ2File(path_or_buf, mode=mode, **compression_args)
477478

478479
# ZIP Compression
479480
elif compression == "zip":
@@ -577,7 +578,9 @@ def __init__(
577578
if mode in ["wb", "rb"]:
578579
mode = mode.replace("b", "")
579580
self.archive_name = archive_name
580-
super().__init__(file, mode, zipfile.ZIP_DEFLATED, **kwargs)
581+
kwargs_zip: Dict[str, Any] = {"compression": zipfile.ZIP_DEFLATED}
582+
kwargs_zip.update(kwargs)
583+
super().__init__(file, mode, **kwargs_zip)
581584

582585
def write(self, data):
583586
archive_name = self.filename

pandas/io/formats/csvs.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@
55
import csv as csvlib
66
from io import StringIO, TextIOWrapper
77
import os
8-
from typing import Hashable, List, Mapping, Optional, Sequence, Union
8+
from typing import Hashable, List, Optional, Sequence, Union
99
import warnings
1010

1111
import numpy as np
1212

1313
from pandas._libs import writers as libwriters
14-
from pandas._typing import FilePathOrBuffer, StorageOptions
14+
from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions
1515

1616
from pandas.core.dtypes.generic import (
1717
ABCDatetimeIndex,
@@ -44,7 +44,7 @@ def __init__(
4444
mode: str = "w",
4545
encoding: Optional[str] = None,
4646
errors: str = "strict",
47-
compression: Union[str, Mapping[str, str], None] = "infer",
47+
compression: CompressionOptions = "infer",
4848
quoting: Optional[int] = None,
4949
line_terminator="\n",
5050
chunksize: Optional[int] = None,

pandas/io/json/_json.py

+22-9
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,13 @@
33
from io import BytesIO, StringIO
44
from itertools import islice
55
import os
6-
from typing import Any, Callable, Optional, Type
6+
from typing import IO, Any, Callable, List, Optional, Type
77

88
import numpy as np
99

1010
import pandas._libs.json as json
1111
from pandas._libs.tslibs import iNaT
12-
from pandas._typing import JSONSerializable, StorageOptions
12+
from pandas._typing import CompressionOptions, JSONSerializable, StorageOptions
1313
from pandas.errors import AbstractMethodError
1414
from pandas.util._decorators import deprecate_kwarg, deprecate_nonkeyword_arguments
1515

@@ -19,7 +19,12 @@
1919
from pandas.core.construction import create_series_with_explicit_dtype
2020
from pandas.core.reshape.concat import concat
2121

22-
from pandas.io.common import get_filepath_or_buffer, get_handle, infer_compression
22+
from pandas.io.common import (
23+
get_compression_method,
24+
get_filepath_or_buffer,
25+
get_handle,
26+
infer_compression,
27+
)
2328
from pandas.io.json._normalize import convert_to_line_delimits
2429
from pandas.io.json._table_schema import build_table_schema, parse_table_schema
2530
from pandas.io.parsers import _validate_integer
@@ -41,7 +46,7 @@ def to_json(
4146
date_unit: str = "ms",
4247
default_handler: Optional[Callable[[Any], JSONSerializable]] = None,
4348
lines: bool = False,
44-
compression: Optional[str] = "infer",
49+
compression: CompressionOptions = "infer",
4550
index: bool = True,
4651
indent: int = 0,
4752
storage_options: StorageOptions = None,
@@ -369,7 +374,7 @@ def read_json(
369374
encoding=None,
370375
lines: bool = False,
371376
chunksize: Optional[int] = None,
372-
compression="infer",
377+
compression: CompressionOptions = "infer",
373378
nrows: Optional[int] = None,
374379
storage_options: StorageOptions = None,
375380
):
@@ -607,7 +612,9 @@ def read_json(
607612
if encoding is None:
608613
encoding = "utf-8"
609614

610-
compression = infer_compression(path_or_buf, compression)
615+
compression_method, compression = get_compression_method(compression)
616+
compression_method = infer_compression(path_or_buf, compression_method)
617+
compression = dict(compression, method=compression_method)
611618
filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
612619
path_or_buf,
613620
encoding=encoding,
@@ -667,10 +674,13 @@ def __init__(
667674
encoding,
668675
lines: bool,
669676
chunksize: Optional[int],
670-
compression,
677+
compression: CompressionOptions,
671678
nrows: Optional[int],
672679
):
673680

681+
compression_method, compression = get_compression_method(compression)
682+
compression = dict(compression, method=compression_method)
683+
674684
self.orient = orient
675685
self.typ = typ
676686
self.dtype = dtype
@@ -687,6 +697,7 @@ def __init__(
687697
self.nrows_seen = 0
688698
self.should_close = False
689699
self.nrows = nrows
700+
self.file_handles: List[IO] = []
690701

691702
if self.chunksize is not None:
692703
self.chunksize = _validate_integer("chunksize", self.chunksize, 1)
@@ -735,8 +746,8 @@ def _get_data_from_filepath(self, filepath_or_buffer):
735746
except (TypeError, ValueError):
736747
pass
737748

738-
if exists or self.compression is not None:
739-
data, _ = get_handle(
749+
if exists or self.compression["method"] is not None:
750+
data, self.file_handles = get_handle(
740751
filepath_or_buffer,
741752
"r",
742753
encoding=self.encoding,
@@ -816,6 +827,8 @@ def close(self):
816827
self.open_stream.close()
817828
except (IOError, AttributeError):
818829
pass
830+
for file_handle in self.file_handles:
831+
file_handle.close()
819832

820833
def __next__(self):
821834
if self.nrows:

pandas/io/pickle.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
""" pickle compat """
22
import pickle
3-
from typing import Any, Optional
3+
from typing import Any
44
import warnings
55

6-
from pandas._typing import FilePathOrBuffer, StorageOptions
6+
from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions
77
from pandas.compat import pickle_compat as pc
88

99
from pandas.io.common import get_filepath_or_buffer, get_handle
@@ -12,7 +12,7 @@
1212
def to_pickle(
1313
obj: Any,
1414
filepath_or_buffer: FilePathOrBuffer,
15-
compression: Optional[str] = "infer",
15+
compression: CompressionOptions = "infer",
1616
protocol: int = pickle.HIGHEST_PROTOCOL,
1717
storage_options: StorageOptions = None,
1818
):
@@ -114,7 +114,7 @@ def to_pickle(
114114

115115
def read_pickle(
116116
filepath_or_buffer: FilePathOrBuffer,
117-
compression: Optional[str] = "infer",
117+
compression: CompressionOptions = "infer",
118118
storage_options: StorageOptions = None,
119119
):
120120
"""

pandas/io/stata.py

+6-13
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535

3636
from pandas._libs.lib import infer_dtype
3737
from pandas._libs.writers import max_len_string_array
38-
from pandas._typing import FilePathOrBuffer, Label, StorageOptions
38+
from pandas._typing import CompressionOptions, FilePathOrBuffer, Label, StorageOptions
3939
from pandas.util._decorators import Appender
4040

4141
from pandas.core.dtypes.common import (
@@ -1938,9 +1938,9 @@ def read_stata(
19381938

19391939
def _open_file_binary_write(
19401940
fname: FilePathOrBuffer,
1941-
compression: Union[str, Mapping[str, str], None],
1941+
compression: CompressionOptions,
19421942
storage_options: StorageOptions = None,
1943-
) -> Tuple[BinaryIO, bool, Optional[Union[str, Mapping[str, str]]]]:
1943+
) -> Tuple[BinaryIO, bool, CompressionOptions]:
19441944
"""
19451945
Open a binary file or no-op if file-like.
19461946
@@ -1978,17 +1978,10 @@ def _open_file_binary_write(
19781978
# Extract compression mode as given, if dict
19791979
compression_typ, compression_args = get_compression_method(compression)
19801980
compression_typ = infer_compression(fname, compression_typ)
1981-
path_or_buf, _, compression_typ, _ = get_filepath_or_buffer(
1982-
fname,
1983-
mode="wb",
1984-
compression=compression_typ,
1985-
storage_options=storage_options,
1981+
compression = dict(compression_args, method=compression_typ)
1982+
path_or_buf, _, compression, _ = get_filepath_or_buffer(
1983+
fname, mode="wb", compression=compression, storage_options=storage_options,
19861984
)
1987-
if compression_typ is not None:
1988-
compression = compression_args
1989-
compression["method"] = compression_typ
1990-
else:
1991-
compression = None
19921985
f, _ = get_handle(path_or_buf, "wb", compression=compression, is_text=False)
19931986
return f, True, compression
19941987
else:

0 commit comments

Comments
 (0)