Skip to content

BUG/ENH: to_pickle/read_pickle support compression for file ojects #35736

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 5, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,7 @@ I/O
- :meth:`to_csv` passes compression arguments for `'gzip'` always to `gzip.GzipFile` (:issue:`28103`)
- :meth:`to_csv` did not support zip compression for binary file object not having a filename (:issue: `35058`)
- :meth:`to_csv` and :meth:`read_csv` did not honor `compression` and `encoding` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, and :issue:`32392`)
- :meth:`to_picke` and :meth:`read_pickle` did not support compression for file-objects (:issue:`26237`, :issue:`29054`, and :issue:`29570`)

Plotting
^^^^^^^^
Expand Down
4 changes: 2 additions & 2 deletions pandas/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@


# compression keywords and compression
CompressionDict = Mapping[str, Optional[Union[str, int, bool]]]
CompressionDict = Dict[str, Any]
CompressionOptions = Optional[Union[str, CompressionDict]]


Expand All @@ -138,6 +138,6 @@ class IOargs(Generic[ModeVar, EncodingVar]):

filepath_or_buffer: FileOrBuffer
encoding: EncodingVar
compression: CompressionOptions
compression: CompressionDict
should_close: bool
mode: Union[ModeVar, str]
4 changes: 2 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
Iterable,
Iterator,
List,
Mapping,
Optional,
Sequence,
Set,
Expand All @@ -49,6 +48,7 @@
ArrayLike,
Axes,
Axis,
CompressionOptions,
Dtype,
FilePathOrBuffer,
FrameOrSeriesUnion,
Expand Down Expand Up @@ -2062,7 +2062,7 @@ def to_stata(
variable_labels: Optional[Dict[Label, str]] = None,
version: Optional[int] = 114,
convert_strl: Optional[Sequence[Label]] = None,
compression: Union[str, Mapping[str, str], None] = "infer",
compression: CompressionOptions = "infer",
storage_options: StorageOptions = None,
) -> None:
"""
Expand Down
24 changes: 9 additions & 15 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,11 +205,13 @@ def get_filepath_or_buffer(
"""
filepath_or_buffer = stringify_path(filepath_or_buffer)

# handle compression dict
compression_method, compression = get_compression_method(compression)
compression_method = infer_compression(filepath_or_buffer, compression_method)
compression = dict(compression, method=compression_method)

# bz2 and xz do not write the byte order mark for utf-16 and utf-32
# print a warning when writing such files
compression_method = infer_compression(
filepath_or_buffer, get_compression_method(compression)[0]
)
if (
mode
and "w" in mode
Expand Down Expand Up @@ -238,7 +240,7 @@ def get_filepath_or_buffer(
content_encoding = req.headers.get("Content-Encoding", None)
if content_encoding == "gzip":
# Override compression based on Content-Encoding header
compression = "gzip"
compression = {"method": "gzip"}
reader = BytesIO(req.read())
req.close()
return IOargs(
Expand Down Expand Up @@ -374,11 +376,7 @@ def get_compression_method(
if isinstance(compression, Mapping):
compression_args = dict(compression)
try:
# error: Incompatible types in assignment (expression has type
# "Union[str, int, None]", variable has type "Optional[str]")
compression_method = compression_args.pop( # type: ignore[assignment]
"method"
)
compression_method = compression_args.pop("method")
except KeyError as err:
raise ValueError("If mapping, compression must have key 'method'") from err
else:
Expand Down Expand Up @@ -652,12 +650,8 @@ def __init__(
super().__init__(file, mode, **kwargs_zip) # type: ignore[arg-type]

def write(self, data):
archive_name = self.filename
if self.archive_name is not None:
archive_name = self.archive_name
if archive_name is None:
# ZipFile needs a non-empty string
archive_name = "zip"
# ZipFile needs a non-empty string
archive_name = self.archive_name or self.filename or "zip"
super().writestr(archive_name, data)

@property
Expand Down
15 changes: 4 additions & 11 deletions pandas/io/formats/csvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,7 @@
)
from pandas.core.dtypes.missing import notna

from pandas.io.common import (
get_compression_method,
get_filepath_or_buffer,
get_handle,
infer_compression,
)
from pandas.io.common import get_filepath_or_buffer, get_handle


class CSVFormatter:
Expand Down Expand Up @@ -60,17 +55,15 @@ def __init__(
if path_or_buf is None:
path_or_buf = StringIO()

# Extract compression mode as given, if dict
compression, self.compression_args = get_compression_method(compression)
self.compression = infer_compression(path_or_buf, compression)

ioargs = get_filepath_or_buffer(
path_or_buf,
encoding=encoding,
compression=self.compression,
compression=compression,
mode=mode,
storage_options=storage_options,
)
self.compression = ioargs.compression.pop("method")
self.compression_args = ioargs.compression
self.path_or_buf = ioargs.filepath_or_buffer
self.should_close = ioargs.should_close
self.mode = ioargs.mode
Expand Down
11 changes: 2 additions & 9 deletions pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,7 @@
from pandas.core.construction import create_series_with_explicit_dtype
from pandas.core.reshape.concat import concat

from pandas.io.common import (
get_compression_method,
get_filepath_or_buffer,
get_handle,
infer_compression,
)
from pandas.io.common import get_compression_method, get_filepath_or_buffer, get_handle
from pandas.io.json._normalize import convert_to_line_delimits
from pandas.io.json._table_schema import build_table_schema, parse_table_schema
from pandas.io.parsers import _validate_integer
Expand Down Expand Up @@ -66,6 +61,7 @@ def to_json(
)
path_or_buf = ioargs.filepath_or_buffer
should_close = ioargs.should_close
compression = ioargs.compression

if lines and orient != "records":
raise ValueError("'lines' keyword only valid when 'orient' is records")
Expand Down Expand Up @@ -616,9 +612,6 @@ def read_json(
if encoding is None:
encoding = "utf-8"

compression_method, compression = get_compression_method(compression)
compression_method = infer_compression(path_or_buf, compression_method)
compression = dict(compression, method=compression_method)
ioargs = get_filepath_or_buffer(
path_or_buf,
encoding=encoding,
Expand Down
13 changes: 5 additions & 8 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,7 @@
from pandas.core.series import Series
from pandas.core.tools import datetimes as tools

from pandas.io.common import (
get_filepath_or_buffer,
get_handle,
infer_compression,
validate_header_arg,
)
from pandas.io.common import get_filepath_or_buffer, get_handle, validate_header_arg
from pandas.io.date_converters import generic_parser

# BOM character (byte order mark)
Expand Down Expand Up @@ -424,9 +419,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
if encoding is not None:
encoding = re.sub("_", "-", encoding).lower()
kwds["encoding"] = encoding

compression = kwds.get("compression", "infer")
compression = infer_compression(filepath_or_buffer, compression)

# TODO: get_filepath_or_buffer could return
# Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile]
Expand Down Expand Up @@ -1976,6 +1969,10 @@ def __init__(self, src, **kwds):

encoding = kwds.get("encoding")

# parsers.TextReader doesn't support compression dicts
if isinstance(kwds.get("compression"), dict):
kwds["compression"] = kwds["compression"]["method"]

if kwds.get("compression") is None and encoding:
if isinstance(src, str):
src = open(src, "rb")
Expand Down
10 changes: 2 additions & 8 deletions pandas/io/pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,8 @@ def to_pickle(
mode="wb",
storage_options=storage_options,
)
compression = ioargs.compression
if not isinstance(ioargs.filepath_or_buffer, str) and compression == "infer":
compression = None
f, fh = get_handle(
ioargs.filepath_or_buffer, "wb", compression=compression, is_text=False
ioargs.filepath_or_buffer, "wb", compression=ioargs.compression, is_text=False
)
if protocol < 0:
protocol = pickle.HIGHEST_PROTOCOL
Expand Down Expand Up @@ -196,11 +193,8 @@ def read_pickle(
ioargs = get_filepath_or_buffer(
filepath_or_buffer, compression=compression, storage_options=storage_options
)
compression = ioargs.compression
if not isinstance(ioargs.filepath_or_buffer, str) and compression == "infer":
compression = None
f, fh = get_handle(
ioargs.filepath_or_buffer, "rb", compression=compression, is_text=False
ioargs.filepath_or_buffer, "rb", compression=ioargs.compression, is_text=False
)

# 1) try standard library Pickle
Expand Down
30 changes: 5 additions & 25 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,7 @@
from pathlib import Path
import struct
import sys
from typing import (
Any,
AnyStr,
BinaryIO,
Dict,
List,
Mapping,
Optional,
Sequence,
Tuple,
Union,
)
from typing import Any, AnyStr, BinaryIO, Dict, List, Optional, Sequence, Tuple, Union
import warnings

from dateutil.relativedelta import relativedelta
Expand Down Expand Up @@ -58,13 +47,7 @@
from pandas.core.indexes.base import Index
from pandas.core.series import Series

from pandas.io.common import (
get_compression_method,
get_filepath_or_buffer,
get_handle,
infer_compression,
stringify_path,
)
from pandas.io.common import get_filepath_or_buffer, get_handle, stringify_path

_version_error = (
"Version of given Stata file is {version}. pandas supports importing "
Expand Down Expand Up @@ -1976,9 +1959,6 @@ def _open_file_binary_write(
return fname, False, None # type: ignore[return-value]
elif isinstance(fname, (str, Path)):
# Extract compression mode as given, if dict
compression_typ, compression_args = get_compression_method(compression)
compression_typ = infer_compression(fname, compression_typ)
compression = dict(compression_args, method=compression_typ)
ioargs = get_filepath_or_buffer(
fname, mode="wb", compression=compression, storage_options=storage_options
)
Expand Down Expand Up @@ -2235,7 +2215,7 @@ def __init__(
time_stamp: Optional[datetime.datetime] = None,
data_label: Optional[str] = None,
variable_labels: Optional[Dict[Label, str]] = None,
compression: Union[str, Mapping[str, str], None] = "infer",
compression: CompressionOptions = "infer",
storage_options: StorageOptions = None,
):
super().__init__()
Expand Down Expand Up @@ -3118,7 +3098,7 @@ def __init__(
data_label: Optional[str] = None,
variable_labels: Optional[Dict[Label, str]] = None,
convert_strl: Optional[Sequence[Label]] = None,
compression: Union[str, Mapping[str, str], None] = "infer",
compression: CompressionOptions = "infer",
storage_options: StorageOptions = None,
):
# Copy to new list since convert_strl might be modified later
Expand Down Expand Up @@ -3523,7 +3503,7 @@ def __init__(
variable_labels: Optional[Dict[Label, str]] = None,
convert_strl: Optional[Sequence[Label]] = None,
version: Optional[int] = None,
compression: Union[str, Mapping[str, str], None] = "infer",
compression: CompressionOptions = "infer",
storage_options: StorageOptions = None,
):
if version is None:
Expand Down
29 changes: 29 additions & 0 deletions pandas/tests/io/test_pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
import datetime
import glob
import gzip
import io
import os
from pathlib import Path
import pickle
import shutil
from warnings import catch_warnings, simplefilter
Expand Down Expand Up @@ -486,3 +488,30 @@ def test_read_pickle_with_subclass():

tm.assert_series_equal(result[0], expected[0])
assert isinstance(result[1], MyTz)


def test_pickle_binary_object_compression(compression):
"""
Read/write from binary file-objects w/wo compression.

GH 26237, GH 29054, and GH 29570
"""
df = tm.makeDataFrame()

# reference for compression
with tm.ensure_clean() as path:
df.to_pickle(path, compression=compression)
reference = Path(path).read_bytes()

# write
buffer = io.BytesIO()
df.to_pickle(buffer, compression=compression)
buffer.seek(0)

# gzip and zip safe the filename: cannot compare the compressed content
assert buffer.getvalue() == reference or compression in ("gzip", "zip")

# read
read_df = pd.read_pickle(buffer, compression=compression)
buffer.seek(0)
tm.assert_frame_equal(df, read_df)