-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
add compression support for 'read_pickle' and 'to_pickle' #13317
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
025a0cd
81d55a0
6df6611
b8c4175
1cb810b
9a07250
ccbeaa9
945e7bb
86afd25
d50e430
e9c5fd2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,7 @@ | |
|
||
try: | ||
from s3fs import S3File | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pls turn off whitespace changes, they will not pass linting |
||
need_text_wrapping = (BytesIO, S3File) | ||
except ImportError: | ||
need_text_wrapping = (BytesIO,) | ||
|
@@ -28,20 +29,21 @@ | |
|
||
try: | ||
import pathlib | ||
|
||
_PATHLIB_INSTALLED = True | ||
except ImportError: | ||
_PATHLIB_INSTALLED = False | ||
|
||
|
||
try: | ||
from py.path import local as LocalPath | ||
|
||
_PY_PATH_INSTALLED = True | ||
except: | ||
_PY_PATH_INSTALLED = False | ||
|
||
|
||
if compat.PY3: | ||
from urllib.request import urlopen, pathname2url | ||
|
||
_urlopen = urlopen | ||
from urllib.parse import urlparse as parse_url | ||
from urllib.parse import (uses_relative, uses_netloc, uses_params, | ||
|
@@ -58,13 +60,13 @@ | |
from contextlib import contextmanager, closing # noqa | ||
from functools import wraps # noqa | ||
|
||
|
||
# @wraps(_urlopen) | ||
@contextmanager | ||
def urlopen(*args, **kwargs): | ||
with closing(_urlopen(*args, **kwargs)) as f: | ||
yield f | ||
|
||
|
||
_VALID_URLS = set(uses_relative + uses_netloc + uses_params) | ||
_VALID_URLS.discard('') | ||
|
||
|
@@ -75,6 +77,7 @@ class ParserError(ValueError): | |
""" | ||
pass | ||
|
||
|
||
# gh-12665: Alias for now and remove later. | ||
CParserError = ParserError | ||
|
||
|
@@ -109,12 +112,14 @@ class BaseIterator(object): | |
"""Subclass this and provide a "__next__()" method to obtain an iterator. | ||
Useful only when the object being iterated is non-reusable (e.g. OK for a | ||
parser, not for an in-memory table, yes for its iterator).""" | ||
|
||
def __iter__(self): | ||
return self | ||
|
||
def __next__(self): | ||
raise AbstractMethodError(self) | ||
|
||
|
||
if not compat.PY3: | ||
BaseIterator.next = lambda self: self.__next__() | ||
|
||
|
@@ -302,7 +307,7 @@ def _infer_compression(filepath_or_buffer, compression): | |
|
||
|
||
def _get_handle(path_or_buf, mode, encoding=None, compression=None, | ||
memory_map=False): | ||
memory_map=False, is_text=True): | ||
""" | ||
Get file handle for given path/buffer and mode. | ||
|
||
|
@@ -317,7 +322,9 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, | |
Supported compression protocols are gzip, bz2, zip, and xz | ||
memory_map : boolean, default False | ||
See parsers._parser_params for more information. | ||
|
||
is_text : boolean, default True | ||
whether file/buffer is in text format (csv, json, etc.), or in binary | ||
mode (pickle, etc.) | ||
Returns | ||
------- | ||
f : file-like | ||
|
@@ -391,13 +398,17 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, | |
elif encoding: | ||
# Python 3 and encoding | ||
f = open(path_or_buf, mode, encoding=encoding) | ||
else: | ||
elif is_text: | ||
# Python 3 and no explicit encoding | ||
f = open(path_or_buf, mode, errors='replace') | ||
else: | ||
# Python 3 and binary mode | ||
f = open(path_or_buf, mode) | ||
handles.append(f) | ||
|
||
# in Python 3, convert BytesIO or fileobjects passed with an encoding | ||
if compat.PY3 and (compression or isinstance(f, need_text_wrapping)): | ||
if compat.PY3 and is_text and\ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there an opportunity to simplify things? What is the relationship between There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
maybe we can spilt There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I opened #15008: would love if you could migrate you above comment to that issue. This way we can keep this pull request focussed and minimalist. |
||
(compression or isinstance(f, need_text_wrapping)): | ||
from io import TextIOWrapper | ||
f = TextIOWrapper(f, encoding=encoding) | ||
handles.append(f) | ||
|
@@ -454,7 +465,6 @@ def __next__(self): | |
|
||
|
||
class UTF8Recoder(BaseIterator): | ||
|
||
""" | ||
Iterator that reads an encoded stream and reencodes the input to UTF-8 | ||
""" | ||
|
@@ -477,6 +487,7 @@ def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds): | |
# ignore encoding | ||
return csv.reader(f, dialect=dialect, **kwds) | ||
|
||
|
||
def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds): | ||
return csv.writer(f, dialect=dialect, **kwds) | ||
else: | ||
|
@@ -498,6 +509,7 @@ def __next__(self): | |
row = next(self.reader) | ||
return [compat.text_type(s, "utf-8") for s in row] | ||
|
||
|
||
class UnicodeWriter: | ||
|
||
""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,9 +4,10 @@ | |
from numpy.lib.format import read_array, write_array | ||
from pandas.compat import BytesIO, cPickle as pkl, pickle_compat as pc, PY3 | ||
from pandas.types.common import is_datetime64_dtype, _NS_DTYPE | ||
from pandas.io.common import _get_handle, _infer_compression | ||
|
||
|
||
def to_pickle(obj, path): | ||
def to_pickle(obj, path, compression='infer'): | ||
""" | ||
Pickle (serialize) object to input file path | ||
|
||
|
@@ -15,12 +16,23 @@ def to_pickle(obj, path): | |
obj : any object | ||
path : string | ||
File path | ||
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a versionadded |
||
a string representing the compression to use in the output file | ||
|
||
.. versionadded:: 0.20.0 | ||
""" | ||
with open(path, 'wb') as f: | ||
inferred_compression = _infer_compression(path, compression) | ||
f, fh = _get_handle(path, 'wb', | ||
compression=inferred_compression, | ||
is_text=False) | ||
try: | ||
pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should close the file handles in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and the same below in the read function There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do you mean
I don't know if it is safe to close all handles in this way There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, something like that. In principle, all handles in |
||
finally: | ||
for _f in fh: | ||
_f.close() | ||
|
||
|
||
def read_pickle(path): | ||
def read_pickle(path, compression='infer'): | ||
""" | ||
Load pickled pandas object (or any other pickled object) from the specified | ||
file path | ||
|
@@ -32,12 +44,21 @@ def read_pickle(path): | |
---------- | ||
path : string | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It shouldn't be too hard to get There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the original api dosen't support URL reading, this feature can be added in future. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would be a nice enhancement (and welcome to work on this!), but let's leave that for another PR There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
It would fit well with a pull request to accomplish #15008. |
||
File path | ||
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' | ||
For on-the-fly decompression of on-disk data. If 'infer', then use | ||
gzip, bz2 or xz if path is a string ending in '.gz', '.bz2', or 'xz', | ||
respectively, and no decompression otherwise. | ||
Set to None for no decompression. | ||
|
||
.. versionadded:: 0.20.0 | ||
|
||
Returns | ||
------- | ||
unpickled : type of object stored in file | ||
""" | ||
|
||
inferred_compression = _infer_compression(path, compression) | ||
|
||
def try_read(path, encoding=None): | ||
# try with cPickle | ||
# try with current pickle, if we have a Type Error then | ||
|
@@ -48,26 +69,43 @@ def try_read(path, encoding=None): | |
# cpickle | ||
# GH 6899 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so these routines need to be changed to read in the file once e.g.
then the operations are all
etc, IOW, all we do is seek to the beginning of the buffer each time, rather than read the file in (potentially) 4 times. |
||
try: | ||
with open(path, 'rb') as fh: | ||
return pkl.load(fh) | ||
f, fh = _get_handle(path, 'rb', | ||
compression=inferred_compression, | ||
is_text=False) | ||
try: | ||
return pkl.load(f) | ||
finally: | ||
for _f in fh: | ||
_f.close() | ||
except Exception: | ||
# reg/patched pickle | ||
try: | ||
with open(path, 'rb') as fh: | ||
return pc.load(fh, encoding=encoding, compat=False) | ||
|
||
f, fh = _get_handle(path, 'rb', | ||
compression=inferred_compression, | ||
is_text=False) | ||
try: | ||
return pc.load(f, encoding=encoding, compat=False) | ||
finally: | ||
for _f in fh: | ||
_f.close() | ||
# compat pickle | ||
except: | ||
with open(path, 'rb') as fh: | ||
return pc.load(fh, encoding=encoding, compat=True) | ||
|
||
f, fh = _get_handle(path, 'rb', | ||
compression=inferred_compression, | ||
is_text=False) | ||
try: | ||
return pc.load(f, encoding=encoding, compat=True) | ||
finally: | ||
for _f in fh: | ||
_f.close() | ||
try: | ||
return try_read(path) | ||
except: | ||
if PY3: | ||
return try_read(path, encoding='latin1') | ||
raise | ||
|
||
|
||
# compat with sparse pickle / unpickle | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -284,6 +284,46 @@ def test_pickle_v0_15_2(self): | |
# | ||
tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) | ||
|
||
def compression_explicit(self, compression): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add these in another class, TestPickleCompression |
||
# issue 11666 | ||
if compression == 'xz': | ||
tm._skip_if_no_lzma() | ||
with tm.ensure_clean(self.path) as path: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add the issue reference as a comment |
||
df = tm.makeDataFrame() | ||
df.to_pickle(path, compression=compression) | ||
df2 = pd.read_pickle(path, compression=compression) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like these roundtrip pickle tests, but I'm worried about a failure scenario where both There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, that will be more persuasive and reliable. One way is to add some prepared .pickle.xz file into pandas/io/tests/data, but reading these files may fail as the pickle version varies. Using command line maybe better but involves more environment configurations. I don't see a clear and clean way to do the test yet. Any suggestion? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I was only suggesting using your local command line "to add some prepared .pickle.xz file into pandas/io/tests/data".
What about using a pickle with protocol version 2? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🆗 |
||
tm.assert_frame_equal(df, df2) | ||
|
||
def test_compression_explicit(self): | ||
compressions = [None, 'gzip', 'bz2', 'xz'] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you also want to test/support single-file There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not this version, maybe next version. Actually in most cases I don't think it's a good design to compress one pickle file into There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Agreed, but _get_handle supports reading the single file zip... so you're going to have this feature anyways. I personally would not use zip to compress a single file, but have received datasets that use this method and so it's handy to be able to read them. Currently, So I'm not terribly concerned if you don't test for reading the zipped pickle, but since it will work, I think people may use it. I will open an issue and tag you about moving to a more uniform compression API in the longer term. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. agreed, generous on input, strict on output |
||
for c in compressions: | ||
yield self.compression_explicit, c | ||
|
||
def compression_explicit_bad(self, compression): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure it makes sense to test for invalid |
||
with tm.assertRaisesRegexp(ValueError, | ||
"Unrecognized compression type"): | ||
with tm.ensure_clean(self.path) as path: | ||
df = tm.makeDataFrame() | ||
df.to_pickle(path, compression=compression) | ||
|
||
def test_compression_explicit_bad(self): | ||
compressions = ['', 'None', 'bad', '7z'] | ||
for c in compressions: | ||
yield self.compression_explicit_bad, c | ||
|
||
def compression_infer(self, ext): | ||
if ext == '.xz': | ||
tm._skip_if_no_lzma() | ||
with tm.ensure_clean(self.path + ext) as path: | ||
df = tm.makeDataFrame() | ||
df.to_pickle(path) | ||
tm.assert_frame_equal(df, pd.read_pickle(path)) | ||
|
||
def test_compression_infer(self): | ||
extensions = ['', '.gz', '.bz2', '.xz', '.who_am_i'] | ||
for ext in extensions: | ||
yield self.compression_infer, ext | ||
|
||
|
||
if __name__ == '__main__': | ||
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
need an versionadded tag