Skip to content

add compression support for 'read_pickle' and 'to_pickle' #13317

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 11 commits into from
8 changes: 6 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1256,17 +1256,21 @@ def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail',
if_exists=if_exists, index=index, index_label=index_label,
chunksize=chunksize, dtype=dtype)

def to_pickle(self, path):
def to_pickle(self, path, compression='infer'):
"""
Pickle (serialize) object to input file path.

Parameters
----------
path : string
File path
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need an versionadded tag

a string representing the compression to use in the output file

.. versionadded:: 0.20.0
"""
from pandas.io.pickle import to_pickle
return to_pickle(self, path)
return to_pickle(self, path, compression=compression)

def to_clipboard(self, excel=None, sep=None, **kwargs):
"""
Expand Down
28 changes: 20 additions & 8 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

try:
from s3fs import S3File

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pls turn off whitespace changes, they will not pass linting

need_text_wrapping = (BytesIO, S3File)
except ImportError:
need_text_wrapping = (BytesIO,)
Expand All @@ -28,20 +29,21 @@

try:
import pathlib

_PATHLIB_INSTALLED = True
except ImportError:
_PATHLIB_INSTALLED = False


try:
from py.path import local as LocalPath

_PY_PATH_INSTALLED = True
except:
_PY_PATH_INSTALLED = False


if compat.PY3:
from urllib.request import urlopen, pathname2url

_urlopen = urlopen
from urllib.parse import urlparse as parse_url
from urllib.parse import (uses_relative, uses_netloc, uses_params,
Expand All @@ -58,13 +60,13 @@
from contextlib import contextmanager, closing # noqa
from functools import wraps # noqa


# @wraps(_urlopen)
@contextmanager
def urlopen(*args, **kwargs):
with closing(_urlopen(*args, **kwargs)) as f:
yield f


_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard('')

Expand All @@ -75,6 +77,7 @@ class ParserError(ValueError):
"""
pass


# gh-12665: Alias for now and remove later.
CParserError = ParserError

Expand Down Expand Up @@ -109,12 +112,14 @@ class BaseIterator(object):
"""Subclass this and provide a "__next__()" method to obtain an iterator.
Useful only when the object being iterated is non-reusable (e.g. OK for a
parser, not for an in-memory table, yes for its iterator)."""

def __iter__(self):
return self

def __next__(self):
raise AbstractMethodError(self)


if not compat.PY3:
BaseIterator.next = lambda self: self.__next__()

Expand Down Expand Up @@ -302,7 +307,7 @@ def _infer_compression(filepath_or_buffer, compression):


def _get_handle(path_or_buf, mode, encoding=None, compression=None,
memory_map=False):
memory_map=False, is_text=True):
"""
Get file handle for given path/buffer and mode.

Expand All @@ -317,7 +322,9 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
Supported compression protocols are gzip, bz2, zip, and xz
memory_map : boolean, default False
See parsers._parser_params for more information.

is_text : boolean, default True
whether file/buffer is in text format (csv, json, etc.), or in binary
mode (pickle, etc.)
Returns
-------
f : file-like
Expand Down Expand Up @@ -391,13 +398,17 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
elif encoding:
# Python 3 and encoding
f = open(path_or_buf, mode, encoding=encoding)
else:
elif is_text:
# Python 3 and no explicit encoding
f = open(path_or_buf, mode, errors='replace')
else:
# Python 3 and binary mode
f = open(path_or_buf, mode)
handles.append(f)

# in Python 3, convert BytesIO or fileobjects passed with an encoding
if compat.PY3 and (compression or isinstance(f, need_text_wrapping)):
if compat.PY3 and is_text and\
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there an opportunity to simplify things? What is the relationship between need_text_wrapping, compression, and is_text --- I will think about this, but I think currently, the logic for when TextIOWrapper gets applied is confusing.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_get_handle needs to deal with varies situations:

  • py2 or py3
  • binary (pickle, msgpack) or text (csv)
  • if text, what's the encoding
  • compression
  • memory map
  • open for read or write

maybe we can spilt _get_handle into two or more functions to make each single function simpler?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I opened #15008: would love if you could migrate you above comment to that issue. This way we can keep this pull request focussed and minimalist.

(compression or isinstance(f, need_text_wrapping)):
from io import TextIOWrapper
f = TextIOWrapper(f, encoding=encoding)
handles.append(f)
Expand Down Expand Up @@ -454,7 +465,6 @@ def __next__(self):


class UTF8Recoder(BaseIterator):

"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
Expand All @@ -477,6 +487,7 @@ def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds):
# ignore encoding
return csv.reader(f, dialect=dialect, **kwds)


def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds):
return csv.writer(f, dialect=dialect, **kwds)
else:
Expand All @@ -498,6 +509,7 @@ def __next__(self):
row = next(self.reader)
return [compat.text_type(s, "utf-8") for s in row]


class UnicodeWriter:

"""
Expand Down
60 changes: 49 additions & 11 deletions pandas/io/pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
from numpy.lib.format import read_array, write_array
from pandas.compat import BytesIO, cPickle as pkl, pickle_compat as pc, PY3
from pandas.types.common import is_datetime64_dtype, _NS_DTYPE
from pandas.io.common import _get_handle, _infer_compression


def to_pickle(obj, path):
def to_pickle(obj, path, compression='infer'):
"""
Pickle (serialize) object to input file path

Expand All @@ -15,12 +16,23 @@ def to_pickle(obj, path):
obj : any object
path : string
File path
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a versionadded

a string representing the compression to use in the output file

.. versionadded:: 0.20.0
"""
with open(path, 'wb') as f:
inferred_compression = _infer_compression(path, compression)
f, fh = _get_handle(path, 'wb',
compression=inferred_compression,
is_text=False)
try:
pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should close the file handles in fh explicitly, instead of closing f by using it in a with statement

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and the same below in the read function

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you mean

for h in fh:
    h.close()

I don't know if it is safe to close all handles in this way

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, something like that. In principle, all handles in fh are opened by pandas itself (not passed by the user) and should be able to be closed.

finally:
for _f in fh:
_f.close()


def read_pickle(path):
def read_pickle(path, compression='infer'):
"""
Load pickled pandas object (or any other pickled object) from the specified
file path
Expand All @@ -32,12 +44,21 @@ def read_pickle(path):
----------
path : string
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It shouldn't be too hard to get read_pickle to also support reading from URLs. @goldenbull -- not sure if you are also interested in adding this feature. Ultimately all read methods should support compression and URL reading.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the original api dosen't support URL reading, this feature can be added in future.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be a nice enhancement (and welcome to work on this!), but let's leave that for another PR

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's leave that for another PR

It would fit well with a pull request to accomplish #15008.

File path
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
For on-the-fly decompression of on-disk data. If 'infer', then use
gzip, bz2 or xz if path is a string ending in '.gz', '.bz2', or 'xz',
respectively, and no decompression otherwise.
Set to None for no decompression.

.. versionadded:: 0.20.0

Returns
-------
unpickled : type of object stored in file
"""

inferred_compression = _infer_compression(path, compression)

def try_read(path, encoding=None):
# try with cPickle
# try with current pickle, if we have a Type Error then
Expand All @@ -48,26 +69,43 @@ def try_read(path, encoding=None):
# cpickle
# GH 6899
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so these routines need to be changed to read in the file once e.g.

f, fh = _get_handle(....)
try:
buffer = BufferIO(fh.read())
finally:
   for _f in fh:
       f.close()  

then the operations are all

try:
    buffer.seek(0)
    pc.load(buffer....)
except:
    ...

etc, IOW, all we do is seek to the beginning of the buffer each time, rather than read the file in (potentially) 4 times.

try:
with open(path, 'rb') as fh:
return pkl.load(fh)
f, fh = _get_handle(path, 'rb',
compression=inferred_compression,
is_text=False)
try:
return pkl.load(f)
finally:
for _f in fh:
_f.close()
except Exception:
# reg/patched pickle
try:
with open(path, 'rb') as fh:
return pc.load(fh, encoding=encoding, compat=False)

f, fh = _get_handle(path, 'rb',
compression=inferred_compression,
is_text=False)
try:
return pc.load(f, encoding=encoding, compat=False)
finally:
for _f in fh:
_f.close()
# compat pickle
except:
with open(path, 'rb') as fh:
return pc.load(fh, encoding=encoding, compat=True)

f, fh = _get_handle(path, 'rb',
compression=inferred_compression,
is_text=False)
try:
return pc.load(f, encoding=encoding, compat=True)
finally:
for _f in fh:
_f.close()
try:
return try_read(path)
except:
if PY3:
return try_read(path, encoding='latin1')
raise


# compat with sparse pickle / unpickle


Expand Down
40 changes: 40 additions & 0 deletions pandas/io/tests/test_pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,46 @@ def test_pickle_v0_15_2(self):
#
tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path))

def compression_explicit(self, compression):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add these in another class, TestPickleCompression

# issue 11666
if compression == 'xz':
tm._skip_if_no_lzma()
with tm.ensure_clean(self.path) as path:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add the issue reference as a comment

df = tm.makeDataFrame()
df.to_pickle(path, compression=compression)
df2 = pd.read_pickle(path, compression=compression)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like these roundtrip pickle tests, but I'm worried about a failure scenario where both to_pickle and read_pickle are broken in the same way. Therefore, I think it makes sense to also test reading of pickles which were compressed externally. I.e. use the command line to compress a pickled file, track in repo, and use these files to test read_pickle.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that will be more persuasive and reliable. One way is to add some prepared .pickle.xz file into pandas/io/tests/data, but reading these files may fail as the pickle version varies. Using command line maybe better but involves more environment configurations. I don't see a clear and clean way to do the test yet. Any suggestion?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using command line maybe better but involves more environment configurations.

I was only suggesting using your local command line "to add some prepared .pickle.xz file into pandas/io/tests/data".

reading these files may fail as the pickle version varies.

What about using a pickle with protocol version 2? pickle.load should automatically detect the version 2, which is supported in 2.7 and 3.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🆗

tm.assert_frame_equal(df, df2)

def test_compression_explicit(self):
compressions = [None, 'gzip', 'bz2', 'xz']
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you also want to test/support single-file zip compression?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not this version, maybe next version. Actually in most cases I don't think it's a good design to compress one pickle file into zip file and then enforce there is only one file in that zip. Maybe it's useful to compress multiple pickle files into one zip archive, but that's another story.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually in most cases I don't think it's a good design to compress one pickle file into zip file and then enforce there is only one file in that zip.

Agreed, but _get_handle supports reading the single file zip... so you're going to have this feature anyways. I personally would not use zip to compress a single file, but have received datasets that use this method and so it's handy to be able to read them.

Currently, to_csv supports only ‘gzip’, ‘bz2’, ‘xz’ compression. I like this approach -- support reading, but do not make it easy to write.

So I'm not terribly concerned if you don't test for reading the zipped pickle, but since it will work, I think people may use it.

I will open an issue and tag you about moving to a more uniform compression API in the longer term.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agreed, generous on input, strict on output

for c in compressions:
yield self.compression_explicit, c

def compression_explicit_bad(self, compression):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure it makes sense to test for invalid compression values or for compression inference. This is because you use _infer_compression, which gets tested for read_table. It seems repetitive to have every read method have to test whether compression inference now that we've consolidated things. Interested whether others agree?

with tm.assertRaisesRegexp(ValueError,
"Unrecognized compression type"):
with tm.ensure_clean(self.path) as path:
df = tm.makeDataFrame()
df.to_pickle(path, compression=compression)

def test_compression_explicit_bad(self):
compressions = ['', 'None', 'bad', '7z']
for c in compressions:
yield self.compression_explicit_bad, c

def compression_infer(self, ext):
if ext == '.xz':
tm._skip_if_no_lzma()
with tm.ensure_clean(self.path + ext) as path:
df = tm.makeDataFrame()
df.to_pickle(path)
tm.assert_frame_equal(df, pd.read_pickle(path))

def test_compression_infer(self):
extensions = ['', '.gz', '.bz2', '.xz', '.who_am_i']
for ext in extensions:
yield self.compression_infer, ext


if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
Expand Down