Skip to content

ENH: Adding 'protocol' parameter to 'to_pickle'. #16252

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 11 commits into from
22 changes: 22 additions & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,28 @@ New features
Other Enhancements
^^^^^^^^^^^^^^^^^^

Pickle file I/O protocol parameter
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we need this section. Can just add a single line with the issue refernce.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed the 'whatsnew' file accordingly.

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

:func:`to_pickle` now allows to specify the protocol used by the Pickler.
The 'protocol' parameter defaults to HIGHEST_PROTOCOL. For Python 2.x, HIGHEST_PROTOCOL is 2.
Since Python 3.0 (respectively 3.4), HIGHEST_PROTOCOL is 3 (respectively 4).
A negative value for the protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL.

.. ipython:: python

df = pd.DataFrame({
'A': np.random.randn(1000),
'B': 'foo',
'C': pd.date_range('20130101', periods=1000, freq='s')})

Using an explicit protocol parameter

.. ipython:: python

df.to_pickle("data.pkl", protocol=2)
rt = pd.read_pickle("data.pkl")
rt


.. _whatsnew_0210.api_breaking:
Expand Down
19 changes: 16 additions & 3 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
from pandas import compat
from pandas.compat.numpy import function as nv
from pandas.compat import (map, zip, lzip, lrange, string_types,
isidentifier, set_function_name)
isidentifier, set_function_name, cPickle as pkl)
import pandas.core.nanops as nanops
from pandas.util.decorators import Appender, Substitution, deprecate_kwarg
from pandas.util.validators import validate_bool_kwarg
Expand Down Expand Up @@ -1344,7 +1344,8 @@ def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail',
if_exists=if_exists, index=index, index_label=index_label,
chunksize=chunksize, dtype=dtype)

def to_pickle(self, path, compression='infer'):
def to_pickle(self, path, compression='infer',
protocol=pkl.HIGHEST_PROTOCOL):
"""
Pickle (serialize) object to input file path.

Expand All @@ -1356,9 +1357,21 @@ def to_pickle(self, path, compression='infer'):
a string representing the compression to use in the output file

.. versionadded:: 0.20.0
protocol : int
Int which indicates which protocol should be used by the pickler,
default HIGHEST_PROTOCOL (Pickle module). The possible values for
this parameter depend on the version of Python. For Python 2.x,
possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value.
For Python >= 3.4, 4 is a valid value.A negative value for the
protocol parameter is equivalent to setting its value to
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a link to the pickle docs where this is defined.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done, I added the url.

HIGHEST_PROTOCOL.

.. versionadded:: 0.21.0

"""
from pandas.io.pickle import to_pickle
return to_pickle(self, path, compression=compression)
return to_pickle(self, path, compression=compression,
protocol=protocol)

def to_clipboard(self, excel=None, sep=None, **kwargs):
"""
Expand Down
18 changes: 16 additions & 2 deletions pandas/io/pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pandas.io.common import _get_handle, _infer_compression


def to_pickle(obj, path, compression='infer'):
def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
"""
Pickle (serialize) object to input file path

Expand All @@ -20,13 +20,27 @@ def to_pickle(obj, path, compression='infer'):
a string representing the compression to use in the output file

.. versionadded:: 0.20.0
protocol : int
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same

Int which indicates which protocol should be used by the pickler,
default HIGHEST_PROTOCOL (Pickle module). The possible values for
this parameter depend on the version of Python. For Python 2.x,
possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same

For Python >= 3.4, 4 is a valid value. A negative value for the
protocol parameter is equivalent to setting its value to
HIGHEST_PROTOCOL.

.. versionadded:: 0.21.0


"""
inferred_compression = _infer_compression(path, compression)
f, fh = _get_handle(path, 'wb',
compression=inferred_compression,
is_text=False)
if protocol < 0:
protocol = pkl.HIGHEST_PROTOCOL
try:
pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL)
pkl.dump(obj, f, protocol=protocol)
finally:
for _f in fh:
_f.close()
Expand Down
38 changes: 37 additions & 1 deletion pandas/tests/io/test_pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,12 @@
from distutils.version import LooseVersion
import pandas as pd
from pandas import Index
from pandas.compat import is_platform_little_endian
from pandas.compat import is_platform_little_endian, cPickle as pkl
import pandas
import pandas.util.testing as tm
from pandas.tseries.offsets import Day, MonthEnd
import shutil
import sys


@pytest.fixture(scope='module')
Expand Down Expand Up @@ -489,3 +490,38 @@ def test_read_infer(self, ext, get_random_path):
df2 = pd.read_pickle(p2)

tm.assert_frame_equal(df, df2)


# ---------------------
# test pickle compression
# ---------------------

class TestProtocol(object):

@pytest.mark.parametrize('protocol', [-1, 0, 1, 2])
def test_read(self, protocol, get_random_path):
with tm.ensure_clean(get_random_path) as path:
df = tm.makeDataFrame()
df.to_pickle(path, protocol=protocol)
df2 = pd.read_pickle(path)
tm.assert_frame_equal(df, df2)

@pytest.mark.parametrize('protocol', [3, 4])
@pytest.mark.skipif(sys.version_info[:2] >= (3, 4),
reason="Testing invalid parameters for "
"Python 2.x and 3.y (y < 4).")
def test_read_bad_versions(self, protocol, get_random_path):
# For Python 2.x (respectively 3.y with y < 4), [expected]
# HIGHEST_PROTOCOL should be 2 (respectively 3). Hence, the protocol
# parameter should not exceed 2 (respectively 3).
if sys.version_info[:2] < (3, 0):
expect_hp = 2
else:
expect_hp = 3
with tm.assert_raises_regex(ValueError,
"pickle protocol %d asked for; the highest"
" available protocol is %d" % (protocol,
expect_hp)):
with tm.ensure_clean(get_random_path) as path:
df = tm.makeDataFrame()
df.to_pickle(path, protocol=protocol)