Skip to content

Commit 539de79

Browse files
jbschirattijreback
authored andcommitted
ENH: Adding 'protocol' parameter to 'to_pickle'.
This PR aims at adding an optional `protocol` parameter to the function `to_pickle`. Closes #14488. If needed, I can update the corresponding test (`pandas/tests/io/test_pickle.py`). Author: Jean-Baptiste Schiratti <[email protected]> Closes #16252 from jbschiratti/pickle_protocol and squashes the following commits: 8eb660d [Jean-Baptiste Schiratti] Minor change on whatsnew. 20a854d [Jean-Baptiste Schiratti] Added ref for protocol parameter + edited whatsnew. 14bc485 [Jean-Baptiste Schiratti] Fix : removed unused import. 7631146 [Jean-Baptiste Schiratti] Fix : added issue number. 460ca0c [Jean-Baptiste Schiratti] Shortened paragraph addded in 'whatsnew'. 352220b [Jean-Baptiste Schiratti] Fix : Fixed error message in 'test_read_bad_versions'. 9c9d38f [Jean-Baptiste Schiratti] Added enhancement to 'whatsnew' file. 35f8d18 [Jean-Baptiste Schiratti] Added tests for new 'protocol' parameter in 'to_pickle'. 4bf0386 [Jean-Baptiste Schiratti] Added docstring for negative protocol parameter. 04bc5c2 [Jean-Baptiste Schiratti] Added 'versionadded' tag, improved docstring + fixed import. 66a35e8 [Jean-Baptiste Schiratti] Added 'protocol' parameter to 'to_pickle'.
1 parent 34ebad8 commit 539de79

File tree

4 files changed

+72
-6
lines changed

4 files changed

+72
-6
lines changed

doc/source/whatsnew/v0.21.0.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@ Other Enhancements
2929
- ``Series.to_dict()`` and ``DataFrame.to_dict()`` now support an ``into`` keyword which allows you to specify the ``collections.Mapping`` subclass that you would like returned. The default is ``dict``, which is backwards compatible. (:issue:`16122`)
3030
- ``RangeIndex.append`` now returns a ``RangeIndex`` object when possible (:issue:`16212`)
3131

32-
32+
- :func:`to_pickle` has gained a protocol parameter (:issue:`16252`). By default,
33+
this parameter is set to `HIGHEST_PROTOCOL <https://docs.python.org/3/library/pickle.html#data-stream-format>`__
3334

3435
.. _whatsnew_0210.api_breaking:
3536

pandas/core/generic.py

+17-3
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
from pandas import compat
5151
from pandas.compat.numpy import function as nv
5252
from pandas.compat import (map, zip, lzip, lrange, string_types,
53-
isidentifier, set_function_name)
53+
isidentifier, set_function_name, cPickle as pkl)
5454
import pandas.core.nanops as nanops
5555
from pandas.util._decorators import Appender, Substitution, deprecate_kwarg
5656
from pandas.util._validators import validate_bool_kwarg
@@ -1350,7 +1350,8 @@ def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail',
13501350
if_exists=if_exists, index=index, index_label=index_label,
13511351
chunksize=chunksize, dtype=dtype)
13521352

1353-
def to_pickle(self, path, compression='infer'):
1353+
def to_pickle(self, path, compression='infer',
1354+
protocol=pkl.HIGHEST_PROTOCOL):
13541355
"""
13551356
Pickle (serialize) object to input file path.
13561357
@@ -1362,9 +1363,22 @@ def to_pickle(self, path, compression='infer'):
13621363
a string representing the compression to use in the output file
13631364
13641365
.. versionadded:: 0.20.0
1366+
protocol : int
1367+
Int which indicates which protocol should be used by the pickler,
1368+
default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible
1369+
values for this parameter depend on the version of Python. For
1370+
Python 2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a
1371+
valid value. For Python >= 3.4, 4 is a valid value.A negative value
1372+
for the protocol parameter is equivalent to setting its value to
1373+
HIGHEST_PROTOCOL.
1374+
1375+
.. [1] https://docs.python.org/3/library/pickle.html
1376+
.. versionadded:: 0.21.0
1377+
13651378
"""
13661379
from pandas.io.pickle import to_pickle
1367-
return to_pickle(self, path, compression=compression)
1380+
return to_pickle(self, path, compression=compression,
1381+
protocol=protocol)
13681382

13691383
def to_clipboard(self, excel=None, sep=None, **kwargs):
13701384
"""

pandas/io/pickle.py

+17-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from pandas.io.common import _get_handle, _infer_compression
88

99

10-
def to_pickle(obj, path, compression='infer'):
10+
def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
1111
"""
1212
Pickle (serialize) object to input file path
1313
@@ -20,13 +20,28 @@ def to_pickle(obj, path, compression='infer'):
2020
a string representing the compression to use in the output file
2121
2222
.. versionadded:: 0.20.0
23+
protocol : int
24+
Int which indicates which protocol should be used by the pickler,
25+
default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible
26+
values for this parameter depend on the version of Python. For Python
27+
2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value.
28+
For Python >= 3.4, 4 is a valid value. A negative value for the
29+
protocol parameter is equivalent to setting its value to
30+
HIGHEST_PROTOCOL.
31+
32+
.. [1] https://docs.python.org/3/library/pickle.html
33+
.. versionadded:: 0.21.0
34+
35+
2336
"""
2437
inferred_compression = _infer_compression(path, compression)
2538
f, fh = _get_handle(path, 'wb',
2639
compression=inferred_compression,
2740
is_text=False)
41+
if protocol < 0:
42+
protocol = pkl.HIGHEST_PROTOCOL
2843
try:
29-
pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL)
44+
pkl.dump(obj, f, protocol=protocol)
3045
finally:
3146
for _f in fh:
3247
_f.close()

pandas/tests/io/test_pickle.py

+36
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import pandas.util.testing as tm
2626
from pandas.tseries.offsets import Day, MonthEnd
2727
import shutil
28+
import sys
2829

2930

3031
@pytest.fixture(scope='module')
@@ -501,3 +502,38 @@ def test_read_infer(self, ext, get_random_path):
501502
df2 = pd.read_pickle(p2)
502503

503504
tm.assert_frame_equal(df, df2)
505+
506+
507+
# ---------------------
508+
# test pickle compression
509+
# ---------------------
510+
511+
class TestProtocol(object):
512+
513+
@pytest.mark.parametrize('protocol', [-1, 0, 1, 2])
514+
def test_read(self, protocol, get_random_path):
515+
with tm.ensure_clean(get_random_path) as path:
516+
df = tm.makeDataFrame()
517+
df.to_pickle(path, protocol=protocol)
518+
df2 = pd.read_pickle(path)
519+
tm.assert_frame_equal(df, df2)
520+
521+
@pytest.mark.parametrize('protocol', [3, 4])
522+
@pytest.mark.skipif(sys.version_info[:2] >= (3, 4),
523+
reason="Testing invalid parameters for "
524+
"Python 2.x and 3.y (y < 4).")
525+
def test_read_bad_versions(self, protocol, get_random_path):
526+
# For Python 2.x (respectively 3.y with y < 4), [expected]
527+
# HIGHEST_PROTOCOL should be 2 (respectively 3). Hence, the protocol
528+
# parameter should not exceed 2 (respectively 3).
529+
if sys.version_info[:2] < (3, 0):
530+
expect_hp = 2
531+
else:
532+
expect_hp = 3
533+
with tm.assert_raises_regex(ValueError,
534+
"pickle protocol %d asked for; the highest"
535+
" available protocol is %d" % (protocol,
536+
expect_hp)):
537+
with tm.ensure_clean(get_random_path) as path:
538+
df = tm.makeDataFrame()
539+
df.to_pickle(path, protocol=protocol)

0 commit comments

Comments
 (0)