From 66a35e8a209afda982984899250a41900c9f2186 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Schiratti Date: Thu, 4 May 2017 15:44:52 +0200 Subject: [PATCH 01/11] Added 'protocol' parameter to 'to_pickle'. --- pandas/core/generic.py | 10 ++++++++-- pandas/io/pickle.py | 7 +++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2bc64795b5f20..42ca310619f1b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -50,6 +50,7 @@ from pandas.compat.numpy import function as nv from pandas.compat import (map, zip, lzip, lrange, string_types, isidentifier, set_function_name) +from pandas.compat import cPickle as pkl import pandas.core.nanops as nanops from pandas.util.decorators import Appender, Substitution, deprecate_kwarg from pandas.util.validators import validate_bool_kwarg @@ -1344,7 +1345,8 @@ def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail', if_exists=if_exists, index=index, index_label=index_label, chunksize=chunksize, dtype=dtype) - def to_pickle(self, path, compression='infer'): + def to_pickle(self, path, compression='infer', + protocol=pkl.HIGHEST_PROTOCOL): """ Pickle (serialize) object to input file path. @@ -1354,11 +1356,15 @@ def to_pickle(self, path, compression='infer'): File path compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' a string representing the compression to use in the output file + protocol : int + Int which indicates which protocol should be used by the pickler, + default HIGHEST_PROTOCOL (Pickle module). .. versionadded:: 0.20.0 """ from pandas.io.pickle import to_pickle - return to_pickle(self, path, compression=compression) + return to_pickle(self, path, compression=compression, + protocol=protocol) def to_clipboard(self, excel=None, sep=None, **kwargs): """ diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 0f91c407766fb..af90e30c126b8 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -7,7 +7,7 @@ from pandas.io.common import _get_handle, _infer_compression -def to_pickle(obj, path, compression='infer'): +def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): """ Pickle (serialize) object to input file path @@ -18,6 +18,9 @@ def to_pickle(obj, path, compression='infer'): File path compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' a string representing the compression to use in the output file + protocol : int + Int which indicates which protocol should be used by the pickler, + default HIGHEST_PROTOCOL (Pickle module). .. versionadded:: 0.20.0 """ @@ -26,7 +29,7 @@ def to_pickle(obj, path, compression='infer'): compression=inferred_compression, is_text=False) try: - pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL) + pkl.dump(obj, f, protocol=protocol) finally: for _f in fh: _f.close() From 04bc5c280370f8b4b8b35dcc79d759d9950a2c59 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Schiratti Date: Thu, 11 May 2017 19:22:37 +0200 Subject: [PATCH 02/11] Added 'versionadded' tag, improved docstring + fixed import. --- pandas/core/generic.py | 13 +++++++++---- pandas/io/pickle.py | 11 +++++++++-- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 42ca310619f1b..cf27f1a64751a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -49,8 +49,7 @@ from pandas import compat from pandas.compat.numpy import function as nv from pandas.compat import (map, zip, lzip, lrange, string_types, - isidentifier, set_function_name) -from pandas.compat import cPickle as pkl + isidentifier, set_function_name, cPickle as pkl) import pandas.core.nanops as nanops from pandas.util.decorators import Appender, Substitution, deprecate_kwarg from pandas.util.validators import validate_bool_kwarg @@ -1356,11 +1355,17 @@ def to_pickle(self, path, compression='infer', File path compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' a string representing the compression to use in the output file + + .. versionadded:: 0.20.0 protocol : int Int which indicates which protocol should be used by the pickler, - default HIGHEST_PROTOCOL (Pickle module). + default HIGHEST_PROTOCOL (Pickle module). The possible values for + this parameter depend on the version of Python. For Python 2.x, + possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value. + For Python >= 3.4, 4 is a valid value. + + .. versionadded:: 0.21.0 - .. versionadded:: 0.20.0 """ from pandas.io.pickle import to_pickle return to_pickle(self, path, compression=compression, diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index af90e30c126b8..65e3c0f68f378 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -18,11 +18,18 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): File path compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' a string representing the compression to use in the output file + + .. versionadded:: 0.20.0 protocol : int Int which indicates which protocol should be used by the pickler, - default HIGHEST_PROTOCOL (Pickle module). + default HIGHEST_PROTOCOL (Pickle module). The possible values for + this parameter depend on the version of Python. For Python 2.x, + possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value. + For Python >= 3.4, 4 is a valid value. + + .. versionadded:: 0.21.0 + - .. versionadded:: 0.20.0 """ inferred_compression = _infer_compression(path, compression) f, fh = _get_handle(path, 'wb', From 4bf038670267e9ba71bd5683cbfd970745d83149 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Schiratti Date: Fri, 12 May 2017 15:51:30 +0200 Subject: [PATCH 03/11] Added docstring for negative protocol parameter. --- pandas/core/generic.py | 4 +++- pandas/io/pickle.py | 6 +++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index cf27f1a64751a..1364f1ffb5a22 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1362,7 +1362,9 @@ def to_pickle(self, path, compression='infer', default HIGHEST_PROTOCOL (Pickle module). The possible values for this parameter depend on the version of Python. For Python 2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value. - For Python >= 3.4, 4 is a valid value. + For Python >= 3.4, 4 is a valid value.A negative value for the + protocol parameter is equivalent to setting its value to + HIGHEST_PROTOCOL. .. versionadded:: 0.21.0 diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 65e3c0f68f378..115368f9f67f8 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -25,7 +25,9 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): default HIGHEST_PROTOCOL (Pickle module). The possible values for this parameter depend on the version of Python. For Python 2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value. - For Python >= 3.4, 4 is a valid value. + For Python >= 3.4, 4 is a valid value. A negative value for the + protocol parameter is equivalent to setting its value to + HIGHEST_PROTOCOL. .. versionadded:: 0.21.0 @@ -35,6 +37,8 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): f, fh = _get_handle(path, 'wb', compression=inferred_compression, is_text=False) + if protocol < 0: + protocol = pkl.HIGHEST_PROTOCOL try: pkl.dump(obj, f, protocol=protocol) finally: From 35f8d1863569ed08da83c53e0e504f2a04b715f0 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Schiratti Date: Fri, 12 May 2017 15:52:07 +0200 Subject: [PATCH 04/11] Added tests for new 'protocol' parameter in 'to_pickle'. --- pandas/tests/io/test_pickle.py | 37 +++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 875b5bd3055b9..e621c4fef6fae 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -20,11 +20,12 @@ from distutils.version import LooseVersion import pandas as pd from pandas import Index -from pandas.compat import is_platform_little_endian +from pandas.compat import is_platform_little_endian, cPickle as pkl import pandas import pandas.util.testing as tm from pandas.tseries.offsets import Day, MonthEnd import shutil +import sys @pytest.fixture(scope='module') @@ -489,3 +490,37 @@ def test_read_infer(self, ext, get_random_path): df2 = pd.read_pickle(p2) tm.assert_frame_equal(df, df2) + + +# --------------------- +# test pickle compression +# --------------------- + +class TestProtocol(object): + + @pytest.mark.parametrize('protocol', [-1, 0, 1, 2]) + def test_read(self, protocol, get_random_path): + with tm.ensure_clean(get_random_path) as path: + df = tm.makeDataFrame() + df.to_pickle(path, protocol=protocol) + df2 = pd.read_pickle(path) + tm.assert_frame_equal(df, df2) + + @pytest.mark.parametrize('protocol', [3, 4]) + @pytest.mark.skipif(sys.version_info[:2] >= (3, 4), + reason="Testing invalid parameters for " + "Python 2.x and 3.y (y < 4).") + def test_read_bad_versions(self, protocol, get_random_path): + # For Python 2.x (respectively 3.y with y < 4), [expected] + # HIGHEST_PROTOCOL should be 2 (respectively 3). Hence, the protocol + # parameter should not exceed 2 (respectively 3). + if sys.version_info[:2] < (3, 0): + expect_hp = 2 + else: + expect_hp = 3 + with tm.assert_raises_regex(ValueError, + "pickle protocol must be <= %d" % + expect_hp): + with tm.ensure_clean(get_random_path) as path: + df = tm.makeDataFrame() + df.to_pickle(path, protocol=protocol) From 9c9d38fcb0bbdebc848450ea9f4396a4949b4c5d Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Schiratti Date: Fri, 12 May 2017 16:16:19 +0200 Subject: [PATCH 05/11] Added enhancement to 'whatsnew' file. --- doc/source/whatsnew/v0.21.0.txt | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 36dffc3d3378b..7743a59ad6f74 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -27,6 +27,28 @@ New features Other Enhancements ^^^^^^^^^^^^^^^^^^ +Pickle file I/O protocol parameter +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`to_pickle` now allows to specify the protocol used by the Pickler. +The 'protocol' parameter defaults to HIGHEST_PROTOCOL. For Python 2.x, HIGHEST_PROTOCOL is 2. +Since Python 3.0 (respectively 3.4), HIGHEST_PROTOCOL is 3 (respectively 4). +A negative value for the protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL. + +.. ipython:: python + + df = pd.DataFrame({ + 'A': np.random.randn(1000), + 'B': 'foo', + 'C': pd.date_range('20130101', periods=1000, freq='s')}) + +Using an explicit protocol parameter + +.. ipython:: python + + df.to_pickle("data.pkl", protocol=2) + rt = pd.read_pickle("data.pkl") + rt .. _whatsnew_0210.api_breaking: From 352220b151d7113eadf1f56a75a15ee19648f199 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Schiratti Date: Fri, 12 May 2017 17:15:51 +0200 Subject: [PATCH 06/11] Fix : Fixed error message in 'test_read_bad_versions'. --- pandas/tests/io/test_pickle.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index e621c4fef6fae..3b1e98b9bcbf1 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -519,8 +519,9 @@ def test_read_bad_versions(self, protocol, get_random_path): else: expect_hp = 3 with tm.assert_raises_regex(ValueError, - "pickle protocol must be <= %d" % - expect_hp): + "pickle protocol %d asked for; the highest" + " available protocol is %d" % (protocol, + expect_hp)): with tm.ensure_clean(get_random_path) as path: df = tm.makeDataFrame() df.to_pickle(path, protocol=protocol) From 460ca0ca98af3c97913e9d2020c186e03e98126e Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Schiratti Date: Wed, 17 May 2017 14:32:06 +0200 Subject: [PATCH 07/11] Shortened paragraph addded in 'whatsnew'. --- doc/source/whatsnew/v0.21.0.txt | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 7743a59ad6f74..afda8773b12e4 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -30,26 +30,7 @@ Other Enhancements Pickle file I/O protocol parameter ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:func:`to_pickle` now allows to specify the protocol used by the Pickler. -The 'protocol' parameter defaults to HIGHEST_PROTOCOL. For Python 2.x, HIGHEST_PROTOCOL is 2. -Since Python 3.0 (respectively 3.4), HIGHEST_PROTOCOL is 3 (respectively 4). -A negative value for the protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL. - -.. ipython:: python - - df = pd.DataFrame({ - 'A': np.random.randn(1000), - 'B': 'foo', - 'C': pd.date_range('20130101', periods=1000, freq='s')}) - -Using an explicit protocol parameter - -.. ipython:: python - - df.to_pickle("data.pkl", protocol=2) - rt = pd.read_pickle("data.pkl") - rt - +- Added protocol parameter to :func:`to_pickle`. The 'protocol' parameter defaults to HIGHEST_PROTOCOL. .. _whatsnew_0210.api_breaking: From 763114657bb5bea432f202667105b35ab58421c5 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Schiratti Date: Wed, 17 May 2017 14:35:03 +0200 Subject: [PATCH 08/11] Fix : added issue number. --- doc/source/whatsnew/v0.21.0.txt | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index afda8773b12e4..9fcc7b462e287 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -27,10 +27,7 @@ New features Other Enhancements ^^^^^^^^^^^^^^^^^^ -Pickle file I/O protocol parameter -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -- Added protocol parameter to :func:`to_pickle`. The 'protocol' parameter defaults to HIGHEST_PROTOCOL. +- Added protocol parameter to :func:`to_pickle`. The 'protocol' parameter defaults to HIGHEST_PROTOCOL. (:issue:`16252`) .. _whatsnew_0210.api_breaking: From 14bc4859764349f8f14d648750b848b0a589373f Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Schiratti Date: Wed, 17 May 2017 15:25:01 +0200 Subject: [PATCH 09/11] Fix : removed unused import. --- pandas/tests/io/test_pickle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 3b1e98b9bcbf1..b290a6f943d91 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -20,7 +20,7 @@ from distutils.version import LooseVersion import pandas as pd from pandas import Index -from pandas.compat import is_platform_little_endian, cPickle as pkl +from pandas.compat import is_platform_little_endian import pandas import pandas.util.testing as tm from pandas.tseries.offsets import Day, MonthEnd From 20a854dc51a3f887fdbc5d1adbaf3a6276379878 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Schiratti Date: Thu, 18 May 2017 10:07:52 +0200 Subject: [PATCH 10/11] Added ref for protocol parameter + edited whatsnew. --- doc/source/whatsnew/v0.21.0.txt | 3 ++- pandas/core/generic.py | 11 ++++++----- pandas/io/pickle.py | 7 ++++--- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 9fcc7b462e287..355ef60eb0d39 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -27,7 +27,8 @@ New features Other Enhancements ^^^^^^^^^^^^^^^^^^ -- Added protocol parameter to :func:`to_pickle`. The 'protocol' parameter defaults to HIGHEST_PROTOCOL. (:issue:`16252`) +- :func:`to_pickle` has gained a protocol parameter (:issue:`16252`). By default, +this parameter is set to HIGHEST_PROTOCOL (see https://docs.python.org/3/library/pickle.html, 12.1.2). .. _whatsnew_0210.api_breaking: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1364f1ffb5a22..175fd55e31fb3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1359,13 +1359,14 @@ def to_pickle(self, path, compression='infer', .. versionadded:: 0.20.0 protocol : int Int which indicates which protocol should be used by the pickler, - default HIGHEST_PROTOCOL (Pickle module). The possible values for - this parameter depend on the version of Python. For Python 2.x, - possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value. - For Python >= 3.4, 4 is a valid value.A negative value for the - protocol parameter is equivalent to setting its value to + default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible + values for this parameter depend on the version of Python. For + Python 2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a + valid value. For Python >= 3.4, 4 is a valid value.A negative value + for the protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL. + .. [1] https://docs.python.org/3/library/pickle.html .. versionadded:: 0.21.0 """ diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 115368f9f67f8..6f4c714931fc8 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -22,13 +22,14 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): .. versionadded:: 0.20.0 protocol : int Int which indicates which protocol should be used by the pickler, - default HIGHEST_PROTOCOL (Pickle module). The possible values for - this parameter depend on the version of Python. For Python 2.x, - possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value. + default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible + values for this parameter depend on the version of Python. For Python + 2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value. For Python >= 3.4, 4 is a valid value. A negative value for the protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL. + .. [1] https://docs.python.org/3/library/pickle.html .. versionadded:: 0.21.0 From 8eb660d3b06333ee446698b199007a979fa7d44b Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Schiratti Date: Thu, 18 May 2017 12:15:22 +0200 Subject: [PATCH 11/11] Minor change on whatsnew. --- doc/source/whatsnew/v0.21.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 355ef60eb0d39..593e96960ed34 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -28,7 +28,7 @@ Other Enhancements ^^^^^^^^^^^^^^^^^^ - :func:`to_pickle` has gained a protocol parameter (:issue:`16252`). By default, -this parameter is set to HIGHEST_PROTOCOL (see https://docs.python.org/3/library/pickle.html, 12.1.2). +this parameter is set to HIGHEST_PROTOCOL (see , 12.1.2). .. _whatsnew_0210.api_breaking: