From 54a73c5a94bbe27fc04132a357c72c669cc9eddf Mon Sep 17 00:00:00 2001 From: Igor Date: Sun, 11 Oct 2020 17:13:17 +0100 Subject: [PATCH 1/8] Write pickle to file-like without intermediate storage Before this change, calling pickle.dumps() created an in-memory byte buffer, negating the advantage of zero-copy pickle protocol 5. After this change, pickle.dump writes directly to open file(-like), cutting peak memory in half in most cases. --- pandas/io/pickle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 80baa6f78ddd7..426a40a65b522 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -98,7 +98,7 @@ def to_pickle( if protocol < 0: protocol = pickle.HIGHEST_PROTOCOL try: - f.write(pickle.dumps(obj, protocol=protocol)) + pickle.dump(obj, f, protocol=protocol) finally: if f != filepath_or_buffer: # do not close user-provided file objects GH 35679 From e35772484a240b24fa830a5f55478db39382c38b Mon Sep 17 00:00:00 2001 From: Igor Date: Sun, 11 Oct 2020 17:52:17 +0100 Subject: [PATCH 2/8] Added note in whatsnew for v1.2.0 --- doc/source/whatsnew/v1.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 2b4b10c39602a..09dafd7d4e2f9 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -313,6 +313,7 @@ Performance improvements avoiding creating these again, if created on either. This can speed up operations that depend on creating copies of existing indexes (:issue:`36840`) - Performance improvement in :meth:`RollingGroupby.count` (:issue:`35625`) - Small performance decrease to :meth:`Rolling.min` and :meth:`Rolling.max` for fixed windows (:issue:`36567`) +- Reduced peak memory usage in DataFrame.to_pickle() when using protocol=5 in python 3.8+ .. --------------------------------------------------------------------------- From 82b046fc3faa1680516954b0b2254d382643351b Mon Sep 17 00:00:00 2001 From: Igor Date: Sun, 11 Oct 2020 18:17:21 +0100 Subject: [PATCH 3/8] Add explicit tests for default and specific pickling protocols --- pandas/tests/io/test_pickle.py | 44 ++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 2241fe7013568..8f086ea6acff3 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -12,6 +12,7 @@ """ import bz2 import datetime +import functools import glob import gzip import io @@ -19,6 +20,7 @@ from pathlib import Path import pickle import shutil +import sys from warnings import catch_warnings, simplefilter import zipfile @@ -155,28 +157,44 @@ def test_pickles(current_pickle_data, legacy_pickle): compare(current_pickle_data, legacy_pickle, version) -def test_round_trip_current(current_pickle_data): - def python_pickler(obj, path): - with open(path, "wb") as fh: - pickle.dump(obj, fh, protocol=-1) +def python_pickler(obj, path): + with open(path, "wb") as fh: + pickle.dump(obj, fh, protocol=-1) - def python_unpickler(path): - with open(path, "rb") as fh: - fh.seek(0) - return pickle.load(fh) +def python_unpickler(path): + with open(path, "rb") as fh: + fh.seek(0) + return pickle.load(fh) + + +named_pickle_writers = { + "python": python_pickler, + "pandas_proto_default": pd.to_pickle, + "pandas_proto_highest": functools.partial( + pd.to_pickle, protocol=pickle.HIGHEST_PROTOCOL + ), + "pandas_proto_4": functools.partial(pd.to_pickle, protocol=4), +} +if sys.version_info >= (3, 8): + named_pickle_writers.update( + { + "pandas_proto_5": functools.partial(pd.to_pickle, protocol=5), + } + ) +pickle_writer_ids, pickle_writers = zip(*named_pickle_writers.items()) + + +@pytest.mark.parametrize("pickle_writer", pickle_writers, ids=pickle_writer_ids) +def test_round_trip_current(current_pickle_data, pickle_writer): data = current_pickle_data for typ, dv in data.items(): for dt, expected in dv.items(): for writer in [pd.to_pickle, python_pickler]: - if writer is None: - continue - with tm.ensure_clean() as path: - # test writing with each pickler - writer(expected, path) + pickle_writer(expected, path) # test reading with each unpickler result = pd.read_pickle(path) From 3c1ee2700d64d8018f59dd288f0b35139c154a14 Mon Sep 17 00:00:00 2001 From: Igor Date: Sun, 11 Oct 2020 18:29:41 +0100 Subject: [PATCH 4/8] Add asv peakmem benchmark for to_pickle/from_pickle --- asv_bench/benchmarks/io/pickle.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py index 4ca9a82ae4827..656fe2197bc8a 100644 --- a/asv_bench/benchmarks/io/pickle.py +++ b/asv_bench/benchmarks/io/pickle.py @@ -24,5 +24,11 @@ def time_read_pickle(self): def time_write_pickle(self): self.df.to_pickle(self.fname) + def peakmem_read_pickle(self): + read_pickle(self.fname) + + def peakmem_write_pickle(self): + self.df.to_pickle(self.fname) + from ..pandas_vb_common import setup # noqa: F401 isort:skip From f35c59d904e9e9b67f5096b993215c5ac0a17bb6 Mon Sep 17 00:00:00 2001 From: Igor Date: Sun, 11 Oct 2020 19:31:34 +0100 Subject: [PATCH 5/8] Clean up test parametrization over supported protocols --- pandas/tests/io/test_pickle.py | 37 +++++++++++++++++----------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 8f086ea6acff3..63566665d6b15 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -26,7 +26,7 @@ import pytest -from pandas.compat import get_lzma_file, import_lzma, is_platform_little_endian +from pandas.compat import get_lzma_file, import_lzma, is_platform_little_endian, PY38 import pandas.util._test_decorators as td import pandas as pd @@ -168,24 +168,23 @@ def python_unpickler(path): return pickle.load(fh) -named_pickle_writers = { - "python": python_pickler, - "pandas_proto_default": pd.to_pickle, - "pandas_proto_highest": functools.partial( - pd.to_pickle, protocol=pickle.HIGHEST_PROTOCOL - ), - "pandas_proto_4": functools.partial(pd.to_pickle, protocol=4), -} -if sys.version_info >= (3, 8): - named_pickle_writers.update( - { - "pandas_proto_5": functools.partial(pd.to_pickle, protocol=5), - } - ) -pickle_writer_ids, pickle_writers = zip(*named_pickle_writers.items()) - - -@pytest.mark.parametrize("pickle_writer", pickle_writers, ids=pickle_writer_ids) +@pytest.mark.parametrize( + "pickle_writer", + [ + pytest.param(python_pickler, id="python"), + pytest.param(pd.to_pickle, id="pandas_proto_default"), + pytest.param( + functools.partial(pd.to_pickle, protocol=pickle.HIGHEST_PROTOCOL), + id="pandas_proto_highest", + ), + pytest.param(functools.partial(pd.to_pickle, protocol=4), id="pandas_proto_4"), + pytest.param( + functools.partial(pd.to_pickle, protocol=5), + id="pandas_proto_5", + marks=pytest.mark.skipif(not PY38, reason="pickle protocol 5 not supported"), + ), + ], +) def test_round_trip_current(current_pickle_data, pickle_writer): data = current_pickle_data for typ, dv in data.items(): From 7db56971cbeb4a52a4ff1e90b4eec9916f6abcdd Mon Sep 17 00:00:00 2001 From: Igor Date: Sun, 11 Oct 2020 19:34:14 +0100 Subject: [PATCH 6/8] Fix linter errors --- pandas/tests/io/test_pickle.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 63566665d6b15..79371cd84c10b 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -20,7 +20,6 @@ from pathlib import Path import pickle import shutil -import sys from warnings import catch_warnings, simplefilter import zipfile @@ -181,7 +180,7 @@ def python_unpickler(path): pytest.param( functools.partial(pd.to_pickle, protocol=5), id="pandas_proto_5", - marks=pytest.mark.skipif(not PY38, reason="pickle protocol 5 not supported"), + marks=pytest.mark.skipif(not PY38, reason="protocol 5 not supported"), ), ], ) From cccfb6e756f2e985457de0df4f92715a3d8737f9 Mon Sep 17 00:00:00 2001 From: Igor Date: Sun, 11 Oct 2020 19:44:41 +0100 Subject: [PATCH 7/8] isort imports --- pandas/tests/io/test_pickle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 79371cd84c10b..a37349654b120 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -25,7 +25,7 @@ import pytest -from pandas.compat import get_lzma_file, import_lzma, is_platform_little_endian, PY38 +from pandas.compat import PY38, get_lzma_file, import_lzma, is_platform_little_endian import pandas.util._test_decorators as td import pandas as pd From 19557b4a31166eba75925bb2c96f210f63ec822c Mon Sep 17 00:00:00 2001 From: Igor Date: Mon, 12 Oct 2020 18:23:13 +0100 Subject: [PATCH 8/8] Format whatsnew entry --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 09dafd7d4e2f9..7d33e785d52c4 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -313,7 +313,7 @@ Performance improvements avoiding creating these again, if created on either. This can speed up operations that depend on creating copies of existing indexes (:issue:`36840`) - Performance improvement in :meth:`RollingGroupby.count` (:issue:`35625`) - Small performance decrease to :meth:`Rolling.min` and :meth:`Rolling.max` for fixed windows (:issue:`36567`) -- Reduced peak memory usage in DataFrame.to_pickle() when using protocol=5 in python 3.8+ +- Reduced peak memory usage in :meth:`DataFrame.to_pickle` when using ``protocol=5`` in python 3.8+ (:issue:`34244`) .. ---------------------------------------------------------------------------