diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py index 4ca9a82ae4827..656fe2197bc8a 100644 --- a/asv_bench/benchmarks/io/pickle.py +++ b/asv_bench/benchmarks/io/pickle.py @@ -24,5 +24,11 @@ def time_read_pickle(self): def time_write_pickle(self): self.df.to_pickle(self.fname) + def peakmem_read_pickle(self): + read_pickle(self.fname) + + def peakmem_write_pickle(self): + self.df.to_pickle(self.fname) + from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index da7dcc6ab29b9..d84b7532060af 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -314,6 +314,7 @@ Performance improvements avoiding creating these again, if created on either. This can speed up operations that depend on creating copies of existing indexes (:issue:`36840`) - Performance improvement in :meth:`RollingGroupby.count` (:issue:`35625`) - Small performance decrease to :meth:`Rolling.min` and :meth:`Rolling.max` for fixed windows (:issue:`36567`) +- Reduced peak memory usage in :meth:`DataFrame.to_pickle` when using ``protocol=5`` in python 3.8+ (:issue:`34244`) - Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 80baa6f78ddd7..426a40a65b522 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -98,7 +98,7 @@ def to_pickle( if protocol < 0: protocol = pickle.HIGHEST_PROTOCOL try: - f.write(pickle.dumps(obj, protocol=protocol)) + pickle.dump(obj, f, protocol=protocol) finally: if f != filepath_or_buffer: # do not close user-provided file objects GH 35679 diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 2241fe7013568..a37349654b120 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -12,6 +12,7 @@ """ import bz2 import datetime +import functools import glob import gzip import io @@ -24,7 +25,7 @@ import pytest -from pandas.compat import get_lzma_file, import_lzma, is_platform_little_endian +from pandas.compat import PY38, get_lzma_file, import_lzma, is_platform_little_endian import pandas.util._test_decorators as td import pandas as pd @@ -155,28 +156,43 @@ def test_pickles(current_pickle_data, legacy_pickle): compare(current_pickle_data, legacy_pickle, version) -def test_round_trip_current(current_pickle_data): - def python_pickler(obj, path): - with open(path, "wb") as fh: - pickle.dump(obj, fh, protocol=-1) +def python_pickler(obj, path): + with open(path, "wb") as fh: + pickle.dump(obj, fh, protocol=-1) - def python_unpickler(path): - with open(path, "rb") as fh: - fh.seek(0) - return pickle.load(fh) +def python_unpickler(path): + with open(path, "rb") as fh: + fh.seek(0) + return pickle.load(fh) + + +@pytest.mark.parametrize( + "pickle_writer", + [ + pytest.param(python_pickler, id="python"), + pytest.param(pd.to_pickle, id="pandas_proto_default"), + pytest.param( + functools.partial(pd.to_pickle, protocol=pickle.HIGHEST_PROTOCOL), + id="pandas_proto_highest", + ), + pytest.param(functools.partial(pd.to_pickle, protocol=4), id="pandas_proto_4"), + pytest.param( + functools.partial(pd.to_pickle, protocol=5), + id="pandas_proto_5", + marks=pytest.mark.skipif(not PY38, reason="protocol 5 not supported"), + ), + ], +) +def test_round_trip_current(current_pickle_data, pickle_writer): data = current_pickle_data for typ, dv in data.items(): for dt, expected in dv.items(): for writer in [pd.to_pickle, python_pickler]: - if writer is None: - continue - with tm.ensure_clean() as path: - # test writing with each pickler - writer(expected, path) + pickle_writer(expected, path) # test reading with each unpickler result = pd.read_pickle(path)