Write pickle to file-like without intermediate in-memory buffer (#37056)

ig248 · web-flow · commit 0fa47b67d66a · 2020-10-14T08:27:37.000-04:00
diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py
@@ -24,5 +24,11 @@ def time_read_pickle(self):
     def time_write_pickle(self):
         self.df.to_pickle(self.fname)
 
+    def peakmem_read_pickle(self):
+        read_pickle(self.fname)
+
+    def peakmem_write_pickle(self):
+        self.df.to_pickle(self.fname)
+
 
 from ..pandas_vb_common import setup  # noqa: F401 isort:skip
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -315,6 +315,7 @@ Performance improvements
   avoiding creating these again, if created on either. This can speed up operations that depend on creating copies of existing indexes (:issue:`36840`)
 - Performance improvement in :meth:`RollingGroupby.count` (:issue:`35625`)
 - Small performance decrease to :meth:`Rolling.min` and :meth:`Rolling.max` for fixed windows (:issue:`36567`)
+- Reduced peak memory usage in :meth:`DataFrame.to_pickle` when using ``protocol=5`` in python 3.8+ (:issue:`34244`)
 - Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`)
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py
@@ -98,7 +98,7 @@ def to_pickle(
     if protocol < 0:
         protocol = pickle.HIGHEST_PROTOCOL
     try:
-        f.write(pickle.dumps(obj, protocol=protocol))
+        pickle.dump(obj, f, protocol=protocol)
     finally:
         if f != filepath_or_buffer:
             # do not close user-provided file objects GH 35679
diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py
@@ -12,6 +12,7 @@
 """
 import bz2
 import datetime
+import functools
 import glob
 import gzip
 import io
@@ -24,7 +25,7 @@
 
 import pytest
 
-from pandas.compat import get_lzma_file, import_lzma, is_platform_little_endian
+from pandas.compat import PY38, get_lzma_file, import_lzma, is_platform_little_endian
 import pandas.util._test_decorators as td
 
 import pandas as pd
@@ -155,28 +156,43 @@ def test_pickles(current_pickle_data, legacy_pickle):
         compare(current_pickle_data, legacy_pickle, version)
 
 
-def test_round_trip_current(current_pickle_data):
-    def python_pickler(obj, path):
-        with open(path, "wb") as fh:
-            pickle.dump(obj, fh, protocol=-1)
+def python_pickler(obj, path):
+    with open(path, "wb") as fh:
+        pickle.dump(obj, fh, protocol=-1)
 
-    def python_unpickler(path):
-        with open(path, "rb") as fh:
-            fh.seek(0)
-            return pickle.load(fh)
 
+def python_unpickler(path):
+    with open(path, "rb") as fh:
+        fh.seek(0)
+        return pickle.load(fh)
+
+
+@pytest.mark.parametrize(
+    "pickle_writer",
+    [
+        pytest.param(python_pickler, id="python"),
+        pytest.param(pd.to_pickle, id="pandas_proto_default"),
+        pytest.param(
+            functools.partial(pd.to_pickle, protocol=pickle.HIGHEST_PROTOCOL),
+            id="pandas_proto_highest",
+        ),
+        pytest.param(functools.partial(pd.to_pickle, protocol=4), id="pandas_proto_4"),
+        pytest.param(
+            functools.partial(pd.to_pickle, protocol=5),
+            id="pandas_proto_5",
+            marks=pytest.mark.skipif(not PY38, reason="protocol 5 not supported"),
+        ),
+    ],
+)
+def test_round_trip_current(current_pickle_data, pickle_writer):
     data = current_pickle_data
     for typ, dv in data.items():
         for dt, expected in dv.items():
 
             for writer in [pd.to_pickle, python_pickler]:
-                if writer is None:
-                    continue
-
                 with tm.ensure_clean() as path:
-
                     # test writing with each pickler
-                    writer(expected, path)
+                    pickle_writer(expected, path)
 
                     # test reading with each unpickler
                     result = pd.read_pickle(path)