Skip to content

Commit 0fa47b6

Browse files
authored
Write pickle to file-like without intermediate in-memory buffer (#37056)
1 parent 3d29aee commit 0fa47b6

File tree

4 files changed

+38
-15
lines changed

4 files changed

+38
-15
lines changed

asv_bench/benchmarks/io/pickle.py

+6
Original file line numberDiff line numberDiff line change
@@ -24,5 +24,11 @@ def time_read_pickle(self):
2424
def time_write_pickle(self):
2525
self.df.to_pickle(self.fname)
2626

27+
def peakmem_read_pickle(self):
28+
read_pickle(self.fname)
29+
30+
def peakmem_write_pickle(self):
31+
self.df.to_pickle(self.fname)
32+
2733

2834
from ..pandas_vb_common import setup # noqa: F401 isort:skip

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,7 @@ Performance improvements
315315
avoiding creating these again, if created on either. This can speed up operations that depend on creating copies of existing indexes (:issue:`36840`)
316316
- Performance improvement in :meth:`RollingGroupby.count` (:issue:`35625`)
317317
- Small performance decrease to :meth:`Rolling.min` and :meth:`Rolling.max` for fixed windows (:issue:`36567`)
318+
- Reduced peak memory usage in :meth:`DataFrame.to_pickle` when using ``protocol=5`` in python 3.8+ (:issue:`34244`)
318319
- Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`)
319320

320321
.. ---------------------------------------------------------------------------

pandas/io/pickle.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def to_pickle(
9898
if protocol < 0:
9999
protocol = pickle.HIGHEST_PROTOCOL
100100
try:
101-
f.write(pickle.dumps(obj, protocol=protocol))
101+
pickle.dump(obj, f, protocol=protocol)
102102
finally:
103103
if f != filepath_or_buffer:
104104
# do not close user-provided file objects GH 35679

pandas/tests/io/test_pickle.py

+30-14
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
"""
1313
import bz2
1414
import datetime
15+
import functools
1516
import glob
1617
import gzip
1718
import io
@@ -24,7 +25,7 @@
2425

2526
import pytest
2627

27-
from pandas.compat import get_lzma_file, import_lzma, is_platform_little_endian
28+
from pandas.compat import PY38, get_lzma_file, import_lzma, is_platform_little_endian
2829
import pandas.util._test_decorators as td
2930

3031
import pandas as pd
@@ -155,28 +156,43 @@ def test_pickles(current_pickle_data, legacy_pickle):
155156
compare(current_pickle_data, legacy_pickle, version)
156157

157158

158-
def test_round_trip_current(current_pickle_data):
159-
def python_pickler(obj, path):
160-
with open(path, "wb") as fh:
161-
pickle.dump(obj, fh, protocol=-1)
159+
def python_pickler(obj, path):
160+
with open(path, "wb") as fh:
161+
pickle.dump(obj, fh, protocol=-1)
162162

163-
def python_unpickler(path):
164-
with open(path, "rb") as fh:
165-
fh.seek(0)
166-
return pickle.load(fh)
167163

164+
def python_unpickler(path):
165+
with open(path, "rb") as fh:
166+
fh.seek(0)
167+
return pickle.load(fh)
168+
169+
170+
@pytest.mark.parametrize(
171+
"pickle_writer",
172+
[
173+
pytest.param(python_pickler, id="python"),
174+
pytest.param(pd.to_pickle, id="pandas_proto_default"),
175+
pytest.param(
176+
functools.partial(pd.to_pickle, protocol=pickle.HIGHEST_PROTOCOL),
177+
id="pandas_proto_highest",
178+
),
179+
pytest.param(functools.partial(pd.to_pickle, protocol=4), id="pandas_proto_4"),
180+
pytest.param(
181+
functools.partial(pd.to_pickle, protocol=5),
182+
id="pandas_proto_5",
183+
marks=pytest.mark.skipif(not PY38, reason="protocol 5 not supported"),
184+
),
185+
],
186+
)
187+
def test_round_trip_current(current_pickle_data, pickle_writer):
168188
data = current_pickle_data
169189
for typ, dv in data.items():
170190
for dt, expected in dv.items():
171191

172192
for writer in [pd.to_pickle, python_pickler]:
173-
if writer is None:
174-
continue
175-
176193
with tm.ensure_clean() as path:
177-
178194
# test writing with each pickler
179-
writer(expected, path)
195+
pickle_writer(expected, path)
180196

181197
# test reading with each unpickler
182198
result = pd.read_pickle(path)

0 commit comments

Comments
 (0)