Skip to content

Commit 5c2fb96

Browse files
authored
REGR: write compressed pickle files with protocol=5 (#39376)
1 parent f164cb9 commit 5c2fb96

File tree

3 files changed

+26
-2
lines changed

3 files changed

+26
-2
lines changed

doc/source/whatsnew/v1.2.2.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ including other versions of pandas.
1414

1515
Fixed regressions
1616
~~~~~~~~~~~~~~~~~
17-
-
17+
- Fixed regression in :meth:`~DataFrame.to_pickle` failing to create bz2/xz compressed pickle files with ``protocol=5`` (:issue:`39002`)
1818
-
1919

2020
.. ---------------------------------------------------------------------------

pandas/io/pickle.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,19 @@ def to_pickle(
9494
is_text=False,
9595
storage_options=storage_options,
9696
) as handles:
97-
pickle.dump(obj, handles.handle, protocol=protocol) # type: ignore[arg-type]
97+
if handles.compression["method"] in ("bz2", "xz") and protocol >= 5:
98+
# some weird TypeError GH#39002 with pickle 5: fallback to letting
99+
# pickle create the entire object and then write it to the buffer.
100+
# "zip" would also be here if pandas.io.common._BytesZipFile
101+
# wouldn't buffer write calls
102+
handles.handle.write(
103+
pickle.dumps(obj, protocol=protocol) # type: ignore[arg-type]
104+
)
105+
else:
106+
# letting pickle write directly to the buffer is more memory-efficient
107+
pickle.dump(
108+
obj, handles.handle, protocol=protocol # type: ignore[arg-type]
109+
)
98110

99111

100112
@doc(storage_options=generic._shared_docs["storage_options"])

pandas/tests/io/test_pickle.py

+12
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import bz2
1414
import datetime
1515
import functools
16+
from functools import partial
1617
import glob
1718
import gzip
1819
import io
@@ -594,3 +595,14 @@ def test_pickle_preserves_block_ndim():
594595

595596
# GH#37631 OP issue was about indexing, underlying problem was pickle
596597
tm.assert_series_equal(res[[True]], ser)
598+
599+
600+
@pytest.mark.parametrize("protocol", [pickle.DEFAULT_PROTOCOL, pickle.HIGHEST_PROTOCOL])
601+
def test_pickle_big_dataframe_compression(protocol, compression):
602+
# GH#39002
603+
df = pd.DataFrame(range(100000))
604+
result = tm.round_trip_pathlib(
605+
partial(df.to_pickle, protocol=protocol, compression=compression),
606+
partial(pd.read_pickle, compression=compression),
607+
)
608+
tm.assert_frame_equal(df, result)

0 commit comments

Comments
 (0)