Skip to content

Commit 4a2b068

Browse files
authored
PERF: Improve pickle support with BZ2 & LZMA (#49068)
* Add `BZ2File` wrapper for pickle protocol 5 * Add `LZMAFile` wrapper for pickle protocol 5 * Use BZ2 & LZMA wrappers for full pickle support * Workaround linter issue `PickleBuffer` isn't currently included in `SupportBytes`, which causes issues with pyright when passing `PickleBuffer` instances to `bytes`. Though it appears ok passing `PickleBuffer` instances to `memoryview`s. So do that instead. This is functionaly very equivalent. There is a slight performance cost to making a `memoryview`, but this is likely negligible compared to copying to `bytes`. * Refactor out `flatten_buffer` * Refactor `B2File` into separate module * Test `flatten_buffer` This provides a reasonable proxy for testing patched `BZ2File` and `LZMAFile` objects. * Move `flatten_buffer` to `_utils` This ran into cyclic import issues in `pickle_compat`. So move `flatten_buffer` to its own module free of these issues. * Import `annotations` to fix `|` usage * Sort `import`s to fix lint * Patch `BZ2File` & `LZMAFile` on Python pre-3.10 This should limit the effects of this patch. Also should make it easier to remove this backport later once all supported Python versions have the fix. * Test C & F contiguous NumPy arrays Also test another non-contiguous array. * Test `memoryview` is 1-D `uint8` contiguous data If a `memoryview` is returned, make sure it as close to `bytes` | `bytearray` as possible. This ensures if other functions assume something like `bytes` (for example assuming `len(b)` is the number of bytes contained), things will continue to work even though this is a `memoryview`. * Run `black` on `bz2` and `lzma` compat files * One more lint fix * Drop unused `PickleBuffer` `import`s * Simplify change to `panda.compat.__init__` Now that the LZMA changes are in a separate file, cleanup the changes to `pandas.compat.__init__`. * Type `flatten_buffer` result * Use `order="A"` in `memoryview.tobytes(...)` In the function `flatten_buffer`, the order is already effectively enforced when copying can be avoided by using `PickleBuffer.raw(...)`. However some test comparisons failed (when they shouldn't have) as this wasn't specified. So add the `order` in both the function and the test. This should fix that test failure. * Move all compat compressors into a single file * Fix `BZ2File` `import` * Refactor out common compat constants * Fix `import` sorting * Drop unused `import` * Ignore `flake8` errors on wildcard `import` * Revert "Ignore `flake8` errors on wildcard `import`" This reverts commit f1f1a2e. * Explicitly `import` all constants * Assign `IS64` first * Try `noqa` on wildcard `import` again * Declare `BZ2File` & `LZMAFile` once Fixes a linter issue from pyright. * `import PickleBuffer` for simplicity * Add `bytearray` to return type * Test `bytes` & `bytearray` are returned unaltered * Explicit list all constants * Trick linter into thinking constants are used ;) * Add new entry to 2.0.0 * Assign constants to themselves Should work around linter issues. * Update changelog entry [skip ci] * Add constants to `__all__` * Update changelog entry [ci skip] * Use Sphinx method annotation
1 parent 93bd1a8 commit 4a2b068

File tree

7 files changed

+151
-27
lines changed

7 files changed

+151
-27
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ Performance improvements
198198
- Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`)
199199
- Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`)
200200
- Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
201+
- Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`)
201202
- Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`)
202203
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`)
203204
- Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`)

pandas/compat/__init__.py

+16-15
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,16 @@
1212
import os
1313
import platform
1414
import sys
15-
from typing import TYPE_CHECKING
1615

1716
from pandas._typing import F
17+
import pandas.compat._compressors
18+
from pandas.compat._constants import (
19+
IS64,
20+
PY39,
21+
PY310,
22+
PY311,
23+
PYPY,
24+
)
1825
from pandas.compat.numpy import (
1926
is_numpy_dev,
2027
np_version_under1p21,
@@ -26,15 +33,6 @@
2633
pa_version_under9p0,
2734
)
2835

29-
if TYPE_CHECKING:
30-
import lzma
31-
32-
PY39 = sys.version_info >= (3, 9)
33-
PY310 = sys.version_info >= (3, 10)
34-
PY311 = sys.version_info >= (3, 11)
35-
PYPY = platform.python_implementation() == "PyPy"
36-
IS64 = sys.maxsize > 2**32
37-
3836

3937
def set_function_name(f: F, name: str, cls) -> F:
4038
"""
@@ -121,7 +119,7 @@ def is_ci_environment() -> bool:
121119
return os.environ.get("PANDAS_CI", "0") == "1"
122120

123121

124-
def get_lzma_file() -> type[lzma.LZMAFile]:
122+
def get_lzma_file() -> type[pandas.compat._compressors.LZMAFile]:
125123
"""
126124
Importing the `LZMAFile` class from the `lzma` module.
127125
@@ -135,15 +133,13 @@ def get_lzma_file() -> type[lzma.LZMAFile]:
135133
RuntimeError
136134
If the `lzma` module was not imported correctly, or didn't exist.
137135
"""
138-
try:
139-
import lzma
140-
except ImportError:
136+
if not pandas.compat._compressors.has_lzma:
141137
raise RuntimeError(
142138
"lzma module not available. "
143139
"A Python re-install with the proper dependencies, "
144140
"might be required to solve this issue."
145141
)
146-
return lzma.LZMAFile
142+
return pandas.compat._compressors.LZMAFile
147143

148144

149145
__all__ = [
@@ -153,4 +149,9 @@ def get_lzma_file() -> type[lzma.LZMAFile]:
153149
"pa_version_under7p0",
154150
"pa_version_under8p0",
155151
"pa_version_under9p0",
152+
"IS64",
153+
"PY39",
154+
"PY310",
155+
"PY311",
156+
"PYPY",
156157
]

pandas/compat/_compressors.py

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
"""
2+
Patched ``BZ2File`` and ``LZMAFile`` to handle pickle protocol 5.
3+
"""
4+
5+
from __future__ import annotations
6+
7+
import bz2
8+
from pickle import PickleBuffer
9+
10+
from pandas.compat._constants import PY310
11+
12+
try:
13+
import lzma
14+
15+
has_lzma = True
16+
except ImportError:
17+
has_lzma = False
18+
19+
20+
def flatten_buffer(
21+
b: bytes | bytearray | memoryview | PickleBuffer,
22+
) -> bytes | bytearray | memoryview:
23+
"""
24+
Return some 1-D `uint8` typed buffer.
25+
26+
Coerces anything that does not match that description to one that does
27+
without copying if possible (otherwise will copy).
28+
"""
29+
30+
if isinstance(b, (bytes, bytearray)):
31+
return b
32+
33+
if not isinstance(b, PickleBuffer):
34+
b = PickleBuffer(b)
35+
36+
try:
37+
# coerce to 1-D `uint8` C-contiguous `memoryview` zero-copy
38+
return b.raw()
39+
except BufferError:
40+
# perform in-memory copy if buffer is not contiguous
41+
return memoryview(b).tobytes("A")
42+
43+
44+
class BZ2File(bz2.BZ2File):
45+
if not PY310:
46+
47+
def write(self, b) -> int:
48+
# Workaround issue where `bz2.BZ2File` expects `len`
49+
# to return the number of bytes in `b` by converting
50+
# `b` into something that meets that constraint with
51+
# minimal copying.
52+
#
53+
# Note: This is fixed in Python 3.10.
54+
return super().write(flatten_buffer(b))
55+
56+
57+
if has_lzma:
58+
59+
class LZMAFile(lzma.LZMAFile):
60+
if not PY310:
61+
62+
def write(self, b) -> int:
63+
# Workaround issue where `lzma.LZMAFile` expects `len`
64+
# to return the number of bytes in `b` by converting
65+
# `b` into something that meets that constraint with
66+
# minimal copying.
67+
#
68+
# Note: This is fixed in Python 3.10.
69+
return super().write(flatten_buffer(b))

pandas/compat/_constants.py

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
"""
2+
_constants
3+
======
4+
5+
Constants relevant for the Python implementation.
6+
"""
7+
8+
from __future__ import annotations
9+
10+
import platform
11+
import sys
12+
13+
IS64 = sys.maxsize > 2**32
14+
15+
PY39 = sys.version_info >= (3, 9)
16+
PY310 = sys.version_info >= (3, 10)
17+
PY311 = sys.version_info >= (3, 11)
18+
PYPY = platform.python_implementation() == "PyPy"
19+
20+
21+
__all__ = [
22+
"IS64",
23+
"PY39",
24+
"PY310",
25+
"PY311",
26+
"PYPY",
27+
]

pandas/io/common.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
ABC,
66
abstractmethod,
77
)
8-
import bz2
98
import codecs
109
import dataclasses
1110
import functools
@@ -55,6 +54,7 @@
5554
WriteBuffer,
5655
)
5756
from pandas.compat import get_lzma_file
57+
from pandas.compat._compressors import BZ2File as _BZ2File
5858
from pandas.compat._optional import import_optional_dependency
5959
from pandas.util._decorators import doc
6060
from pandas.util._exceptions import find_stack_level
@@ -761,9 +761,9 @@ def get_handle(
761761

762762
# BZ Compression
763763
elif compression == "bz2":
764-
# No overload variant of "BZ2File" matches argument types
764+
# Overload of "BZ2File" to handle pickle protocol 5
765765
# "Union[str, BaseBuffer]", "str", "Dict[str, Any]"
766-
handle = bz2.BZ2File( # type: ignore[call-overload]
766+
handle = _BZ2File( # type: ignore[call-overload]
767767
handle,
768768
mode=ioargs.mode,
769769
**compression_args,

pandas/io/pickle.py

+2-9
Original file line numberDiff line numberDiff line change
@@ -101,15 +101,8 @@ def to_pickle(
101101
is_text=False,
102102
storage_options=storage_options,
103103
) as handles:
104-
if handles.compression["method"] in ("bz2", "xz") and protocol >= 5:
105-
# some weird TypeError GH#39002 with pickle 5: fallback to letting
106-
# pickle create the entire object and then write it to the buffer.
107-
# "zip" would also be here if pandas.io.common._BytesZipFile
108-
# wouldn't buffer write calls
109-
handles.handle.write(pickle.dumps(obj, protocol=protocol))
110-
else:
111-
# letting pickle write directly to the buffer is more memory-efficient
112-
pickle.dump(obj, handles.handle, protocol=protocol)
104+
# letting pickle write directly to the buffer is more memory-efficient
105+
pickle.dump(obj, handles.handle, protocol=protocol)
113106

114107

115108
@doc(

pandas/tests/io/test_pickle.py

+33
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
1111
3. Move the created pickle to "data/legacy_pickle/<version>" directory.
1212
"""
13+
from array import array
1314
import bz2
1415
import datetime
1516
import functools
@@ -37,6 +38,7 @@
3738
get_lzma_file,
3839
is_platform_little_endian,
3940
)
41+
from pandas.compat._compressors import flatten_buffer
4042
from pandas.compat._optional import import_optional_dependency
4143
import pandas.util._test_decorators as td
4244

@@ -105,6 +107,37 @@ def legacy_pickle(request, datapath):
105107
# ---------------------
106108
# tests
107109
# ---------------------
110+
111+
112+
@pytest.mark.parametrize(
113+
"data",
114+
[
115+
b"123",
116+
b"123456",
117+
bytearray(b"123"),
118+
memoryview(b"123"),
119+
pickle.PickleBuffer(b"123"),
120+
array("I", [1, 2, 3]),
121+
memoryview(b"123456").cast("B", (3, 2)),
122+
memoryview(b"123456").cast("B", (3, 2))[::2],
123+
np.arange(12).reshape((3, 4), order="C"),
124+
np.arange(12).reshape((3, 4), order="F"),
125+
np.arange(12).reshape((3, 4), order="C")[:, ::2],
126+
],
127+
)
128+
def test_flatten_buffer(data):
129+
result = flatten_buffer(data)
130+
expected = memoryview(data).tobytes("A")
131+
assert result == expected
132+
if isinstance(data, (bytes, bytearray)):
133+
assert result is data
134+
elif isinstance(result, memoryview):
135+
assert result.ndim == 1
136+
assert result.format == "B"
137+
assert result.contiguous
138+
assert result.shape == (result.nbytes,)
139+
140+
108141
def test_pickles(legacy_pickle):
109142
if not is_platform_little_endian():
110143
pytest.skip("known failure on non-little endian")

0 commit comments

Comments
 (0)