Skip to content

PERF: Improve pickle support with BZ2 & LZMA #49068

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 49 commits into from
Oct 21, 2022
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
b3e1bc5
Add `BZ2File` wrapper for pickle protocol 5
jakirkham Oct 13, 2022
17f725b
Add `LZMAFile` wrapper for pickle protocol 5
jakirkham Oct 13, 2022
280731e
Use BZ2 & LZMA wrappers for full pickle support
jakirkham Oct 13, 2022
ccda94e
Workaround linter issue
jakirkham Oct 13, 2022
08c37e5
Refactor out `flatten_buffer`
jakirkham Oct 18, 2022
8109338
Refactor `B2File` into separate module
jakirkham Oct 18, 2022
3c498bd
Merge pandas-dev/main into jakirkham/fix_pickle5
jakirkham Oct 18, 2022
691eba7
Test `flatten_buffer`
jakirkham Oct 18, 2022
7a93b70
Move `flatten_buffer` to `_utils`
jakirkham Oct 18, 2022
8f5b0a1
Import `annotations` to fix `|` usage
jakirkham Oct 18, 2022
b5ce67c
Merge pandas-dev/main into jakirkham/fix_pickle5
jakirkham Oct 18, 2022
c54529a
Sort `import`s to fix lint
jakirkham Oct 18, 2022
7604d48
Patch `BZ2File` & `LZMAFile` on Python pre-3.10
jakirkham Oct 18, 2022
9f3d387
Test C & F contiguous NumPy arrays
jakirkham Oct 18, 2022
16f21a5
Test `memoryview` is 1-D `uint8` contiguous data
jakirkham Oct 18, 2022
6df7e08
Run `black` on `bz2` and `lzma` compat files
jakirkham Oct 18, 2022
39ffab0
One more lint fix
jakirkham Oct 18, 2022
f134dee
Drop unused `PickleBuffer` `import`s
jakirkham Oct 18, 2022
a7126a2
Simplify change to `panda.compat.__init__`
jakirkham Oct 18, 2022
df7b0ce
Type `flatten_buffer` result
jakirkham Oct 18, 2022
742788f
Use `order="A"` in `memoryview.tobytes(...)`
jakirkham Oct 18, 2022
ffc58d3
Move all compat compressors into a single file
jakirkham Oct 19, 2022
0b1be16
Fix `BZ2File` `import`
jakirkham Oct 19, 2022
5a6ea45
Merge pandas-dev/main into jakirkham/fix_pickle5
jakirkham Oct 19, 2022
f00740c
Refactor out common compat constants
jakirkham Oct 19, 2022
06e5387
Fix `import` sorting
jakirkham Oct 19, 2022
269dc0f
Drop unused `import`
jakirkham Oct 19, 2022
f1f1a2e
Ignore `flake8` errors on wildcard `import`
jakirkham Oct 19, 2022
f73f0a5
Revert "Ignore `flake8` errors on wildcard `import`"
jakirkham Oct 19, 2022
b8a724b
Explicitly `import` all constants
jakirkham Oct 19, 2022
1b11188
Assign `IS64` first
jakirkham Oct 19, 2022
d2a39db
Try `noqa` on wildcard `import` again
jakirkham Oct 19, 2022
01e8604
Declare `BZ2File` & `LZMAFile` once
jakirkham Oct 19, 2022
523e20c
`import PickleBuffer` for simplicity
jakirkham Oct 19, 2022
6f7e293
Add `bytearray` to return type
jakirkham Oct 19, 2022
4614bd7
Test `bytes` & `bytearray` are returned unaltered
jakirkham Oct 19, 2022
818e08d
Merge branch 'main' into fix_pickle5
jakirkham Oct 19, 2022
f33ed7a
Merge pandas-dev/main into jakirkham/fix_pickle5
jakirkham Oct 19, 2022
cf4f926
Explicit list all constants
jakirkham Oct 20, 2022
4a2efad
Trick linter into thinking constants are used ;)
jakirkham Oct 20, 2022
b18a3f0
Merge pandas-dev/main into jakirkham/fix_pickle5
jakirkham Oct 20, 2022
0dae476
Add new entry to 2.0.0
jakirkham Oct 20, 2022
366f645
Assign constants to themselves
jakirkham Oct 20, 2022
092e726
Update changelog entry [skip ci]
jakirkham Oct 20, 2022
03b8eac
Merge pandas-dev/main into jakirkham/fix_pickle5
jakirkham Oct 20, 2022
e49ba4f
Add constants to `__all__`
jakirkham Oct 20, 2022
453b4e3
Update changelog entry [ci skip]
jakirkham Oct 21, 2022
30124dd
Use Sphinx method annotation
jakirkham Oct 21, 2022
72aeff2
Merge pandas-dev/main into jakirkham/fix_pickle5
jakirkham Oct 21, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 11 additions & 9 deletions pandas/compat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,16 @@
from __future__ import annotations

import os
from pickle import PickleBuffer
import platform
import sys
from typing import TYPE_CHECKING

try:
import pandas.compat.lzma

has_lzma = True
except ImportError:
has_lzma = False

from pandas._typing import F
from pandas.compat.numpy import (
Expand All @@ -26,9 +33,6 @@
pa_version_under9p0,
)

if TYPE_CHECKING:
import lzma

PY39 = sys.version_info >= (3, 9)
PY310 = sys.version_info >= (3, 10)
PY311 = sys.version_info >= (3, 11)
Expand Down Expand Up @@ -121,7 +125,7 @@ def is_ci_environment() -> bool:
return os.environ.get("PANDAS_CI", "0") == "1"


def get_lzma_file() -> type[lzma.LZMAFile]:
def get_lzma_file() -> type[pandas.compat.lzma.LZMAFile]:
"""
Importing the `LZMAFile` class from the `lzma` module.

Expand All @@ -135,15 +139,13 @@ def get_lzma_file() -> type[lzma.LZMAFile]:
RuntimeError
If the `lzma` module was not imported correctly, or didn't exist.
"""
try:
import lzma
except ImportError:
if not has_lzma:
raise RuntimeError(
"lzma module not available. "
"A Python re-install with the proper dependencies, "
"might be required to solve this issue."
)
return lzma.LZMAFile
return pandas.compat.lzma.LZMAFile


__all__ = [
Expand Down
29 changes: 29 additions & 0 deletions pandas/compat/_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""
Utilities used in various `compat` components.
"""

from __future__ import annotations

import pickle


def flatten_buffer(b: bytes | bytearray | memoryview | pickle.PickleBuffer):
"""
Return some 1-D `uint8` typed buffer.

Coerces anything that does not match that description to one that does
without copying if possible (otherwise will copy).
"""

if isinstance(b, (bytes, bytearray)):
return b

if not isinstance(b, pickle.PickleBuffer):
b = pickle.PickleBuffer(b)

try:
# coerce to 1-D `uint8` C-contiguous `memoryview` zero-copy
return b.raw()
except BufferError:
# perform in-memory copy if buffer is not contiguous
return memoryview(b).tobytes()
25 changes: 25 additions & 0 deletions pandas/compat/bz2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""
Patched ``BZ2File`` to handle pickle protocol 5.
"""

from __future__ import annotations

import bz2
import sys

from pandas.compat._utils import flatten_buffer


if sys.version_info < (3, 10):
class BZ2File(bz2.BZ2File):
def write(self, b) -> int:
# Workaround issue where `bz2.BZ2File` expects `len`
# to return the number of bytes in `b` by converting
# `b` into something that meets that constraint with
# minimal copying.
#
# Note: This is fixed in Python 3.10.
return super().write(flatten_buffer(b))
else:
class BZ2File(bz2.BZ2File):
pass
25 changes: 25 additions & 0 deletions pandas/compat/lzma.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""
Patched ``LZMAFile`` to handle pickle protocol 5.
"""

from __future__ import annotations

import lzma
import sys

from pandas.compat._utils import flatten_buffer


if sys.version_info < (3, 10):
class LZMAFile(lzma.LZMAFile):
def write(self, b) -> int:
# Workaround issue where `lzma.LZMAFile` expects `len`
# to return the number of bytes in `b` by converting
# `b` into something that meets that constraint with
# minimal copying.
#
# Note: This is fixed in Python 3.10.
return super().write(flatten_buffer(b))
else:
class LZMAFile(lzma.LZMAFile):
pass
7 changes: 4 additions & 3 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
ABC,
abstractmethod,
)
import bz2
import codecs
import dataclasses
import functools
Expand All @@ -21,6 +20,7 @@
import mmap
import os
from pathlib import Path
from pickle import PickleBuffer
import re
import tarfile
from typing import (
Expand Down Expand Up @@ -56,6 +56,7 @@
)
from pandas.compat import get_lzma_file
from pandas.compat._optional import import_optional_dependency
from pandas.compat.bz2 import BZ2File as _BZ2File
from pandas.util._decorators import doc
from pandas.util._exceptions import find_stack_level

Expand Down Expand Up @@ -761,9 +762,9 @@ def get_handle(

# BZ Compression
elif compression == "bz2":
# No overload variant of "BZ2File" matches argument types
# Overload of "BZ2File" to handle pickle protocol 5
# "Union[str, BaseBuffer]", "str", "Dict[str, Any]"
handle = bz2.BZ2File( # type: ignore[call-overload]
handle = _BZ2File( # type: ignore[call-overload]
handle,
mode=ioargs.mode,
**compression_args,
Expand Down
11 changes: 2 additions & 9 deletions pandas/io/pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,15 +101,8 @@ def to_pickle(
is_text=False,
storage_options=storage_options,
) as handles:
if handles.compression["method"] in ("bz2", "xz") and protocol >= 5:
# some weird TypeError GH#39002 with pickle 5: fallback to letting
# pickle create the entire object and then write it to the buffer.
# "zip" would also be here if pandas.io.common._BytesZipFile
# wouldn't buffer write calls
handles.handle.write(pickle.dumps(obj, protocol=protocol))
else:
# letting pickle write directly to the buffer is more memory-efficient
pickle.dump(obj, handles.handle, protocol=protocol)
# letting pickle write directly to the buffer is more memory-efficient
pickle.dump(obj, handles.handle, protocol=protocol)


@doc(
Expand Down
30 changes: 30 additions & 0 deletions pandas/tests/io/test_pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

3. Move the created pickle to "data/legacy_pickle/<version>" directory.
"""
from array import array
import bz2
import datetime
import functools
Expand Down Expand Up @@ -38,6 +39,7 @@
is_platform_little_endian,
)
from pandas.compat._optional import import_optional_dependency
from pandas.compat._utils import flatten_buffer
import pandas.util._test_decorators as td

import pandas as pd
Expand Down Expand Up @@ -105,6 +107,34 @@ def legacy_pickle(request, datapath):
# ---------------------
# tests
# ---------------------


@pytest.mark.parametrize(
"data",
[
b"123",
b"123456",
bytearray(b"123"),
memoryview(b"123"),
pickle.PickleBuffer(b"123"),
array("I", [1, 2, 3]),
memoryview(b"123456").cast("B", (3, 2)),
memoryview(b"123456").cast("B", (3, 2))[::2],
np.arange(12).reshape((3, 4), order="C"),
np.arange(12).reshape((3, 4), order="F"),
np.arange(12).reshape((3, 4), order="C")[:, ::2],
],
)
def test_flatten_buffer(data):
result = flatten_buffer(data)
assert result == bytes(data)
if isinstance(result, memoryview):
assert result.ndim == 1
assert result.format == "B"
assert result.contiguous
assert result.shape == (result.nbytes,)


def test_pickles(legacy_pickle):
if not is_platform_little_endian():
pytest.skip("known failure on non-little endian")
Expand Down