PERF: Improve pickle support with BZ2 & LZMA (#49068)

jakirkham · web-flow · commit 4a2b06862c9e · 2022-10-21T10:29:35.000-07:00
* Add `BZ2File` wrapper for pickle protocol 5 * Add `LZMAFile` wrapper for pickle protocol 5 * Use BZ2 & LZMA wrappers for full pickle support * Workaround linter issue `PickleBuffer` isn't currently included in `SupportBytes`, which causes issues with pyright when passing `PickleBuffer` instances to `bytes`. Though it appears ok passing `PickleBuffer` instances to `memoryview`s. So do that instead. This is functionaly very equivalent. There is a slight performance cost to making a `memoryview`, but this is likely negligible compared to copying to `bytes`. * Refactor out `flatten_buffer` * Refactor `B2File` into separate module * Test `flatten_buffer` This provides a reasonable proxy for testing patched `BZ2File` and `LZMAFile` objects. * Move `flatten_buffer` to `_utils` This ran into cyclic import issues in `pickle_compat`. So move `flatten_buffer` to its own module free of these issues. * Import `annotations` to fix `|` usage * Sort `import`s to fix lint * Patch `BZ2File` & `LZMAFile` on Python pre-3.10 This should limit the effects of this patch. Also should make it easier to remove this backport later once all supported Python versions have the fix. * Test C & F contiguous NumPy arrays Also test another non-contiguous array. * Test `memoryview` is 1-D `uint8` contiguous data If a `memoryview` is returned, make sure it as close to `bytes` | `bytearray` as possible. This ensures if other functions assume something like `bytes` (for example assuming `len(b)` is the number of bytes contained), things will continue to work even though this is a `memoryview`. * Run `black` on `bz2` and `lzma` compat files * One more lint fix * Drop unused `PickleBuffer` `import`s * Simplify change to `panda.compat.__init__` Now that the LZMA changes are in a separate file, cleanup the changes to `pandas.compat.__init__`. * Type `flatten_buffer` result * Use `order="A"` in `memoryview.tobytes(...)` In the function `flatten_buffer`, the order is already effectively enforced when copying can be avoided by using `PickleBuffer.raw(...)`. However some test comparisons failed (when they shouldn't have) as this wasn't specified. So add the `order` in both the function and the test. This should fix that test failure. * Move all compat compressors into a single file * Fix `BZ2File` `import` * Refactor out common compat constants * Fix `import` sorting * Drop unused `import` * Ignore `flake8` errors on wildcard `import` * Revert "Ignore `flake8` errors on wildcard `import`" This reverts commit f1f1a2e. * Explicitly `import` all constants * Assign `IS64` first * Try `noqa` on wildcard `import` again * Declare `BZ2File` & `LZMAFile` once Fixes a linter issue from pyright. * `import PickleBuffer` for simplicity * Add `bytearray` to return type * Test `bytes` & `bytearray` are returned unaltered * Explicit list all constants * Trick linter into thinking constants are used ;) * Add new entry to 2.0.0 * Assign constants to themselves Should work around linter issues. * Update changelog entry [skip ci] * Add constants to `__all__` * Update changelog entry [ci skip] * Use Sphinx method annotation
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -198,6 +198,7 @@ Performance improvements
 - Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`)
 - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`)
 - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
+- Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`)
 - Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`)
 - Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`)
 - Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`)
diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
@@ -12,9 +12,16 @@
 import os
 import platform
 import sys
-from typing import TYPE_CHECKING
 
 from pandas._typing import F
+import pandas.compat._compressors
+from pandas.compat._constants import (
+    IS64,
+    PY39,
+    PY310,
+    PY311,
+    PYPY,
+)
 from pandas.compat.numpy import (
     is_numpy_dev,
     np_version_under1p21,
@@ -26,15 +33,6 @@
     pa_version_under9p0,
 )
 
-if TYPE_CHECKING:
-    import lzma
-
-PY39 = sys.version_info >= (3, 9)
-PY310 = sys.version_info >= (3, 10)
-PY311 = sys.version_info >= (3, 11)
-PYPY = platform.python_implementation() == "PyPy"
-IS64 = sys.maxsize > 2**32
-
 
 def set_function_name(f: F, name: str, cls) -> F:
     """
@@ -121,7 +119,7 @@ def is_ci_environment() -> bool:
     return os.environ.get("PANDAS_CI", "0") == "1"
 
 
-def get_lzma_file() -> type[lzma.LZMAFile]:
+def get_lzma_file() -> type[pandas.compat._compressors.LZMAFile]:
     """
     Importing the `LZMAFile` class from the `lzma` module.
 
@@ -135,15 +133,13 @@ def get_lzma_file() -> type[lzma.LZMAFile]:
     RuntimeError
         If the `lzma` module was not imported correctly, or didn't exist.
     """
-    try:
-        import lzma
-    except ImportError:
+    if not pandas.compat._compressors.has_lzma:
         raise RuntimeError(
             "lzma module not available. "
             "A Python re-install with the proper dependencies, "
             "might be required to solve this issue."
         )
-    return lzma.LZMAFile
+    return pandas.compat._compressors.LZMAFile
 
 
 __all__ = [
@@ -153,4 +149,9 @@ def get_lzma_file() -> type[lzma.LZMAFile]:
     "pa_version_under7p0",
     "pa_version_under8p0",
     "pa_version_under9p0",
+    "IS64",
+    "PY39",
+    "PY310",
+    "PY311",
+    "PYPY",
 ]
diff --git a/pandas/compat/_compressors.py b/pandas/compat/_compressors.py
@@ -0,0 +1,69 @@
+"""
+Patched ``BZ2File`` and ``LZMAFile`` to handle pickle protocol 5.
+"""
+
+from __future__ import annotations
+
+import bz2
+from pickle import PickleBuffer
+
+from pandas.compat._constants import PY310
+
+try:
+    import lzma
+
+    has_lzma = True
+except ImportError:
+    has_lzma = False
+
+
+def flatten_buffer(
+    b: bytes | bytearray | memoryview | PickleBuffer,
+) -> bytes | bytearray | memoryview:
+    """
+    Return some 1-D `uint8` typed buffer.
+
+    Coerces anything that does not match that description to one that does
+    without copying if possible (otherwise will copy).
+    """
+
+    if isinstance(b, (bytes, bytearray)):
+        return b
+
+    if not isinstance(b, PickleBuffer):
+        b = PickleBuffer(b)
+
+    try:
+        # coerce to 1-D `uint8` C-contiguous `memoryview` zero-copy
+        return b.raw()
+    except BufferError:
+        # perform in-memory copy if buffer is not contiguous
+        return memoryview(b).tobytes("A")
+
+
+class BZ2File(bz2.BZ2File):
+    if not PY310:
+
+        def write(self, b) -> int:
+            # Workaround issue where `bz2.BZ2File` expects `len`
+            # to return the number of bytes in `b` by converting
+            # `b` into something that meets that constraint with
+            # minimal copying.
+            #
+            # Note: This is fixed in Python 3.10.
+            return super().write(flatten_buffer(b))
+
+
+if has_lzma:
+
+    class LZMAFile(lzma.LZMAFile):
+        if not PY310:
+
+            def write(self, b) -> int:
+                # Workaround issue where `lzma.LZMAFile` expects `len`
+                # to return the number of bytes in `b` by converting
+                # `b` into something that meets that constraint with
+                # minimal copying.
+                #
+                # Note: This is fixed in Python 3.10.
+                return super().write(flatten_buffer(b))
diff --git a/pandas/compat/_constants.py b/pandas/compat/_constants.py
@@ -0,0 +1,27 @@
+"""
+_constants
+======
+
+Constants relevant for the Python implementation.
+"""
+
+from __future__ import annotations
+
+import platform
+import sys
+
+IS64 = sys.maxsize > 2**32
+
+PY39 = sys.version_info >= (3, 9)
+PY310 = sys.version_info >= (3, 10)
+PY311 = sys.version_info >= (3, 11)
+PYPY = platform.python_implementation() == "PyPy"
+
+
+__all__ = [
+    "IS64",
+    "PY39",
+    "PY310",
+    "PY311",
+    "PYPY",
+]
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -5,7 +5,6 @@
     ABC,
     abstractmethod,
 )
-import bz2
 import codecs
 import dataclasses
 import functools
@@ -55,6 +54,7 @@
     WriteBuffer,
 )
 from pandas.compat import get_lzma_file
+from pandas.compat._compressors import BZ2File as _BZ2File
 from pandas.compat._optional import import_optional_dependency
 from pandas.util._decorators import doc
 from pandas.util._exceptions import find_stack_level
@@ -761,9 +761,9 @@ def get_handle(
 
         # BZ Compression
         elif compression == "bz2":
-            # No overload variant of "BZ2File" matches argument types
+            # Overload of "BZ2File" to handle pickle protocol 5
             # "Union[str, BaseBuffer]", "str", "Dict[str, Any]"
-            handle = bz2.BZ2File(  # type: ignore[call-overload]
+            handle = _BZ2File(  # type: ignore[call-overload]
                 handle,
                 mode=ioargs.mode,
                 **compression_args,
diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py
@@ -101,15 +101,8 @@ def to_pickle(
         is_text=False,
         storage_options=storage_options,
     ) as handles:
-        if handles.compression["method"] in ("bz2", "xz") and protocol >= 5:
-            # some weird TypeError GH#39002 with pickle 5: fallback to letting
-            # pickle create the entire object and then write it to the buffer.
-            # "zip" would also be here if pandas.io.common._BytesZipFile
-            # wouldn't buffer write calls
-            handles.handle.write(pickle.dumps(obj, protocol=protocol))
-        else:
-            # letting pickle write directly to the buffer is more memory-efficient
-            pickle.dump(obj, handles.handle, protocol=protocol)
+        # letting pickle write directly to the buffer is more memory-efficient
+        pickle.dump(obj, handles.handle, protocol=protocol)
 
 
 @doc(
diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py
@@ -10,6 +10,7 @@
 
 3. Move the created pickle to "data/legacy_pickle/<version>" directory.
 """
+from array import array
 import bz2
 import datetime
 import functools
@@ -37,6 +38,7 @@
     get_lzma_file,
     is_platform_little_endian,
 )
+from pandas.compat._compressors import flatten_buffer
 from pandas.compat._optional import import_optional_dependency
 import pandas.util._test_decorators as td
 
@@ -105,6 +107,37 @@ def legacy_pickle(request, datapath):
 # ---------------------
 # tests
 # ---------------------
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        b"123",
+        b"123456",
+        bytearray(b"123"),
+        memoryview(b"123"),
+        pickle.PickleBuffer(b"123"),
+        array("I", [1, 2, 3]),
+        memoryview(b"123456").cast("B", (3, 2)),
+        memoryview(b"123456").cast("B", (3, 2))[::2],
+        np.arange(12).reshape((3, 4), order="C"),
+        np.arange(12).reshape((3, 4), order="F"),
+        np.arange(12).reshape((3, 4), order="C")[:, ::2],
+    ],
+)
+def test_flatten_buffer(data):
+    result = flatten_buffer(data)
+    expected = memoryview(data).tobytes("A")
+    assert result == expected
+    if isinstance(data, (bytes, bytearray)):
+        assert result is data
+    elif isinstance(result, memoryview):
+        assert result.ndim == 1
+        assert result.format == "B"
+        assert result.contiguous
+        assert result.shape == (result.nbytes,)
+
+
 def test_pickles(legacy_pickle):
     if not is_platform_little_endian():
         pytest.skip("known failure on non-little endian")