diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 4ac737bb6b29a..7bf3ae8be6952 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -188,6 +188,7 @@ Performance improvements - Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`) - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`) - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) +- Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`) - Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`) - Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 52eb0cf147296..f097b0c1ae630 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -12,9 +12,16 @@ import os import platform import sys -from typing import TYPE_CHECKING from pandas._typing import F +import pandas.compat._compressors +from pandas.compat._constants import ( + IS64, + PY39, + PY310, + PY311, + PYPY, +) from pandas.compat.numpy import ( is_numpy_dev, np_version_under1p21, @@ -26,15 +33,6 @@ pa_version_under9p0, ) -if TYPE_CHECKING: - import lzma - -PY39 = sys.version_info >= (3, 9) -PY310 = sys.version_info >= (3, 10) -PY311 = sys.version_info >= (3, 11) -PYPY = platform.python_implementation() == "PyPy" -IS64 = sys.maxsize > 2**32 - def set_function_name(f: F, name: str, cls) -> F: """ @@ -121,7 +119,7 @@ def is_ci_environment() -> bool: return os.environ.get("PANDAS_CI", "0") == "1" -def get_lzma_file() -> type[lzma.LZMAFile]: +def get_lzma_file() -> type[pandas.compat._compressors.LZMAFile]: """ Importing the `LZMAFile` class from the `lzma` module. @@ -135,15 +133,13 @@ def get_lzma_file() -> type[lzma.LZMAFile]: RuntimeError If the `lzma` module was not imported correctly, or didn't exist. """ - try: - import lzma - except ImportError: + if not pandas.compat._compressors.has_lzma: raise RuntimeError( "lzma module not available. " "A Python re-install with the proper dependencies, " "might be required to solve this issue." ) - return lzma.LZMAFile + return pandas.compat._compressors.LZMAFile __all__ = [ @@ -153,4 +149,9 @@ def get_lzma_file() -> type[lzma.LZMAFile]: "pa_version_under7p0", "pa_version_under8p0", "pa_version_under9p0", + "IS64", + "PY39", + "PY310", + "PY311", + "PYPY", ] diff --git a/pandas/compat/_compressors.py b/pandas/compat/_compressors.py new file mode 100644 index 0000000000000..a4f39c4e34bd4 --- /dev/null +++ b/pandas/compat/_compressors.py @@ -0,0 +1,69 @@ +""" +Patched ``BZ2File`` and ``LZMAFile`` to handle pickle protocol 5. +""" + +from __future__ import annotations + +import bz2 +from pickle import PickleBuffer + +from pandas.compat._constants import PY310 + +try: + import lzma + + has_lzma = True +except ImportError: + has_lzma = False + + +def flatten_buffer( + b: bytes | bytearray | memoryview | PickleBuffer, +) -> bytes | bytearray | memoryview: + """ + Return some 1-D `uint8` typed buffer. + + Coerces anything that does not match that description to one that does + without copying if possible (otherwise will copy). + """ + + if isinstance(b, (bytes, bytearray)): + return b + + if not isinstance(b, PickleBuffer): + b = PickleBuffer(b) + + try: + # coerce to 1-D `uint8` C-contiguous `memoryview` zero-copy + return b.raw() + except BufferError: + # perform in-memory copy if buffer is not contiguous + return memoryview(b).tobytes("A") + + +class BZ2File(bz2.BZ2File): + if not PY310: + + def write(self, b) -> int: + # Workaround issue where `bz2.BZ2File` expects `len` + # to return the number of bytes in `b` by converting + # `b` into something that meets that constraint with + # minimal copying. + # + # Note: This is fixed in Python 3.10. + return super().write(flatten_buffer(b)) + + +if has_lzma: + + class LZMAFile(lzma.LZMAFile): + if not PY310: + + def write(self, b) -> int: + # Workaround issue where `lzma.LZMAFile` expects `len` + # to return the number of bytes in `b` by converting + # `b` into something that meets that constraint with + # minimal copying. + # + # Note: This is fixed in Python 3.10. + return super().write(flatten_buffer(b)) diff --git a/pandas/compat/_constants.py b/pandas/compat/_constants.py new file mode 100644 index 0000000000000..75d99f5ae51fb --- /dev/null +++ b/pandas/compat/_constants.py @@ -0,0 +1,27 @@ +""" +_constants +====== + +Constants relevant for the Python implementation. +""" + +from __future__ import annotations + +import platform +import sys + +IS64 = sys.maxsize > 2**32 + +PY39 = sys.version_info >= (3, 9) +PY310 = sys.version_info >= (3, 10) +PY311 = sys.version_info >= (3, 11) +PYPY = platform.python_implementation() == "PyPy" + + +__all__ = [ + "IS64", + "PY39", + "PY310", + "PY311", + "PYPY", +] diff --git a/pandas/io/common.py b/pandas/io/common.py index 88311c64f5761..64e703572f2bf 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -5,7 +5,6 @@ ABC, abstractmethod, ) -import bz2 import codecs import dataclasses import functools @@ -55,6 +54,7 @@ WriteBuffer, ) from pandas.compat import get_lzma_file +from pandas.compat._compressors import BZ2File as _BZ2File from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level @@ -761,9 +761,9 @@ def get_handle( # BZ Compression elif compression == "bz2": - # No overload variant of "BZ2File" matches argument types + # Overload of "BZ2File" to handle pickle protocol 5 # "Union[str, BaseBuffer]", "str", "Dict[str, Any]" - handle = bz2.BZ2File( # type: ignore[call-overload] + handle = _BZ2File( # type: ignore[call-overload] handle, mode=ioargs.mode, **compression_args, diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 373d608876c3e..1d720b881525b 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -101,15 +101,8 @@ def to_pickle( is_text=False, storage_options=storage_options, ) as handles: - if handles.compression["method"] in ("bz2", "xz") and protocol >= 5: - # some weird TypeError GH#39002 with pickle 5: fallback to letting - # pickle create the entire object and then write it to the buffer. - # "zip" would also be here if pandas.io.common._BytesZipFile - # wouldn't buffer write calls - handles.handle.write(pickle.dumps(obj, protocol=protocol)) - else: - # letting pickle write directly to the buffer is more memory-efficient - pickle.dump(obj, handles.handle, protocol=protocol) + # letting pickle write directly to the buffer is more memory-efficient + pickle.dump(obj, handles.handle, protocol=protocol) @doc( diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 98f02e14f4f13..d78cb9e46cd1a 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -10,6 +10,7 @@ 3. Move the created pickle to "data/legacy_pickle/" directory. """ +from array import array import bz2 import datetime import functools @@ -37,6 +38,7 @@ get_lzma_file, is_platform_little_endian, ) +from pandas.compat._compressors import flatten_buffer from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td @@ -105,6 +107,37 @@ def legacy_pickle(request, datapath): # --------------------- # tests # --------------------- + + +@pytest.mark.parametrize( + "data", + [ + b"123", + b"123456", + bytearray(b"123"), + memoryview(b"123"), + pickle.PickleBuffer(b"123"), + array("I", [1, 2, 3]), + memoryview(b"123456").cast("B", (3, 2)), + memoryview(b"123456").cast("B", (3, 2))[::2], + np.arange(12).reshape((3, 4), order="C"), + np.arange(12).reshape((3, 4), order="F"), + np.arange(12).reshape((3, 4), order="C")[:, ::2], + ], +) +def test_flatten_buffer(data): + result = flatten_buffer(data) + expected = memoryview(data).tobytes("A") + assert result == expected + if isinstance(data, (bytes, bytearray)): + assert result is data + elif isinstance(result, memoryview): + assert result.ndim == 1 + assert result.format == "B" + assert result.contiguous + assert result.shape == (result.nbytes,) + + def test_pickles(legacy_pickle): if not is_platform_little_endian(): pytest.skip("known failure on non-little endian")