From b3e1bc5100d7620375b37334ac1c87d9a74473ea Mon Sep 17 00:00:00 2001 From: jakirkham Date: Thu, 13 Oct 2022 02:53:01 -0700 Subject: [PATCH 01/41] Add `BZ2File` wrapper for pickle protocol 5 --- pandas/io/common.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 4d324b009d1e3..ff63fb2327c59 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -22,6 +22,7 @@ import mmap import os from pathlib import Path +from pickle import PickleBuffer import re import tarfile from typing import ( @@ -762,9 +763,9 @@ def get_handle( # BZ Compression elif compression == "bz2": - # No overload variant of "BZ2File" matches argument types + # Overload of "BZ2File" to handle pickle protocol 5 # "Union[str, BaseBuffer]", "str", "Dict[str, Any]" - handle = bz2.BZ2File( # type: ignore[call-overload] + handle = _BZ2File( # type: ignore[call-overload] handle, mode=ioargs.mode, **compression_args, @@ -1002,6 +1003,22 @@ def write_to_buffer(self) -> None: self.buffer.addfile(tarinfo, self) +class _BZ2File(bz2.BZ2File): + def write(self, b) -> int: + if isinstance(b, PickleBuffer): + # Workaround issue where `bz2.BZ2File` expects `len` + # to return the number of bytes in `b` by converting + # `b` into something that meets that constraint with + # minimal copying. + try: + # coerce to 1-D `uint8` C-contiguous `memoryview` zero-copy + b = b.raw() + except BufferError: + # perform in-memory copy if buffer is not contiguous + b = bytes(b) + return super().write(b) + + class _BytesZipFile(_BufferedWriter): def __init__( self, From 17f725b6cc15bf5a773201e57742ceeca772de46 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Thu, 13 Oct 2022 02:53:01 -0700 Subject: [PATCH 02/41] Add `LZMAFile` wrapper for pickle protocol 5 --- pandas/compat/__init__.py | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 80f66c945ba27..86551516521d8 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -10,9 +10,16 @@ from __future__ import annotations import os +from pickle import PickleBuffer import platform import sys -from typing import TYPE_CHECKING + +try: + import lzma + + has_lzma = True +except ImportError: + has_lzma = False from pandas._typing import F from pandas.compat.numpy import ( @@ -31,9 +38,6 @@ pa_version_under9p0, ) -if TYPE_CHECKING: - import lzma - PY39 = sys.version_info >= (3, 9) PY310 = sys.version_info >= (3, 10) PY311 = sys.version_info >= (3, 11) @@ -41,6 +45,24 @@ IS64 = sys.maxsize > 2**32 +if has_lzma: + + class _LZMAFile(lzma.LZMAFile): + def write(self, b) -> int: + if isinstance(b, PickleBuffer): + # Workaround issue where `lzma.LZMAFile` expects `len` + # to return the number of bytes in `b` by converting + # `b` into something that meets that constraint with + # minimal copying. + try: + # coerce to 1-D `uint8` C-contiguous `memoryview` zero-copy + b = b.raw() + except BufferError: + # perform in-memory copy if buffer is not contiguous + b = bytes(b) + return super().write(b) + + def set_function_name(f: F, name: str, cls) -> F: """ Bind the name/qualname attributes of the function. @@ -126,7 +148,7 @@ def is_ci_environment() -> bool: return os.environ.get("PANDAS_CI", "0") == "1" -def get_lzma_file() -> type[lzma.LZMAFile]: +def get_lzma_file() -> type[_LZMAFile]: """ Importing the `LZMAFile` class from the `lzma` module. @@ -140,15 +162,13 @@ def get_lzma_file() -> type[lzma.LZMAFile]: RuntimeError If the `lzma` module was not imported correctly, or didn't exist. """ - try: - import lzma - except ImportError: + if not has_lzma: raise RuntimeError( "lzma module not available. " "A Python re-install with the proper dependencies, " "might be required to solve this issue." ) - return lzma.LZMAFile + return _LZMAFile __all__ = [ From 280731e76a8224cff89cfed19755b3b50b1172b6 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Thu, 13 Oct 2022 02:53:02 -0700 Subject: [PATCH 03/41] Use BZ2 & LZMA wrappers for full pickle support --- pandas/io/pickle.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 373d608876c3e..1d720b881525b 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -101,15 +101,8 @@ def to_pickle( is_text=False, storage_options=storage_options, ) as handles: - if handles.compression["method"] in ("bz2", "xz") and protocol >= 5: - # some weird TypeError GH#39002 with pickle 5: fallback to letting - # pickle create the entire object and then write it to the buffer. - # "zip" would also be here if pandas.io.common._BytesZipFile - # wouldn't buffer write calls - handles.handle.write(pickle.dumps(obj, protocol=protocol)) - else: - # letting pickle write directly to the buffer is more memory-efficient - pickle.dump(obj, handles.handle, protocol=protocol) + # letting pickle write directly to the buffer is more memory-efficient + pickle.dump(obj, handles.handle, protocol=protocol) @doc( From ccda94ea37b0fbd8c311ba17b28b007d01892062 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Thu, 13 Oct 2022 03:58:05 -0700 Subject: [PATCH 04/41] Workaround linter issue `PickleBuffer` isn't currently included in `SupportBytes`, which causes issues with pyright when passing `PickleBuffer` instances to `bytes`. Though it appears ok passing `PickleBuffer` instances to `memoryview`s. So do that instead. This is functionaly very equivalent. There is a slight performance cost to making a `memoryview`, but this is likely negligible compared to copying to `bytes`. --- pandas/compat/__init__.py | 2 +- pandas/io/common.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 86551516521d8..494453b5f8ff2 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -59,7 +59,7 @@ def write(self, b) -> int: b = b.raw() except BufferError: # perform in-memory copy if buffer is not contiguous - b = bytes(b) + b = memoryview(b).tobytes() return super().write(b) diff --git a/pandas/io/common.py b/pandas/io/common.py index ff63fb2327c59..4b0fbe1f9b1f9 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -1015,7 +1015,7 @@ def write(self, b) -> int: b = b.raw() except BufferError: # perform in-memory copy if buffer is not contiguous - b = bytes(b) + b = memoryview(b).tobytes() return super().write(b) From 08c37e5a0ac648ed6477a1ad8c3082330ccef6e7 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 03:15:49 -0700 Subject: [PATCH 05/41] Refactor out `flatten_buffer` --- pandas/compat/__init__.py | 24 +++--------------------- pandas/compat/lzma.py | 18 ++++++++++++++++++ pandas/compat/pickle_compat.py | 22 ++++++++++++++++++++++ pandas/io/common.py | 18 ++++++------------ 4 files changed, 49 insertions(+), 33 deletions(-) create mode 100644 pandas/compat/lzma.py diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 494453b5f8ff2..be38ed125eb6a 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -15,7 +15,7 @@ import sys try: - import lzma + import pandas.compat.lzma has_lzma = True except ImportError: @@ -45,24 +45,6 @@ IS64 = sys.maxsize > 2**32 -if has_lzma: - - class _LZMAFile(lzma.LZMAFile): - def write(self, b) -> int: - if isinstance(b, PickleBuffer): - # Workaround issue where `lzma.LZMAFile` expects `len` - # to return the number of bytes in `b` by converting - # `b` into something that meets that constraint with - # minimal copying. - try: - # coerce to 1-D `uint8` C-contiguous `memoryview` zero-copy - b = b.raw() - except BufferError: - # perform in-memory copy if buffer is not contiguous - b = memoryview(b).tobytes() - return super().write(b) - - def set_function_name(f: F, name: str, cls) -> F: """ Bind the name/qualname attributes of the function. @@ -148,7 +130,7 @@ def is_ci_environment() -> bool: return os.environ.get("PANDAS_CI", "0") == "1" -def get_lzma_file() -> type[_LZMAFile]: +def get_lzma_file() -> type[pandas.compat.lzma.LZMAFile]: """ Importing the `LZMAFile` class from the `lzma` module. @@ -168,7 +150,7 @@ def get_lzma_file() -> type[_LZMAFile]: "A Python re-install with the proper dependencies, " "might be required to solve this issue." ) - return _LZMAFile + return pandas.compat.lzma.LZMAFile __all__ = [ diff --git a/pandas/compat/lzma.py b/pandas/compat/lzma.py new file mode 100644 index 0000000000000..7105c7da6191f --- /dev/null +++ b/pandas/compat/lzma.py @@ -0,0 +1,18 @@ +""" +Patched ``LZMAFile`` to handle pickle protocol 5. +""" + +from __future__ import annotations + +import lzma + +from pandas.compat.pickle_compat import flatten_buffer + + +class LZMAFile(lzma.LZMAFile): + def write(self, b) -> int: + # Workaround issue where `lzma.LZMAFile` expects `len` + # to return the number of bytes in `b` by converting + # `b` into something that meets that constraint with + # minimal copying. + return super().write(flatten_buffer(b)) diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index c233e3d8a4892..f9a52d5dd5e47 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -33,6 +33,28 @@ ) +def flatten_buffer(b: bytes | bytearray | memoryview | pkl.PickleBuffer): + """ + Return some 1-D `uint8` typed buffer. + + Coerces anything that does not match that description to one that does + without copying if possible (otherwise will copy). + """ + + if isinstance(b, (bytes, bytearray)): + return b + + if not isinstance(b, pkl.PickleBuffer): + b = pkl.PickleBuffer(b) + + try: + # coerce to 1-D `uint8` C-contiguous `memoryview` zero-copy + return b.raw() + except BufferError: + # perform in-memory copy if buffer is not contiguous + return memoryview(b).tobytes() + + def load_reduce(self): stack = self.stack args = stack.pop() diff --git a/pandas/io/common.py b/pandas/io/common.py index 4b0fbe1f9b1f9..107971523725e 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -57,6 +57,7 @@ WriteBuffer, ) from pandas.compat import get_lzma_file +from pandas.compat.pickle_compat import flatten_buffer from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level @@ -1005,18 +1006,11 @@ def write_to_buffer(self) -> None: class _BZ2File(bz2.BZ2File): def write(self, b) -> int: - if isinstance(b, PickleBuffer): - # Workaround issue where `bz2.BZ2File` expects `len` - # to return the number of bytes in `b` by converting - # `b` into something that meets that constraint with - # minimal copying. - try: - # coerce to 1-D `uint8` C-contiguous `memoryview` zero-copy - b = b.raw() - except BufferError: - # perform in-memory copy if buffer is not contiguous - b = memoryview(b).tobytes() - return super().write(b) + # Workaround issue where `bz2.BZ2File` expects `len` + # to return the number of bytes in `b` by converting + # `b` into something that meets that constraint with + # minimal copying. + return super().write(flatten_buffer(b)) class _BytesZipFile(_BufferedWriter): From 8109338ba1e6fd3d2d222adf39e764e4b1dc6b3a Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 03:16:24 -0700 Subject: [PATCH 06/41] Refactor `B2File` into separate module --- pandas/compat/bz2.py | 18 ++++++++++++++++++ pandas/io/common.py | 12 +----------- 2 files changed, 19 insertions(+), 11 deletions(-) create mode 100644 pandas/compat/bz2.py diff --git a/pandas/compat/bz2.py b/pandas/compat/bz2.py new file mode 100644 index 0000000000000..1ce0986c1176d --- /dev/null +++ b/pandas/compat/bz2.py @@ -0,0 +1,18 @@ +""" +Patched ``BZ2File`` to handle pickle protocol 5. +""" + +from __future__ import annotations + +import bz2 + +from pandas.compat.pickle_compat import flatten_buffer + + +class BZ2File(bz2.BZ2File): + def write(self, b) -> int: + # Workaround issue where `bz2.BZ2File` expects `len` + # to return the number of bytes in `b` by converting + # `b` into something that meets that constraint with + # minimal copying. + return super().write(flatten_buffer(b)) diff --git a/pandas/io/common.py b/pandas/io/common.py index 107971523725e..3cf46d6c2f99a 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -5,7 +5,6 @@ ABC, abstractmethod, ) -import bz2 import codecs import dataclasses import functools @@ -57,7 +56,7 @@ WriteBuffer, ) from pandas.compat import get_lzma_file -from pandas.compat.pickle_compat import flatten_buffer +from pandas.compat.bz2 import BZ2File as _BZ2File from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level @@ -1004,15 +1003,6 @@ def write_to_buffer(self) -> None: self.buffer.addfile(tarinfo, self) -class _BZ2File(bz2.BZ2File): - def write(self, b) -> int: - # Workaround issue where `bz2.BZ2File` expects `len` - # to return the number of bytes in `b` by converting - # `b` into something that meets that constraint with - # minimal copying. - return super().write(flatten_buffer(b)) - - class _BytesZipFile(_BufferedWriter): def __init__( self, From 691eba789667a3992f57efe0fe440a5227c639e7 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 03:31:54 -0700 Subject: [PATCH 07/41] Test `flatten_buffer` This provides a reasonable proxy for testing patched `BZ2File` and `LZMAFile` objects. --- pandas/tests/io/test_pickle.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 98f02e14f4f13..167224025bd0c 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -10,6 +10,7 @@ 3. Move the created pickle to "data/legacy_pickle/" directory. """ +from array import array import bz2 import datetime import functools @@ -38,6 +39,7 @@ is_platform_little_endian, ) from pandas.compat._optional import import_optional_dependency +from pandas.compat.pickle import flatten_buffer import pandas.util._test_decorators as td import pandas as pd @@ -105,6 +107,26 @@ def legacy_pickle(request, datapath): # --------------------- # tests # --------------------- + + +@pytest.mark.parametrize( + "data", + [ + b"123", + b"123456", + bytearray(b"123"), + memoryview(b"123"), + pickle.PickleBuffer(b"123"), + array("I", [1, 2, 3]), + memoryview(b"123456").cast("B", (3, 2)), + memoryview(b"123456").cast("B", (3, 2))[::2], + ], +) +def test_flatten_buffer(data): + result = flatten_buffer(data) + assert result == bytes(data) + + def test_pickles(legacy_pickle): if not is_platform_little_endian(): pytest.skip("known failure on non-little endian") From 7a93b704157fcb8f730b89d3d77a653b3f65ab0a Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 03:39:37 -0700 Subject: [PATCH 08/41] Move `flatten_buffer` to `_utils` This ran into cyclic import issues in `pickle_compat`. So move `flatten_buffer` to its own module free of these issues. --- pandas/compat/_utils.py | 27 +++++++++++++++++++++++++++ pandas/compat/bz2.py | 2 +- pandas/compat/lzma.py | 2 +- pandas/compat/pickle_compat.py | 22 ---------------------- pandas/tests/io/test_pickle.py | 2 +- 5 files changed, 30 insertions(+), 25 deletions(-) create mode 100644 pandas/compat/_utils.py diff --git a/pandas/compat/_utils.py b/pandas/compat/_utils.py new file mode 100644 index 0000000000000..4ed2a47f98869 --- /dev/null +++ b/pandas/compat/_utils.py @@ -0,0 +1,27 @@ +""" +Utilities used in various `compat` components. +""" + +import pickle + + +def flatten_buffer(b: bytes | bytearray | memoryview | pickle.PickleBuffer): + """ + Return some 1-D `uint8` typed buffer. + + Coerces anything that does not match that description to one that does + without copying if possible (otherwise will copy). + """ + + if isinstance(b, (bytes, bytearray)): + return b + + if not isinstance(b, pickle.PickleBuffer): + b = pickle.PickleBuffer(b) + + try: + # coerce to 1-D `uint8` C-contiguous `memoryview` zero-copy + return b.raw() + except BufferError: + # perform in-memory copy if buffer is not contiguous + return memoryview(b).tobytes() diff --git a/pandas/compat/bz2.py b/pandas/compat/bz2.py index 1ce0986c1176d..dcf2915342cd0 100644 --- a/pandas/compat/bz2.py +++ b/pandas/compat/bz2.py @@ -6,7 +6,7 @@ import bz2 -from pandas.compat.pickle_compat import flatten_buffer +from pandas.compat._utils import flatten_buffer class BZ2File(bz2.BZ2File): diff --git a/pandas/compat/lzma.py b/pandas/compat/lzma.py index 7105c7da6191f..49d80b754c970 100644 --- a/pandas/compat/lzma.py +++ b/pandas/compat/lzma.py @@ -6,7 +6,7 @@ import lzma -from pandas.compat.pickle_compat import flatten_buffer +from pandas.compat._utils import flatten_buffer class LZMAFile(lzma.LZMAFile): diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index f9a52d5dd5e47..c233e3d8a4892 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -33,28 +33,6 @@ ) -def flatten_buffer(b: bytes | bytearray | memoryview | pkl.PickleBuffer): - """ - Return some 1-D `uint8` typed buffer. - - Coerces anything that does not match that description to one that does - without copying if possible (otherwise will copy). - """ - - if isinstance(b, (bytes, bytearray)): - return b - - if not isinstance(b, pkl.PickleBuffer): - b = pkl.PickleBuffer(b) - - try: - # coerce to 1-D `uint8` C-contiguous `memoryview` zero-copy - return b.raw() - except BufferError: - # perform in-memory copy if buffer is not contiguous - return memoryview(b).tobytes() - - def load_reduce(self): stack = self.stack args = stack.pop() diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 167224025bd0c..73f8fec1a68cf 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -39,7 +39,7 @@ is_platform_little_endian, ) from pandas.compat._optional import import_optional_dependency -from pandas.compat.pickle import flatten_buffer +from pandas.compat._utils import flatten_buffer import pandas.util._test_decorators as td import pandas as pd From 8f5b0a1ed9647d4ba7cbb79545c0e6c7be77faf2 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 03:53:16 -0700 Subject: [PATCH 09/41] Import `annotations` to fix `|` usage --- pandas/compat/_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/compat/_utils.py b/pandas/compat/_utils.py index 4ed2a47f98869..83f4c7f8ee382 100644 --- a/pandas/compat/_utils.py +++ b/pandas/compat/_utils.py @@ -2,6 +2,8 @@ Utilities used in various `compat` components. """ +from __future__ import annotations + import pickle From c54529a9a91ada16744c955817ad936c8d0faad8 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 11:54:42 -0700 Subject: [PATCH 10/41] Sort `import`s to fix lint --- pandas/io/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index e9158794f9f33..43a9a1e7448a3 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -55,8 +55,8 @@ WriteBuffer, ) from pandas.compat import get_lzma_file -from pandas.compat.bz2 import BZ2File as _BZ2File from pandas.compat._optional import import_optional_dependency +from pandas.compat.bz2 import BZ2File as _BZ2File from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level From 7604d48da0c72985ab4a6d98e4c77368ecca5f14 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 12:00:24 -0700 Subject: [PATCH 11/41] Patch `BZ2File` & `LZMAFile` on Python pre-3.10 This should limit the effects of this patch. Also should make it easier to remove this backport later once all supported Python versions have the fix. --- pandas/compat/bz2.py | 21 ++++++++++++++------- pandas/compat/lzma.py | 21 ++++++++++++++------- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/pandas/compat/bz2.py b/pandas/compat/bz2.py index dcf2915342cd0..db0767c361801 100644 --- a/pandas/compat/bz2.py +++ b/pandas/compat/bz2.py @@ -5,14 +5,21 @@ from __future__ import annotations import bz2 +import sys from pandas.compat._utils import flatten_buffer -class BZ2File(bz2.BZ2File): - def write(self, b) -> int: - # Workaround issue where `bz2.BZ2File` expects `len` - # to return the number of bytes in `b` by converting - # `b` into something that meets that constraint with - # minimal copying. - return super().write(flatten_buffer(b)) +if sys.version_info < (3, 10): + class BZ2File(bz2.BZ2File): + def write(self, b) -> int: + # Workaround issue where `bz2.BZ2File` expects `len` + # to return the number of bytes in `b` by converting + # `b` into something that meets that constraint with + # minimal copying. + # + # Note: This is fixed in Python 3.10. + return super().write(flatten_buffer(b)) +else: + class BZ2File(bz2.BZ2File): + pass diff --git a/pandas/compat/lzma.py b/pandas/compat/lzma.py index 49d80b754c970..4ce27c5ab845c 100644 --- a/pandas/compat/lzma.py +++ b/pandas/compat/lzma.py @@ -5,14 +5,21 @@ from __future__ import annotations import lzma +import sys from pandas.compat._utils import flatten_buffer -class LZMAFile(lzma.LZMAFile): - def write(self, b) -> int: - # Workaround issue where `lzma.LZMAFile` expects `len` - # to return the number of bytes in `b` by converting - # `b` into something that meets that constraint with - # minimal copying. - return super().write(flatten_buffer(b)) +if sys.version_info < (3, 10): + class LZMAFile(lzma.LZMAFile): + def write(self, b) -> int: + # Workaround issue where `lzma.LZMAFile` expects `len` + # to return the number of bytes in `b` by converting + # `b` into something that meets that constraint with + # minimal copying. + # + # Note: This is fixed in Python 3.10. + return super().write(flatten_buffer(b)) +else: + class LZMAFile(lzma.LZMAFile): + pass From 9f3d38712d565a14fd27ac92696264036a353c22 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 12:05:12 -0700 Subject: [PATCH 12/41] Test C & F contiguous NumPy arrays Also test another non-contiguous array. --- pandas/tests/io/test_pickle.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 73f8fec1a68cf..f7fbdd8234c94 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -120,6 +120,9 @@ def legacy_pickle(request, datapath): array("I", [1, 2, 3]), memoryview(b"123456").cast("B", (3, 2)), memoryview(b"123456").cast("B", (3, 2))[::2], + np.arange(12).reshape((3, 4), order="C"), + np.arange(12).reshape((3, 4), order="F"), + np.arange(12).reshape((3, 4), order="C")[:, ::2], ], ) def test_flatten_buffer(data): From 16f21a5078650c8a80828951794ed6a78282bfae Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 12:11:22 -0700 Subject: [PATCH 13/41] Test `memoryview` is 1-D `uint8` contiguous data If a `memoryview` is returned, make sure it as close to `bytes` | `bytearray` as possible. This ensures if other functions assume something like `bytes` (for example assuming `len(b)` is the number of bytes contained), things will continue to work even though this is a `memoryview`. --- pandas/tests/io/test_pickle.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index f7fbdd8234c94..51f78c893a4d1 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -128,6 +128,11 @@ def legacy_pickle(request, datapath): def test_flatten_buffer(data): result = flatten_buffer(data) assert result == bytes(data) + if isinstance(result, memoryview): + assert result.ndim == 1 + assert result.format == "B" + assert result.contiguous + assert result.shape == (result.nbytes,) def test_pickles(legacy_pickle): From 6df7e081270142afbfb738f26ab7763017a4cbfe Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 13:10:03 -0700 Subject: [PATCH 14/41] Run `black` on `bz2` and `lzma` compat files --- pandas/compat/bz2.py | 3 +++ pandas/compat/lzma.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/pandas/compat/bz2.py b/pandas/compat/bz2.py index db0767c361801..7f1d4e8247341 100644 --- a/pandas/compat/bz2.py +++ b/pandas/compat/bz2.py @@ -11,6 +11,7 @@ if sys.version_info < (3, 10): + class BZ2File(bz2.BZ2File): def write(self, b) -> int: # Workaround issue where `bz2.BZ2File` expects `len` @@ -20,6 +21,8 @@ def write(self, b) -> int: # # Note: This is fixed in Python 3.10. return super().write(flatten_buffer(b)) + else: + class BZ2File(bz2.BZ2File): pass diff --git a/pandas/compat/lzma.py b/pandas/compat/lzma.py index 4ce27c5ab845c..922e9dc255936 100644 --- a/pandas/compat/lzma.py +++ b/pandas/compat/lzma.py @@ -11,6 +11,7 @@ if sys.version_info < (3, 10): + class LZMAFile(lzma.LZMAFile): def write(self, b) -> int: # Workaround issue where `lzma.LZMAFile` expects `len` @@ -20,6 +21,8 @@ def write(self, b) -> int: # # Note: This is fixed in Python 3.10. return super().write(flatten_buffer(b)) + else: + class LZMAFile(lzma.LZMAFile): pass From 39ffab0c8416ca542a03a38ad5a665bce63012fc Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 13:41:42 -0700 Subject: [PATCH 15/41] One more lint fix --- pandas/compat/bz2.py | 1 - pandas/compat/lzma.py | 1 - 2 files changed, 2 deletions(-) diff --git a/pandas/compat/bz2.py b/pandas/compat/bz2.py index 7f1d4e8247341..f6dae95fed2a2 100644 --- a/pandas/compat/bz2.py +++ b/pandas/compat/bz2.py @@ -9,7 +9,6 @@ from pandas.compat._utils import flatten_buffer - if sys.version_info < (3, 10): class BZ2File(bz2.BZ2File): diff --git a/pandas/compat/lzma.py b/pandas/compat/lzma.py index 922e9dc255936..d5762790cbd89 100644 --- a/pandas/compat/lzma.py +++ b/pandas/compat/lzma.py @@ -9,7 +9,6 @@ from pandas.compat._utils import flatten_buffer - if sys.version_info < (3, 10): class LZMAFile(lzma.LZMAFile): From f134dee7c1e6dff939489fbf324bdb8c88547032 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 14:06:42 -0700 Subject: [PATCH 16/41] Drop unused `PickleBuffer` `import`s --- pandas/compat/__init__.py | 1 - pandas/io/common.py | 1 - 2 files changed, 2 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 1c4dcb6004fa0..451ba9b9b00bd 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -10,7 +10,6 @@ from __future__ import annotations import os -from pickle import PickleBuffer import platform import sys diff --git a/pandas/io/common.py b/pandas/io/common.py index 43a9a1e7448a3..549daf97cea79 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -20,7 +20,6 @@ import mmap import os from pathlib import Path -from pickle import PickleBuffer import re import tarfile from typing import ( From a7126a28d3502caa5e26dde37b5b970233d0471e Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 14:34:01 -0700 Subject: [PATCH 17/41] Simplify change to `panda.compat.__init__` Now that the LZMA changes are in a separate file, cleanup the changes to `pandas.compat.__init__`. --- pandas/compat/__init__.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 451ba9b9b00bd..3e6e4874e6b49 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -12,13 +12,7 @@ import os import platform import sys - -try: - import pandas.compat.lzma - - has_lzma = True -except ImportError: - has_lzma = False +from typing import TYPE_CHECKING from pandas._typing import F from pandas.compat.numpy import ( @@ -32,6 +26,9 @@ pa_version_under9p0, ) +if TYPE_CHECKING: + import pandas.compat.lzma + PY39 = sys.version_info >= (3, 9) PY310 = sys.version_info >= (3, 10) PY311 = sys.version_info >= (3, 11) @@ -138,7 +135,9 @@ def get_lzma_file() -> type[pandas.compat.lzma.LZMAFile]: RuntimeError If the `lzma` module was not imported correctly, or didn't exist. """ - if not has_lzma: + try: + import pandas.compat.lzma + except ImportError: raise RuntimeError( "lzma module not available. " "A Python re-install with the proper dependencies, " From df7b0cef478d4aaaf47511e6788d949fc3e6ff4c Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 14:56:38 -0700 Subject: [PATCH 18/41] Type `flatten_buffer` result --- pandas/compat/_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/compat/_utils.py b/pandas/compat/_utils.py index 83f4c7f8ee382..f30573eb7a7e4 100644 --- a/pandas/compat/_utils.py +++ b/pandas/compat/_utils.py @@ -7,7 +7,9 @@ import pickle -def flatten_buffer(b: bytes | bytearray | memoryview | pickle.PickleBuffer): +def flatten_buffer( + b: bytes | bytearray | memoryview | pickle.PickleBuffer, +) -> bytes | memoryview: """ Return some 1-D `uint8` typed buffer. From 742788fcb5c6ae659086dc2046d0bb1fffc686e3 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 16:57:23 -0700 Subject: [PATCH 19/41] Use `order="A"` in `memoryview.tobytes(...)` In the function `flatten_buffer`, the order is already effectively enforced when copying can be avoided by using `PickleBuffer.raw(...)`. However some test comparisons failed (when they shouldn't have) as this wasn't specified. So add the `order` in both the function and the test. This should fix that test failure. --- pandas/compat/_utils.py | 2 +- pandas/tests/io/test_pickle.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/compat/_utils.py b/pandas/compat/_utils.py index f30573eb7a7e4..9b686718f20d4 100644 --- a/pandas/compat/_utils.py +++ b/pandas/compat/_utils.py @@ -28,4 +28,4 @@ def flatten_buffer( return b.raw() except BufferError: # perform in-memory copy if buffer is not contiguous - return memoryview(b).tobytes() + return memoryview(b).tobytes("A") diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 51f78c893a4d1..e8a3c1c972833 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -127,7 +127,8 @@ def legacy_pickle(request, datapath): ) def test_flatten_buffer(data): result = flatten_buffer(data) - assert result == bytes(data) + expected = memoryview(data).tobytes("A") + assert result == expected if isinstance(result, memoryview): assert result.ndim == 1 assert result.format == "B" From ffc58d3c9081066850c403262fbd0a808fe74cff Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 17:08:21 -0700 Subject: [PATCH 20/41] Move all compat compressors into a single file --- pandas/compat/__init__.py | 13 ++---- pandas/compat/_compressors.py | 78 ++++++++++++++++++++++++++++++++++ pandas/compat/_utils.py | 31 -------------- pandas/compat/bz2.py | 27 ------------ pandas/compat/lzma.py | 27 ------------ pandas/tests/io/test_pickle.py | 2 +- 6 files changed, 83 insertions(+), 95 deletions(-) create mode 100644 pandas/compat/_compressors.py delete mode 100644 pandas/compat/_utils.py delete mode 100644 pandas/compat/bz2.py delete mode 100644 pandas/compat/lzma.py diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 3e6e4874e6b49..09100cfc388d6 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -12,9 +12,9 @@ import os import platform import sys -from typing import TYPE_CHECKING from pandas._typing import F +import pandas.compat._compressors from pandas.compat.numpy import ( is_numpy_dev, np_version_under1p21, @@ -26,9 +26,6 @@ pa_version_under9p0, ) -if TYPE_CHECKING: - import pandas.compat.lzma - PY39 = sys.version_info >= (3, 9) PY310 = sys.version_info >= (3, 10) PY311 = sys.version_info >= (3, 11) @@ -121,7 +118,7 @@ def is_ci_environment() -> bool: return os.environ.get("PANDAS_CI", "0") == "1" -def get_lzma_file() -> type[pandas.compat.lzma.LZMAFile]: +def get_lzma_file() -> type[pandas.compat._compressors.LZMAFile]: """ Importing the `LZMAFile` class from the `lzma` module. @@ -135,15 +132,13 @@ def get_lzma_file() -> type[pandas.compat.lzma.LZMAFile]: RuntimeError If the `lzma` module was not imported correctly, or didn't exist. """ - try: - import pandas.compat.lzma - except ImportError: + if not pandas.compat._compressors.has_lzma: raise RuntimeError( "lzma module not available. " "A Python re-install with the proper dependencies, " "might be required to solve this issue." ) - return pandas.compat.lzma.LZMAFile + return pandas.compat._compressors.LZMAFile __all__ = [ diff --git a/pandas/compat/_compressors.py b/pandas/compat/_compressors.py new file mode 100644 index 0000000000000..94cd20750bb1b --- /dev/null +++ b/pandas/compat/_compressors.py @@ -0,0 +1,78 @@ +""" +Patched ``BZ2File`` and ``LZMAFile`` to handle pickle protocol 5. +""" + +from __future__ import annotations + +import bz2 +import pickle +import sys + +try: + import lzma + + has_lzma = True +except ImportError: + has_lzma = False + + +PY310 = sys.version_info >= (3, 10) + + +def flatten_buffer( + b: bytes | bytearray | memoryview | pickle.PickleBuffer, +) -> bytes | memoryview: + """ + Return some 1-D `uint8` typed buffer. + + Coerces anything that does not match that description to one that does + without copying if possible (otherwise will copy). + """ + + if isinstance(b, (bytes, bytearray)): + return b + + if not isinstance(b, pickle.PickleBuffer): + b = pickle.PickleBuffer(b) + + try: + # coerce to 1-D `uint8` C-contiguous `memoryview` zero-copy + return b.raw() + except BufferError: + # perform in-memory copy if buffer is not contiguous + return memoryview(b).tobytes("A") + + +if not PY310: + + class BZ2File(bz2.BZ2File): + def write(self, b) -> int: + # Workaround issue where `bz2.BZ2File` expects `len` + # to return the number of bytes in `b` by converting + # `b` into something that meets that constraint with + # minimal copying. + # + # Note: This is fixed in Python 3.10. + return super().write(flatten_buffer(b)) + + if has_lzma: + + class LZMAFile(lzma.LZMAFile): + def write(self, b) -> int: + # Workaround issue where `lzma.LZMAFile` expects `len` + # to return the number of bytes in `b` by converting + # `b` into something that meets that constraint with + # minimal copying. + # + # Note: This is fixed in Python 3.10. + return super().write(flatten_buffer(b)) + +else: + + class BZ2File(bz2.BZ2File): + pass + + if has_lzma: + + class LZMAFile(lzma.LZMAFile): + pass diff --git a/pandas/compat/_utils.py b/pandas/compat/_utils.py deleted file mode 100644 index 9b686718f20d4..0000000000000 --- a/pandas/compat/_utils.py +++ /dev/null @@ -1,31 +0,0 @@ -""" -Utilities used in various `compat` components. -""" - -from __future__ import annotations - -import pickle - - -def flatten_buffer( - b: bytes | bytearray | memoryview | pickle.PickleBuffer, -) -> bytes | memoryview: - """ - Return some 1-D `uint8` typed buffer. - - Coerces anything that does not match that description to one that does - without copying if possible (otherwise will copy). - """ - - if isinstance(b, (bytes, bytearray)): - return b - - if not isinstance(b, pickle.PickleBuffer): - b = pickle.PickleBuffer(b) - - try: - # coerce to 1-D `uint8` C-contiguous `memoryview` zero-copy - return b.raw() - except BufferError: - # perform in-memory copy if buffer is not contiguous - return memoryview(b).tobytes("A") diff --git a/pandas/compat/bz2.py b/pandas/compat/bz2.py deleted file mode 100644 index f6dae95fed2a2..0000000000000 --- a/pandas/compat/bz2.py +++ /dev/null @@ -1,27 +0,0 @@ -""" -Patched ``BZ2File`` to handle pickle protocol 5. -""" - -from __future__ import annotations - -import bz2 -import sys - -from pandas.compat._utils import flatten_buffer - -if sys.version_info < (3, 10): - - class BZ2File(bz2.BZ2File): - def write(self, b) -> int: - # Workaround issue where `bz2.BZ2File` expects `len` - # to return the number of bytes in `b` by converting - # `b` into something that meets that constraint with - # minimal copying. - # - # Note: This is fixed in Python 3.10. - return super().write(flatten_buffer(b)) - -else: - - class BZ2File(bz2.BZ2File): - pass diff --git a/pandas/compat/lzma.py b/pandas/compat/lzma.py deleted file mode 100644 index d5762790cbd89..0000000000000 --- a/pandas/compat/lzma.py +++ /dev/null @@ -1,27 +0,0 @@ -""" -Patched ``LZMAFile`` to handle pickle protocol 5. -""" - -from __future__ import annotations - -import lzma -import sys - -from pandas.compat._utils import flatten_buffer - -if sys.version_info < (3, 10): - - class LZMAFile(lzma.LZMAFile): - def write(self, b) -> int: - # Workaround issue where `lzma.LZMAFile` expects `len` - # to return the number of bytes in `b` by converting - # `b` into something that meets that constraint with - # minimal copying. - # - # Note: This is fixed in Python 3.10. - return super().write(flatten_buffer(b)) - -else: - - class LZMAFile(lzma.LZMAFile): - pass diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index e8a3c1c972833..da688a74d77cf 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -38,8 +38,8 @@ get_lzma_file, is_platform_little_endian, ) +from pandas.compat._compressors import flatten_buffer from pandas.compat._optional import import_optional_dependency -from pandas.compat._utils import flatten_buffer import pandas.util._test_decorators as td import pandas as pd From 0b1be1634eacd42ec666c1a0a741f53b4cad4741 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 17:23:58 -0700 Subject: [PATCH 21/41] Fix `BZ2File` `import` --- pandas/io/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 549daf97cea79..64e703572f2bf 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -54,8 +54,8 @@ WriteBuffer, ) from pandas.compat import get_lzma_file +from pandas.compat._compressors import BZ2File as _BZ2File from pandas.compat._optional import import_optional_dependency -from pandas.compat.bz2 import BZ2File as _BZ2File from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level From f00740ce89aa5efe75b348b6b9bacaa81909253b Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 17:40:58 -0700 Subject: [PATCH 22/41] Refactor out common compat constants --- pandas/compat/__init__.py | 6 +----- pandas/compat/_compressors.py | 5 ++--- pandas/compat/_constants.py | 17 +++++++++++++++++ 3 files changed, 20 insertions(+), 8 deletions(-) create mode 100644 pandas/compat/_constants.py diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 09100cfc388d6..56baf4b81efe5 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -26,11 +26,7 @@ pa_version_under9p0, ) -PY39 = sys.version_info >= (3, 9) -PY310 = sys.version_info >= (3, 10) -PY311 = sys.version_info >= (3, 11) -PYPY = platform.python_implementation() == "PyPy" -IS64 = sys.maxsize > 2**32 +from pandas.compat._constants import * def set_function_name(f: F, name: str, cls) -> F: diff --git a/pandas/compat/_compressors.py b/pandas/compat/_compressors.py index 94cd20750bb1b..6df7b8bb66041 100644 --- a/pandas/compat/_compressors.py +++ b/pandas/compat/_compressors.py @@ -8,6 +8,8 @@ import pickle import sys +from pandas.compat._constants import PY310 + try: import lzma @@ -16,9 +18,6 @@ has_lzma = False -PY310 = sys.version_info >= (3, 10) - - def flatten_buffer( b: bytes | bytearray | memoryview | pickle.PickleBuffer, ) -> bytes | memoryview: diff --git a/pandas/compat/_constants.py b/pandas/compat/_constants.py new file mode 100644 index 0000000000000..02b636106db99 --- /dev/null +++ b/pandas/compat/_constants.py @@ -0,0 +1,17 @@ +""" +_constants +====== + +Constants relevant for the Python implementation. +""" + +from __future__ import annotations + +import sys +import platform + +PY39 = sys.version_info >= (3, 9) +PY310 = sys.version_info >= (3, 10) +PY311 = sys.version_info >= (3, 11) +PYPY = platform.python_implementation() == "PyPy" +IS64 = sys.maxsize > 2**32 From 06e5387398426a6dffc827f4eb587625d33b82b3 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 18:04:50 -0700 Subject: [PATCH 23/41] Fix `import` sorting --- pandas/compat/__init__.py | 3 +-- pandas/compat/_constants.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 56baf4b81efe5..d5affb12075ff 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -15,6 +15,7 @@ from pandas._typing import F import pandas.compat._compressors +from pandas.compat._constants import * from pandas.compat.numpy import ( is_numpy_dev, np_version_under1p21, @@ -26,8 +27,6 @@ pa_version_under9p0, ) -from pandas.compat._constants import * - def set_function_name(f: F, name: str, cls) -> F: """ diff --git a/pandas/compat/_constants.py b/pandas/compat/_constants.py index 02b636106db99..15bb3109e1971 100644 --- a/pandas/compat/_constants.py +++ b/pandas/compat/_constants.py @@ -7,8 +7,8 @@ from __future__ import annotations -import sys import platform +import sys PY39 = sys.version_info >= (3, 9) PY310 = sys.version_info >= (3, 10) From 269dc0f53f0022a8db055f3cd858c7251bce66d6 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 18:07:37 -0700 Subject: [PATCH 24/41] Drop unused `import` --- pandas/compat/_compressors.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/compat/_compressors.py b/pandas/compat/_compressors.py index 6df7b8bb66041..66ac17178e030 100644 --- a/pandas/compat/_compressors.py +++ b/pandas/compat/_compressors.py @@ -6,7 +6,6 @@ import bz2 import pickle -import sys from pandas.compat._constants import PY310 From f1f1a2e56861abbc0b2c04adbb98880794c47ca0 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 18:09:03 -0700 Subject: [PATCH 25/41] Ignore `flake8` errors on wildcard `import` --- pandas/compat/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index d5affb12075ff..0b2462dac7095 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -15,7 +15,7 @@ from pandas._typing import F import pandas.compat._compressors -from pandas.compat._constants import * +from pandas.compat._constants import * # noqa: F401,F403 from pandas.compat.numpy import ( is_numpy_dev, np_version_under1p21, From f73f0a590ce94f112bcc5af587197e1ea79a6e63 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 18:27:19 -0700 Subject: [PATCH 26/41] Revert "Ignore `flake8` errors on wildcard `import`" This reverts commit f1f1a2e56861abbc0b2c04adbb98880794c47ca0. --- pandas/compat/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 0b2462dac7095..d5affb12075ff 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -15,7 +15,7 @@ from pandas._typing import F import pandas.compat._compressors -from pandas.compat._constants import * # noqa: F401,F403 +from pandas.compat._constants import * from pandas.compat.numpy import ( is_numpy_dev, np_version_under1p21, From b8a724b5abd60384077531e2ef654b938f7a76a4 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 18:48:54 -0700 Subject: [PATCH 27/41] Explicitly `import` all constants --- pandas/compat/__init__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index d5affb12075ff..7f4be8dae5452 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -15,7 +15,13 @@ from pandas._typing import F import pandas.compat._compressors -from pandas.compat._constants import * +from pandas.compat._constants import ( + IS64, + PY39, + PY310, + PY311, + PYPY, +) from pandas.compat.numpy import ( is_numpy_dev, np_version_under1p21, From 1b111887cff577f22074c3327cf08087bdc3195c Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 18:49:24 -0700 Subject: [PATCH 28/41] Assign `IS64` first --- pandas/compat/_constants.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/compat/_constants.py b/pandas/compat/_constants.py index 15bb3109e1971..cad5b8fc9b861 100644 --- a/pandas/compat/_constants.py +++ b/pandas/compat/_constants.py @@ -10,8 +10,9 @@ import platform import sys +IS64 = sys.maxsize > 2**32 + PY39 = sys.version_info >= (3, 9) PY310 = sys.version_info >= (3, 10) PY311 = sys.version_info >= (3, 11) PYPY = platform.python_implementation() == "PyPy" -IS64 = sys.maxsize > 2**32 From d2a39db631bd29af1343eeb0c44765cb96917ae1 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 18:52:12 -0700 Subject: [PATCH 29/41] Try `noqa` on wildcard `import` again --- pandas/compat/__init__.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 7f4be8dae5452..b019eb634541f 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -15,13 +15,7 @@ from pandas._typing import F import pandas.compat._compressors -from pandas.compat._constants import ( - IS64, - PY39, - PY310, - PY311, - PYPY, -) +from pandas.compat._constants import * # noqa: F401, F403 from pandas.compat.numpy import ( is_numpy_dev, np_version_under1p21, From 01e86046534d5194fa38f0c8747128a260f65d32 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 19:36:28 -0700 Subject: [PATCH 30/41] Declare `BZ2File` & `LZMAFile` once Fixes a linter issue from pyright. --- pandas/compat/_compressors.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/pandas/compat/_compressors.py b/pandas/compat/_compressors.py index 66ac17178e030..a8316aea6d28e 100644 --- a/pandas/compat/_compressors.py +++ b/pandas/compat/_compressors.py @@ -41,9 +41,9 @@ def flatten_buffer( return memoryview(b).tobytes("A") -if not PY310: +class BZ2File(bz2.BZ2File): + if not PY310: - class BZ2File(bz2.BZ2File): def write(self, b) -> int: # Workaround issue where `bz2.BZ2File` expects `len` # to return the number of bytes in `b` by converting @@ -53,9 +53,12 @@ def write(self, b) -> int: # Note: This is fixed in Python 3.10. return super().write(flatten_buffer(b)) - if has_lzma: - class LZMAFile(lzma.LZMAFile): +if has_lzma: + + class LZMAFile(lzma.LZMAFile): + if not PY310: + def write(self, b) -> int: # Workaround issue where `lzma.LZMAFile` expects `len` # to return the number of bytes in `b` by converting @@ -64,13 +67,3 @@ def write(self, b) -> int: # # Note: This is fixed in Python 3.10. return super().write(flatten_buffer(b)) - -else: - - class BZ2File(bz2.BZ2File): - pass - - if has_lzma: - - class LZMAFile(lzma.LZMAFile): - pass From 523e20c9efafff39a048939b0c1131341a92ccde Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 19:52:16 -0700 Subject: [PATCH 31/41] `import PickleBuffer` for simplicity --- pandas/compat/_compressors.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/compat/_compressors.py b/pandas/compat/_compressors.py index a8316aea6d28e..3ae473d830823 100644 --- a/pandas/compat/_compressors.py +++ b/pandas/compat/_compressors.py @@ -5,7 +5,7 @@ from __future__ import annotations import bz2 -import pickle +from pickle import PickleBuffer from pandas.compat._constants import PY310 @@ -18,7 +18,7 @@ def flatten_buffer( - b: bytes | bytearray | memoryview | pickle.PickleBuffer, + b: bytes | bytearray | memoryview | PickleBuffer, ) -> bytes | memoryview: """ Return some 1-D `uint8` typed buffer. @@ -30,8 +30,8 @@ def flatten_buffer( if isinstance(b, (bytes, bytearray)): return b - if not isinstance(b, pickle.PickleBuffer): - b = pickle.PickleBuffer(b) + if not isinstance(b, PickleBuffer): + b = PickleBuffer(b) try: # coerce to 1-D `uint8` C-contiguous `memoryview` zero-copy From 6f7e29380cf21ad8a0adb86879b7c3c3124dfd09 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 19:53:26 -0700 Subject: [PATCH 32/41] Add `bytearray` to return type --- pandas/compat/_compressors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/compat/_compressors.py b/pandas/compat/_compressors.py index 3ae473d830823..a4f39c4e34bd4 100644 --- a/pandas/compat/_compressors.py +++ b/pandas/compat/_compressors.py @@ -19,7 +19,7 @@ def flatten_buffer( b: bytes | bytearray | memoryview | PickleBuffer, -) -> bytes | memoryview: +) -> bytes | bytearray | memoryview: """ Return some 1-D `uint8` typed buffer. From 4614bd7f7d87d054cef5edc0382e2c0701453fd9 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 18 Oct 2022 19:55:11 -0700 Subject: [PATCH 33/41] Test `bytes` & `bytearray` are returned unaltered --- pandas/tests/io/test_pickle.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index da688a74d77cf..d78cb9e46cd1a 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -129,7 +129,9 @@ def test_flatten_buffer(data): result = flatten_buffer(data) expected = memoryview(data).tobytes("A") assert result == expected - if isinstance(result, memoryview): + if isinstance(data, (bytes, bytearray)): + assert result is data + elif isinstance(result, memoryview): assert result.ndim == 1 assert result.format == "B" assert result.contiguous From cf4f9262f78dd5bdfd655cecdb450aa7d816e209 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Wed, 19 Oct 2022 17:04:39 -0700 Subject: [PATCH 34/41] Explicit list all constants --- pandas/compat/__init__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index b019eb634541f..2b545a4ba034e 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -15,7 +15,13 @@ from pandas._typing import F import pandas.compat._compressors -from pandas.compat._constants import * # noqa: F401, F403 +from pandas.compat._constants import ( # noqa: F401 + IS64, + PY39, + PY310, + PY311, + PYPY, +) from pandas.compat.numpy import ( is_numpy_dev, np_version_under1p21, From 4a2efad9b04450005145f689c3f0d839334bff65 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Wed, 19 Oct 2022 18:32:55 -0700 Subject: [PATCH 35/41] Trick linter into thinking constants are used ;) --- pandas/compat/__init__.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 2b545a4ba034e..8524654002e22 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -15,12 +15,12 @@ from pandas._typing import F import pandas.compat._compressors -from pandas.compat._constants import ( # noqa: F401 - IS64, - PY39, - PY310, - PY311, - PYPY, +from pandas.compat._constants import ( + IS64 as IS64, + PY39 as PY39, + PY310 as PY310, + PY311 as PY311, + PYPY as PYPY, ) from pandas.compat.numpy import ( is_numpy_dev, From 0dae47659fbc017683acfb695b3b6a4bfb9b1a6a Mon Sep 17 00:00:00 2001 From: jakirkham Date: Wed, 19 Oct 2022 18:35:57 -0700 Subject: [PATCH 36/41] Add new entry to 2.0.0 --- doc/source/whatsnew/v2.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 30acc117c237a..6d145004e380b 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -170,6 +170,7 @@ Performance improvements - Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`) - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`) - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) +- Performance use out-of-band pickling with BZ2 & LZMA files (:issue:`49068`) - Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`) - Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`) From 366f645e6f0f71401619d432d15582393169d1ca Mon Sep 17 00:00:00 2001 From: jakirkham Date: Wed, 19 Oct 2022 22:34:36 -0700 Subject: [PATCH 37/41] Assign constants to themselves Should work around linter issues. --- pandas/compat/__init__.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 8524654002e22..83a75ff4a6d67 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -16,11 +16,11 @@ from pandas._typing import F import pandas.compat._compressors from pandas.compat._constants import ( - IS64 as IS64, - PY39 as PY39, - PY310 as PY310, - PY311 as PY311, - PYPY as PYPY, + IS64, + PY39, + PY310, + PY311, + PYPY, ) from pandas.compat.numpy import ( is_numpy_dev, @@ -33,6 +33,12 @@ pa_version_under9p0, ) +IS64 = IS64 +PY39 = PY39 +PY310 = PY310 +PY311 = PY311 +PYPY = PYPY + def set_function_name(f: F, name: str, cls) -> F: """ From 092e726594384b58cc3ddeed6d9ace43d9a726f2 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Thu, 20 Oct 2022 12:50:34 -0700 Subject: [PATCH 38/41] Update changelog entry [skip ci] --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 6d145004e380b..a3dd79546d439 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -170,7 +170,7 @@ Performance improvements - Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`) - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`) - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) -- Performance use out-of-band pickling with BZ2 & LZMA files (:issue:`49068`) +- Minimize memory usage when writing BZ2 & LZMA files by using out-of-band pickling (:issue:`49068`) - Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`) - Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`) From e49ba4f69e9669255787e3d4b3de5f3bb10ae2b3 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Thu, 20 Oct 2022 12:55:38 -0700 Subject: [PATCH 39/41] Add constants to `__all__` --- pandas/compat/__init__.py | 11 +++++------ pandas/compat/_constants.py | 9 +++++++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 83a75ff4a6d67..f097b0c1ae630 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -33,12 +33,6 @@ pa_version_under9p0, ) -IS64 = IS64 -PY39 = PY39 -PY310 = PY310 -PY311 = PY311 -PYPY = PYPY - def set_function_name(f: F, name: str, cls) -> F: """ @@ -155,4 +149,9 @@ def get_lzma_file() -> type[pandas.compat._compressors.LZMAFile]: "pa_version_under7p0", "pa_version_under8p0", "pa_version_under9p0", + "IS64", + "PY39", + "PY310", + "PY311", + "PYPY", ] diff --git a/pandas/compat/_constants.py b/pandas/compat/_constants.py index cad5b8fc9b861..75d99f5ae51fb 100644 --- a/pandas/compat/_constants.py +++ b/pandas/compat/_constants.py @@ -16,3 +16,12 @@ PY310 = sys.version_info >= (3, 10) PY311 = sys.version_info >= (3, 11) PYPY = platform.python_implementation() == "PyPy" + + +__all__ = [ + "IS64", + "PY39", + "PY310", + "PY311", + "PYPY", +] From 453b4e3bcb85370bb82c24175c2c6d8a682614b1 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Thu, 20 Oct 2022 20:46:45 -0700 Subject: [PATCH 40/41] Update changelog entry [ci skip] --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 3a70acf5c3eab..e6b5bc62af62a 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -185,7 +185,7 @@ Performance improvements - Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`) - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`) - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) -- Minimize memory usage when writing BZ2 & LZMA files by using out-of-band pickling (:issue:`49068`) +- Reduce memory usage of DataFrame/Series `to_pickle` when using BZ2 or LZMA (:issue:`49068`) - Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`) - Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`) From 30124dda4a582fc20adea9dc53d3fa65d1edc8d6 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Fri, 21 Oct 2022 08:22:50 -0700 Subject: [PATCH 41/41] Use Sphinx method annotation --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index e6b5bc62af62a..4475c58880fa1 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -185,7 +185,7 @@ Performance improvements - Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`) - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`) - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) -- Reduce memory usage of DataFrame/Series `to_pickle` when using BZ2 or LZMA (:issue:`49068`) +- Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`) - Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`) - Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`)