From f5cc8a8e629e9e7ca096b761acf01b7d7e03b94b Mon Sep 17 00:00:00 2001 From: Thomas Blauth Date: Mon, 19 Apr 2021 07:30:04 +0000 Subject: [PATCH 01/12] BUG: Slice Arrow buffer before passing it to numpy (#40896) Add Arrow buffer slicing before handing it over to numpy which is needed in case the Arrow buffer contains padding or offset. --- pandas/core/arrays/_arrow_utils.py | 11 ++++- pandas/tests/arrays/test_arrow_utils.py | 58 +++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 pandas/tests/arrays/test_arrow_utils.py diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py index 31f6896b12f98..3583e5f6a5e11 100644 --- a/pandas/core/arrays/_arrow_utils.py +++ b/pandas/core/arrays/_arrow_utils.py @@ -14,6 +14,8 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype): Convert a primitive pyarrow.Array to a numpy array and boolean mask based on the buffers of the Array. + At the moment pyarrow.BooleanArray is not supported. + Parameters ---------- arr : pyarrow.Array @@ -25,8 +27,15 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype): Tuple of two numpy arrays with the raw data (with specified dtype) and a boolean mask (validity mask, so False means missing) """ + buflist = arr.buffers() - data = np.frombuffer(buflist[1], dtype=dtype)[arr.offset : arr.offset + len(arr)] + # Since Arrow buffers might contain padding and the data might be offset, + # the buffer gets sliced here before handing it to numpy. + # See also https://github.com/pandas-dev/pandas/issues/40896 + offset = arr.offset * arr.type.bit_width // 8 + length = len(arr) * arr.type.bit_width // 8 + data_buf = buflist[1][offset : offset + length] + data = np.frombuffer(data_buf, dtype=dtype) bitmask = buflist[0] if bitmask is not None: mask = pyarrow.BooleanArray.from_buffers( diff --git a/pandas/tests/arrays/test_arrow_utils.py b/pandas/tests/arrays/test_arrow_utils.py new file mode 100644 index 0000000000000..9aa2e6ccfbf30 --- /dev/null +++ b/pandas/tests/arrays/test_arrow_utils.py @@ -0,0 +1,58 @@ +import numpy as np + +import pandas.util._test_decorators as td + +import pandas._testing as tm + + +@td.skip_if_no("pyarrow") +def test_pyarrow_array_to_numpy_and_mask(dtype, pa_array, np_expected, mask_expected): + """ + Test conversion from pyarrow array to numpy array. + + Also modifies the pyarrow buffer to contain padding and offset, which are + considered valid buffers by pyarrow. + See also https://github.com/pandas-dev/pandas/issues/40896 + """ + import pyarrow as pa + + from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask + + dtype = np.int32 + pa_array = pa.array([0, 1, 2], type=pa.int32()) + np_expected = np.array([0, 1, 2], dtype=np.int32) + mask_expected = np.array([True, True, True]) + + data, mask = pyarrow_array_to_numpy_and_mask(pa_array, dtype) + tm.assert_numpy_array_equal(data, np_expected) + assert (mask == mask_expected).all() + + mask_buffer = pa_array.buffers()[0] + data_buffer = pa_array.buffers()[1].to_pybytes() + + # Add trailing padding to the buffer. + data_buffer_trail = pa.py_buffer(data_buffer + b"\x00") + pa_array_trail = pa.Array.from_buffers( + type=pa_array.type, + length=len(pa_array), + buffers=[mask_buffer, data_buffer_trail], + offset=pa_array.offset, + ) + pa_array_trail.validate() + data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, dtype) + tm.assert_numpy_array_equal(data, np_expected) + assert (mask == mask_expected).all() + + # Add offset to the buffer. + offset = b"\x00" * (pa_array.type.bit_width // 8) + data_buffer_offset = pa.py_buffer(offset + data_buffer) + pa_array_offset = pa.Array.from_buffers( + type=pa_array.type, + length=len(pa_array), + buffers=[mask_buffer, data_buffer_offset], + offset=pa_array.offset + 1, + ) + pa_array_offset.validate() + data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, dtype) + tm.assert_numpy_array_equal(data, np_expected) + assert (mask == mask_expected).all() From 2a8042c3cdb7060177d26422058c731461ff5564 Mon Sep 17 00:00:00 2001 From: Thomas Blauth Date: Mon, 19 Apr 2021 21:00:59 +0000 Subject: [PATCH 02/12] BUG: Slice Arrow buffer before passing it to numpy (#40896) Use numpy dtype for bitwidth information. Fix test signature. --- pandas/core/arrays/_arrow_utils.py | 7 +++++-- pandas/tests/arrays/test_arrow_utils.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py index 3583e5f6a5e11..1d7c25db9ea4a 100644 --- a/pandas/core/arrays/_arrow_utils.py +++ b/pandas/core/arrays/_arrow_utils.py @@ -27,13 +27,16 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype): Tuple of two numpy arrays with the raw data (with specified dtype) and a boolean mask (validity mask, so False means missing) """ + dtype = np.dtype(dtype) buflist = arr.buffers() # Since Arrow buffers might contain padding and the data might be offset, # the buffer gets sliced here before handing it to numpy. # See also https://github.com/pandas-dev/pandas/issues/40896 - offset = arr.offset * arr.type.bit_width // 8 - length = len(arr) * arr.type.bit_width // 8 + # offset = arr.offset * arr.type.bit_width // 8 + # length = len(arr) * arr.type.bit_width // 8 + offset = arr.offset * dtype.itemsize + length = len(arr) * dtype.itemsize data_buf = buflist[1][offset : offset + length] data = np.frombuffer(data_buf, dtype=dtype) bitmask = buflist[0] diff --git a/pandas/tests/arrays/test_arrow_utils.py b/pandas/tests/arrays/test_arrow_utils.py index 9aa2e6ccfbf30..073b252db0f05 100644 --- a/pandas/tests/arrays/test_arrow_utils.py +++ b/pandas/tests/arrays/test_arrow_utils.py @@ -6,7 +6,7 @@ @td.skip_if_no("pyarrow") -def test_pyarrow_array_to_numpy_and_mask(dtype, pa_array, np_expected, mask_expected): +def test_pyarrow_array_to_numpy_and_mask(): """ Test conversion from pyarrow array to numpy array. From fd94972446404cb5cfc056d3dcd52bd9ace95874 Mon Sep 17 00:00:00 2001 From: Thomas Blauth Date: Tue, 20 Apr 2021 08:28:47 +0000 Subject: [PATCH 03/12] BUG: Slice Arrow buffer before passing it to numpy (#40896) Fix mask buffer in test. --- pandas/core/arrays/_arrow_utils.py | 2 -- pandas/tests/arrays/test_arrow_utils.py | 9 +++++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py index 1d7c25db9ea4a..51e5f36b88c79 100644 --- a/pandas/core/arrays/_arrow_utils.py +++ b/pandas/core/arrays/_arrow_utils.py @@ -33,8 +33,6 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype): # Since Arrow buffers might contain padding and the data might be offset, # the buffer gets sliced here before handing it to numpy. # See also https://github.com/pandas-dev/pandas/issues/40896 - # offset = arr.offset * arr.type.bit_width // 8 - # length = len(arr) * arr.type.bit_width // 8 offset = arr.offset * dtype.itemsize length = len(arr) * dtype.itemsize data_buf = buflist[1][offset : offset + length] diff --git a/pandas/tests/arrays/test_arrow_utils.py b/pandas/tests/arrays/test_arrow_utils.py index 073b252db0f05..3e1692e358da9 100644 --- a/pandas/tests/arrays/test_arrow_utils.py +++ b/pandas/tests/arrays/test_arrow_utils.py @@ -25,7 +25,7 @@ def test_pyarrow_array_to_numpy_and_mask(): data, mask = pyarrow_array_to_numpy_and_mask(pa_array, dtype) tm.assert_numpy_array_equal(data, np_expected) - assert (mask == mask_expected).all() + tm.assert_numpy_array_equal(mask, mask_expected) mask_buffer = pa_array.buffers()[0] data_buffer = pa_array.buffers()[1].to_pybytes() @@ -41,18 +41,19 @@ def test_pyarrow_array_to_numpy_and_mask(): pa_array_trail.validate() data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, dtype) tm.assert_numpy_array_equal(data, np_expected) - assert (mask == mask_expected).all() + tm.assert_numpy_array_equal(mask, mask_expected) # Add offset to the buffer. offset = b"\x00" * (pa_array.type.bit_width // 8) data_buffer_offset = pa.py_buffer(offset + data_buffer) + mask_buffer_offset = pa.py_buffer(b"\x0F") pa_array_offset = pa.Array.from_buffers( type=pa_array.type, length=len(pa_array), - buffers=[mask_buffer, data_buffer_offset], + buffers=[mask_buffer_offset, data_buffer_offset], offset=pa_array.offset + 1, ) pa_array_offset.validate() data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, dtype) tm.assert_numpy_array_equal(data, np_expected) - assert (mask == mask_expected).all() + tm.assert_numpy_array_equal(mask, mask_expected) From 1586d50357a3982a21f44007382a3a35b1ec9da5 Mon Sep 17 00:00:00 2001 From: Thomas Blauth Date: Tue, 20 Apr 2021 15:43:49 +0000 Subject: [PATCH 04/12] BUG: Slice Arrow buffer before passing it to numpy (#40896) Move tests to pandas/tests/arrays/masked/test_arrow_compat.py Add more dtypes to test. --- .../tests/arrays/masked/test_arrow_compat.py | 100 ++++++++++++++++++ pandas/tests/arrays/test_arrow_utils.py | 59 ----------- 2 files changed, 100 insertions(+), 59 deletions(-) delete mode 100644 pandas/tests/arrays/test_arrow_utils.py diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index 8bb32dec2cc0e..d70794a82dcb0 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -1,3 +1,4 @@ +import numpy as np import pytest import pandas.util._test_decorators as td @@ -64,3 +65,102 @@ def test_arrow_sliced(): result = table.slice(2, None).to_pandas() expected = df.iloc[2:].reset_index(drop=True) tm.assert_frame_equal(result, expected) + + +@pytest.fixture +def np_dtype_to_arrays(request): + import pyarrow as pa + + np_dtype = request.param + pa_type = pa.from_numpy_dtype(np_dtype) + + pa_array = pa.array([0, 1, 2], type=pa_type) + np_expected = np.array([0, 1, 2], dtype=np_dtype) + mask_expected = np.array([True, True, True]) + return np_dtype, pa_array, np_expected, mask_expected + + +@td.skip_if_no("pyarrow") +@pytest.mark.parametrize( + "np_dtype_to_arrays", + ( + [ + np.int8(), + np.int16(), + np.int32(), + np.int64(), + np.uint8(), + np.uint16(), + np.uint32(), + np.uint64(), + np.float32(), + np.float64(), + ] + ), + indirect=True, +) +def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays): + """ + Test conversion from pyarrow array to numpy array. + + Modifies the pyarrow buffer to contain padding and offset, which are + considered valid buffers by pyarrow. + + Also tests empty pyarrow arrays with non empty buffers. + See https://github.com/pandas-dev/pandas/issues/40896 + """ + import pyarrow as pa + + from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask + + np_dtype, pa_array, np_expected, mask_expected = np_dtype_to_arrays + data, mask = pyarrow_array_to_numpy_and_mask(pa_array, np_dtype) + tm.assert_numpy_array_equal(data, np_expected) + tm.assert_numpy_array_equal(mask, mask_expected) + + mask_buffer = pa_array.buffers()[0] + data_buffer = pa_array.buffers()[1] + data_buffer_bytes = pa_array.buffers()[1].to_pybytes() + + # Add trailing padding to the buffer. + data_buffer_trail = pa.py_buffer(data_buffer_bytes + b"\x00") + pa_array_trail = pa.Array.from_buffers( + type=pa_array.type, + length=len(pa_array), + buffers=[mask_buffer, data_buffer_trail], + offset=pa_array.offset, + ) + pa_array_trail.validate() + data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, np_dtype) + tm.assert_numpy_array_equal(data, np_expected) + tm.assert_numpy_array_equal(mask, mask_expected) + + # Add offset to the buffer. + offset = b"\x00" * (pa_array.type.bit_width // 8) + data_buffer_offset = pa.py_buffer(offset + data_buffer_bytes) + mask_buffer_offset = pa.py_buffer(b"\x0F") + pa_array_offset = pa.Array.from_buffers( + type=pa_array.type, + length=len(pa_array), + buffers=[mask_buffer_offset, data_buffer_offset], + offset=pa_array.offset + 1, + ) + pa_array_offset.validate() + data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype) + tm.assert_numpy_array_equal(data, np_expected) + tm.assert_numpy_array_equal(mask, mask_expected) + + # Empty array + np_expected_empty = np.array([], dtype=np_dtype) + mask_expected_empty = np.array([], dtype=np.bool_) + + pa_array_offset = pa.Array.from_buffers( + type=pa_array.type, + length=0, + buffers=[mask_buffer, data_buffer], + offset=pa_array.offset, + ) + pa_array_offset.validate() + data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype) + tm.assert_numpy_array_equal(data, np_expected_empty) + tm.assert_numpy_array_equal(mask, mask_expected_empty) diff --git a/pandas/tests/arrays/test_arrow_utils.py b/pandas/tests/arrays/test_arrow_utils.py deleted file mode 100644 index 3e1692e358da9..0000000000000 --- a/pandas/tests/arrays/test_arrow_utils.py +++ /dev/null @@ -1,59 +0,0 @@ -import numpy as np - -import pandas.util._test_decorators as td - -import pandas._testing as tm - - -@td.skip_if_no("pyarrow") -def test_pyarrow_array_to_numpy_and_mask(): - """ - Test conversion from pyarrow array to numpy array. - - Also modifies the pyarrow buffer to contain padding and offset, which are - considered valid buffers by pyarrow. - See also https://github.com/pandas-dev/pandas/issues/40896 - """ - import pyarrow as pa - - from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask - - dtype = np.int32 - pa_array = pa.array([0, 1, 2], type=pa.int32()) - np_expected = np.array([0, 1, 2], dtype=np.int32) - mask_expected = np.array([True, True, True]) - - data, mask = pyarrow_array_to_numpy_and_mask(pa_array, dtype) - tm.assert_numpy_array_equal(data, np_expected) - tm.assert_numpy_array_equal(mask, mask_expected) - - mask_buffer = pa_array.buffers()[0] - data_buffer = pa_array.buffers()[1].to_pybytes() - - # Add trailing padding to the buffer. - data_buffer_trail = pa.py_buffer(data_buffer + b"\x00") - pa_array_trail = pa.Array.from_buffers( - type=pa_array.type, - length=len(pa_array), - buffers=[mask_buffer, data_buffer_trail], - offset=pa_array.offset, - ) - pa_array_trail.validate() - data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, dtype) - tm.assert_numpy_array_equal(data, np_expected) - tm.assert_numpy_array_equal(mask, mask_expected) - - # Add offset to the buffer. - offset = b"\x00" * (pa_array.type.bit_width // 8) - data_buffer_offset = pa.py_buffer(offset + data_buffer) - mask_buffer_offset = pa.py_buffer(b"\x0F") - pa_array_offset = pa.Array.from_buffers( - type=pa_array.type, - length=len(pa_array), - buffers=[mask_buffer_offset, data_buffer_offset], - offset=pa_array.offset + 1, - ) - pa_array_offset.validate() - data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, dtype) - tm.assert_numpy_array_equal(data, np_expected) - tm.assert_numpy_array_equal(mask, mask_expected) From 56528699bfee092f712d965992fa52260aee85fd Mon Sep 17 00:00:00 2001 From: Thomas Blauth Date: Wed, 21 Apr 2021 14:36:08 +0000 Subject: [PATCH 05/12] BUG: Slice Arrow buffer before passing it to numpy (#40896) Modify test_arrow_compat.py: - Use import_optional_dependency to skip the whole module if pyarrow is not available. - Use any_real_dtype fixture. --- .../tests/arrays/masked/test_arrow_compat.py | 42 ++++--------------- 1 file changed, 8 insertions(+), 34 deletions(-) diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index d70794a82dcb0..15571f03ca69a 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -1,11 +1,16 @@ import numpy as np import pytest -import pandas.util._test_decorators as td +from pandas.compat._optional import import_optional_dependency import pandas as pd import pandas._testing as tm +try: + pa = import_optional_dependency("pyarrow", min_version="0.16.0") +except ImportError: + pytestmark = pytest.mark.skip(reason="Pyarrow not available") + arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_EA_INT_DTYPES] arrays += [pd.array([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES] arrays += [pd.array([True, False, True, None], dtype="boolean")] @@ -16,10 +21,8 @@ def data(request): return request.param -@td.skip_if_no("pyarrow", min_version="0.15.0") def test_arrow_array(data): # protocol added in 0.15.0 - import pyarrow as pa arr = pa.array(data) expected = pa.array( @@ -29,10 +32,8 @@ def test_arrow_array(data): assert arr.equals(expected) -@td.skip_if_no("pyarrow", min_version="0.16.0") def test_arrow_roundtrip(data): # roundtrip possible from arrow 0.16.0 - import pyarrow as pa df = pd.DataFrame({"a": data}) table = pa.table(df) @@ -42,11 +43,9 @@ def test_arrow_roundtrip(data): tm.assert_frame_equal(result, df) -@td.skip_if_no("pyarrow", min_version="0.16.0") def test_arrow_from_arrow_uint(): # https://github.com/pandas-dev/pandas/issues/31896 # possible mismatch in types - import pyarrow as pa dtype = pd.UInt32Dtype() result = dtype.__from_arrow__(pa.array([1, 2, 3, 4, None], type="int64")) @@ -55,10 +54,8 @@ def test_arrow_from_arrow_uint(): tm.assert_extension_array_equal(result, expected) -@td.skip_if_no("pyarrow", min_version="0.16.0") def test_arrow_sliced(): # https://github.com/pandas-dev/pandas/issues/38525 - import pyarrow as pa df = pd.DataFrame({"a": pd.array([0, None, 2, 3, None], dtype="Int64")}) table = pa.table(df) @@ -68,10 +65,8 @@ def test_arrow_sliced(): @pytest.fixture -def np_dtype_to_arrays(request): - import pyarrow as pa - - np_dtype = request.param +def np_dtype_to_arrays(any_real_dtype): + np_dtype = np.dtype(any_real_dtype) pa_type = pa.from_numpy_dtype(np_dtype) pa_array = pa.array([0, 1, 2], type=pa_type) @@ -80,25 +75,6 @@ def np_dtype_to_arrays(request): return np_dtype, pa_array, np_expected, mask_expected -@td.skip_if_no("pyarrow") -@pytest.mark.parametrize( - "np_dtype_to_arrays", - ( - [ - np.int8(), - np.int16(), - np.int32(), - np.int64(), - np.uint8(), - np.uint16(), - np.uint32(), - np.uint64(), - np.float32(), - np.float64(), - ] - ), - indirect=True, -) def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays): """ Test conversion from pyarrow array to numpy array. @@ -109,8 +85,6 @@ def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays): Also tests empty pyarrow arrays with non empty buffers. See https://github.com/pandas-dev/pandas/issues/40896 """ - import pyarrow as pa - from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask np_dtype, pa_array, np_expected, mask_expected = np_dtype_to_arrays From bd70705605d761cbfaffc21e5b2a5db92e9f7927 Mon Sep 17 00:00:00 2001 From: Thomas Blauth Date: Thu, 22 Apr 2021 06:58:24 +0000 Subject: [PATCH 06/12] BUG: Slice Arrow buffer before passing it to numpy (#40896) Add whatsnew entry. --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 85d9acff353be..b7249545ee503 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -866,6 +866,7 @@ ExtensionArray - Fixed bug where :meth:`Series.idxmax`, :meth:`Series.idxmin` and ``argmax/min`` fail when the underlying data is :class:`ExtensionArray` (:issue:`32749`, :issue:`33719`, :issue:`36566`) - Fixed a bug where some properties of subclasses of :class:`PandasExtensionDtype` where improperly cached (:issue:`40329`) - Bug in :meth:`DataFrame.mask` where masking a :class:`Dataframe` with an :class:`ExtensionArray` dtype raises ``ValueError`` (:issue:`40941`) +- Bug in :func:`pandas.core.arrays._arrow_utils.pyarrow_array_to_numpy_and_mask` where Numpy raises ``ValueError`` if buffer size does not match multiple of dtype size (:issue:`40896`) Styler ^^^^^^ From ff85a8081c165f60bd0411876dab2b46793e4605 Mon Sep 17 00:00:00 2001 From: Thomas Blauth Date: Thu, 22 Apr 2021 07:33:54 +0000 Subject: [PATCH 07/12] BUG: Slice Arrow buffer before passing it to numpy (#40896) Enforce creation of Arrow bitmask buffer. --- pandas/tests/arrays/masked/test_arrow_compat.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index 15571f03ca69a..3c3cda0ab6c67 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -69,9 +69,12 @@ def np_dtype_to_arrays(any_real_dtype): np_dtype = np.dtype(any_real_dtype) pa_type = pa.from_numpy_dtype(np_dtype) - pa_array = pa.array([0, 1, 2], type=pa_type) + # None ensures the creation of a bitmask buffer. + pa_array = pa.array([0, 1, 2, None], type=pa_type) + # Since masked Arrow buffer slots are not required to contain a specific + # value, assert only the first three values of the created np.array np_expected = np.array([0, 1, 2], dtype=np_dtype) - mask_expected = np.array([True, True, True]) + mask_expected = np.array([True, True, True, False]) return np_dtype, pa_array, np_expected, mask_expected @@ -89,7 +92,7 @@ def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays): np_dtype, pa_array, np_expected, mask_expected = np_dtype_to_arrays data, mask = pyarrow_array_to_numpy_and_mask(pa_array, np_dtype) - tm.assert_numpy_array_equal(data, np_expected) + tm.assert_numpy_array_equal(data[:3], np_expected) tm.assert_numpy_array_equal(mask, mask_expected) mask_buffer = pa_array.buffers()[0] @@ -106,13 +109,13 @@ def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays): ) pa_array_trail.validate() data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, np_dtype) - tm.assert_numpy_array_equal(data, np_expected) + tm.assert_numpy_array_equal(data[:3], np_expected) tm.assert_numpy_array_equal(mask, mask_expected) # Add offset to the buffer. offset = b"\x00" * (pa_array.type.bit_width // 8) data_buffer_offset = pa.py_buffer(offset + data_buffer_bytes) - mask_buffer_offset = pa.py_buffer(b"\x0F") + mask_buffer_offset = pa.py_buffer(b"\x0E") pa_array_offset = pa.Array.from_buffers( type=pa_array.type, length=len(pa_array), @@ -121,7 +124,7 @@ def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays): ) pa_array_offset.validate() data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype) - tm.assert_numpy_array_equal(data, np_expected) + tm.assert_numpy_array_equal(data[:3], np_expected) tm.assert_numpy_array_equal(mask, mask_expected) # Empty array @@ -136,5 +139,5 @@ def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays): ) pa_array_offset.validate() data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype) - tm.assert_numpy_array_equal(data, np_expected_empty) + tm.assert_numpy_array_equal(data[:3], np_expected_empty) tm.assert_numpy_array_equal(mask, mask_expected_empty) From c195089a28f932b2805663ce9981cffd435b0279 Mon Sep 17 00:00:00 2001 From: Thomas Blauth Date: Thu, 22 Apr 2021 09:39:20 +0000 Subject: [PATCH 08/12] BUG: Slice Arrow buffer before passing it to numpy (#40896) Fix whatsnew.rst --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 84a2900b30651..c70d4aede12b9 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -882,7 +882,7 @@ ExtensionArray - Fixed bug where :meth:`Series.idxmax`, :meth:`Series.idxmin` and ``argmax/min`` fail when the underlying data is :class:`ExtensionArray` (:issue:`32749`, :issue:`33719`, :issue:`36566`) - Fixed a bug where some properties of subclasses of :class:`PandasExtensionDtype` where improperly cached (:issue:`40329`) - Bug in :meth:`DataFrame.mask` where masking a :class:`Dataframe` with an :class:`ExtensionArray` dtype raises ``ValueError`` (:issue:`40941`) -- Bug in :func:`pandas.core.arrays._arrow_utils.pyarrow_array_to_numpy_and_mask` where Numpy raises ``ValueError`` if buffer size does not match multiple of dtype size (:issue:`40896`) +- Bug in :func:`pyarrow_array_to_numpy_and_mask` where Numpy raises ``ValueError`` if buffer size does not match multiple of dtype size (:issue:`40896`) Styler ^^^^^^ From f6555df26b64912a62e5a0ceb4a6115e9817f5b3 Mon Sep 17 00:00:00 2001 From: Thomas Blauth Date: Fri, 23 Apr 2021 08:33:03 +0000 Subject: [PATCH 09/12] Fix whatsnew --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index c70d4aede12b9..9e2e5a59c7f16 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -882,7 +882,7 @@ ExtensionArray - Fixed bug where :meth:`Series.idxmax`, :meth:`Series.idxmin` and ``argmax/min`` fail when the underlying data is :class:`ExtensionArray` (:issue:`32749`, :issue:`33719`, :issue:`36566`) - Fixed a bug where some properties of subclasses of :class:`PandasExtensionDtype` where improperly cached (:issue:`40329`) - Bug in :meth:`DataFrame.mask` where masking a :class:`Dataframe` with an :class:`ExtensionArray` dtype raises ``ValueError`` (:issue:`40941`) -- Bug in :func:`pyarrow_array_to_numpy_and_mask` where Numpy raises ``ValueError`` if buffer size does not match multiple of dtype size (:issue:`40896`) +- Bug in :func:`pyarrow_array_to_numpy_and_mask` when converting a pyarrow array who's data buffer size is not a multiple of dtype size (:issue:`40896`) Styler ^^^^^^ From 03648ebf02be438a7be890f7bd21b2edb4c39e94 Mon Sep 17 00:00:00 2001 From: Thomas Blauth Date: Fri, 23 Apr 2021 08:48:09 +0000 Subject: [PATCH 10/12] Use pytest.importorskip --- pandas/tests/arrays/masked/test_arrow_compat.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index e1c1e48b65a93..fabfdb9236aad 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -6,10 +6,7 @@ import pandas as pd import pandas._testing as tm -try: - import pyarrow as pa -except ImportError: - pa = None +pa = pytest.importorskip("pyarrow", minversion="0.15.0") arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_EA_INT_DTYPES] arrays += [pd.array([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES] @@ -21,7 +18,6 @@ def data(request): return request.param -@td.skip_if_no("pyarrow", min_version="0.15.0") def test_arrow_array(data): # protocol added in 0.15.0 @@ -104,7 +100,6 @@ def np_dtype_to_arrays(any_real_dtype): return np_dtype, pa_array, np_expected, mask_expected -@td.skip_if_no("pyarrow") def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays): """ Test conversion from pyarrow array to numpy array. From 9aa5df6bd83cb63ac6eee4a2ebb1f568b3dde7d9 Mon Sep 17 00:00:00 2001 From: Thomas Blauth Date: Fri, 23 Apr 2021 09:23:26 +0000 Subject: [PATCH 11/12] Fix whatsnew --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 9e2e5a59c7f16..02d64e96e380b 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -796,6 +796,7 @@ I/O - Bug in :func:`read_excel` dropping empty values from single-column spreadsheets (:issue:`39808`) - Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40907`) - Bug in :func:`read_orc` always raising ``AttributeError`` (:issue:`40918`) +- Bug in the conversion from pyarrow to pandas (e.g. for reading Parquet) with nullable dtypes and a pyarrow array whose data buffer size is not a multiple of dtype size (:issue:`40896`) Period ^^^^^^ @@ -882,7 +883,6 @@ ExtensionArray - Fixed bug where :meth:`Series.idxmax`, :meth:`Series.idxmin` and ``argmax/min`` fail when the underlying data is :class:`ExtensionArray` (:issue:`32749`, :issue:`33719`, :issue:`36566`) - Fixed a bug where some properties of subclasses of :class:`PandasExtensionDtype` where improperly cached (:issue:`40329`) - Bug in :meth:`DataFrame.mask` where masking a :class:`Dataframe` with an :class:`ExtensionArray` dtype raises ``ValueError`` (:issue:`40941`) -- Bug in :func:`pyarrow_array_to_numpy_and_mask` when converting a pyarrow array who's data buffer size is not a multiple of dtype size (:issue:`40896`) Styler ^^^^^^ From b86f9fc4b1e346a5dc66d7957e8c8b5aea39f070 Mon Sep 17 00:00:00 2001 From: Thomas Blauth Date: Mon, 26 Apr 2021 13:04:17 +0000 Subject: [PATCH 12/12] Move import to top --- pandas/tests/arrays/masked/test_arrow_compat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index fabfdb9236aad..d64dd6fa24d2c 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -8,6 +8,8 @@ pa = pytest.importorskip("pyarrow", minversion="0.15.0") +from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask + arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_EA_INT_DTYPES] arrays += [pd.array([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES] arrays += [pd.array([True, False, True, None], dtype="boolean")] @@ -110,8 +112,6 @@ def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays): Also tests empty pyarrow arrays with non empty buffers. See https://github.com/pandas-dev/pandas/issues/40896 """ - from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask - np_dtype, pa_array, np_expected, mask_expected = np_dtype_to_arrays data, mask = pyarrow_array_to_numpy_and_mask(pa_array, np_dtype) tm.assert_numpy_array_equal(data[:3], np_expected)