diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 0e567972e7823..02d64e96e380b 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -796,6 +796,7 @@ I/O - Bug in :func:`read_excel` dropping empty values from single-column spreadsheets (:issue:`39808`) - Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40907`) - Bug in :func:`read_orc` always raising ``AttributeError`` (:issue:`40918`) +- Bug in the conversion from pyarrow to pandas (e.g. for reading Parquet) with nullable dtypes and a pyarrow array whose data buffer size is not a multiple of dtype size (:issue:`40896`) Period ^^^^^^ diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py index 31f6896b12f98..51e5f36b88c79 100644 --- a/pandas/core/arrays/_arrow_utils.py +++ b/pandas/core/arrays/_arrow_utils.py @@ -14,6 +14,8 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype): Convert a primitive pyarrow.Array to a numpy array and boolean mask based on the buffers of the Array. + At the moment pyarrow.BooleanArray is not supported. + Parameters ---------- arr : pyarrow.Array @@ -25,8 +27,16 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype): Tuple of two numpy arrays with the raw data (with specified dtype) and a boolean mask (validity mask, so False means missing) """ + dtype = np.dtype(dtype) + buflist = arr.buffers() - data = np.frombuffer(buflist[1], dtype=dtype)[arr.offset : arr.offset + len(arr)] + # Since Arrow buffers might contain padding and the data might be offset, + # the buffer gets sliced here before handing it to numpy. + # See also https://github.com/pandas-dev/pandas/issues/40896 + offset = arr.offset * dtype.itemsize + length = len(arr) * dtype.itemsize + data_buf = buflist[1][offset : offset + length] + data = np.frombuffer(data_buf, dtype=dtype) bitmask = buflist[0] if bitmask is not None: mask = pyarrow.BooleanArray.from_buffers( diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index 07a02c7152bce..d64dd6fa24d2c 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -1,3 +1,4 @@ +import numpy as np import pytest import pandas.util._test_decorators as td @@ -5,6 +6,10 @@ import pandas as pd import pandas._testing as tm +pa = pytest.importorskip("pyarrow", minversion="0.15.0") + +from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask + arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_EA_INT_DTYPES] arrays += [pd.array([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES] arrays += [pd.array([True, False, True, None], dtype="boolean")] @@ -15,10 +20,8 @@ def data(request): return request.param -@td.skip_if_no("pyarrow", min_version="0.15.0") def test_arrow_array(data): # protocol added in 0.15.0 - import pyarrow as pa arr = pa.array(data) expected = pa.array( @@ -31,7 +34,6 @@ def test_arrow_array(data): @td.skip_if_no("pyarrow", min_version="0.16.0") def test_arrow_roundtrip(data): # roundtrip possible from arrow 0.16.0 - import pyarrow as pa df = pd.DataFrame({"a": data}) table = pa.table(df) @@ -44,7 +46,6 @@ def test_arrow_roundtrip(data): @td.skip_if_no("pyarrow", min_version="0.15.1.dev") def test_arrow_load_from_zero_chunks(data): # GH-41040 - import pyarrow as pa df = pd.DataFrame({"a": data[0:0]}) table = pa.table(df) @@ -61,7 +62,6 @@ def test_arrow_load_from_zero_chunks(data): def test_arrow_from_arrow_uint(): # https://github.com/pandas-dev/pandas/issues/31896 # possible mismatch in types - import pyarrow as pa dtype = pd.UInt32Dtype() result = dtype.__from_arrow__(pa.array([1, 2, 3, 4, None], type="int64")) @@ -73,7 +73,6 @@ def test_arrow_from_arrow_uint(): @td.skip_if_no("pyarrow", min_version="0.16.0") def test_arrow_sliced(data): # https://github.com/pandas-dev/pandas/issues/38525 - import pyarrow as pa df = pd.DataFrame({"a": data}) table = pa.table(df) @@ -89,12 +88,87 @@ def test_arrow_sliced(data): tm.assert_frame_equal(result, expected) +@pytest.fixture +def np_dtype_to_arrays(any_real_dtype): + np_dtype = np.dtype(any_real_dtype) + pa_type = pa.from_numpy_dtype(np_dtype) + + # None ensures the creation of a bitmask buffer. + pa_array = pa.array([0, 1, 2, None], type=pa_type) + # Since masked Arrow buffer slots are not required to contain a specific + # value, assert only the first three values of the created np.array + np_expected = np.array([0, 1, 2], dtype=np_dtype) + mask_expected = np.array([True, True, True, False]) + return np_dtype, pa_array, np_expected, mask_expected + + +def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays): + """ + Test conversion from pyarrow array to numpy array. + + Modifies the pyarrow buffer to contain padding and offset, which are + considered valid buffers by pyarrow. + + Also tests empty pyarrow arrays with non empty buffers. + See https://github.com/pandas-dev/pandas/issues/40896 + """ + np_dtype, pa_array, np_expected, mask_expected = np_dtype_to_arrays + data, mask = pyarrow_array_to_numpy_and_mask(pa_array, np_dtype) + tm.assert_numpy_array_equal(data[:3], np_expected) + tm.assert_numpy_array_equal(mask, mask_expected) + + mask_buffer = pa_array.buffers()[0] + data_buffer = pa_array.buffers()[1] + data_buffer_bytes = pa_array.buffers()[1].to_pybytes() + + # Add trailing padding to the buffer. + data_buffer_trail = pa.py_buffer(data_buffer_bytes + b"\x00") + pa_array_trail = pa.Array.from_buffers( + type=pa_array.type, + length=len(pa_array), + buffers=[mask_buffer, data_buffer_trail], + offset=pa_array.offset, + ) + pa_array_trail.validate() + data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, np_dtype) + tm.assert_numpy_array_equal(data[:3], np_expected) + tm.assert_numpy_array_equal(mask, mask_expected) + + # Add offset to the buffer. + offset = b"\x00" * (pa_array.type.bit_width // 8) + data_buffer_offset = pa.py_buffer(offset + data_buffer_bytes) + mask_buffer_offset = pa.py_buffer(b"\x0E") + pa_array_offset = pa.Array.from_buffers( + type=pa_array.type, + length=len(pa_array), + buffers=[mask_buffer_offset, data_buffer_offset], + offset=pa_array.offset + 1, + ) + pa_array_offset.validate() + data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype) + tm.assert_numpy_array_equal(data[:3], np_expected) + tm.assert_numpy_array_equal(mask, mask_expected) + + # Empty array + np_expected_empty = np.array([], dtype=np_dtype) + mask_expected_empty = np.array([], dtype=np.bool_) + + pa_array_offset = pa.Array.from_buffers( + type=pa_array.type, + length=0, + buffers=[mask_buffer, data_buffer], + offset=pa_array.offset, + ) + pa_array_offset.validate() + data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype) + tm.assert_numpy_array_equal(data[:3], np_expected_empty) + tm.assert_numpy_array_equal(mask, mask_expected_empty) + + @td.skip_if_no("pyarrow", min_version="0.16.0") def test_from_arrow_type_error(request, data): # ensure that __from_arrow__ returns a TypeError when getting a wrong # array type - import pyarrow as pa - if data.dtype != "boolean": # TODO numeric dtypes cast any incoming array to the correct dtype # instead of erroring