pandas-dev · jorisvandenbossche · Apr 28, 2021 · Apr 19, 2021 · Apr 19, 2021 · Apr 20, 2021
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -796,6 +796,7 @@ I/O
 - Bug in :func:`read_excel` dropping empty values from single-column spreadsheets (:issue:`39808`)
 - Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40907`)
 - Bug in :func:`read_orc` always raising ``AttributeError`` (:issue:`40918`)
+- Bug in the conversion from pyarrow to pandas (e.g. for reading Parquet) with nullable dtypes and a pyarrow array whose data buffer size is not a multiple of dtype size (:issue:`40896`)
 
 Period
 ^^^^^^

diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py
@@ -14,6 +14,8 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype):
     Convert a primitive pyarrow.Array to a numpy array and boolean mask based
     on the buffers of the Array.
 
+    At the moment pyarrow.BooleanArray is not supported.
+
     Parameters
     ----------
     arr : pyarrow.Array
@@ -25,8 +27,16 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype):
         Tuple of two numpy arrays with the raw data (with specified dtype) and
         a boolean mask (validity mask, so False means missing)
     """
+    dtype = np.dtype(dtype)
+
     buflist = arr.buffers()
-    data = np.frombuffer(buflist[1], dtype=dtype)[arr.offset : arr.offset + len(arr)]
+    # Since Arrow buffers might contain padding and the data might be offset,
+    # the buffer gets sliced here before handing it to numpy.
+    # See also https://github.com/pandas-dev/pandas/issues/40896
+    offset = arr.offset * dtype.itemsize
+    length = len(arr) * dtype.itemsize
+    data_buf = buflist[1][offset : offset + length]
+    data = np.frombuffer(data_buf, dtype=dtype)
     bitmask = buflist[0]
     if bitmask is not None:
         mask = pyarrow.BooleanArray.from_buffers(

diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py
@@ -1,10 +1,15 @@
+import numpy as np
 import pytest
 
 import pandas.util._test_decorators as td
 
 import pandas as pd
 import pandas._testing as tm
 
+pa = pytest.importorskip("pyarrow", minversion="0.15.0")
+
+from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask
+
 arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_EA_INT_DTYPES]
 arrays += [pd.array([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES]
 arrays += [pd.array([True, False, True, None], dtype="boolean")]
@@ -15,10 +20,8 @@ def data(request):
     return request.param
 
 
-@td.skip_if_no("pyarrow", min_version="0.15.0")
 def test_arrow_array(data):
     # protocol added in 0.15.0
-    import pyarrow as pa
 
     arr = pa.array(data)
     expected = pa.array(
@@ -31,7 +34,6 @@ def test_arrow_array(data):
 @td.skip_if_no("pyarrow", min_version="0.16.0")
 def test_arrow_roundtrip(data):
     # roundtrip possible from arrow 0.16.0
-    import pyarrow as pa
 
     df = pd.DataFrame({"a": data})
     table = pa.table(df)
@@ -44,7 +46,6 @@ def test_arrow_roundtrip(data):
 @td.skip_if_no("pyarrow", min_version="0.15.1.dev")
 def test_arrow_load_from_zero_chunks(data):
     # GH-41040
-    import pyarrow as pa
 
     df = pd.DataFrame({"a": data[0:0]})
     table = pa.table(df)
@@ -61,7 +62,6 @@ def test_arrow_load_from_zero_chunks(data):
 def test_arrow_from_arrow_uint():
     # https://github.com/pandas-dev/pandas/issues/31896
     # possible mismatch in types
-    import pyarrow as pa
 
     dtype = pd.UInt32Dtype()
     result = dtype.__from_arrow__(pa.array([1, 2, 3, 4, None], type="int64"))
@@ -73,7 +73,6 @@ def test_arrow_from_arrow_uint():
 @td.skip_if_no("pyarrow", min_version="0.16.0")
 def test_arrow_sliced(data):
     # https://github.com/pandas-dev/pandas/issues/38525
-    import pyarrow as pa
 
     df = pd.DataFrame({"a": data})
     table = pa.table(df)
@@ -89,12 +88,87 @@ def test_arrow_sliced(data):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.fixture
+def np_dtype_to_arrays(any_real_dtype):
+    np_dtype = np.dtype(any_real_dtype)
+    pa_type = pa.from_numpy_dtype(np_dtype)
+
+    # None ensures the creation of a bitmask buffer.
+    pa_array = pa.array([0, 1, 2, None], type=pa_type)
+    # Since masked Arrow buffer slots are not required to contain a specific
+    # value, assert only the first three values of the created np.array
+    np_expected = np.array([0, 1, 2], dtype=np_dtype)
+    mask_expected = np.array([True, True, True, False])
+    return np_dtype, pa_array, np_expected, mask_expected
+
+
+def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays):
+    """
+    Test conversion from pyarrow array to numpy array.
+
+    Modifies the pyarrow buffer to contain padding and offset, which are
+    considered valid buffers by pyarrow.
+
+    Also tests empty pyarrow arrays with non empty buffers.
+    See https://github.com/pandas-dev/pandas/issues/40896
+    """
+    np_dtype, pa_array, np_expected, mask_expected = np_dtype_to_arrays
+    data, mask = pyarrow_array_to_numpy_and_mask(pa_array, np_dtype)
+    tm.assert_numpy_array_equal(data[:3], np_expected)
+    tm.assert_numpy_array_equal(mask, mask_expected)
+
+    mask_buffer = pa_array.buffers()[0]
+    data_buffer = pa_array.buffers()[1]
+    data_buffer_bytes = pa_array.buffers()[1].to_pybytes()
+
+    # Add trailing padding to the buffer.
+    data_buffer_trail = pa.py_buffer(data_buffer_bytes + b"\x00")
+    pa_array_trail = pa.Array.from_buffers(
+        type=pa_array.type,
+        length=len(pa_array),
+        buffers=[mask_buffer, data_buffer_trail],
+        offset=pa_array.offset,
+    )
+    pa_array_trail.validate()
+    data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, np_dtype)
+    tm.assert_numpy_array_equal(data[:3], np_expected)
+    tm.assert_numpy_array_equal(mask, mask_expected)
+
+    # Add offset to the buffer.
+    offset = b"\x00" * (pa_array.type.bit_width // 8)
+    data_buffer_offset = pa.py_buffer(offset + data_buffer_bytes)
+    mask_buffer_offset = pa.py_buffer(b"\x0E")
+    pa_array_offset = pa.Array.from_buffers(
+        type=pa_array.type,
+        length=len(pa_array),
+        buffers=[mask_buffer_offset, data_buffer_offset],
+        offset=pa_array.offset + 1,
+    )
+    pa_array_offset.validate()
+    data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype)
+    tm.assert_numpy_array_equal(data[:3], np_expected)
+    tm.assert_numpy_array_equal(mask, mask_expected)
+
+    # Empty array
+    np_expected_empty = np.array([], dtype=np_dtype)
+    mask_expected_empty = np.array([], dtype=np.bool_)
+
+    pa_array_offset = pa.Array.from_buffers(
+        type=pa_array.type,
+        length=0,
+        buffers=[mask_buffer, data_buffer],
+        offset=pa_array.offset,
+    )
+    pa_array_offset.validate()
+    data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype)
+    tm.assert_numpy_array_equal(data[:3], np_expected_empty)
+    tm.assert_numpy_array_equal(mask, mask_expected_empty)
+
+
 @td.skip_if_no("pyarrow", min_version="0.16.0")
 def test_from_arrow_type_error(request, data):
     # ensure that __from_arrow__ returns a TypeError when getting a wrong
     # array type
-    import pyarrow as pa
-
     if data.dtype != "boolean":
         # TODO numeric dtypes cast any incoming array to the correct dtype
         # instead of erroring