pandas-dev · jorisvandenbossche · Apr 28, 2021 · Apr 19, 2021 · Apr 19, 2021 · Apr 20, 2021
diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py
@@ -14,6 +14,8 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype):
     Convert a primitive pyarrow.Array to a numpy array and boolean mask based
     on the buffers of the Array.
 
+    At the moment pyarrow.BooleanArray is not supported.
+
     Parameters
     ----------
     arr : pyarrow.Array
@@ -25,8 +27,16 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype):
         Tuple of two numpy arrays with the raw data (with specified dtype) and
         a boolean mask (validity mask, so False means missing)
     """
+    dtype = np.dtype(dtype)
+
     buflist = arr.buffers()
-    data = np.frombuffer(buflist[1], dtype=dtype)[arr.offset : arr.offset + len(arr)]
+    # Since Arrow buffers might contain padding and the data might be offset,
+    # the buffer gets sliced here before handing it to numpy.
+    # See also https://github.com/pandas-dev/pandas/issues/40896
+    offset = arr.offset * dtype.itemsize
+    length = len(arr) * dtype.itemsize
+    data_buf = buflist[1][offset : offset + length]
+    data = np.frombuffer(data_buf, dtype=dtype)
     bitmask = buflist[0]
     if bitmask is not None:
         mask = pyarrow.BooleanArray.from_buffers(

diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py
@@ -1,3 +1,4 @@
+import numpy as np
 import pytest
 
 import pandas.util._test_decorators as td
@@ -64,3 +65,102 @@ def test_arrow_sliced():
     result = table.slice(2, None).to_pandas()
     expected = df.iloc[2:].reset_index(drop=True)
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.fixture
+def np_dtype_to_arrays(request):
+    import pyarrow as pa
+
+    np_dtype = request.param
+    pa_type = pa.from_numpy_dtype(np_dtype)
+
+    pa_array = pa.array([0, 1, 2], type=pa_type)
-    pa_array = pa.array([0, 1, 2], type=pa_type)
+    pa_array = pa.array([0, 1, 2, None], type=pa_type)
-    pa_array = pa.array([0, 1, 2], type=pa_type)
+    pa_array = pa.array([0, 1, 2, None], type=pa_type)
+    np_expected = np.array([0, 1, 2], dtype=np_dtype)
+    mask_expected = np.array([True, True, True])
+    return np_dtype, pa_array, np_expected, mask_expected
+
+
+@td.skip_if_no("pyarrow")
+@pytest.mark.parametrize(
+    "np_dtype_to_arrays",
+    (
+        [
+            np.int8(),
+            np.int16(),
+            np.int32(),
+            np.int64(),
+            np.uint8(),
+            np.uint16(),
+            np.uint32(),
+            np.uint64(),
+            np.float32(),
+            np.float64(),
+        ]
+    ),
+    indirect=True,
+)
+def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays):
+    """
+    Test conversion from pyarrow array to numpy array.
+
+    Modifies the pyarrow buffer to contain padding and offset, which are
+    considered valid buffers by pyarrow.
+
+    Also tests empty pyarrow arrays with non empty buffers.
+    See https://github.com/pandas-dev/pandas/issues/40896
+    """
+    import pyarrow as pa
+
+    from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask
+
+    np_dtype, pa_array, np_expected, mask_expected = np_dtype_to_arrays
+    data, mask = pyarrow_array_to_numpy_and_mask(pa_array, np_dtype)
+    tm.assert_numpy_array_equal(data, np_expected)
+    tm.assert_numpy_array_equal(mask, mask_expected)
+
+    mask_buffer = pa_array.buffers()[0]
+    data_buffer = pa_array.buffers()[1]
+    data_buffer_bytes = pa_array.buffers()[1].to_pybytes()
+
+    # Add trailing padding to the buffer.
+    data_buffer_trail = pa.py_buffer(data_buffer_bytes + b"\x00")
+    pa_array_trail = pa.Array.from_buffers(
+        type=pa_array.type,
+        length=len(pa_array),
+        buffers=[mask_buffer, data_buffer_trail],
+        offset=pa_array.offset,
+    )
+    pa_array_trail.validate()
+    data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, np_dtype)
+    tm.assert_numpy_array_equal(data, np_expected)
+    tm.assert_numpy_array_equal(mask, mask_expected)
+
+    # Add offset to the buffer.
+    offset = b"\x00" * (pa_array.type.bit_width // 8)
+    data_buffer_offset = pa.py_buffer(offset + data_buffer_bytes)
+    mask_buffer_offset = pa.py_buffer(b"\x0F")
+    pa_array_offset = pa.Array.from_buffers(
+        type=pa_array.type,
+        length=len(pa_array),
+        buffers=[mask_buffer_offset, data_buffer_offset],
+        offset=pa_array.offset + 1,
+    )
+    pa_array_offset.validate()
+    data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype)
+    tm.assert_numpy_array_equal(data, np_expected)
+    tm.assert_numpy_array_equal(mask, mask_expected)
+
+    # Empty array
+    np_expected_empty = np.array([], dtype=np_dtype)
+    mask_expected_empty = np.array([], dtype=np.bool_)
+
+    pa_array_offset = pa.Array.from_buffers(
+        type=pa_array.type,
+        length=0,
+        buffers=[mask_buffer, data_buffer],
+        offset=pa_array.offset,
+    )
+    pa_array_offset.validate()
+    data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype)
+    tm.assert_numpy_array_equal(data, np_expected_empty)
+    tm.assert_numpy_array_equal(mask, mask_expected_empty)