|
| 1 | +import numpy as np |
1 | 2 | import pytest
|
2 | 3 |
|
3 | 4 | import pandas.util._test_decorators as td
|
@@ -64,3 +65,102 @@ def test_arrow_sliced():
|
64 | 65 | result = table.slice(2, None).to_pandas()
|
65 | 66 | expected = df.iloc[2:].reset_index(drop=True)
|
66 | 67 | tm.assert_frame_equal(result, expected)
|
| 68 | + |
| 69 | + |
| 70 | +@pytest.fixture |
| 71 | +def np_dtype_to_arrays(request): |
| 72 | + import pyarrow as pa |
| 73 | + |
| 74 | + np_dtype = request.param |
| 75 | + pa_type = pa.from_numpy_dtype(np_dtype) |
| 76 | + |
| 77 | + pa_array = pa.array([0, 1, 2], type=pa_type) |
| 78 | + np_expected = np.array([0, 1, 2], dtype=np_dtype) |
| 79 | + mask_expected = np.array([True, True, True]) |
| 80 | + return np_dtype, pa_array, np_expected, mask_expected |
| 81 | + |
| 82 | + |
| 83 | +@td.skip_if_no("pyarrow") |
| 84 | +@pytest.mark.parametrize( |
| 85 | + "np_dtype_to_arrays", |
| 86 | + ( |
| 87 | + [ |
| 88 | + np.int8(), |
| 89 | + np.int16(), |
| 90 | + np.int32(), |
| 91 | + np.int64(), |
| 92 | + np.uint8(), |
| 93 | + np.uint16(), |
| 94 | + np.uint32(), |
| 95 | + np.uint64(), |
| 96 | + np.float32(), |
| 97 | + np.float64(), |
| 98 | + ] |
| 99 | + ), |
| 100 | + indirect=True, |
| 101 | +) |
| 102 | +def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays): |
| 103 | + """ |
| 104 | + Test conversion from pyarrow array to numpy array. |
| 105 | +
|
| 106 | + Modifies the pyarrow buffer to contain padding and offset, which are |
| 107 | + considered valid buffers by pyarrow. |
| 108 | +
|
| 109 | + Also tests empty pyarrow arrays with non empty buffers. |
| 110 | + See https://github.com/pandas-dev/pandas/issues/40896 |
| 111 | + """ |
| 112 | + import pyarrow as pa |
| 113 | + |
| 114 | + from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask |
| 115 | + |
| 116 | + np_dtype, pa_array, np_expected, mask_expected = np_dtype_to_arrays |
| 117 | + data, mask = pyarrow_array_to_numpy_and_mask(pa_array, np_dtype) |
| 118 | + tm.assert_numpy_array_equal(data, np_expected) |
| 119 | + tm.assert_numpy_array_equal(mask, mask_expected) |
| 120 | + |
| 121 | + mask_buffer = pa_array.buffers()[0] |
| 122 | + data_buffer = pa_array.buffers()[1] |
| 123 | + data_buffer_bytes = pa_array.buffers()[1].to_pybytes() |
| 124 | + |
| 125 | + # Add trailing padding to the buffer. |
| 126 | + data_buffer_trail = pa.py_buffer(data_buffer_bytes + b"\x00") |
| 127 | + pa_array_trail = pa.Array.from_buffers( |
| 128 | + type=pa_array.type, |
| 129 | + length=len(pa_array), |
| 130 | + buffers=[mask_buffer, data_buffer_trail], |
| 131 | + offset=pa_array.offset, |
| 132 | + ) |
| 133 | + pa_array_trail.validate() |
| 134 | + data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, np_dtype) |
| 135 | + tm.assert_numpy_array_equal(data, np_expected) |
| 136 | + tm.assert_numpy_array_equal(mask, mask_expected) |
| 137 | + |
| 138 | + # Add offset to the buffer. |
| 139 | + offset = b"\x00" * (pa_array.type.bit_width // 8) |
| 140 | + data_buffer_offset = pa.py_buffer(offset + data_buffer_bytes) |
| 141 | + mask_buffer_offset = pa.py_buffer(b"\x0F") |
| 142 | + pa_array_offset = pa.Array.from_buffers( |
| 143 | + type=pa_array.type, |
| 144 | + length=len(pa_array), |
| 145 | + buffers=[mask_buffer_offset, data_buffer_offset], |
| 146 | + offset=pa_array.offset + 1, |
| 147 | + ) |
| 148 | + pa_array_offset.validate() |
| 149 | + data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype) |
| 150 | + tm.assert_numpy_array_equal(data, np_expected) |
| 151 | + tm.assert_numpy_array_equal(mask, mask_expected) |
| 152 | + |
| 153 | + # Empty array |
| 154 | + np_expected_empty = np.array([], dtype=np_dtype) |
| 155 | + mask_expected_empty = np.array([], dtype=np.bool_) |
| 156 | + |
| 157 | + pa_array_offset = pa.Array.from_buffers( |
| 158 | + type=pa_array.type, |
| 159 | + length=0, |
| 160 | + buffers=[mask_buffer, data_buffer], |
| 161 | + offset=pa_array.offset, |
| 162 | + ) |
| 163 | + pa_array_offset.validate() |
| 164 | + data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype) |
| 165 | + tm.assert_numpy_array_equal(data, np_expected_empty) |
| 166 | + tm.assert_numpy_array_equal(mask, mask_expected_empty) |
0 commit comments