1
+ import numpy as np
1
2
import pytest
2
3
3
4
import pandas .util ._test_decorators as td
4
5
5
6
import pandas as pd
6
7
import pandas ._testing as tm
7
8
9
+ pa = pytest .importorskip ("pyarrow" , minversion = "0.15.0" )
10
+
11
+ from pandas .core .arrays ._arrow_utils import pyarrow_array_to_numpy_and_mask
12
+
8
13
arrays = [pd .array ([1 , 2 , 3 , None ], dtype = dtype ) for dtype in tm .ALL_EA_INT_DTYPES ]
9
14
arrays += [pd .array ([0.1 , 0.2 , 0.3 , None ], dtype = dtype ) for dtype in tm .FLOAT_EA_DTYPES ]
10
15
arrays += [pd .array ([True , False , True , None ], dtype = "boolean" )]
@@ -15,10 +20,8 @@ def data(request):
15
20
return request .param
16
21
17
22
18
- @td .skip_if_no ("pyarrow" , min_version = "0.15.0" )
19
23
def test_arrow_array (data ):
20
24
# protocol added in 0.15.0
21
- import pyarrow as pa
22
25
23
26
arr = pa .array (data )
24
27
expected = pa .array (
@@ -31,7 +34,6 @@ def test_arrow_array(data):
31
34
@td .skip_if_no ("pyarrow" , min_version = "0.16.0" )
32
35
def test_arrow_roundtrip (data ):
33
36
# roundtrip possible from arrow 0.16.0
34
- import pyarrow as pa
35
37
36
38
df = pd .DataFrame ({"a" : data })
37
39
table = pa .table (df )
@@ -44,7 +46,6 @@ def test_arrow_roundtrip(data):
44
46
@td .skip_if_no ("pyarrow" , min_version = "0.15.1.dev" )
45
47
def test_arrow_load_from_zero_chunks (data ):
46
48
# GH-41040
47
- import pyarrow as pa
48
49
49
50
df = pd .DataFrame ({"a" : data [0 :0 ]})
50
51
table = pa .table (df )
@@ -61,7 +62,6 @@ def test_arrow_load_from_zero_chunks(data):
61
62
def test_arrow_from_arrow_uint ():
62
63
# https://github.com/pandas-dev/pandas/issues/31896
63
64
# possible mismatch in types
64
- import pyarrow as pa
65
65
66
66
dtype = pd .UInt32Dtype ()
67
67
result = dtype .__from_arrow__ (pa .array ([1 , 2 , 3 , 4 , None ], type = "int64" ))
@@ -73,7 +73,6 @@ def test_arrow_from_arrow_uint():
73
73
@td .skip_if_no ("pyarrow" , min_version = "0.16.0" )
74
74
def test_arrow_sliced (data ):
75
75
# https://github.com/pandas-dev/pandas/issues/38525
76
- import pyarrow as pa
77
76
78
77
df = pd .DataFrame ({"a" : data })
79
78
table = pa .table (df )
@@ -89,12 +88,87 @@ def test_arrow_sliced(data):
89
88
tm .assert_frame_equal (result , expected )
90
89
91
90
91
+ @pytest .fixture
92
+ def np_dtype_to_arrays (any_real_dtype ):
93
+ np_dtype = np .dtype (any_real_dtype )
94
+ pa_type = pa .from_numpy_dtype (np_dtype )
95
+
96
+ # None ensures the creation of a bitmask buffer.
97
+ pa_array = pa .array ([0 , 1 , 2 , None ], type = pa_type )
98
+ # Since masked Arrow buffer slots are not required to contain a specific
99
+ # value, assert only the first three values of the created np.array
100
+ np_expected = np .array ([0 , 1 , 2 ], dtype = np_dtype )
101
+ mask_expected = np .array ([True , True , True , False ])
102
+ return np_dtype , pa_array , np_expected , mask_expected
103
+
104
+
105
+ def test_pyarrow_array_to_numpy_and_mask (np_dtype_to_arrays ):
106
+ """
107
+ Test conversion from pyarrow array to numpy array.
108
+
109
+ Modifies the pyarrow buffer to contain padding and offset, which are
110
+ considered valid buffers by pyarrow.
111
+
112
+ Also tests empty pyarrow arrays with non empty buffers.
113
+ See https://github.com/pandas-dev/pandas/issues/40896
114
+ """
115
+ np_dtype , pa_array , np_expected , mask_expected = np_dtype_to_arrays
116
+ data , mask = pyarrow_array_to_numpy_and_mask (pa_array , np_dtype )
117
+ tm .assert_numpy_array_equal (data [:3 ], np_expected )
118
+ tm .assert_numpy_array_equal (mask , mask_expected )
119
+
120
+ mask_buffer = pa_array .buffers ()[0 ]
121
+ data_buffer = pa_array .buffers ()[1 ]
122
+ data_buffer_bytes = pa_array .buffers ()[1 ].to_pybytes ()
123
+
124
+ # Add trailing padding to the buffer.
125
+ data_buffer_trail = pa .py_buffer (data_buffer_bytes + b"\x00 " )
126
+ pa_array_trail = pa .Array .from_buffers (
127
+ type = pa_array .type ,
128
+ length = len (pa_array ),
129
+ buffers = [mask_buffer , data_buffer_trail ],
130
+ offset = pa_array .offset ,
131
+ )
132
+ pa_array_trail .validate ()
133
+ data , mask = pyarrow_array_to_numpy_and_mask (pa_array_trail , np_dtype )
134
+ tm .assert_numpy_array_equal (data [:3 ], np_expected )
135
+ tm .assert_numpy_array_equal (mask , mask_expected )
136
+
137
+ # Add offset to the buffer.
138
+ offset = b"\x00 " * (pa_array .type .bit_width // 8 )
139
+ data_buffer_offset = pa .py_buffer (offset + data_buffer_bytes )
140
+ mask_buffer_offset = pa .py_buffer (b"\x0E " )
141
+ pa_array_offset = pa .Array .from_buffers (
142
+ type = pa_array .type ,
143
+ length = len (pa_array ),
144
+ buffers = [mask_buffer_offset , data_buffer_offset ],
145
+ offset = pa_array .offset + 1 ,
146
+ )
147
+ pa_array_offset .validate ()
148
+ data , mask = pyarrow_array_to_numpy_and_mask (pa_array_offset , np_dtype )
149
+ tm .assert_numpy_array_equal (data [:3 ], np_expected )
150
+ tm .assert_numpy_array_equal (mask , mask_expected )
151
+
152
+ # Empty array
153
+ np_expected_empty = np .array ([], dtype = np_dtype )
154
+ mask_expected_empty = np .array ([], dtype = np .bool_ )
155
+
156
+ pa_array_offset = pa .Array .from_buffers (
157
+ type = pa_array .type ,
158
+ length = 0 ,
159
+ buffers = [mask_buffer , data_buffer ],
160
+ offset = pa_array .offset ,
161
+ )
162
+ pa_array_offset .validate ()
163
+ data , mask = pyarrow_array_to_numpy_and_mask (pa_array_offset , np_dtype )
164
+ tm .assert_numpy_array_equal (data [:3 ], np_expected_empty )
165
+ tm .assert_numpy_array_equal (mask , mask_expected_empty )
166
+
167
+
92
168
@td .skip_if_no ("pyarrow" , min_version = "0.16.0" )
93
169
def test_from_arrow_type_error (request , data ):
94
170
# ensure that __from_arrow__ returns a TypeError when getting a wrong
95
171
# array type
96
- import pyarrow as pa
97
-
98
172
if data .dtype != "boolean" :
99
173
# TODO numeric dtypes cast any incoming array to the correct dtype
100
174
# instead of erroring
0 commit comments