Skip to content

Commit 83e4332

Browse files
PERF: optimize conversion from boolean Arrow array to masked BooleanArray (#41051)
1 parent 9be0b82 commit 83e4332

File tree

3 files changed

+46
-4
lines changed

3 files changed

+46
-4
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,7 @@ Performance improvements
631631
- Performance improvement in :class:`Styler` where render times are more than 50% reduced (:issue:`39972` :issue:`39952`)
632632
- Performance improvement in :meth:`core.window.ewm.ExponentialMovingWindow.mean` with ``times`` (:issue:`39784`)
633633
- Performance improvement in :meth:`.GroupBy.apply` when requiring the python fallback implementation (:issue:`40176`)
634+
- Performance improvement in the conversion of pyarrow boolean array to a pandas nullable boolean array (:issue:`41051`)
634635
- Performance improvement for concatenation of data with type :class:`CategoricalDtype` (:issue:`40193`)
635636
- Performance improvement in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` with nullable data types (:issue:`37493`)
636637
-

pandas/core/arrays/boolean.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,9 @@ def __from_arrow__(
114114
"""
115115
import pyarrow
116116

117+
if array.type != pyarrow.bool_():
118+
raise TypeError(f"Expected array of boolean type, got {array.type} instead")
119+
117120
if isinstance(array, pyarrow.Array):
118121
chunks = [array]
119122
else:
@@ -122,8 +125,19 @@ def __from_arrow__(
122125

123126
results = []
124127
for arr in chunks:
125-
# TODO should optimize this without going through object array
126-
bool_arr = BooleanArray._from_sequence(np.array(arr))
128+
buflist = arr.buffers()
129+
data = pyarrow.BooleanArray.from_buffers(
130+
arr.type, len(arr), [None, buflist[1]], offset=arr.offset
131+
).to_numpy(zero_copy_only=False)
132+
if arr.null_count != 0:
133+
mask = pyarrow.BooleanArray.from_buffers(
134+
arr.type, len(arr), [None, buflist[0]], offset=arr.offset
135+
).to_numpy(zero_copy_only=False)
136+
mask = ~mask
137+
else:
138+
mask = np.zeros(len(arr), dtype=bool)
139+
140+
bool_arr = BooleanArray(data, mask)
127141
results.append(bool_arr)
128142

129143
return BooleanArray._concat_same_type(results)

pandas/tests/arrays/masked/test_arrow_compat.py

+29-2
Original file line numberDiff line numberDiff line change
@@ -55,12 +55,39 @@ def test_arrow_from_arrow_uint():
5555

5656

5757
@td.skip_if_no("pyarrow", min_version="0.16.0")
58-
def test_arrow_sliced():
58+
def test_arrow_sliced(data):
5959
# https://github.com/pandas-dev/pandas/issues/38525
6060
import pyarrow as pa
6161

62-
df = pd.DataFrame({"a": pd.array([0, None, 2, 3, None], dtype="Int64")})
62+
df = pd.DataFrame({"a": data})
6363
table = pa.table(df)
6464
result = table.slice(2, None).to_pandas()
6565
expected = df.iloc[2:].reset_index(drop=True)
6666
tm.assert_frame_equal(result, expected)
67+
68+
# no missing values
69+
df2 = df.fillna(data[0])
70+
table = pa.table(df2)
71+
result = table.slice(2, None).to_pandas()
72+
expected = df2.iloc[2:].reset_index(drop=True)
73+
tm.assert_frame_equal(result, expected)
74+
75+
76+
@td.skip_if_no("pyarrow", min_version="0.16.0")
77+
def test_from_arrow_type_error(request, data):
78+
# ensure that __from_arrow__ returns a TypeError when getting a wrong
79+
# array type
80+
import pyarrow as pa
81+
82+
if data.dtype != "boolean":
83+
# TODO numeric dtypes cast any incoming array to the correct dtype
84+
# instead of erroring
85+
request.node.add_marker(
86+
pytest.mark.xfail(reason="numeric dtypes don't error but cast")
87+
)
88+
89+
arr = pa.array(data).cast("string")
90+
with pytest.raises(TypeError, match=None):
91+
# we don't test the exact error message, only the fact that it raises
92+
# a TypeError is relevant
93+
data.dtype.__from_arrow__(arr)

0 commit comments

Comments
 (0)