Skip to content

PERF: optimize conversion from boolean Arrow array to masked BooleanArray #41051

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,7 @@ Performance improvements
- Performance improvement in :class:`Styler` where render times are more than 50% reduced (:issue:`39972` :issue:`39952`)
- Performance improvement in :meth:`core.window.ewm.ExponentialMovingWindow.mean` with ``times`` (:issue:`39784`)
- Performance improvement in :meth:`.GroupBy.apply` when requiring the python fallback implementation (:issue:`40176`)
- Performance improvement in the conversion of pyarrow boolean array to a pandas nullable boolean array (:issue:`41051`)
- Performance improvement for concatenation of data with type :class:`CategoricalDtype` (:issue:`40193`)

.. ---------------------------------------------------------------------------
Expand Down
18 changes: 16 additions & 2 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,9 @@ def __from_arrow__(
"""
import pyarrow

if array.type != pyarrow.bool_():
raise TypeError(f"Expected array of boolean type, got {array.type} instead")

if isinstance(array, pyarrow.Array):
chunks = [array]
else:
Expand All @@ -122,8 +125,19 @@ def __from_arrow__(

results = []
for arr in chunks:
# TODO should optimize this without going through object array
bool_arr = BooleanArray._from_sequence(np.array(arr))
buflist = arr.buffers()
data = pyarrow.BooleanArray.from_buffers(
arr.type, len(arr), [None, buflist[1]], offset=arr.offset
).to_numpy(zero_copy_only=False)
Copy link
Member

@simonjayhawkins simonjayhawkins Apr 20, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

related to #41018 (comment)

if we do pass an incompatible type we now get a less helpful message

>>> import pandas as pd
>>> from pandas.core.arrays.string_arrow import ArrowStringDtype
>>> 
>>> s2 = pd.Series(["a", None, "1"], dtype="arrow_string")
>>> 
>>> arr = s2.values._data
>>> 
>>> arr
<pyarrow.lib.ChunkedArray object at 0x7f50a262ba90>
[
  [
    "a",
    null,
    "1"
  ]
]
>>> 
>>> pd.BooleanDtype().__from_arrow__(arr)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/home/simon/pandas/pandas/core/arrays/boolean.py", line 126, in __from_arrow__
    data = pyarrow.BooleanArray.from_buffers(
  File "pyarrow/array.pxi", line 947, in pyarrow.lib.Array.from_buffers
ValueError: Type's expected number of buffers (3) did not match the passed number (2).

on master

TypeError: Need to pass bool-like values

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if we do pass an incompatible type we now get a less helpful message

Generally we should never do that ourselves, though.
It's possible to get that in a conversion of arrow -> pandas if you have outdated metadata:

>>> df = pd.DataFrame({'a': pd.array([True, False])})
>>> table = pa.table(df)
>>> new_table = table.cast(pa.schema([('a', pa.int8())], metadata=table.schema.metadata))
>>> new_table.to_pandas()
...
~/scipy/repos/pandas-build-arrow/pandas/core/arrays/boolean.py in __from_arrow__(self, array)
    135                 mask = np.zeros(len(arr), dtype=bool)
    136 
--> 137             bool_arr = BooleanArray(data, mask)
    138             results.append(bool_arr)
    139 

~/scipy/repos/pandas-build-arrow/pandas/core/arrays/boolean.py in __init__(self, values, mask, copy)
    289     def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
    290         if not (isinstance(values, np.ndarray) and values.dtype == np.bool_):
--> 291             raise TypeError(
    292                 "values should be boolean numpy array. Use "
    293                 "the 'pd.array' function instead"

TypeError: values should be boolean numpy array. Use the 'pd.array' function instead

>>> new_table = table.cast(pa.schema([('a', pa.string())], metadata=table.schema.metadata))
>>> new_table.to_pandas()
...
~/scipy/repos/pandas-build-arrow/pandas/core/arrays/boolean.py in __from_arrow__(self, array)
    124         for arr in chunks:
    125             buflist = arr.buffers()
--> 126             data = pyarrow.BooleanArray.from_buffers(
    127                 arr.type, len(arr), [None, buflist[1]], offset=arr.offset
    128             ).to_numpy(zero_copy_only=False)

~/scipy/repos/arrow/python/pyarrow/array.pxi in pyarrow.lib.Array.from_buffers()

ValueError: Type's expected number of buffers (3) did not match the passed number (2).

And I think this is maybe something to fix on the pyarrow side (it should ignore the metadata on error).

Now, it's also easy to add a arr.type check to ensure it is boolean arrow type, so will do that.

if arr.null_count != 0:
mask = pyarrow.BooleanArray.from_buffers(
arr.type, len(arr), [None, buflist[0]], offset=arr.offset
).to_numpy(zero_copy_only=False)
mask = ~mask
else:
mask = np.zeros(len(arr), dtype=bool)

bool_arr = BooleanArray(data, mask)
results.append(bool_arr)

return BooleanArray._concat_same_type(results)
Expand Down
31 changes: 29 additions & 2 deletions pandas/tests/arrays/masked/test_arrow_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,39 @@ def test_arrow_from_arrow_uint():


@td.skip_if_no("pyarrow", min_version="0.16.0")
def test_arrow_sliced():
def test_arrow_sliced(data):
# https://github.com/pandas-dev/pandas/issues/38525
import pyarrow as pa

df = pd.DataFrame({"a": pd.array([0, None, 2, 3, None], dtype="Int64")})
df = pd.DataFrame({"a": data})
table = pa.table(df)
result = table.slice(2, None).to_pandas()
expected = df.iloc[2:].reset_index(drop=True)
tm.assert_frame_equal(result, expected)

# no missing values
df2 = df.fillna(data[0])
table = pa.table(df2)
result = table.slice(2, None).to_pandas()
expected = df2.iloc[2:].reset_index(drop=True)
tm.assert_frame_equal(result, expected)


@td.skip_if_no("pyarrow", min_version="0.16.0")
def test_from_arrow_type_error(request, data):
# ensure that __from_arrow__ returns a TypeError when getting a wrong
# array type
import pyarrow as pa

if data.dtype != "boolean":
# TODO numeric dtypes cast any incoming array to the correct dtype
# instead of erroring
request.node.add_marker(
pytest.mark.xfail(reason="numeric dtypes don't error but cast")
)

arr = pa.array(data).cast("string")
with pytest.raises(TypeError, match=None):
# we don't test the exact error message, only the fact that it raises
# a TypeError is relevant
data.dtype.__from_arrow__(arr)