diff --git a/pandas/_typing.py b/pandas/_typing.py index c1769126a5776..96b4d2bba8291 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -528,3 +528,42 @@ def closed(self) -> bool: SequenceT = TypeVar("SequenceT", bound=Sequence[Hashable]) SliceType = Optional[Hashable] + + +# Arrow PyCapsule Interface +# from https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#protocol-typehints + + +class ArrowArrayExportable(Protocol): + """ + An object with an ``__arrow_c_array__`` method. + + This method indicates the object is an Arrow-compatible object implementing + the `Arrow PyCapsule Protocol`_ (exposing the `Arrow C Data Interface`_ in + Python), enabling zero-copy Arrow data interchange across libraries. + + .. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html + .. _Arrow C Data Interface: https://arrow.apache.org/docs/format/CDataInterface.html + + """ + + def __arrow_c_array__( + self, requested_schema: object | None = None + ) -> tuple[object, object]: ... + + +class ArrowStreamExportable(Protocol): + """ + An object with an ``__arrow_c_stream__`` method. + + This method indicates the object is an Arrow-compatible object implementing + the `Arrow PyCapsule Protocol`_ (exposing the `Arrow C Data Interface`_ + for streams in Python), enabling zero-copy Arrow data interchange across + libraries. + + .. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html + .. _Arrow C Data Interface: https://arrow.apache.org/docs/format/CDataInterface.html + + """ + + def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: ... diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f47acf579d79c..ea39b91aa007f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -205,6 +205,8 @@ AnyAll, AnyArrayLike, ArrayLike, + ArrowArrayExportable, + ArrowStreamExportable, Axes, Axis, AxisInt, @@ -1746,6 +1748,54 @@ def __rmatmul__(self, other) -> DataFrame: # ---------------------------------------------------------------------- # IO methods (to / from other formats) + @classmethod + def from_arrow( + cls, data: ArrowArrayExportable | ArrowStreamExportable + ) -> DataFrame: + """ + Construct a DataFrame from a tabular Arrow object. + + This function accepts any tabular Arrow object implementing + the `Arrow PyCapsule Protocol`_ (i.e. having an ``__arrow_c_array__`` + or ``__arrow_c_stream__`` method). + + This function currently relies on ``pyarrow`` to convert the tabular + object in Arrow format to pandas. + + .. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html + + .. versionadded:: 3.0 + + Parameters + ---------- + data : pyarrow.Table or Arrow-compatible table + Any tabular object implementing the Arrow PyCapsule Protocol + (i.e. has an ``__arrow_c_array__`` or ``__arrow_c_stream__`` + method). + + Returns + ------- + DataFrame + + """ + pa = import_optional_dependency("pyarrow", min_version="14.0.0") + if not isinstance(data, pa.Table): + if not ( + hasattr(data, "__arrow_c_array__") + or hasattr(data, "__arrow_c_stream__") + ): + # explicitly test this, because otherwise we would accept variour other + # input types through the pa.table(..) call + raise TypeError( + "Expected an Arrow-compatible tabular object (i.e. having an " + "'_arrow_c_array__' or '__arrow_c_stream__' method), got " + f"'{type(data).__name__}' instead." + ) + data = pa.table(data) + + df = data.to_pandas() + return df + @classmethod def from_dict( cls, diff --git a/pandas/tests/frame/test_arrow_interface.py b/pandas/tests/frame/test_arrow_interface.py index dc163268f64b9..9fd2e055421a1 100644 --- a/pandas/tests/frame/test_arrow_interface.py +++ b/pandas/tests/frame/test_arrow_interface.py @@ -7,6 +7,7 @@ import pandas.util._test_decorators as td import pandas as pd +import pandas._testing as tm pa = pytest.importorskip("pyarrow") @@ -47,3 +48,46 @@ def test_dataframe_to_arrow(): table = pa.RecordBatchReader.from_stream(df, schema=schema).read_all() expected = expected.cast(schema) assert table.equals(expected) + + +class ArrowArrayWrapper: + def __init__(self, batch): + self.array = batch + + def __arrow_c_array__(self, requested_schema=None): + return self.array.__arrow_c_array__(requested_schema) + + +class ArrowStreamWrapper: + def __init__(self, table): + self.stream = table + + def __arrow_c_stream__(self, requested_schema=None): + return self.stream.__arrow_c_stream__(requested_schema) + + +@td.skip_if_no("pyarrow", min_version="14.0") +def test_dataframe_from_arrow(): + # objects with __arrow_c_stream__ + table = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + + result = pd.DataFrame.from_arrow(table) + expected = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + tm.assert_frame_equal(result, expected) + + # not only pyarrow object are supported + result = pd.DataFrame.from_arrow(ArrowStreamWrapper(table)) + tm.assert_frame_equal(result, expected) + + # objects with __arrow_c_array__ + batch = pa.record_batch([[1, 2, 3], ["a", "b", "c"]], names=["a", "b"]) + + result = pd.DataFrame.from_arrow(table) + tm.assert_frame_equal(result, expected) + + result = pd.DataFrame.from_arrow(ArrowArrayWrapper(batch)) + tm.assert_frame_equal(result, expected) + + # only accept actual Arrow objects + with pytest.raises(TypeError, match="Expected an Arrow-compatible tabular object"): + pd.DataFrame.from_arrow({"a": [1, 2, 3], "b": ["a", "b", "c"]})