From 84543aff080192b0b1c50da1dd1202c5fbabaa4b Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 23 Aug 2024 18:10:43 +0100 Subject: [PATCH 1/3] ENH: support Arrow PyCapsule Interface on Series for export --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/series.py | 34 +++++++++++++++++++++ pandas/tests/series/test_arrow_interface.py | 26 ++++++++++++++++ 3 files changed, 61 insertions(+) create mode 100644 pandas/tests/series/test_arrow_interface.py diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 1533f9267ce39..eaf9ce899f03a 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -43,6 +43,7 @@ Other enhancements - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`) - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`) - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`) +- :class:`Series` now supports the Arrow PyCapsule Interface for export (:issue:`59518`) - :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`) - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`) - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) diff --git a/pandas/core/series.py b/pandas/core/series.py index 5c35c6c0d6d23..7a6f83156699c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -34,6 +34,7 @@ from pandas._libs.lib import is_range_indexer from pandas.compat import PYPY from pandas.compat._constants import REF_COUNT +from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.errors import ( ChainedAssignmentError, @@ -558,6 +559,39 @@ def _init_dict( # ---------------------------------------------------------------------- + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the pandas Series as an Arrow C stream PyCapsule. + + This relies on pyarrow to convert the pandas Series to the Arrow + format (and follows the default behaviour of ``pyarrow.Array.from_pandas`` + in its handling of the index, i.e. to ignore it). + This conversion is not necessarily zero-copy. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the dataframe should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + """ + pa = import_optional_dependency("pyarrow", min_version="16.0.0") + if requested_schema is not None: + # todo: how should this be supported? + msg = ( + "Passing `requested_schema` to `Series.__arrow_c_stream__` is not yet " + "supported" + ) + raise NotImplementedError(msg) + ca = pa.chunked_array([pa.Array.from_pandas(self, type=requested_schema)]) + return ca.__arrow_c_stream__() + + # ---------------------------------------------------------------------- + @property def _constructor(self) -> type[Series]: return Series diff --git a/pandas/tests/series/test_arrow_interface.py b/pandas/tests/series/test_arrow_interface.py new file mode 100644 index 0000000000000..d649f7be76274 --- /dev/null +++ b/pandas/tests/series/test_arrow_interface.py @@ -0,0 +1,26 @@ +import ctypes + +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd + +pa = pytest.importorskip("pyarrow") + + +@td.skip_if_no("pyarrow", min_version="16.0") +def test_series_arrow_interface(): + s = pd.Series([1, 4, 2]) + + capsule = s.__arrow_c_stream__() + assert ( + ctypes.pythonapi.PyCapsule_IsValid( + ctypes.py_object(capsule), b"arrow_array_stream" + ) + == 1 + ) + + ca = pa.chunked_array(s) + expected = pa.chunked_array([[1, 4, 2]]) + assert ca.equals(expected) From 5b3eb17ea42ad26f43fc910565cf366daedaed79 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 23 Aug 2024 19:14:08 +0100 Subject: [PATCH 2/3] simplify --- pandas/core/series.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 7a6f83156699c..8933b8c5d92bf 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -580,15 +580,8 @@ def __arrow_c_stream__(self, requested_schema=None): PyCapsule """ pa = import_optional_dependency("pyarrow", min_version="16.0.0") - if requested_schema is not None: - # todo: how should this be supported? - msg = ( - "Passing `requested_schema` to `Series.__arrow_c_stream__` is not yet " - "supported" - ) - raise NotImplementedError(msg) ca = pa.chunked_array([pa.Array.from_pandas(self, type=requested_schema)]) - return ca.__arrow_c_stream__() + return ca.__arrow_c_stream__(requested_schema) # ---------------------------------------------------------------------- From 6db2b8ef29520cc58a944ef0f57de201039c5c31 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 23 Aug 2024 19:14:39 +0100 Subject: [PATCH 3/3] simplify --- pandas/tests/series/test_arrow_interface.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/tests/series/test_arrow_interface.py b/pandas/tests/series/test_arrow_interface.py index d649f7be76274..34a2a638e4185 100644 --- a/pandas/tests/series/test_arrow_interface.py +++ b/pandas/tests/series/test_arrow_interface.py @@ -2,14 +2,11 @@ import pytest -import pandas.util._test_decorators as td - import pandas as pd -pa = pytest.importorskip("pyarrow") +pa = pytest.importorskip("pyarrow", minversion="16.0") -@td.skip_if_no("pyarrow", min_version="16.0") def test_series_arrow_interface(): s = pd.Series([1, 4, 2])