From f09a14a181f821a82fffcc60008ff3a25cb823de Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 24 Mar 2023 22:09:03 -0700 Subject: [PATCH] Backport PR #52184: DOC: Clarify difference between StringDtype(pyarrow) and ArrowDtype(string) --- doc/source/reference/arrays.rst | 7 ++++--- doc/source/user_guide/pyarrow.rst | 20 +++++++++++++++++++- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index edcd3d2a40b1a..54e49448daca8 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -93,9 +93,10 @@ PyArrow type pandas extension type NumPy .. note:: - For string types (``pyarrow.string()``, ``string[pyarrow]``), PyArrow support is still facilitated - by :class:`arrays.ArrowStringArray` and ``StringDtype("pyarrow")``. See the :ref:`string section ` - below. + Pyarrow-backed string support is provided by both ``pd.StringDtype("pyarrow")`` and ``pd.ArrowDtype(pa.string())``. + ``pd.StringDtype("pyarrow")`` is described below in the :ref:`string section ` + and will be returned if the string alias ``"string[pyarrow]"`` is specified. ``pd.ArrowDtype(pa.string())`` + generally has better interoperability with :class:`ArrowDtype` of different types. While individual values in an :class:`arrays.ArrowExtensionArray` are stored as a PyArrow objects, scalars are **returned** as Python scalars corresponding to the data type, e.g. a PyArrow int64 will be returned as Python int, or :class:`NA` for missing diff --git a/doc/source/user_guide/pyarrow.rst b/doc/source/user_guide/pyarrow.rst index 8531216ecc61e..63937ed27b8b2 100644 --- a/doc/source/user_guide/pyarrow.rst +++ b/doc/source/user_guide/pyarrow.rst @@ -35,6 +35,23 @@ which is similar to a NumPy array. To construct these from the main pandas data df = pd.DataFrame([[1, 2], [3, 4]], dtype="uint64[pyarrow]") df +.. note:: + + The string alias ``"string[pyarrow]"`` maps to ``pd.StringDtype("pyarrow")`` which is not equivalent to + specifying ``dtype=pd.ArrowDtype(pa.string())``. Generally, operations on the data will behave similarly + except ``pd.StringDtype("pyarrow")`` can return NumPy-backed nullable types while ``pd.ArrowDtype(pa.string())`` + will return :class:`ArrowDtype`. + + .. ipython:: python + + import pyarrow as pa + data = list("abc") + ser_sd = pd.Series(data, dtype="string[pyarrow]") + ser_ad = pd.Series(data, dtype=pd.ArrowDtype(pa.string())) + ser_ad.dtype == ser_sd.dtype + ser_sd.str.contains("a") + ser_ad.str.contains("a") + For PyArrow types that accept parameters, you can pass in a PyArrow type with those parameters into :class:`ArrowDtype` to use in the ``dtype`` parameter. @@ -106,6 +123,7 @@ The following are just some examples of operations that are accelerated by nativ .. ipython:: python + import pyarrow as pa ser = pd.Series([-1.545, 0.211, None], dtype="float32[pyarrow]") ser.mean() ser + ser @@ -115,7 +133,7 @@ The following are just some examples of operations that are accelerated by nativ ser.isna() ser.fillna(0) - ser_str = pd.Series(["a", "b", None], dtype="string[pyarrow]") + ser_str = pd.Series(["a", "b", None], dtype=pd.ArrowDtype(pa.string())) ser_str.str.startswith("a") from datetime import datetime