pandas-dev · mroeschke · Aug 18, 2022 · Jul 25, 2022 · Jul 26, 2022 · Jul 29, 2022
diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst
@@ -19,19 +19,20 @@ objects contained with a :class:`Index`, :class:`Series`, or
 For some data types, pandas extends NumPy's type system. String aliases for these types
 can be found at :ref:`basics.dtypes`.
 
-=================== ========================= ================== =============================
-Kind of Data        pandas Data Type          Scalar             Array
-=================== ========================= ================== =============================
-TZ-aware datetime   :class:`DatetimeTZDtype`  :class:`Timestamp` :ref:`api.arrays.datetime`
-Timedeltas          (none)                    :class:`Timedelta` :ref:`api.arrays.timedelta`
-Period (time spans) :class:`PeriodDtype`      :class:`Period`    :ref:`api.arrays.period`
-Intervals           :class:`IntervalDtype`    :class:`Interval`  :ref:`api.arrays.interval`
-Nullable Integer    :class:`Int64Dtype`, ...  (none)             :ref:`api.arrays.integer_na`
-Categorical         :class:`CategoricalDtype` (none)             :ref:`api.arrays.categorical`
-Sparse              :class:`SparseDtype`      (none)             :ref:`api.arrays.sparse`
-Strings             :class:`StringDtype`      :class:`str`       :ref:`api.arrays.string`
-Boolean (with NA)   :class:`BooleanDtype`     :class:`bool`      :ref:`api.arrays.bool`
-=================== ========================= ================== =============================
+=================== ========================= ============================= =============================
+Kind of Data        pandas Data Type          Scalar                        Array
+=================== ========================= ============================= =============================
+PyArrow             :class:`ArrowDtype`       Python Scalars or :class:`NA` :ref:`api.arrays.arrow`
+TZ-aware datetime   :class:`DatetimeTZDtype`  :class:`Timestamp`            :ref:`api.arrays.datetime`
+Timedeltas          (none)                    :class:`Timedelta`            :ref:`api.arrays.timedelta`
+Period (time spans) :class:`PeriodDtype`      :class:`Period`               :ref:`api.arrays.period`
+Intervals           :class:`IntervalDtype`    :class:`Interval`             :ref:`api.arrays.interval`
+Nullable Integer    :class:`Int64Dtype`, ...  (none)                        :ref:`api.arrays.integer_na`
+Categorical         :class:`CategoricalDtype` (none)                        :ref:`api.arrays.categorical`
+Sparse              :class:`SparseDtype`      (none)                        :ref:`api.arrays.sparse`
+Strings             :class:`StringDtype`      :class:`str`                  :ref:`api.arrays.string`
+Boolean (with NA)   :class:`BooleanDtype`     :class:`bool`                 :ref:`api.arrays.bool`
+=================== ========================= ============================= =============================
 
 pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`).
 The top-level :meth:`array` method can be used to create a new array, which may be
@@ -42,6 +43,40 @@ stored in a :class:`Series`, :class:`Index`, or as a column in a :class:`DataFra
 
    array
 
+.. _api.arrays.arrow:
+
+PyArrow
+-------
+
+The :class:`arrays.ArrowExtensionArray` is backed by a :external+pyarrow:py:class:`pyarrow.ChunkedArray` with a
+:external+pyarrow:py:class:`pyarrow.DataType` instead of a NumPy array and data type. The ``.dtype`` of a :class:`arrays.ArrowExtensionArray`
+is an :class:`ArrowDtype`.
+
+`Pyarrow <https://arrow.apache.org/docs/python/index.html>`__ provides similar array and `data type <https://arrow.apache.org/docs/python/api/datatypes.html>`__
+support as NumPy including first-class nullability support for all data types, immutability and more.
+
+.. note::
+
+    For string types (``pyarrow.string()``, ``string[pyarrow]``), PyArrow support is still facilitated
+    by :class:`arrays.ArrowStringArray` and ``StringDtype("pyarrow")``. See the :ref:`string section <api.arrays.string>`
+    below.
+
+While individual values in an :class:`arrays.ArrowExtensionArray` are stored as a PyArrow objects, scalars are **returned**
+as Python scalars corresponding to the data type, e.g. a PyArrow int64 will be returned as Python int, or :class:`NA` for missing
+values.
+
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_without_autosummary.rst
+
+   arrays.ArrowExtensionArray
+
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_without_autosummary.rst
+
+   ArrowDtype
+
 .. _api.arrays.datetime:
 
 Datetimes

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -14,6 +14,38 @@ including other versions of pandas.
 Enhancements
 ~~~~~~~~~~~~
 
+.. _whatsnew_150.enhancements.arrow:
+
+Native PyArrow-backed ExtensionArray
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+With `Pyarrow <https://arrow.apache.org/docs/python/index.html>`__ installed, users can now create pandas objects
+that are backed by a ``pyarrow.ChunkedArray`` and ``pyarrow.DataType``.
+
+The ``dtype`` argument can accept a string of a `pyarrow data type <https://arrow.apache.org/docs/python/api/datatypes.html>`__
+with ``pyarrow`` in brackets e.g. ``int64[pyarrow]`` or, for pyarrow data types that take parameters, a :class:`ArrowDtype`
+initialized with a ``pyarrow.DataType``
+
+.. ipython:: python
+
+    import pyarrow as pa
+    ser_float = pd.Series([1.0, 2.0, None], dtype="float32[pyarrow]")
+    ser_float
+
+    list_of_int_type = pd.ArrowDtype(pa.list_(pa.int64()))
+    ser_list = pd.Series([[1, 2], [3, None]], dtype=list_of_int_type)
+    ser_list
+
+Most operations are supported and have been implemented using `pyarrow compute <https://arrow.apache.org/docs/python/api/compute.html>`__ functions.
+We recommend installing the latest version of PyArrow to access the most recently implemented compute functions.
+
+.. ipython:: python
+
+    ser_list.take([1, 0])
+    ser_float * 5
+    ser_float.mean()
+    ser_float.dropna()
+
 .. _whatsnew_150.enhancements.dataframe_exchange:
 
 DataFrame exchange protocol implementation

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -159,8 +159,25 @@ def to_pyarrow_type(
 
 class ArrowExtensionArray(OpsMixin, ExtensionArray):
     """
-    Base class for ExtensionArray backed by Arrow ChunkedArray.
-    """
+    Pandas ExtensionArray backed by a PyArrow ChunkedArray.
+
+    Parameters
+    ----------
+    values : pyarrow.Array or pyarrow.ChunkedArray
+
+    Returns
+    -------
+    ArrowExtensionArray
+
+    Notes
+    -----
+    Most methods are implemented using `pyarrow compute functions. <https://arrow.apache.org/docs/python/api/compute.html>`__
+    Some methods may either raise an exception or raise a ``PerformanceWarning`` if an
+    associated compute function is not available based on the installed version of PyArrow.
+
+    Please install the latest version of PyArrow to enable the best functionality and avoid
+    potential bugs in prior versions of PyArrow.
+    """  # noqa: E501 (http link too long)
 
     _data: pa.ChunkedArray
     _dtype: ArrowDtype

diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py
@@ -17,9 +17,34 @@
 @register_extension_dtype
 class ArrowDtype(StorageExtensionDtype):
     """
-    Base class for dtypes for ArrowExtensionArray.
-    Modeled after BaseMaskedDtype
-    """
+    An ExtensionDtype for PyArrow data types.
+
+    While most ``dtype`` arguments can accept the "string"
+    constructor, e.g. ``int64[pyarrow]``, ArrowDtype is useful
+    if the data type contains parameters like ``pyarrow.timestamp``.
+
+    Parameters
+    ----------
+    pyarrow_dtype : pa.DataType
+        An instance of a `pyarrow.DataType <https://arrow.apache.org/docs/python/api/datatypes.html#factory-functions>`__
+
+    Returns
+    -------
+    ArrowDtype
+
+    Example
+    -------
+    >>> import pyarrow as pa
+    >>> pd.ArrowDtype(pa.int64())
+    int64[pyarrow]
+
+    Types with parameters must be constructed with ArrowDtype
+
+    >>> pd.ArrowDtype(pa.timestamp("s", tz="America/New_York"))
+    timestamp[s, tz=America/New_York][pyarrow]
+    >>> pd.ArrowDtype(pa.list_(pa.int64()))
+    list<item: int64>[pyarrow]
+    """  # noqa: E501
 
     _metadata = ("storage", "pyarrow_dtype")  # type: ignore[assignment]
 

diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py
@@ -150,6 +150,7 @@
     "LZMA",
     "Numba",
     "Timestamp",
+    "PyArrow",
 }
 
 CAP_EXCEPTIONS_DICT = {word.lower(): word for word in CAPITALIZATION_EXCEPTIONS}