ENH: Add StringArray.__arrow_array__ for conversion to Arrow (pandas-dev#29182)

jorisvandenbossche · Mateusz Górski · commit 0c389e609f13 · 2019-11-18T15:09:56.000+01:00
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -102,11 +102,12 @@ Other enhancements
 - :meth:`DataFrame.to_string` added the ``max_colwidth`` parameter to control when wide columns are truncated (:issue:`9784`)
 - :meth:`MultiIndex.from_product` infers level names from inputs if not explicitly provided (:issue:`27292`)
 - :meth:`DataFrame.to_latex` now accepts ``caption`` and ``label`` arguments (:issue:`25436`)
-- The :ref:`integer dtype <integer_na>` with support for missing values can now be converted to
-  ``pyarrow`` (>= 0.15.0), which means that it is supported in writing to the Parquet file format
-  when using the ``pyarrow`` engine. It is currently not yet supported when converting back to
-  pandas (so it will become an integer or float dtype depending on the presence of missing data).
-  (:issue:`28368`)
+- The :ref:`integer dtype <integer_na>` with support for missing values and the
+  new :ref:`string dtype <text.types>` can now be converted to ``pyarrow`` (>=
+  0.15.0), which means that it is supported in writing to the Parquet file
+  format when using the ``pyarrow`` engine. It is currently not yet supported
+  when converting back to pandas, so it will become an integer or float
+  (depending on the presence of missing data) or object dtype column. (:issue:`28368`)
 - :meth:`DataFrame.to_json` now accepts an ``indent`` integer argument to enable pretty printing of JSON output (:issue:`12004`)
 - :meth:`read_stata` can read Stata 119 dta files. (:issue:`28250`)
 - Added ``encoding`` argument to :meth:`DataFrame.to_string` for non-ascii text (:issue:`28766`)
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -182,6 +182,16 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
     def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
         return cls._from_sequence(strings, dtype=dtype, copy=copy)
 
+    def __arrow_array__(self, type=None):
+        """
+        Convert myself into a pyarrow Array.
+        """
+        import pyarrow as pa
+
+        if type is None:
+            type = pa.string()
+        return pa.array(self._ndarray, type=type, from_pandas=True)
+
     def __setitem__(self, key, value):
         value = extract_array(value, extract_numpy=True)
         if isinstance(value, type(self)):
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -3,6 +3,8 @@
 import numpy as np
 import pytest
 
+import pandas.util._test_decorators as td
+
 import pandas as pd
 import pandas.util.testing as tm
 
@@ -158,3 +160,14 @@ def test_reduce_missing(skipna):
         assert result == "abc"
     else:
         assert pd.isna(result)
+
+
+@td.skip_if_no("pyarrow", min_version="0.15.0")
+def test_arrow_array():
+    # protocol added in 0.15.0
+    import pyarrow as pa
+
+    data = pd.array(["a", "b", "c"], dtype="string")
+    arr = pa.array(data)
+    expected = pa.array(list(data), type=pa.string(), from_pandas=True)
+    assert arr.equals(expected)
diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py
@@ -819,7 +819,7 @@ def test_ufunc_reduce_raises(values):
         np.add.reduce(a)
 
 
-@td.skip_if_no("pyarrow", min_version="0.14.1.dev")
+@td.skip_if_no("pyarrow", min_version="0.15.0")
 def test_arrow_array(data):
     # protocol added in 0.15.0
     import pyarrow as pa
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -504,15 +504,22 @@ def test_empty_dataframe(self, pa):
         df = pd.DataFrame()
         check_round_trip(df, pa)
 
-    @td.skip_if_no("pyarrow", min_version="0.14.1.dev")
-    def test_nullable_integer(self, pa):
-        df = pd.DataFrame({"a": pd.Series([1, 2, 3], dtype="Int64")})
-        # currently de-serialized as plain int
-        expected = df.assign(a=df.a.astype("int64"))
+    @td.skip_if_no("pyarrow", min_version="0.15.0")
+    def test_additional_extension_arrays(self, pa):
+        # test additional ExtensionArrays that are supported through the
+        # __arrow_array__ protocol
+        df = pd.DataFrame(
+            {
+                "a": pd.Series([1, 2, 3], dtype="Int64"),
+                "b": pd.Series(["a", None, "c"], dtype="string"),
+            }
+        )
+        # currently de-serialized as plain int / object
+        expected = df.assign(a=df.a.astype("int64"), b=df.b.astype("object"))
         check_round_trip(df, pa, expected=expected)
 
         df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")})
-        # if missing values currently de-serialized as float
+        # if missing values in integer, currently de-serialized as float
         expected = df.assign(a=df.a.astype("float64"))
         check_round_trip(df, pa, expected=expected)