Skip to content

Commit 0c389e6

Browse files
jorisvandenbosscheMateusz Górski
authored and
Mateusz Górski
committed
ENH: Add StringArray.__arrow_array__ for conversion to Arrow (pandas-dev#29182)
1 parent 88a3bc0 commit 0c389e6

File tree

5 files changed

+43
-12
lines changed

5 files changed

+43
-12
lines changed

doc/source/whatsnew/v1.0.0.rst

+6-5
Original file line numberDiff line numberDiff line change
@@ -102,11 +102,12 @@ Other enhancements
102102
- :meth:`DataFrame.to_string` added the ``max_colwidth`` parameter to control when wide columns are truncated (:issue:`9784`)
103103
- :meth:`MultiIndex.from_product` infers level names from inputs if not explicitly provided (:issue:`27292`)
104104
- :meth:`DataFrame.to_latex` now accepts ``caption`` and ``label`` arguments (:issue:`25436`)
105-
- The :ref:`integer dtype <integer_na>` with support for missing values can now be converted to
106-
``pyarrow`` (>= 0.15.0), which means that it is supported in writing to the Parquet file format
107-
when using the ``pyarrow`` engine. It is currently not yet supported when converting back to
108-
pandas (so it will become an integer or float dtype depending on the presence of missing data).
109-
(:issue:`28368`)
105+
- The :ref:`integer dtype <integer_na>` with support for missing values and the
106+
new :ref:`string dtype <text.types>` can now be converted to ``pyarrow`` (>=
107+
0.15.0), which means that it is supported in writing to the Parquet file
108+
format when using the ``pyarrow`` engine. It is currently not yet supported
109+
when converting back to pandas, so it will become an integer or float
110+
(depending on the presence of missing data) or object dtype column. (:issue:`28368`)
110111
- :meth:`DataFrame.to_json` now accepts an ``indent`` integer argument to enable pretty printing of JSON output (:issue:`12004`)
111112
- :meth:`read_stata` can read Stata 119 dta files. (:issue:`28250`)
112113
- Added ``encoding`` argument to :meth:`DataFrame.to_string` for non-ascii text (:issue:`28766`)

pandas/core/arrays/string_.py

+10
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,16 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
182182
def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
183183
return cls._from_sequence(strings, dtype=dtype, copy=copy)
184184

185+
def __arrow_array__(self, type=None):
186+
"""
187+
Convert myself into a pyarrow Array.
188+
"""
189+
import pyarrow as pa
190+
191+
if type is None:
192+
type = pa.string()
193+
return pa.array(self._ndarray, type=type, from_pandas=True)
194+
185195
def __setitem__(self, key, value):
186196
value = extract_array(value, extract_numpy=True)
187197
if isinstance(value, type(self)):

pandas/tests/arrays/string_/test_string.py

+13
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import numpy as np
44
import pytest
55

6+
import pandas.util._test_decorators as td
7+
68
import pandas as pd
79
import pandas.util.testing as tm
810

@@ -158,3 +160,14 @@ def test_reduce_missing(skipna):
158160
assert result == "abc"
159161
else:
160162
assert pd.isna(result)
163+
164+
165+
@td.skip_if_no("pyarrow", min_version="0.15.0")
166+
def test_arrow_array():
167+
# protocol added in 0.15.0
168+
import pyarrow as pa
169+
170+
data = pd.array(["a", "b", "c"], dtype="string")
171+
arr = pa.array(data)
172+
expected = pa.array(list(data), type=pa.string(), from_pandas=True)
173+
assert arr.equals(expected)

pandas/tests/arrays/test_integer.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -819,7 +819,7 @@ def test_ufunc_reduce_raises(values):
819819
np.add.reduce(a)
820820

821821

822-
@td.skip_if_no("pyarrow", min_version="0.14.1.dev")
822+
@td.skip_if_no("pyarrow", min_version="0.15.0")
823823
def test_arrow_array(data):
824824
# protocol added in 0.15.0
825825
import pyarrow as pa

pandas/tests/io/test_parquet.py

+13-6
Original file line numberDiff line numberDiff line change
@@ -504,15 +504,22 @@ def test_empty_dataframe(self, pa):
504504
df = pd.DataFrame()
505505
check_round_trip(df, pa)
506506

507-
@td.skip_if_no("pyarrow", min_version="0.14.1.dev")
508-
def test_nullable_integer(self, pa):
509-
df = pd.DataFrame({"a": pd.Series([1, 2, 3], dtype="Int64")})
510-
# currently de-serialized as plain int
511-
expected = df.assign(a=df.a.astype("int64"))
507+
@td.skip_if_no("pyarrow", min_version="0.15.0")
508+
def test_additional_extension_arrays(self, pa):
509+
# test additional ExtensionArrays that are supported through the
510+
# __arrow_array__ protocol
511+
df = pd.DataFrame(
512+
{
513+
"a": pd.Series([1, 2, 3], dtype="Int64"),
514+
"b": pd.Series(["a", None, "c"], dtype="string"),
515+
}
516+
)
517+
# currently de-serialized as plain int / object
518+
expected = df.assign(a=df.a.astype("int64"), b=df.b.astype("object"))
512519
check_round_trip(df, pa, expected=expected)
513520

514521
df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")})
515-
# if missing values currently de-serialized as float
522+
# if missing values in integer, currently de-serialized as float
516523
expected = df.assign(a=df.a.astype("float64"))
517524
check_round_trip(df, pa, expected=expected)
518525

0 commit comments

Comments
 (0)