diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 01a102f269886..5324d145eb3d0 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -102,11 +102,12 @@ Other enhancements - :meth:`DataFrame.to_string` added the ``max_colwidth`` parameter to control when wide columns are truncated (:issue:`9784`) - :meth:`MultiIndex.from_product` infers level names from inputs if not explicitly provided (:issue:`27292`) - :meth:`DataFrame.to_latex` now accepts ``caption`` and ``label`` arguments (:issue:`25436`) -- The :ref:`integer dtype ` with support for missing values can now be converted to - ``pyarrow`` (>= 0.15.0), which means that it is supported in writing to the Parquet file format - when using the ``pyarrow`` engine. It is currently not yet supported when converting back to - pandas (so it will become an integer or float dtype depending on the presence of missing data). - (:issue:`28368`) +- The :ref:`integer dtype ` with support for missing values and the + new :ref:`string dtype ` can now be converted to ``pyarrow`` (>= + 0.15.0), which means that it is supported in writing to the Parquet file + format when using the ``pyarrow`` engine. It is currently not yet supported + when converting back to pandas, so it will become an integer or float + (depending on the presence of missing data) or object dtype column. (:issue:`28368`) - :meth:`DataFrame.to_json` now accepts an ``indent`` integer argument to enable pretty printing of JSON output (:issue:`12004`) - :meth:`read_stata` can read Stata 119 dta files. (:issue:`28250`) - Added ``encoding`` argument to :func:`DataFrame.to_html` for non-ascii text (:issue:`28663`) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 87649ac651127..7c487b227de20 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -182,6 +182,16 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): return cls._from_sequence(strings, dtype=dtype, copy=copy) + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow as pa + + if type is None: + type = pa.string() + return pa.array(self._ndarray, type=type, from_pandas=True) + def __setitem__(self, key, value): value = extract_array(value, extract_numpy=True) if isinstance(value, type(self)): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 40221c34116ae..efe2b4e0b2deb 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd import pandas.util.testing as tm @@ -158,3 +160,14 @@ def test_reduce_missing(skipna): assert result == "abc" else: assert pd.isna(result) + + +@td.skip_if_no("pyarrow", min_version="0.15.0") +def test_arrow_array(): + # protocol added in 0.15.0 + import pyarrow as pa + + data = pd.array(["a", "b", "c"], dtype="string") + arr = pa.array(data) + expected = pa.array(list(data), type=pa.string(), from_pandas=True) + assert arr.equals(expected) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 55e25caafc4ee..793de66767cc3 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -819,7 +819,7 @@ def test_ufunc_reduce_raises(values): np.add.reduce(a) -@td.skip_if_no("pyarrow", min_version="0.14.1.dev") +@td.skip_if_no("pyarrow", min_version="0.15.0") def test_arrow_array(data): # protocol added in 0.15.0 import pyarrow as pa diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 2a95904d5668d..26bfefecc632d 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -504,15 +504,22 @@ def test_empty_dataframe(self, pa): df = pd.DataFrame() check_round_trip(df, pa) - @td.skip_if_no("pyarrow", min_version="0.14.1.dev") - def test_nullable_integer(self, pa): - df = pd.DataFrame({"a": pd.Series([1, 2, 3], dtype="Int64")}) - # currently de-serialized as plain int - expected = df.assign(a=df.a.astype("int64")) + @td.skip_if_no("pyarrow", min_version="0.15.0") + def test_additional_extension_arrays(self, pa): + # test additional ExtensionArrays that are supported through the + # __arrow_array__ protocol + df = pd.DataFrame( + { + "a": pd.Series([1, 2, 3], dtype="Int64"), + "b": pd.Series(["a", None, "c"], dtype="string"), + } + ) + # currently de-serialized as plain int / object + expected = df.assign(a=df.a.astype("int64"), b=df.b.astype("object")) check_round_trip(df, pa, expected=expected) df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")}) - # if missing values currently de-serialized as float + # if missing values in integer, currently de-serialized as float expected = df.assign(a=df.a.astype("float64")) check_round_trip(df, pa, expected=expected)