Skip to content

Switch arrow type for string array to large string #56220

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Dec 21, 2023
Merged
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,8 @@ Other enhancements
- Implemented :meth:`Series.str.extract` for :class:`ArrowDtype` (:issue:`56268`)
- Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as "BMS" (:issue:`56243`)
- Improved error message when constructing :class:`Period` with invalid offsets such as "QS" (:issue:`55785`)
- The dtypes ``string[pyarrow]`` and ``string[pyarrow_numpy]`` now both utilize the ``large_string`` type from PyArrow to avoid overflow for long columns (:issue:`56259`)


.. ---------------------------------------------------------------------------
.. _whatsnew_220.notable_bug_fixes:
Expand Down
21 changes: 16 additions & 5 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,7 @@ def _from_sequence_of_strings(
pa_type is None
or pa.types.is_binary(pa_type)
or pa.types.is_string(pa_type)
or pa.types.is_large_string(pa_type)
):
# pa_type is None: Let pa.array infer
# pa_type is string/binary: scalars already correct type
Expand Down Expand Up @@ -632,7 +633,9 @@ def __invert__(self) -> Self:
# This is a bit wise op for integer types
if pa.types.is_integer(self._pa_array.type):
return type(self)(pc.bit_wise_not(self._pa_array))
elif pa.types.is_string(self._pa_array.type):
elif pa.types.is_string(self._pa_array.type) or pa.types.is_large_string(
self._pa_array.type
):
# Raise TypeError instead of pa.ArrowNotImplementedError
raise TypeError("__invert__ is not supported for string dtypes")
else:
Expand Down Expand Up @@ -692,7 +695,11 @@ def _evaluate_op_method(self, other, op, arrow_funcs):
pa_type = self._pa_array.type
other = self._box_pa(other)

if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type):
if (
pa.types.is_string(pa_type)
or pa.types.is_large_string(pa_type)
or pa.types.is_binary(pa_type)
):
if op in [operator.add, roperator.radd]:
sep = pa.scalar("", type=pa_type)
if op is operator.add:
Expand All @@ -709,7 +716,9 @@ def _evaluate_op_method(self, other, op, arrow_funcs):
result = pc.binary_repeat(binary, pa_integral)
return type(self)(result)
elif (
pa.types.is_string(other.type) or pa.types.is_binary(other.type)
pa.types.is_string(other.type)
or pa.types.is_binary(other.type)
or pa.types.is_large_string(other.type)
) and op in [operator.mul, roperator.rmul]:
binary = other
integral = self._pa_array
Expand Down Expand Up @@ -1471,7 +1480,7 @@ def _concat_same_type(cls, to_concat) -> Self:
chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()]
if to_concat[0].dtype == "string":
# StringDtype has no attribute pyarrow_dtype
pa_dtype = pa.string()
pa_dtype = pa.large_string()
else:
pa_dtype = to_concat[0].dtype.pyarrow_dtype
arr = pa.chunked_array(chunks, type=pa_dtype)
Expand Down Expand Up @@ -2253,7 +2262,9 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None):
return type(self)(result)

def _str_join(self, sep: str):
if pa.types.is_string(self._pa_array.type):
if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string(
self._pa_array.type
):
result = self._apply_elementwise(list)
result = pa.chunked_array(result, type=pa.list_(pa.string()))
else:
Expand Down
38 changes: 26 additions & 12 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,17 +126,40 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr
_storage = "pyarrow"

def __init__(self, values) -> None:
_chk_pyarrow_available()
if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string(
values.type
):
values = pc.cast(values, pa.large_string())

super().__init__(values)
self._dtype = StringDtype(storage=self._storage)

if not pa.types.is_string(self._pa_array.type) and not (
if not pa.types.is_large_string(self._pa_array.type) and not (
pa.types.is_dictionary(self._pa_array.type)
and pa.types.is_string(self._pa_array.type.value_type)
and pa.types.is_large_string(self._pa_array.type.value_type)
):
raise ValueError(
"ArrowStringArray requires a PyArrow (chunked) array of string type"
"ArrowStringArray requires a PyArrow (chunked) array of "
"large_string type"
)

@classmethod
def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
pa_scalar = super()._box_pa_scalar(value, pa_type)
if pa.types.is_string(pa_scalar.type) and pa_type is None:
pa_scalar = pc.cast(pa_scalar, pa.large_string())
return pa_scalar

@classmethod
def _box_pa_array(
cls, value, pa_type: pa.DataType | None = None, copy: bool = False
) -> pa.Array | pa.ChunkedArray:
pa_array = super()._box_pa_array(value, pa_type)
if pa.types.is_string(pa_array.type) and pa_type is None:
pa_array = pc.cast(pa_array, pa.large_string())
return pa_array

def __len__(self) -> int:
"""
Length of this array.
Expand Down Expand Up @@ -574,15 +597,6 @@ def _rank(
class ArrowStringArrayNumpySemantics(ArrowStringArray):
_storage = "pyarrow_numpy"

def __init__(self, values) -> None:
_chk_pyarrow_available()

if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_large_string(
values.type
):
values = pc.cast(values, pa.string())
super().__init__(values)

@classmethod
def _result_converter(cls, values, na=None):
if not isna(na):
Expand Down
14 changes: 11 additions & 3 deletions pandas/io/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,17 @@ def _convert_arrays_to_dataframe(
)
if dtype_backend == "pyarrow":
pa = import_optional_dependency("pyarrow")
arrays = [
ArrowExtensionArray(pa.array(arr, from_pandas=True)) for arr in arrays
]

result_arrays = []
for arr in arrays:
pa_array = pa.array(arr, from_pandas=True)
if arr.dtype == "string":
# TODO: Arrow still infers strings arrays as regular strings instead
# of large_string, which is what we preserver everywhere else for
# dtype_backend="pyarrow". We may want to reconsider this
pa_array = pa_array.cast(pa.string())
result_arrays.append(ArrowExtensionArray(pa_array))
arrays = result_arrays # type: ignore[assignment]
if arrays:
df = DataFrame(dict(zip(list(range(len(columns))), arrays)))
df.columns = columns
Expand Down
16 changes: 12 additions & 4 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,13 +487,15 @@ def test_fillna_args(dtype, arrow_string_storage):
def test_arrow_array(dtype):
# protocol added in 0.15.0
pa = pytest.importorskip("pyarrow")
import pyarrow.compute as pc

data = pd.array(["a", "b", "c"], dtype=dtype)
arr = pa.array(data)
expected = pa.array(list(data), type=pa.string(), from_pandas=True)
expected = pa.array(list(data), type=pa.large_string(), from_pandas=True)
if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0:
expected = pa.chunked_array(expected)

if dtype.storage == "python":
expected = pc.cast(expected, pa.string())
assert arr.equals(expected)


Expand All @@ -512,7 +514,10 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
data = pd.array(["a", "b", None], dtype=dtype)
df = pd.DataFrame({"a": data})
table = pa.table(df)
assert table.field("a").type == "string"
if dtype.storage == "python":
assert table.field("a").type == "string"
else:
assert table.field("a").type == "large_string"
with pd.option_context("string_storage", string_storage2):
result = table.to_pandas()
assert isinstance(result["a"].dtype, pd.StringDtype)
Expand All @@ -539,7 +544,10 @@ def test_arrow_load_from_zero_chunks(
data = pd.array([], dtype=dtype)
df = pd.DataFrame({"a": data})
table = pa.table(df)
assert table.field("a").type == "string"
if dtype.storage == "python":
assert table.field("a").type == "string"
else:
assert table.field("a").type == "large_string"
# Instantiate the same table with no chunks at all
table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
with pd.option_context("string_storage", string_storage2):
Expand Down
9 changes: 6 additions & 3 deletions pandas/tests/arrays/string_/test_string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def test_constructor_not_string_type_raises(array, chunked, arrow_string_storage
msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowExtensionArray"
else:
msg = re.escape(
"ArrowStringArray requires a PyArrow (chunked) array of string type"
"ArrowStringArray requires a PyArrow (chunked) array of large_string type"
)
with pytest.raises(ValueError, match=msg):
ArrowStringArray(arr)
Expand All @@ -76,17 +76,20 @@ def test_constructor_not_string_type_value_dictionary_raises(chunked):
arr = pa.chunked_array(arr)

msg = re.escape(
"ArrowStringArray requires a PyArrow (chunked) array of string type"
"ArrowStringArray requires a PyArrow (chunked) array of large_string type"
)
with pytest.raises(ValueError, match=msg):
ArrowStringArray(arr)


@pytest.mark.xfail(
reason="dict conversion does not seem to be implemented for large string in arrow"
)
@pytest.mark.parametrize("chunked", [True, False])
def test_constructor_valid_string_type_value_dictionary(chunked):
pa = pytest.importorskip("pyarrow")

arr = pa.array(["1", "2", "3"], pa.dictionary(pa.int32(), pa.utf8()))
arr = pa.array(["1", "2", "3"], pa.large_string()).dictionary_encode()
if chunked:
arr = pa.chunked_array(arr)

Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2054,6 +2054,13 @@ def test_read_json_dtype_backend(
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))

elif dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))

else:
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/io/parser/test_read_fwf.py
Original file line number Diff line number Diff line change
Expand Up @@ -971,6 +971,12 @@ def test_dtype_backend(string_storage, dtype_backend):
if string_storage == "python":
arr = StringArray(np.array(["a", "b"], dtype=np.object_))
arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_))
elif dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

arr = ArrowExtensionArray(pa.array(["a", "b"]))
arr_na = ArrowExtensionArray(pa.array([None, "a"]))
else:
pa = pytest.importorskip("pyarrow")
arr = ArrowStringArray(pa.array(["a", "b"]))
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/io/test_clipboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,13 @@ def test_read_clipboard_dtype_backend(
string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))

elif dtype_backend == "pyarrow" and engine != "c":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["x", "y"]))
string_array_na = ArrowExtensionArray(pa.array(["x", None]))

else:
string_array = ArrowStringArray(pa.array(["x", "y"]))
string_array_na = ArrowStringArray(pa.array(["x", None]))
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/io/test_feather.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,12 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_))

elif dtype_backend == "pyarrow":
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))

else:
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/io/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,12 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html):
if string_storage == "python":
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))
elif dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))
else:
pa = pytest.importorskip("pyarrow")
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/io/test_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -3647,6 +3647,13 @@ def func(storage, dtype_backend, conn_name) -> DataFrame:
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_))

elif dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) # type: ignore[assignment]
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) # type: ignore[assignment]

else:
pa = pytest.importorskip("pyarrow")
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/io/xml/test_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -2044,6 +2044,13 @@ def test_read_xml_nullable_dtypes(
string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))

elif dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["x", "y"]))
string_array_na = ArrowExtensionArray(pa.array(["x", None]))

else:
pa = pytest.importorskip("pyarrow")
string_array = ArrowStringArray(pa.array(["x", "y"]))
Expand Down