Skip to content

Commit 2488e5e

Browse files
authored
Switch arrow type for string array to large string (#56220)
1 parent 5ef4a35 commit 2488e5e

File tree

13 files changed

+118
-27
lines changed

13 files changed

+118
-27
lines changed

doc/source/whatsnew/v2.2.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,8 @@ Other enhancements
236236
- Implemented :meth:`Series.str.extract` for :class:`ArrowDtype` (:issue:`56268`)
237237
- Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as "BMS" (:issue:`56243`)
238238
- Improved error message when constructing :class:`Period` with invalid offsets such as "QS" (:issue:`55785`)
239+
- The dtypes ``string[pyarrow]`` and ``string[pyarrow_numpy]`` now both utilize the ``large_string`` type from PyArrow to avoid overflow for long columns (:issue:`56259`)
240+
239241

240242
.. ---------------------------------------------------------------------------
241243
.. _whatsnew_220.notable_bug_fixes:

pandas/core/arrays/arrow/array.py

+16-5
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,7 @@ def _from_sequence_of_strings(
291291
pa_type is None
292292
or pa.types.is_binary(pa_type)
293293
or pa.types.is_string(pa_type)
294+
or pa.types.is_large_string(pa_type)
294295
):
295296
# pa_type is None: Let pa.array infer
296297
# pa_type is string/binary: scalars already correct type
@@ -632,7 +633,9 @@ def __invert__(self) -> Self:
632633
# This is a bit wise op for integer types
633634
if pa.types.is_integer(self._pa_array.type):
634635
return type(self)(pc.bit_wise_not(self._pa_array))
635-
elif pa.types.is_string(self._pa_array.type):
636+
elif pa.types.is_string(self._pa_array.type) or pa.types.is_large_string(
637+
self._pa_array.type
638+
):
636639
# Raise TypeError instead of pa.ArrowNotImplementedError
637640
raise TypeError("__invert__ is not supported for string dtypes")
638641
else:
@@ -692,7 +695,11 @@ def _evaluate_op_method(self, other, op, arrow_funcs):
692695
pa_type = self._pa_array.type
693696
other = self._box_pa(other)
694697

695-
if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type):
698+
if (
699+
pa.types.is_string(pa_type)
700+
or pa.types.is_large_string(pa_type)
701+
or pa.types.is_binary(pa_type)
702+
):
696703
if op in [operator.add, roperator.radd]:
697704
sep = pa.scalar("", type=pa_type)
698705
if op is operator.add:
@@ -709,7 +716,9 @@ def _evaluate_op_method(self, other, op, arrow_funcs):
709716
result = pc.binary_repeat(binary, pa_integral)
710717
return type(self)(result)
711718
elif (
712-
pa.types.is_string(other.type) or pa.types.is_binary(other.type)
719+
pa.types.is_string(other.type)
720+
or pa.types.is_binary(other.type)
721+
or pa.types.is_large_string(other.type)
713722
) and op in [operator.mul, roperator.rmul]:
714723
binary = other
715724
integral = self._pa_array
@@ -1467,7 +1476,7 @@ def _concat_same_type(cls, to_concat) -> Self:
14671476
chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()]
14681477
if to_concat[0].dtype == "string":
14691478
# StringDtype has no attribute pyarrow_dtype
1470-
pa_dtype = pa.string()
1479+
pa_dtype = pa.large_string()
14711480
else:
14721481
pa_dtype = to_concat[0].dtype.pyarrow_dtype
14731482
arr = pa.chunked_array(chunks, type=pa_dtype)
@@ -2271,7 +2280,9 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None):
22712280
return type(self)(result)
22722281

22732282
def _str_join(self, sep: str):
2274-
if pa.types.is_string(self._pa_array.type):
2283+
if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string(
2284+
self._pa_array.type
2285+
):
22752286
result = self._apply_elementwise(list)
22762287
result = pa.chunked_array(result, type=pa.list_(pa.string()))
22772288
else:

pandas/core/arrays/string_arrow.py

+26-12
Original file line numberDiff line numberDiff line change
@@ -126,17 +126,40 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr
126126
_storage = "pyarrow"
127127

128128
def __init__(self, values) -> None:
129+
_chk_pyarrow_available()
130+
if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string(
131+
values.type
132+
):
133+
values = pc.cast(values, pa.large_string())
134+
129135
super().__init__(values)
130136
self._dtype = StringDtype(storage=self._storage)
131137

132-
if not pa.types.is_string(self._pa_array.type) and not (
138+
if not pa.types.is_large_string(self._pa_array.type) and not (
133139
pa.types.is_dictionary(self._pa_array.type)
134-
and pa.types.is_string(self._pa_array.type.value_type)
140+
and pa.types.is_large_string(self._pa_array.type.value_type)
135141
):
136142
raise ValueError(
137-
"ArrowStringArray requires a PyArrow (chunked) array of string type"
143+
"ArrowStringArray requires a PyArrow (chunked) array of "
144+
"large_string type"
138145
)
139146

147+
@classmethod
148+
def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
149+
pa_scalar = super()._box_pa_scalar(value, pa_type)
150+
if pa.types.is_string(pa_scalar.type) and pa_type is None:
151+
pa_scalar = pc.cast(pa_scalar, pa.large_string())
152+
return pa_scalar
153+
154+
@classmethod
155+
def _box_pa_array(
156+
cls, value, pa_type: pa.DataType | None = None, copy: bool = False
157+
) -> pa.Array | pa.ChunkedArray:
158+
pa_array = super()._box_pa_array(value, pa_type)
159+
if pa.types.is_string(pa_array.type) and pa_type is None:
160+
pa_array = pc.cast(pa_array, pa.large_string())
161+
return pa_array
162+
140163
def __len__(self) -> int:
141164
"""
142165
Length of this array.
@@ -574,15 +597,6 @@ def _rank(
574597
class ArrowStringArrayNumpySemantics(ArrowStringArray):
575598
_storage = "pyarrow_numpy"
576599

577-
def __init__(self, values) -> None:
578-
_chk_pyarrow_available()
579-
580-
if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_large_string(
581-
values.type
582-
):
583-
values = pc.cast(values, pa.string())
584-
super().__init__(values)
585-
586600
@classmethod
587601
def _result_converter(cls, values, na=None):
588602
if not isna(na):

pandas/io/sql.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -172,9 +172,17 @@ def _convert_arrays_to_dataframe(
172172
)
173173
if dtype_backend == "pyarrow":
174174
pa = import_optional_dependency("pyarrow")
175-
arrays = [
176-
ArrowExtensionArray(pa.array(arr, from_pandas=True)) for arr in arrays
177-
]
175+
176+
result_arrays = []
177+
for arr in arrays:
178+
pa_array = pa.array(arr, from_pandas=True)
179+
if arr.dtype == "string":
180+
# TODO: Arrow still infers strings arrays as regular strings instead
181+
# of large_string, which is what we preserver everywhere else for
182+
# dtype_backend="pyarrow". We may want to reconsider this
183+
pa_array = pa_array.cast(pa.string())
184+
result_arrays.append(ArrowExtensionArray(pa_array))
185+
arrays = result_arrays # type: ignore[assignment]
178186
if arrays:
179187
df = DataFrame(dict(zip(list(range(len(columns))), arrays)))
180188
df.columns = columns

pandas/tests/arrays/string_/test_string.py

+12-4
Original file line numberDiff line numberDiff line change
@@ -487,13 +487,15 @@ def test_fillna_args(dtype, arrow_string_storage):
487487
def test_arrow_array(dtype):
488488
# protocol added in 0.15.0
489489
pa = pytest.importorskip("pyarrow")
490+
import pyarrow.compute as pc
490491

491492
data = pd.array(["a", "b", "c"], dtype=dtype)
492493
arr = pa.array(data)
493-
expected = pa.array(list(data), type=pa.string(), from_pandas=True)
494+
expected = pa.array(list(data), type=pa.large_string(), from_pandas=True)
494495
if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0:
495496
expected = pa.chunked_array(expected)
496-
497+
if dtype.storage == "python":
498+
expected = pc.cast(expected, pa.string())
497499
assert arr.equals(expected)
498500

499501

@@ -512,7 +514,10 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
512514
data = pd.array(["a", "b", None], dtype=dtype)
513515
df = pd.DataFrame({"a": data})
514516
table = pa.table(df)
515-
assert table.field("a").type == "string"
517+
if dtype.storage == "python":
518+
assert table.field("a").type == "string"
519+
else:
520+
assert table.field("a").type == "large_string"
516521
with pd.option_context("string_storage", string_storage2):
517522
result = table.to_pandas()
518523
assert isinstance(result["a"].dtype, pd.StringDtype)
@@ -539,7 +544,10 @@ def test_arrow_load_from_zero_chunks(
539544
data = pd.array([], dtype=dtype)
540545
df = pd.DataFrame({"a": data})
541546
table = pa.table(df)
542-
assert table.field("a").type == "string"
547+
if dtype.storage == "python":
548+
assert table.field("a").type == "string"
549+
else:
550+
assert table.field("a").type == "large_string"
543551
# Instantiate the same table with no chunks at all
544552
table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
545553
with pd.option_context("string_storage", string_storage2):

pandas/tests/arrays/string_/test_string_arrow.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def test_constructor_not_string_type_raises(array, chunked, arrow_string_storage
6161
msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowExtensionArray"
6262
else:
6363
msg = re.escape(
64-
"ArrowStringArray requires a PyArrow (chunked) array of string type"
64+
"ArrowStringArray requires a PyArrow (chunked) array of large_string type"
6565
)
6666
with pytest.raises(ValueError, match=msg):
6767
ArrowStringArray(arr)
@@ -76,17 +76,20 @@ def test_constructor_not_string_type_value_dictionary_raises(chunked):
7676
arr = pa.chunked_array(arr)
7777

7878
msg = re.escape(
79-
"ArrowStringArray requires a PyArrow (chunked) array of string type"
79+
"ArrowStringArray requires a PyArrow (chunked) array of large_string type"
8080
)
8181
with pytest.raises(ValueError, match=msg):
8282
ArrowStringArray(arr)
8383

8484

85+
@pytest.mark.xfail(
86+
reason="dict conversion does not seem to be implemented for large string in arrow"
87+
)
8588
@pytest.mark.parametrize("chunked", [True, False])
8689
def test_constructor_valid_string_type_value_dictionary(chunked):
8790
pa = pytest.importorskip("pyarrow")
8891

89-
arr = pa.array(["1", "2", "3"], pa.dictionary(pa.int32(), pa.utf8()))
92+
arr = pa.array(["1", "2", "3"], pa.large_string()).dictionary_encode()
9093
if chunked:
9194
arr = pa.chunked_array(arr)
9295

pandas/tests/io/json/test_pandas.py

+7
Original file line numberDiff line numberDiff line change
@@ -2054,6 +2054,13 @@ def test_read_json_dtype_backend(
20542054
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
20552055
string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))
20562056

2057+
elif dtype_backend == "pyarrow":
2058+
pa = pytest.importorskip("pyarrow")
2059+
from pandas.arrays import ArrowExtensionArray
2060+
2061+
string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
2062+
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))
2063+
20572064
else:
20582065
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
20592066
string_array_na = ArrowStringArray(pa.array(["a", "b", None]))

pandas/tests/io/parser/test_read_fwf.py

+6
Original file line numberDiff line numberDiff line change
@@ -971,6 +971,12 @@ def test_dtype_backend(string_storage, dtype_backend):
971971
if string_storage == "python":
972972
arr = StringArray(np.array(["a", "b"], dtype=np.object_))
973973
arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_))
974+
elif dtype_backend == "pyarrow":
975+
pa = pytest.importorskip("pyarrow")
976+
from pandas.arrays import ArrowExtensionArray
977+
978+
arr = ArrowExtensionArray(pa.array(["a", "b"]))
979+
arr_na = ArrowExtensionArray(pa.array([None, "a"]))
974980
else:
975981
pa = pytest.importorskip("pyarrow")
976982
arr = ArrowStringArray(pa.array(["a", "b"]))

pandas/tests/io/test_clipboard.py

+7
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,13 @@ def test_read_clipboard_dtype_backend(
359359
string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
360360
string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))
361361

362+
elif dtype_backend == "pyarrow" and engine != "c":
363+
pa = pytest.importorskip("pyarrow")
364+
from pandas.arrays import ArrowExtensionArray
365+
366+
string_array = ArrowExtensionArray(pa.array(["x", "y"]))
367+
string_array_na = ArrowExtensionArray(pa.array(["x", None]))
368+
362369
else:
363370
string_array = ArrowStringArray(pa.array(["x", "y"]))
364371
string_array_na = ArrowStringArray(pa.array(["x", None]))

pandas/tests/io/test_feather.py

+6
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,12 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
186186
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
187187
string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_))
188188

189+
elif dtype_backend == "pyarrow":
190+
from pandas.arrays import ArrowExtensionArray
191+
192+
string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
193+
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))
194+
189195
else:
190196
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
191197
string_array_na = ArrowStringArray(pa.array(["a", "b", None]))

pandas/tests/io/test_html.py

+5
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,12 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html):
183183
if string_storage == "python":
184184
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
185185
string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))
186+
elif dtype_backend == "pyarrow":
187+
pa = pytest.importorskip("pyarrow")
188+
from pandas.arrays import ArrowExtensionArray
186189

190+
string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
191+
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))
187192
else:
188193
pa = pytest.importorskip("pyarrow")
189194
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))

pandas/tests/io/test_sql.py

+7
Original file line numberDiff line numberDiff line change
@@ -3647,6 +3647,13 @@ def func(storage, dtype_backend, conn_name) -> DataFrame:
36473647
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
36483648
string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_))
36493649

3650+
elif dtype_backend == "pyarrow":
3651+
pa = pytest.importorskip("pyarrow")
3652+
from pandas.arrays import ArrowExtensionArray
3653+
3654+
string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) # type: ignore[assignment]
3655+
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) # type: ignore[assignment]
3656+
36503657
else:
36513658
pa = pytest.importorskip("pyarrow")
36523659
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))

pandas/tests/io/xml/test_xml.py

+7
Original file line numberDiff line numberDiff line change
@@ -2044,6 +2044,13 @@ def test_read_xml_nullable_dtypes(
20442044
string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
20452045
string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))
20462046

2047+
elif dtype_backend == "pyarrow":
2048+
pa = pytest.importorskip("pyarrow")
2049+
from pandas.arrays import ArrowExtensionArray
2050+
2051+
string_array = ArrowExtensionArray(pa.array(["x", "y"]))
2052+
string_array_na = ArrowExtensionArray(pa.array(["x", None]))
2053+
20472054
else:
20482055
pa = pytest.importorskip("pyarrow")
20492056
string_array = ArrowStringArray(pa.array(["x", "y"]))

0 commit comments

Comments
 (0)