Skip to content

Commit ecb90b5

Browse files
Backport PR #58590 on branch 2.2.x (BUG: Use large_string in string array consistently) (#58597)
Backport PR #58590: BUG: Use large_string in string array consistently Co-authored-by: Patrick Hoefler <[email protected]>
1 parent 35c2377 commit ecb90b5

File tree

1 file changed

+10
-6
lines changed

1 file changed

+10
-6
lines changed

pandas/core/arrays/string_arrow.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -190,13 +190,13 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal
190190
na_values = scalars._mask
191191
result = scalars._data
192192
result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
193-
return cls(pa.array(result, mask=na_values, type=pa.string()))
193+
return cls(pa.array(result, mask=na_values, type=pa.large_string()))
194194
elif isinstance(scalars, (pa.Array, pa.ChunkedArray)):
195-
return cls(pc.cast(scalars, pa.string()))
195+
return cls(pc.cast(scalars, pa.large_string()))
196196

197197
# convert non-na-likes to str
198198
result = lib.ensure_string_array(scalars, copy=copy)
199-
return cls(pa.array(result, type=pa.string(), from_pandas=True))
199+
return cls(pa.array(result, type=pa.large_string(), from_pandas=True))
200200

201201
@classmethod
202202
def _from_sequence_of_strings(
@@ -239,7 +239,7 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
239239
value_set = [
240240
pa_scalar.as_py()
241241
for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values]
242-
if pa_scalar.type in (pa.string(), pa.null())
242+
if pa_scalar.type in (pa.string(), pa.null(), pa.large_string())
243243
]
244244

245245
# short-circuit to return all False array.
@@ -337,7 +337,9 @@ def _str_map(
337337
result = lib.map_infer_mask(
338338
arr, f, mask.view("uint8"), convert=False, na_value=na_value
339339
)
340-
result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True)
340+
result = pa.array(
341+
result, mask=mask, type=pa.large_string(), from_pandas=True
342+
)
341343
return type(self)(result)
342344
else:
343345
# This is when the result type is object. We reach this when
@@ -658,7 +660,9 @@ def _str_map(
658660
result = lib.map_infer_mask(
659661
arr, f, mask.view("uint8"), convert=False, na_value=na_value
660662
)
661-
result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True)
663+
result = pa.array(
664+
result, mask=mask, type=pa.large_string(), from_pandas=True
665+
)
662666
return type(self)(result)
663667
else:
664668
# This is when the result type is object. We reach this when

0 commit comments

Comments
 (0)