Skip to content

Commit f0b8db4

Browse files
BUG (string dtype): convert dictionary input to materialized string array in ArrowStringArray constructor (#59479)
1 parent 32d0ae9 commit f0b8db4

File tree

2 files changed

+15
-12
lines changed

2 files changed

+15
-12
lines changed

pandas/core/arrays/string_arrow.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -130,18 +130,22 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr
130130

131131
def __init__(self, values) -> None:
132132
_chk_pyarrow_available()
133-
if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string(
134-
values.type
133+
if isinstance(values, (pa.Array, pa.ChunkedArray)) and (
134+
pa.types.is_string(values.type)
135+
or (
136+
pa.types.is_dictionary(values.type)
137+
and (
138+
pa.types.is_string(values.type.value_type)
139+
or pa.types.is_large_string(values.type.value_type)
140+
)
141+
)
135142
):
136143
values = pc.cast(values, pa.large_string())
137144

138145
super().__init__(values)
139146
self._dtype = StringDtype(storage=self._storage, na_value=self._na_value)
140147

141-
if not pa.types.is_large_string(self._pa_array.type) and not (
142-
pa.types.is_dictionary(self._pa_array.type)
143-
and pa.types.is_large_string(self._pa_array.type.value_type)
144-
):
148+
if not pa.types.is_large_string(self._pa_array.type):
145149
raise ValueError(
146150
"ArrowStringArray requires a PyArrow (chunked) array of "
147151
"large_string type"

pandas/tests/arrays/string_/test_string_arrow.py

+5-6
Original file line numberDiff line numberDiff line change
@@ -88,19 +88,18 @@ def test_constructor_not_string_type_value_dictionary_raises(chunked):
8888
ArrowStringArray(arr)
8989

9090

91-
@pytest.mark.xfail(
92-
reason="dict conversion does not seem to be implemented for large string in arrow"
93-
)
91+
@pytest.mark.parametrize("string_type", ["string", "large_string"])
9492
@pytest.mark.parametrize("chunked", [True, False])
95-
def test_constructor_valid_string_type_value_dictionary(chunked):
93+
def test_constructor_valid_string_type_value_dictionary(string_type, chunked):
9694
pa = pytest.importorskip("pyarrow")
9795

98-
arr = pa.array(["1", "2", "3"], pa.large_string()).dictionary_encode()
96+
arr = pa.array(["1", "2", "3"], getattr(pa, string_type)()).dictionary_encode()
9997
if chunked:
10098
arr = pa.chunked_array(arr)
10199

102100
arr = ArrowStringArray(arr)
103-
assert pa.types.is_string(arr._pa_array.type.value_type)
101+
# dictionary type get converted to dense large string array
102+
assert pa.types.is_large_string(arr._pa_array.type)
104103

105104

106105
def test_constructor_from_list():

0 commit comments

Comments
 (0)