Switch arrow type for string array to large string (#56220)

phofl · web-flow · commit 2488e5e6d783 · 2023-12-21T22:05:38.000+01:00
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -236,6 +236,8 @@ Other enhancements
 - Implemented :meth:`Series.str.extract` for :class:`ArrowDtype` (:issue:`56268`)
 - Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as "BMS" (:issue:`56243`)
 - Improved error message when constructing :class:`Period` with invalid offsets such as "QS" (:issue:`55785`)
+- The dtypes ``string[pyarrow]`` and ``string[pyarrow_numpy]`` now both utilize the ``large_string`` type from PyArrow to avoid overflow for long columns (:issue:`56259`)
+
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_220.notable_bug_fixes:
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -291,6 +291,7 @@ def _from_sequence_of_strings(
             pa_type is None
             or pa.types.is_binary(pa_type)
             or pa.types.is_string(pa_type)
+            or pa.types.is_large_string(pa_type)
         ):
             # pa_type is None: Let pa.array infer
             # pa_type is string/binary: scalars already correct type
@@ -632,7 +633,9 @@ def __invert__(self) -> Self:
         # This is a bit wise op for integer types
         if pa.types.is_integer(self._pa_array.type):
             return type(self)(pc.bit_wise_not(self._pa_array))
-        elif pa.types.is_string(self._pa_array.type):
+        elif pa.types.is_string(self._pa_array.type) or pa.types.is_large_string(
+            self._pa_array.type
+        ):
             # Raise TypeError instead of pa.ArrowNotImplementedError
             raise TypeError("__invert__ is not supported for string dtypes")
         else:
@@ -692,7 +695,11 @@ def _evaluate_op_method(self, other, op, arrow_funcs):
         pa_type = self._pa_array.type
         other = self._box_pa(other)
 
-        if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type):
+        if (
+            pa.types.is_string(pa_type)
+            or pa.types.is_large_string(pa_type)
+            or pa.types.is_binary(pa_type)
+        ):
             if op in [operator.add, roperator.radd]:
                 sep = pa.scalar("", type=pa_type)
                 if op is operator.add:
@@ -709,7 +716,9 @@ def _evaluate_op_method(self, other, op, arrow_funcs):
                 result = pc.binary_repeat(binary, pa_integral)
                 return type(self)(result)
         elif (
-            pa.types.is_string(other.type) or pa.types.is_binary(other.type)
+            pa.types.is_string(other.type)
+            or pa.types.is_binary(other.type)
+            or pa.types.is_large_string(other.type)
         ) and op in [operator.mul, roperator.rmul]:
             binary = other
             integral = self._pa_array
@@ -1467,7 +1476,7 @@ def _concat_same_type(cls, to_concat) -> Self:
         chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()]
         if to_concat[0].dtype == "string":
             # StringDtype has no attribute pyarrow_dtype
-            pa_dtype = pa.string()
+            pa_dtype = pa.large_string()
         else:
             pa_dtype = to_concat[0].dtype.pyarrow_dtype
         arr = pa.chunked_array(chunks, type=pa_dtype)
@@ -2271,7 +2280,9 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None):
         return type(self)(result)
 
     def _str_join(self, sep: str):
-        if pa.types.is_string(self._pa_array.type):
+        if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string(
+            self._pa_array.type
+        ):
             result = self._apply_elementwise(list)
             result = pa.chunked_array(result, type=pa.list_(pa.string()))
         else:
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -126,17 +126,40 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr
     _storage = "pyarrow"
 
     def __init__(self, values) -> None:
+        _chk_pyarrow_available()
+        if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string(
+            values.type
+        ):
+            values = pc.cast(values, pa.large_string())
+
         super().__init__(values)
         self._dtype = StringDtype(storage=self._storage)
 
-        if not pa.types.is_string(self._pa_array.type) and not (
+        if not pa.types.is_large_string(self._pa_array.type) and not (
             pa.types.is_dictionary(self._pa_array.type)
-            and pa.types.is_string(self._pa_array.type.value_type)
+            and pa.types.is_large_string(self._pa_array.type.value_type)
         ):
             raise ValueError(
-                "ArrowStringArray requires a PyArrow (chunked) array of string type"
+                "ArrowStringArray requires a PyArrow (chunked) array of "
+                "large_string type"
             )
 
+    @classmethod
+    def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
+        pa_scalar = super()._box_pa_scalar(value, pa_type)
+        if pa.types.is_string(pa_scalar.type) and pa_type is None:
+            pa_scalar = pc.cast(pa_scalar, pa.large_string())
+        return pa_scalar
+
+    @classmethod
+    def _box_pa_array(
+        cls, value, pa_type: pa.DataType | None = None, copy: bool = False
+    ) -> pa.Array | pa.ChunkedArray:
+        pa_array = super()._box_pa_array(value, pa_type)
+        if pa.types.is_string(pa_array.type) and pa_type is None:
+            pa_array = pc.cast(pa_array, pa.large_string())
+        return pa_array
+
     def __len__(self) -> int:
         """
         Length of this array.
@@ -574,15 +597,6 @@ def _rank(
 class ArrowStringArrayNumpySemantics(ArrowStringArray):
     _storage = "pyarrow_numpy"
 
-    def __init__(self, values) -> None:
-        _chk_pyarrow_available()
-
-        if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_large_string(
-            values.type
-        ):
-            values = pc.cast(values, pa.string())
-        super().__init__(values)
-
     @classmethod
     def _result_converter(cls, values, na=None):
         if not isna(na):
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
@@ -172,9 +172,17 @@ def _convert_arrays_to_dataframe(
     )
     if dtype_backend == "pyarrow":
         pa = import_optional_dependency("pyarrow")
-        arrays = [
-            ArrowExtensionArray(pa.array(arr, from_pandas=True)) for arr in arrays
-        ]
+
+        result_arrays = []
+        for arr in arrays:
+            pa_array = pa.array(arr, from_pandas=True)
+            if arr.dtype == "string":
+                # TODO: Arrow still infers strings arrays as regular strings instead
+                # of large_string, which is what we preserver everywhere else for
+                # dtype_backend="pyarrow". We may want to reconsider this
+                pa_array = pa_array.cast(pa.string())
+            result_arrays.append(ArrowExtensionArray(pa_array))
+        arrays = result_arrays  # type: ignore[assignment]
     if arrays:
         df = DataFrame(dict(zip(list(range(len(columns))), arrays)))
         df.columns = columns
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -487,13 +487,15 @@ def test_fillna_args(dtype, arrow_string_storage):
 def test_arrow_array(dtype):
     # protocol added in 0.15.0
     pa = pytest.importorskip("pyarrow")
+    import pyarrow.compute as pc
 
     data = pd.array(["a", "b", "c"], dtype=dtype)
     arr = pa.array(data)
-    expected = pa.array(list(data), type=pa.string(), from_pandas=True)
+    expected = pa.array(list(data), type=pa.large_string(), from_pandas=True)
     if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0:
         expected = pa.chunked_array(expected)
-
+    if dtype.storage == "python":
+        expected = pc.cast(expected, pa.string())
     assert arr.equals(expected)
 
 
@@ -512,7 +514,10 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
     data = pd.array(["a", "b", None], dtype=dtype)
     df = pd.DataFrame({"a": data})
     table = pa.table(df)
-    assert table.field("a").type == "string"
+    if dtype.storage == "python":
+        assert table.field("a").type == "string"
+    else:
+        assert table.field("a").type == "large_string"
     with pd.option_context("string_storage", string_storage2):
         result = table.to_pandas()
     assert isinstance(result["a"].dtype, pd.StringDtype)
@@ -539,7 +544,10 @@ def test_arrow_load_from_zero_chunks(
     data = pd.array([], dtype=dtype)
     df = pd.DataFrame({"a": data})
     table = pa.table(df)
-    assert table.field("a").type == "string"
+    if dtype.storage == "python":
+        assert table.field("a").type == "string"
+    else:
+        assert table.field("a").type == "large_string"
     # Instantiate the same table with no chunks at all
     table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
     with pd.option_context("string_storage", string_storage2):
diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -61,7 +61,7 @@ def test_constructor_not_string_type_raises(array, chunked, arrow_string_storage
         msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowExtensionArray"
     else:
         msg = re.escape(
-            "ArrowStringArray requires a PyArrow (chunked) array of string type"
+            "ArrowStringArray requires a PyArrow (chunked) array of large_string type"
         )
     with pytest.raises(ValueError, match=msg):
         ArrowStringArray(arr)
@@ -76,17 +76,20 @@ def test_constructor_not_string_type_value_dictionary_raises(chunked):
         arr = pa.chunked_array(arr)
 
     msg = re.escape(
-        "ArrowStringArray requires a PyArrow (chunked) array of string type"
+        "ArrowStringArray requires a PyArrow (chunked) array of large_string type"
     )
     with pytest.raises(ValueError, match=msg):
         ArrowStringArray(arr)
 
 
+@pytest.mark.xfail(
+    reason="dict conversion does not seem to be implemented for large string in arrow"
+)
 @pytest.mark.parametrize("chunked", [True, False])
 def test_constructor_valid_string_type_value_dictionary(chunked):
     pa = pytest.importorskip("pyarrow")
 
-    arr = pa.array(["1", "2", "3"], pa.dictionary(pa.int32(), pa.utf8()))
+    arr = pa.array(["1", "2", "3"], pa.large_string()).dictionary_encode()
     if chunked:
         arr = pa.chunked_array(arr)
 
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -2054,6 +2054,13 @@ def test_read_json_dtype_backend(
             string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
             string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))
 
+        elif dtype_backend == "pyarrow":
+            pa = pytest.importorskip("pyarrow")
+            from pandas.arrays import ArrowExtensionArray
+
+            string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
+            string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))
+
         else:
             string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
             string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py
@@ -971,6 +971,12 @@ def test_dtype_backend(string_storage, dtype_backend):
     if string_storage == "python":
         arr = StringArray(np.array(["a", "b"], dtype=np.object_))
         arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_))
+    elif dtype_backend == "pyarrow":
+        pa = pytest.importorskip("pyarrow")
+        from pandas.arrays import ArrowExtensionArray
+
+        arr = ArrowExtensionArray(pa.array(["a", "b"]))
+        arr_na = ArrowExtensionArray(pa.array([None, "a"]))
     else:
         pa = pytest.importorskip("pyarrow")
         arr = ArrowStringArray(pa.array(["a", "b"]))
diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py
@@ -359,6 +359,13 @@ def test_read_clipboard_dtype_backend(
             string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
             string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))
 
+        elif dtype_backend == "pyarrow" and engine != "c":
+            pa = pytest.importorskip("pyarrow")
+            from pandas.arrays import ArrowExtensionArray
+
+            string_array = ArrowExtensionArray(pa.array(["x", "y"]))
+            string_array_na = ArrowExtensionArray(pa.array(["x", None]))
+
         else:
             string_array = ArrowStringArray(pa.array(["x", "y"]))
             string_array_na = ArrowStringArray(pa.array(["x", None]))
diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py
@@ -186,6 +186,12 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
             string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
             string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_))
 
+        elif dtype_backend == "pyarrow":
+            from pandas.arrays import ArrowExtensionArray
+
+            string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
+            string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))
+
         else:
             string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
             string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
@@ -183,7 +183,12 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html):
         if string_storage == "python":
             string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
             string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))
+        elif dtype_backend == "pyarrow":
+            pa = pytest.importorskip("pyarrow")
+            from pandas.arrays import ArrowExtensionArray
 
+            string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
+            string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))
         else:
             pa = pytest.importorskip("pyarrow")
             string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
@@ -3647,6 +3647,13 @@ def func(storage, dtype_backend, conn_name) -> DataFrame:
             string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
             string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_))
 
+        elif dtype_backend == "pyarrow":
+            pa = pytest.importorskip("pyarrow")
+            from pandas.arrays import ArrowExtensionArray
+
+            string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))  # type: ignore[assignment]
+            string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))  # type: ignore[assignment]
+
         else:
             pa = pytest.importorskip("pyarrow")
             string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
@@ -2044,6 +2044,13 @@ def test_read_xml_nullable_dtypes(
         string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
         string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))
 
+    elif dtype_backend == "pyarrow":
+        pa = pytest.importorskip("pyarrow")
+        from pandas.arrays import ArrowExtensionArray
+
+        string_array = ArrowExtensionArray(pa.array(["x", "y"]))
+        string_array_na = ArrowExtensionArray(pa.array(["x", None]))
+
     else:
         pa = pytest.importorskip("pyarrow")
         string_array = ArrowStringArray(pa.array(["x", "y"]))