From dab87c74b3d14c61cdac3eb997b4bf608fbda55d Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Sun, 26 Nov 2023 21:28:45 +0100
Subject: [PATCH 1/2] BUG: numba raises for string columns or index

---
 doc/source/whatsnew/v2.2.0.rst   |  2 +-
 pandas/core/apply.py             | 12 +++++++++---
 pandas/tests/apply/test_numba.py | 19 ++++++++++++++++++-
 3 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
index d252c19a95d4a..d4954e6caf2d0 100644
--- a/doc/source/whatsnew/v2.2.0.rst
+++ b/doc/source/whatsnew/v2.2.0.rst
@@ -496,8 +496,8 @@ Conversion
 Strings
 ^^^^^^^
 - Bug in :func:`pandas.api.types.is_string_dtype` while checking object array with no elements is of the string dtype (:issue:`54661`)
+- Bug in :meth:`DataFrame.apply` failing when ``engine="numba"`` and columns or index have ``StringDtype`` (:issue:`56189`)
 - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`)
--
 
 Interval
 ^^^^^^^^
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index 3b79882d3c762..bb3cc3a03760f 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -1172,11 +1172,17 @@ def apply_with_numba(self) -> dict[int, Any]:
         )
         from pandas.core._numba.extensions import set_numba_data
 
+        index = self.obj.index
+        if index.dtype == "string":
+            index = index.astype(object)
+
+        columns = self.obj.columns
+        if columns.dtype == "string":
+            columns = columns.astype(object)
+
         # Convert from numba dict to regular dict
         # Our isinstance checks in the df constructor don't pass for numbas typed dict
-        with set_numba_data(self.obj.index) as index, set_numba_data(
-            self.columns
-        ) as columns:
+        with set_numba_data(index) as index, set_numba_data(columns) as columns:
             res = dict(nb_func(self.values, columns, index))
         return res
 
diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py
index ee239568d057d..85d7baee1bdf5 100644
--- a/pandas/tests/apply/test_numba.py
+++ b/pandas/tests/apply/test_numba.py
@@ -24,6 +24,22 @@ def test_numba_vs_python_noop(float_frame, apply_axis):
     tm.assert_frame_equal(result, expected)
 
 
+def test_numba_vs_python_string_index():
+    # GH#56189
+    pytest.importorskip("pyarrow")
+    df = DataFrame(
+        1,
+        index=Index(["a", "b"], dtype="string[pyarrow_numpy]"),
+        columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"),
+    )
+    func = lambda x: x
+    result = df.apply(func, engine="numba", axis=0)
+    expected = df.apply(func, engine="python", axis=0)
+    tm.assert_frame_equal(
+        result, expected, check_column_type=False, check_index_type=False
+    )
+
+
 def test_numba_vs_python_indexing():
     frame = DataFrame(
         {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]},
@@ -88,7 +104,8 @@ def test_numba_unsupported_dtypes(apply_axis):
     df["c"] = df["c"].astype("double[pyarrow]")
 
     with pytest.raises(
-        ValueError, match="Column b must have a numeric dtype. Found 'object' instead"
+        ValueError,
+        match="Column b must have a numeric dtype. Found 'object|string' instead",
     ):
         df.apply(f, engine="numba", axis=apply_axis)
 

From 78a21f4f088a4bf036a016feff5b623fc0c698f2 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Sun, 26 Nov 2023 21:32:09 +0100
Subject: [PATCH 2/2] Adjust tests for apply folder for new string option

---
 pandas/tests/apply/test_frame_apply.py  |  9 ++++++---
 pandas/tests/apply/test_invalid_arg.py  | 18 +++++++++++++++---
 pandas/tests/apply/test_series_apply.py |  4 ++--
 3 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
index 2d7549e09a986..b7eac6b8f0ea1 100644
--- a/pandas/tests/apply/test_frame_apply.py
+++ b/pandas/tests/apply/test_frame_apply.py
@@ -1464,13 +1464,16 @@ def test_apply_datetime_tz_issue(engine, request):
 
 @pytest.mark.parametrize("df", [DataFrame({"A": ["a", None], "B": ["c", "d"]})])
 @pytest.mark.parametrize("method", ["min", "max", "sum"])
-def test_mixed_column_raises(df, method):
+def test_mixed_column_raises(df, method, using_infer_string):
     # GH 16832
     if method == "sum":
-        msg = r'can only concatenate str \(not "int"\) to str'
+        msg = r'can only concatenate str \(not "int"\) to str|does not support'
     else:
         msg = "not supported between instances of 'str' and 'float'"
-    with pytest.raises(TypeError, match=msg):
+    if not using_infer_string:
+        with pytest.raises(TypeError, match=msg):
+            getattr(df, method)()
+    else:
         getattr(df, method)()
 
 
diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py
index 9f8611dd4b08b..ad269b745004b 100644
--- a/pandas/tests/apply/test_invalid_arg.py
+++ b/pandas/tests/apply/test_invalid_arg.py
@@ -223,9 +223,14 @@ def transform2(row):
         DataFrame([["a", "b"], ["b", "a"]]), [["cumprod", TypeError]]
     ),
 )
-def test_agg_cython_table_raises_frame(df, func, expected, axis):
+def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_string):
     # GH 21224
-    msg = "can't multiply sequence by non-int of type 'str'"
+    if using_infer_string:
+        import pyarrow as pa
+
+        expected = (expected, pa.lib.ArrowNotImplementedError)
+
+    msg = "can't multiply sequence by non-int of type 'str'|has no kernel"
     warn = None if isinstance(func, str) else FutureWarning
     with pytest.raises(expected, match=msg):
         with tm.assert_produces_warning(warn, match="using DataFrame.cumprod"):
@@ -248,11 +253,18 @@ def test_agg_cython_table_raises_frame(df, func, expected, axis):
         )
     ),
 )
-def test_agg_cython_table_raises_series(series, func, expected):
+def test_agg_cython_table_raises_series(series, func, expected, using_infer_string):
     # GH21224
     msg = r"[Cc]ould not convert|can't multiply sequence by non-int of type"
     if func == "median" or func is np.nanmedian or func is np.median:
         msg = r"Cannot convert \['a' 'b' 'c'\] to numeric"
+
+    if using_infer_string:
+        import pyarrow as pa
+
+        expected = (expected, pa.lib.ArrowNotImplementedError)
+
+    msg = msg + "|does not support|has no kernel"
     warn = None if isinstance(func, str) else FutureWarning
 
     with pytest.raises(expected, match=msg):
diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py
index 177dff2d771d4..24e48ebd4ed54 100644
--- a/pandas/tests/apply/test_series_apply.py
+++ b/pandas/tests/apply/test_series_apply.py
@@ -222,7 +222,7 @@ def f(x):
         assert result == "Asia/Tokyo"
 
 
-def test_apply_categorical(by_row):
+def test_apply_categorical(by_row, using_infer_string):
     values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True)
     ser = Series(values, name="XX", index=list("abcdefg"))
 
@@ -245,7 +245,7 @@ def test_apply_categorical(by_row):
     result = ser.apply(lambda x: "A")
     exp = Series(["A"] * 7, name="XX", index=list("abcdefg"))
     tm.assert_series_equal(result, exp)
-    assert result.dtype == object
+    assert result.dtype == object if not using_infer_string else "string[pyarrow_numpy]"
 
 
 @pytest.mark.parametrize("series", [["1-1", "1-1", np.nan], ["1-1", "1-2", np.nan]])