From dab87c74b3d14c61cdac3eb997b4bf608fbda55d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 26 Nov 2023 21:28:45 +0100 Subject: [PATCH 1/2] BUG: numba raises for string columns or index --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/apply.py | 12 +++++++++--- pandas/tests/apply/test_numba.py | 19 ++++++++++++++++++- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index d252c19a95d4a..d4954e6caf2d0 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -496,8 +496,8 @@ Conversion Strings ^^^^^^^ - Bug in :func:`pandas.api.types.is_string_dtype` while checking object array with no elements is of the string dtype (:issue:`54661`) +- Bug in :meth:`DataFrame.apply` failing when ``engine="numba"`` and columns or index have ``StringDtype`` (:issue:`56189`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`) -- Interval ^^^^^^^^ diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 3b79882d3c762..bb3cc3a03760f 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1172,11 +1172,17 @@ def apply_with_numba(self) -> dict[int, Any]: ) from pandas.core._numba.extensions import set_numba_data + index = self.obj.index + if index.dtype == "string": + index = index.astype(object) + + columns = self.obj.columns + if columns.dtype == "string": + columns = columns.astype(object) + # Convert from numba dict to regular dict # Our isinstance checks in the df constructor don't pass for numbas typed dict - with set_numba_data(self.obj.index) as index, set_numba_data( - self.columns - ) as columns: + with set_numba_data(index) as index, set_numba_data(columns) as columns: res = dict(nb_func(self.values, columns, index)) return res diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index ee239568d057d..85d7baee1bdf5 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -24,6 +24,22 @@ def test_numba_vs_python_noop(float_frame, apply_axis): tm.assert_frame_equal(result, expected) +def test_numba_vs_python_string_index(): + # GH#56189 + pytest.importorskip("pyarrow") + df = DataFrame( + 1, + index=Index(["a", "b"], dtype="string[pyarrow_numpy]"), + columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"), + ) + func = lambda x: x + result = df.apply(func, engine="numba", axis=0) + expected = df.apply(func, engine="python", axis=0) + tm.assert_frame_equal( + result, expected, check_column_type=False, check_index_type=False + ) + + def test_numba_vs_python_indexing(): frame = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]}, @@ -88,7 +104,8 @@ def test_numba_unsupported_dtypes(apply_axis): df["c"] = df["c"].astype("double[pyarrow]") with pytest.raises( - ValueError, match="Column b must have a numeric dtype. Found 'object' instead" + ValueError, + match="Column b must have a numeric dtype. Found 'object|string' instead", ): df.apply(f, engine="numba", axis=apply_axis) From 78a21f4f088a4bf036a016feff5b623fc0c698f2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 26 Nov 2023 21:32:09 +0100 Subject: [PATCH 2/2] Adjust tests for apply folder for new string option --- pandas/tests/apply/test_frame_apply.py | 9 ++++++--- pandas/tests/apply/test_invalid_arg.py | 18 +++++++++++++++--- pandas/tests/apply/test_series_apply.py | 4 ++-- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 2d7549e09a986..b7eac6b8f0ea1 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -1464,13 +1464,16 @@ def test_apply_datetime_tz_issue(engine, request): @pytest.mark.parametrize("df", [DataFrame({"A": ["a", None], "B": ["c", "d"]})]) @pytest.mark.parametrize("method", ["min", "max", "sum"]) -def test_mixed_column_raises(df, method): +def test_mixed_column_raises(df, method, using_infer_string): # GH 16832 if method == "sum": - msg = r'can only concatenate str \(not "int"\) to str' + msg = r'can only concatenate str \(not "int"\) to str|does not support' else: msg = "not supported between instances of 'str' and 'float'" - with pytest.raises(TypeError, match=msg): + if not using_infer_string: + with pytest.raises(TypeError, match=msg): + getattr(df, method)() + else: getattr(df, method)() diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index 9f8611dd4b08b..ad269b745004b 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -223,9 +223,14 @@ def transform2(row): DataFrame([["a", "b"], ["b", "a"]]), [["cumprod", TypeError]] ), ) -def test_agg_cython_table_raises_frame(df, func, expected, axis): +def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_string): # GH 21224 - msg = "can't multiply sequence by non-int of type 'str'" + if using_infer_string: + import pyarrow as pa + + expected = (expected, pa.lib.ArrowNotImplementedError) + + msg = "can't multiply sequence by non-int of type 'str'|has no kernel" warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): with tm.assert_produces_warning(warn, match="using DataFrame.cumprod"): @@ -248,11 +253,18 @@ def test_agg_cython_table_raises_frame(df, func, expected, axis): ) ), ) -def test_agg_cython_table_raises_series(series, func, expected): +def test_agg_cython_table_raises_series(series, func, expected, using_infer_string): # GH21224 msg = r"[Cc]ould not convert|can't multiply sequence by non-int of type" if func == "median" or func is np.nanmedian or func is np.median: msg = r"Cannot convert \['a' 'b' 'c'\] to numeric" + + if using_infer_string: + import pyarrow as pa + + expected = (expected, pa.lib.ArrowNotImplementedError) + + msg = msg + "|does not support|has no kernel" warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index 177dff2d771d4..24e48ebd4ed54 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -222,7 +222,7 @@ def f(x): assert result == "Asia/Tokyo" -def test_apply_categorical(by_row): +def test_apply_categorical(by_row, using_infer_string): values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) ser = Series(values, name="XX", index=list("abcdefg")) @@ -245,7 +245,7 @@ def test_apply_categorical(by_row): result = ser.apply(lambda x: "A") exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) - assert result.dtype == object + assert result.dtype == object if not using_infer_string else "string[pyarrow_numpy]" @pytest.mark.parametrize("series", [["1-1", "1-1", np.nan], ["1-1", "1-2", np.nan]])