From 0d3e86eceee25921b5819ce5b3d6e11e400bcd15 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 6 Sep 2023 00:32:36 +0200 Subject: [PATCH] Backport PR #54974: Include pyarrow_numpy string in efficient merge implementation --- pandas/core/reshape/merge.py | 3 ++- pandas/tests/reshape/merge/test_merge.py | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 4eda8d2d75408..d36ceff800c56 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2417,7 +2417,8 @@ def _factorize_keys( elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or ( - isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow" + isinstance(lk.dtype, StringDtype) + and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"] ): import pyarrow as pa import pyarrow.compute as pc diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 2c97e8773b0d6..c4067363d934e 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2872,13 +2872,13 @@ def test_merge_ea_int_and_float_numpy(): tm.assert_frame_equal(result, expected.astype("float64")) -def test_merge_arrow_string_index(): +def test_merge_arrow_string_index(any_string_dtype): # GH#54894 pytest.importorskip("pyarrow") - left = DataFrame({"a": ["a", "b"]}, dtype="string[pyarrow]") - right = DataFrame({"b": 1}, index=Index(["a", "c"], dtype="string[pyarrow]")) + left = DataFrame({"a": ["a", "b"]}, dtype=any_string_dtype) + right = DataFrame({"b": 1}, index=Index(["a", "c"], dtype=any_string_dtype)) result = left.merge(right, left_on="a", right_index=True, how="left") expected = DataFrame( - {"a": Series(["a", "b"], dtype="string[pyarrow]"), "b": [1, np.nan]} + {"a": Series(["a", "b"], dtype=any_string_dtype), "b": [1, np.nan]} ) tm.assert_frame_equal(result, expected)