diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index b7e45d043d869..4c548d94a0d59 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -13,6 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed regression in :func:`merge` when merging over a PyArrow string index (:issue:`54894`) - Fixed regression in :func:`read_csv` when ``usecols`` is given and ``dtypes`` is a dict for ``engine="python"`` (:issue:`54868`) - Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`) - Fixed regression when comparing a :class:`Series` with ``datetime64`` dtype with ``None`` (:issue:`54870`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f3695fb87ea78..8ef3943ab0d8d 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2437,8 +2437,12 @@ def _factorize_keys( length = len(dc.dictionary) llab, rlab, count = ( - pc.fill_null(dc.indices[slice(len_lk)], length).to_numpy(), - pc.fill_null(dc.indices[slice(len_lk, None)], length).to_numpy(), + pc.fill_null(dc.indices[slice(len_lk)], length) + .to_numpy() + .astype(np.intp, copy=False), + pc.fill_null(dc.indices[slice(len_lk, None)], length) + .to_numpy() + .astype(np.intp, copy=False), len(dc.dictionary), ) if how == "right": diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 02d7e2059e8e1..9cada6964c094 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2947,3 +2947,15 @@ def test_merge_ea_int_and_float_numpy(): result = df2.merge(df1) tm.assert_frame_equal(result, expected.astype("float64")) + + +def test_merge_arrow_string_index(): + # GH#54894 + pytest.importorskip("pyarrow") + left = DataFrame({"a": ["a", "b"]}, dtype="string[pyarrow]") + right = DataFrame({"b": 1}, index=Index(["a", "c"], dtype="string[pyarrow]")) + result = left.merge(right, left_on="a", right_index=True, how="left") + expected = DataFrame( + {"a": Series(["a", "b"], dtype="string[pyarrow]"), "b": [1, np.nan]} + ) + tm.assert_frame_equal(result, expected)