From 610cb56f85bbebe16b6dfc9b2c4fcecc2506f186 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 31 Aug 2023 11:34:06 +0200 Subject: [PATCH 1/3] REGR: Merge raising when left merging on arrow string index --- doc/source/whatsnew/v2.1.1.rst | 2 +- pandas/core/reshape/merge.py | 8 ++++++-- pandas/tests/reshape/merge/test_merge.py | 11 +++++++++++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index 66f6d59d08cb2..53e3325684c64 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -13,7 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- +- Fixed regression in :func:`merge` when merging over a PyArrow string index (:issue:`54894`) .. --------------------------------------------------------------------------- .. _whatsnew_211.bug_fixes: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 13bc1008698b2..71e91b574a9e7 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2436,8 +2436,12 @@ def _factorize_keys( length = len(dc.dictionary) llab, rlab, count = ( - pc.fill_null(dc.indices[slice(len_lk)], length).to_numpy(), - pc.fill_null(dc.indices[slice(len_lk, None)], length).to_numpy(), + pc.fill_null(dc.indices[slice(len_lk)], length) + .to_numpy() + .astype(np.intp), + pc.fill_null(dc.indices[slice(len_lk, None)], length) + .to_numpy() + .astype(np.intp), len(dc.dictionary), ) if how == "right": diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 02d7e2059e8e1..7d923fd59fc3f 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2947,3 +2947,14 @@ def test_merge_ea_int_and_float_numpy(): result = df2.merge(df1) tm.assert_frame_equal(result, expected.astype("float64")) + + +def test_merge_arrow_string_index(): + # GH#54894 + left = DataFrame({"a": ["a", "b"]}, dtype="string[pyarrow]") + right = DataFrame({"b": 1}, index=Index(["a", "c"], dtype="string[pyarrow]")) + result = left.merge(right, left_on="a", right_index=True, how="left") + expected = DataFrame( + {"a": Series(["a", "b"], dtype="string[pyarrow]"), "b": [1, np.nan]} + ) + tm.assert_frame_equal(result, expected) From 5b2b7ab96391d1d24050d34fda13872da2f231b7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 31 Aug 2023 12:47:21 +0200 Subject: [PATCH 2/3] Fix --- pandas/tests/reshape/merge/test_merge.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 7d923fd59fc3f..9cada6964c094 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2951,6 +2951,7 @@ def test_merge_ea_int_and_float_numpy(): def test_merge_arrow_string_index(): # GH#54894 + pytest.importorskip("pyarrow") left = DataFrame({"a": ["a", "b"]}, dtype="string[pyarrow]") right = DataFrame({"b": 1}, index=Index(["a", "c"], dtype="string[pyarrow]")) result = left.merge(right, left_on="a", right_index=True, how="left") From 4f1a996422b3c27dc04097a935a84029e5a56465 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 31 Aug 2023 22:59:24 +0200 Subject: [PATCH 3/3] Add copy false --- pandas/core/reshape/merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 71e91b574a9e7..6098ff2698e2e 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2438,10 +2438,10 @@ def _factorize_keys( llab, rlab, count = ( pc.fill_null(dc.indices[slice(len_lk)], length) .to_numpy() - .astype(np.intp), + .astype(np.intp, copy=False), pc.fill_null(dc.indices[slice(len_lk, None)], length) .to_numpy() - .astype(np.intp), + .astype(np.intp, copy=False), len(dc.dictionary), ) if how == "right":