Skip to content

Commit 33670f5

Browse files
Backport PR pandas-dev#52422 on branch 2.0.x (BUG: merge with arrow and numpy dtypes raises) (pandas-dev#52480)
Backport PR pandas-dev#52422: BUG: merge with arrow and numpy dtypes raises Co-authored-by: Patrick Hoefler <[email protected]>
1 parent b9b06b3 commit 33670f5

File tree

3 files changed

+35
-2
lines changed

3 files changed

+35
-2
lines changed

doc/source/whatsnew/v2.0.1.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ Fixed regressions
2020

2121
Bug fixes
2222
~~~~~~~~~
23-
-
23+
- Fixed bug in :func:`merge` when merging with ``ArrowDtype`` one one and a NumPy dtype on the other side (:issue:`52406`)
2424

2525
.. ---------------------------------------------------------------------------
2626
.. _whatsnew_201.other:

pandas/core/reshape/merge.py

+19-1
Original file line numberDiff line numberDiff line change
@@ -78,13 +78,15 @@
7878
)
7979

8080
from pandas import (
81+
ArrowDtype,
8182
Categorical,
8283
Index,
8384
MultiIndex,
8485
Series,
8586
)
8687
import pandas.core.algorithms as algos
8788
from pandas.core.arrays import (
89+
ArrowExtensionArray,
8890
BaseMaskedArray,
8991
ExtensionArray,
9092
)
@@ -2372,7 +2374,11 @@ def _factorize_keys(
23722374
rk = ensure_int64(rk.codes)
23732375

23742376
elif isinstance(lk, ExtensionArray) and is_dtype_equal(lk.dtype, rk.dtype):
2375-
if not isinstance(lk, BaseMaskedArray):
2377+
if not isinstance(lk, BaseMaskedArray) and not (
2378+
# exclude arrow dtypes that would get cast to object
2379+
isinstance(lk.dtype, ArrowDtype)
2380+
and is_numeric_dtype(lk.dtype.numpy_dtype)
2381+
):
23762382
lk, _ = lk._values_for_factorize()
23772383

23782384
# error: Item "ndarray" of "Union[Any, ndarray]" has no attribute
@@ -2387,6 +2393,16 @@ def _factorize_keys(
23872393
assert isinstance(rk, BaseMaskedArray)
23882394
llab = rizer.factorize(lk._data, mask=lk._mask)
23892395
rlab = rizer.factorize(rk._data, mask=rk._mask)
2396+
elif isinstance(lk, ArrowExtensionArray):
2397+
assert isinstance(rk, ArrowExtensionArray)
2398+
# we can only get here with numeric dtypes
2399+
# TODO: Remove when we have a Factorizer for Arrow
2400+
llab = rizer.factorize(
2401+
lk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=lk.isna()
2402+
)
2403+
rlab = rizer.factorize(
2404+
rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=rk.isna()
2405+
)
23902406
else:
23912407
# Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type
23922408
# "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],
@@ -2445,6 +2461,8 @@ def _convert_arrays_and_get_rizer_klass(
24452461
# Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]";
24462462
# expected type "Type[object]"
24472463
klass = _factorizers[lk.dtype.type] # type: ignore[index]
2464+
elif isinstance(lk.dtype, ArrowDtype):
2465+
klass = _factorizers[lk.dtype.numpy_dtype.type]
24482466
else:
24492467
klass = _factorizers[lk.dtype.type]
24502468

pandas/tests/reshape/merge/test_merge.py

+15
Original file line numberDiff line numberDiff line change
@@ -2735,3 +2735,18 @@ def test_merge_ea_and_non_ea(any_numeric_ea_dtype, join_type):
27352735
}
27362736
)
27372737
tm.assert_frame_equal(result, expected)
2738+
2739+
2740+
@pytest.mark.parametrize("dtype", ["int64", "int64[pyarrow]"])
2741+
def test_merge_arrow_and_numpy_dtypes(dtype):
2742+
# GH#52406
2743+
pytest.importorskip("pyarrow")
2744+
df = DataFrame({"a": [1, 2]}, dtype=dtype)
2745+
df2 = DataFrame({"a": [1, 2]}, dtype="int64[pyarrow]")
2746+
result = df.merge(df2)
2747+
expected = df.copy()
2748+
tm.assert_frame_equal(result, expected)
2749+
2750+
result = df2.merge(df)
2751+
expected = df2.copy()
2752+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)