Skip to content

Commit fdc13d6

Browse files
authored
BUG: merge with arrow and numpy dtypes raises (#52422)
1 parent b635369 commit fdc13d6

File tree

3 files changed

+35
-2
lines changed

3 files changed

+35
-2
lines changed

doc/source/whatsnew/v2.0.1.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ Fixed regressions
2020

2121
Bug fixes
2222
~~~~~~~~~
23-
-
23+
- Fixed bug in :func:`merge` when merging with ``ArrowDtype`` one one and a NumPy dtype on the other side (:issue:`52406`)
2424

2525
.. ---------------------------------------------------------------------------
2626
.. _whatsnew_201.other:

pandas/core/reshape/merge.py

+19-1
Original file line numberDiff line numberDiff line change
@@ -80,13 +80,15 @@
8080
)
8181

8282
from pandas import (
83+
ArrowDtype,
8384
Categorical,
8485
Index,
8586
MultiIndex,
8687
Series,
8788
)
8889
import pandas.core.algorithms as algos
8990
from pandas.core.arrays import (
91+
ArrowExtensionArray,
9092
BaseMaskedArray,
9193
ExtensionArray,
9294
)
@@ -2377,7 +2379,11 @@ def _factorize_keys(
23772379
rk = ensure_int64(rk.codes)
23782380

23792381
elif isinstance(lk, ExtensionArray) and is_dtype_equal(lk.dtype, rk.dtype):
2380-
if not isinstance(lk, BaseMaskedArray):
2382+
if not isinstance(lk, BaseMaskedArray) and not (
2383+
# exclude arrow dtypes that would get cast to object
2384+
isinstance(lk.dtype, ArrowDtype)
2385+
and is_numeric_dtype(lk.dtype.numpy_dtype)
2386+
):
23812387
lk, _ = lk._values_for_factorize()
23822388

23832389
# error: Item "ndarray" of "Union[Any, ndarray]" has no attribute
@@ -2392,6 +2398,16 @@ def _factorize_keys(
23922398
assert isinstance(rk, BaseMaskedArray)
23932399
llab = rizer.factorize(lk._data, mask=lk._mask)
23942400
rlab = rizer.factorize(rk._data, mask=rk._mask)
2401+
elif isinstance(lk, ArrowExtensionArray):
2402+
assert isinstance(rk, ArrowExtensionArray)
2403+
# we can only get here with numeric dtypes
2404+
# TODO: Remove when we have a Factorizer for Arrow
2405+
llab = rizer.factorize(
2406+
lk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=lk.isna()
2407+
)
2408+
rlab = rizer.factorize(
2409+
rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=rk.isna()
2410+
)
23952411
else:
23962412
# Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type
23972413
# "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],
@@ -2450,6 +2466,8 @@ def _convert_arrays_and_get_rizer_klass(
24502466
# Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]";
24512467
# expected type "Type[object]"
24522468
klass = _factorizers[lk.dtype.type] # type: ignore[index]
2469+
elif isinstance(lk.dtype, ArrowDtype):
2470+
klass = _factorizers[lk.dtype.numpy_dtype.type]
24532471
else:
24542472
klass = _factorizers[lk.dtype.type]
24552473

pandas/tests/reshape/merge/test_merge.py

+15
Original file line numberDiff line numberDiff line change
@@ -2761,3 +2761,18 @@ def test_merge_ea_and_non_ea(any_numeric_ea_dtype, join_type):
27612761
}
27622762
)
27632763
tm.assert_frame_equal(result, expected)
2764+
2765+
2766+
@pytest.mark.parametrize("dtype", ["int64", "int64[pyarrow]"])
2767+
def test_merge_arrow_and_numpy_dtypes(dtype):
2768+
# GH#52406
2769+
pytest.importorskip("pyarrow")
2770+
df = DataFrame({"a": [1, 2]}, dtype=dtype)
2771+
df2 = DataFrame({"a": [1, 2]}, dtype="int64[pyarrow]")
2772+
result = df.merge(df2)
2773+
expected = df.copy()
2774+
tm.assert_frame_equal(result, expected)
2775+
2776+
result = df2.merge(df)
2777+
expected = df2.copy()
2778+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)