Skip to content

Commit 3e3ffef

Browse files
BUG: Outer/right merge with EA dtypes cast to object (#43152)
1 parent ea68a18 commit 3e3ffef

File tree

3 files changed

+67
-10
lines changed

3 files changed

+67
-10
lines changed

doc/source/whatsnew/v1.3.3.rst

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ Fixed regressions
1717
- Fixed regression in :class:`DataFrame` constructor failing to broadcast for defined :class:`Index` and len one list of :class:`Timestamp` (:issue:`42810`)
1818
- Performance regression in :meth:`core.window.ewm.ExponentialMovingWindow.mean` (:issue:`42333`)
1919
- Fixed regression in :meth:`.GroupBy.agg` incorrectly raising in some cases (:issue:`42390`)
20+
- Fixed regression in :meth:`merge` where ``on`` columns with ``ExtensionDtype`` or ``bool`` data types were cast to ``object`` in ``right`` and ``outer`` merge (:issue:`40073`)
2021
- Fixed regression in :meth:`RangeIndex.where` and :meth:`RangeIndex.putmask` raising ``AssertionError`` when result did not represent a :class:`RangeIndex` (:issue:`43240`)
2122
- Fixed regression in :meth:`read_parquet` where the ``fastparquet`` engine would not work properly with fastparquet 0.7.0 (:issue:`43075`)
2223

pandas/core/reshape/merge.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
Categorical,
7171
Index,
7272
MultiIndex,
73+
Series,
7374
)
7475
from pandas.core import groupby
7576
import pandas.core.algorithms as algos
@@ -81,10 +82,7 @@
8182
from pandas.core.sorting import is_int64_overflow_possible
8283

8384
if TYPE_CHECKING:
84-
from pandas import (
85-
DataFrame,
86-
Series,
87-
)
85+
from pandas import DataFrame
8886
from pandas.core.arrays import DatetimeArray
8987

9088

@@ -904,17 +902,22 @@ def _maybe_add_join_keys(
904902
# error: Item "bool" of "Union[Any, bool]" has no attribute "all"
905903
if mask_left.all(): # type: ignore[union-attr]
906904
key_col = Index(rvals)
905+
result_dtype = rvals.dtype
907906
# error: Item "bool" of "Union[Any, bool]" has no attribute "all"
908907
elif (
909908
right_indexer is not None
910909
and mask_right.all() # type: ignore[union-attr]
911910
):
912911
key_col = Index(lvals)
912+
result_dtype = lvals.dtype
913913
else:
914914
key_col = Index(lvals).where(~mask_left, rvals)
915+
result_dtype = lvals.dtype
915916

916917
if result._is_label_reference(name):
917-
result[name] = key_col
918+
result[name] = Series(
919+
key_col, dtype=result_dtype, index=result.index
920+
)
918921
elif result._is_level_reference(name):
919922
if isinstance(result.index, MultiIndex):
920923
key_col.name = name

pandas/tests/reshape/merge/test_merge.py

+58-5
Original file line numberDiff line numberDiff line change
@@ -356,8 +356,8 @@ def test_merge_join_key_dtype_cast(self):
356356
df = merge(df1, df2, how="outer")
357357

358358
# GH13169
359-
# this really should be bool
360-
assert df["key"].dtype == "object"
359+
# GH#40073
360+
assert df["key"].dtype == "bool"
361361

362362
df1 = DataFrame({"val": [1]})
363363
df2 = DataFrame({"val": [2]})
@@ -368,10 +368,12 @@ def test_merge_join_key_dtype_cast(self):
368368

369369
def test_handle_join_key_pass_array(self):
370370
left = DataFrame(
371-
{"key": [1, 1, 2, 2, 3], "value": np.arange(5)}, columns=["value", "key"]
371+
{"key": [1, 1, 2, 2, 3], "value": np.arange(5)},
372+
columns=["value", "key"],
373+
dtype="int64",
372374
)
373-
right = DataFrame({"rvalue": np.arange(6)})
374-
key = np.array([1, 1, 2, 3, 4, 5])
375+
right = DataFrame({"rvalue": np.arange(6)}, dtype="int64")
376+
key = np.array([1, 1, 2, 3, 4, 5], dtype="int64")
375377

376378
merged = merge(left, right, left_on="key", right_on=key, how="outer")
377379
merged2 = merge(right, left, left_on=key, right_on="key", how="outer")
@@ -1644,6 +1646,57 @@ def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals):
16441646
with pytest.raises(ValueError, match=msg):
16451647
merge(df2, df1, on=["A"])
16461648

1649+
@pytest.mark.parametrize(
1650+
"expected_data, how",
1651+
[
1652+
([1, 2], "outer"),
1653+
([], "inner"),
1654+
([2], "right"),
1655+
([1], "left"),
1656+
],
1657+
)
1658+
def test_merge_EA_dtype(self, any_numeric_ea_dtype, how, expected_data):
1659+
# GH#40073
1660+
d1 = DataFrame([(1,)], columns=["id"], dtype=any_numeric_ea_dtype)
1661+
d2 = DataFrame([(2,)], columns=["id"], dtype=any_numeric_ea_dtype)
1662+
result = merge(d1, d2, how=how)
1663+
expected = DataFrame(expected_data, columns=["id"], dtype=any_numeric_ea_dtype)
1664+
tm.assert_frame_equal(result, expected)
1665+
1666+
@pytest.mark.parametrize(
1667+
"expected_data, how",
1668+
[
1669+
(["a", "b"], "outer"),
1670+
([], "inner"),
1671+
(["b"], "right"),
1672+
(["a"], "left"),
1673+
],
1674+
)
1675+
def test_merge_string_dtype(self, how, expected_data, any_string_dtype):
1676+
# GH#40073
1677+
d1 = DataFrame([("a",)], columns=["id"], dtype=any_string_dtype)
1678+
d2 = DataFrame([("b",)], columns=["id"], dtype=any_string_dtype)
1679+
result = merge(d1, d2, how=how)
1680+
expected = DataFrame(expected_data, columns=["id"], dtype=any_string_dtype)
1681+
tm.assert_frame_equal(result, expected)
1682+
1683+
@pytest.mark.parametrize(
1684+
"how, expected_data",
1685+
[
1686+
("inner", [[True, 1, 4], [False, 5, 3]]),
1687+
("outer", [[True, 1, 4], [False, 5, 3]]),
1688+
("left", [[True, 1, 4], [False, 5, 3]]),
1689+
("right", [[False, 5, 3], [True, 1, 4]]),
1690+
],
1691+
)
1692+
def test_merge_bool_dtype(self, how, expected_data):
1693+
# GH#40073
1694+
df1 = DataFrame({"A": [True, False], "B": [1, 5]})
1695+
df2 = DataFrame({"A": [False, True], "C": [3, 4]})
1696+
result = merge(df1, df2, how=how)
1697+
expected = DataFrame(expected_data, columns=["A", "B", "C"])
1698+
tm.assert_frame_equal(result, expected)
1699+
16471700

16481701
@pytest.fixture
16491702
def left():

0 commit comments

Comments
 (0)