Skip to content

Commit e25acb9

Browse files
Backport PR #43152 on branch 1.3.x (BUG: Outer/right merge with EA dtypes cast to object) (#43389)
1 parent 363fd40 commit e25acb9

File tree

3 files changed

+69
-10
lines changed

3 files changed

+69
-10
lines changed

doc/source/whatsnew/v1.3.3.rst

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ Fixed regressions
1717
- Fixed regression in :class:`DataFrame` constructor failing to broadcast for defined :class:`Index` and len one list of :class:`Timestamp` (:issue:`42810`)
1818
- Performance regression in :meth:`core.window.ewm.ExponentialMovingWindow.mean` (:issue:`42333`)
1919
- Fixed regression in :meth:`.GroupBy.agg` incorrectly raising in some cases (:issue:`42390`)
20+
- Fixed regression in :meth:`merge` where ``on`` columns with ``ExtensionDtype`` or ``bool`` data types were cast to ``object`` in ``right`` and ``outer`` merge (:issue:`40073`)
2021
- Fixed regression in :meth:`RangeIndex.where` and :meth:`RangeIndex.putmask` raising ``AssertionError`` when result did not represent a :class:`RangeIndex` (:issue:`43240`)
2122
- Fixed regression in :meth:`read_parquet` where the ``fastparquet`` engine would not work properly with fastparquet 0.7.0 (:issue:`43075`)
2223

pandas/core/reshape/merge.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969
Categorical,
7070
Index,
7171
MultiIndex,
72+
Series,
7273
)
7374
from pandas.core import groupby
7475
import pandas.core.algorithms as algos
@@ -80,10 +81,7 @@
8081
from pandas.core.sorting import is_int64_overflow_possible
8182

8283
if TYPE_CHECKING:
83-
from pandas import (
84-
DataFrame,
85-
Series,
86-
)
84+
from pandas import DataFrame
8785
from pandas.core.arrays import DatetimeArray
8886

8987

@@ -903,17 +901,22 @@ def _maybe_add_join_keys(
903901
# error: Item "bool" of "Union[Any, bool]" has no attribute "all"
904902
if mask_left.all(): # type: ignore[union-attr]
905903
key_col = Index(rvals)
904+
result_dtype = rvals.dtype
906905
# error: Item "bool" of "Union[Any, bool]" has no attribute "all"
907906
elif (
908907
right_indexer is not None
909908
and mask_right.all() # type: ignore[union-attr]
910909
):
911910
key_col = Index(lvals)
911+
result_dtype = lvals.dtype
912912
else:
913913
key_col = Index(lvals).where(~mask_left, rvals)
914+
result_dtype = lvals.dtype
914915

915916
if result._is_label_reference(name):
916-
result[name] = key_col
917+
result[name] = Series(
918+
key_col, dtype=result_dtype, index=result.index
919+
)
917920
elif result._is_level_reference(name):
918921
if isinstance(result.index, MultiIndex):
919922
key_col.name = name

pandas/tests/reshape/merge/test_merge.py

+60-5
Original file line numberDiff line numberDiff line change
@@ -354,8 +354,8 @@ def test_merge_join_key_dtype_cast(self):
354354
df = merge(df1, df2, how="outer")
355355

356356
# GH13169
357-
# this really should be bool
358-
assert df["key"].dtype == "object"
357+
# GH#40073
358+
assert df["key"].dtype == "bool"
359359

360360
df1 = DataFrame({"val": [1]})
361361
df2 = DataFrame({"val": [2]})
@@ -366,10 +366,12 @@ def test_merge_join_key_dtype_cast(self):
366366

367367
def test_handle_join_key_pass_array(self):
368368
left = DataFrame(
369-
{"key": [1, 1, 2, 2, 3], "value": np.arange(5)}, columns=["value", "key"]
369+
{"key": [1, 1, 2, 2, 3], "value": np.arange(5)},
370+
columns=["value", "key"],
371+
dtype="int64",
370372
)
371-
right = DataFrame({"rvalue": np.arange(6)})
372-
key = np.array([1, 1, 2, 3, 4, 5])
373+
right = DataFrame({"rvalue": np.arange(6)}, dtype="int64")
374+
key = np.array([1, 1, 2, 3, 4, 5], dtype="int64")
373375

374376
merged = merge(left, right, left_on="key", right_on=key, how="outer")
375377
merged2 = merge(right, left, left_on=key, right_on="key", how="outer")
@@ -1642,6 +1644,59 @@ def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals):
16421644
with pytest.raises(ValueError, match=msg):
16431645
merge(df2, df1, on=["A"])
16441646

1647+
@pytest.mark.parametrize(
1648+
"expected_data, how",
1649+
[
1650+
([1, 2], "outer"),
1651+
([], "inner"),
1652+
([2], "right"),
1653+
([1], "left"),
1654+
],
1655+
)
1656+
def test_merge_EA_dtype(self, any_nullable_numeric_dtype, how, expected_data):
1657+
# GH#40073
1658+
d1 = DataFrame([(1,)], columns=["id"], dtype=any_nullable_numeric_dtype)
1659+
d2 = DataFrame([(2,)], columns=["id"], dtype=any_nullable_numeric_dtype)
1660+
result = merge(d1, d2, how=how)
1661+
expected = DataFrame(
1662+
expected_data, columns=["id"], dtype=any_nullable_numeric_dtype
1663+
)
1664+
tm.assert_frame_equal(result, expected)
1665+
1666+
@pytest.mark.parametrize(
1667+
"expected_data, how",
1668+
[
1669+
(["a", "b"], "outer"),
1670+
([], "inner"),
1671+
(["b"], "right"),
1672+
(["a"], "left"),
1673+
],
1674+
)
1675+
def test_merge_string_dtype(self, how, expected_data, any_string_dtype):
1676+
# GH#40073
1677+
d1 = DataFrame([("a",)], columns=["id"], dtype=any_string_dtype)
1678+
d2 = DataFrame([("b",)], columns=["id"], dtype=any_string_dtype)
1679+
result = merge(d1, d2, how=how)
1680+
expected = DataFrame(expected_data, columns=["id"], dtype=any_string_dtype)
1681+
tm.assert_frame_equal(result, expected)
1682+
1683+
@pytest.mark.parametrize(
1684+
"how, expected_data",
1685+
[
1686+
("inner", [[True, 1, 4], [False, 5, 3]]),
1687+
("outer", [[True, 1, 4], [False, 5, 3]]),
1688+
("left", [[True, 1, 4], [False, 5, 3]]),
1689+
("right", [[False, 5, 3], [True, 1, 4]]),
1690+
],
1691+
)
1692+
def test_merge_bool_dtype(self, how, expected_data):
1693+
# GH#40073
1694+
df1 = DataFrame({"A": [True, False], "B": [1, 5]})
1695+
df2 = DataFrame({"A": [False, True], "C": [3, 4]})
1696+
result = merge(df1, df2, how=how)
1697+
expected = DataFrame(expected_data, columns=["A", "B", "C"])
1698+
tm.assert_frame_equal(result, expected)
1699+
16451700

16461701
@pytest.fixture
16471702
def left():

0 commit comments

Comments
 (0)