Skip to content

Commit 4e755cb

Browse files
phoflJulianWgs
authored andcommitted
Deprecate suffixes in merge producing duplicate columns (pandas-dev#40991)
1 parent 0bbf980 commit 4e755cb

File tree

4 files changed

+59
-2
lines changed

4 files changed

+59
-2
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -603,6 +603,7 @@ Deprecations
603603
- Deprecated support for ``np.ma.mrecords.MaskedRecords`` in the :class:`DataFrame` constructor, pass ``{name: data[name] for name in data.dtype.names}`` instead (:issue:`40363`)
604604
- Deprecated the use of ``**kwargs`` in :class:`.ExcelWriter`; use the keyword argument ``engine_kwargs`` instead (:issue:`40430`)
605605
- Deprecated the ``level`` keyword for :class:`DataFrame` and :class:`Series` aggregations; use groupby instead (:issue:`39983`)
606+
- Deprecated :func:`merge` producing duplicated columns through the ``suffixes`` keyword and already existing columns (:issue:`22818`)
606607

607608
.. ---------------------------------------------------------------------------
608609

pandas/core/reshape/merge.py

+19-1
Original file line numberDiff line numberDiff line change
@@ -2311,4 +2311,22 @@ def renamer(x, suffix):
23112311
lrenamer = partial(renamer, suffix=lsuffix)
23122312
rrenamer = partial(renamer, suffix=rsuffix)
23132313

2314-
return (left._transform_index(lrenamer), right._transform_index(rrenamer))
2314+
llabels = left._transform_index(lrenamer)
2315+
rlabels = right._transform_index(rrenamer)
2316+
2317+
dups = []
2318+
if not llabels.is_unique:
2319+
# Only warn when duplicates are caused because of suffixes, already duplicated
2320+
# columns in origin should not warn
2321+
dups = llabels[(llabels.duplicated()) & (~left.duplicated())].tolist()
2322+
if not rlabels.is_unique:
2323+
dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist())
2324+
if dups:
2325+
warnings.warn(
2326+
f"Passing 'suffixes' which cause duplicate columns {set(dups)} in the "
2327+
f"result is deprecated and will raise a MergeError in a future version.",
2328+
FutureWarning,
2329+
stacklevel=4,
2330+
)
2331+
2332+
return llabels, rlabels

pandas/tests/reshape/merge/test_join.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -629,7 +629,8 @@ def test_join_dups(self):
629629
dta = x.merge(y, left_index=True, right_index=True).merge(
630630
z, left_index=True, right_index=True, how="outer"
631631
)
632-
dta = dta.merge(w, left_index=True, right_index=True)
632+
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
633+
dta = dta.merge(w, left_index=True, right_index=True)
633634
expected = concat([x, y, z, w], axis=1)
634635
expected.columns = ["x_x", "y_x", "x_y", "y_y", "x_x", "y_x", "x_y", "y_y"]
635636
tm.assert_frame_equal(dta, expected)

pandas/tests/reshape/merge/test_merge.py

+37
Original file line numberDiff line numberDiff line change
@@ -2409,3 +2409,40 @@ def test_merge_result_empty_index_and_on():
24092409

24102410
result = merge(df2, df1, left_index=True, right_on=["b"])
24112411
tm.assert_frame_equal(result, expected)
2412+
2413+
2414+
def test_merge_suffixes_produce_dup_columns_warns():
2415+
# GH#22818
2416+
left = DataFrame({"a": [1, 2, 3], "b": 1, "b_x": 2})
2417+
right = DataFrame({"a": [1, 2, 3], "b": 2})
2418+
expected = DataFrame(
2419+
[[1, 1, 2, 2], [2, 1, 2, 2], [3, 1, 2, 2]], columns=["a", "b_x", "b_x", "b_y"]
2420+
)
2421+
with tm.assert_produces_warning(FutureWarning):
2422+
result = merge(left, right, on="a")
2423+
tm.assert_frame_equal(result, expected)
2424+
2425+
with tm.assert_produces_warning(FutureWarning):
2426+
merge(right, left, on="a", suffixes=("_y", "_x"))
2427+
tm.assert_frame_equal(result, expected)
2428+
2429+
2430+
def test_merge_duplicate_columns_with_suffix_no_warning():
2431+
# GH#22818
2432+
# Do not raise warning when duplicates are caused by duplicates in origin
2433+
left = DataFrame([[1, 1, 1], [2, 2, 2]], columns=["a", "b", "b"])
2434+
right = DataFrame({"a": [1, 3], "b": 2})
2435+
result = merge(left, right, on="a")
2436+
expected = DataFrame([[1, 1, 1, 2]], columns=["a", "b_x", "b_x", "b_y"])
2437+
tm.assert_frame_equal(result, expected)
2438+
2439+
2440+
def test_merge_duplicate_columns_with_suffix_causing_another_duplicate():
2441+
# GH#22818
2442+
# This should raise warning because suffixes cause another collision
2443+
left = DataFrame([[1, 1, 1, 1], [2, 2, 2, 2]], columns=["a", "b", "b", "b_x"])
2444+
right = DataFrame({"a": [1, 3], "b": 2})
2445+
with tm.assert_produces_warning(FutureWarning):
2446+
result = merge(left, right, on="a")
2447+
expected = DataFrame([[1, 1, 1, 1, 2]], columns=["a", "b_x", "b_x", "b_x", "b_y"])
2448+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)