Skip to content

ENH: accept None behaviour for suffixes in DataFrame.merge #25243

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Other Enhancements

- Indexing of ``DataFrame`` and ``Series`` now accepts zerodim ``np.ndarray`` (:issue:`24919`)
- :meth:`Timestamp.replace` now supports the ``fold`` argument to disambiguate DST transition times (:issue:`25017`)
- :function:`pandas.merge` now accepts ``None`` as input of suffixes (:issue:`25242`)
- :meth:`DataFrame.at_time` and :meth:`Series.at_time` now support :meth:`datetime.time` objects with timezones (:issue:`24043`)
- ``Series.str`` has gained :meth:`Series.str.casefold` method to removes all case distinctions present in a string (:issue:`25405`)
- :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`)
Expand Down
8 changes: 0 additions & 8 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,14 +273,6 @@
4 bar 2 bar 6
5 baz 3 baz 7

Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
any overlapping columns.

>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did we previously accept False here? or was this added in the previous PR?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this was not added in previous PR.
IIUC previously the code was something like
if not lsuffix and not rsuffix, so False and None are both not allowed, and will raise the ValueError,
but since in this PR, we don't raise error or warnings when there are duplicate column names, so i removed this part of docstring. @jreback @jorisvandenbossche

Traceback (most recent call last):
...
ValueError: columns overlap but no suffix specified:
Index(['value'], dtype='object')
"""

# -----------------------------------------------------------------------
Expand Down
3 changes: 0 additions & 3 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1963,9 +1963,6 @@ def items_overlap_with_suffix(left, lsuffix, right, rsuffix):
if len(to_rename) == 0:
return left, right
else:
if not lsuffix and not rsuffix:
raise ValueError('columns overlap but no suffix specified: '
'{rename}'.format(rename=to_rename))

def renamer(x, suffix):
"""Rename the left and right indices.
Expand Down
8 changes: 5 additions & 3 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,13 +159,13 @@ def merge_ordered(left, right, on=None,
left DataFrame
fill_method : {'ffill', None}, default None
Interpolation method for data
suffixes : Sequence, default is ("_x", "_y")
suffixes : Sequence or None, default is ("_x", "_y")
A length-2 sequence where each element is optionally a string
indicating the suffix to add to overlapping column names in
`left` and `right` respectively. Pass a value of `None` instead
of a string to indicate that the column name from `left` or
`right` should be left as-is, with no suffix. At least one of the
values must not be None.
`right` should be left as-is, with no suffix. `None` means
keep both name of overlapping columns as-is.
.. versionchanged:: 0.25.0
how : {'left', 'right', 'outer', 'inner'}, default 'outer'
Expand Down Expand Up @@ -495,6 +495,8 @@ def __init__(self, left, right, how='inner', on=None,

self.copy = copy
self.suffixes = suffixes
if self.suffixes is None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can change the signature I think to suffixes=None

self.suffixes = (None, None)
self.sort = sort

self.left_index = left_index
Expand Down
6 changes: 0 additions & 6 deletions pandas/tests/frame/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,6 @@ def test_join_index(frame):
with pytest.raises(ValueError, match='join method'):
f.join(f2, how='foo')

# corner case - overlapping columns
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, this used to error. I am not sure I like this change. cc @jorisvandenbossche

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, this removal is made because in last PR we found out pandas don't explicitly refuse overlapping columns, e.g.

df1 = pd.DataFrame({"a": [1, 2], "b": [3, 4], "b_x": [5, 6]})
df2 = pd.DataFrame({"a": [1, 2], "b": [4, 5]})
df1.merge(df2, on="a")

the output will still contain two "b_x" columns. @jorisvandenbossche @simonjayhawkins

msg = 'columns overlap but no suffix'
for how in ('outer', 'left', 'inner'):
with pytest.raises(ValueError, match=msg):
frame.join(frame, how=how)


def test_join_index_more(frame):
af = frame.loc[:, ['A', 'B']]
Expand Down
46 changes: 12 additions & 34 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1618,7 +1618,18 @@ def test_merge_series(on, left_on, right_on, left_index, right_index, nm):
("a", "a", dict(suffixes=[None, "_x"]), ["a", "a_x"]),
(0, 0, dict(suffixes=["_a", None]), ["0_a", 0]),
("a", "a", dict(), ["a_x", "a_y"]),
(0, 0, dict(), ["0_x", "0_y"])
(0, 0, dict(), ["0_x", "0_y"]),
# accept 2-length None alike suffixes input
(0, 0, dict(suffixes=[None, None]), [0, 0]),
(0, 0, dict(suffixes=(None, '')), [0, '0']),
(0, 0, dict(suffixes=['', '']), ['0', '0']),
("a", "a", dict(suffixes=[None, None]), ["a", "a"]),
("a", "a", dict(suffixes=["", None]), ["a", "a"]),
("a", "a", dict(suffixes=(None, None)), ["a", "a"]),
("a", "a", dict(suffixes=('', '')), ["a", "a"]),
# accept None as suffixes
(0, 0, dict(suffixes=None), [0, 0]),
("a", "a", dict(suffixes=None), ["a", "a"])
])
def test_merge_suffix(col1, col2, kwargs, expected_cols):
# issue: 24782
Expand All @@ -1633,36 +1644,3 @@ def test_merge_suffix(col1, col2, kwargs, expected_cols):

result = pd.merge(a, b, left_index=True, right_index=True, **kwargs)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("col1, col2, suffixes", [
("a", "a", [None, None]),
("a", "a", (None, None)),
("a", "a", ("", None)),
(0, 0, [None, None]),
(0, 0, (None, ""))
])
def test_merge_suffix_error(col1, col2, suffixes):
# issue: 24782
a = pd.DataFrame({col1: [1, 2, 3]})
b = pd.DataFrame({col2: [3, 4, 5]})

# TODO: might reconsider current raise behaviour, see issue 24782
msg = "columns overlap but no suffix specified"
with pytest.raises(ValueError, match=msg):
pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes)


@pytest.mark.parametrize("col1, col2, suffixes", [
("a", "a", None),
(0, 0, None)
])
def test_merge_suffix_none_error(col1, col2, suffixes):
# issue: 24782
a = pd.DataFrame({col1: [1, 2, 3]})
b = pd.DataFrame({col2: [3, 4, 5]})

# TODO: might reconsider current raise behaviour, see GH24782
msg = "iterable"
with pytest.raises(TypeError, match=msg):
pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes)
6 changes: 0 additions & 6 deletions pandas/tests/reshape/merge/test_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,12 +508,6 @@ def test_join_multi_levels(self):
with pytest.raises(ValueError):
household.join(portfolio, how='inner')

portfolio2 = portfolio.copy()
portfolio2.index.set_names(['household_id', 'foo'])

with pytest.raises(ValueError):
portfolio2.join(portfolio, how='inner')

def test_join_multi_levels2(self):

# some more advanced merges
Expand Down