diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 3fab4850dd1ec..68a5641aec5e1 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -756,6 +756,8 @@ Reshaping - Bug in :func:`concat` resulting in a ``ValueError`` when at least one of both inputs had a non-unique index (:issue:`36263`) - Bug in :meth:`DataFrame.merge` and :meth:`pandas.merge` returning inconsistent ordering in result for ``how=right`` and ``how=left`` (:issue:`35382`) - Bug in :func:`merge_ordered` couldn't handle list-like ``left_by`` or ``right_by`` (:issue:`35269`) +- Bug in :func:`merge_ordered` returned wrong join result when length of ``left_by`` or ``right_by`` equals to the rows of ``left`` or ``right`` (:issue:`38166`) +- Bug in :func:`merge_ordered` didn't raise when elements in ``left_by`` or ``right_by`` not exist in ``left`` columns or ``right`` columns (:issue:`38167`) Sparse ^^^^^^ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 545117dd84f93..2c6cdb846221f 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -114,11 +114,8 @@ def _groupby_and_merge(by, on, left: "DataFrame", right: "DataFrame", merge_piec # if we can groupby the rhs # then we can get vastly better perf - - try: + if all(item in right.columns for item in by): rby = right.groupby(by, sort=False) - except KeyError: - pass for key, lhs in lby: @@ -274,10 +271,20 @@ def _merger(x, y): if left_by is not None and right_by is not None: raise ValueError("Can only group either left or right frames") elif left_by is not None: + if isinstance(left_by, str): + left_by = [left_by] + check = set(left_by).difference(left.columns) + if len(check) != 0: + raise KeyError(f"{check} not found in left columns") result, _ = _groupby_and_merge( left_by, on, left, right, lambda x, y: _merger(x, y) ) elif right_by is not None: + if isinstance(right_by, str): + right_by = [right_by] + check = set(right_by).difference(right.columns) + if len(check) != 0: + raise KeyError(f"{check} not found in right columns") result, _ = _groupby_and_merge( right_by, on, right, left, lambda x, y: _merger(y, x) ) diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index 8389a6bb9be10..4a70719df5c57 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -177,3 +177,22 @@ def test_list_type_by(self, left, right, on, left_by, right_by, expected): ) tm.assert_frame_equal(result, expected) + + def test_left_by_length_equals_to_right_shape0(self): + # GH 38166 + left = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHT")) + right = DataFrame([[2, 1]], columns=list("TE")) + result = merge_ordered(left, right, on="T", left_by=["G", "H"]) + expected = DataFrame( + {"G": ["g"] * 3, "H": ["h"] * 3, "T": [1, 2, 3], "E": [np.nan, 1.0, np.nan]} + ) + + tm.assert_frame_equal(result, expected) + + def test_elements_not_in_by_but_in_df(self): + # GH 38167 + left = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHT")) + right = DataFrame([[2, 1]], columns=list("TE")) + msg = r"\{'h'\} not found in left columns" + with pytest.raises(KeyError, match=msg): + merge_ordered(left, right, on="T", left_by=["G", "h"])