Skip to content

Commit 909af8b

Browse files
authored
BUG: unexpected merge_ordered results caused by wrongly groupby (#38170)
1 parent 40ca2b9 commit 909af8b

File tree

3 files changed

+32
-4
lines changed

3 files changed

+32
-4
lines changed

doc/source/whatsnew/v1.2.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -787,6 +787,8 @@ Reshaping
787787
- Bug in :func:`concat` resulting in a ``ValueError`` when at least one of both inputs had a non-unique index (:issue:`36263`)
788788
- Bug in :meth:`DataFrame.merge` and :meth:`pandas.merge` returning inconsistent ordering in result for ``how=right`` and ``how=left`` (:issue:`35382`)
789789
- Bug in :func:`merge_ordered` couldn't handle list-like ``left_by`` or ``right_by`` (:issue:`35269`)
790+
- Bug in :func:`merge_ordered` returned wrong join result when length of ``left_by`` or ``right_by`` equals to the rows of ``left`` or ``right`` (:issue:`38166`)
791+
- Bug in :func:`merge_ordered` didn't raise when elements in ``left_by`` or ``right_by`` not exist in ``left`` columns or ``right`` columns (:issue:`38167`)
790792

791793
Sparse
792794
^^^^^^

pandas/core/reshape/merge.py

+11-4
Original file line numberDiff line numberDiff line change
@@ -114,11 +114,8 @@ def _groupby_and_merge(by, on, left: "DataFrame", right: "DataFrame", merge_piec
114114

115115
# if we can groupby the rhs
116116
# then we can get vastly better perf
117-
118-
try:
117+
if all(item in right.columns for item in by):
119118
rby = right.groupby(by, sort=False)
120-
except KeyError:
121-
pass
122119

123120
for key, lhs in lby:
124121

@@ -274,10 +271,20 @@ def _merger(x, y):
274271
if left_by is not None and right_by is not None:
275272
raise ValueError("Can only group either left or right frames")
276273
elif left_by is not None:
274+
if isinstance(left_by, str):
275+
left_by = [left_by]
276+
check = set(left_by).difference(left.columns)
277+
if len(check) != 0:
278+
raise KeyError(f"{check} not found in left columns")
277279
result, _ = _groupby_and_merge(
278280
left_by, on, left, right, lambda x, y: _merger(x, y)
279281
)
280282
elif right_by is not None:
283+
if isinstance(right_by, str):
284+
right_by = [right_by]
285+
check = set(right_by).difference(right.columns)
286+
if len(check) != 0:
287+
raise KeyError(f"{check} not found in right columns")
281288
result, _ = _groupby_and_merge(
282289
right_by, on, right, left, lambda x, y: _merger(y, x)
283290
)

pandas/tests/reshape/merge/test_merge_ordered.py

+19
Original file line numberDiff line numberDiff line change
@@ -177,3 +177,22 @@ def test_list_type_by(self, left, right, on, left_by, right_by, expected):
177177
)
178178

179179
tm.assert_frame_equal(result, expected)
180+
181+
def test_left_by_length_equals_to_right_shape0(self):
182+
# GH 38166
183+
left = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHT"))
184+
right = DataFrame([[2, 1]], columns=list("TE"))
185+
result = merge_ordered(left, right, on="T", left_by=["G", "H"])
186+
expected = DataFrame(
187+
{"G": ["g"] * 3, "H": ["h"] * 3, "T": [1, 2, 3], "E": [np.nan, 1.0, np.nan]}
188+
)
189+
190+
tm.assert_frame_equal(result, expected)
191+
192+
def test_elements_not_in_by_but_in_df(self):
193+
# GH 38167
194+
left = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHT"))
195+
right = DataFrame([[2, 1]], columns=list("TE"))
196+
msg = r"\{'h'\} not found in left columns"
197+
with pytest.raises(KeyError, match=msg):
198+
merge_ordered(left, right, on="T", left_by=["G", "h"])

0 commit comments

Comments
 (0)