diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 17a830788be3f..4188fb8006dad 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -936,6 +936,7 @@ Reshaping - Bug in :meth:`DataFrame.replace` casts columns to ``object`` dtype if items in ``to_replace`` not in values (:issue:`32988`) - Ensure only named functions can be used in :func:`eval()` (:issue:`32460`) - Fixed bug in :func:`melt` where melting MultiIndex columns with ``col_level`` > 0 would raise a ``KeyError`` on ``id_vars`` (:issue:`34129`) +- Fixed bug setting wrong values in result when joining one side over index and other side over column in case of join type not equal to inner (:issue:`17257`, :issue:`28220`, :issue:`28243` and :issue:`33232`) Sparse ^^^^^^ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0c796c8f45a52..a207869de8484 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -787,7 +787,25 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): take_left, take_right = None, None if name in result: - + array_like = is_array_like(rname) or is_array_like(lname) + right_in, left_in = False, False + if not array_like: + right_in = ( + name in self.right_on + or self.right_index is True + and name in self.right.index.names + ) + left_in = ( + name in self.left_on + or self.left_index is True + and name in self.left.index.names + ) + if ( + (not right_in or not left_in) + and not array_like + and self.how != "asof" + ): + continue if left_indexer is not None and right_indexer is not None: if name in self.left: diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 4408aa0bbce4a..860908cd97adb 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -484,12 +484,10 @@ def check2(exp, kwarg): kwarg = dict(left_on="a", right_index=True) check1(exp_in, kwarg) - exp_out["a"] = [0, 1, 2] check2(exp_out, kwarg) kwarg = dict(left_on="a", right_on="x") check1(exp_in, kwarg) - exp_out["a"] = np.array([np.nan] * 3, dtype=object) check2(exp_out, kwarg) def test_merge_left_notempty_right_empty(self): @@ -1294,9 +1292,9 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index): [0, 0, 0], [1, 1, 1], [2, 2, 2], - [np.nan, 3, 3], - [np.nan, 4, 4], - [np.nan, 5, 5], + [np.nan, np.nan, 3], + [np.nan, np.nan, 4], + [np.nan, np.nan, 5], ], columns=["a", "key", "b"], ) @@ -1311,7 +1309,7 @@ def test_merge_right_index_right(self): right = pd.DataFrame({"b": [1, 2, 3]}) expected = pd.DataFrame( - {"a": [1, 2, 3, None], "key": [0, 1, 1, 2], "b": [1, 2, 2, 3]}, + {"a": [1, 2, 3, None], "key": [0, 1, 1, None], "b": [1, 2, 2, 3]}, columns=["a", "key", "b"], index=[0, 1, 2, np.nan], ) @@ -1347,7 +1345,7 @@ def test_merge_take_missing_values_from_index_of_other_dtype(self): expected = pd.DataFrame( { "a": [1, 2, 3, None], - "key": pd.Categorical(["a", "a", "b", "c"]), + "key": pd.Categorical(["a", "a", "b", None], dtype=right.index.dtype), "b": [1, 1, 2, 3], }, index=[0, 1, 2, np.nan], @@ -2227,3 +2225,39 @@ def test_categorical_non_unique_monotonic(n_categories): index=left_index, ) tm.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize( + ("how", "data"), + [ + ("left", {"y": [1, 2], "x": ["a", np.nan]}), + ("right", {"y": [1, np.nan], "x": ["a", "c"]}), + ], +) +def test_index_true_and_on_combination_values(how, data): + # gh 28220 and gh 28243 and gh 17257 + left = pd.DataFrame({"y": [1, 2]}, index=["a", "b"]) + right = pd.DataFrame({"x": ["a", "c"]}) + + result = pd.merge(left, right, left_index=True, right_on="x", how=how) + expected = pd.DataFrame(data, index=result.index) + tm.assert_frame_equal(result, expected) + + +def test_index_true_outer_join(): + # gh 33232 and gh 17257 + left = pd.DataFrame({"lkey": [0, 3], "value": [1, 5]}) + right = pd.DataFrame({"rkey": ["foo", "baz"], "value": [0, 1]}) + + result = pd.merge(left, right, how="outer", left_on="lkey", right_index=True) + + expected = pd.DataFrame( + { + "lkey": [0, 3, np.nan], + "value_x": [1, 5, np.nan], + "rkey": ["foo", np.nan, "baz"], + "value_y": [0, np.nan, 1], + }, + index=[0, 1, np.nan], + ) + tm.assert_frame_equal(result, expected)