From ef2d11f4b18f3680edfec4e60f96f9d6131934e2 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 31 May 2020 15:41:45 +0200 Subject: [PATCH 1/5] BUG: Fix wrong values when joining via index and column for left, right, outer --- pandas/core/reshape/merge.py | 54 +++++++++++++++--------- pandas/tests/reshape/merge/test_merge.py | 48 ++++++++++++++++++--- 2 files changed, 76 insertions(+), 26 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0c796c8f45a52..91f2f583b8754 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -787,33 +787,49 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): take_left, take_right = None, None if name in result: + array_like = is_array_like(rname) or is_array_like(lname) + right_in, left_in = False, False + if not array_like: + right_in = ( + name in self.right_on + or self.right_index is True + and name in self.right.index.names + ) + left_in = ( + name in self.left_on + or self.left_index is True + and name in self.left.index.names + ) + if right_in and left_in or array_like or self.how == "asof": - if left_indexer is not None and right_indexer is not None: - if name in self.left: + if left_indexer is not None and right_indexer is not None: + if name in self.left: - if left_has_missing is None: - left_has_missing = (left_indexer == -1).any() + if left_has_missing is None: + left_has_missing = (left_indexer == -1).any() - if left_has_missing: - take_right = self.right_join_keys[i] + if left_has_missing: + take_right = self.right_join_keys[i] - if not is_dtype_equal( - result[name].dtype, self.left[name].dtype - ): - take_left = self.left[name]._values + if not is_dtype_equal( + result[name].dtype, self.left[name].dtype + ): + take_left = self.left[name]._values - elif name in self.right: + elif name in self.right: - if right_has_missing is None: - right_has_missing = (right_indexer == -1).any() + if right_has_missing is None: + right_has_missing = (right_indexer == -1).any() - if right_has_missing: - take_left = self.left_join_keys[i] + if right_has_missing: + take_left = self.left_join_keys[i] - if not is_dtype_equal( - result[name].dtype, self.right[name].dtype - ): - take_right = self.right[name]._values + if not is_dtype_equal( + result[name].dtype, self.right[name].dtype + ): + take_right = self.right[name]._values + else: + continue elif left_indexer is not None and is_array_like(self.left_join_keys[i]): take_left = self.left_join_keys[i] diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 4408aa0bbce4a..860908cd97adb 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -484,12 +484,10 @@ def check2(exp, kwarg): kwarg = dict(left_on="a", right_index=True) check1(exp_in, kwarg) - exp_out["a"] = [0, 1, 2] check2(exp_out, kwarg) kwarg = dict(left_on="a", right_on="x") check1(exp_in, kwarg) - exp_out["a"] = np.array([np.nan] * 3, dtype=object) check2(exp_out, kwarg) def test_merge_left_notempty_right_empty(self): @@ -1294,9 +1292,9 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index): [0, 0, 0], [1, 1, 1], [2, 2, 2], - [np.nan, 3, 3], - [np.nan, 4, 4], - [np.nan, 5, 5], + [np.nan, np.nan, 3], + [np.nan, np.nan, 4], + [np.nan, np.nan, 5], ], columns=["a", "key", "b"], ) @@ -1311,7 +1309,7 @@ def test_merge_right_index_right(self): right = pd.DataFrame({"b": [1, 2, 3]}) expected = pd.DataFrame( - {"a": [1, 2, 3, None], "key": [0, 1, 1, 2], "b": [1, 2, 2, 3]}, + {"a": [1, 2, 3, None], "key": [0, 1, 1, None], "b": [1, 2, 2, 3]}, columns=["a", "key", "b"], index=[0, 1, 2, np.nan], ) @@ -1347,7 +1345,7 @@ def test_merge_take_missing_values_from_index_of_other_dtype(self): expected = pd.DataFrame( { "a": [1, 2, 3, None], - "key": pd.Categorical(["a", "a", "b", "c"]), + "key": pd.Categorical(["a", "a", "b", None], dtype=right.index.dtype), "b": [1, 1, 2, 3], }, index=[0, 1, 2, np.nan], @@ -2227,3 +2225,39 @@ def test_categorical_non_unique_monotonic(n_categories): index=left_index, ) tm.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize( + ("how", "data"), + [ + ("left", {"y": [1, 2], "x": ["a", np.nan]}), + ("right", {"y": [1, np.nan], "x": ["a", "c"]}), + ], +) +def test_index_true_and_on_combination_values(how, data): + # gh 28220 and gh 28243 and gh 17257 + left = pd.DataFrame({"y": [1, 2]}, index=["a", "b"]) + right = pd.DataFrame({"x": ["a", "c"]}) + + result = pd.merge(left, right, left_index=True, right_on="x", how=how) + expected = pd.DataFrame(data, index=result.index) + tm.assert_frame_equal(result, expected) + + +def test_index_true_outer_join(): + # gh 33232 and gh 17257 + left = pd.DataFrame({"lkey": [0, 3], "value": [1, 5]}) + right = pd.DataFrame({"rkey": ["foo", "baz"], "value": [0, 1]}) + + result = pd.merge(left, right, how="outer", left_on="lkey", right_index=True) + + expected = pd.DataFrame( + { + "lkey": [0, 3, np.nan], + "value_x": [1, 5, np.nan], + "rkey": ["foo", np.nan, "baz"], + "value_y": [0, np.nan, 1], + }, + index=[0, 1, np.nan], + ) + tm.assert_frame_equal(result, expected) From f1f564714e911d3ed63ae2497747f19db03b3832 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 31 May 2020 15:44:14 +0200 Subject: [PATCH 2/5] Add whats new entry --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 17a830788be3f..86a25feabd26d 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -936,6 +936,7 @@ Reshaping - Bug in :meth:`DataFrame.replace` casts columns to ``object`` dtype if items in ``to_replace`` not in values (:issue:`32988`) - Ensure only named functions can be used in :func:`eval()` (:issue:`32460`) - Fixed bug in :func:`melt` where melting MultiIndex columns with ``col_level`` > 0 would raise a ``KeyError`` on ``id_vars`` (:issue:`34129`) +- Fixed bug setting wrong values in result when joining one side over index and other side over column in case of join type not equal to inner (:issue:`17257`, :issue`28220`, :issue:`28243`, :issue:`33232`) Sparse ^^^^^^ From 3a5a29248d1c687e0b14a97c54d5a4e2d6e0b6b2 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 31 May 2020 16:26:26 +0200 Subject: [PATCH 3/5] Fix whats new typo --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 86a25feabd26d..4188fb8006dad 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -936,7 +936,7 @@ Reshaping - Bug in :meth:`DataFrame.replace` casts columns to ``object`` dtype if items in ``to_replace`` not in values (:issue:`32988`) - Ensure only named functions can be used in :func:`eval()` (:issue:`32460`) - Fixed bug in :func:`melt` where melting MultiIndex columns with ``col_level`` > 0 would raise a ``KeyError`` on ``id_vars`` (:issue:`34129`) -- Fixed bug setting wrong values in result when joining one side over index and other side over column in case of join type not equal to inner (:issue:`17257`, :issue`28220`, :issue:`28243`, :issue:`33232`) +- Fixed bug setting wrong values in result when joining one side over index and other side over column in case of join type not equal to inner (:issue:`17257`, :issue:`28220`, :issue:`28243` and :issue:`33232`) Sparse ^^^^^^ From a3e6623dcba740bf1a45111257dcb0e082049e7c Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 31 May 2020 18:14:10 +0200 Subject: [PATCH 4/5] Change if condition --- pandas/core/reshape/merge.py | 44 +++++++++++++++++------------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 91f2f583b8754..8a103a857b0e8 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -800,36 +800,34 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): or self.left_index is True and name in self.left.index.names ) - if right_in and left_in or array_like or self.how == "asof": - - if left_indexer is not None and right_indexer is not None: - if name in self.left: + if (not right_in or not left_in) and not array_like and self.how != "asof": + continue + if left_indexer is not None and right_indexer is not None: + if name in self.left: - if left_has_missing is None: - left_has_missing = (left_indexer == -1).any() + if left_has_missing is None: + left_has_missing = (left_indexer == -1).any() - if left_has_missing: - take_right = self.right_join_keys[i] + if left_has_missing: + take_right = self.right_join_keys[i] - if not is_dtype_equal( - result[name].dtype, self.left[name].dtype - ): - take_left = self.left[name]._values + if not is_dtype_equal( + result[name].dtype, self.left[name].dtype + ): + take_left = self.left[name]._values - elif name in self.right: + elif name in self.right: - if right_has_missing is None: - right_has_missing = (right_indexer == -1).any() + if right_has_missing is None: + right_has_missing = (right_indexer == -1).any() - if right_has_missing: - take_left = self.left_join_keys[i] + if right_has_missing: + take_left = self.left_join_keys[i] - if not is_dtype_equal( - result[name].dtype, self.right[name].dtype - ): - take_right = self.right[name]._values - else: - continue + if not is_dtype_equal( + result[name].dtype, self.right[name].dtype + ): + take_right = self.right[name]._values elif left_indexer is not None and is_array_like(self.left_join_keys[i]): take_left = self.left_join_keys[i] From e759612ebf9d11332f66701a85da4cf8ad3ecf82 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 31 May 2020 18:17:59 +0200 Subject: [PATCH 5/5] Run black pandas --- pandas/core/reshape/merge.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 8a103a857b0e8..a207869de8484 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -800,7 +800,11 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): or self.left_index is True and name in self.left.index.names ) - if (not right_in or not left_in) and not array_like and self.how != "asof": + if ( + (not right_in or not left_in) + and not array_like + and self.how != "asof" + ): continue if left_indexer is not None and right_indexer is not None: if name in self.left: