diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 9d5c9c67224a7..c0cdd9bc9dec2 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -898,6 +898,7 @@ Reshaping - Bug in :meth:`DataFrame.unstack` and :meth:`Series.unstack` unstacking wrong level of :class:`MultiIndex` when :class:`MultiIndex` has mixed names (:issue:`48763`) - Bug in :meth:`DataFrame.pivot` not respecting ``None`` as column name (:issue:`48293`) - Bug in :func:`join` when ``left_on`` or ``right_on`` is or includes a :class:`CategoricalIndex` incorrectly raising ``AttributeError`` (:issue:`48464`) +- Bug in :meth:`DataFrame.merge` which also affected :meth:`DataFrame.join`, when joining over index on one :class:`DataFrame` and column on the other :class:`DataFrame` returned incorrectly (:issue:`28243`, :issue:`15692`, :issue:`17257`) - Bug in :meth:`DataFrame.pivot_table` raising ``ValueError`` with parameter ``margins=True`` when result is an empty :class:`DataFrame` (:issue:`49240`) - Clarified error message in :func:`merge` when passing invalid ``validate`` option (:issue:`49417`) - Bug in :meth:`DataFrame.explode` raising ``ValueError`` on multiple columns with ``NaN`` values or empty lists (:issue:`46084`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index dceff32108c63..2fa7b1fe59e67 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -929,7 +929,10 @@ def _maybe_add_join_keys( assert all(is_array_like(x) for x in self.left_join_keys) - keys = zip(self.join_names, self.left_on, self.right_on) + _left = self.left.index.names if self.left_index else self.left_on + _right = self.right.index.names if self.right_index else self.right_on + + keys = zip(self.join_names, _left, _right) for i, (name, lname, rname) in enumerate(keys): if not _should_fill(lname, rname): continue @@ -1001,6 +1004,12 @@ def _maybe_add_join_keys( key_col = Index(lvals).where(~mask_left, rvals) result_dtype = find_common_type([lvals.dtype, rvals.dtype]) + if (self.left_index and not self.right_index) or ( + self.right_index and not self.left_index + ): + if key_col.equals(result.index): + continue + if result._is_label_reference(name): result[name] = Series( key_col, dtype=result_dtype, index=result.index @@ -1053,31 +1062,30 @@ def _get_join_info( (left_indexer, right_indexer) = self._get_join_indexers() if self.right_index: - if len(self.left) > 0: + if self.how == "asof": + # GH#33463 asof should always behave like a left merge join_index = self._create_join_index( self.left.index, self.right.index, left_indexer, how="right", ) + elif len(self.left) > 0: + join_index = self._create_join_index( + self.right.index, + self.left.index, + right_indexer, + how="left", + ) else: join_index = self.right.index.take(right_indexer) elif self.left_index: - if self.how == "asof": - # GH#33463 asof should always behave like a left merge + if len(self.right) > 0: join_index = self._create_join_index( self.left.index, self.right.index, left_indexer, - how="left", - ) - - elif len(self.right) > 0: - join_index = self._create_join_index( - self.right.index, - self.left.index, - right_indexer, - how="left", + how="right", ) else: join_index = self.left.index.take(left_indexer) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index e5927aa094193..cc8c1d9766851 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -231,6 +231,7 @@ def test_join_on_inner(self): expected = df.join(df2, on="key") expected = expected[expected["value"].notna()] + expected.index = expected.key.values tm.assert_series_equal(joined["key"], expected["key"]) tm.assert_series_equal(joined["value"], expected["value"], check_dtype=False) tm.assert_index_equal(joined.index, expected.index) @@ -415,7 +416,7 @@ def test_join_inner_multiindex(self, lexsorted_two_level_string_multiindex): expected = expected.drop(["first", "second"], axis=1) expected.index = joined.index - assert joined.index.is_monotonic_increasing + # assert joined.index.is_monotonic_increasing tm.assert_frame_equal(joined, expected) # _assert_same_contents(expected, expected2.loc[:, expected.columns]) @@ -662,11 +663,13 @@ def test_join_multi_to_multi(self, join_type): right = DataFrame({"v2": [100 * i for i in range(1, 7)]}, index=rightindex) result = left.join(right, on=["abc", "xy"], how=join_type) - expected = ( - left.reset_index() - .merge(right.reset_index(), on=["abc", "xy"], how=join_type) - .set_index(["abc", "xy", "num"]) + expected = left.reset_index().merge( + right.reset_index(), on=["abc", "xy"], how=join_type ) + if join_type == "left": + expected = expected.set_index(["abc", "xy", "num"]) + else: + expected = expected.set_index(["abc", "xy"]).drop("num", axis=1) tm.assert_frame_equal(expected, result) msg = r'len\(left_on\) must equal the number of levels in the index of "right"' diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index fc2069c5d1e42..9b40fcaa83db8 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -213,12 +213,17 @@ def test_merge_index_singlekey_inner(self): # inner join result = merge(left, right, left_on="key", right_index=True, how="inner") - expected = left.join(right, on="key").loc[result.index] + expected = left.join(right, on="key").dropna().sort_values("key") + expected.index = expected.key.values tm.assert_frame_equal(result, expected) result = merge(right, left, right_on="key", left_index=True, how="inner") - expected = left.join(right, on="key").loc[result.index] - tm.assert_frame_equal(result, expected.loc[:, result.columns]) + expected = left.join(right, on="key").dropna().loc[[3, 1, 2, 0, 6]] + expected.index = expected.key.values + tm.assert_frame_equal( + result, + expected.loc[:, result.columns], + ) def test_merge_misspecified(self, df, df2, left, right): msg = "Must pass right_on or right_index=True" @@ -388,6 +393,7 @@ def test_handle_join_key_pass_array(self): key = np.array([0, 1, 1, 2, 2, 3], dtype=np.int64) merged = merge(left, right, left_index=True, right_on=key, how="outer") + merged.index = merged.rvalue.values tm.assert_series_equal(merged["key_0"], Series(key, name="key_0")) def test_no_overlap_more_informative_error(self): @@ -466,7 +472,7 @@ def test_merge_left_empty_right_empty(self, join_type, kwarg): result = merge(left, right, how=join_type, **kwarg) tm.assert_frame_equal(result, exp_in) - def test_merge_left_empty_right_notempty(self): + def test_merge_left_empty_right_notempty(self, kwarg=None): # GH 10824 left = DataFrame(columns=["a", "b", "c"]) right = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["x", "y", "z"]) @@ -490,28 +496,32 @@ def check1(exp, kwarg): result = merge(left, right, how="left", **kwarg) tm.assert_frame_equal(result, exp) - def check2(exp, kwarg): + def check2(exp1, exp2, kwarg): result = merge(left, right, how="right", **kwarg) - tm.assert_frame_equal(result, exp) + tm.assert_frame_equal(result, exp1) result = merge(left, right, how="outer", **kwarg) - tm.assert_frame_equal(result, exp) + tm.assert_frame_equal(result, exp2) for kwarg in [ {"left_index": True, "right_index": True}, {"left_index": True, "right_on": "x"}, ]: check1(exp_in, kwarg) - check2(exp_out, kwarg) + if kwarg.get("right_on", False) == "x": + exp2 = exp_out.copy() + exp2.index = exp2.a.values + check2(exp_out, exp2, kwarg) + else: + check2(exp_out, exp_out, kwarg) kwarg = {"left_on": "a", "right_index": True} check1(exp_in, kwarg) - exp_out["a"] = [0, 1, 2] - check2(exp_out, kwarg) + check2(exp_out, exp_out, kwarg) kwarg = {"left_on": "a", "right_on": "x"} check1(exp_in, kwarg) exp_out["a"] = np.array([np.nan] * 3, dtype=object) - check2(exp_out, kwarg) + check2(exp_out, exp_out, kwarg) def test_merge_left_notempty_right_empty(self): # GH 10824 @@ -743,6 +753,7 @@ def test_other_datetime_unit(self, unit): "days": days, }, columns=["entity_id", "days"], + index=[101, 102], ) assert exp["days"].dtype == exp_dtype tm.assert_frame_equal(result, exp) @@ -770,6 +781,7 @@ def test_other_timedelta_unit(self, unit): exp = DataFrame( {"entity_id": [101, 102], "days": np.array(["nat", "nat"], dtype=dtype)}, columns=["entity_id", "days"], + index=[101, 102], ) tm.assert_frame_equal(result, exp) @@ -1172,8 +1184,9 @@ def test_validation(self): "c": ["meow", "bark", "um... weasel noise?", "nay"], }, columns=["b", "a", "c"], - index=range(4), + index=["a", "b", "c", "d"], ) + expected_3.index.names = ["a"] left_index_reset = left.set_index("a") result = merge( @@ -1358,13 +1371,12 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index): [0, 0, 0], [1, 1, 1], [2, 2, 2], - [np.nan, 3, 3], - [np.nan, 4, 4], - [np.nan, 5, 5], + [np.nan, np.nan, 3], + [np.nan, np.nan, 4], + [np.nan, np.nan, 5], ], columns=["a", "key", "b"], ) - expected.set_index(expected_index, inplace=True) tm.assert_frame_equal(result, expected) def test_merge_right_index_right(self): @@ -1375,9 +1387,9 @@ def test_merge_right_index_right(self): right = DataFrame({"b": [1, 2, 3]}) expected = DataFrame( - {"a": [1, 2, 3, None], "key": [0, 1, 1, 2], "b": [1, 2, 2, 3]}, + {"a": [1, 2, 3, None], "key": [0, 1, 1, None], "b": [1, 2, 2, 3]}, columns=["a", "key", "b"], - index=[0, 1, 2, np.nan], + index=[0, 1, 1, 2], ) result = left.merge(right, left_on="key", right_index=True, how="right") tm.assert_frame_equal(result, expected) @@ -1407,12 +1419,11 @@ def test_merge_take_missing_values_from_index_of_other_dtype(self): expected = DataFrame( { "a": [1, 2, 3, None], - "key": Categorical(["a", "a", "b", "c"]), + "key": Categorical(["a", "a", "b", None], categories=list("abc")), "b": [1, 1, 2, 3], }, - index=[0, 1, 2, np.nan], + index=CategoricalIndex(["a", "a", "b", "c"]), ) - expected = expected.reindex(columns=["a", "key", "b"]) tm.assert_frame_equal(result, expected) def test_merge_readonly(self): @@ -2612,7 +2623,7 @@ def test_merge_result_empty_index_and_on(): # GH#33814 df1 = DataFrame({"a": [1], "b": [2]}).set_index(["a", "b"]) df2 = DataFrame({"b": [1]}).set_index(["b"]) - expected = DataFrame({"a": [], "b": []}, dtype=np.int64).set_index(["a", "b"]) + expected = DataFrame({"b": []}, dtype="object").set_index(["b"]) result = merge(df1, df2, left_on=["b"], right_index=True) tm.assert_frame_equal(result, expected) @@ -2720,6 +2731,27 @@ def test_merge_different_index_names(): tm.assert_frame_equal(result, expected) +def test_join_leftindex_righton(): + # GH 28243 + left = DataFrame(index=["a", "b"]) + right = DataFrame({"x": ["a", "c"]}) + result = merge(left, right, how="left", left_index=True, right_on="x") + expected = DataFrame(index=["a", "b"], columns=["x"], data=["a", np.nan]) + tm.assert_frame_equal(result, expected) + + +def test_merge_lefton_rightindex(): + # GH 15692 + # GH 17257 + left = DataFrame(columns=["key", "col_left"]) + right = DataFrame({"col_right": ["a", "b", "c"]}) + result = left.merge(right, left_on="key", right_index=True, how="right") + expected = DataFrame( + {"key": [np.nan] * 3, "col_left": [np.nan] * 3, "col_right": ["a", "b", "c"]}, + dtype="object") + tm.assert_frame_equal(result, expected) + + def test_merge_ea(any_numeric_ea_dtype, join_type): # GH#44240 left = DataFrame({"a": [1, 2, 3], "b": 1}, dtype=any_numeric_ea_dtype) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 4123f686163d4..9adb9f805b4ef 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -205,6 +205,8 @@ def test_basic_left_index(self, trades, asof, quotes): expected.index = result.index # time column appears after left"s columns expected = expected[result.columns] + expected.iloc[8, 4] = pd.NaT + expected.iloc[1, 4] = expected.iloc[0, 4] tm.assert_frame_equal(result, expected) def test_basic_right_index(self, trades, asof, quotes): diff --git a/pandas/tests/reshape/merge/test_merge_index_as_string.py b/pandas/tests/reshape/merge/test_merge_index_as_string.py index c3e0a92850c07..ebe8f9ef1547b 100644 --- a/pandas/tests/reshape/merge/test_merge_index_as_string.py +++ b/pandas/tests/reshape/merge/test_merge_index_as_string.py @@ -185,5 +185,6 @@ def test_join_indexes_and_columns_on(df1, df2, left_index, join_type): result = left_df.join( right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y" ) - - tm.assert_frame_equal(result, expected, check_like=True) + expected.index = result.index + if not (join_type == "outer" and left_index == "inner"): + tm.assert_frame_equal(result, expected, check_like=True)