diff --git a/doc/source/getting_started/comparison/includes/merge.rst b/doc/source/getting_started/comparison/includes/merge.rst index b8e3f54fd132b..55cd352cf951f 100644 --- a/doc/source/getting_started/comparison/includes/merge.rst +++ b/doc/source/getting_started/comparison/includes/merge.rst @@ -15,3 +15,12 @@ data does not have to be sorted ahead of time, and different join types are acco outer_join = df1.merge(df2, on=["key"], how="outer") outer_join + + anti_left_join = df1.merge(df2, on=["key"], how="anti_left") + anti_left_join + + anti_right_join = df1.merge(df2, on=["key"], how="anti_right") + anti_right_join + + anti_full_join = df1.merge(df2, on=["key"], how="anti_full") + anti_full_join diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c31c4eb460209..93d95b8e4e9d7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -277,7 +277,8 @@ ----------%s right : DataFrame or named Series Object to merge with. -how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' +how : {'left', 'right', 'outer', 'inner', 'cross', 'anti_left', \ +'anti_right', 'anti_full'}, default 'inner' Type of merge to be performed. * left: use only keys from left frame, similar to a SQL left outer join; @@ -292,6 +293,15 @@ of the left keys. .. versionadded:: 1.2.0 + * anti_left: use only keys from left frame that are absent in right + frame; preserve key order. + * anti_right: use keys from the right frame that are absent in the + left frame; preserve key order. + * anti_full: use keys from the right frame that are absent in the + left frame, and the keys in the left frame that are absent in the + right frame; sort keys lexicographically. + + .. versionadded:: 1.4.0 on : label or list Column or index level names to join on. These must be found in both @@ -451,6 +461,33 @@ 1 foo 8 2 bar 7 3 bar 8 + +>>> df1 = pd.DataFrame({"A": [1, 2, 3], "C": [5, 6, 7]}) +>>> df2 = pd. DataFrame({"B": [1, 2, 4], "C": [7, 8, 9]}) +>>> df1 + A C +0 1 5 +1 2 6 +2 3 7 +>>> df2 + B C +0 1 7 +1 2 8 +2 4 9 +>>> df1.merge(df2, on="C", how="anti_left") + A C B +0 1 5 NaN +1 2 6 NaN +>>> df1.merge(df2, on="C", how="anti_right") + A C B +0 NaN 8 2 +1 NaN 9 4 +>>> df1.merge(df2, on="C", how="anti_full") + A C B +0 1.0 5 NaN +1 2.0 6 NaN +2 NaN 8 2.0 +3 NaN 9 4.0 """ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 3a39713f18d65..fa938361fdf8b 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -690,6 +690,8 @@ def __init__( cross_col, ) = self._create_cross_configuration(self.left, self.right) self.left_on = self.right_on = [cross_col] + elif self.how in ["anti_left", "anti_right", "anti_full"]: + self.left, self.right, self.how = self._anti_join_update() self._cross = cross_col # note this function has side effects @@ -743,6 +745,46 @@ def get_result(self) -> DataFrame: return result.__finalize__(self, method="merge") + def _anti_join_update(self): + """ + Converts `anti_left`, `anti_right` and `anti_full` configurations into `left`, + `right` and `outer` join configurations. + Calls `_anti_helper` with the indices or columns to be merged on. + """ + if self.left_index and self.right_index: + # Merge using `right_index` and `left_index` + join_index_l, join_index_r, self.how = _anti_helper( + self.left.index, self.right.index, self.how + ) + elif self.on is not None or ( + None not in self.left_on and None not in self.right_on + ): + # Merge using `on` or `left_on` and `right_on` + if self.on is not None: + left_on = right_on = self.on + else: + left_on = self.left_on + right_on = self.right_on + join_index_l, join_index_r, self.how = _anti_helper( + self.left[left_on], self.right[right_on], self.how + ) + elif self.left_index and self.right_on is not None: + # Merge using `left_index` and `right_on` + join_index_l, join_index_r, self.how = _anti_helper( + self.left.index, self.right[self.right_on], self.how + ) + elif self.right_index and self.left_on is not None: + # Merge using `left_on` and `right_index` + join_index_l, join_index_r, self.how = _anti_helper( + self.left[self.left_on], self.right.index, self.how + ) + self.left = self.left.loc[join_index_l] + self.right = self.right.loc[join_index_r] + + # sanity check to ensure correct `how` + assert self.how in ["left", "right", "inner", "outer"] + return (self.left, self.right, self.how) + def _maybe_drop_cross_column( self, result: DataFrame, cross_col: str | None ) -> None: @@ -1453,6 +1495,59 @@ def _validate(self, validate: str) -> None: raise ValueError("Not a valid argument for validate") +def _anti_helper( + _left: Index | DataFrame, + _right: Index | DataFrame, + _how: str, +) -> tuple[npt.NDArray, npt.NDArray, str]: + """ + Converts `anti_left`, `anti_right` and `anti_full` configurations into `left`, + `right` and `outer` join configurations + + Parameters + ---------- + _left : DataFrame, Index + left frame with columns if merged with `on` or `left/right_on`, else Index + _right : DataFrame, Index + right frame with columns if merged with `on` or `left/right_on`, else Index + _how : {'anti_left', 'anti_right', 'anti_full'} + + Returns + ------- + np.ndarray[bool] + Indexer of left_keys + np.ndarray[bool] + Indexer of right_keys + {"left", "right", "outer"} + Native join configurations + + """ + + # If not Index. Convert the columns into Index or + # MultiIndex as required + if not isinstance(_left, Index): + if len(_left.columns) == 1: + _left = Index(_left.values.flatten(), dtype=_left.dtypes[0]) + else: + _left = MultiIndex.from_frame(_left) + if not isinstance(_right, Index): + if len(_right.columns) == 1: + _right = Index(_right.values.flatten(), dtype=_right.dtypes[0]) + else: + _right = MultiIndex.from_frame(_right) + + how_dict: dict[str, str] = { + "anti_left": "left", + "anti_right": "right", + "anti_full": "outer", + } + _how = how_dict[_how] + + join_index_l = ~_left.isin(_right) + join_index_r = ~_right.isin(_left) + return (join_index_l, join_index_r, _how) + + def get_join_indexers( left_keys, right_keys, sort: bool = False, how: str = "inner", **kwargs ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: diff --git a/pandas/tests/reshape/merge/test_merge_anti.py b/pandas/tests/reshape/merge/test_merge_anti.py new file mode 100644 index 0000000000000..3ca4887706965 --- /dev/null +++ b/pandas/tests/reshape/merge/test_merge_anti.py @@ -0,0 +1,504 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, +) +import pandas._testing as tm +from pandas.core.reshape.merge import merge + + +class Test_AntiJoin: + @pytest.mark.parametrize( + "how, exp_index, exp_values", + [ + ("anti_left", ["c"], [[3, 30, np.nan, np.nan]]), + ("anti_right", ["d"], [[np.nan, np.nan, 4, 40]]), + ( + "anti_full", + ["c", "d"], + [[3, 30, np.nan, np.nan], [np.nan, np.nan, 4, 40]], + ), + ], + ) + def test_basic_anti_index(self, how, exp_index, exp_values): + # basic test containing NaNs w/o on param + left = DataFrame({"A": [1, 2, 3], "C": [10, 20, 30]}, index=["a", "b", "c"]) + right = DataFrame({"B": [1, 2, 4], "C": [10, 20, 40]}, index=["a", "b", "d"]) + expected = DataFrame( + exp_values, index=exp_index, columns=["A", "C_x", "B", "C_y"] + ) + result = merge(left, right, how=how, left_index=True, right_index=True) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "on, how, data", + [ + ( + ["C"], + "anti_left", + [[1, 5, np.nan], [2, 6, np.nan]], + ), + ( + ["C"], + "anti_right", + [[np.nan, 8, 2], [np.nan, 9, 4]], + ), + ( + ["C"], + "anti_full", + [[1, 5, np.nan], [2, 6, np.nan], [np.nan, 8, 2], [np.nan, 9, 4]], + ), + ( + None, + "anti_left", + [[1, 5, np.nan], [2, 6, np.nan]], + ), + ( + None, + "anti_right", + [[np.nan, 8, 2], [np.nan, 9, 4]], + ), + ( + None, + "anti_full", + [[1, 5, np.nan], [2, 6, np.nan], [np.nan, 8, 2], [np.nan, 9, 4]], + ), + ], + ) + def test_basic_anti_on(self, on, how, data): + # basic test containing NaNs with on param + left = DataFrame({"A": [1, 2, 3], "C": [5, 6, 7]}, index=["a", "b", "c"]) + right = DataFrame({"B": [1, 2, 4], "C": [7, 8, 9]}, index=["a", "b", "d"]) + expected = DataFrame(data, columns=["A", "C", "B"]) + result = merge(left, right, how=how, on=on) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "expected, how, left_on, right_on", + [ + ( + DataFrame({"A": 3, "C_x": 7, "B": np.nan, "C_y": np.nan}, index=[0]), + "anti_left", + ["A"], + ["B"], + ), + ( + DataFrame({"A": np.nan, "C_x": np.nan, "B": 4, "C_y": 9}, index=[0]), + "anti_right", + ["A"], + ["B"], + ), + ( + DataFrame( + { + "A": [3, np.nan], + "C_x": [7, np.nan], + "B": [np.nan, 4], + "C_y": [np.nan, 9], + }, + ), + "anti_full", + ["A"], + ["B"], + ), + ], + ) + def test_basic_anti_lefton_righton(self, expected, how, left_on, right_on): + # basic test containing NaNs with left_on / right_on params + left = DataFrame({"A": [1, 2, 3], "C": [5, 6, 7]}, index=["a", "b", "c"]) + right = DataFrame({"B": [1, 2, 4], "C": [7, 8, 9]}, index=["a", "b", "d"]) + result = merge( + left, + right, + how=how, + left_on=left_on, + right_on=right_on, + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "expected, how", + [ + ( + DataFrame( + { + "A": [3], + "B_x": [6], + "C_x": [7], + "B_y": [np.nan], + "C_y": [np.nan], + "D": ["c"], + }, + index=[np.nan], + ), + "anti_left", + ), + ( + DataFrame( + { + "A": [np.nan], + "B_x": [np.nan], + "C_x": [np.nan], + "B_y": [9], + "C_y": [7], + "D": ["d"], + }, + index=[2], + ), + "anti_right", + ), + ( + DataFrame( + { + "A": [3, np.nan], + "B_x": [6, np.nan], + "C_x": [7, np.nan], + "B_y": [np.nan, 9], + "C_y": [np.nan, 7], + "D": ["c", "d"], + }, + index=[np.nan, 2], + ), + "anti_full", + ), + ], + ) + def test_anti_index_with_col(self, expected, how): + # basic test containing NaNs with left_index and right_on params + left = DataFrame( + {"A": [1, 2, 3], "B": [4, 5, 6], "C": [5, 6, 7]}, index=["a", "b", "c"] + ) + right = DataFrame({"B": [5, 5, 9], "C": [4, 6, 7], "D": ["a", "b", "d"]}) + result = merge(left, right, how=how, left_index=True, right_on=["D"]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "expected, how, on", + [ + ( + DataFrame( + {"A": [1, 3], "B": [4, 6], "C": [5, 7], "D": [np.nan, np.nan]} + ).astype({"D": object}), + "anti_left", + None, + ), + ( + DataFrame( + {"A": [np.nan, np.nan], "B": [5, 9], "C": [4, 7], "D": ["a", "d"]} + ), + "anti_right", + None, + ), + ( + DataFrame( + { + "A": [1, 3, np.nan, np.nan], + "B": [4, 6, 5, 9], + "C": [5, 7, 4, 7], + "D": [np.nan, np.nan, "a", "d"], + } + ), + "anti_full", + None, + ), + ( + DataFrame( + {"A": [1, 3], "B": [4, 6], "C": [5, 7], "D": [np.nan, np.nan]} + ).astype({"D": object}), + "anti_left", + ["B", "C"], + ), + ( + DataFrame( + {"A": [np.nan, np.nan], "B": [5, 9], "C": [4, 7], "D": ["a", "d"]} + ), + "anti_right", + ["B", "C"], + ), + ( + DataFrame( + { + "A": [1, 3, np.nan, np.nan], + "B": [4, 6, 5, 9], + "C": [5, 7, 4, 7], + "D": [np.nan, np.nan, "a", "d"], + } + ), + "anti_full", + ["B", "C"], + ), + ], + ) + def test_anti_multicol(self, expected, how, on): + # test with multicol with and w/o on param + right = DataFrame({"B": [5, 5, 9], "C": [4, 6, 7], "D": ["a", "b", "d"]}) + left = DataFrame( + {"A": [1, 2, 3], "B": [4, 5, 6], "C": [5, 6, 7]}, index=["a", "b", "c"] + ) + result = merge(left, right, how=how, on=on) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "expected, how, on, left_on, right_on", + [ + ( + DataFrame({"A": np.nan, "B": "c", "C": np.nan}, index=[0]), + "anti_right", + None, + None, + None, + ), + ( + DataFrame({"A": np.nan, "B": 3, "C": np.nan}, index=[0]).astype( + {"B": object} + ), + "anti_left", + None, + None, + None, + ), + ( + DataFrame({"A": np.nan, "B": "c", "C": np.nan}, index=[0]), + "anti_right", + ["B"], + None, + None, + ), + ( + DataFrame({"A": np.nan, "B": 3, "C": np.nan}, index=[0]).astype( + {"B": object} + ), + "anti_left", + ["B"], + None, + None, + ), + ( + DataFrame( + {"A": [2.0], "B_x": [2], "C": [np.nan], "B_y": [np.nan]} + ).astype({"B_x": object, "B_y": object}), + "anti_left", + None, + ["A"], + ["C"], + ), + ( + DataFrame( + { + "A": [np.nan, np.nan], + "B_x": [np.nan, np.nan], + "C": [1.0, 3.0], + "B_y": ["a", 2], + } + ).astype({"B_x": object}), + "anti_right", + None, + ["A"], + ["C"], + ), + ], + ) + def test_anti_with_nan(self, expected, how, on, left_on, right_on): + # basic anti_joins with mixed dtypes + left = DataFrame({"A": [np.nan, 2, np.nan], "B": ["a", 2, 3]}) + right = DataFrame({"C": [1, 3, np.nan], "B": ["a", 2, "c"]}) + result = merge(left, right, on=on, how=how, left_on=left_on, right_on=right_on) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "expected, how, left_on, right_on", + [ + ( + DataFrame({"A": [np.nan, pd.NA], "B": ["a", 3], "C": [np.nan, np.nan]}), + "anti_left", + "B", + "B", + ), + ( + DataFrame( + {"A": [np.nan, np.nan], "B": [pd.NA, "c"], "C": [1, np.nan]} + ).astype({"A": object}), + "anti_right", + "B", + "B", + ), + ( + DataFrame( + { + "A": [np.nan, pd.NA, np.nan, np.nan], + "B": ["a", 3, pd.NA, "c"], + "C": [np.nan, np.nan, 1, np.nan], + } + ), + "anti_full", + "B", + "B", + ), + ( + DataFrame( + { + "A": [2, pd.NA], + "B_x": [2, 3], + "C": [np.nan, np.nan], + "B_y": [np.nan, np.nan], + } + ).astype({"B_x": object, "B_y": object}), + "anti_left", + "A", + "C", + ), + ( + DataFrame( + { + "A": [np.nan, np.nan], + "B_x": [np.nan, np.nan], + "C": [1.0, 3], + "B_y": [pd.NA, 2], + } + ).astype( + { + "A": object, + "B_x": object, + } + ), + "anti_right", + "A", + "C", + ), + ( + DataFrame( + { + "A": [2, pd.NA, np.nan, np.nan], + "B_x": [2, 3, np.nan, np.nan], + "C": [np.nan, np.nan, 1, 3], + "B_y": [np.nan, np.nan, pd.NA, 2], + } + ).astype({"B_x": object}), + "anti_full", + "A", + "C", + ), + ], + ) + def test_anti_with_nan_and_NA(self, expected, how, left_on, right_on): + # test to check np.nan isn't matched with pd.NA + left = DataFrame({"A": [np.nan, 2, pd.NA], "B": ["a", 2, 3]}) + right = DataFrame({"C": [1, 3, np.nan], "B": [pd.NA, 2, "c"]}) + result = merge(left, right, how=how, left_on=left_on, right_on=right_on) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "how, expected", + [ + ( + "anti_left", + DataFrame( + {"vals_x": [20, 17], "vals_y": [np.nan] * 2}, + index=pd.date_range("1/2/2010", periods=2, freq="2d"), + ), + ), + ( + "anti_right", + DataFrame( + {"vals_x": [np.nan] * 2, "vals_y": [17, 21]}, + index=pd.date_range("1/7/2010", periods=2, freq="2d"), + ), + ), + ( + "anti_full", + DataFrame( + { + "vals_x": [20, 17, np.nan, np.nan], + "vals_y": [np.nan, np.nan, 17, 21], + }, + index=pd.date_range("1/7/2010", periods=2, freq="2d").union( + pd.date_range("1/2/2010", periods=2, freq="2d") + ), + ), + ), + ], + ) + def test_anti_datetime(self, how, expected): + left = DataFrame( + {"vals": [10, 20, 15, 17, 21]}, + index=pd.date_range("1/1/2010", periods=5, freq="D"), + ) + right = DataFrame( + {"vals": [10, 20, 15, 17, 21]}, + index=pd.date_range("1/1/2010", periods=5, freq="2D"), + ) + result = merge(left, right, left_index=True, right_index=True, how=how) + tm.assert_frame_equal(result, expected) + + def test_anti_datetime_tz(self): + expected = DataFrame( + { + "Date": pd.date_range( + "10-20-2021", periods=2, freq="6D", tz="Asia/Kolkata" + ), + "a_x": [3, 4], + "a_y": [np.nan, np.nan], + } + ) + left = DataFrame( + { + "Date": pd.date_range( + "10-02-2021", periods=5, freq="6D", tz="Asia/Kolkata" + ), + "a": range(5), + } + ) + right = DataFrame( + { + "Date": pd.date_range( + "10-02-2021", periods=5, freq="3D", tz="Asia/Kolkata" + ), + "a": range(5), + } + ) + result = merge(left, right, how="anti_left", on="Date") + tm.assert_frame_equal(result, expected) + + def test_anti_categorical(self): + left = DataFrame({"A": list("abca"), "B": list("bccd")}, dtype="category") + right = DataFrame({"A": list("dad"), "C": list("gap")}, dtype="category") + expected = DataFrame( + { + "A": ["b", "c"], + "B": Categorical(["c", "c"], categories=list("bcd")), + "C": Categorical([np.nan, np.nan], categories=list("agp")), + } + ) + result = merge(left, right, how="anti_left") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype", ["Int64", "Int32", "UInt32", "UInt64", "Float32", "Float64"] + ) + def test_anti_EA_dtypes(self, dtype): + left = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, dtype=dtype) + right = DataFrame({"A": [1, 4, 5], "C": [7, 6, 8]}, dtype=dtype) + result = merge(left, right, how="anti_right") + expected = DataFrame( + {"A": [4, 5], "B": [pd.NA, pd.NA], "C": [6, 8]}, dtype=dtype + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype", ["Int64", "Int32", "UInt32", "UInt64", "Float32", "Float64"] + ) + def test_anti_EA_dtypes_with_multicol(self, dtype): + left = DataFrame( + {"A": [1, 2, 3], "B": [4, 5, 6], "C": [5, 6, 7]}, + index=["a", "b", "c"], + dtype=dtype, + ) + right = DataFrame({"B": [5, 5, 9], "C": [4, 6, 7], "D": [1, 0, 0]}, dtype=dtype) + expected = DataFrame( + columns=list("ABCD"), data=[[1, 4, 5, pd.NA], [3, 6, 7, pd.NA]], dtype=dtype + ) + result = merge(left, right, how="anti_left") + tm.assert_frame_equal(result, expected)