From fba0bb5ae53d618c2c511889d26f2b99e600ae28 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sun, 15 Aug 2021 23:38:42 +0530 Subject: [PATCH 01/31] ENH:included anti join functionality --- pandas/core/reshape/merge.py | 49 +++++++++ pandas/tests/reshape/merge/test_merge.py | 126 +++++++++++++++++++++++ 2 files changed, 175 insertions(+) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index a3baf8ade9c2e..b3e616188edb7 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -693,6 +693,9 @@ def __init__( self.left_on = self.right_on = [cross_col] self._cross = cross_col + if self.how in ["anti_left", "anti_right", "anti_full"]: + self._anti_join_update() + # note this function has side effects ( self.left_join_keys, @@ -744,6 +747,52 @@ def get_result(self) -> DataFrame: return result.__finalize__(self, method="merge") + def _anti_join_update(self): + if self.left_index and self.right_index: + if self.how == "anti_right": + join_index = set(self.right.index).difference(set(self.left.index)) + self.how = "right" + elif self.how == "anti_left": + join_index = set(self.left.index).difference(set(self.right.index)) + self.how = "left" + else: + join_index = set(self.left.index.union(self.right.index)) - set( + self.left.index.intersection(self.right.index) + ) + self.how = "outer" + self.left = self.left[self.left.index.isin(join_index)] + self.right = self.right[self.right.index.isin(join_index)] + else: + if self.on is not None: + left_on = right_on = self.on + else: + left_on = self.left_on + right_on = self.right_on + if self.how == "anti_right": + join_index = set(self.right[right_on].values.flatten()).difference( + set(self.left[left_on].values.flatten()) + ) + self.how = "right" + elif self.how == "anti_left": + join_index = set(self.left[left_on].values.flatten()).difference( + set(self.right[right_on].values.flatten()) + ) + self.how = "left" + else: + join_index = set(self.left[left_on].values.flatten()).union( + self.right[right_on].values.flatten() + ) - set(self.left[left_on].values.flatten()).intersection( + self.right[right_on].values.flatten() + ) + self.how = "outer" + self.left = self.left[self.left[left_on].isin(join_index).values.flatten()] + self.right = self.right[ + self.right[right_on].isin(join_index).values.flatten() + ] + + # sanity check to ensure correct `how` + assert self.how in ["left", "right", "inner", "outer"] + def _maybe_drop_cross_column( self, result: DataFrame, cross_col: str | None ) -> None: diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index cd07b3814d023..f3f0b02f04dd8 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2487,3 +2487,129 @@ def test_mergeerror_on_left_index_mismatched_dtypes(): df_2 = DataFrame(data=["X"], columns=["C"], index=[999]) with pytest.raises(MergeError, match="Can only pass argument"): merge(df_1, df_2, on=["C"], left_index=True) + + +@pytest.mark.parametrize( + "expected, how, on, left_on, right_on, left_index, right_index", + [ + ( + DataFrame({"A": 3, "C_x": 7, "B": np.nan, "C_y": np.nan}, index=["c"]), + "anti_left", + None, + None, + None, + True, + True, + ), + ( + DataFrame({"A": np.nan, "C_x": np.nan, "B": 4, "C_y": 9}, index=["d"]), + "anti_right", + None, + None, + None, + True, + True, + ), + ( + DataFrame( + { + "A": [3, np.nan], + "C_x": [7, np.nan], + "B": [np.nan, 4], + "C_y": [np.nan, 9], + }, + index=["c", "d"], + ), + "anti_full", + None, + None, + None, + True, + True, + ), + ( + DataFrame({"A": [1, 2], "C": [5, 6], "B": [np.nan, np.nan]}, index=[0, 1]), + "anti_left", + ["C"], + None, + None, + False, + False, + ), + ( + DataFrame({"A": [np.nan, np.nan], "B": [2, 4], "C": [8, 9]}, index=[0, 1]), + "anti_right", + ["C"], + None, + None, + False, + False, + ), + ( + DataFrame( + { + "A": [1, 2, np.nan, np.nan], + "C": [5, 6, 8, 9], + "B": [np.nan, np.nan, 2, 4], + }, + index=[0, 1, 2, 3], + ), + "anti_full", + ["C"], + None, + None, + False, + False, + ), + ( + DataFrame({"A": 3, "C_x": 7, "B": np.nan, "C_y": np.nan}, index=[0]), + "anti_left", + None, + ["A"], + ["B"], + False, + False, + ), + ( + DataFrame({"A": np.nan, "C_x": np.nan, "B": 4, "C_y": 9}, index=[0]), + "anti_right", + None, + ["A"], + ["B"], + False, + False, + ), + ( + DataFrame( + { + "A": [3, np.nan], + "C_x": [7, np.nan], + "B": [np.nan, 4], + "C_y": [np.nan, 9], + }, + index=[0, 1], + ), + "anti_full", + None, + ["A"], + ["B"], + False, + False, + ), + ], +) +def test_anti_join(expected, how, on, left_on, right_on, left_index, right_index): + # GH#42916 + df_l = DataFrame({"A": [1, 2, 3], "C": [5, 6, 7]}, index=["a", "b", "c"]) + df_r = DataFrame({"B": [1, 2, 4], "C": [7, 8, 9]}, index=["a", "b", "d"]) + result = merge( + df_l, + df_r, + how=how, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + ) + tm.assert_frame_equal(result, expected) From 53fd41d4a5d92b06480b372de86b4f1a60657300 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Tue, 17 Aug 2021 00:25:38 +0530 Subject: [PATCH 02/31] included multicol join --- pandas/core/reshape/merge.py | 75 +++-- pandas/tests/reshape/merge/test_merge.py | 395 ++++++++++++++++------- 2 files changed, 331 insertions(+), 139 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index b3e616188edb7..3f6809778ab52 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -748,6 +748,37 @@ def get_result(self) -> DataFrame: return result.__finalize__(self, method="merge") def _anti_join_update(self): + def isin_nd(a, b, invert=False): + # a,b are the nD input arrays to give us + # "isin-like" functionality across them + a = np.ascontiguousarray(a) + b = np.ascontiguousarray(b) + void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1])) + A, B = a.view(void_dt).ravel(), b.view(void_dt).ravel() + return np.isin(A, B, invert=invert) + + def _multi_columns(arr_l, arr_r, how): + nrows, ncols = arr_l.shape + if how == "anti_right": + join_index_l = join_index_r = isin_nd(arr_r, arr_l, invert=True) + _how = "right" + elif how == "anti_left": + join_index_l = join_index_r = isin_nd(arr_l, arr_r, invert=True) + _how = "left" + else: + try: + _union = np.unique(np.vstack((arr_l, arr_r)), axis=0) + except TypeError: + _union = np.vstack((arr_l, arr_r)) + _intersect = np.array( + list({tuple(x) for x in arr_l} & {tuple(x) for x in arr_r}) + ) + _union_index = _union[isin_nd(_union, _intersect, invert=True)] + join_index_l = isin_nd(arr_l, _union_index) + join_index_r = isin_nd(arr_r, _union_index) + _how = "outer" + return (join_index_l, join_index_r, _how) + if self.left_index and self.right_index: if self.how == "anti_right": join_index = set(self.right.index).difference(set(self.left.index)) @@ -768,27 +799,33 @@ def _anti_join_update(self): else: left_on = self.left_on right_on = self.right_on - if self.how == "anti_right": - join_index = set(self.right[right_on].values.flatten()).difference( - set(self.left[left_on].values.flatten()) + if is_list_like(left_on) and len(left_on) > 1: + join_index_l, join_index_r, self.how = _multi_columns( + self.left[left_on].values, self.right[right_on].values, self.how ) - self.how = "right" - elif self.how == "anti_left": - join_index = set(self.left[left_on].values.flatten()).difference( - set(self.right[right_on].values.flatten()) - ) - self.how = "left" + self.left = self.left[join_index_l] + self.right = self.right[join_index_r] + else: - join_index = set(self.left[left_on].values.flatten()).union( - self.right[right_on].values.flatten() - ) - set(self.left[left_on].values.flatten()).intersection( - self.right[right_on].values.flatten() - ) - self.how = "outer" - self.left = self.left[self.left[left_on].isin(join_index).values.flatten()] - self.right = self.right[ - self.right[right_on].isin(join_index).values.flatten() - ] + if self.how == "anti_right": + join_index = set(self.right[right_on].values.flatten()).difference( + set(self.left[left_on].values.flatten()) + ) + self.how = "right" + elif self.how == "anti_left": + join_index = set(self.left[left_on].values.flatten()).difference( + set(self.right[right_on].values.flatten()) + ) + self.how = "left" + else: + join_index = set(self.left[left_on].values.flatten()).union( + self.right[right_on].values.flatten() + ) - set(self.left[left_on].values.flatten()).intersection( + self.right[right_on].values.flatten() + ) + self.how = "outer" + self.left = self.left[self.left[left_on].isin(join_index).values] + self.right = self.right[self.right[right_on].isin(join_index).values] # sanity check to ensure correct `how` assert self.how in ["left", "right", "inner", "outer"] diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f3f0b02f04dd8..94c0d880a9707 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2489,127 +2489,282 @@ def test_mergeerror_on_left_index_mismatched_dtypes(): merge(df_1, df_2, on=["C"], left_index=True) -@pytest.mark.parametrize( - "expected, how, on, left_on, right_on, left_index, right_index", - [ - ( - DataFrame({"A": 3, "C_x": 7, "B": np.nan, "C_y": np.nan}, index=["c"]), - "anti_left", - None, - None, - None, - True, - True, - ), - ( - DataFrame({"A": np.nan, "C_x": np.nan, "B": 4, "C_y": 9}, index=["d"]), - "anti_right", - None, - None, - None, - True, - True, - ), - ( - DataFrame( - { - "A": [3, np.nan], - "C_x": [7, np.nan], - "B": [np.nan, 4], - "C_y": [np.nan, 9], - }, - index=["c", "d"], +class Test_AntiJoin: + @pytest.mark.parametrize( + "expected, how, on, left_on, right_on, left_index, right_index", + [ + ( + DataFrame({"A": 3, "C_x": 7, "B": np.nan, "C_y": np.nan}, index=["c"]), + "anti_left", + None, + None, + None, + True, + True, ), - "anti_full", - None, - None, - None, - True, - True, - ), - ( - DataFrame({"A": [1, 2], "C": [5, 6], "B": [np.nan, np.nan]}, index=[0, 1]), - "anti_left", - ["C"], - None, - None, - False, - False, - ), - ( - DataFrame({"A": [np.nan, np.nan], "B": [2, 4], "C": [8, 9]}, index=[0, 1]), - "anti_right", - ["C"], - None, - None, - False, - False, - ), - ( - DataFrame( - { - "A": [1, 2, np.nan, np.nan], - "C": [5, 6, 8, 9], - "B": [np.nan, np.nan, 2, 4], - }, - index=[0, 1, 2, 3], + ( + DataFrame({"A": np.nan, "C_x": np.nan, "B": 4, "C_y": 9}, index=["d"]), + "anti_right", + None, + None, + None, + True, + True, ), - "anti_full", - ["C"], - None, - None, - False, - False, - ), - ( - DataFrame({"A": 3, "C_x": 7, "B": np.nan, "C_y": np.nan}, index=[0]), - "anti_left", - None, - ["A"], - ["B"], - False, - False, - ), - ( - DataFrame({"A": np.nan, "C_x": np.nan, "B": 4, "C_y": 9}, index=[0]), - "anti_right", - None, - ["A"], - ["B"], - False, - False, - ), - ( - DataFrame( - { - "A": [3, np.nan], - "C_x": [7, np.nan], - "B": [np.nan, 4], - "C_y": [np.nan, 9], - }, - index=[0, 1], + ( + DataFrame( + { + "A": [3, np.nan], + "C_x": [7, np.nan], + "B": [np.nan, 4], + "C_y": [np.nan, 9], + }, + index=["c", "d"], + ), + "anti_full", + None, + None, + None, + True, + True, ), - "anti_full", - None, - ["A"], - ["B"], - False, - False, - ), - ], -) -def test_anti_join(expected, how, on, left_on, right_on, left_index, right_index): - # GH#42916 - df_l = DataFrame({"A": [1, 2, 3], "C": [5, 6, 7]}, index=["a", "b", "c"]) - df_r = DataFrame({"B": [1, 2, 4], "C": [7, 8, 9]}, index=["a", "b", "d"]) - result = merge( - df_l, - df_r, - how=how, - on=on, - left_on=left_on, - right_on=right_on, - left_index=left_index, - right_index=right_index, + ( + DataFrame( + {"A": [1, 2], "C": [5, 6], "B": [np.nan, np.nan]}, index=[0, 1] + ), + "anti_left", + ["C"], + None, + None, + False, + False, + ), + ( + DataFrame( + {"A": [np.nan, np.nan], "B": [2, 4], "C": [8, 9]}, index=[0, 1] + ), + "anti_right", + ["C"], + None, + None, + False, + False, + ), + ( + DataFrame( + { + "A": [1, 2, np.nan, np.nan], + "C": [5, 6, 8, 9], + "B": [np.nan, np.nan, 2, 4], + }, + index=[0, 1, 2, 3], + ), + "anti_full", + ["C"], + None, + None, + False, + False, + ), + ( + DataFrame({"A": 3, "C_x": 7, "B": np.nan, "C_y": np.nan}, index=[0]), + "anti_left", + None, + ["A"], + ["B"], + False, + False, + ), + ( + DataFrame({"A": np.nan, "C_x": np.nan, "B": 4, "C_y": 9}, index=[0]), + "anti_right", + None, + ["A"], + ["B"], + False, + False, + ), + ( + DataFrame( + { + "A": [3, np.nan], + "C_x": [7, np.nan], + "B": [np.nan, 4], + "C_y": [np.nan, 9], + }, + index=[0, 1], + ), + "anti_full", + None, + ["A"], + ["B"], + False, + False, + ), + ( + DataFrame( + { + "A": [1, 2], + "C": [5, 6], + "B": [np.nan, np.nan], + }, + index=[0, 1], + ), + "anti_left", + None, + None, + None, + False, + False, + ), + ( + DataFrame( + { + "A": [np.nan, np.nan], + "B": [2, 4], + "C": [8, 9], + }, + index=[0, 1], + ), + "anti_right", + None, + None, + None, + False, + False, + ), + ( + DataFrame( + { + "A": [1, 2, np.nan, np.nan], + "C": [5, 6, 8, 9], + "B": [np.nan, np.nan, 2, 4], + }, + index=[0, 1, 2, 3], + ), + "anti_full", + None, + None, + None, + False, + False, + ), + ], ) - tm.assert_frame_equal(result, expected) + def test_anti_join( + self, expected, how, on, left_on, right_on, left_index, right_index + ): + # GH#42916 + df_l = DataFrame({"A": [1, 2, 3], "C": [5, 6, 7]}, index=["a", "b", "c"]) + df_r = DataFrame({"B": [1, 2, 4], "C": [7, 8, 9]}, index=["a", "b", "d"]) + result = merge( + df_l, + df_r, + how=how, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "expected, how, on, left_on, right_on, left_index, right_index", + [ + ( + DataFrame( + {"A": [1, 3], "B": [4, 6], "C": [5, 7], "D": [np.nan, np.nan]} + ).astype({"D": object}), + "anti_left", + None, + None, + None, + False, + False, + ), + ( + DataFrame( + {"A": [np.nan, np.nan], "B": [5, 9], "C": [4, 7], "D": ["a", "d"]} + ), + "anti_right", + None, + None, + None, + False, + False, + ), + ( + DataFrame( + { + "A": [1, 3, np.nan, np.nan], + "B": [4, 6, 5, 9], + "C": [5, 7, 4, 7], + "D": [np.nan, np.nan, "a", "d"], + } + ), + "anti_full", + None, + None, + None, + False, + False, + ), + ( + DataFrame( + {"A": [1, 3], "B": [4, 6], "C": [5, 7], "D": [np.nan, np.nan]} + ).astype({"D": object}), + "anti_left", + ["B", "C"], + None, + None, + False, + False, + ), + ( + DataFrame( + {"A": [np.nan, np.nan], "B": [5, 9], "C": [4, 7], "D": ["a", "d"]} + ), + "anti_right", + ["B", "C"], + None, + None, + False, + False, + ), + ( + DataFrame( + { + "A": [1, 3, np.nan, np.nan], + "B": [4, 6, 5, 9], + "C": [5, 7, 4, 7], + "D": [np.nan, np.nan, "a", "d"], + } + ), + "anti_full", + ["B", "C"], + None, + None, + False, + False, + ), + ], + ) + def test_anti_join_multicol( + self, expected, how, on, left_on, right_on, left_index, right_index + ): + df_2 = DataFrame({"B": [5, 5, 9], "C": [4, 6, 7], "D": ["a", "b", "d"]}) + df_1 = DataFrame( + {"A": [1, 2, 3], "B": [4, 5, 6], "C": [5, 6, 7]}, index=["a", "b", "c"] + ) + result = merge( + df_1, + df_2, + how=how, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + ) + tm.assert_frame_equal(result, expected) From 448373ba69044fd427177121dc8861abe74072d5 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Tue, 17 Aug 2021 16:24:25 +0530 Subject: [PATCH 03/31] handling index and col --- pandas/core/reshape/merge.py | 65 ++++++++++-------------- pandas/tests/reshape/merge/test_merge.py | 23 ++++++++- 2 files changed, 49 insertions(+), 39 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 3f6809778ab52..61674d5bb7dc0 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -758,7 +758,13 @@ def isin_nd(a, b, invert=False): return np.isin(A, B, invert=invert) def _multi_columns(arr_l, arr_r, how): - nrows, ncols = arr_l.shape + if len(arr_l.shape) == 1: + arr_l = np.atleast_2d(arr_l).reshape(-1, np.atleast_2d(arr_l).shape[0]) + if len(arr_r.shape) == 1: + arr_r = np.atleast_2d(arr_r).reshape(-1, np.atleast_2d(arr_r).shape[0]) + if arr_l.dtype.kind == "O" or arr_r.dtype.kind == "O": + arr_l = arr_l.astype(str) + arr_r = arr_r.astype(str) if how == "anti_right": join_index_l = join_index_r = isin_nd(arr_r, arr_l, invert=True) _how = "right" @@ -780,52 +786,37 @@ def _multi_columns(arr_l, arr_r, how): return (join_index_l, join_index_r, _how) if self.left_index and self.right_index: - if self.how == "anti_right": - join_index = set(self.right.index).difference(set(self.left.index)) - self.how = "right" - elif self.how == "anti_left": - join_index = set(self.left.index).difference(set(self.right.index)) - self.how = "left" - else: - join_index = set(self.left.index.union(self.right.index)) - set( - self.left.index.intersection(self.right.index) - ) - self.how = "outer" - self.left = self.left[self.left.index.isin(join_index)] - self.right = self.right[self.right.index.isin(join_index)] - else: + join_index_l, join_index_r, self.how = _multi_columns( + self.left.index.values, self.right.index.values, self.how + ) + self.left = self.left[join_index_l] + self.right = self.right[join_index_r] + elif self.on is not None or ( + None not in self.left_on and None not in self.right_on + ): if self.on is not None: left_on = right_on = self.on else: left_on = self.left_on right_on = self.right_on - if is_list_like(left_on) and len(left_on) > 1: + if is_list_like(left_on): join_index_l, join_index_r, self.how = _multi_columns( self.left[left_on].values, self.right[right_on].values, self.how ) self.left = self.left[join_index_l] self.right = self.right[join_index_r] - - else: - if self.how == "anti_right": - join_index = set(self.right[right_on].values.flatten()).difference( - set(self.left[left_on].values.flatten()) - ) - self.how = "right" - elif self.how == "anti_left": - join_index = set(self.left[left_on].values.flatten()).difference( - set(self.right[right_on].values.flatten()) - ) - self.how = "left" - else: - join_index = set(self.left[left_on].values.flatten()).union( - self.right[right_on].values.flatten() - ) - set(self.left[left_on].values.flatten()).intersection( - self.right[right_on].values.flatten() - ) - self.how = "outer" - self.left = self.left[self.left[left_on].isin(join_index).values] - self.right = self.right[self.right[right_on].isin(join_index).values] + elif self.left_index and self.right_on is not None: + join_index_l, join_index_r, self.how = _multi_columns( + self.left.index.values, self.right[self.right_on].values, self.how + ) + self.left = self.left[join_index_l] + self.right = self.right[join_index_r] + elif self.right_index and self.left_on is not None: + join_index_l, join_index_r, self.how = _multi_columns( + self.left[self.left_on].values, self.right.index.values, self.how + ) + self.left = self.left[join_index_l] + self.right = self.right[join_index_r] # sanity check to ensure correct `how` assert self.how in ["left", "right", "inner", "outer"] diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 94c0d880a9707..5f302d0c9816b 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2541,7 +2541,7 @@ class Test_AntiJoin: ), ( DataFrame( - {"A": [np.nan, np.nan], "B": [2, 4], "C": [8, 9]}, index=[0, 1] + {"A": [np.nan, np.nan], "C": [8, 9], "B": [2, 4]}, index=[0, 1] ), "anti_right", ["C"], @@ -2621,8 +2621,8 @@ class Test_AntiJoin: DataFrame( { "A": [np.nan, np.nan], - "B": [2, 4], "C": [8, 9], + "B": [2, 4], }, index=[0, 1], ), @@ -2748,6 +2748,25 @@ def test_anti_join( False, False, ), + ( + DataFrame( + { + "A": [3], + "B_x": [6], + "C_x": [7], + "B_y": [np.nan], + "C_y": [np.nan], + "D": ["c"], + }, + index=[np.nan], + ), + "anti_left", + None, + None, + ["D"], + True, + False, + ), ], ) def test_anti_join_multicol( From 4dd802df3f004855ddf7f9485ad482e00cc2b40c Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Tue, 17 Aug 2021 17:39:19 +0530 Subject: [PATCH 04/31] added test on nan --- pandas/tests/reshape/merge/test_merge.py | 84 ++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 5f302d0c9816b..2cbdcb9ce4220 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2787,3 +2787,87 @@ def test_anti_join_multicol( right_index=right_index, ) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "expected, how, on, left_on, right_on", + [ + ( + DataFrame({"A": np.nan, "B": "c", "C": np.nan}, index=[0]), + "anti_right", + None, + None, + None, + ), + ( + DataFrame({"A": np.nan, "B": 3, "C": np.nan}, index=[0]).astype( + {"B": object} + ), + "anti_left", + None, + None, + None, + ), + ( + DataFrame( + {"A": [np.nan, np.nan], "B": [3.0, "c"], "C": [np.nan, np.nan]} + ), + "anti_full", + None, + None, + None, + ), + ( + DataFrame({"A": np.nan, "B": "c", "C": np.nan}, index=[0]), + "anti_right", + ["B"], + None, + None, + ), + ( + DataFrame({"A": np.nan, "B": 3, "C": np.nan}, index=[0]).astype( + {"B": object} + ), + "anti_left", + ["B"], + None, + None, + ), + ( + DataFrame( + {"A": [np.nan, np.nan], "B": [3.0, "c"], "C": [np.nan, np.nan]} + ), + "anti_full", + ["B"], + None, + None, + ), + ( + DataFrame( + {"A": [2.0], "B_x": [2], "C": [np.nan], "B_y": [np.nan]} + ).astype({"B_x": object, "B_y": object}), + "anti_left", + None, + ["A"], + ["C"], + ), + ( + DataFrame( + { + "A": [np.nan, np.nan], + "B_x": [np.nan, np.nan], + "C": [1.0, 3.0], + "B_y": ["a", 2], + } + ).astype({"B_x": object}), + "anti_right", + None, + ["A"], + ["C"], + ), + ], + ) + def test_nan_anti_join(self, expected, how, on, left_on, right_on): + df_1 = DataFrame({"A": [np.nan, 2, np.nan], "B": ["a", 2, 3]}) + df_2 = DataFrame({"C": [1, 3, np.nan], "B": ["a", 2, "c"]}) + result = merge(df_1, df_2, on=on, how=how, left_on=left_on, right_on=right_on) + tm.assert_frame_equal(result, expected) From 6e3d1a47e6e85115ca9d374c54e417baf1160f98 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Tue, 17 Aug 2021 19:16:27 +0530 Subject: [PATCH 05/31] removed tests cases with warning --- pandas/tests/reshape/merge/test_merge.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 2cbdcb9ce4220..026b9f360fe6a 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2807,15 +2807,6 @@ def test_anti_join_multicol( None, None, ), - ( - DataFrame( - {"A": [np.nan, np.nan], "B": [3.0, "c"], "C": [np.nan, np.nan]} - ), - "anti_full", - None, - None, - None, - ), ( DataFrame({"A": np.nan, "B": "c", "C": np.nan}, index=[0]), "anti_right", @@ -2832,15 +2823,6 @@ def test_anti_join_multicol( None, None, ), - ( - DataFrame( - {"A": [np.nan, np.nan], "B": [3.0, "c"], "C": [np.nan, np.nan]} - ), - "anti_full", - ["B"], - None, - None, - ), ( DataFrame( {"A": [2.0], "B_x": [2], "C": [np.nan], "B_y": [np.nan]} From 6427f0983fd464b01bcbdb152e8eedf62c55655a Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Wed, 18 Aug 2021 17:53:54 +0530 Subject: [PATCH 06/31] seperated antijoin tests to another file --- pandas/core/reshape/merge.py | 22 +- pandas/tests/reshape/merge/test_merge.py | 366 ------------------ pandas/tests/reshape/merge/test_merge_anti.py | 325 ++++++++++++++++ 3 files changed, 331 insertions(+), 382 deletions(-) create mode 100644 pandas/tests/reshape/merge/test_merge_anti.py diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 61674d5bb7dc0..b6539791319d5 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -772,10 +772,7 @@ def _multi_columns(arr_l, arr_r, how): join_index_l = join_index_r = isin_nd(arr_l, arr_r, invert=True) _how = "left" else: - try: - _union = np.unique(np.vstack((arr_l, arr_r)), axis=0) - except TypeError: - _union = np.vstack((arr_l, arr_r)) + _union = np.unique(np.vstack((arr_l, arr_r)), axis=0) _intersect = np.array( list({tuple(x) for x in arr_l} & {tuple(x) for x in arr_r}) ) @@ -789,8 +786,6 @@ def _multi_columns(arr_l, arr_r, how): join_index_l, join_index_r, self.how = _multi_columns( self.left.index.values, self.right.index.values, self.how ) - self.left = self.left[join_index_l] - self.right = self.right[join_index_r] elif self.on is not None or ( None not in self.left_on and None not in self.right_on ): @@ -799,24 +794,19 @@ def _multi_columns(arr_l, arr_r, how): else: left_on = self.left_on right_on = self.right_on - if is_list_like(left_on): - join_index_l, join_index_r, self.how = _multi_columns( - self.left[left_on].values, self.right[right_on].values, self.how - ) - self.left = self.left[join_index_l] - self.right = self.right[join_index_r] + join_index_l, join_index_r, self.how = _multi_columns( + self.left[left_on].values, self.right[right_on].values, self.how + ) elif self.left_index and self.right_on is not None: join_index_l, join_index_r, self.how = _multi_columns( self.left.index.values, self.right[self.right_on].values, self.how ) - self.left = self.left[join_index_l] - self.right = self.right[join_index_r] elif self.right_index and self.left_on is not None: join_index_l, join_index_r, self.how = _multi_columns( self.left[self.left_on].values, self.right.index.values, self.how ) - self.left = self.left[join_index_l] - self.right = self.right[join_index_r] + self.left = self.left[join_index_l] + self.right = self.right[join_index_r] # sanity check to ensure correct `how` assert self.how in ["left", "right", "inner", "outer"] diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 026b9f360fe6a..cd07b3814d023 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2487,369 +2487,3 @@ def test_mergeerror_on_left_index_mismatched_dtypes(): df_2 = DataFrame(data=["X"], columns=["C"], index=[999]) with pytest.raises(MergeError, match="Can only pass argument"): merge(df_1, df_2, on=["C"], left_index=True) - - -class Test_AntiJoin: - @pytest.mark.parametrize( - "expected, how, on, left_on, right_on, left_index, right_index", - [ - ( - DataFrame({"A": 3, "C_x": 7, "B": np.nan, "C_y": np.nan}, index=["c"]), - "anti_left", - None, - None, - None, - True, - True, - ), - ( - DataFrame({"A": np.nan, "C_x": np.nan, "B": 4, "C_y": 9}, index=["d"]), - "anti_right", - None, - None, - None, - True, - True, - ), - ( - DataFrame( - { - "A": [3, np.nan], - "C_x": [7, np.nan], - "B": [np.nan, 4], - "C_y": [np.nan, 9], - }, - index=["c", "d"], - ), - "anti_full", - None, - None, - None, - True, - True, - ), - ( - DataFrame( - {"A": [1, 2], "C": [5, 6], "B": [np.nan, np.nan]}, index=[0, 1] - ), - "anti_left", - ["C"], - None, - None, - False, - False, - ), - ( - DataFrame( - {"A": [np.nan, np.nan], "C": [8, 9], "B": [2, 4]}, index=[0, 1] - ), - "anti_right", - ["C"], - None, - None, - False, - False, - ), - ( - DataFrame( - { - "A": [1, 2, np.nan, np.nan], - "C": [5, 6, 8, 9], - "B": [np.nan, np.nan, 2, 4], - }, - index=[0, 1, 2, 3], - ), - "anti_full", - ["C"], - None, - None, - False, - False, - ), - ( - DataFrame({"A": 3, "C_x": 7, "B": np.nan, "C_y": np.nan}, index=[0]), - "anti_left", - None, - ["A"], - ["B"], - False, - False, - ), - ( - DataFrame({"A": np.nan, "C_x": np.nan, "B": 4, "C_y": 9}, index=[0]), - "anti_right", - None, - ["A"], - ["B"], - False, - False, - ), - ( - DataFrame( - { - "A": [3, np.nan], - "C_x": [7, np.nan], - "B": [np.nan, 4], - "C_y": [np.nan, 9], - }, - index=[0, 1], - ), - "anti_full", - None, - ["A"], - ["B"], - False, - False, - ), - ( - DataFrame( - { - "A": [1, 2], - "C": [5, 6], - "B": [np.nan, np.nan], - }, - index=[0, 1], - ), - "anti_left", - None, - None, - None, - False, - False, - ), - ( - DataFrame( - { - "A": [np.nan, np.nan], - "C": [8, 9], - "B": [2, 4], - }, - index=[0, 1], - ), - "anti_right", - None, - None, - None, - False, - False, - ), - ( - DataFrame( - { - "A": [1, 2, np.nan, np.nan], - "C": [5, 6, 8, 9], - "B": [np.nan, np.nan, 2, 4], - }, - index=[0, 1, 2, 3], - ), - "anti_full", - None, - None, - None, - False, - False, - ), - ], - ) - def test_anti_join( - self, expected, how, on, left_on, right_on, left_index, right_index - ): - # GH#42916 - df_l = DataFrame({"A": [1, 2, 3], "C": [5, 6, 7]}, index=["a", "b", "c"]) - df_r = DataFrame({"B": [1, 2, 4], "C": [7, 8, 9]}, index=["a", "b", "d"]) - result = merge( - df_l, - df_r, - how=how, - on=on, - left_on=left_on, - right_on=right_on, - left_index=left_index, - right_index=right_index, - ) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "expected, how, on, left_on, right_on, left_index, right_index", - [ - ( - DataFrame( - {"A": [1, 3], "B": [4, 6], "C": [5, 7], "D": [np.nan, np.nan]} - ).astype({"D": object}), - "anti_left", - None, - None, - None, - False, - False, - ), - ( - DataFrame( - {"A": [np.nan, np.nan], "B": [5, 9], "C": [4, 7], "D": ["a", "d"]} - ), - "anti_right", - None, - None, - None, - False, - False, - ), - ( - DataFrame( - { - "A": [1, 3, np.nan, np.nan], - "B": [4, 6, 5, 9], - "C": [5, 7, 4, 7], - "D": [np.nan, np.nan, "a", "d"], - } - ), - "anti_full", - None, - None, - None, - False, - False, - ), - ( - DataFrame( - {"A": [1, 3], "B": [4, 6], "C": [5, 7], "D": [np.nan, np.nan]} - ).astype({"D": object}), - "anti_left", - ["B", "C"], - None, - None, - False, - False, - ), - ( - DataFrame( - {"A": [np.nan, np.nan], "B": [5, 9], "C": [4, 7], "D": ["a", "d"]} - ), - "anti_right", - ["B", "C"], - None, - None, - False, - False, - ), - ( - DataFrame( - { - "A": [1, 3, np.nan, np.nan], - "B": [4, 6, 5, 9], - "C": [5, 7, 4, 7], - "D": [np.nan, np.nan, "a", "d"], - } - ), - "anti_full", - ["B", "C"], - None, - None, - False, - False, - ), - ( - DataFrame( - { - "A": [3], - "B_x": [6], - "C_x": [7], - "B_y": [np.nan], - "C_y": [np.nan], - "D": ["c"], - }, - index=[np.nan], - ), - "anti_left", - None, - None, - ["D"], - True, - False, - ), - ], - ) - def test_anti_join_multicol( - self, expected, how, on, left_on, right_on, left_index, right_index - ): - df_2 = DataFrame({"B": [5, 5, 9], "C": [4, 6, 7], "D": ["a", "b", "d"]}) - df_1 = DataFrame( - {"A": [1, 2, 3], "B": [4, 5, 6], "C": [5, 6, 7]}, index=["a", "b", "c"] - ) - result = merge( - df_1, - df_2, - how=how, - on=on, - left_on=left_on, - right_on=right_on, - left_index=left_index, - right_index=right_index, - ) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "expected, how, on, left_on, right_on", - [ - ( - DataFrame({"A": np.nan, "B": "c", "C": np.nan}, index=[0]), - "anti_right", - None, - None, - None, - ), - ( - DataFrame({"A": np.nan, "B": 3, "C": np.nan}, index=[0]).astype( - {"B": object} - ), - "anti_left", - None, - None, - None, - ), - ( - DataFrame({"A": np.nan, "B": "c", "C": np.nan}, index=[0]), - "anti_right", - ["B"], - None, - None, - ), - ( - DataFrame({"A": np.nan, "B": 3, "C": np.nan}, index=[0]).astype( - {"B": object} - ), - "anti_left", - ["B"], - None, - None, - ), - ( - DataFrame( - {"A": [2.0], "B_x": [2], "C": [np.nan], "B_y": [np.nan]} - ).astype({"B_x": object, "B_y": object}), - "anti_left", - None, - ["A"], - ["C"], - ), - ( - DataFrame( - { - "A": [np.nan, np.nan], - "B_x": [np.nan, np.nan], - "C": [1.0, 3.0], - "B_y": ["a", 2], - } - ).astype({"B_x": object}), - "anti_right", - None, - ["A"], - ["C"], - ), - ], - ) - def test_nan_anti_join(self, expected, how, on, left_on, right_on): - df_1 = DataFrame({"A": [np.nan, 2, np.nan], "B": ["a", 2, 3]}) - df_2 = DataFrame({"C": [1, 3, np.nan], "B": ["a", 2, "c"]}) - result = merge(df_1, df_2, on=on, how=how, left_on=left_on, right_on=right_on) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge_anti.py b/pandas/tests/reshape/merge/test_merge_anti.py new file mode 100644 index 0000000000000..3bb9d0420bf66 --- /dev/null +++ b/pandas/tests/reshape/merge/test_merge_anti.py @@ -0,0 +1,325 @@ +import numpy as np +import pytest + +from pandas import DataFrame +import pandas._testing as tm +from pandas.core.reshape.merge import merge + + +class Test_AntiJoin: + @pytest.mark.parametrize( + "how, expected", + [ + ( + "anti_left", + DataFrame({"A": 3, "C_x": 7, "B": np.nan, "C_y": np.nan}, index=["c"]), + ), + ( + "anti_right", + DataFrame({"A": np.nan, "C_x": np.nan, "B": 4, "C_y": 9}, index=["d"]), + ), + ( + "anti_full", + DataFrame( + { + "A": [3, np.nan], + "C_x": [7, np.nan], + "B": [np.nan, 4], + "C_y": [np.nan, 9], + }, + index=["c", "d"], + ), + ), + ], + ) + def test_basic_anti_index(self, how, expected): + left = DataFrame({"A": [1, 2, 3], "C": [5, 6, 7]}, index=["a", "b", "c"]) + right = DataFrame({"B": [1, 2, 4], "C": [7, 8, 9]}, index=["a", "b", "d"]) + result = merge(left, right, how=how, left_index=True, right_index=True) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "on, how, expected", + [ + ( + ["C"], + "anti_left", + DataFrame( + {"A": [1, 2], "C": [5, 6], "B": [np.nan, np.nan]}, index=[0, 1] + ), + ), + ( + ["C"], + "anti_right", + DataFrame( + {"A": [np.nan, np.nan], "C": [8, 9], "B": [2, 4]}, index=[0, 1] + ), + ), + ( + ["C"], + "anti_full", + DataFrame( + { + "A": [1, 2, np.nan, np.nan], + "C": [5, 6, 8, 9], + "B": [np.nan, np.nan, 2, 4], + }, + index=[0, 1, 2, 3], + ), + ), + ( + None, + "anti_left", + DataFrame({"A": [1, 2], "C": [5, 6], "B": [np.nan, np.nan]}), + ), + ( + None, + "anti_right", + DataFrame({"A": [np.nan, np.nan], "C": [8, 9], "B": [2, 4]}), + ), + ( + None, + "anti_full", + DataFrame( + { + "A": [1, 2, np.nan, np.nan], + "C": [5, 6, 8, 9], + "B": [np.nan, np.nan, 2, 4], + }, + ), + ), + ], + ) + def test_basic_anti_on(self, on, how, expected): + left = DataFrame({"A": [1, 2, 3], "C": [5, 6, 7]}, index=["a", "b", "c"]) + right = DataFrame({"B": [1, 2, 4], "C": [7, 8, 9]}, index=["a", "b", "d"]) + result = merge(left, right, how=how, on=on) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "expected, how, left_on, right_on", + [ + ( + DataFrame({"A": 3, "C_x": 7, "B": np.nan, "C_y": np.nan}, index=[0]), + "anti_left", + ["A"], + ["B"], + ), + ( + DataFrame({"A": np.nan, "C_x": np.nan, "B": 4, "C_y": 9}, index=[0]), + "anti_right", + ["A"], + ["B"], + ), + ( + DataFrame( + { + "A": [3, np.nan], + "C_x": [7, np.nan], + "B": [np.nan, 4], + "C_y": [np.nan, 9], + }, + ), + "anti_full", + ["A"], + ["B"], + ), + ], + ) + def test_basic_anti_lefton_righton(self, expected, how, left_on, right_on): + left = DataFrame({"A": [1, 2, 3], "C": [5, 6, 7]}, index=["a", "b", "c"]) + right = DataFrame({"B": [1, 2, 4], "C": [7, 8, 9]}, index=["a", "b", "d"]) + result = merge( + left, + right, + how=how, + left_on=left_on, + right_on=right_on, + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "expected, how", + [ + ( + DataFrame( + { + "A": [3], + "B_x": [6], + "C_x": [7], + "B_y": [np.nan], + "C_y": [np.nan], + "D": ["c"], + }, + index=[np.nan], + ), + "anti_left", + ), + ( + DataFrame( + { + "A": [np.nan], + "B_x": [np.nan], + "C_x": [np.nan], + "B_y": [9], + "C_y": [7], + "D": ["d"], + }, + index=[2], + ), + "anti_right", + ), + ( + DataFrame( + { + "A": [3, np.nan], + "B_x": [6, np.nan], + "C_x": [7, np.nan], + "B_y": [np.nan, 9], + "C_y": [np.nan, 7], + "D": ["c", "d"], + }, + index=[np.nan, 2], + ), + "anti_full", + ), + ], + ) + def test_anti_index_with_col(self, expected, how): + left = DataFrame( + {"A": [1, 2, 3], "B": [4, 5, 6], "C": [5, 6, 7]}, index=["a", "b", "c"] + ) + right = DataFrame({"B": [5, 5, 9], "C": [4, 6, 7], "D": ["a", "b", "d"]}) + result = merge(left, right, how=how, left_index=True, right_on=["D"]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "expected, how, on", + [ + ( + DataFrame( + {"A": [1, 3], "B": [4, 6], "C": [5, 7], "D": [np.nan, np.nan]} + ).astype({"D": object}), + "anti_left", + None, + ), + ( + DataFrame( + {"A": [np.nan, np.nan], "B": [5, 9], "C": [4, 7], "D": ["a", "d"]} + ), + "anti_right", + None, + ), + ( + DataFrame( + { + "A": [1, 3, np.nan, np.nan], + "B": [4, 6, 5, 9], + "C": [5, 7, 4, 7], + "D": [np.nan, np.nan, "a", "d"], + } + ), + "anti_full", + None, + ), + ( + DataFrame( + {"A": [1, 3], "B": [4, 6], "C": [5, 7], "D": [np.nan, np.nan]} + ).astype({"D": object}), + "anti_left", + ["B", "C"], + ), + ( + DataFrame( + {"A": [np.nan, np.nan], "B": [5, 9], "C": [4, 7], "D": ["a", "d"]} + ), + "anti_right", + ["B", "C"], + ), + ( + DataFrame( + { + "A": [1, 3, np.nan, np.nan], + "B": [4, 6, 5, 9], + "C": [5, 7, 4, 7], + "D": [np.nan, np.nan, "a", "d"], + } + ), + "anti_full", + ["B", "C"], + ), + ], + ) + def test_anti_multicol(self, expected, how, on): + df_2 = DataFrame({"B": [5, 5, 9], "C": [4, 6, 7], "D": ["a", "b", "d"]}) + df_1 = DataFrame( + {"A": [1, 2, 3], "B": [4, 5, 6], "C": [5, 6, 7]}, index=["a", "b", "c"] + ) + result = merge(df_1, df_2, how=how, on=on) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "expected, how, on, left_on, right_on", + [ + ( + DataFrame({"A": np.nan, "B": "c", "C": np.nan}, index=[0]), + "anti_right", + None, + None, + None, + ), + ( + DataFrame({"A": np.nan, "B": 3, "C": np.nan}, index=[0]).astype( + {"B": object} + ), + "anti_left", + None, + None, + None, + ), + ( + DataFrame({"A": np.nan, "B": "c", "C": np.nan}, index=[0]), + "anti_right", + ["B"], + None, + None, + ), + ( + DataFrame({"A": np.nan, "B": 3, "C": np.nan}, index=[0]).astype( + {"B": object} + ), + "anti_left", + ["B"], + None, + None, + ), + ( + DataFrame( + {"A": [2.0], "B_x": [2], "C": [np.nan], "B_y": [np.nan]} + ).astype({"B_x": object, "B_y": object}), + "anti_left", + None, + ["A"], + ["C"], + ), + ( + DataFrame( + { + "A": [np.nan, np.nan], + "B_x": [np.nan, np.nan], + "C": [1.0, 3.0], + "B_y": ["a", 2], + } + ).astype({"B_x": object}), + "anti_right", + None, + ["A"], + ["C"], + ), + ], + ) + def test_anti_with_nan(self, expected, how, on, left_on, right_on): + df_1 = DataFrame({"A": [np.nan, 2, np.nan], "B": ["a", 2, 3]}) + df_2 = DataFrame({"C": [1, 3, np.nan], "B": ["a", 2, "c"]}) + result = merge(df_1, df_2, on=on, how=how, left_on=left_on, right_on=right_on) + tm.assert_frame_equal(result, expected) From 86ddac97760492eaf8c380c53af4e382e561398a Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Thu, 19 Aug 2021 00:21:19 +0530 Subject: [PATCH 07/31] replaced np funcs with pd --- pandas/core/reshape/merge.py | 69 +++++++++++++++--------------------- 1 file changed, 28 insertions(+), 41 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index b6539791319d5..a8eddd333c470 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -694,7 +694,7 @@ def __init__( self._cross = cross_col if self.how in ["anti_left", "anti_right", "anti_full"]: - self._anti_join_update() + self.left, self.right, self.how = self._anti_join_update() # note this function has side effects ( @@ -748,43 +748,29 @@ def get_result(self) -> DataFrame: return result.__finalize__(self, method="merge") def _anti_join_update(self): - def isin_nd(a, b, invert=False): - # a,b are the nD input arrays to give us - # "isin-like" functionality across them - a = np.ascontiguousarray(a) - b = np.ascontiguousarray(b) - void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1])) - A, B = a.view(void_dt).ravel(), b.view(void_dt).ravel() - return np.isin(A, B, invert=invert) - - def _multi_columns(arr_l, arr_r, how): - if len(arr_l.shape) == 1: - arr_l = np.atleast_2d(arr_l).reshape(-1, np.atleast_2d(arr_l).shape[0]) - if len(arr_r.shape) == 1: - arr_r = np.atleast_2d(arr_r).reshape(-1, np.atleast_2d(arr_r).shape[0]) - if arr_l.dtype.kind == "O" or arr_r.dtype.kind == "O": - arr_l = arr_l.astype(str) - arr_r = arr_r.astype(str) - if how == "anti_right": - join_index_l = join_index_r = isin_nd(arr_r, arr_l, invert=True) - _how = "right" - elif how == "anti_left": - join_index_l = join_index_r = isin_nd(arr_l, arr_r, invert=True) - _how = "left" + def _anti_helper(_left, _right, _how): + if not isinstance(_left, Index): + if len(_left.columns) == 1: + _left = Index(_left.values.flatten()) + else: + _left = MultiIndex.from_frame(_left) + if not isinstance(_right, Index): + if len(_right.columns) == 1: + _right = Index(_right.values.flatten()) + else: + _right = MultiIndex.from_frame(_right) + + if _how in ["anti_left", "anti_right"]: + _how = _how.split("_")[1] else: - _union = np.unique(np.vstack((arr_l, arr_r)), axis=0) - _intersect = np.array( - list({tuple(x) for x in arr_l} & {tuple(x) for x in arr_r}) - ) - _union_index = _union[isin_nd(_union, _intersect, invert=True)] - join_index_l = isin_nd(arr_l, _union_index) - join_index_r = isin_nd(arr_r, _union_index) _how = "outer" + join_index_l = ~_left.isin(_right) + join_index_r = ~_right.isin(_left) return (join_index_l, join_index_r, _how) if self.left_index and self.right_index: - join_index_l, join_index_r, self.how = _multi_columns( - self.left.index.values, self.right.index.values, self.how + join_index_l, join_index_r, self.how = _anti_helper( + self.left.index, self.right.index, self.how ) elif self.on is not None or ( None not in self.left_on and None not in self.right_on @@ -794,22 +780,23 @@ def _multi_columns(arr_l, arr_r, how): else: left_on = self.left_on right_on = self.right_on - join_index_l, join_index_r, self.how = _multi_columns( - self.left[left_on].values, self.right[right_on].values, self.how + join_index_l, join_index_r, self.how = _anti_helper( + self.left[left_on], self.right[right_on], self.how ) elif self.left_index and self.right_on is not None: - join_index_l, join_index_r, self.how = _multi_columns( - self.left.index.values, self.right[self.right_on].values, self.how + join_index_l, join_index_r, self.how = _anti_helper( + self.left.index, self.right[self.right_on], self.how ) elif self.right_index and self.left_on is not None: - join_index_l, join_index_r, self.how = _multi_columns( - self.left[self.left_on].values, self.right.index.values, self.how + join_index_l, join_index_r, self.how = _anti_helper( + self.left[self.left_on], self.right.index, self.how ) - self.left = self.left[join_index_l] - self.right = self.right[join_index_r] + self.left = self.left.loc[join_index_l] + self.right = self.right.loc[join_index_r] # sanity check to ensure correct `how` assert self.how in ["left", "right", "inner", "outer"] + return (self.left, self.right, self.how) def _maybe_drop_cross_column( self, result: DataFrame, cross_col: str | None From 84294e40afef2cb88321485aa266aeaa5b7ccf00 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Thu, 19 Aug 2021 01:26:17 +0530 Subject: [PATCH 08/31] added test with pd.NA --- pandas/tests/reshape/merge/test_merge_anti.py | 94 +++++++++++++++++-- 1 file changed, 88 insertions(+), 6 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge_anti.py b/pandas/tests/reshape/merge/test_merge_anti.py index 3bb9d0420bf66..08addef3eac02 100644 --- a/pandas/tests/reshape/merge/test_merge_anti.py +++ b/pandas/tests/reshape/merge/test_merge_anti.py @@ -1,6 +1,7 @@ import numpy as np import pytest +import pandas as pd from pandas import DataFrame import pandas._testing as tm from pandas.core.reshape.merge import merge @@ -251,11 +252,11 @@ def test_anti_index_with_col(self, expected, how): ], ) def test_anti_multicol(self, expected, how, on): - df_2 = DataFrame({"B": [5, 5, 9], "C": [4, 6, 7], "D": ["a", "b", "d"]}) - df_1 = DataFrame( + right = DataFrame({"B": [5, 5, 9], "C": [4, 6, 7], "D": ["a", "b", "d"]}) + left = DataFrame( {"A": [1, 2, 3], "B": [4, 5, 6], "C": [5, 6, 7]}, index=["a", "b", "c"] ) - result = merge(df_1, df_2, how=how, on=on) + result = merge(left, right, how=how, on=on) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -319,7 +320,88 @@ def test_anti_multicol(self, expected, how, on): ], ) def test_anti_with_nan(self, expected, how, on, left_on, right_on): - df_1 = DataFrame({"A": [np.nan, 2, np.nan], "B": ["a", 2, 3]}) - df_2 = DataFrame({"C": [1, 3, np.nan], "B": ["a", 2, "c"]}) - result = merge(df_1, df_2, on=on, how=how, left_on=left_on, right_on=right_on) + left = DataFrame({"A": [np.nan, 2, np.nan], "B": ["a", 2, 3]}) + right = DataFrame({"C": [1, 3, np.nan], "B": ["a", 2, "c"]}) + result = merge(left, right, on=on, how=how, left_on=left_on, right_on=right_on) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "expected, how, left_on, right_on", + [ + ( + DataFrame({"A": [np.nan, pd.NA], "B": ["a", 3], "C": [np.nan, np.nan]}), + "anti_left", + "B", + "B", + ), + ( + DataFrame( + {"A": [np.nan, np.nan], "B": [pd.NA, "c"], "C": [1, np.nan]} + ).astype({"A": object}), + "anti_right", + "B", + "B", + ), + ( + DataFrame( + { + "A": [np.nan, pd.NA, np.nan, np.nan], + "B": ["a", 3, pd.NA, "c"], + "C": [np.nan, np.nan, 1, np.nan], + } + ), + "anti_full", + "B", + "B", + ), + ( + DataFrame( + { + "A": [2, pd.NA], + "B_x": [2, 3], + "C": [np.nan, np.nan], + "B_y": [np.nan, np.nan], + } + ).astype({"B_x": object, "B_y": object}), + "anti_left", + "A", + "C", + ), + ( + DataFrame( + { + "A": [np.nan, np.nan], + "B_x": [np.nan, np.nan], + "C": [1.0, 3], + "B_y": [pd.NA, 2], + } + ).astype( + { + "A": object, + "B_x": object, + } + ), + "anti_right", + "A", + "C", + ), + ( + DataFrame( + { + "A": [2, pd.NA, np.nan, np.nan], + "B_x": [2, 3, np.nan, np.nan], + "C": [np.nan, np.nan, 1, 3], + "B_y": [np.nan, np.nan, pd.NA, 2], + } + ).astype({"B_x": object}), + "anti_full", + "A", + "C", + ), + ], + ) + def test_anti_with_nan_and_NA(self, expected, how, left_on, right_on): + left = DataFrame({"A": [np.nan, 2, pd.NA], "B": ["a", 2, 3]}) + right = DataFrame({"C": [1, 3, np.nan], "B": [pd.NA, 2, "c"]}) + result = merge(left, right, how=how, left_on=left_on, right_on=right_on) tm.assert_frame_equal(result, expected) From 43ae0a12382be35d4544fbdb5f1e5c761c9eb27a Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Thu, 19 Aug 2021 13:33:23 +0530 Subject: [PATCH 09/31] suggested changes --- pandas/core/frame.py | 39 +++++++++++++++- pandas/core/reshape/merge.py | 87 ++++++++++++++++++++++++++---------- 2 files changed, 102 insertions(+), 24 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 823de2133f0b3..03ef23c164c13 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -269,7 +269,8 @@ ----------%s right : DataFrame or named Series Object to merge with. -how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' +how : {'left', 'right', 'outer', 'inner', 'cross', + 'anti_left', 'anti_right', 'anti_full'}, default 'inner' Type of merge to be performed. * left: use only keys from left frame, similar to a SQL left outer join; @@ -284,6 +285,15 @@ of the left keys. .. versionadded:: 1.2.0 + * anti_left: use only keys from left frame that are absent in right + frame; preserve key order. + * anti_right: use keys from the right frame that are absent in the + left frame; preserve key order. + * anti_full: use keys from the right frame that are absent in the + left frame, and the keys in the left frame that are absent in the + right frame; sort keys lexicographically. + + .. versionadded:: 1.4.0 on : label or list Column or index level names to join on. These must be found in both @@ -443,6 +453,33 @@ 1 foo 8 2 bar 7 3 bar 8 + +>>> df1 = pd.DataFrame({"A": [1, 2, 3], "C": [5, 6, 7]}) +>>> df2 = pd. DataFrame({"B": [1, 2, 4], "C": [7, 8, 9]}) +>>> df1 + A C +0 1 5 +1 2 6 +2 3 7 +>>> df2 + B C +0 1 7 +1 2 8 +2 4 9 +>>> df1.merge(df2, on="C", how="anti_left") + A C B +0 1 5 NaN +1 2 6 NaN +>>> df1.merge(df2, on="C", how="anti_right") + A C B +0 NaN 8 2 +1 NaN 9 4 +>>> df1.merge(df2, on="C", how="anti_full") + A C B +0 1.0 5 NaN +1 2.0 6 NaN +2 NaN 8 2.0 +3 NaN 9 4.0 """ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index a8eddd333c470..845baf08c5a06 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -691,10 +691,9 @@ def __init__( cross_col, ) = self._create_cross_configuration(self.left, self.right) self.left_on = self.right_on = [cross_col] - self._cross = cross_col - - if self.how in ["anti_left", "anti_right", "anti_full"]: + elif self.how in ["anti_left", "anti_right", "anti_full"]: self.left, self.right, self.how = self._anti_join_update() + self._cross = cross_col # note this function has side effects ( @@ -748,33 +747,20 @@ def get_result(self) -> DataFrame: return result.__finalize__(self, method="merge") def _anti_join_update(self): - def _anti_helper(_left, _right, _how): - if not isinstance(_left, Index): - if len(_left.columns) == 1: - _left = Index(_left.values.flatten()) - else: - _left = MultiIndex.from_frame(_left) - if not isinstance(_right, Index): - if len(_right.columns) == 1: - _right = Index(_right.values.flatten()) - else: - _right = MultiIndex.from_frame(_right) - - if _how in ["anti_left", "anti_right"]: - _how = _how.split("_")[1] - else: - _how = "outer" - join_index_l = ~_left.isin(_right) - join_index_r = ~_right.isin(_left) - return (join_index_l, join_index_r, _how) - + """ + Converts `anti_left`, `anti_right` and `anti_full` configurations into `left`, + `right` and `outer` join configurations. + Calls `_anti_helper` with the indices or columns to be merged on. + """ if self.left_index and self.right_index: + # Merge using `right_index` and `left_index` join_index_l, join_index_r, self.how = _anti_helper( self.left.index, self.right.index, self.how ) elif self.on is not None or ( None not in self.left_on and None not in self.right_on ): + # Merge using `on` or `left_on` and `right_on` if self.on is not None: left_on = right_on = self.on else: @@ -784,10 +770,12 @@ def _anti_helper(_left, _right, _how): self.left[left_on], self.right[right_on], self.how ) elif self.left_index and self.right_on is not None: + # Merge using `left_index` and `right_on` join_index_l, join_index_r, self.how = _anti_helper( self.left.index, self.right[self.right_on], self.how ) elif self.right_index and self.left_on is not None: + # Merge using `left_on` and `right_index` join_index_l, join_index_r, self.how = _anti_helper( self.left[self.left_on], self.right.index, self.how ) @@ -1501,6 +1489,59 @@ def _validate(self, validate: str) -> None: raise ValueError("Not a valid argument for validate") +def _anti_helper( + _left: Index | DataFrame, + _right: Index | DataFrame, + _how: str, +) -> tuple[npt.NDArray, npt.NDArray, str]: + """ + Converts `anti_left`, `anti_right` and `anti_full` configurations into `left`, + `right` and `outer` join configurations + + Parameters + ---------- + _left : DataFrame, Index + left frame with columns if merged with `on` or `left/right_on`, else Index + _right : DataFrame, Index + right frame with columns if merged with `on` or `left/right_on`, else Index + _how : {'anti_left', 'anti_right', 'anti_full'} + + Returns + ------- + np.ndarray[bool] + Indexer of left_keys + np.ndarray[bool] + Indexer of right_keys + {"left", "right", "outer"} + Native join configurations + + """ + + # If not Index. Convert the columns into Index or + # MultiIndex as required + if not isinstance(_left, Index): + if len(_left.columns) == 1: + _left = Index(_left.values.flatten()) + else: + _left = MultiIndex.from_frame(_left) + if not isinstance(_right, Index): + if len(_right.columns) == 1: + _right = Index(_right.values.flatten()) + else: + _right = MultiIndex.from_frame(_right) + + how_dict: dict[str, str] = { + "anti_left": "left", + "anti_right": "right", + "anti_full": "outer", + } + _how = how_dict[_how] + + join_index_l = ~_left.isin(_right) + join_index_r = ~_right.isin(_left) + return (join_index_l, join_index_r, _how) + + def get_join_indexers( left_keys, right_keys, sort: bool = False, how: str = "inner", **kwargs ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: From c36705cb75efe91de629ae377b4d1f0125226d31 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Thu, 19 Aug 2021 18:48:38 +0530 Subject: [PATCH 10/31] added tests covering Categorcal, EA, datetime,datetime w tz, EA+multicol --- pandas/core/reshape/merge.py | 4 +- pandas/tests/reshape/merge/test_merge_anti.py | 118 +++++++++++++++++- 2 files changed, 119 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 845baf08c5a06..833b7e58ed53f 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1521,12 +1521,12 @@ def _anti_helper( # MultiIndex as required if not isinstance(_left, Index): if len(_left.columns) == 1: - _left = Index(_left.values.flatten()) + _left = Index(_left.values.flatten(), dtype=_left.dtypes[0]) else: _left = MultiIndex.from_frame(_left) if not isinstance(_right, Index): if len(_right.columns) == 1: - _right = Index(_right.values.flatten()) + _right = Index(_right.values.flatten(), dtype=_right.dtypes[0]) else: _right = MultiIndex.from_frame(_right) diff --git a/pandas/tests/reshape/merge/test_merge_anti.py b/pandas/tests/reshape/merge/test_merge_anti.py index 08addef3eac02..4686e1130902a 100644 --- a/pandas/tests/reshape/merge/test_merge_anti.py +++ b/pandas/tests/reshape/merge/test_merge_anti.py @@ -2,7 +2,10 @@ import pytest import pandas as pd -from pandas import DataFrame +from pandas import ( + Categorical, + DataFrame, +) import pandas._testing as tm from pandas.core.reshape.merge import merge @@ -405,3 +408,116 @@ def test_anti_with_nan_and_NA(self, expected, how, left_on, right_on): right = DataFrame({"C": [1, 3, np.nan], "B": [pd.NA, 2, "c"]}) result = merge(left, right, how=how, left_on=left_on, right_on=right_on) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "how, expected", + [ + ( + "anti_left", + DataFrame( + {"vals_x": [20, 17], "vals_y": [np.nan] * 2}, + index=pd.date_range("1/2/2010", periods=2, freq="2d"), + ), + ), + ( + "anti_right", + DataFrame( + {"vals_x": [np.nan] * 2, "vals_y": [17, 21]}, + index=pd.date_range("1/7/2010", periods=2, freq="2d"), + ), + ), + ( + "anti_full", + DataFrame( + { + "vals_x": [20, 17, np.nan, np.nan], + "vals_y": [np.nan, np.nan, 17, 21], + }, + index=pd.date_range("1/7/2010", periods=2, freq="2d").union( + pd.date_range("1/2/2010", periods=2, freq="2d") + ), + ), + ), + ], + ) + def test_anti_datetime(self, how, expected): + left = DataFrame( + {"vals": [10, 20, 15, 17, 21]}, + index=pd.date_range("1/1/2010", periods=5, freq="D"), + ) + right = DataFrame( + {"vals": [10, 20, 15, 17, 21]}, + index=pd.date_range("1/1/2010", periods=5, freq="2D"), + ) + result = merge(left, right, left_index=True, right_index=True, how=how) + tm.assert_frame_equal(result, expected) + + def test_anti_datetime_tz(self): + expected = DataFrame( + { + "Date": pd.date_range( + "20-10-2021", periods=2, freq="6D", tz="Asia/Kolkata" + ), + "a_x": [3, 4], + "a_y": [np.nan, np.nan], + } + ) + left = DataFrame( + { + "Date": pd.date_range( + "10-02-2021", periods=5, freq="6D", tz="Asia/Kolkata" + ), + "a": range(5), + } + ) + right = DataFrame( + { + "Date": pd.date_range( + "10-02-2021", periods=5, freq="3D", tz="Asia/Kolkata" + ), + "a": range(5), + } + ) + result = merge(left, right, how="anti_left", on="Date") + tm.assert_frame_equal(result, expected) + + def test_anti_categorical(self): + left = DataFrame({"A": list("abca"), "B": list("bccd")}, dtype="category") + right = DataFrame({"A": list("dad"), "C": list("gap")}, dtype="category") + expected = DataFrame( + { + "A": ["b", "c"], + "B": Categorical(["c", "c"], categories=list("bcd")), + "C": Categorical([np.nan, np.nan], categories=list("agp")), + } + ) + result = merge(left, right, how="anti_left") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype", ["Int64", "Int32", "UInt32", "UInt64", "Float32", "Float64"] + ) + def test_anti_EA_dtypes(self, dtype): + left = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, dtype=dtype) + right = DataFrame({"A": [1, 4, 5], "C": [7, 6, 8]}, dtype=dtype) + result = merge(left, right, how="anti_right") + expected = DataFrame( + {"A": [4, 5], "B": [pd.NA, pd.NA], "C": [6, 8]}, dtype=dtype + ).astype({"A": object}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype", ["Int64", "Int32", "UInt32", "UInt64", "Float32", "Float64"] + ) + def test_anti_EA_dtypes_with_multicol(self, dtype): + left = DataFrame( + {"A": [1, 2, 3], "B": [4, 5, 6], "C": [5, 6, 7]}, + index=["a", "b", "c"], + dtype=dtype, + ) + right = DataFrame({"B": [5, 5, 9], "C": [4, 6, 7], "D": [1, 0, 0]}, dtype=dtype) + expected = DataFrame( + columns=list("ABCD"), data=[[1, 4, 5, pd.NA], [3, 6, 7, pd.NA]], dtype=dtype + ) + result = merge(left, right, how="anti_left") + tm.assert_frame_equal(result, expected) From 951406a98aebf1b8930be24426b78150cdd0d621 Mon Sep 17 00:00:00 2001 From: Shoham Debnath Date: Fri, 20 Aug 2021 13:19:56 +0530 Subject: [PATCH 11/31] Update pandas/tests/reshape/merge/test_merge_anti.py simplified test_basic_anti_index (thx to @attck68) Co-authored-by: attack68 <24256554+attack68@users.noreply.github.com> --- pandas/tests/reshape/merge/test_merge_anti.py | 35 +++++-------------- 1 file changed, 9 insertions(+), 26 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge_anti.py b/pandas/tests/reshape/merge/test_merge_anti.py index 4686e1130902a..91152404659f1 100644 --- a/pandas/tests/reshape/merge/test_merge_anti.py +++ b/pandas/tests/reshape/merge/test_merge_anti.py @@ -12,33 +12,16 @@ class Test_AntiJoin: @pytest.mark.parametrize( - "how, expected", - [ - ( - "anti_left", - DataFrame({"A": 3, "C_x": 7, "B": np.nan, "C_y": np.nan}, index=["c"]), - ), - ( - "anti_right", - DataFrame({"A": np.nan, "C_x": np.nan, "B": 4, "C_y": 9}, index=["d"]), - ), - ( - "anti_full", - DataFrame( - { - "A": [3, np.nan], - "C_x": [7, np.nan], - "B": [np.nan, 4], - "C_y": [np.nan, 9], - }, - index=["c", "d"], - ), - ), - ], + "how, exp_index, exp_values", [ + ("anti_left", ["c"], [3, 30, np.nan, np.nan]), + ("anti_right", ["d"], [np.nan, np.nan, 4, 40]), + ("anti_full", ["c", "d"], [[3, 30, np.nan, np.nan], [np.nan, np.nan, 4, 40]]) + ] ) - def test_basic_anti_index(self, how, expected): - left = DataFrame({"A": [1, 2, 3], "C": [5, 6, 7]}, index=["a", "b", "c"]) - right = DataFrame({"B": [1, 2, 4], "C": [7, 8, 9]}, index=["a", "b", "d"]) + def test_basic_anti_index(self, how, exp_index, exp_values): + left = DataFrame({"A": [1, 2, 3], "C": [10, 20, 30]}, index=["a", "b", "c"]) + right = DataFrame({"B": [1, 2, 4], "C": [10, 20, 40]}, index=["a", "b", "d"]) + expected = DataFrame(exp_values, index=exp_index, columns=["A", "C_x", "B", "C_y"]) result = merge(left, right, how=how, left_index=True, right_index=True) tm.assert_frame_equal(result, expected) From db80abf461bddc1e4572c8b95fbc68af88a91f6f Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Fri, 20 Aug 2021 13:29:06 +0530 Subject: [PATCH 12/31] formatted with black --- pandas/tests/reshape/merge/test_merge_anti.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge_anti.py b/pandas/tests/reshape/merge/test_merge_anti.py index 91152404659f1..bad0e83dacc2a 100644 --- a/pandas/tests/reshape/merge/test_merge_anti.py +++ b/pandas/tests/reshape/merge/test_merge_anti.py @@ -12,16 +12,23 @@ class Test_AntiJoin: @pytest.mark.parametrize( - "how, exp_index, exp_values", [ + "how, exp_index, exp_values", + [ ("anti_left", ["c"], [3, 30, np.nan, np.nan]), ("anti_right", ["d"], [np.nan, np.nan, 4, 40]), - ("anti_full", ["c", "d"], [[3, 30, np.nan, np.nan], [np.nan, np.nan, 4, 40]]) - ] + ( + "anti_full", + ["c", "d"], + [[3, 30, np.nan, np.nan], [np.nan, np.nan, 4, 40]], + ), + ], ) def test_basic_anti_index(self, how, exp_index, exp_values): left = DataFrame({"A": [1, 2, 3], "C": [10, 20, 30]}, index=["a", "b", "c"]) right = DataFrame({"B": [1, 2, 4], "C": [10, 20, 40]}, index=["a", "b", "d"]) - expected = DataFrame(exp_values, index=exp_index, columns=["A", "C_x", "B", "C_y"]) + expected = DataFrame( + exp_values, index=exp_index, columns=["A", "C_x", "B", "C_y"] + ) result = merge(left, right, how=how, left_index=True, right_index=True) tm.assert_frame_equal(result, expected) From d93c0acb5abfa6fa196acbc97c70658e2738670c Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Fri, 20 Aug 2021 13:49:42 +0530 Subject: [PATCH 13/31] changed a few test setup --- pandas/tests/reshape/merge/test_merge_anti.py | 38 ++++++------------- 1 file changed, 11 insertions(+), 27 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge_anti.py b/pandas/tests/reshape/merge/test_merge_anti.py index bad0e83dacc2a..358171c269d2d 100644 --- a/pandas/tests/reshape/merge/test_merge_anti.py +++ b/pandas/tests/reshape/merge/test_merge_anti.py @@ -14,8 +14,8 @@ class Test_AntiJoin: @pytest.mark.parametrize( "how, exp_index, exp_values", [ - ("anti_left", ["c"], [3, 30, np.nan, np.nan]), - ("anti_right", ["d"], [np.nan, np.nan, 4, 40]), + ("anti_left", ["c"], [[3, 30, np.nan, np.nan]]), + ("anti_right", ["d"], [[np.nan, np.nan, 4, 40]]), ( "anti_full", ["c", "d"], @@ -33,60 +33,44 @@ def test_basic_anti_index(self, how, exp_index, exp_values): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( - "on, how, expected", + "on, how, data", [ ( ["C"], "anti_left", - DataFrame( - {"A": [1, 2], "C": [5, 6], "B": [np.nan, np.nan]}, index=[0, 1] - ), + [[1, 5, np.nan], [2, 6, np.nan]], ), ( ["C"], "anti_right", - DataFrame( - {"A": [np.nan, np.nan], "C": [8, 9], "B": [2, 4]}, index=[0, 1] - ), + [[np.nan, 8, 2], [np.nan, 9, 4]], ), ( ["C"], "anti_full", - DataFrame( - { - "A": [1, 2, np.nan, np.nan], - "C": [5, 6, 8, 9], - "B": [np.nan, np.nan, 2, 4], - }, - index=[0, 1, 2, 3], - ), + [[1, 5, np.nan], [2, 6, np.nan], [np.nan, 8, 2], [np.nan, 9, 4]], ), ( None, "anti_left", - DataFrame({"A": [1, 2], "C": [5, 6], "B": [np.nan, np.nan]}), + [[1, 5, np.nan], [2, 6, np.nan]], ), ( None, "anti_right", - DataFrame({"A": [np.nan, np.nan], "C": [8, 9], "B": [2, 4]}), + [[np.nan, 8, 2], [np.nan, 9, 4]], ), ( None, "anti_full", - DataFrame( - { - "A": [1, 2, np.nan, np.nan], - "C": [5, 6, 8, 9], - "B": [np.nan, np.nan, 2, 4], - }, - ), + [[1, 5, np.nan], [2, 6, np.nan], [np.nan, 8, 2], [np.nan, 9, 4]], ), ], ) - def test_basic_anti_on(self, on, how, expected): + def test_basic_anti_on(self, on, how, data): left = DataFrame({"A": [1, 2, 3], "C": [5, 6, 7]}, index=["a", "b", "c"]) right = DataFrame({"B": [1, 2, 4], "C": [7, 8, 9]}, index=["a", "b", "d"]) + expected = DataFrame(data, columns=["A", "C", "B"]) result = merge(left, right, how=how, on=on) tm.assert_frame_equal(result, expected) From 79bbbb9d528588c11e68643d030782195e028c3d Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Fri, 10 Sep 2021 14:01:40 +0530 Subject: [PATCH 14/31] removed object cast for EA dtypes; xref #43152 --- pandas/tests/reshape/merge/test_merge_anti.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge_anti.py b/pandas/tests/reshape/merge/test_merge_anti.py index 358171c269d2d..5998353f5855a 100644 --- a/pandas/tests/reshape/merge/test_merge_anti.py +++ b/pandas/tests/reshape/merge/test_merge_anti.py @@ -477,7 +477,7 @@ def test_anti_EA_dtypes(self, dtype): result = merge(left, right, how="anti_right") expected = DataFrame( {"A": [4, 5], "B": [pd.NA, pd.NA], "C": [6, 8]}, dtype=dtype - ).astype({"A": object}) + ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( From 14d0d4c2e09622c061ec54ad52149ea56dd5d399 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sat, 25 Sep 2021 00:39:32 +0530 Subject: [PATCH 15/31] more comments --- pandas/core/frame.py | 8 ++++++-- pandas/tests/reshape/merge/test_merge_anti.py | 5 +++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 96d965e7f1c72..e3be19f0d8de3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -275,8 +275,8 @@ ----------%s right : DataFrame or named Series Object to merge with. -how : {'left', 'right', 'outer', 'inner', 'cross', - 'anti_left', 'anti_right', 'anti_full'}, default 'inner' +how : {'left', 'right', 'outer', 'inner', 'cross', 'anti_left', \ +'anti_right', 'anti_full'}, default 'inner' Type of merge to be performed. * left: use only keys from left frame, similar to a SQL left outer join; @@ -473,14 +473,18 @@ 1 2 8 2 4 9 >>> df1.merge(df2, on="C", how="anti_left") +# because `7` is common in column `C` it's dropped from df1 A C B 0 1 5 NaN 1 2 6 NaN >>> df1.merge(df2, on="C", how="anti_right") +# because `7` is common in column `C` it's dropped from df2 A C B 0 NaN 8 2 1 NaN 9 4 >>> df1.merge(df2, on="C", how="anti_full") +# because `7` is common in column `C` it's dropped from both +# df1 and df2, then outer merged A C B 0 1.0 5 NaN 1 2.0 6 NaN diff --git a/pandas/tests/reshape/merge/test_merge_anti.py b/pandas/tests/reshape/merge/test_merge_anti.py index 5998353f5855a..cebecf900e9f1 100644 --- a/pandas/tests/reshape/merge/test_merge_anti.py +++ b/pandas/tests/reshape/merge/test_merge_anti.py @@ -24,6 +24,7 @@ class Test_AntiJoin: ], ) def test_basic_anti_index(self, how, exp_index, exp_values): + # basic test containing NaNs w/o on param left = DataFrame({"A": [1, 2, 3], "C": [10, 20, 30]}, index=["a", "b", "c"]) right = DataFrame({"B": [1, 2, 4], "C": [10, 20, 40]}, index=["a", "b", "d"]) expected = DataFrame( @@ -68,6 +69,7 @@ def test_basic_anti_index(self, how, exp_index, exp_values): ], ) def test_basic_anti_on(self, on, how, data): + # basic test containing NaNs with on param left = DataFrame({"A": [1, 2, 3], "C": [5, 6, 7]}, index=["a", "b", "c"]) right = DataFrame({"B": [1, 2, 4], "C": [7, 8, 9]}, index=["a", "b", "d"]) expected = DataFrame(data, columns=["A", "C", "B"]) @@ -105,6 +107,7 @@ def test_basic_anti_on(self, on, how, data): ], ) def test_basic_anti_lefton_righton(self, expected, how, left_on, right_on): + # basic test containing NaNs with left_on / right_on params left = DataFrame({"A": [1, 2, 3], "C": [5, 6, 7]}, index=["a", "b", "c"]) right = DataFrame({"B": [1, 2, 4], "C": [7, 8, 9]}, index=["a", "b", "d"]) result = merge( @@ -164,6 +167,7 @@ def test_basic_anti_lefton_righton(self, expected, how, left_on, right_on): ], ) def test_anti_index_with_col(self, expected, how): + # basic test containing NaNs with left_index and right_on params left = DataFrame( {"A": [1, 2, 3], "B": [4, 5, 6], "C": [5, 6, 7]}, index=["a", "b", "c"] ) @@ -229,6 +233,7 @@ def test_anti_index_with_col(self, expected, how): ], ) def test_anti_multicol(self, expected, how, on): + # test with multicol with and w/o on param right = DataFrame({"B": [5, 5, 9], "C": [4, 6, 7], "D": ["a", "b", "d"]}) left = DataFrame( {"A": [1, 2, 3], "B": [4, 5, 6], "C": [5, 6, 7]}, index=["a", "b", "c"] From 3fe64f41fd7273ff918cfc656e2db7684a3c3ed1 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sat, 25 Sep 2021 00:52:25 +0530 Subject: [PATCH 16/31] added in merge.rst --- doc/source/getting_started/comparison/includes/merge.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/doc/source/getting_started/comparison/includes/merge.rst b/doc/source/getting_started/comparison/includes/merge.rst index b8e3f54fd132b..55cd352cf951f 100644 --- a/doc/source/getting_started/comparison/includes/merge.rst +++ b/doc/source/getting_started/comparison/includes/merge.rst @@ -15,3 +15,12 @@ data does not have to be sorted ahead of time, and different join types are acco outer_join = df1.merge(df2, on=["key"], how="outer") outer_join + + anti_left_join = df1.merge(df2, on=["key"], how="anti_left") + anti_left_join + + anti_right_join = df1.merge(df2, on=["key"], how="anti_right") + anti_right_join + + anti_full_join = df1.merge(df2, on=["key"], how="anti_full") + anti_full_join From fc50027dee02b780acab0a9f7d24341854657f11 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sat, 25 Sep 2021 16:40:11 +0530 Subject: [PATCH 17/31] removed comments from example; failing doctests --- pandas/core/frame.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e3be19f0d8de3..02b96c19e0272 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -473,18 +473,14 @@ 1 2 8 2 4 9 >>> df1.merge(df2, on="C", how="anti_left") -# because `7` is common in column `C` it's dropped from df1 A C B 0 1 5 NaN 1 2 6 NaN >>> df1.merge(df2, on="C", how="anti_right") -# because `7` is common in column `C` it's dropped from df2 A C B 0 NaN 8 2 1 NaN 9 4 >>> df1.merge(df2, on="C", how="anti_full") -# because `7` is common in column `C` it's dropped from both -# df1 and df2, then outer merged A C B 0 1.0 5 NaN 1 2.0 6 NaN From aba9a300d9e2ffede32cc43606d239762ea2013c Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sat, 25 Sep 2021 16:58:49 +0530 Subject: [PATCH 18/31] reversed mm dd order in test_anti_datetime_tz to prevent UserWarning --- folder/subfolder/out.csv | 3 +++ pandas/tests/reshape/merge/test_merge_anti.py | 2 +- path_to_file.xlsx | Bin 0 -> 5582 bytes 3 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 folder/subfolder/out.csv create mode 100644 path_to_file.xlsx diff --git a/folder/subfolder/out.csv b/folder/subfolder/out.csv new file mode 100644 index 0000000000000..98e237e417a30 --- /dev/null +++ b/folder/subfolder/out.csv @@ -0,0 +1,3 @@ +,name,mask,weapon +0,Raphael,red,sai +1,Donatello,purple,bo staff diff --git a/pandas/tests/reshape/merge/test_merge_anti.py b/pandas/tests/reshape/merge/test_merge_anti.py index cebecf900e9f1..4f573a9728f2e 100644 --- a/pandas/tests/reshape/merge/test_merge_anti.py +++ b/pandas/tests/reshape/merge/test_merge_anti.py @@ -435,7 +435,7 @@ def test_anti_datetime_tz(self): expected = DataFrame( { "Date": pd.date_range( - "20-10-2021", periods=2, freq="6D", tz="Asia/Kolkata" + "10-20-2021", periods=2, freq="6D", tz="Asia/Kolkata" ), "a_x": [3, 4], "a_y": [np.nan, np.nan], diff --git a/path_to_file.xlsx b/path_to_file.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..e413ed7339a719e99f6eae3558d8585fe165a08f GIT binary patch literal 5582 zcmZ`-1y~e&_g+9+YGn}tmktG4Is}n!Wa*M-$)%PK0VxTkLs|qxI;50FTv9+9mM%$I zq(Sgouh0LxSN`|g*_nCfdFDOyJM*6No-+rkf^&@$000mG_CiIq@TJr&o?@=5FgrPB zw{o_GBAi{^Am%PE+}@6GwU`IQ*u3PjKYLx9>!sSIEhL9DE33);Xd!V{GS$)nEu!tQ zPGDzh`FSFnyGuo+_rkSfmzfgC0HB*ub&re-o#;|+3oLyAVqfD2o)0Q2nU#!!-x3z2 zt*Kp-#>nR&NWo>^kJdJ;Twv@U3m9=pej*&_ibe%@*$z|a$I)VTCeS~YD+(tm`GQ&I z92NjT@z3R2IwN4e*84eLD!PM*JnU!GiV3@;M?2JS3Z~9UK-4=IZZ-PKPKHnO;@Dbc zr{fzvT4aB&>9}2cnsXM<@Y%SQnTjjIEtT!=PU=$UHb`=m9YI~#^Mz$^2=7@Y(&bw0 z*>i

0V)%PTM$&0Brt;Y*8Qh((g1R(GaBd5Anjgm;y;sCB|*nLF6I(R6FaNADpI> zn;~Osm8MS|eLMPv;;X(SmA}^UKdzx1Tr)nsth%7|V2l<|j4plYy#Kf-?5@o`=*QwY zu4h>l{&`2IdfSKKz=~4(KmDgZlh>}q1pqW@0e~AA|GnW5cUzbv?DvuP*BjU|P>V&$ zQn&1L%D2?V+z|U*o?a$2l^?cKS2A1kwkz^MRpBka(zG5YH<=v@LP8Dln=DRr&}(v=rVED(347MnpX;IlwW2K9d;0-_C%3vz39-M+IxW?K?j$=+*BzMC_VtOg63)=N zQdr`uP)t8<>uM9S_H5R5nC5*G83v@onLy=H91_x4ag~+b+6RYJ$o9ZL+AIwXvTjm} z3Q>zq#b)+$-&iMjSBHzh4iIM`OA4b-A#$oIGrX+YE-N|8j3@%t&=^1bndHtY#Z&&( zor!LMFsr4}6=pNgYJxUJ9{M&r4W{k)VND*lYuXTQ9T`2cL2BWEQu~@31uNEc_h^11 zfdP3V8{V`af6J#{spvApk=b%R^f&C?dt38U-#jxwP4#WUsptz%TDAJ9K`Dy79&f^| zPj28eQLD}=66Pjc^f(t56^Bt@d9Z{=3F02|`29K#0~xojfRufD0>u&5#HT$dG?TC` zWt$Pl!CasRm49Gw%S*|cu-ImzC& z|HdC7M5-iB+52seml3bpFG=^u2Zbl?tn_!>V_%cLkx#w84o!=68lqm}6TCx^TYh4_ z%f0DhZP6+dQ>@5A(67oxSkka}KbwP&RuIWM5-(g8rK${m6lUt$piRCJ#}ng=6+bNN zW~9ow`ORWo7FoGlW%pQ)NUn=*MXmpR%*^7lQ|R*OTYfdB+vmA-GEo*agUK3%CS`GO z64$n#UYGd-*{q7v-60-6c12GcrrvVnjaFW@0ej?gY2kqR=Qe0ZZ-F%BC3j+zuycB$ zx!dAOM}#Y9=Plr>Tic^8AIk_G9f`wlD(XD^G_c{=VyxFQI1xeT5G3m$hUwHfw^J$$6eBQd{kZ6hgZ zS=$w2(U_Y<@)iDFhr>ha#q;(L9W;7#k6vm4CX&{?t?lO0hT)$xpd!-`prt0_2u;U3%0~#% z>iiGVU9Xb$j=5HrYaCygB?R}0;0EwHNWKhHLVee<>vf1et!ZDYaku9rDUVPL(-za0 z$JGgNIND?}ST{YM=_k8s8I(&~Rw$RZ`XkzKvQS1vo_VPRBo+9=@B%arrpf_DaH`cT8*Llx9gz- zTWkf)h?$L7Q*SnD!Y_WJSC-3P`VWA~sDfwq`o4*C z6!@c1v-%QDaeIja08sxHcrRy!gPSc3=I#dh>-3k(hb5w%P(0VeO+O9c$VzlZ8jIki zUzhXZWw5REXw=eVHJrF^s!S*%MMBrB+ufD&eYKu+DIM}==wnpO9;06z8J7?qo=qlD1EneTM+~|rl0FxPp!2(43tti7dnxAagA@Ms20a#FNMWh4 zVozw&kDG)W(*B{zGemDr#nV>}D8CY)C3uC_;|eHbe;)RsvGGZeH#mz)G0nR{-Vl|p}=#{*jZ|nS&MO5On6OfLX5hj?Cb8JaZ*?KMGGRo({z*xwr5nR6B2Ev(tcUa`=3Y~D zRk}hnOdq{m+lnf-;*Cw!8lzR|vAT0w!J3bC(yfc&e7Ux7n_{Dp30YT}6K%qU()$Rh ze6`D5Ric4yh}HktB)#wtxZw#AyUAf2!JK(di)wNtD+efWAu?|iVtYVccc9vKhJD}p zdArJzf|IZJ*uy`WrF)!$dmY1S4+#K(`Ty$8+}wTOFt=ZH&gvOCpm-=RIXiaKG8zP3 zBXEqAH_UE@9hm!RTZmF9rm*+1ZdByNtcav;)@)JJs^;q7pFbTi7RnN`hD;**UNVZ^ zDQ1hULyjhjIE@>z+3ib>l~yYj)#M|=hip#vpKZSYl%l-dJ(7_L-|wcYmFUhXjl|X3 zb0#X}naKWFc8gc|cyOO-ZsswFE4BqCsLzF0U%zr0P_3_;(#JsxbRLNGYsAwejS7ad zYd%WH^LD-22{$a>q*ej?JQY-BWmeZ-`I1T0GEAq>n*4aX+51{>El~M}WwhA?5~*Qv zKks#QXwE>)O7ieP{V^K+uw-C8XV1sJ3}*(XxIIi{>pFq;NIK0&cMlt~nB`Ci!|)FM zaV?2s-bR4Phsc;~$CZzCm2ax%fM%nuwc3>}3DceH&;rHb+sVAj(`xM-@;alRT+A~0 zB56m(`HX~ZTQq9~H{-ZCJjUloJq)=+g>}jX7<+|1ny1CwB4wnyAR*Bq=9wSXh3beM z*gAnVEy+(f!-ZCieGpmY3w$~k<3|-JU;azLk$RaAiru;*u#c;m`aEeEs{&GnW}_y!$_l`UTqKW;IfZVf1dn1JA2Q9fRy8ObPM zW0gtcD)%>B#8+MFb`dZ!bC7vJB>_*2jd{Rjp+C}A0{qc}7C)Z1_f{U=AQ7Y6@y8#b zqG`s9s_(dzhAB(d{lU-8V#e;NEkfWV(InI_>Cp!*hRd@^_LW1?Hs7=9P_wk9#0Xf_ z1WlaFY*0LG(aAC(=&1)*#%(nH5e`3SjfK!16iw3DE|LJF#(qbg{Y5~oII#mVL%*HE zqLdOw(bhr1LKxoW*h#^K9f2j2(D?)((xR}pC63AB?a#DHjcTN8!=%d%Yyf~5 zb2df5;BFA^U&pM3_f8*qu1f}f8p`IuA7YH)hwV{aJH^s{O*RR{jgrf1t(Y!wjU&G0 z^FSOp1Gbag5y`KME~365*R(SnYrs;hV~w>Nh?g!OMeDivBzt$alPdT~KaSnK{{r#(QM-(sM!S zOD>Z>Rk~cy$m`n$fw#Mk;zl+qESYM2jn+?HZMBJFW-M8syIRy-3I_L(Z3h7UbQI!= z!Z*h#|18E)vfujftF&7n_hZ-Ce8Jj4`9!`aepRH_QML-0Z>05n%BV@ zk;%%1NVd~ME{)kPA&m5yVFU6PX80+UC+5zL;=BI4!ETq5_f#|9xo;mE;|Zo$z9qT9 zCn|1=XB~k!<9fg4uo>anr>B^2b(Kzh$F)~2yTw5VWcnVAzINGUB4`EH08X;9dsn|H z(xCsq0+#icVS>`y$NF3U#GXxJrK%K=%ahbnpAFu|w%0S?(a(P$oEaAJG_u0s%RXby zW_VpzF8yRJ!Re)F--TMrK`NJ>nIHfNeT}!bbS$!ae#|W-w5i;nbwB(0n@bIvJM>M* z#W^H5adsEIjQp|xWaGs#mthTtg-8q=G{4#S9eMxc3VGhtXi3HcJ*pR~E-;F0^3`$K$rhcM%=A5SFxTTs3At2ebQNE5Q8?(nHOz+2l& zJ$F8;XQyxY%Jsw*^g?vx3hhHV2+9|ZuBRdW-gL7^x00*r)Z)&m?QOd*K+L>vO=yI4 zYp(Q;bw`G3I9mp<5~%YVM3t)M_w_C!k->0O(N0?MYnilkW z*zsuayF16CoDwDTEbanR71!9gu*75`D&FLg)7-F|U)Ui{+vC%Sjd_1QPo+O#GH)n z&$Hs=68Gs7ep8a&p0yiL6)bE@oc}#F!^F>DzaY#@{{J(ytLUpktlwAwAPB4CpXmP^ zX Date: Sun, 26 Sep 2021 16:35:05 +0530 Subject: [PATCH 19/31] Update pandas/core/reshape/merge.py Co-authored-by: JHM Darbyshire <24256554+attack68@users.noreply.github.com> --- pandas/core/reshape/merge.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 90a9ac0d67b86..18c5be9c63432 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1522,16 +1522,12 @@ def _anti_helper( # If not Index. Convert the columns into Index or # MultiIndex as required - if not isinstance(_left, Index): - if len(_left.columns) == 1: - _left = Index(_left.values.flatten(), dtype=_left.dtypes[0]) - else: - _left = MultiIndex.from_frame(_left) - if not isinstance(_right, Index): - if len(_right.columns) == 1: - _right = Index(_right.values.flatten(), dtype=_right.dtypes[0]) - else: - _right = MultiIndex.from_frame(_right) + for _side in [_left, _right]: + if not isinstance(_side, Index): + if len(_side.columns) == 1: + _side = Index(_side.values.flatten(), dtype=_side.dtypes[0]) + else: + _side = MultiIndex.from_frame(_side) how_dict: dict[str, str] = { "anti_left": "left", From 417ea1374cd64fa2f2bdcd5aedd820bf0ad8b3ba Mon Sep 17 00:00:00 2001 From: Shoham Debnath Date: Sun, 26 Sep 2021 16:35:45 +0530 Subject: [PATCH 20/31] Delete out.csv --- folder/subfolder/out.csv | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 folder/subfolder/out.csv diff --git a/folder/subfolder/out.csv b/folder/subfolder/out.csv deleted file mode 100644 index 98e237e417a30..0000000000000 --- a/folder/subfolder/out.csv +++ /dev/null @@ -1,3 +0,0 @@ -,name,mask,weapon -0,Raphael,red,sai -1,Donatello,purple,bo staff From 09426c6391f5174c71531182cf590adaeed582fc Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sun, 26 Sep 2021 21:12:32 +0530 Subject: [PATCH 21/31] Revert "Update pandas/core/reshape/merge.py" This reverts commit 411bcaa34dd75400ef41f07cd86ccd42e738d9d8. --- pandas/core/reshape/merge.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 18c5be9c63432..90a9ac0d67b86 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1522,12 +1522,16 @@ def _anti_helper( # If not Index. Convert the columns into Index or # MultiIndex as required - for _side in [_left, _right]: - if not isinstance(_side, Index): - if len(_side.columns) == 1: - _side = Index(_side.values.flatten(), dtype=_side.dtypes[0]) - else: - _side = MultiIndex.from_frame(_side) + if not isinstance(_left, Index): + if len(_left.columns) == 1: + _left = Index(_left.values.flatten(), dtype=_left.dtypes[0]) + else: + _left = MultiIndex.from_frame(_left) + if not isinstance(_right, Index): + if len(_right.columns) == 1: + _right = Index(_right.values.flatten(), dtype=_right.dtypes[0]) + else: + _right = MultiIndex.from_frame(_right) how_dict: dict[str, str] = { "anti_left": "left", From b6e72aaabd8ed57da0b3f6830eab6759a6a738fa Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sun, 26 Sep 2021 21:13:42 +0530 Subject: [PATCH 22/31] removed files added by mistake --- path_to_file.xlsx | Bin 5582 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 path_to_file.xlsx diff --git a/path_to_file.xlsx b/path_to_file.xlsx deleted file mode 100644 index e413ed7339a719e99f6eae3558d8585fe165a08f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5582 zcmZ`-1y~e&_g+9+YGn}tmktG4Is}n!Wa*M-$)%PK0VxTkLs|qxI;50FTv9+9mM%$I zq(Sgouh0LxSN`|g*_nCfdFDOyJM*6No-+rkf^&@$000mG_CiIq@TJr&o?@=5FgrPB zw{o_GBAi{^Am%PE+}@6GwU`IQ*u3PjKYLx9>!sSIEhL9DE33);Xd!V{GS$)nEu!tQ zPGDzh`FSFnyGuo+_rkSfmzfgC0HB*ub&re-o#;|+3oLyAVqfD2o)0Q2nU#!!-x3z2 zt*Kp-#>nR&NWo>^kJdJ;Twv@U3m9=pej*&_ibe%@*$z|a$I)VTCeS~YD+(tm`GQ&I z92NjT@z3R2IwN4e*84eLD!PM*JnU!GiV3@;M?2JS3Z~9UK-4=IZZ-PKPKHnO;@Dbc zr{fzvT4aB&>9}2cnsXM<@Y%SQnTjjIEtT!=PU=$UHb`=m9YI~#^Mz$^2=7@Y(&bw0 z*>i

0V)%PTM$&0Brt;Y*8Qh((g1R(GaBd5Anjgm;y;sCB|*nLF6I(R6FaNADpI> zn;~Osm8MS|eLMPv;;X(SmA}^UKdzx1Tr)nsth%7|V2l<|j4plYy#Kf-?5@o`=*QwY zu4h>l{&`2IdfSKKz=~4(KmDgZlh>}q1pqW@0e~AA|GnW5cUzbv?DvuP*BjU|P>V&$ zQn&1L%D2?V+z|U*o?a$2l^?cKS2A1kwkz^MRpBka(zG5YH<=v@LP8Dln=DRr&}(v=rVED(347MnpX;IlwW2K9d;0-_C%3vz39-M+IxW?K?j$=+*BzMC_VtOg63)=N zQdr`uP)t8<>uM9S_H5R5nC5*G83v@onLy=H91_x4ag~+b+6RYJ$o9ZL+AIwXvTjm} z3Q>zq#b)+$-&iMjSBHzh4iIM`OA4b-A#$oIGrX+YE-N|8j3@%t&=^1bndHtY#Z&&( zor!LMFsr4}6=pNgYJxUJ9{M&r4W{k)VND*lYuXTQ9T`2cL2BWEQu~@31uNEc_h^11 zfdP3V8{V`af6J#{spvApk=b%R^f&C?dt38U-#jxwP4#WUsptz%TDAJ9K`Dy79&f^| zPj28eQLD}=66Pjc^f(t56^Bt@d9Z{=3F02|`29K#0~xojfRufD0>u&5#HT$dG?TC` zWt$Pl!CasRm49Gw%S*|cu-ImzC& z|HdC7M5-iB+52seml3bpFG=^u2Zbl?tn_!>V_%cLkx#w84o!=68lqm}6TCx^TYh4_ z%f0DhZP6+dQ>@5A(67oxSkka}KbwP&RuIWM5-(g8rK${m6lUt$piRCJ#}ng=6+bNN zW~9ow`ORWo7FoGlW%pQ)NUn=*MXmpR%*^7lQ|R*OTYfdB+vmA-GEo*agUK3%CS`GO z64$n#UYGd-*{q7v-60-6c12GcrrvVnjaFW@0ej?gY2kqR=Qe0ZZ-F%BC3j+zuycB$ zx!dAOM}#Y9=Plr>Tic^8AIk_G9f`wlD(XD^G_c{=VyxFQI1xeT5G3m$hUwHfw^J$$6eBQd{kZ6hgZ zS=$w2(U_Y<@)iDFhr>ha#q;(L9W;7#k6vm4CX&{?t?lO0hT)$xpd!-`prt0_2u;U3%0~#% z>iiGVU9Xb$j=5HrYaCygB?R}0;0EwHNWKhHLVee<>vf1et!ZDYaku9rDUVPL(-za0 z$JGgNIND?}ST{YM=_k8s8I(&~Rw$RZ`XkzKvQS1vo_VPRBo+9=@B%arrpf_DaH`cT8*Llx9gz- zTWkf)h?$L7Q*SnD!Y_WJSC-3P`VWA~sDfwq`o4*C z6!@c1v-%QDaeIja08sxHcrRy!gPSc3=I#dh>-3k(hb5w%P(0VeO+O9c$VzlZ8jIki zUzhXZWw5REXw=eVHJrF^s!S*%MMBrB+ufD&eYKu+DIM}==wnpO9;06z8J7?qo=qlD1EneTM+~|rl0FxPp!2(43tti7dnxAagA@Ms20a#FNMWh4 zVozw&kDG)W(*B{zGemDr#nV>}D8CY)C3uC_;|eHbe;)RsvGGZeH#mz)G0nR{-Vl|p}=#{*jZ|nS&MO5On6OfLX5hj?Cb8JaZ*?KMGGRo({z*xwr5nR6B2Ev(tcUa`=3Y~D zRk}hnOdq{m+lnf-;*Cw!8lzR|vAT0w!J3bC(yfc&e7Ux7n_{Dp30YT}6K%qU()$Rh ze6`D5Ric4yh}HktB)#wtxZw#AyUAf2!JK(di)wNtD+efWAu?|iVtYVccc9vKhJD}p zdArJzf|IZJ*uy`WrF)!$dmY1S4+#K(`Ty$8+}wTOFt=ZH&gvOCpm-=RIXiaKG8zP3 zBXEqAH_UE@9hm!RTZmF9rm*+1ZdByNtcav;)@)JJs^;q7pFbTi7RnN`hD;**UNVZ^ zDQ1hULyjhjIE@>z+3ib>l~yYj)#M|=hip#vpKZSYl%l-dJ(7_L-|wcYmFUhXjl|X3 zb0#X}naKWFc8gc|cyOO-ZsswFE4BqCsLzF0U%zr0P_3_;(#JsxbRLNGYsAwejS7ad zYd%WH^LD-22{$a>q*ej?JQY-BWmeZ-`I1T0GEAq>n*4aX+51{>El~M}WwhA?5~*Qv zKks#QXwE>)O7ieP{V^K+uw-C8XV1sJ3}*(XxIIi{>pFq;NIK0&cMlt~nB`Ci!|)FM zaV?2s-bR4Phsc;~$CZzCm2ax%fM%nuwc3>}3DceH&;rHb+sVAj(`xM-@;alRT+A~0 zB56m(`HX~ZTQq9~H{-ZCJjUloJq)=+g>}jX7<+|1ny1CwB4wnyAR*Bq=9wSXh3beM z*gAnVEy+(f!-ZCieGpmY3w$~k<3|-JU;azLk$RaAiru;*u#c;m`aEeEs{&GnW}_y!$_l`UTqKW;IfZVf1dn1JA2Q9fRy8ObPM zW0gtcD)%>B#8+MFb`dZ!bC7vJB>_*2jd{Rjp+C}A0{qc}7C)Z1_f{U=AQ7Y6@y8#b zqG`s9s_(dzhAB(d{lU-8V#e;NEkfWV(InI_>Cp!*hRd@^_LW1?Hs7=9P_wk9#0Xf_ z1WlaFY*0LG(aAC(=&1)*#%(nH5e`3SjfK!16iw3DE|LJF#(qbg{Y5~oII#mVL%*HE zqLdOw(bhr1LKxoW*h#^K9f2j2(D?)((xR}pC63AB?a#DHjcTN8!=%d%Yyf~5 zb2df5;BFA^U&pM3_f8*qu1f}f8p`IuA7YH)hwV{aJH^s{O*RR{jgrf1t(Y!wjU&G0 z^FSOp1Gbag5y`KME~365*R(SnYrs;hV~w>Nh?g!OMeDivBzt$alPdT~KaSnK{{r#(QM-(sM!S zOD>Z>Rk~cy$m`n$fw#Mk;zl+qESYM2jn+?HZMBJFW-M8syIRy-3I_L(Z3h7UbQI!= z!Z*h#|18E)vfujftF&7n_hZ-Ce8Jj4`9!`aepRH_QML-0Z>05n%BV@ zk;%%1NVd~ME{)kPA&m5yVFU6PX80+UC+5zL;=BI4!ETq5_f#|9xo;mE;|Zo$z9qT9 zCn|1=XB~k!<9fg4uo>anr>B^2b(Kzh$F)~2yTw5VWcnVAzINGUB4`EH08X;9dsn|H z(xCsq0+#icVS>`y$NF3U#GXxJrK%K=%ahbnpAFu|w%0S?(a(P$oEaAJG_u0s%RXby zW_VpzF8yRJ!Re)F--TMrK`NJ>nIHfNeT}!bbS$!ae#|W-w5i;nbwB(0n@bIvJM>M* z#W^H5adsEIjQp|xWaGs#mthTtg-8q=G{4#S9eMxc3VGhtXi3HcJ*pR~E-;F0^3`$K$rhcM%=A5SFxTTs3At2ebQNE5Q8?(nHOz+2l& zJ$F8;XQyxY%Jsw*^g?vx3hhHV2+9|ZuBRdW-gL7^x00*r)Z)&m?QOd*K+L>vO=yI4 zYp(Q;bw`G3I9mp<5~%YVM3t)M_w_C!k->0O(N0?MYnilkW z*zsuayF16CoDwDTEbanR71!9gu*75`D&FLg)7-F|U)Ui{+vC%Sjd_1QPo+O#GH)n z&$Hs=68Gs7ep8a&p0yiL6)bE@oc}#F!^F>DzaY#@{{J(ytLUpktlwAwAPB4CpXmP^ zX Date: Sun, 26 Sep 2021 21:23:17 +0530 Subject: [PATCH 23/31] more comments on tests --- pandas/tests/reshape/merge/test_merge_anti.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/reshape/merge/test_merge_anti.py b/pandas/tests/reshape/merge/test_merge_anti.py index 4f573a9728f2e..3ca4887706965 100644 --- a/pandas/tests/reshape/merge/test_merge_anti.py +++ b/pandas/tests/reshape/merge/test_merge_anti.py @@ -302,6 +302,7 @@ def test_anti_multicol(self, expected, how, on): ], ) def test_anti_with_nan(self, expected, how, on, left_on, right_on): + # basic anti_joins with mixed dtypes left = DataFrame({"A": [np.nan, 2, np.nan], "B": ["a", 2, 3]}) right = DataFrame({"C": [1, 3, np.nan], "B": ["a", 2, "c"]}) result = merge(left, right, on=on, how=how, left_on=left_on, right_on=right_on) @@ -383,6 +384,7 @@ def test_anti_with_nan(self, expected, how, on, left_on, right_on): ], ) def test_anti_with_nan_and_NA(self, expected, how, left_on, right_on): + # test to check np.nan isn't matched with pd.NA left = DataFrame({"A": [np.nan, 2, pd.NA], "B": ["a", 2, 3]}) right = DataFrame({"C": [1, 3, np.nan], "B": [pd.NA, 2, "c"]}) result = merge(left, right, how=how, left_on=left_on, right_on=right_on) From bf76fda8d54abe891861246e02faaf245a764396 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Wed, 26 Oct 2022 02:38:57 +0530 Subject: [PATCH 24/31] attempt 2 --- pandas/core/reshape/merge.py | 27 +++++++++++++++++-- pandas/tests/reshape/merge/test_merge_anti.py | 3 +-- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index fa938361fdf8b..3ec2348cd5266 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -690,8 +690,8 @@ def __init__( cross_col, ) = self._create_cross_configuration(self.left, self.right) self.left_on = self.right_on = [cross_col] - elif self.how in ["anti_left", "anti_right", "anti_full"]: - self.left, self.right, self.how = self._anti_join_update() +# elif self.how in ["anti_left", "anti_right", "anti_full"]: +# self.left, self.right, self.how = self._anti_join_update() self._cross = cross_col # note this function has side effects @@ -991,6 +991,29 @@ def _get_join_info( left_ax = self.left.axes[self.axis] right_ax = self.right.axes[self.axis] + if self.how in ["anti_left", "anti_right", "anti_full"]: + how_dict: dict[str, str] = { + "anti_left": "left", + "anti_right": "right", + "anti_full": "outer", + } + self.how = how_dict[self.how] + if self.left_index and self.right_index: + # Merge using `right_index` and `left_index` + left_ax = self.left.index[~self.left.index.isin(self.right.index)] + right_ax = self.right.index[~self.right.index.isin(self.left.index)] + self.left = self.left.loc[left_ax] + self.right = self.right.loc[right_ax] + elif self.on is not None or (self.left_on is not None and self.right_on is not None): + # Merge using `on` or `left_on` and `right_on` + _left = [~np.isin(self.left_join_keys[x], self.right_join_keys[x]) for x in range(len(self.left_join_keys))] + _right = [~np.isin(self.right_join_keys[x], self.left_join_keys[x]) for x in range(len(self.left_join_keys))] + self.left = self.left[np.sum(np.stack(_left, axis=0), axis=0)>0] + self.right = self.right[np.sum(np.stack(_right, axis=0), axis=0)>0] + + self.left_join_keys = [x[np.sum(np.stack(_left, axis=0), axis=0)>0] for x in self.left_join_keys] + self.right_join_keys = [x[np.sum(np.stack(_right, axis=0), axis=0)>0] for x in self.right_join_keys] + if self.left_index and self.right_index and self.how != "asof": join_index, left_indexer, right_indexer = left_ax.join( right_ax, how=self.how, return_indexers=True, sort=self.sort diff --git a/pandas/tests/reshape/merge/test_merge_anti.py b/pandas/tests/reshape/merge/test_merge_anti.py index 3ca4887706965..b2527bacd2b50 100644 --- a/pandas/tests/reshape/merge/test_merge_anti.py +++ b/pandas/tests/reshape/merge/test_merge_anti.py @@ -145,8 +145,7 @@ def test_basic_anti_lefton_righton(self, expected, how, left_on, right_on): "B_y": [9], "C_y": [7], "D": ["d"], - }, - index=[2], + } ), "anti_right", ), From 93150b73a6a5bda723e26c1356fe5d106a841837 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Wed, 26 Oct 2022 13:35:52 +0530 Subject: [PATCH 25/31] fixed broken tests --- pandas/core/reshape/merge.py | 4 ++-- pandas/tests/reshape/merge/test_merge_anti.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index eb460e27615b8..cb54117ead48d 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1072,8 +1072,8 @@ def _get_join_info( self.right = self.right.loc[right_ax] elif self.on is not None or (self.left_on is not None and self.right_on is not None): # Merge using `on` or `left_on` and `right_on` - _left = [~np.isin(self.left_join_keys[x], self.right_join_keys[x]) for x in range(len(self.left_join_keys))] - _right = [~np.isin(self.right_join_keys[x], self.left_join_keys[x]) for x in range(len(self.left_join_keys))] + _left = [~Index(self.left_join_keys[x]).isin(Index(self.right_join_keys[x])) for x in range(len(self.left_join_keys))] + _right = [~Index(self.right_join_keys[x]).isin(Index(self.left_join_keys[x]))for x in range(len(self.left_join_keys))] self.left = self.left[np.sum(np.stack(_left, axis=0), axis=0)>0] self.right = self.right[np.sum(np.stack(_right, axis=0), axis=0)>0] diff --git a/pandas/tests/reshape/merge/test_merge_anti.py b/pandas/tests/reshape/merge/test_merge_anti.py index b2527bacd2b50..9e71b71ab6590 100644 --- a/pandas/tests/reshape/merge/test_merge_anti.py +++ b/pandas/tests/reshape/merge/test_merge_anti.py @@ -396,7 +396,7 @@ def test_anti_with_nan_and_NA(self, expected, how, left_on, right_on): "anti_left", DataFrame( {"vals_x": [20, 17], "vals_y": [np.nan] * 2}, - index=pd.date_range("1/2/2010", periods=2, freq="2d"), + index=pd.date_range(start="1/2/2010", end="1/4/2010", periods=2), ), ), ( From f63c903d52f2ed53abc5c9c5463a2460cab888c7 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Wed, 26 Oct 2022 13:37:47 +0530 Subject: [PATCH 26/31] removed prev implementation --- pandas/core/reshape/merge.py | 96 ------------------------------------ 1 file changed, 96 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index cb54117ead48d..6c293a19f08fd 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -698,8 +698,6 @@ def __init__( cross_col, ) = self._create_cross_configuration(self.left, self.right) self.left_on = self.right_on = [cross_col] -# elif self.how in ["anti_left", "anti_right", "anti_full"]: -# self.left, self.right, self.how = self._anti_join_update() self._cross = cross_col # note this function has side effects @@ -795,46 +793,6 @@ def get_result(self, copy: bool = True) -> DataFrame: return result.__finalize__(self, method="merge") - def _anti_join_update(self): - """ - Converts `anti_left`, `anti_right` and `anti_full` configurations into `left`, - `right` and `outer` join configurations. - Calls `_anti_helper` with the indices or columns to be merged on. - """ - if self.left_index and self.right_index: - # Merge using `right_index` and `left_index` - join_index_l, join_index_r, self.how = _anti_helper( - self.left.index, self.right.index, self.how - ) - elif self.on is not None or ( - None not in self.left_on and None not in self.right_on - ): - # Merge using `on` or `left_on` and `right_on` - if self.on is not None: - left_on = right_on = self.on - else: - left_on = self.left_on - right_on = self.right_on - join_index_l, join_index_r, self.how = _anti_helper( - self.left[left_on], self.right[right_on], self.how - ) - elif self.left_index and self.right_on is not None: - # Merge using `left_index` and `right_on` - join_index_l, join_index_r, self.how = _anti_helper( - self.left.index, self.right[self.right_on], self.how - ) - elif self.right_index and self.left_on is not None: - # Merge using `left_on` and `right_index` - join_index_l, join_index_r, self.how = _anti_helper( - self.left[self.left_on], self.right.index, self.how - ) - self.left = self.left.loc[join_index_l] - self.right = self.right.loc[join_index_r] - - # sanity check to ensure correct `how` - assert self.how in ["left", "right", "inner", "outer"] - return (self.left, self.right, self.how) - def _maybe_drop_cross_column( self, result: DataFrame, cross_col: str | None ) -> None: @@ -1637,60 +1595,6 @@ def _validate(self, validate: str) -> None: else: raise ValueError("Not a valid argument for validate") - -def _anti_helper( - _left: Index | DataFrame, - _right: Index | DataFrame, - _how: str, -) -> tuple[npt.NDArray, npt.NDArray, str]: - """ - Converts `anti_left`, `anti_right` and `anti_full` configurations into `left`, - `right` and `outer` join configurations - - Parameters - ---------- - _left : DataFrame, Index - left frame with columns if merged with `on` or `left/right_on`, else Index - _right : DataFrame, Index - right frame with columns if merged with `on` or `left/right_on`, else Index - _how : {'anti_left', 'anti_right', 'anti_full'} - - Returns - ------- - np.ndarray[bool] - Indexer of left_keys - np.ndarray[bool] - Indexer of right_keys - {"left", "right", "outer"} - Native join configurations - - """ - - # If not Index. Convert the columns into Index or - # MultiIndex as required - if not isinstance(_left, Index): - if len(_left.columns) == 1: - _left = Index(_left.values.flatten(), dtype=_left.dtypes[0]) - else: - _left = MultiIndex.from_frame(_left) - if not isinstance(_right, Index): - if len(_right.columns) == 1: - _right = Index(_right.values.flatten(), dtype=_right.dtypes[0]) - else: - _right = MultiIndex.from_frame(_right) - - how_dict: dict[str, str] = { - "anti_left": "left", - "anti_right": "right", - "anti_full": "outer", - } - _how = how_dict[_how] - - join_index_l = ~_left.isin(_right) - join_index_r = ~_right.isin(_left) - return (join_index_l, join_index_r, _how) - - def get_join_indexers( left_keys, right_keys, sort: bool = False, how: str = "inner", **kwargs ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: From 9a00186b4c513e583c3f027111d174f5a17b7204 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Wed, 26 Oct 2022 13:39:57 +0530 Subject: [PATCH 27/31] fix lint --- pandas/core/reshape/merge.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 6c293a19f08fd..56533589fef00 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1028,15 +1028,29 @@ def _get_join_info( right_ax = self.right.index[~self.right.index.isin(self.left.index)] self.left = self.left.loc[left_ax] self.right = self.right.loc[right_ax] - elif self.on is not None or (self.left_on is not None and self.right_on is not None): + elif self.on is not None or ( + self.left_on is not None and self.right_on is not None + ): # Merge using `on` or `left_on` and `right_on` - _left = [~Index(self.left_join_keys[x]).isin(Index(self.right_join_keys[x])) for x in range(len(self.left_join_keys))] - _right = [~Index(self.right_join_keys[x]).isin(Index(self.left_join_keys[x]))for x in range(len(self.left_join_keys))] - self.left = self.left[np.sum(np.stack(_left, axis=0), axis=0)>0] - self.right = self.right[np.sum(np.stack(_right, axis=0), axis=0)>0] + _left = [ + ~Index(self.left_join_keys[x]).isin(Index(self.right_join_keys[x])) + for x in range(len(self.left_join_keys)) + ] + _right = [ + ~Index(self.right_join_keys[x]).isin(Index(self.left_join_keys[x])) + for x in range(len(self.left_join_keys)) + ] + self.left = self.left[np.sum(np.stack(_left, axis=0), axis=0) > 0] + self.right = self.right[np.sum(np.stack(_right, axis=0), axis=0) > 0] - self.left_join_keys = [x[np.sum(np.stack(_left, axis=0), axis=0)>0] for x in self.left_join_keys] - self.right_join_keys = [x[np.sum(np.stack(_right, axis=0), axis=0)>0] for x in self.right_join_keys] + self.left_join_keys = [ + x[np.sum(np.stack(_left, axis=0), axis=0) > 0] + for x in self.left_join_keys + ] + self.right_join_keys = [ + x[np.sum(np.stack(_right, axis=0), axis=0) > 0] + for x in self.right_join_keys + ] if self.left_index and self.right_index and self.how != "asof": join_index, left_indexer, right_indexer = left_ax.join( @@ -1595,6 +1609,7 @@ def _validate(self, validate: str) -> None: else: raise ValueError("Not a valid argument for validate") + def get_join_indexers( left_keys, right_keys, sort: bool = False, how: str = "inner", **kwargs ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: From e021a0c32db594bbaa0ca00aefb43c0c45b07c00 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Wed, 26 Oct 2022 13:50:22 +0530 Subject: [PATCH 28/31] added whatsnew --- doc/source/whatsnew/v2.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 0f8afe14a2369..de289e5c89618 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -49,6 +49,7 @@ Other enhancements - Fix ``test`` optional_extra by adding missing test package ``pytest-asyncio`` (:issue:`48361`) - :func:`DataFrame.astype` exception message thrown improved to include column name when type conversion is not possible. (:issue:`47571`) - :meth:`DataFrame.to_json` now supports a ``mode`` keyword with supported inputs 'w' and 'a'. Defaulting to 'w', 'a' can be used when lines=True and orient='records' to append record oriented json lines to an existing json file. (:issue:`35849`) +- :meth:`DataFrame.join` now supports ``how`` with ``anti_left``, ``anti_right`` and ``anti_full`` (:issue:`42916`) .. --------------------------------------------------------------------------- .. _whatsnew_200.notable_bug_fixes: From 378c858a4306fce2de2196323b56235a935cacc9 Mon Sep 17 00:00:00 2001 From: Shoham Debnath Date: Thu, 27 Oct 2022 11:10:37 +0530 Subject: [PATCH 29/31] updated version Co-authored-by: William Ayd --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index aa5cfe1d6344c..3173a80fae3b8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -319,7 +319,7 @@ left frame, and the keys in the left frame that are absent in the right frame; sort keys lexicographically. - .. versionadded:: 1.4.0 + .. versionadded:: 2.0.0 on : label or list Column or index level names to join on. These must be found in both From 645b02dadd75565951bdad76511e197b5b30c12c Mon Sep 17 00:00:00 2001 From: Shoham Debnath Date: Thu, 27 Oct 2022 11:12:13 +0530 Subject: [PATCH 30/31] remove unnecessary typing Co-authored-by: William Ayd --- pandas/core/reshape/merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 56533589fef00..b08f62f4eabb3 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1016,7 +1016,7 @@ def _get_join_info( right_ax = self.right.axes[self.axis] if self.how in ["anti_left", "anti_right", "anti_full"]: - how_dict: dict[str, str] = { + how_dict = { "anti_left": "left", "anti_right": "right", "anti_full": "outer", From e2a94238de01044fcce836fd70e837fbfde7f039 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Thu, 27 Oct 2022 12:27:02 +0530 Subject: [PATCH 31/31] added asv benchmark for anti_joins --- asv_bench/benchmarks/join_merge.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 2309347ac96d8..d7eb22d53621c 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -292,10 +292,13 @@ def time_merge_dataframe_empty_left(self, sort): def time_merge_dataframes_cross(self, sort): merge(self.left.loc[:2000], self.right.loc[:2000], how="cross", sort=sort) + def time_merge_dataframes_anti(self, sort): + merge(self.left, self.right, how="anti_left", sort=sort) + class I8Merge: - params = ["inner", "outer", "left", "right"] + params = ["inner", "outer", "left", "right", "anti_left", "anti_right", "anti_full"] param_names = ["how"] def setup(self, how): @@ -472,7 +475,7 @@ class MergeMultiIndex: ("datetime64[ns]", "int64"), ("Int64", "Int64"), ], - ["left", "right", "inner", "outer"], + ["left", "right", "inner", "outer", "anti_left", "anti_right", "anti_full"], ] param_names = ["dtypes", "how"]