Skip to content

Commit 2cc9b21

Browse files
authored
ENH: Support 'left_anti' and 'right_anti' joins in pd.merge (#60732)
* ENH: Support 'left_anti' and 'right_anti' joins in pd.merge * Fix mypy errors * Fix another mypy error * Restructure a bit * Fix mypy typing error * Fix test * Fix arrow string test * Fix future string test * Retry fix * Address review comment
1 parent 0e245de commit 2cc9b21

File tree

7 files changed

+400
-16
lines changed

7 files changed

+400
-16
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ Other enhancements
3535
- :class:`pandas.api.typing.NoDefault` is available for typing ``no_default``
3636
- :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
3737
- :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`)
38+
- :func:`pandas.merge`, :meth:`DataFrame.merge` and :meth:`DataFrame.join` now support anti joins (``left_anti`` and ``right_anti``) in the ``how`` parameter (:issue:`42916`)
3839
- :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`)
3940
- :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`)
4041
- :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`).

pandas/_typing.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -442,7 +442,9 @@ def closed(self) -> bool:
442442
AnyAll = Literal["any", "all"]
443443

444444
# merge
445-
MergeHow = Literal["left", "right", "inner", "outer", "cross"]
445+
MergeHow = Literal[
446+
"left", "right", "inner", "outer", "cross", "left_anti", "right_anti"
447+
]
446448
MergeValidate = Literal[
447449
"one_to_one",
448450
"1:1",

pandas/core/frame.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,8 @@
315315
----------%s
316316
right : DataFrame or named Series
317317
Object to merge with.
318-
how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'
318+
how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti'},
319+
default 'inner'
319320
Type of merge to be performed.
320321
321322
* left: use only keys from left frame, similar to a SQL left outer join;
@@ -328,6 +329,10 @@
328329
join; preserve the order of the left keys.
329330
* cross: creates the cartesian product from both frames, preserves the order
330331
of the left keys.
332+
* left_anti: use only keys from left frame that are not in right frame, similar
333+
to SQL left anti join; preserve key order.
334+
* right_anti: use only keys from right frame that are not in left frame, similar
335+
to SQL right anti join; preserve key order.
331336
on : label or list
332337
Column or index level names to join on. These must be found in both
333338
DataFrames. If `on` is None and not merging on indexes then this defaults
@@ -10613,7 +10618,8 @@ def join(
1061310618
values given, the `other` DataFrame must have a MultiIndex. Can
1061410619
pass an array as the join key if it is not already contained in
1061510620
the calling DataFrame. Like an Excel VLOOKUP operation.
10616-
how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'left'
10621+
how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti'},
10622+
default 'left'
1061710623
How to handle the operation of the two objects.
1061810624
1061910625
* left: use calling frame's index (or column if on is specified)
@@ -10625,6 +10631,10 @@ def join(
1062510631
of the calling's one.
1062610632
* cross: creates the cartesian product from both frames, preserves the order
1062710633
of the left keys.
10634+
* left_anti: use set difference of calling frame's index and `other`'s
10635+
index.
10636+
* right_anti: use set difference of `other`'s index and calling frame's
10637+
index.
1062810638
lsuffix : str, default ''
1062910639
Suffix to use from left frame's overlapping columns.
1063010640
rsuffix : str, default ''

pandas/core/reshape/merge.py

+86-11
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,8 @@ def merge(
180180
First pandas object to merge.
181181
right : DataFrame or named Series
182182
Second pandas object to merge.
183-
how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'
183+
how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti},
184+
default 'inner'
184185
Type of merge to be performed.
185186
186187
* left: use only keys from left frame, similar to a SQL left outer join;
@@ -193,6 +194,10 @@ def merge(
193194
join; preserve the order of the left keys.
194195
* cross: creates the cartesian product from both frames, preserves the order
195196
of the left keys.
197+
* left_anti: use only keys from left frame that are not in right frame, similar
198+
to SQL left anti join; preserve key order.
199+
* right_anti: use only keys from right frame that are not in left frame, similar
200+
to SQL right anti join; preserve key order.
196201
on : label or list
197202
Column or index level names to join on. These must be found in both
198203
DataFrames. If `on` is None and not merging on indexes then this defaults
@@ -953,7 +958,7 @@ def __init__(
953958
self,
954959
left: DataFrame | Series,
955960
right: DataFrame | Series,
956-
how: JoinHow | Literal["asof"] = "inner",
961+
how: JoinHow | Literal["left_anti", "right_anti", "asof"] = "inner",
957962
on: IndexLabel | AnyArrayLike | None = None,
958963
left_on: IndexLabel | AnyArrayLike | None = None,
959964
right_on: IndexLabel | AnyArrayLike | None = None,
@@ -968,7 +973,7 @@ def __init__(
968973
_right = _validate_operand(right)
969974
self.left = self.orig_left = _left
970975
self.right = self.orig_right = _right
971-
self.how = how
976+
self.how, self.anti_join = self._validate_how(how)
972977

973978
self.on = com.maybe_make_list(on)
974979

@@ -998,14 +1003,6 @@ def __init__(
9981003
)
9991004
raise MergeError(msg)
10001005

1001-
# GH 59435: raise when "how" is not a valid Merge type
1002-
merge_type = {"left", "right", "inner", "outer", "cross", "asof"}
1003-
if how not in merge_type:
1004-
raise ValueError(
1005-
f"'{how}' is not a valid Merge type: "
1006-
f"left, right, inner, outer, cross, asof"
1007-
)
1008-
10091006
self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on)
10101007

10111008
(
@@ -1035,6 +1032,37 @@ def __init__(
10351032
if validate is not None:
10361033
self._validate_validate_kwd(validate)
10371034

1035+
@final
1036+
def _validate_how(
1037+
self, how: JoinHow | Literal["left_anti", "right_anti", "asof"]
1038+
) -> tuple[JoinHow | Literal["asof"], bool]:
1039+
"""
1040+
Validate the 'how' parameter and return the actual join type and whether
1041+
this is an anti join.
1042+
"""
1043+
# GH 59435: raise when "how" is not a valid Merge type
1044+
merge_type = {
1045+
"left",
1046+
"right",
1047+
"inner",
1048+
"outer",
1049+
"left_anti",
1050+
"right_anti",
1051+
"cross",
1052+
"asof",
1053+
}
1054+
if how not in merge_type:
1055+
raise ValueError(
1056+
f"'{how}' is not a valid Merge type: "
1057+
f"left, right, inner, outer, left_anti, right_anti, cross, asof"
1058+
)
1059+
anti_join = False
1060+
if how in {"left_anti", "right_anti"}:
1061+
how = how.split("_")[0] # type: ignore[assignment]
1062+
anti_join = True
1063+
how = cast(JoinHow | Literal["asof"], how)
1064+
return how, anti_join
1065+
10381066
def _maybe_require_matching_dtypes(
10391067
self, left_join_keys: list[ArrayLike], right_join_keys: list[ArrayLike]
10401068
) -> None:
@@ -1405,6 +1433,11 @@ def _get_join_info(
14051433
n = len(left_ax) if left_indexer is None else len(left_indexer)
14061434
join_index = default_index(n)
14071435

1436+
if self.anti_join:
1437+
join_index, left_indexer, right_indexer = self._handle_anti_join(
1438+
join_index, left_indexer, right_indexer
1439+
)
1440+
14081441
return join_index, left_indexer, right_indexer
14091442

14101443
@final
@@ -1447,6 +1480,48 @@ def _create_join_index(
14471480
return index.copy()
14481481
return index.take(indexer)
14491482

1483+
@final
1484+
def _handle_anti_join(
1485+
self,
1486+
join_index: Index,
1487+
left_indexer: npt.NDArray[np.intp] | None,
1488+
right_indexer: npt.NDArray[np.intp] | None,
1489+
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
1490+
"""
1491+
Handle anti join by returning the correct join index and indexers
1492+
1493+
Parameters
1494+
----------
1495+
join_index : Index
1496+
join index
1497+
left_indexer : np.ndarray[np.intp] or None
1498+
left indexer
1499+
right_indexer : np.ndarray[np.intp] or None
1500+
right indexer
1501+
1502+
Returns
1503+
-------
1504+
Index, np.ndarray[np.intp] or None, np.ndarray[np.intp] or None
1505+
"""
1506+
# Make sure indexers are not None
1507+
if left_indexer is None:
1508+
left_indexer = np.arange(len(self.left))
1509+
if right_indexer is None:
1510+
right_indexer = np.arange(len(self.right))
1511+
1512+
assert self.how in {"left", "right"}
1513+
if self.how == "left":
1514+
# Filter to rows where left keys are not in right keys
1515+
filt = right_indexer == -1
1516+
else:
1517+
# Filter to rows where right keys are not in left keys
1518+
filt = left_indexer == -1
1519+
join_index = join_index[filt]
1520+
left_indexer = left_indexer[filt]
1521+
right_indexer = right_indexer[filt]
1522+
1523+
return join_index, left_indexer, right_indexer
1524+
14501525
@final
14511526
def _get_merge_keys(
14521527
self,

pandas/tests/frame/methods/test_join.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,20 @@ def test_join_index(float_frame):
277277
tm.assert_index_equal(joined.index, float_frame.index.sort_values())
278278
tm.assert_index_equal(joined.columns, expected_columns)
279279

280-
join_msg = "'foo' is not a valid Merge type: left, right, inner, outer, cross, asof"
280+
# left anti
281+
joined = f.join(f2, how="left_anti")
282+
tm.assert_index_equal(joined.index, float_frame.index[:5])
283+
tm.assert_index_equal(joined.columns, expected_columns)
284+
285+
# right anti
286+
joined = f.join(f2, how="right_anti")
287+
tm.assert_index_equal(joined.index, float_frame.index[10:][::-1])
288+
tm.assert_index_equal(joined.columns, expected_columns)
289+
290+
join_msg = (
291+
"'foo' is not a valid Merge type: left, right, inner, outer, "
292+
"left_anti, right_anti, cross, asof"
293+
)
281294
with pytest.raises(ValueError, match=re.escape(join_msg)):
282295
f.join(f2, how="foo")
283296

pandas/tests/reshape/merge/test_merge.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1464,7 +1464,10 @@ def test_merge_how_validation(self):
14641464
data2 = DataFrame(
14651465
np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"]
14661466
)
1467-
msg = "'full' is not a valid Merge type: left, right, inner, outer, cross, asof"
1467+
msg = (
1468+
"'full' is not a valid Merge type: left, right, inner, outer, "
1469+
"left_anti, right_anti, cross, asof"
1470+
)
14681471
with pytest.raises(ValueError, match=re.escape(msg)):
14691472
data1.merge(data2, how="full")
14701473

0 commit comments

Comments
 (0)