Skip to content

ENH:included anti join functionality #43056

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 35 commits into from
Closed
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
fba0bb5
ENH:included anti join functionality
debnathshoham Aug 15, 2021
53fd41d
included multicol join
debnathshoham Aug 16, 2021
627c31e
Merge branch 'master' into 42916
debnathshoham Aug 16, 2021
448373b
handling index and col
debnathshoham Aug 17, 2021
4dd802d
added test on nan
debnathshoham Aug 17, 2021
3b17c59
Merge branch 'master' into 42916
debnathshoham Aug 17, 2021
6e3d1a4
removed tests cases with warning
debnathshoham Aug 17, 2021
6427f09
seperated antijoin tests to another file
debnathshoham Aug 18, 2021
86ddac9
replaced np funcs with pd
debnathshoham Aug 18, 2021
84294e4
added test with pd.NA
debnathshoham Aug 18, 2021
43ae0a1
suggested changes
debnathshoham Aug 19, 2021
c36705c
added tests covering Categorcal, EA, datetime,datetime w tz, EA+multicol
debnathshoham Aug 19, 2021
951406a
Update pandas/tests/reshape/merge/test_merge_anti.py
debnathshoham Aug 20, 2021
db80abf
formatted with black
debnathshoham Aug 20, 2021
d93c0ac
changed a few test setup
debnathshoham Aug 20, 2021
90af576
Merge branch 'master' into 42916
debnathshoham Sep 8, 2021
80ce02e
Merge branch 'master' into 42916
debnathshoham Sep 10, 2021
79bbbb9
removed object cast for EA dtypes; xref #43152
debnathshoham Sep 10, 2021
ee7cc16
Merge branch 'master' into 42916
debnathshoham Sep 10, 2021
76cd5c6
Merge branch 'master' into 42916
debnathshoham Sep 19, 2021
d358efc
Merge branch 'master' into 42916
debnathshoham Sep 24, 2021
14d0d4c
more comments
debnathshoham Sep 24, 2021
3fe64f4
added in merge.rst
debnathshoham Sep 24, 2021
fc50027
removed comments from example; failing doctests
debnathshoham Sep 25, 2021
aba9a30
reversed mm dd order in test_anti_datetime_tz to prevent UserWarning
debnathshoham Sep 25, 2021
411bcaa
Update pandas/core/reshape/merge.py
debnathshoham Sep 26, 2021
417ea13
Delete out.csv
debnathshoham Sep 26, 2021
09426c6
Revert "Update pandas/core/reshape/merge.py"
debnathshoham Sep 26, 2021
b6e72aa
removed files added by mistake
debnathshoham Sep 26, 2021
8338a48
more comments on tests
debnathshoham Sep 26, 2021
cc6c8ea
Merge branch 'master' into 42916
debnathshoham Sep 27, 2021
f33fe48
Merge branch 'master' into 42916
debnathshoham Sep 29, 2021
74e172b
Merge branch 'master' into 42916
debnathshoham Oct 4, 2021
594f80a
Merge branch 'master' into 42916
debnathshoham Dec 15, 2021
f395cbb
Merge branch 'main' of https://github.com/pandas-dev/pandas into 42916
debnathshoham Jan 14, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions doc/source/getting_started/comparison/includes/merge.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,12 @@ data does not have to be sorted ahead of time, and different join types are acco

outer_join = df1.merge(df2, on=["key"], how="outer")
outer_join

anti_left_join = df1.merge(df2, on=["key"], how="anti_left")
anti_left_join

anti_right_join = df1.merge(df2, on=["key"], how="anti_right")
anti_right_join

anti_full_join = df1.merge(df2, on=["key"], how="anti_full")
anti_full_join
3 changes: 3 additions & 0 deletions folder/subfolder/out.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
,name,mask,weapon
0,Raphael,red,sai
1,Donatello,purple,bo staff
39 changes: 38 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,8 @@
----------%s
right : DataFrame or named Series
Object to merge with.
how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'
how : {'left', 'right', 'outer', 'inner', 'cross', 'anti_left', \
'anti_right', 'anti_full'}, default 'inner'
Type of merge to be performed.

* left: use only keys from left frame, similar to a SQL left outer join;
Expand All @@ -290,6 +291,15 @@
of the left keys.

.. versionadded:: 1.2.0
* anti_left: use only keys from left frame that are absent in right
frame; preserve key order.
* anti_right: use keys from the right frame that are absent in the
left frame; preserve key order.
* anti_full: use keys from the right frame that are absent in the
left frame, and the keys in the left frame that are absent in the
right frame; sort keys lexicographically.

.. versionadded:: 1.4.0

on : label or list
Column or index level names to join on. These must be found in both
Expand Down Expand Up @@ -449,6 +459,33 @@
1 foo 8
2 bar 7
3 bar 8

>>> df1 = pd.DataFrame({"A": [1, 2, 3], "C": [5, 6, 7]})
>>> df2 = pd. DataFrame({"B": [1, 2, 4], "C": [7, 8, 9]})
>>> df1
A C
0 1 5
1 2 6
2 3 7
>>> df2
B C
0 1 7
1 2 8
2 4 9
>>> df1.merge(df2, on="C", how="anti_left")
A C B
0 1 5 NaN
1 2 6 NaN
>>> df1.merge(df2, on="C", how="anti_right")
A C B
0 NaN 8 2
1 NaN 9 4
>>> df1.merge(df2, on="C", how="anti_full")
A C B
0 1.0 5 NaN
1 2.0 6 NaN
2 NaN 8 2.0
3 NaN 9 4.0
"""


Expand Down
95 changes: 95 additions & 0 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -690,6 +690,8 @@ def __init__(
cross_col,
) = self._create_cross_configuration(self.left, self.right)
self.left_on = self.right_on = [cross_col]
elif self.how in ["anti_left", "anti_right", "anti_full"]:
self.left, self.right, self.how = self._anti_join_update()
self._cross = cross_col

# note this function has side effects
Expand Down Expand Up @@ -743,6 +745,46 @@ def get_result(self) -> DataFrame:

return result.__finalize__(self, method="merge")

def _anti_join_update(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok with this being a method on _MergeOperation but it should have NO side effects, e.g. simply return left, right and assign at this level.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure if I understand what you mean by, being a method on _MergeOperation.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this needs to execute in the get_result() portion, NOT during validation. validation shouldn't dot he actual computation.

IOW during _get_join_info could dispatch based on how.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the actual calculation is still happening in get_result().
I have just changed the configuration of left, right and how to utilise the already existing joining methods. E.g. For anti_left I have changed the left, right and how to left such that left join would give the result

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hi @jreback - wondering if you got a chance to look at this? and if this implementation is fine?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no the implementation is quite convoluted

i am not sure how to fix what u have here

"""
Converts `anti_left`, `anti_right` and `anti_full` configurations into `left`,
`right` and `outer` join configurations.
Calls `_anti_helper` with the indices or columns to be merged on.
"""
if self.left_index and self.right_index:
# Merge using `right_index` and `left_index`
join_index_l, join_index_r, self.how = _anti_helper(
self.left.index, self.right.index, self.how
)
elif self.on is not None or (
None not in self.left_on and None not in self.right_on
):
# Merge using `on` or `left_on` and `right_on`
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to be honest these comments are obvious, as you said. it was more the general concept that you could have expanded in the main docstring, why you have implemented it this way and what is the advantage. Just in case you come back to look at it after two years, or another developer comes in and tries to figure out what you are doing.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the comments in merge.py looks sufficient to me. But again, I wrote it and a second pair of eyes can give a fresh perspective.
Do you think the logic in merge.py is too difficult to follow? (if you could point to any specific portion)

if self.on is not None:
left_on = right_on = self.on
else:
left_on = self.left_on
right_on = self.right_on
join_index_l, join_index_r, self.how = _anti_helper(
self.left[left_on], self.right[right_on], self.how
)
elif self.left_index and self.right_on is not None:
# Merge using `left_index` and `right_on`
join_index_l, join_index_r, self.how = _anti_helper(
self.left.index, self.right[self.right_on], self.how
)
elif self.right_index and self.left_on is not None:
# Merge using `left_on` and `right_index`
join_index_l, join_index_r, self.how = _anti_helper(
self.left[self.left_on], self.right.index, self.how
)
self.left = self.left.loc[join_index_l]
self.right = self.right.loc[join_index_r]

# sanity check to ensure correct `how`
assert self.how in ["left", "right", "inner", "outer"]
return (self.left, self.right, self.how)

def _maybe_drop_cross_column(
self, result: DataFrame, cross_col: str | None
) -> None:
Expand Down Expand Up @@ -1450,6 +1492,59 @@ def _validate(self, validate: str) -> None:
raise ValueError("Not a valid argument for validate")


def _anti_helper(
_left: Index | DataFrame,
_right: Index | DataFrame,
_how: str,
) -> tuple[npt.NDArray, npt.NDArray, str]:
"""
Converts `anti_left`, `anti_right` and `anti_full` configurations into `left`,
`right` and `outer` join configurations

Parameters
----------
_left : DataFrame, Index
left frame with columns if merged with `on` or `left/right_on`, else Index
_right : DataFrame, Index
right frame with columns if merged with `on` or `left/right_on`, else Index
_how : {'anti_left', 'anti_right', 'anti_full'}

Returns
-------
np.ndarray[bool]
Indexer of left_keys
np.ndarray[bool]
Indexer of right_keys
{"left", "right", "outer"}
Native join configurations

"""

# If not Index. Convert the columns into Index or
# MultiIndex as required
if not isinstance(_left, Index):
if len(_left.columns) == 1:
_left = Index(_left.values.flatten(), dtype=_left.dtypes[0])
else:
_left = MultiIndex.from_frame(_left)
if not isinstance(_right, Index):
if len(_right.columns) == 1:
_right = Index(_right.values.flatten(), dtype=_right.dtypes[0])
else:
_right = MultiIndex.from_frame(_right)

how_dict: dict[str, str] = {
"anti_left": "left",
"anti_right": "right",
"anti_full": "outer",
}
_how = how_dict[_how]

join_index_l = ~_left.isin(_right)
join_index_r = ~_right.isin(_left)
return (join_index_l, join_index_r, _how)


def get_join_indexers(
left_keys, right_keys, sort: bool = False, how: str = "inner", **kwargs
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
Expand Down
Loading