Skip to content

REF: separate out cross-merge, make less stateful #53810

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 23, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 75 additions & 65 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,10 +145,79 @@ def merge(
indicator: str | bool = False,
validate: str | None = None,
) -> DataFrame:
if how == "cross":
return _cross_merge(
left,
right,
on=on,
left_on=left_on,
right_on=right_on,
left_index=left_index,
right_index=right_index,
sort=sort,
suffixes=suffixes,
indicator=indicator,
validate=validate,
copy=copy,
)
else:
op = _MergeOperation(
left,
right,
how=how,
on=on,
left_on=left_on,
right_on=right_on,
left_index=left_index,
right_index=right_index,
sort=sort,
suffixes=suffixes,
indicator=indicator,
validate=validate,
)
return op.get_result(copy=copy)


def _cross_merge(
left: DataFrame | Series,
right: DataFrame | Series,
on: IndexLabel | None = None,
left_on: IndexLabel | None = None,
right_on: IndexLabel | None = None,
left_index: bool = False,
right_index: bool = False,
sort: bool = False,
suffixes: Suffixes = ("_x", "_y"),
copy: bool | None = None,
indicator: str | bool = False,
validate: str | None = None,
) -> DataFrame:
"""
See merge.__doc__ with how='cross'
"""

cross_col = f"_cross_{uuid.uuid4()}"
left = left.assign(**{cross_col: 1})
right = right.assign(**{cross_col: 1})

if (
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this validation could come before the assign but no big deal

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good idea will update

left_index
or right_index
or right_on is not None
or left_on is not None
or on is not None
):
raise MergeError(
"Can not pass on, right_on, left_on or set right_index=True or "
"left_index=True"
)

left_on = right_on = [cross_col]

op = _MergeOperation(
left,
right,
how=how,
how="inner",
on=on,
left_on=left_on,
right_on=right_on,
Expand All @@ -159,7 +228,9 @@ def merge(
indicator=indicator,
validate=validate,
)
return op.get_result(copy=copy)
res = op.get_result(copy=copy)
del res[cross_col]
return res


def _groupby_and_merge(by, left: DataFrame, right: DataFrame, merge_pieces):
Expand Down Expand Up @@ -706,17 +777,6 @@ def __init__(

self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on)

cross_col = None
if self.how == "cross":
(
self.left,
self.right,
self.how,
cross_col,
) = self._create_cross_configuration(self.left, self.right)
self.left_on = self.right_on = [cross_col]
self._cross = cross_col

# note this function has side effects
(
self.left_join_keys,
Expand Down Expand Up @@ -809,17 +869,8 @@ def get_result(self, copy: bool | None = True) -> DataFrame:

self._maybe_restore_index_levels(result)

self._maybe_drop_cross_column(result, self._cross)

return result.__finalize__(self, method="merge")

@final
def _maybe_drop_cross_column(
self, result: DataFrame, cross_col: str | None
) -> None:
if cross_col is not None:
del result[cross_col]

@final
@cache_readonly
def _indicator_name(self) -> str | None:
Expand Down Expand Up @@ -1437,53 +1488,12 @@ def _maybe_coerce_merge_keys(self) -> None:
self.right = self.right.copy()
self.right[name] = self.right[name].astype(typ)

@final
def _create_cross_configuration(
self, left: DataFrame, right: DataFrame
) -> tuple[DataFrame, DataFrame, JoinHow, str]:
"""
Creates the configuration to dispatch the cross operation to inner join,
e.g. adding a join column and resetting parameters. Join column is added
to a new object, no inplace modification

Parameters
----------
left : DataFrame
right : DataFrame

Returns
-------
a tuple (left, right, how, cross_col) representing the adjusted
DataFrames with cross_col, the merge operation set to inner and the column
to join over.
"""
cross_col = f"_cross_{uuid.uuid4()}"
how: JoinHow = "inner"
return (
left.assign(**{cross_col: 1}),
right.assign(**{cross_col: 1}),
how,
cross_col,
)

def _validate_left_right_on(self, left_on, right_on):
left_on = com.maybe_make_list(left_on)
right_on = com.maybe_make_list(right_on)

if self.how == "cross":
if (
self.left_index
or self.right_index
or right_on is not None
or left_on is not None
or self.on is not None
):
raise MergeError(
"Can not pass on, right_on, left_on or set right_index=True or "
"left_index=True"
)
# Hm, any way to make this logic less complicated??
elif self.on is None and left_on is None and right_on is None:
if self.on is None and left_on is None and right_on is None:
if self.left_index and self.right_index:
left_on, right_on = (), ()
elif self.left_index:
Expand Down Expand Up @@ -1551,7 +1561,7 @@ def _validate_left_right_on(self, left_on, right_on):
'of levels in the index of "left"'
)
left_on = [None] * n
if self.how != "cross" and len(right_on) != len(left_on):
if len(right_on) != len(left_on):
raise ValueError("len(right_on) must equal len(left_on)")

return left_on, right_on
Expand Down