Skip to content

Commit d8a2e74

Browse files
authored
ENH: Implement cross method for Merge Operations (pandas-dev#37864)
1 parent 6ef5b4b commit d8a2e74

File tree

6 files changed

+230
-3
lines changed

6 files changed

+230
-3
lines changed

asv_bench/benchmarks/join_merge.py

+6
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,9 @@ def time_join_dataframe_index_single_key_small(self, sort):
132132
def time_join_dataframe_index_shuffle_key_bigger_sort(self, sort):
133133
self.df_shuf.join(self.df_key2, on="key2", sort=sort)
134134

135+
def time_join_dataframes_cross(self, sort):
136+
self.df.loc[:2000].join(self.df_key1, how="cross", sort=sort)
137+
135138

136139
class JoinIndex:
137140
def setup(self):
@@ -205,6 +208,9 @@ def time_merge_dataframe_integer_2key(self, sort):
205208
def time_merge_dataframe_integer_key(self, sort):
206209
merge(self.df, self.df2, on="key1", sort=sort)
207210

211+
def time_merge_dataframes_cross(self, sort):
212+
merge(self.left.loc[:2000], self.right.loc[:2000], how="cross", sort=sort)
213+
208214

209215
class I8Merge:
210216

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,7 @@ Other enhancements
255255
- Improve error reporting for :meth:`DataFrame.merge` when invalid merge column definitions were given (:issue:`16228`)
256256
- Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`)
257257
- Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`)
258+
- Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`)
258259

259260
.. ---------------------------------------------------------------------------
260261

pandas/core/frame.py

+55-1
Original file line numberDiff line numberDiff line change
@@ -205,12 +205,14 @@
205205
The join is done on columns or indexes. If joining columns on
206206
columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
207207
on indexes or indexes on a column or columns, the index will be passed on.
208+
When performing a cross merge, no column specifications to merge on are
209+
allowed.
208210
209211
Parameters
210212
----------%s
211213
right : DataFrame or named Series
212214
Object to merge with.
213-
how : {'left', 'right', 'outer', 'inner'}, default 'inner'
215+
how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'
214216
Type of merge to be performed.
215217
216218
* left: use only keys from left frame, similar to a SQL left outer join;
@@ -221,6 +223,11 @@
221223
join; sort keys lexicographically.
222224
* inner: use intersection of keys from both frames, similar to a SQL inner
223225
join; preserve the order of the left keys.
226+
* cross: creates the cartesian product from both frames, preserves the order
227+
of the left keys.
228+
229+
.. versionadded:: 1.2.0
230+
224231
on : label or list
225232
Column or index level names to join on. These must be found in both
226233
DataFrames. If `on` is None and not merging on indexes then this defaults
@@ -341,6 +348,44 @@
341348
...
342349
ValueError: columns overlap but no suffix specified:
343350
Index(['value'], dtype='object')
351+
352+
>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})
353+
>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})
354+
>>> df1
355+
a b
356+
0 foo 1
357+
1 bar 2
358+
>>> df2
359+
a c
360+
0 foo 3
361+
1 baz 4
362+
363+
>>> df1.merge(df2, how='inner', on='a')
364+
a b c
365+
0 foo 1 3
366+
367+
>>> df1.merge(df2, how='left', on='a')
368+
a b c
369+
0 foo 1 3.0
370+
1 bar 2 NaN
371+
372+
>>> df1 = pd.DataFrame({'left': ['foo', 'bar']})
373+
>>> df2 = pd.DataFrame({'right': [7, 8]})
374+
>>> df1
375+
left
376+
0 foo
377+
1 bar
378+
>>> df2
379+
right
380+
0 7
381+
1 8
382+
383+
>>> df1.merge(df2, how='cross')
384+
left right
385+
0 foo 7
386+
1 foo 8
387+
2 bar 7
388+
3 bar 8
344389
"""
345390

346391

@@ -8083,6 +8128,15 @@ def _join_compat(
80838128
other = DataFrame({other.name: other})
80848129

80858130
if isinstance(other, DataFrame):
8131+
if how == "cross":
8132+
return merge(
8133+
self,
8134+
other,
8135+
how=how,
8136+
on=on,
8137+
suffixes=(lsuffix, rsuffix),
8138+
sort=sort,
8139+
)
80868140
return merge(
80878141
self,
80888142
other,

pandas/core/reshape/merge.py

+61-2
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import copy
66
import datetime
77
from functools import partial
8+
import hashlib
89
import string
910
from typing import TYPE_CHECKING, Optional, Tuple, cast
1011
import warnings
@@ -643,6 +644,17 @@ def __init__(
643644

644645
self._validate_specification()
645646

647+
cross_col = None
648+
if self.how == "cross":
649+
(
650+
self.left,
651+
self.right,
652+
self.how,
653+
cross_col,
654+
) = self._create_cross_configuration(self.left, self.right)
655+
self.left_on = self.right_on = [cross_col]
656+
self._cross = cross_col
657+
646658
# note this function has side effects
647659
(
648660
self.left_join_keys,
@@ -690,8 +702,14 @@ def get_result(self):
690702

691703
self._maybe_restore_index_levels(result)
692704

705+
self._maybe_drop_cross_column(result, self._cross)
706+
693707
return result.__finalize__(self, method="merge")
694708

709+
def _maybe_drop_cross_column(self, result: "DataFrame", cross_col: Optional[str]):
710+
if cross_col is not None:
711+
result.drop(columns=cross_col, inplace=True)
712+
695713
def _indicator_pre_merge(
696714
self, left: "DataFrame", right: "DataFrame"
697715
) -> Tuple["DataFrame", "DataFrame"]:
@@ -1200,9 +1218,50 @@ def _maybe_coerce_merge_keys(self):
12001218
typ = rk.categories.dtype if rk_is_cat else object
12011219
self.right = self.right.assign(**{name: self.right[name].astype(typ)})
12021220

1221+
def _create_cross_configuration(
1222+
self, left, right
1223+
) -> Tuple["DataFrame", "DataFrame", str, str]:
1224+
"""
1225+
Creates the configuration to dispatch the cross operation to inner join,
1226+
e.g. adding a join column and resetting parameters. Join column is added
1227+
to a new object, no inplace modification
1228+
1229+
Parameters
1230+
----------
1231+
left: DataFrame
1232+
right DataFrame
1233+
1234+
Returns
1235+
-------
1236+
a tuple (left, right, how, cross_col) representing the adjusted
1237+
DataFrames with cross_col, the merge operation set to inner and the column
1238+
to join over.
1239+
"""
1240+
cross_col = f"_cross_{hashlib.md5().hexdigest()}"
1241+
how = "inner"
1242+
return (
1243+
left.assign(**{cross_col: 1}),
1244+
right.assign(**{cross_col: 1}),
1245+
how,
1246+
cross_col,
1247+
)
1248+
12031249
def _validate_specification(self):
1250+
if self.how == "cross":
1251+
if (
1252+
self.left_index
1253+
or self.right_index
1254+
or self.right_on is not None
1255+
or self.left_on is not None
1256+
or self.on is not None
1257+
):
1258+
raise MergeError(
1259+
"Can not pass on, right_on, left_on or set right_index=True or "
1260+
"left_index=True"
1261+
)
1262+
return
12041263
# Hm, any way to make this logic less complicated??
1205-
if self.on is None and self.left_on is None and self.right_on is None:
1264+
elif self.on is None and self.left_on is None and self.right_on is None:
12061265

12071266
if self.left_index and self.right_index:
12081267
self.left_on, self.right_on = (), ()
@@ -1266,7 +1325,7 @@ def _validate_specification(self):
12661325
'of levels in the index of "left"'
12671326
)
12681327
self.left_on = [None] * n
1269-
if len(self.right_on) != len(self.left_on):
1328+
if self.how != "cross" and len(self.right_on) != len(self.left_on):
12701329
raise ValueError("len(right_on) must equal len(left_on)")
12711330

12721331
def _validate(self, validate: str):

pandas/tests/reshape/merge/test_join.py

+12
Original file line numberDiff line numberDiff line change
@@ -803,3 +803,15 @@ def test_join_inner_multiindex_deterministic_order():
803803
index=MultiIndex.from_tuples([(2, 1, 4, 3)], names=("b", "a", "d", "c")),
804804
)
805805
tm.assert_frame_equal(result, expected)
806+
807+
808+
@pytest.mark.parametrize(
809+
("input_col", "output_cols"), [("b", ["a", "b"]), ("a", ["a_x", "a_y"])]
810+
)
811+
def test_join_cross(input_col, output_cols):
812+
# GH#5401
813+
left = DataFrame({"a": [1, 3]})
814+
right = DataFrame({input_col: [3, 4]})
815+
result = left.join(right, how="cross", lsuffix="_x", rsuffix="_y")
816+
expected = DataFrame({output_cols[0]: [1, 1, 3, 3], output_cols[1]: [3, 4, 3, 4]})
817+
tm.assert_frame_equal(result, expected)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import pytest
2+
3+
from pandas import DataFrame
4+
import pandas._testing as tm
5+
from pandas.core.reshape.merge import MergeError, merge
6+
7+
8+
@pytest.mark.parametrize(
9+
("input_col", "output_cols"), [("b", ["a", "b"]), ("a", ["a_x", "a_y"])]
10+
)
11+
def test_merge_cross(input_col, output_cols):
12+
# GH#5401
13+
left = DataFrame({"a": [1, 3]})
14+
right = DataFrame({input_col: [3, 4]})
15+
left_copy = left.copy()
16+
right_copy = right.copy()
17+
result = merge(left, right, how="cross")
18+
expected = DataFrame({output_cols[0]: [1, 1, 3, 3], output_cols[1]: [3, 4, 3, 4]})
19+
tm.assert_frame_equal(result, expected)
20+
tm.assert_frame_equal(left, left_copy)
21+
tm.assert_frame_equal(right, right_copy)
22+
23+
24+
@pytest.mark.parametrize(
25+
"kwargs",
26+
[
27+
{"left_index": True},
28+
{"right_index": True},
29+
{"on": "a"},
30+
{"left_on": "a"},
31+
{"right_on": "b"},
32+
],
33+
)
34+
def test_merge_cross_error_reporting(kwargs):
35+
# GH#5401
36+
left = DataFrame({"a": [1, 3]})
37+
right = DataFrame({"b": [3, 4]})
38+
msg = (
39+
"Can not pass on, right_on, left_on or set right_index=True or "
40+
"left_index=True"
41+
)
42+
with pytest.raises(MergeError, match=msg):
43+
merge(left, right, how="cross", **kwargs)
44+
45+
46+
def test_merge_cross_mixed_dtypes():
47+
# GH#5401
48+
left = DataFrame(["a", "b", "c"], columns=["A"])
49+
right = DataFrame(range(2), columns=["B"])
50+
result = merge(left, right, how="cross")
51+
expected = DataFrame({"A": ["a", "a", "b", "b", "c", "c"], "B": [0, 1, 0, 1, 0, 1]})
52+
tm.assert_frame_equal(result, expected)
53+
54+
55+
def test_merge_cross_more_than_one_column():
56+
# GH#5401
57+
left = DataFrame({"A": list("ab"), "B": [2, 1]})
58+
right = DataFrame({"C": range(2), "D": range(4, 6)})
59+
result = merge(left, right, how="cross")
60+
expected = DataFrame(
61+
{
62+
"A": ["a", "a", "b", "b"],
63+
"B": [2, 2, 1, 1],
64+
"C": [0, 1, 0, 1],
65+
"D": [4, 5, 4, 5],
66+
}
67+
)
68+
tm.assert_frame_equal(result, expected)
69+
70+
71+
def test_merge_cross_null_values(nulls_fixture):
72+
# GH#5401
73+
left = DataFrame({"a": [1, nulls_fixture]})
74+
right = DataFrame({"b": ["a", "b"], "c": [1.0, 2.0]})
75+
result = merge(left, right, how="cross")
76+
expected = DataFrame(
77+
{
78+
"a": [1, 1, nulls_fixture, nulls_fixture],
79+
"b": ["a", "b", "a", "b"],
80+
"c": [1.0, 2.0, 1.0, 2.0],
81+
}
82+
)
83+
tm.assert_frame_equal(result, expected)
84+
85+
86+
def test_join_cross_error_reporting():
87+
# GH#5401
88+
left = DataFrame({"a": [1, 3]})
89+
right = DataFrame({"a": [3, 4]})
90+
msg = (
91+
"Can not pass on, right_on, left_on or set right_index=True or "
92+
"left_index=True"
93+
)
94+
with pytest.raises(MergeError, match=msg):
95+
left.join(right, how="cross", on="a")

0 commit comments

Comments
 (0)