Skip to content

Commit c51c2a7

Browse files
lukemanleyjreback
andauthored
PERF: Merge empty frame (pandas-dev#45838)
* faster merge with empty frame * whatsnew * docs, tests, asvs * fix whatsnew Co-authored-by: Jeff Reback <[email protected]>
1 parent 70f2ed0 commit c51c2a7

File tree

4 files changed

+98
-2
lines changed

4 files changed

+98
-2
lines changed

asv_bench/benchmarks/join_merge.py

+6
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,12 @@ def time_merge_dataframe_integer_2key(self, sort):
216216
def time_merge_dataframe_integer_key(self, sort):
217217
merge(self.df, self.df2, on="key1", sort=sort)
218218

219+
def time_merge_dataframe_empty_right(self, sort):
220+
merge(self.left, self.right.iloc[:0], sort=sort)
221+
222+
def time_merge_dataframe_empty_left(self, sort):
223+
merge(self.left.iloc[:0], self.right, sort=sort)
224+
219225
def time_merge_dataframes_cross(self, sort):
220226
merge(self.left.loc[:2000], self.right.loc[:2000], how="cross", sort=sort)
221227

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,7 @@ Performance improvements
244244
- Performance improvement in :meth:`DataFrame.duplicated` when subset consists of only one column (:issue:`45236`)
245245
- Performance improvement in :meth:`.GroupBy.transform` when broadcasting values for user-defined functions (:issue:`45708`)
246246
- Performance improvement in :meth:`.GroupBy.transform` for user-defined functions when only a single group exists (:issue:`44977`)
247+
- Performance improvement in :func:`merge` when left and/or right are empty (:issue:`45838`)
247248
- Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)
248249
-
249250

pandas/core/reshape/merge.py

+51-2
Original file line numberDiff line numberDiff line change
@@ -977,7 +977,6 @@ def _get_join_info(
977977
)
978978
else:
979979
join_index = self.right.index.take(right_indexer)
980-
left_indexer = np.array([-1] * len(join_index), dtype=np.intp)
981980
elif self.left_index:
982981
if self.how == "asof":
983982
# GH#33463 asof should always behave like a left merge
@@ -997,7 +996,6 @@ def _get_join_info(
997996
)
998997
else:
999998
join_index = self.left.index.take(left_indexer)
1000-
right_indexer = np.array([-1] * len(join_index), dtype=np.intp)
1001999
else:
10021000
join_index = Index(np.arange(len(left_indexer)))
10031001

@@ -1477,6 +1475,20 @@ def get_join_indexers(
14771475
right_keys
14781476
), "left_key and right_keys must be the same length"
14791477

1478+
# fast-path for empty left/right
1479+
left_n = len(left_keys[0])
1480+
right_n = len(right_keys[0])
1481+
if left_n == 0:
1482+
if how in ["left", "inner", "cross"]:
1483+
return _get_empty_indexer()
1484+
elif not sort and how in ["right", "outer"]:
1485+
return _get_no_sort_one_missing_indexer(right_n, True)
1486+
elif right_n == 0:
1487+
if how in ["right", "inner", "cross"]:
1488+
return _get_empty_indexer()
1489+
elif not sort and how in ["left", "outer"]:
1490+
return _get_no_sort_one_missing_indexer(left_n, False)
1491+
14801492
# get left & right join labels and num. of levels at each location
14811493
mapped = (
14821494
_factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how)
@@ -2055,6 +2067,43 @@ def _get_single_indexer(
20552067
return libjoin.left_outer_join(left_key, right_key, count, sort=sort)
20562068

20572069

2070+
def _get_empty_indexer() -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
2071+
"""Return empty join indexers."""
2072+
return (
2073+
np.array([], dtype=np.intp),
2074+
np.array([], dtype=np.intp),
2075+
)
2076+
2077+
2078+
def _get_no_sort_one_missing_indexer(
2079+
n: int, left_missing: bool
2080+
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
2081+
"""
2082+
Return join indexers where all of one side is selected without sorting
2083+
and none of the other side is selected.
2084+
2085+
Parameters
2086+
----------
2087+
n : int
2088+
Length of indexers to create.
2089+
left_missing : bool
2090+
If True, the left indexer will contain only -1's.
2091+
If False, the right indexer will contain only -1's.
2092+
2093+
Returns
2094+
-------
2095+
np.ndarray[np.intp]
2096+
Left indexer
2097+
np.ndarray[np.intp]
2098+
Right indexer
2099+
"""
2100+
idx = np.arange(n, dtype=np.intp)
2101+
idx_missing = np.full(shape=n, fill_value=-1, dtype=np.intp)
2102+
if left_missing:
2103+
return idx_missing, idx
2104+
return idx, idx_missing
2105+
2106+
20582107
def _left_join_on_index(
20592108
left_ax: Index, right_ax: Index, join_keys, sort: bool = False
20602109
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp]]:

pandas/tests/reshape/merge/test_merge.py

+40
Original file line numberDiff line numberDiff line change
@@ -1717,6 +1717,46 @@ def test_merge_ea_with_string(self, join_type, string_dtype):
17171717
)
17181718
tm.assert_series_equal(merged.dtypes, expected)
17191719

1720+
@pytest.mark.parametrize(
1721+
"left_empty, how, exp",
1722+
[
1723+
(False, "left", "left"),
1724+
(False, "right", "empty"),
1725+
(False, "inner", "empty"),
1726+
(False, "outer", "left"),
1727+
(False, "cross", "empty_cross"),
1728+
(True, "left", "empty"),
1729+
(True, "right", "right"),
1730+
(True, "inner", "empty"),
1731+
(True, "outer", "right"),
1732+
(True, "cross", "empty_cross"),
1733+
],
1734+
)
1735+
def test_merge_empty(self, left_empty, how, exp):
1736+
1737+
left = DataFrame({"A": [2, 1], "B": [3, 4]})
1738+
right = DataFrame({"A": [1], "C": [5]}, dtype="int64")
1739+
1740+
if left_empty:
1741+
left = left.head(0)
1742+
else:
1743+
right = right.head(0)
1744+
1745+
result = left.merge(right, how=how)
1746+
1747+
if exp == "left":
1748+
expected = DataFrame({"A": [2, 1], "B": [3, 4], "C": [np.nan, np.nan]})
1749+
elif exp == "right":
1750+
expected = DataFrame({"B": [np.nan], "A": [1], "C": [5]})
1751+
elif exp == "empty":
1752+
expected = DataFrame(columns=["A", "B", "C"], dtype="int64")
1753+
if left_empty:
1754+
expected = expected[["B", "A", "C"]]
1755+
elif exp == "empty_cross":
1756+
expected = DataFrame(columns=["A_x", "B", "A_y", "C"], dtype="int64")
1757+
1758+
tm.assert_frame_equal(result, expected)
1759+
17201760

17211761
@pytest.fixture
17221762
def left():

0 commit comments

Comments
 (0)