Skip to content

Commit aafa7a9

Browse files
authored
PERF: join empty frame (pandas-dev#46015)
* faster joins when left and/or right is empty * whatsnew * cleanup * add asv for joining with empty frame * asv
1 parent 7f97e27 commit aafa7a9

File tree

4 files changed

+74
-9
lines changed

4 files changed

+74
-9
lines changed

asv_bench/benchmarks/join_merge.py

+13
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,19 @@ def time_left_outer_join_index(self):
158158
self.left.join(self.right, on="jim")
159159

160160

161+
class JoinEmpty:
162+
def setup(self):
163+
N = 100_000
164+
self.df = DataFrame({"A": np.arange(N)})
165+
self.df_empty = DataFrame(columns=["B", "C"], dtype="int64")
166+
167+
def time_inner_join_left_empty(self):
168+
self.df_empty.join(self.df, how="inner")
169+
170+
def time_inner_join_right_empty(self):
171+
self.df.join(self.df_empty, how="inner")
172+
173+
161174
class JoinNonUnique:
162175
# outer join of non-unique
163176
# GH 6329

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,7 @@ Performance improvements
256256
- Performance improvement in :meth:`.GroupBy.transform` for user-defined functions when only a single group exists (:issue:`44977`)
257257
- Performance improvement in :meth:`MultiIndex.get_locs` (:issue:`45681`)
258258
- Performance improvement in :func:`merge` when left and/or right are empty (:issue:`45838`)
259+
- Performance improvement in :meth:`DataFrame.join` when left and/or right are empty (:issue:`46015`)
259260
- Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)
260261
-
261262

pandas/core/indexes/base.py

+19-9
Original file line numberDiff line numberDiff line change
@@ -4542,15 +4542,25 @@ def join(
45424542
if level is not None and (self._is_multi or other._is_multi):
45434543
return self._join_level(other, level, how=how)
45444544

4545-
if len(other) == 0 and how in ("left", "outer"):
4546-
join_index = self._view()
4547-
rindexer = np.repeat(np.intp(-1), len(join_index))
4548-
return join_index, None, rindexer
4549-
4550-
if len(self) == 0 and how in ("right", "outer"):
4551-
join_index = other._view()
4552-
lindexer = np.repeat(np.intp(-1), len(join_index))
4553-
return join_index, lindexer, None
4545+
if len(other) == 0:
4546+
if how in ("left", "outer"):
4547+
join_index = self._view()
4548+
rindexer = np.broadcast_to(np.intp(-1), len(join_index))
4549+
return join_index, None, rindexer
4550+
elif how in ("right", "inner", "cross"):
4551+
join_index = other._view()
4552+
lindexer = np.array([])
4553+
return join_index, lindexer, None
4554+
4555+
if len(self) == 0:
4556+
if how in ("right", "outer"):
4557+
join_index = other._view()
4558+
lindexer = np.broadcast_to(np.intp(-1), len(join_index))
4559+
return join_index, lindexer, None
4560+
elif how in ("left", "inner", "cross"):
4561+
join_index = self._view()
4562+
rindexer = np.array([])
4563+
return join_index, None, rindexer
45544564

45554565
if self._join_precedence < other._join_precedence:
45564566
how = {"right": "left", "left": "right"}.get(how, how)

pandas/tests/reshape/merge/test_join.py

+41
Original file line numberDiff line numberDiff line change
@@ -881,3 +881,44 @@ def test_join_multiindex_not_alphabetical_categorical(categories, values):
881881
}
882882
).set_index(["first", "second"])
883883
tm.assert_frame_equal(result, expected)
884+
885+
886+
@pytest.mark.parametrize(
887+
"left_empty, how, exp",
888+
[
889+
(False, "left", "left"),
890+
(False, "right", "empty"),
891+
(False, "inner", "empty"),
892+
(False, "outer", "left"),
893+
(False, "cross", "empty"),
894+
(True, "left", "empty"),
895+
(True, "right", "right"),
896+
(True, "inner", "empty"),
897+
(True, "outer", "right"),
898+
(True, "cross", "empty"),
899+
],
900+
)
901+
def test_join_empty(left_empty, how, exp):
902+
903+
left = DataFrame({"A": [2, 1], "B": [3, 4]}, dtype="int64").set_index("A")
904+
right = DataFrame({"A": [1], "C": [5]}, dtype="int64").set_index("A")
905+
906+
if left_empty:
907+
left = left.head(0)
908+
else:
909+
right = right.head(0)
910+
911+
result = left.join(right, how=how)
912+
913+
if exp == "left":
914+
expected = DataFrame({"A": [2, 1], "B": [3, 4], "C": [np.nan, np.nan]})
915+
expected = expected.set_index("A")
916+
elif exp == "right":
917+
expected = DataFrame({"B": [np.nan], "A": [1], "C": [5]})
918+
expected = expected.set_index("A")
919+
elif exp == "empty":
920+
expected = DataFrame(index=Index([]), columns=["B", "C"], dtype="int64")
921+
if how != "cross":
922+
expected = expected.rename_axis("A")
923+
924+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)