diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 6e27d9355997f..e3c6bf9bd4e07 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -158,6 +158,19 @@ def time_left_outer_join_index(self): self.left.join(self.right, on="jim") +class JoinEmpty: + def setup(self): + N = 100_000 + self.df = DataFrame({"A": np.arange(N)}) + self.df_empty = DataFrame(columns=["B", "C"], dtype="int64") + + def time_inner_join_left_empty(self): + self.df_empty.join(self.df, how="inner") + + def time_inner_join_right_empty(self): + self.df.join(self.df_empty, how="inner") + + class JoinNonUnique: # outer join of non-unique # GH 6329 diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index e3f772ac026ab..199a52f9d770f 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -256,6 +256,7 @@ Performance improvements - Performance improvement in :meth:`.GroupBy.transform` for user-defined functions when only a single group exists (:issue:`44977`) - Performance improvement in :meth:`MultiIndex.get_locs` (:issue:`45681`) - Performance improvement in :func:`merge` when left and/or right are empty (:issue:`45838`) +- Performance improvement in :meth:`DataFrame.join` when left and/or right are empty (:issue:`46015`) - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`) - diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c346f88e75ebe..d7594f2483569 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4542,15 +4542,25 @@ def join( if level is not None and (self._is_multi or other._is_multi): return self._join_level(other, level, how=how) - if len(other) == 0 and how in ("left", "outer"): - join_index = self._view() - rindexer = np.repeat(np.intp(-1), len(join_index)) - return join_index, None, rindexer - - if len(self) == 0 and how in ("right", "outer"): - join_index = other._view() - lindexer = np.repeat(np.intp(-1), len(join_index)) - return join_index, lindexer, None + if len(other) == 0: + if how in ("left", "outer"): + join_index = self._view() + rindexer = np.broadcast_to(np.intp(-1), len(join_index)) + return join_index, None, rindexer + elif how in ("right", "inner", "cross"): + join_index = other._view() + lindexer = np.array([]) + return join_index, lindexer, None + + if len(self) == 0: + if how in ("right", "outer"): + join_index = other._view() + lindexer = np.broadcast_to(np.intp(-1), len(join_index)) + return join_index, lindexer, None + elif how in ("left", "inner", "cross"): + join_index = self._view() + rindexer = np.array([]) + return join_index, None, rindexer if self._join_precedence < other._join_precedence: how = {"right": "left", "left": "right"}.get(how, how) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 7b932a3bb80c0..629c8d53ce9bd 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -881,3 +881,44 @@ def test_join_multiindex_not_alphabetical_categorical(categories, values): } ).set_index(["first", "second"]) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "left_empty, how, exp", + [ + (False, "left", "left"), + (False, "right", "empty"), + (False, "inner", "empty"), + (False, "outer", "left"), + (False, "cross", "empty"), + (True, "left", "empty"), + (True, "right", "right"), + (True, "inner", "empty"), + (True, "outer", "right"), + (True, "cross", "empty"), + ], +) +def test_join_empty(left_empty, how, exp): + + left = DataFrame({"A": [2, 1], "B": [3, 4]}, dtype="int64").set_index("A") + right = DataFrame({"A": [1], "C": [5]}, dtype="int64").set_index("A") + + if left_empty: + left = left.head(0) + else: + right = right.head(0) + + result = left.join(right, how=how) + + if exp == "left": + expected = DataFrame({"A": [2, 1], "B": [3, 4], "C": [np.nan, np.nan]}) + expected = expected.set_index("A") + elif exp == "right": + expected = DataFrame({"B": [np.nan], "A": [1], "C": [5]}) + expected = expected.set_index("A") + elif exp == "empty": + expected = DataFrame(index=Index([]), columns=["B", "C"], dtype="int64") + if how != "cross": + expected = expected.rename_axis("A") + + tm.assert_frame_equal(result, expected)