Skip to content

Commit 8d1b7fc

Browse files
authored
ENH: Avoid copy when possible in merge (#51327)
1 parent c03ac84 commit 8d1b7fc

File tree

3 files changed

+53
-10
lines changed

3 files changed

+53
-10
lines changed

pandas/core/frame.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -9809,7 +9809,7 @@ def merge(
98099809
right_index: bool = False,
98109810
sort: bool = False,
98119811
suffixes: Suffixes = ("_x", "_y"),
9812-
copy: bool = True,
9812+
copy: bool | None = None,
98139813
indicator: str | bool = False,
98149814
validate: str | None = None,
98159815
) -> DataFrame:

pandas/core/reshape/merge.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
join as libjoin,
2626
lib,
2727
)
28+
from pandas._libs.lib import is_range_indexer
2829
from pandas._typing import (
2930
AnyArrayLike,
3031
ArrayLike,
@@ -757,7 +758,7 @@ def _reindex_and_concat(
757758
self.left._info_axis, self.right._info_axis, self.suffixes
758759
)
759760

760-
if left_indexer is not None:
761+
if left_indexer is not None and not is_range_indexer(left_indexer, len(left)):
761762
# Pinning the index here (and in the right code just below) is not
762763
# necessary, but makes the `.take` more performant if we have e.g.
763764
# a MultiIndex for left.index.
@@ -773,7 +774,9 @@ def _reindex_and_concat(
773774
left = left._constructor(lmgr)
774775
left.index = join_index
775776

776-
if right_indexer is not None:
777+
if right_indexer is not None and not is_range_indexer(
778+
right_indexer, len(right)
779+
):
777780
rmgr = right._mgr.reindex_indexer(
778781
join_index,
779782
right_indexer,

pandas/tests/copy_view/test_functions.py

+47-7
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import numpy as np
2-
3-
import pandas.util._test_decorators as td
2+
import pytest
43

54
from pandas import (
65
DataFrame,
@@ -182,19 +181,25 @@ def test_concat_mixed_series_frame(using_copy_on_write):
182181
tm.assert_frame_equal(result, expected)
183182

184183

185-
@td.skip_copy_on_write_not_yet_implemented # TODO(CoW)
186-
def test_merge_on_key(using_copy_on_write):
184+
@pytest.mark.parametrize(
185+
"func",
186+
[
187+
lambda df1, df2, **kwargs: df1.merge(df2, **kwargs),
188+
lambda df1, df2, **kwargs: merge(df1, df2, **kwargs),
189+
],
190+
)
191+
def test_merge_on_key(using_copy_on_write, func):
187192
df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]})
188193
df2 = DataFrame({"key": ["a", "b", "c"], "b": [4, 5, 6]})
189194
df1_orig = df1.copy()
190195
df2_orig = df2.copy()
191196

192-
result = merge(df1, df2, on="key")
197+
result = func(df1, df2, on="key")
193198

194199
if using_copy_on_write:
195200
assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
196201
assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
197-
assert not np.shares_memory(get_array(result, "key"), get_array(df1, "key"))
202+
assert np.shares_memory(get_array(result, "key"), get_array(df1, "key"))
198203
assert not np.shares_memory(get_array(result, "key"), get_array(df2, "key"))
199204
else:
200205
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
@@ -239,4 +244,39 @@ def test_merge_on_index(using_copy_on_write):
239244
tm.assert_frame_equal(df2, df2_orig)
240245

241246

242-
# TODO(CoW) add merge tests where one of left/right isn't copied
247+
@pytest.mark.parametrize(
248+
"func, how",
249+
[
250+
(lambda df1, df2, **kwargs: merge(df2, df1, on="key", **kwargs), "right"),
251+
(lambda df1, df2, **kwargs: merge(df1, df2, on="key", **kwargs), "left"),
252+
],
253+
)
254+
def test_merge_on_key_enlarging_one(using_copy_on_write, func, how):
255+
df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]})
256+
df2 = DataFrame({"key": ["a", "b"], "b": [4, 5]})
257+
df1_orig = df1.copy()
258+
df2_orig = df2.copy()
259+
260+
result = func(df1, df2, how=how)
261+
262+
if using_copy_on_write:
263+
assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
264+
assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
265+
assert df2._mgr._has_no_reference(1)
266+
assert df2._mgr._has_no_reference(0)
267+
assert np.shares_memory(get_array(result, "key"), get_array(df1, "key")) is (
268+
how == "left"
269+
)
270+
assert not np.shares_memory(get_array(result, "key"), get_array(df2, "key"))
271+
else:
272+
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
273+
assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
274+
275+
if how == "left":
276+
result.iloc[0, 1] = 0
277+
else:
278+
result.iloc[0, 2] = 0
279+
if using_copy_on_write:
280+
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
281+
tm.assert_frame_equal(df1, df1_orig)
282+
tm.assert_frame_equal(df2, df2_orig)

0 commit comments

Comments
 (0)