Skip to content

Commit 0582e35

Browse files
authored
ENH: Avoid copying whole block for single block case (#51435)
1 parent 248c966 commit 0582e35

File tree

4 files changed

+49
-15
lines changed

4 files changed

+49
-15
lines changed

pandas/core/internals/managers.py

+23-1
Original file line numberDiff line numberDiff line change
@@ -370,8 +370,30 @@ def setitem(self, indexer, value) -> Self:
370370
raise ValueError(f"Cannot set values with ndim > {self.ndim}")
371371

372372
if using_copy_on_write() and not self._has_no_reference(0):
373-
# if being referenced -> perform Copy-on-Write and clear the reference
374373
# this method is only called if there is a single block -> hardcoded 0
374+
# Split blocks to only copy the columns we want to modify
375+
if self.ndim == 2 and isinstance(indexer, tuple):
376+
blk_loc = self.blklocs[indexer[1]]
377+
if is_list_like(blk_loc) and blk_loc.ndim == 2:
378+
blk_loc = np.squeeze(blk_loc, axis=0)
379+
elif not is_list_like(blk_loc):
380+
# Keep dimension and copy data later
381+
blk_loc = [blk_loc] # type: ignore[assignment]
382+
if len(blk_loc) == 0:
383+
return self.copy(deep=False)
384+
385+
values = self.blocks[0].values
386+
if values.ndim == 2:
387+
values = values[blk_loc]
388+
# "T" has no attribute "_iset_split_block"
389+
self._iset_split_block( # type: ignore[attr-defined]
390+
0, blk_loc, values
391+
)
392+
# first block equals values
393+
self.blocks[0].setitem((indexer[0], np.arange(len(blk_loc))), value)
394+
return self
395+
# No need to split if we either set all columns or on a single block
396+
# manager
375397
self = self.copy()
376398

377399
return self.apply("setitem", indexer=indexer, value=value)

pandas/tests/copy_view/test_indexing.py

+5-6
Original file line numberDiff line numberDiff line change
@@ -1028,16 +1028,15 @@ def test_dataframe_add_column_from_series(backend, using_copy_on_write):
10281028
(tm.iloc, (slice(None), 0)),
10291029
],
10301030
)
1031+
@pytest.mark.parametrize(
1032+
"col", [[0.1, 0.2, 0.3], [7, 8, 9]], ids=["mixed-block", "single-block"]
1033+
)
10311034
def test_set_value_copy_only_necessary_column(
1032-
using_copy_on_write,
1033-
indexer_func,
1034-
indexer,
1035-
val,
1035+
using_copy_on_write, indexer_func, indexer, val, col
10361036
):
10371037
# When setting inplace, only copy column that is modified instead of the whole
10381038
# block (by splitting the block)
1039-
# TODO multi-block only for now
1040-
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
1039+
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": col})
10411040
df_orig = df.copy()
10421041
view = df[:]
10431042

pandas/tests/copy_view/test_methods.py

+14-8
Original file line numberDiff line numberDiff line change
@@ -489,7 +489,8 @@ def test_shift_no_op(using_copy_on_write):
489489

490490
df.iloc[0, 0] = 0
491491
if using_copy_on_write:
492-
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
492+
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
493+
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
493494
tm.assert_frame_equal(df2, df_orig)
494495

495496

@@ -532,16 +533,16 @@ def test_shift_columns(using_copy_on_write):
532533
df2 = df.shift(periods=1, axis=1)
533534

534535
assert np.shares_memory(get_array(df2, "2020-01-02"), get_array(df, "2020-01-01"))
535-
df.iloc[0, 1] = 0
536+
df.iloc[0, 0] = 0
536537
if using_copy_on_write:
537538
assert not np.shares_memory(
538539
get_array(df2, "2020-01-02"), get_array(df, "2020-01-01")
539540
)
540-
expected = DataFrame(
541-
[[np.nan, 1], [np.nan, 3], [np.nan, 5]],
542-
columns=date_range("2020-01-01", "2020-01-02"),
543-
)
544-
tm.assert_frame_equal(df2, expected)
541+
expected = DataFrame(
542+
[[np.nan, 1], [np.nan, 3], [np.nan, 5]],
543+
columns=date_range("2020-01-01", "2020-01-02"),
544+
)
545+
tm.assert_frame_equal(df2, expected)
545546

546547

547548
def test_pop(using_copy_on_write):
@@ -1335,13 +1336,18 @@ def test_droplevel(using_copy_on_write):
13351336

13361337
if using_copy_on_write:
13371338
assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
1339+
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
13381340
else:
13391341
assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
1342+
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
13401343

13411344
# mutating df2 triggers a copy-on-write for that column / block
13421345
df2.iloc[0, 0] = 0
13431346

1344-
assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
1347+
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
1348+
if using_copy_on_write:
1349+
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
1350+
13451351
tm.assert_frame_equal(df, df_orig)
13461352

13471353

pandas/tests/indexing/test_loc.py

+7
Original file line numberDiff line numberDiff line change
@@ -2646,6 +2646,13 @@ def test_loc_indexer_all_false_broadcast(self):
26462646
df.loc[np.array([False], dtype=np.bool_), ["a"]] = df["b"]
26472647
tm.assert_frame_equal(df, expected)
26482648

2649+
def test_loc_indexer_length_one(self):
2650+
# GH#51435
2651+
df = DataFrame({"a": ["x"], "b": ["y"]}, dtype=object)
2652+
expected = DataFrame({"a": ["y"], "b": ["y"]}, dtype=object)
2653+
df.loc[np.array([True], dtype=np.bool_), ["a"]] = df["b"]
2654+
tm.assert_frame_equal(df, expected)
2655+
26492656

26502657
class TestLocListlike:
26512658
@pytest.mark.parametrize("box", [lambda x: x, np.asarray, list])

0 commit comments

Comments
 (0)