Skip to content

ENH: Avoid copying unnecessary columns in setitem by splitting blocks for CoW #51031

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Feb 10, 2023
3 changes: 3 additions & 0 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1605,6 +1605,9 @@ def delete(self, loc) -> list[Block]:
values = self.values.delete(loc)
mgr_locs = self._mgr_locs.delete(loc)
return [type(self)(values, placement=mgr_locs, ndim=self.ndim)]
elif self.values.ndim == 1:
# We get here through to_stata
return []
return super().delete(loc)

@cache_readonly
Expand Down
32 changes: 21 additions & 11 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1250,7 +1250,10 @@ def value_getitem(placement):
self._known_consolidated = False

def _iset_split_block(
self, blkno_l: int, blk_locs: np.ndarray, value: ArrayLike | None = None
self,
blkno_l: int,
blk_locs: np.ndarray | list[int],
value: ArrayLike | None = None,
) -> None:
"""Removes columns from a block by splitting the block.

Expand All @@ -1271,12 +1274,8 @@ def _iset_split_block(

nbs_tup = tuple(blk.delete(blk_locs))
if value is not None:
# Argument 1 to "BlockPlacement" has incompatible type "BlockPlacement";
# expected "Union[int, slice, ndarray[Any, Any]]"
first_nb = new_block_2d(
value,
BlockPlacement(blk.mgr_locs[blk_locs]), # type: ignore[arg-type]
)
locs = blk.mgr_locs.as_array[blk_locs]
first_nb = new_block_2d(value, BlockPlacement(locs))
else:
first_nb = nbs_tup[0]
nbs_tup = tuple(nbs_tup[1:])
Expand All @@ -1287,6 +1286,10 @@ def _iset_split_block(
)
self.blocks = blocks_tup

if not nbs_tup and value is not None:
# No need to update anything if split did not happen
return

self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb))

for i, nb in enumerate(nbs_tup):
Expand Down Expand Up @@ -1330,11 +1333,18 @@ def column_setitem(
intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`)
"""
if using_copy_on_write() and not self._has_no_reference(loc):
# otherwise perform Copy-on-Write and clear the reference
blkno = self.blknos[loc]
blocks = list(self.blocks)
blocks[blkno] = blocks[blkno].copy()
self.blocks = tuple(blocks)
# Split blocks to only copy the column we want to modify
blk_loc = self.blklocs[loc]
# Copy our values
values = self.blocks[blkno].values
if values.ndim == 1:
values = values.copy()
else:
# Use [blk_loc] as indexer to keep ndim=2, this already results in a
# copy
values = values[[blk_loc]]
self._iset_split_block(blkno, [blk_loc], values)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need to call this for the values.ndim == 1 case that already copied the values, or does this just also does the "wrapping in block and updating self.blocks" for that case?

And _iset_split_block assumes that values is always already copied, right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes and Yes

Not calling it would duplicate the logic


# this manager is only created temporarily to mutate the values in place
# so don't track references, otherwise the `setitem` would perform CoW again
Expand Down
36 changes: 36 additions & 0 deletions pandas/tests/copy_view/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -883,3 +883,39 @@ def test_dataframe_add_column_from_series():
df.loc[2, "new"] = 100
expected_s = Series([0, 11, 12])
tm.assert_series_equal(s, expected_s)


@pytest.mark.parametrize("val", [100, "a"])
@pytest.mark.parametrize(
"indexer_func, indexer",
[
(tm.loc, (0, "a")),
(tm.iloc, (0, 0)),
(tm.loc, ([0], "a")),
(tm.iloc, ([0], 0)),
(tm.loc, (slice(None), "a")),
(tm.iloc, (slice(None), 0)),
],
)
def test_set_value_copy_only_necessary_column(
using_copy_on_write, indexer_func, indexer, val
):
# When setting inplace, only copy column that is modified instead of the whole
# block (by splitting the block)
# TODO multi-block only for now
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_orig = df.copy()
view = df[:]

indexer_func(df)[indexer] = val

if using_copy_on_write:
assert np.shares_memory(get_array(df, "b"), get_array(view, "b"))
assert not np.shares_memory(get_array(df, "a"), get_array(view, "a"))
tm.assert_frame_equal(view, df_orig)
else:
assert np.shares_memory(get_array(df, "c"), get_array(view, "c"))
if val == "a":
assert not np.shares_memory(get_array(df, "a"), get_array(view, "a"))
else:
assert np.shares_memory(get_array(df, "a"), get_array(view, "a"))