Skip to content

ENH: Avoid copying unnecessary columns in setitem by splitting blocks for CoW #51031

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Feb 10, 2023
3 changes: 3 additions & 0 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1605,6 +1605,9 @@ def delete(self, loc) -> list[Block]:
values = self.values.delete(loc)
mgr_locs = self._mgr_locs.delete(loc)
return [type(self)(values, placement=mgr_locs, ndim=self.ndim)]
elif self.values.ndim == 1:
# We get here through to_stata
return []
return super().delete(loc)

@cache_readonly
Expand Down
30 changes: 19 additions & 11 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1250,7 +1250,10 @@ def value_getitem(placement):
self._known_consolidated = False

def _iset_split_block(
self, blkno_l: int, blk_locs: np.ndarray, value: ArrayLike | None = None
self,
blkno_l: int,
blk_locs: np.ndarray | list[int],
value: ArrayLike | None = None,
) -> None:
"""Removes columns from a block by splitting the block.

Expand All @@ -1271,12 +1274,8 @@ def _iset_split_block(

nbs_tup = tuple(blk.delete(blk_locs))
if value is not None:
# Argument 1 to "BlockPlacement" has incompatible type "BlockPlacement";
# expected "Union[int, slice, ndarray[Any, Any]]"
first_nb = new_block_2d(
value,
BlockPlacement(blk.mgr_locs[blk_locs]), # type: ignore[arg-type]
)
locs = blk.mgr_locs.as_array[blk_locs]
first_nb = new_block_2d(value, BlockPlacement(locs))
else:
first_nb = nbs_tup[0]
nbs_tup = tuple(nbs_tup[1:])
Expand All @@ -1287,6 +1286,10 @@ def _iset_split_block(
)
self.blocks = blocks_tup

if not nbs_tup and value is not None:
# No need to update anything if split did not happen
return

self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb))

for i, nb in enumerate(nbs_tup):
Expand Down Expand Up @@ -1330,11 +1333,16 @@ def column_setitem(
intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`)
"""
if using_copy_on_write() and not self._has_no_reference(loc):
# otherwise perform Copy-on-Write and clear the reference
blkno = self.blknos[loc]
blocks = list(self.blocks)
blocks[blkno] = blocks[blkno].copy()
self.blocks = tuple(blocks)
# Split blocks to only copy the column we want to modify
blk_loc = self.blklocs[loc]
# Copy our values
values = self.blocks[blkno].values
if values.ndim == 1:
values = values.copy()
else:
values = values[[blk_loc]]
self._iset_split_block(blkno, [blk_loc], values)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need to call this for the values.ndim == 1 case that already copied the values, or does this just also does the "wrapping in block and updating self.blocks" for that case?

And _iset_split_block assumes that values is always already copied, right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes and Yes

Not calling it would duplicate the logic


# this manager is only created temporarily to mutate the values in place
# so don't track references, otherwise the `setitem` would perform CoW again
Expand Down
32 changes: 32 additions & 0 deletions pandas/tests/copy_view/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -883,3 +883,35 @@ def test_dataframe_add_column_from_series():
df.loc[2, "new"] = 100
expected_s = Series([0, 11, 12])
tm.assert_series_equal(s, expected_s)


@pytest.mark.parametrize("val", [100, "a"])
@pytest.mark.parametrize(
"method",
[
lambda df: df.loc[0, "a"],
lambda df: df.iloc[0, 0],
lambda df: df.loc[[0], "a"],
lambda df: df.iloc[[0], 0],
lambda df: df.loc[:, "a"],
lambda df: df.iloc[:, 0],
],
)
def test_set_value_loc_copy_only_necessary_column(using_copy_on_write, method, val):
# Only copy column that is modified, multi-block only for now
Copy link
Member

@jorisvandenbossche jorisvandenbossche Feb 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def test_set_value_loc_copy_only_necessary_column(using_copy_on_write, method, val):
# Only copy column that is modified, multi-block only for now
def test_set_value_copy_only_necessary_column(using_copy_on_write, method, val):
# When setting inplace, only copy column that is modified instead of the whole
# block (by splitting the block)
# TODO multi-block only for now

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed

df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1.5})
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1.5})
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})

(small nitpick, but almost every test in this file is already using that)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed

df_orig = df.copy()
view = df[:]

df.loc[0, "a"] = val
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this use method ? Hmm, but that doesn't work for setitem. So you will have to parametrize with tuples of (method, indexer), so you can do method(df)[indexer] = val?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah that's probably why I missed it. That's a pita...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added


if using_copy_on_write:
assert np.shares_memory(get_array(df, "b"), get_array(view, "b"))
assert not np.shares_memory(get_array(df, "a"), get_array(view, "a"))
tm.assert_frame_equal(view, df_orig)
else:
assert np.shares_memory(get_array(df, "c"), get_array(view, "c"))
if val == "a":
assert not np.shares_memory(get_array(df, "a"), get_array(view, "a"))
else:
assert np.shares_memory(get_array(df, "a"), get_array(view, "a"))