From b855f09e720b45fac3ac7edfd3cb78d173399f3d Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 6 Jan 2023 21:25:15 -0800 Subject: [PATCH 01/15] WIP: ENH: CoW support for replace --- pandas/core/frame.py | 2 +- pandas/core/generic.py | 4 +++- pandas/core/internals/managers.py | 32 ++++++++++++++++++++++++-- pandas/tests/copy_view/test_methods.py | 30 ++++++++++++++++++++++++ 4 files changed, 64 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6491081c54592..de363d3616e73 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5522,7 +5522,7 @@ def _replace_columnwise( DataFrame or None """ # Operate column-wise - res = self if inplace else self.copy() + res = self if inplace else self.copy(deep=None) ax = self.columns for i, ax_value in enumerate(ax): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 561422c868e91..a64eb9e337f98 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7045,6 +7045,7 @@ def replace( to_replace = [to_replace] if isinstance(to_replace, (tuple, list)): + # TODO: Consider copy-on-write for non-replaced columns's here if isinstance(self, ABCDataFrame): from pandas import Series @@ -7104,7 +7105,8 @@ def replace( if not self.size: if inplace: return None - return self.copy() + # TODO: Is it even worth doing the CoW here? + return self.copy(deep=None) if is_dict_like(to_replace): if is_dict_like(value): # {'A' : NA} -> {'A' : 0} diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c90787ee24277..b6cffac874aa4 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1211,8 +1211,12 @@ def value_getitem(placement): return value[placement.indexer] # Accessing public blknos ensures the public versions are initialized + # print(f"Loc is {loc}") blknos = self.blknos[loc] + # print(f"Blknos is {blknos}") blklocs = self.blklocs[loc].copy() + # print(f"Blklocs is {blklocs}") + # print(f"Blocks is {self.blocks}") unfit_mgr_locs = [] unfit_val_locs = [] @@ -1223,8 +1227,32 @@ def value_getitem(placement): if inplace and blk.should_store(value): # Updating inplace -> check if we need to do Copy-on-Write if using_copy_on_write() and not self._has_no_reference_block(blkno_l): - blk.set_inplace(blk_locs, value_getitem(val_locs), copy=True) - self._clear_reference_block(blkno_l) + + # print("Hit here") + # print(f"Blk locs is {blk_locs}") + # print(f"Blkno_l is {blkno_l}") + prev_refs = self.refs[blkno_l] + leftover_blocks = blk.delete(blk_locs) + num_extra_blocks = len(leftover_blocks) + # Preserve refs for unchanged blocks + extra_refs = [prev_refs] * num_extra_blocks + self.refs = self.refs[:blkno_l] + extra_refs + self.refs[blkno_l:] + + # TODO: Are we sure we want 2D? + nb = new_block_2d(value, placement=blk._mgr_locs) + old_blocks = self.blocks + new_blocks = ( + old_blocks[:blkno_l] + + tuple(leftover_blocks[:blkno_l]) + + (nb,) + + tuple(leftover_blocks[blkno_l:]) + + old_blocks[blkno_l + num_extra_blocks :] + ) + self.blocks = new_blocks + + self._rebuild_blknos_and_blklocs() + # blk.set_inplace(blk_locs, value_getitem(val_locs), copy=True) + # self._clear_reference_block(blkno_l) else: blk.set_inplace(blk_locs, value_getitem(val_locs)) else: diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index af900fd5268ef..a95a8c41e1d9b 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -721,3 +721,33 @@ def test_squeeze(using_copy_on_write): # Without CoW the original will be modified assert np.shares_memory(series.values, get_array(df, "a")) assert df.loc[0, "a"] == 0 + + +@pytest.mark.parametrize( + "to_replace", + [ + {"a": 1, "b": 4}, + # Test CoW splits blocks to avoid copying unchanged columns + {"a": 1}, + # TODO: broken, fix by fixing Block.replace() + # 1 + ], +) +def test_replace(using_copy_on_write, to_replace): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]}) + df_orig = df.copy() + + df_replaced = df.replace(to_replace=to_replace, value=-1) + + # Should share memory regardless of CoW since squeeze is just an iloc + if using_copy_on_write: + if (df_replaced["b"] == df["b"]).all(): + assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) + + # mutating squeezed df triggers a copy-on-write for that column/block + df_replaced.loc[0, "c"] = -1 + if using_copy_on_write: + assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "a")) + + tm.assert_frame_equal(df, df_orig) From e27d7886c2f9e852b6869e17d05df9d1c9c91cb7 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sat, 7 Jan 2023 11:36:12 -0800 Subject: [PATCH 02/15] Fix bugs, should work for 1-D --- pandas/core/internals/managers.py | 38 ++++++++++++++------------ pandas/tests/copy_view/test_methods.py | 14 ++++++---- 2 files changed, 29 insertions(+), 23 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b6cffac874aa4..2b2416b65d16b 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1211,12 +1211,8 @@ def value_getitem(placement): return value[placement.indexer] # Accessing public blknos ensures the public versions are initialized - # print(f"Loc is {loc}") blknos = self.blknos[loc] - # print(f"Blknos is {blknos}") blklocs = self.blklocs[loc].copy() - # print(f"Blklocs is {blklocs}") - # print(f"Blocks is {self.blocks}") unfit_mgr_locs = [] unfit_val_locs = [] @@ -1227,30 +1223,38 @@ def value_getitem(placement): if inplace and blk.should_store(value): # Updating inplace -> check if we need to do Copy-on-Write if using_copy_on_write() and not self._has_no_reference_block(blkno_l): - - # print("Hit here") - # print(f"Blk locs is {blk_locs}") - # print(f"Blkno_l is {blkno_l}") prev_refs = self.refs[blkno_l] - leftover_blocks = blk.delete(blk_locs) - num_extra_blocks = len(leftover_blocks) + leftover_blocks = tuple(blk.delete(blk_locs)) # Preserve refs for unchanged blocks - extra_refs = [prev_refs] * num_extra_blocks - self.refs = self.refs[:blkno_l] + extra_refs + self.refs[blkno_l:] + # Add new block where old block was and remaining blocks at + # the end to avoid updating all block numbers + extra_refs = [prev_refs] * len(leftover_blocks) + self.refs = ( + self.refs[:blkno_l] + + [None] + + self.refs[blkno_l + 1 :] + + extra_refs + ) # TODO: Are we sure we want 2D? - nb = new_block_2d(value, placement=blk._mgr_locs) + # Also generalize this to bigger than 1-D locs + nb = new_block_2d( + value, + placement=BlockPlacement(slice(blk_locs[0], blk_locs[0] + 1)), + ) old_blocks = self.blocks new_blocks = ( old_blocks[:blkno_l] - + tuple(leftover_blocks[:blkno_l]) + (nb,) - + tuple(leftover_blocks[blkno_l:]) - + old_blocks[blkno_l + num_extra_blocks :] + + old_blocks[blkno_l + 1 :] + + leftover_blocks ) + self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb)) + for i, nb in enumerate(leftover_blocks): + self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb)) + self._blknos[nb.mgr_locs.indexer] = i + len(old_blocks) self.blocks = new_blocks - self._rebuild_blknos_and_blklocs() # blk.set_inplace(blk_locs, value_getitem(val_locs), copy=True) # self._clear_reference_block(blkno_l) else: diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index a95a8c41e1d9b..da7fb688f1ca0 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -724,22 +724,24 @@ def test_squeeze(using_copy_on_write): @pytest.mark.parametrize( - "to_replace", + "replace_kwargs", [ - {"a": 1, "b": 4}, + {"to_replace": {"a": 1, "b": 4}, "value": -1}, # Test CoW splits blocks to avoid copying unchanged columns - {"a": 1}, + {"to_replace": {"a": 1}, "value": -1}, + {"to_replace": {"b": 4}, "value": -1}, + # TODO: broken, fix by fixing Block.replace_list() + # {"to_replace": {"b": 1}}, # TODO: broken, fix by fixing Block.replace() # 1 ], ) -def test_replace(using_copy_on_write, to_replace): +def test_replace(using_copy_on_write, replace_kwargs): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]}) df_orig = df.copy() - df_replaced = df.replace(to_replace=to_replace, value=-1) + df_replaced = df.replace(**replace_kwargs) - # Should share memory regardless of CoW since squeeze is just an iloc if using_copy_on_write: if (df_replaced["b"] == df["b"]).all(): assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b")) From f638abdde3654463903f01d39f268f59c5a75eff Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 9 Jan 2023 12:23:25 -0800 Subject: [PATCH 03/15] Refactor to support more than one loc --- pandas/core/internals/managers.py | 55 +++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 2b2416b65d16b..b270767a52df4 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1235,19 +1235,62 @@ def value_getitem(placement): + self.refs[blkno_l + 1 :] + extra_refs ) + if len(blk_locs) > 1: + # Add the refs for the other deleted blocks + self.refs += [None] * (len(blk_locs) - 1) + + # Fill before the first leftover block + nbs = [] + leftover_start_i = leftover_blocks[0].mgr_locs.as_slice.start + + if leftover_start_i != 0: + nbs.append( + new_block_2d( + np.tile(value, (leftover_start_i, 1)), + placement=BlockPlacement(slice(0, leftover_start_i)), + ) + ) + + # Every hole in between leftover blocks is where we need to insert + # a new block + for i in range(len(leftover_blocks) - 1): + curr_block = leftover_blocks[i] + next_block = leftover_blocks[i + 1] + + curr_end = curr_block.as_slice.end + next_start = next_block.as_slice.start + + num_to_fill = next_start - (curr_end - 1) + nb = new_block_2d( + np.tile(value, (num_to_fill, 1)), + placement=BlockPlacement( + slice(curr_end, curr_end + num_to_fill) + ), + ) + nbs.append(nb) + + # Fill after the last leftover block + last_del_loc = blk_locs[-1] + 1 + last_leftover_loc = leftover_blocks[-1].mgr_locs.as_slice.stop + if last_del_loc > last_leftover_loc: + diff = last_del_loc - last_leftover_loc + nbs.append( + new_block_2d( + np.tile(value, (diff, 1)), + placement=BlockPlacement( + slice(last_leftover_loc, last_del_loc) + ), + ) + ) - # TODO: Are we sure we want 2D? - # Also generalize this to bigger than 1-D locs - nb = new_block_2d( - value, - placement=BlockPlacement(slice(blk_locs[0], blk_locs[0] + 1)), - ) old_blocks = self.blocks + nb = nbs[0] new_blocks = ( old_blocks[:blkno_l] + (nb,) + old_blocks[blkno_l + 1 :] + leftover_blocks + + tuple(nbs) ) self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb)) for i, nb in enumerate(leftover_blocks): From bc3af5c28df1c4de73cfe9e5897e825addfab49e Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 10 Jan 2023 11:09:40 -0800 Subject: [PATCH 04/15] Don't access refs if None --- pandas/core/internals/managers.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b270767a52df4..7dff217492144 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1223,21 +1223,22 @@ def value_getitem(placement): if inplace and blk.should_store(value): # Updating inplace -> check if we need to do Copy-on-Write if using_copy_on_write() and not self._has_no_reference_block(blkno_l): - prev_refs = self.refs[blkno_l] leftover_blocks = tuple(blk.delete(blk_locs)) - # Preserve refs for unchanged blocks - # Add new block where old block was and remaining blocks at - # the end to avoid updating all block numbers - extra_refs = [prev_refs] * len(leftover_blocks) - self.refs = ( - self.refs[:blkno_l] - + [None] - + self.refs[blkno_l + 1 :] - + extra_refs - ) - if len(blk_locs) > 1: - # Add the refs for the other deleted blocks - self.refs += [None] * (len(blk_locs) - 1) + if self.refs is not None: + prev_refs = self.refs[blkno_l] + # Preserve refs for unchanged blocks + # Add new block where old block was and remaining blocks at + # the end to avoid updating all block numbers + extra_refs = [prev_refs] * len(leftover_blocks) + self.refs = ( + self.refs[:blkno_l] + + [None] + + self.refs[blkno_l + 1 :] + + extra_refs + ) + if len(blk_locs) > 1: + # Add the refs for the other deleted blocks + self.refs += [None] * (len(blk_locs) - 1) # Fill before the first leftover block nbs = [] From 66b957f4a735df2392daddc7a01fd3e231ed5f1d Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 12 Jan 2023 14:01:59 -0800 Subject: [PATCH 05/15] Flesh out tests, fix issues --- pandas/core/generic.py | 1 - pandas/core/internals/managers.py | 35 +++++++----------- pandas/tests/copy_view/test_internals.py | 46 ++++++++++++++++++++++++ pandas/tests/copy_view/test_methods.py | 6 ++-- 4 files changed, 63 insertions(+), 25 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a64eb9e337f98..f11bcdcdec18f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7105,7 +7105,6 @@ def replace( if not self.size: if inplace: return None - # TODO: Is it even worth doing the CoW here? return self.copy(deep=None) if is_dict_like(to_replace): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 7dff217492144..ac3177f4be45a 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1224,21 +1224,6 @@ def value_getitem(placement): # Updating inplace -> check if we need to do Copy-on-Write if using_copy_on_write() and not self._has_no_reference_block(blkno_l): leftover_blocks = tuple(blk.delete(blk_locs)) - if self.refs is not None: - prev_refs = self.refs[blkno_l] - # Preserve refs for unchanged blocks - # Add new block where old block was and remaining blocks at - # the end to avoid updating all block numbers - extra_refs = [prev_refs] * len(leftover_blocks) - self.refs = ( - self.refs[:blkno_l] - + [None] - + self.refs[blkno_l + 1 :] - + extra_refs - ) - if len(blk_locs) > 1: - # Add the refs for the other deleted blocks - self.refs += [None] * (len(blk_locs) - 1) # Fill before the first leftover block nbs = [] @@ -1258,8 +1243,8 @@ def value_getitem(placement): curr_block = leftover_blocks[i] next_block = leftover_blocks[i + 1] - curr_end = curr_block.as_slice.end - next_start = next_block.as_slice.start + curr_end = curr_block.mgr_locs.as_slice.stop + next_start = next_block.mgr_locs.as_slice.start num_to_fill = next_start - (curr_end - 1) nb = new_block_2d( @@ -1283,7 +1268,8 @@ def value_getitem(placement): ), ) ) - + # Add new block where old block was and remaining blocks at + # the end to avoid updating all block numbers old_blocks = self.blocks nb = nbs[0] new_blocks = ( @@ -1291,16 +1277,21 @@ def value_getitem(placement): + (nb,) + old_blocks[blkno_l + 1 :] + leftover_blocks - + tuple(nbs) + + tuple(nbs[1:]) ) + if self.refs is not None: + prev_refs = self.refs[blkno_l] + extra_refs = [prev_refs] * len(leftover_blocks) + self.refs[blkno_l] = None + self.refs += extra_refs + if len(blk_locs) > 1: + # Add the refs for the other deleted blocks + self.refs += [None] * (len(blk_locs) - 1) self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb)) for i, nb in enumerate(leftover_blocks): self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb)) self._blknos[nb.mgr_locs.indexer] = i + len(old_blocks) self.blocks = new_blocks - - # blk.set_inplace(blk_locs, value_getitem(val_locs), copy=True) - # self._clear_reference_block(blkno_l) else: blk.set_inplace(blk_locs, value_getitem(val_locs)) else: diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index 7a2965f2e1c61..648e027ec2228 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -5,6 +5,7 @@ import pandas as pd from pandas import DataFrame +import pandas._testing as tm from pandas.tests.copy_view.util import get_array @@ -92,3 +93,48 @@ def test_switch_options(): subset.iloc[0, 0] = 0 # df updated with CoW disabled assert df.iloc[0, 0] == 0 + + +@td.skip_array_manager_invalid_test +@pytest.mark.parametrize( + "locs", + [ + [0], + [1], + [3], + [0, 1], + [0, 2], + [0, 1, 2], + [1, 2], + [1, 3], + [0, 5], + ], +) +def test_iset_splits_blocks_inplace(using_copy_on_write, locs): + # Nothing currently calls iset with + # more than 1 loc with inplace=True (only happens with inplace=False) + # but ensure that it works + df = DataFrame( + { + "a": [1, 2, 3], + "b": [4, 5, 6], + "c": [7, 8, 9], + "d": [10, 11, 12], + "e": [13, 14, 15], + "f": ["foo", "bar", "baz"], + } + ) + df_orig = df.copy() + df2 = df.copy(deep=None) # Trigger a CoW (if enabled, otherwise makes copy) + arr = np.array([-1, -2, -3]) + df2._mgr.iset(locs, arr, inplace=True) + + tm.assert_frame_equal(df, df_orig) + + if using_copy_on_write: + for i, col in enumerate(df.columns): + if i not in locs: + assert np.shares_memory(get_array(df, col), get_array(df2, col)) + else: + for col in df.columns: + assert not np.shares_memory(get_array(df, col), get_array(df2, col)) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index da7fb688f1ca0..e919c43a85bfe 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -730,9 +730,11 @@ def test_squeeze(using_copy_on_write): # Test CoW splits blocks to avoid copying unchanged columns {"to_replace": {"a": 1}, "value": -1}, {"to_replace": {"b": 4}, "value": -1}, - # TODO: broken, fix by fixing Block.replace_list() + {"to_replace": {"b": {4: 1}}}, + # TODO: Add these in a further optimization + # We would need to see which columns got replaced in the mask + # which could be expensive # {"to_replace": {"b": 1}}, - # TODO: broken, fix by fixing Block.replace() # 1 ], ) From 2266ab8786f8c8ce7559e2675ad0152458430d90 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 12 Jan 2023 14:50:52 -0800 Subject: [PATCH 06/15] fix tests --- pandas/tests/copy_view/test_internals.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index 648e027ec2228..fa8e73ce7ebb3 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -97,20 +97,19 @@ def test_switch_options(): @td.skip_array_manager_invalid_test @pytest.mark.parametrize( - "locs", + "locs, arr", [ - [0], - [1], - [3], - [0, 1], - [0, 2], - [0, 1, 2], - [1, 2], - [1, 3], - [0, 5], + ([0], np.array([-1, -2, -3])), + ([1], np.array([-1, -2, -3])), + ([5], np.array([-1, -2, -3])), + ([0, 1], np.array([-1, -2, -3])), + ([0, 2], np.array([-1, -2, -3])), + ([0, 1, 2], np.array([-1, -2, -3])), + ([1, 2], np.array([-1, -2, -3])), + ([1, 3], np.array([-1, -2, -3])), ], ) -def test_iset_splits_blocks_inplace(using_copy_on_write, locs): +def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr): # Nothing currently calls iset with # more than 1 loc with inplace=True (only happens with inplace=False) # but ensure that it works @@ -126,7 +125,6 @@ def test_iset_splits_blocks_inplace(using_copy_on_write, locs): ) df_orig = df.copy() df2 = df.copy(deep=None) # Trigger a CoW (if enabled, otherwise makes copy) - arr = np.array([-1, -2, -3]) df2._mgr.iset(locs, arr, inplace=True) tm.assert_frame_equal(df, df_orig) From 7aee8476bb334632d4d3e9d79185e87037a6fed3 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 13 Jan 2023 19:51:22 -0800 Subject: [PATCH 07/15] fixes --- pandas/core/internals/managers.py | 16 +++++++++++----- pandas/tests/copy_view/test_internals.py | 1 + 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index f2858cde5c403..aff0d7db134c5 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1224,6 +1224,9 @@ def value_getitem(placement): # Updating inplace -> check if we need to do Copy-on-Write if using_copy_on_write() and not self._has_no_reference_block(blkno_l): leftover_blocks = tuple(blk.delete(blk_locs)) + vals = np.broadcast_to( + value_getitem(val_locs), blk.values[blk_locs].shape + ) # Fill before the first leftover block nbs = [] @@ -1232,13 +1235,14 @@ def value_getitem(placement): if leftover_start_i != 0: nbs.append( new_block_2d( - np.tile(value, (leftover_start_i, 1)), + vals[:leftover_start_i], placement=BlockPlacement(slice(0, leftover_start_i)), ) ) # Every hole in between leftover blocks is where we need to insert # a new block + curr_idx = leftover_start_i for i in range(len(leftover_blocks) - 1): curr_block = leftover_blocks[i] next_block = leftover_blocks[i + 1] @@ -1246,14 +1250,15 @@ def value_getitem(placement): curr_end = curr_block.mgr_locs.as_slice.stop next_start = next_block.mgr_locs.as_slice.start - num_to_fill = next_start - (curr_end - 1) + num_to_fill = next_start - curr_end nb = new_block_2d( - np.tile(value, (num_to_fill, 1)), + vals[curr_idx : curr_idx + num_to_fill], placement=BlockPlacement( slice(curr_end, curr_end + num_to_fill) ), ) nbs.append(nb) + curr_idx += num_to_fill # Fill after the last leftover block last_del_loc = blk_locs[-1] + 1 @@ -1262,7 +1267,7 @@ def value_getitem(placement): diff = last_del_loc - last_leftover_loc nbs.append( new_block_2d( - np.tile(value, (diff, 1)), + vals[curr_idx : curr_idx + diff], placement=BlockPlacement( slice(last_leftover_loc, last_del_loc) ), @@ -1271,6 +1276,7 @@ def value_getitem(placement): # Add new block where old block was and remaining blocks at # the end to avoid updating all block numbers old_blocks = self.blocks + nbs = tuple(nbs) nb = nbs[0] new_blocks = ( old_blocks[:blkno_l] @@ -1288,7 +1294,7 @@ def value_getitem(placement): # Add the refs for the other deleted blocks self.refs += [None] * (len(blk_locs) - 1) self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb)) - for i, nb in enumerate(leftover_blocks): + for i, nb in enumerate(leftover_blocks + nbs[1:]): self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb)) self._blknos[nb.mgr_locs.indexer] = i + len(old_blocks) self.blocks = new_blocks diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index fa8e73ce7ebb3..d9df1f0040769 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -107,6 +107,7 @@ def test_switch_options(): ([0, 1, 2], np.array([-1, -2, -3])), ([1, 2], np.array([-1, -2, -3])), ([1, 3], np.array([-1, -2, -3])), + ([1, 3], np.array([[-1, -2, -3], [-4, -5, -6]]).T), ], ) def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr): From 6dc05e185e8d5a350d465d39c9bf397131983840 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 14 Jan 2023 20:10:17 +0100 Subject: [PATCH 08/15] Simplify block splitting logic --- pandas/core/internals/managers.py | 103 ++++++------------------------ 1 file changed, 18 insertions(+), 85 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 0d3ccf94b822b..0ed59bbde408d 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1220,83 +1220,15 @@ def value_getitem(placement): if inplace and blk.should_store(value): # Updating inplace -> check if we need to do Copy-on-Write if using_copy_on_write() and not self._has_no_reference_block(blkno_l): - leftover_blocks = tuple(blk.delete(blk_locs)) - vals = np.broadcast_to( - value_getitem(val_locs), blk.values[blk_locs].shape - ) - - # Fill before the first leftover block - nbs = [] - leftover_start_i = leftover_blocks[0].mgr_locs.as_slice.start - - if leftover_start_i != 0: - nbs.append( - new_block_2d( - vals[:leftover_start_i], - placement=BlockPlacement(slice(0, leftover_start_i)), - ) - ) - - # Every hole in between leftover blocks is where we need to insert - # a new block - curr_idx = leftover_start_i - for i in range(len(leftover_blocks) - 1): - curr_block = leftover_blocks[i] - next_block = leftover_blocks[i + 1] - - curr_end = curr_block.mgr_locs.as_slice.stop - next_start = next_block.mgr_locs.as_slice.start - - num_to_fill = next_start - curr_end - nb = new_block_2d( - vals[curr_idx : curr_idx + num_to_fill], - placement=BlockPlacement( - slice(curr_end, curr_end + num_to_fill) - ), - ) - nbs.append(nb) - curr_idx += num_to_fill - - # Fill after the last leftover block - last_del_loc = blk_locs[-1] + 1 - last_leftover_loc = leftover_blocks[-1].mgr_locs.as_slice.stop - if last_del_loc > last_leftover_loc: - diff = last_del_loc - last_leftover_loc - nbs.append( - new_block_2d( - vals[curr_idx : curr_idx + diff], - placement=BlockPlacement( - slice(last_leftover_loc, last_del_loc) - ), - ) - ) - # Add new block where old block was and remaining blocks at - # the end to avoid updating all block numbers - old_blocks = self.blocks - nbs = tuple(nbs) - nb = nbs[0] - new_blocks = ( - old_blocks[:blkno_l] - + (nb,) - + old_blocks[blkno_l + 1 :] - + leftover_blocks - + tuple(nbs[1:]) + nbs_tup = tuple(blk.delete(blk_locs)) + first_nb = new_block_2d( + value_getitem(val_locs), BlockPlacement(blk.mgr_locs[blk_locs]) ) if self.refs is not None: - prev_refs = self.refs[blkno_l] - extra_refs = [prev_refs] * len(leftover_blocks) - self.refs[blkno_l] = None - self.refs += extra_refs - if len(blk_locs) > 1: - # Add the refs for the other deleted blocks - self.refs += [None] * (len(blk_locs) - 1) - self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb)) - for i, nb in enumerate(leftover_blocks + nbs[1:]): - self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb)) - self._blknos[nb.mgr_locs.indexer] = i + len(old_blocks) - self.blocks = new_blocks + self.refs.extend([self.refs[blkno_l]] * len(nbs_tup)) else: blk.set_inplace(blk_locs, value_getitem(val_locs)) + continue else: unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs]) unfit_val_locs.append(val_locs) @@ -1304,25 +1236,26 @@ def value_getitem(placement): # If all block items are unfit, schedule the block for removal. if len(val_locs) == len(blk.mgr_locs): removed_blknos.append(blkno_l) + continue else: nbs = blk.delete(blk_locs) # Add first block where old block was and remaining blocks at # the end to avoid updating all block numbers first_nb = nbs[0] nbs_tup = tuple(nbs[1:]) - nr_blocks = len(self.blocks) - blocks_tup = ( - self.blocks[:blkno_l] - + (first_nb,) - + self.blocks[blkno_l + 1 :] - + nbs_tup - ) - self.blocks = blocks_tup - self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb)) + nr_blocks = len(self.blocks) + blocks_tup = ( + self.blocks[:blkno_l] + + (first_nb,) + + self.blocks[blkno_l + 1 :] + + nbs_tup + ) + self.blocks = blocks_tup + self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb)) - for i, nb in enumerate(nbs_tup): - self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb)) - self._blknos[nb.mgr_locs.indexer] = i + nr_blocks + for i, nb in enumerate(nbs_tup): + self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb)) + self._blknos[nb.mgr_locs.indexer] = i + nr_blocks if len(removed_blknos): # Remove blocks & update blknos and refs accordingly From 6514131fe77bfb1664eec1a6742cdbd3145c8802 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 14 Jan 2023 20:37:59 +0100 Subject: [PATCH 09/15] Clear ref --- pandas/core/internals/managers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 0ed59bbde408d..07c8194227283 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1226,6 +1226,7 @@ def value_getitem(placement): ) if self.refs is not None: self.refs.extend([self.refs[blkno_l]] * len(nbs_tup)) + self._clear_reference_block(blkno) else: blk.set_inplace(blk_locs, value_getitem(val_locs)) continue From 384984223fa6f7861ff5618dbc427ae3f36f7eda Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 14 Jan 2023 20:42:02 +0100 Subject: [PATCH 10/15] Adjust test --- pandas/tests/copy_view/test_methods.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index b1b286b813d04..5a7efbb4bae8d 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -934,6 +934,10 @@ def test_replace(using_copy_on_write, replace_kwargs): if using_copy_on_write: assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "a")) + if "a" in replace_kwargs: + arr = get_array(df_replaced, "a") + df_replaced.loc[0, "a"] = 100 + assert np.shares_memory(get_array(df_replaced, "a"), arr) tm.assert_frame_equal(df, df_orig) From f2b8a97c79ee1cbb06bcff9af09dfa85ebbb4fbc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 14 Jan 2023 20:45:12 +0100 Subject: [PATCH 11/15] Fix test --- pandas/tests/copy_view/test_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 5a7efbb4bae8d..f5987cebea9c2 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -934,7 +934,7 @@ def test_replace(using_copy_on_write, replace_kwargs): if using_copy_on_write: assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "a")) - if "a" in replace_kwargs: + if "a" in replace_kwargs["to_replace"]: arr = get_array(df_replaced, "a") df_replaced.loc[0, "a"] = 100 assert np.shares_memory(get_array(df_replaced, "a"), arr) From c1d90ca3d0c03ddff143de4312b37d0171ee3b52 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 14 Jan 2023 21:30:57 +0100 Subject: [PATCH 12/15] Small fix --- pandas/core/internals/managers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 07c8194227283..1a0aba0778da5 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1226,7 +1226,7 @@ def value_getitem(placement): ) if self.refs is not None: self.refs.extend([self.refs[blkno_l]] * len(nbs_tup)) - self._clear_reference_block(blkno) + self._clear_reference_block(blkno_l) else: blk.set_inplace(blk_locs, value_getitem(val_locs)) continue From e8e1ff243e2c27db158fc9194a98f88d8850b08e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 14 Jan 2023 21:50:19 +0100 Subject: [PATCH 13/15] Fix 32 bit test --- pandas/tests/copy_view/test_internals.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index bd2dca5c80cfa..f16a262b2ea16 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -100,15 +100,15 @@ def test_switch_options(): @pytest.mark.parametrize( "locs, arr", [ - ([0], np.array([-1, -2, -3])), - ([1], np.array([-1, -2, -3])), - ([5], np.array([-1, -2, -3])), - ([0, 1], np.array([-1, -2, -3])), - ([0, 2], np.array([-1, -2, -3])), - ([0, 1, 2], np.array([-1, -2, -3])), - ([1, 2], np.array([-1, -2, -3])), - ([1, 3], np.array([-1, -2, -3])), - ([1, 3], np.array([[-1, -2, -3], [-4, -5, -6]]).T), + ([0], np.array([-1, -2, -3], dtype=np.intp)), + ([1], np.array([-1, -2, -3], dtype=np.intp)), + ([5], np.array([-1, -2, -3], dtype=np.intp)), + ([0, 1], np.array([-1, -2, -3], dtype=np.intp)), + ([0, 2], np.array([-1, -2, -3], dtype=np.intp)), + ([0, 1, 2], np.array([-1, -2, -3], dtype=np.intp)), + ([1, 2], np.array([-1, -2, -3], dtype=np.intp)), + ([1, 3], np.array([-1, -2, -3], dtype=np.intp)), + ([1, 3], np.array([[-1, -2, -3], [-4, -5, -6]], dtype=np.intp).T), ], ) def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr): @@ -122,9 +122,10 @@ def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr): "c": [7, 8, 9], "d": [10, 11, 12], "e": [13, 14, 15], - "f": ["foo", "bar", "baz"], - } + }, + dtype=np.intp, ) + df["f"] = ["a", "b", "c"] df_orig = df.copy() df2 = df.copy(deep=None) # Trigger a CoW (if enabled, otherwise makes copy) df2._mgr.iset(locs, arr, inplace=True) From 9e84de41e44860325ab55630ebaccc4c5ba9d7ec Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 14 Jan 2023 22:49:33 +0100 Subject: [PATCH 14/15] Adjust test --- pandas/tests/copy_view/test_internals.py | 25 ++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index f16a262b2ea16..506ae7d3465c5 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -97,21 +97,22 @@ def test_switch_options(): @td.skip_array_manager_invalid_test +@pytest.mark.parametrize("dtype", [np.intp, np.int8]) @pytest.mark.parametrize( "locs, arr", [ - ([0], np.array([-1, -2, -3], dtype=np.intp)), - ([1], np.array([-1, -2, -3], dtype=np.intp)), - ([5], np.array([-1, -2, -3], dtype=np.intp)), - ([0, 1], np.array([-1, -2, -3], dtype=np.intp)), - ([0, 2], np.array([-1, -2, -3], dtype=np.intp)), - ([0, 1, 2], np.array([-1, -2, -3], dtype=np.intp)), - ([1, 2], np.array([-1, -2, -3], dtype=np.intp)), - ([1, 3], np.array([-1, -2, -3], dtype=np.intp)), - ([1, 3], np.array([[-1, -2, -3], [-4, -5, -6]], dtype=np.intp).T), + ([0], np.array([-1, -2, -3])), + ([1], np.array([-1, -2, -3])), + ([5], np.array([-1, -2, -3])), + ([0, 1], np.array([[-1, -2, -3], [-4, -5, -6]]).T), + ([0, 2], np.array([[-1, -2, -3], [-4, -5, -6]]).T), + ([0, 1, 2], np.array([[-1, -2, -3], [-4, -5, -6], [-4, -5, -6]]).T), + ([1, 2], np.array([[-1, -2, -3], [-4, -5, -6]]).T), + ([1, 3], np.array([[-1, -2, -3], [-4, -5, -6]]).T), + ([1, 3], np.array([[-1, -2, -3], [-4, -5, -6]]).T), ], ) -def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr): +def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr, dtype): # Nothing currently calls iset with # more than 1 loc with inplace=True (only happens with inplace=False) # but ensure that it works @@ -122,10 +123,10 @@ def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr): "c": [7, 8, 9], "d": [10, 11, 12], "e": [13, 14, 15], + "f": ["a", "b", "c"], }, - dtype=np.intp, ) - df["f"] = ["a", "b", "c"] + arr = arr.astype(dtype) df_orig = df.copy() df2 = df.copy(deep=None) # Trigger a CoW (if enabled, otherwise makes copy) df2._mgr.iset(locs, arr, inplace=True) From 7988a163b7898f6965ea344b2eb8e32f3b08c08f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 16 Jan 2023 17:19:06 +0100 Subject: [PATCH 15/15] Update pandas/tests/copy_view/test_methods.py Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- pandas/tests/copy_view/test_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index f5987cebea9c2..d1184c3a70610 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -932,7 +932,7 @@ def test_replace(using_copy_on_write, replace_kwargs): # mutating squeezed df triggers a copy-on-write for that column/block df_replaced.loc[0, "c"] = -1 if using_copy_on_write: - assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "a")) + assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) if "a" in replace_kwargs["to_replace"]: arr = get_array(df_replaced, "a")