From 94c982e23b6f26ad9bbb02c14f99b59471bcb82f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 24 Dec 2022 11:04:12 +0100 Subject: [PATCH 01/22] Start with implmenting concat for cow --- pandas/conftest.py | 1 + pandas/core/reshape/concat.py | 24 ++++++++++++++++-- pandas/tests/copy_view/test_functions.py | 32 ++++++++++++++++++++++++ pandas/tests/copy_view/test_methods.py | 17 +++++++++++++ 4 files changed, 72 insertions(+), 2 deletions(-) create mode 100644 pandas/tests/copy_view/test_functions.py diff --git a/pandas/conftest.py b/pandas/conftest.py index 3b167d9ef4fe2..14c4e59778c9a 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1903,6 +1903,7 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ + pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index aced5a73a1f02..8375f36e881cd 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -17,6 +17,12 @@ import numpy as np +from pandas._config import ( + config, + get_option, +) + +from pandas._libs import lib from pandas._typing import ( Axis, AxisInt, @@ -47,6 +53,7 @@ get_unanimous_names, ) from pandas.core.internals import concatenate_managers +from pandas.core.internals.construction import dict_to_mgr if TYPE_CHECKING: from pandas import ( @@ -155,7 +162,7 @@ def concat( names=None, verify_integrity: bool = False, sort: bool = False, - copy: bool = True, + copy: bool = lib.NoDefault, ) -> DataFrame | Series: """ Concatenate pandas objects along a particular axis. @@ -363,6 +370,12 @@ def concat( 0 1 2 1 3 4 """ + if copy is lib.NoDefault: + if config.get_option("mode.copy_on_write"): + copy = None + else: + copy = True + op = _Concatenator( objs, axis=axis, @@ -584,7 +597,14 @@ def get_result(self): cons = sample._constructor_expanddim index, columns = self.new_axes - df = cons(data, index=index, copy=self.copy) + mgr = dict_to_mgr( + data, + index, + None, + copy=self.copy, + typ=get_option("mode.data_manager"), + ) + df = cons(mgr, copy=self.copy) df.columns = columns return df.__finalize__(self, method="concat") diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py new file mode 100644 index 0000000000000..5e74ffd062206 --- /dev/null +++ b/pandas/tests/copy_view/test_functions.py @@ -0,0 +1,32 @@ +import numpy as np + +from pandas import ( + DataFrame, + concat, +) +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + + +def test_concat_frames(using_copy_on_write): + df = DataFrame({"b": ["a"] * 3}) + df2 = DataFrame({"a": ["a"] * 3}) + + result = concat([df, df2], axis=1) + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + else: + assert not np.shares_memory(get_array(result, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + + result.iloc[0, 0] = "d" + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + + result.iloc[0, 1] = "d" + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + tm.assert_frame_equal(df, result) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index f5c7b31e59bc5..8c76720b1c1fb 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -339,6 +339,23 @@ def test_assign(using_copy_on_write): tm.assert_frame_equal(df, df_orig) +def test_round(using_copy_on_write): + df = DataFrame({"a": [1, 2], "b": "c"}) + df2 = df.round() + df_orig = df.copy() + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + + df2.iloc[0, 1] = "d" + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + tm.assert_frame_equal(df, df_orig) + + def test_reorder_levels(using_copy_on_write): index = MultiIndex.from_tuples( [(1, 1), (1, 2), (2, 1), (2, 2)], names=["one", "two"] From 0db1c3949993619e98e193ea6e82860754bcd484 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 30 Dec 2022 12:21:11 +0100 Subject: [PATCH 02/22] Fix concat logic --- pandas/conftest.py | 1 - pandas/core/reshape/concat.py | 7 ++++++- pandas/tests/copy_view/test_functions.py | 4 ++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 14c4e59778c9a..3b167d9ef4fe2 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1903,7 +1903,6 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ - pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 8375f36e881cd..d0cff699f7cd6 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -54,6 +54,7 @@ ) from pandas.core.internals import concatenate_managers from pandas.core.internals.construction import dict_to_mgr +from pandas.core.internals.managers import _using_copy_on_write if TYPE_CHECKING: from pandas import ( @@ -604,6 +605,8 @@ def get_result(self): copy=self.copy, typ=get_option("mode.data_manager"), ) + if _using_copy_on_write() and not self.copy: + mgr = mgr.copy(deep=False) df = cons(mgr, copy=self.copy) df.columns = columns return df.__finalize__(self, method="concat") @@ -631,8 +634,10 @@ def get_result(self): new_data = concatenate_managers( mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy ) - if not self.copy: + if not self.copy and not _using_copy_on_write(): new_data._consolidate_inplace() + elif _using_copy_on_write() and not self.copy: + new_data = new_data.copy(deep=False) cons = sample._constructor return cons(new_data).__finalize__(self, method="concat") diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index 5e74ffd062206..29e6ed570e3c3 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -11,7 +11,7 @@ def test_concat_frames(using_copy_on_write): df = DataFrame({"b": ["a"] * 3}) df2 = DataFrame({"a": ["a"] * 3}) - + df_orig = df.copy() result = concat([df, df2], axis=1) if using_copy_on_write: @@ -29,4 +29,4 @@ def test_concat_frames(using_copy_on_write): result.iloc[0, 1] = "d" if using_copy_on_write: assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) - tm.assert_frame_equal(df, result) + tm.assert_frame_equal(df, df_orig) From fc564f8b41f73384b724c47489369ddb30444ef7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 30 Dec 2022 13:05:36 +0100 Subject: [PATCH 03/22] Add series test --- pandas/tests/copy_view/test_functions.py | 25 ++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index 29e6ed570e3c3..09206f8e7d82a 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -2,6 +2,7 @@ from pandas import ( DataFrame, + Series, concat, ) import pandas._testing as tm @@ -30,3 +31,27 @@ def test_concat_frames(using_copy_on_write): if using_copy_on_write: assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) tm.assert_frame_equal(df, df_orig) + + +def test_concat_series(using_copy_on_write): + ser = Series([1, 2], name="a") + ser2 = Series([3, 4], name="b") + ser_orig = ser.copy() + result = concat([ser, ser2], axis=1) + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), ser.values) + assert np.shares_memory(get_array(result, "b"), ser2.values) + else: + assert not np.shares_memory(get_array(result, "a"), ser.values) + assert not np.shares_memory(get_array(result, "b"), ser2.values) + + result.iloc[0, 0] = 100 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), ser.values) + assert np.shares_memory(get_array(result, "b"), ser2.values) + + result.iloc[0, 1] = 1000 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "b"), ser2.values) + tm.assert_series_equal(ser, ser_orig) From a7165c2013b45b233af19184f2c18ee89b1dbfa3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 30 Dec 2022 13:07:31 +0100 Subject: [PATCH 04/22] Clean up --- pandas/core/reshape/concat.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index d0cff699f7cd6..010ff4560bfc7 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -17,12 +17,8 @@ import numpy as np -from pandas._config import ( - config, - get_option, -) +from pandas._config import get_option -from pandas._libs import lib from pandas._typing import ( Axis, AxisInt, @@ -163,7 +159,7 @@ def concat( names=None, verify_integrity: bool = False, sort: bool = False, - copy: bool = lib.NoDefault, + copy: bool | None = None, ) -> DataFrame | Series: """ Concatenate pandas objects along a particular axis. @@ -371,12 +367,6 @@ def concat( 0 1 2 1 3 4 """ - if copy is lib.NoDefault: - if config.get_option("mode.copy_on_write"): - copy = None - else: - copy = True - op = _Concatenator( objs, axis=axis, From 20a62bb33d5894832f74632b994c563526bea027 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 30 Dec 2022 13:57:51 +0100 Subject: [PATCH 05/22] Skip test --- pandas/core/reshape/concat.py | 6 ++++++ pandas/tests/io/pytables/test_store.py | 2 ++ 2 files changed, 8 insertions(+) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 010ff4560bfc7..1069b7dcfd191 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -367,6 +367,12 @@ def concat( 0 1 2 1 3 4 """ + if copy is None: + if _using_copy_on_write(): + copy = False + else: + copy = True + op = _Concatenator( objs, axis=axis, diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 1263d61b55cd5..9b11df8eaf58d 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -23,6 +23,7 @@ timedelta_range, ) import pandas._testing as tm +from pandas.core.internals.managers import _using_copy_on_write from pandas.tests.io.pytables.common import ( _maybe_remove, ensure_clean_store, @@ -1009,6 +1010,7 @@ def test_to_hdf_with_object_column_names(tmp_path, setup_path): assert len(result) +@pytest.mark.skipif(_using_copy_on_write(), reason="strides buggy with cow") def test_hdfstore_strides(setup_path): # GH22073 df = DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) From bca212340375849ec788858bfd403ec4bbfc80bf Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 30 Dec 2022 15:46:51 +0100 Subject: [PATCH 06/22] Fix test --- pandas/tests/reshape/concat/test_concat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 3dc6f2404444b..fb59985e42636 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -50,7 +50,7 @@ def test_append_concat(self): assert isinstance(result.index, PeriodIndex) assert result.index[0] == s1.index[0] - def test_concat_copy(self, using_array_manager): + def test_concat_copy(self, using_array_manager, using_copy_on_write): df = DataFrame(np.random.randn(4, 3)) df2 = DataFrame(np.random.randint(0, 10, size=4).reshape(4, 1)) df3 = DataFrame({5: "foo"}, index=range(4)) @@ -81,7 +81,7 @@ def test_concat_copy(self, using_array_manager): result = concat([df, df2, df3, df4], axis=1, copy=False) for arr in result._mgr.arrays: if arr.dtype.kind == "f": - if using_array_manager: + if using_array_manager or using_copy_on_write: # this is a view on some array in either df or df4 assert any( np.shares_memory(arr, other) From 3c4abd210c956f852f6f6cc057d63c39ea3495e6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 7 Jan 2023 12:46:38 +0100 Subject: [PATCH 07/22] Add check --- pandas/conftest.py | 1 + pandas/tests/copy_view/test_functions.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/pandas/conftest.py b/pandas/conftest.py index 3b167d9ef4fe2..14c4e59778c9a 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1903,6 +1903,7 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ + pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index 09206f8e7d82a..29a90a42ed246 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -37,6 +37,7 @@ def test_concat_series(using_copy_on_write): ser = Series([1, 2], name="a") ser2 = Series([3, 4], name="b") ser_orig = ser.copy() + ser2_orig = ser2.copy() result = concat([ser, ser2], axis=1) if using_copy_on_write: @@ -55,3 +56,4 @@ def test_concat_series(using_copy_on_write): if using_copy_on_write: assert not np.shares_memory(get_array(result, "b"), ser2.values) tm.assert_series_equal(ser, ser_orig) + tm.assert_series_equal(ser2, ser2_orig) From 8bc3272cb3f20e8431bb28f15515dcca229f7bac Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 7 Jan 2023 13:08:22 +0100 Subject: [PATCH 08/22] Push logic down --- pandas/conftest.py | 1 - pandas/core/internals/concat.py | 18 ++++++++++++++++-- pandas/core/reshape/concat.py | 12 +++++------- pandas/tests/copy_view/test_functions.py | 24 ++++++++++++++++++++++++ 4 files changed, 45 insertions(+), 10 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 14c4e59778c9a..3b167d9ef4fe2 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1903,7 +1903,6 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ - pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 364025d583b7d..d2c2d3b284cb7 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -7,6 +7,7 @@ Sequence, cast, ) +import weakref import numpy as np @@ -61,7 +62,10 @@ ensure_block_shape, new_block_2d, ) -from pandas.core.internals.managers import BlockManager +from pandas.core.internals.managers import ( + BlockManager, + using_copy_on_write, +) if TYPE_CHECKING: from pandas import Index @@ -267,6 +271,8 @@ def _concat_managers_axis0( offset = 0 blocks = [] + refs = [] + parent = None for i, mgr in enumerate(mgrs): # If we already reindexed, then we definitely don't need another copy made_copy = had_reindexers[i] @@ -283,8 +289,16 @@ def _concat_managers_axis0( nb._mgr_locs = nb._mgr_locs.add(offset) blocks.append(nb) + if not made_copy and not copy and using_copy_on_write(): + refs.extend([weakref.ref(blk) for blk in mgr.blocks]) + parent = mgr + offset += len(mgr.items) - return BlockManager(tuple(blocks), axes) + + result = BlockManager(tuple(blocks), axes) + result.parent = parent + result.refs = refs if refs else None + return result def _maybe_reindex_columns_na_proxy( diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 1069b7dcfd191..25c68ac436a47 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -50,7 +50,7 @@ ) from pandas.core.internals import concatenate_managers from pandas.core.internals.construction import dict_to_mgr -from pandas.core.internals.managers import _using_copy_on_write +from pandas.core.internals.managers import using_copy_on_write if TYPE_CHECKING: from pandas import ( @@ -368,7 +368,7 @@ def concat( 1 3 4 """ if copy is None: - if _using_copy_on_write(): + if using_copy_on_write(): copy = False else: copy = True @@ -601,9 +601,9 @@ def get_result(self): copy=self.copy, typ=get_option("mode.data_manager"), ) - if _using_copy_on_write() and not self.copy: + if using_copy_on_write() and not self.copy: mgr = mgr.copy(deep=False) - df = cons(mgr, copy=self.copy) + df = cons(mgr, copy=False) df.columns = columns return df.__finalize__(self, method="concat") @@ -630,10 +630,8 @@ def get_result(self): new_data = concatenate_managers( mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy ) - if not self.copy and not _using_copy_on_write(): + if not self.copy and not using_copy_on_write(): new_data._consolidate_inplace() - elif _using_copy_on_write() and not self.copy: - new_data = new_data.copy(deep=False) cons = sample._constructor return cons(new_data).__finalize__(self, method="concat") diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index 29a90a42ed246..56c20845ee5c9 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -33,6 +33,30 @@ def test_concat_frames(using_copy_on_write): tm.assert_frame_equal(df, df_orig) +def test_concat_frames_updating_input(using_copy_on_write): + df = DataFrame({"b": ["a"] * 3}) + df2 = DataFrame({"a": ["a"] * 3}) + result = concat([df, df2], axis=1) + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + else: + assert not np.shares_memory(get_array(result, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + + expected = result.copy() + df.iloc[0, 0] = "d" + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + + df2.iloc[0, 0] = "d" + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + tm.assert_frame_equal(result, expected) + + def test_concat_series(using_copy_on_write): ser = Series([1, 2], name="a") ser2 = Series([3, 4], name="b") From 79cedd433d5a483eea9d6de1c88e58169062e313 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 7 Jan 2023 13:09:59 +0100 Subject: [PATCH 09/22] Fix test --- pandas/tests/io/pytables/test_store.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 93ba5e00ceec2..e1d02d3f76f9b 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -23,7 +23,7 @@ timedelta_range, ) import pandas._testing as tm -from pandas.core.internals.managers import _using_copy_on_write +from pandas.core.internals.managers import using_copy_on_write from pandas.tests.io.pytables.common import ( _maybe_remove, ensure_clean_store, @@ -1010,7 +1010,7 @@ def test_to_hdf_with_object_column_names(tmp_path, setup_path): assert len(result) -@pytest.mark.skipif(_using_copy_on_write(), reason="strides buggy with cow") +@pytest.mark.skipif(using_copy_on_write(), reason="strides buggy with cow") def test_hdfstore_strides(setup_path): # GH22073 df = DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) From 6c8a7049d37883aa80d83f49d7c9b050a88a8954 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 7 Jan 2023 22:47:47 +0100 Subject: [PATCH 10/22] Fix typing --- pandas/core/internals/concat.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index d2c2d3b284cb7..442aced639553 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -4,6 +4,8 @@ import itertools from typing import ( TYPE_CHECKING, + List, + Optional, Sequence, cast, ) @@ -297,7 +299,7 @@ def _concat_managers_axis0( result = BlockManager(tuple(blocks), axes) result.parent = parent - result.refs = refs if refs else None + result.refs = cast(Optional[List[Optional[weakref.ref]]], refs) if refs else None return result From dddc9f0f7ef17a73b46ba98f51e27d1345ffeb8b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 13 Jan 2023 10:46:26 +0100 Subject: [PATCH 11/22] Fix annotation --- pandas/core/internals/concat.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 442aced639553..68e68b84de3e2 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -4,8 +4,6 @@ import itertools from typing import ( TYPE_CHECKING, - List, - Optional, Sequence, cast, ) @@ -273,7 +271,7 @@ def _concat_managers_axis0( offset = 0 blocks = [] - refs = [] + refs: list[weakref.ref | None] = [] parent = None for i, mgr in enumerate(mgrs): # If we already reindexed, then we definitely don't need another copy @@ -299,7 +297,7 @@ def _concat_managers_axis0( result = BlockManager(tuple(blocks), axes) result.parent = parent - result.refs = cast(Optional[List[Optional[weakref.ref]]], refs) if refs else None + result.refs = refs if refs else None return result From d18416dd586c8ec3a17e0fc258e2bace45baf9a6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 13 Jan 2023 14:35:26 +0100 Subject: [PATCH 12/22] Fix chained concat --- pandas/core/internals/concat.py | 6 +++--- pandas/tests/copy_view/test_functions.py | 25 ++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 68e68b84de3e2..ffa1816ebeab8 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -272,7 +272,7 @@ def _concat_managers_axis0( offset = 0 blocks = [] refs: list[weakref.ref | None] = [] - parent = None + parents: list = [] for i, mgr in enumerate(mgrs): # If we already reindexed, then we definitely don't need another copy made_copy = had_reindexers[i] @@ -291,12 +291,12 @@ def _concat_managers_axis0( if not made_copy and not copy and using_copy_on_write(): refs.extend([weakref.ref(blk) for blk in mgr.blocks]) - parent = mgr + parents.append(mgr) offset += len(mgr.items) result = BlockManager(tuple(blocks), axes) - result.parent = parent + result.parent = parents if parents else None result.refs = refs if refs else None return result diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index 56c20845ee5c9..cd27b619de820 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -81,3 +81,28 @@ def test_concat_series(using_copy_on_write): assert not np.shares_memory(get_array(result, "b"), ser2.values) tm.assert_series_equal(ser, ser_orig) tm.assert_series_equal(ser2, ser2_orig) + + +def test_chained_concat(using_copy_on_write): + df1 = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + + df2 = DataFrame({"c": [4, 5, 6]}) + + df3 = DataFrame({"d": [4, 5, 6]}) + result = concat([concat([df1, df2], axis=1), df3], axis=1) + expected = result.copy() + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert np.shares_memory(get_array(result, "c"), get_array(df2, "c")) + assert np.shares_memory(get_array(result, "d"), get_array(df3, "d")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert not np.shares_memory(get_array(result, "c"), get_array(df2, "c")) + assert not np.shares_memory(get_array(result, "d"), get_array(df3, "d")) + + df1.iloc[0, 0] = 100 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + + tm.assert_frame_equal(result, expected) From 5ccf23e5e336e266beddf66272df815c170291c4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 13 Jan 2023 14:36:54 +0100 Subject: [PATCH 13/22] Pass to bm --- pandas/conftest.py | 1 + pandas/core/internals/concat.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index e4e32ab4a7350..b594befc7fac1 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1903,6 +1903,7 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ + pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index ffa1816ebeab8..ba9da21e305f7 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -295,7 +295,7 @@ def _concat_managers_axis0( offset += len(mgr.items) - result = BlockManager(tuple(blocks), axes) + result = BlockManager(tuple(blocks), axes, parent=parents, refs=refs) result.parent = parents if parents else None result.refs = refs if refs else None return result From 7648b23276428809a19199bf46e2a26efb2ab6dc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 13 Jan 2023 14:44:47 +0100 Subject: [PATCH 14/22] Fix series case --- pandas/conftest.py | 1 - pandas/core/internals/concat.py | 8 +++++--- pandas/core/reshape/concat.py | 4 +++- pandas/tests/copy_view/test_functions.py | 25 ++++++++++++++++++++++++ 4 files changed, 33 insertions(+), 5 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index b594befc7fac1..e4e32ab4a7350 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1903,7 +1903,6 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ - pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index ba9da21e305f7..7bf5b1f06198c 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -295,9 +295,11 @@ def _concat_managers_axis0( offset += len(mgr.items) - result = BlockManager(tuple(blocks), axes, parent=parents, refs=refs) - result.parent = parents if parents else None - result.refs = refs if refs else None + result_parents = parents if parents else None + result_ref = refs if refs else None + result = BlockManager(tuple(blocks), axes, parent=result_parents, refs=result_ref) + result.parent = result_parents + result.refs = result_ref return result diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 25c68ac436a47..2df4a9e7b7c0f 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -14,6 +14,7 @@ cast, overload, ) +import weakref import numpy as np @@ -602,7 +603,8 @@ def get_result(self): typ=get_option("mode.data_manager"), ) if using_copy_on_write() and not self.copy: - mgr = mgr.copy(deep=False) + mgr.parent = [obj._mgr for obj in self.objs] + mgr.refs = [weakref.ref(obj._mgr.blocks[0]) for obj in self.objs] df = cons(mgr, copy=False) df.columns = columns return df.__finalize__(self, method="concat") diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index cd27b619de820..d1e51ae5079c3 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -106,3 +106,28 @@ def test_chained_concat(using_copy_on_write): assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) tm.assert_frame_equal(result, expected) + + +def test_concat_series_parent_obj(using_copy_on_write): + ser = Series([1, 2], name="a") + ser2 = Series([3, 4], name="b") + expected = DataFrame({"a": [1, 2], "b": [3, 4]}) + result = concat([ser, ser2], axis=1) + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(ser, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(ser2, "b")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(ser, "a")) + assert not np.shares_memory(get_array(result, "b"), get_array(ser2, "b")) + + ser.iloc[0] = 100 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(ser, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(ser2, "b")) + tm.assert_frame_equal(result, expected) + + ser2.iloc[0] = 1000 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "b"), get_array(ser2, "b")) + tm.assert_frame_equal(result, expected) From 99da573acc47319ef28f42bec741b96a9d16a3fc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 13 Jan 2023 15:45:41 +0100 Subject: [PATCH 15/22] Update pandas/core/internals/concat.py Co-authored-by: Joris Van den Bossche --- pandas/core/internals/concat.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 7bf5b1f06198c..028147e907e55 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -298,8 +298,6 @@ def _concat_managers_axis0( result_parents = parents if parents else None result_ref = refs if refs else None result = BlockManager(tuple(blocks), axes, parent=result_parents, refs=result_ref) - result.parent = result_parents - result.refs = result_ref return result From a6afa8bc4ba32181fae93643a82206addb44ba26 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 13 Jan 2023 17:05:28 +0100 Subject: [PATCH 16/22] Fix concat --- pandas/core/reshape/concat.py | 6 +++-- pandas/tests/copy_view/test_functions.py | 29 ++++++++++++++++++++---- pandas/tests/io/pytables/test_store.py | 3 ++- 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 2df4a9e7b7c0f..ae38a82f0c2ca 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -18,7 +18,10 @@ import numpy as np -from pandas._config import get_option +from pandas._config import ( + get_option, + using_copy_on_write, +) from pandas._typing import ( Axis, @@ -51,7 +54,6 @@ ) from pandas.core.internals import concatenate_managers from pandas.core.internals.construction import dict_to_mgr -from pandas.core.internals.managers import using_copy_on_write if TYPE_CHECKING: from pandas import ( diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index d1e51ae5079c3..4023a47216b4c 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -83,11 +83,9 @@ def test_concat_series(using_copy_on_write): tm.assert_series_equal(ser2, ser2_orig) -def test_chained_concat(using_copy_on_write): +def test_concat_frames_chained(using_copy_on_write): df1 = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) - df2 = DataFrame({"c": [4, 5, 6]}) - df3 = DataFrame({"d": [4, 5, 6]}) result = concat([concat([df1, df2], axis=1), df3], axis=1) expected = result.copy() @@ -108,7 +106,30 @@ def test_chained_concat(using_copy_on_write): tm.assert_frame_equal(result, expected) -def test_concat_series_parent_obj(using_copy_on_write): +def test_concat_series_chained(using_copy_on_write): + ser1 = Series([1, 2, 3], name="a") + ser2 = Series([4, 5, 6], name="c") + ser3 = Series([4, 5, 6], name="d") + result = concat([concat([ser1, ser2], axis=1), ser3], axis=1) + expected = result.copy() + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(ser1, "a")) + assert np.shares_memory(get_array(result, "c"), get_array(ser2, "c")) + assert np.shares_memory(get_array(result, "d"), get_array(ser3, "d")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(ser1, "a")) + assert not np.shares_memory(get_array(result, "c"), get_array(ser2, "c")) + assert not np.shares_memory(get_array(result, "d"), get_array(ser3, "d")) + + ser1.iloc[0] = 100 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(ser1, "a")) + + tm.assert_frame_equal(result, expected) + + +def test_concat_series_updating_input(using_copy_on_write): ser = Series([1, 2], name="a") ser2 = Series([3, 4], name="b") expected = DataFrame({"a": [1, 2], "b": [3, 4]}) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index fae26ff3a855f..3bc98325c1bec 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -11,6 +11,8 @@ import numpy as np import pytest +from pandas._config import using_copy_on_write + import pandas as pd from pandas import ( DataFrame, @@ -24,7 +26,6 @@ timedelta_range, ) import pandas._testing as tm -from pandas.core.internals.managers import using_copy_on_write from pandas.tests.io.pytables.common import ( _maybe_remove, ensure_clean_store, From 8b492106d266fc4c86451a7cb9196bf405c14550 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 13 Jan 2023 17:42:19 +0100 Subject: [PATCH 17/22] Fix chained series case --- pandas/core/frame.py | 1 + pandas/core/reshape/concat.py | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fbc78da26c4b6..a25fe3288ca65 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -730,6 +730,7 @@ def __init__( columns, dtype=dtype, typ=manager, + copy=copy, ) else: mgr = ndarray_to_mgr( diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index ae38a82f0c2ca..5449e3a3e2af9 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -536,18 +536,19 @@ def __init__( ) else: - name = getattr(obj, "name", None) + name = new_name = getattr(obj, "name", None) if ignore_index or name is None: - name = current_column + new_name = current_column current_column += 1 # doing a row-wise concatenation so need everything # to line up if self._is_frame and axis == 1: - name = 0 + new_name = 0 # mypy needs to know sample is not an NDFrame sample = cast("DataFrame | Series", sample) - obj = sample._constructor({name: obj}) + obj = sample._constructor(obj, columns=[name], copy=False) + obj.columns = [new_name] self.objs.append(obj) From 8ba35fe26cb350d030e6698e210053d9ed352ee5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 15 Jan 2023 16:10:40 +0100 Subject: [PATCH 18/22] Fix mypy --- pandas/core/reshape/concat.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 5449e3a3e2af9..0747299bc2dfa 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -606,8 +606,13 @@ def get_result(self): typ=get_option("mode.data_manager"), ) if using_copy_on_write() and not self.copy: - mgr.parent = [obj._mgr for obj in self.objs] - mgr.refs = [weakref.ref(obj._mgr.blocks[0]) for obj in self.objs] + parents = [obj._mgr for obj in self.objs] + mgr.parent = parents # type: ignore[union-attr] + refs = [ + weakref.ref(obj._mgr.blocks[0]) # type: ignore[union-attr] + for obj in self.objs + ] + mgr.refs = refs # type: ignore[union-attr] df = cons(mgr, copy=False) df.columns = columns return df.__finalize__(self, method="concat") From 2047d5c03431323605652c6ae9771725663d6dba Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 15 Jan 2023 16:18:16 +0100 Subject: [PATCH 19/22] Fix bug --- pandas/core/internals/concat.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 028147e907e55..d46b51a2ee954 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -292,6 +292,8 @@ def _concat_managers_axis0( if not made_copy and not copy and using_copy_on_write(): refs.extend([weakref.ref(blk) for blk in mgr.blocks]) parents.append(mgr) + elif using_copy_on_write(): + refs.extend([None] * len(mgr.blocks)) offset += len(mgr.items) From e959c531673c40176a044fd9717b2098fb3d2896 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 16 Jan 2023 11:43:40 +0100 Subject: [PATCH 20/22] Add test and flag --- pandas/core/frame.py | 3 ++- pandas/core/reshape/concat.py | 6 ++++++ pandas/tests/copy_view/test_functions.py | 25 ++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index aaecbaeff5f51..b2fbc7b088eb3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -723,6 +723,7 @@ def __init__( ) elif getattr(data, "name", None) is not None: # i.e. Series/Index with non-None name + _copy = copy if using_copy_on_write() else True mgr = dict_to_mgr( # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no # attribute "name" @@ -731,7 +732,7 @@ def __init__( columns, dtype=dtype, typ=manager, - copy=copy, + copy=_copy, ) else: mgr = ndarray_to_mgr( diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 0747299bc2dfa..e60b998306e55 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -536,6 +536,7 @@ def __init__( ) else: + original_obj = obj name = new_name = getattr(obj, "name", None) if ignore_index or name is None: new_name = current_column @@ -548,6 +549,11 @@ def __init__( # mypy needs to know sample is not an NDFrame sample = cast("DataFrame | Series", sample) obj = sample._constructor(obj, columns=[name], copy=False) + if using_copy_on_write(): + # TODO(CoW): Remove when ref tracking in constructors works + obj._mgr.parent = original_obj + obj._mgr.refs = [weakref.ref(original_obj._mgr.blocks[0])] + obj.columns = [new_name] self.objs.append(obj) diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index 4023a47216b4c..569cbc4ad7583 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -152,3 +152,28 @@ def test_concat_series_updating_input(using_copy_on_write): if using_copy_on_write: assert not np.shares_memory(get_array(result, "b"), get_array(ser2, "b")) tm.assert_frame_equal(result, expected) + + +def test_concat_mixed_series_frame(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "c": 1}) + ser = Series([4, 5, 6], name="d") + result = concat([df, ser], axis=1) + expected = result.copy() + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(result, "c"), get_array(df, "c")) + assert np.shares_memory(get_array(result, "d"), get_array(ser, "d")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(result, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(result, "d"), get_array(ser, "d")) + + ser.iloc[0] = 100 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "d"), get_array(ser, "d")) + + df.iloc[0, 0] = 100 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + tm.assert_frame_equal(result, expected) From 891e07c48534b05327d428754b72fc6a1878f217 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 16 Jan 2023 11:46:18 +0100 Subject: [PATCH 21/22] Fix hdf test --- pandas/io/pytables.py | 2 +- pandas/tests/io/pytables/test_store.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 8c0d5c3da385c..079317c1ed18d 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3205,7 +3205,7 @@ def read( dfs.append(df) if len(dfs) > 0: - out = concat(dfs, axis=1) + out = concat(dfs, axis=1, copy=True) out = out.reindex(columns=items, copy=False) return out diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 33b30587eeec3..06684f076aefe 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -11,8 +11,6 @@ import numpy as np import pytest -from pandas._config import using_copy_on_write - import pandas as pd from pandas import ( DataFrame, @@ -999,7 +997,6 @@ def test_to_hdf_with_object_column_names(tmp_path, setup_path): assert len(result) -@pytest.mark.skipif(using_copy_on_write(), reason="strides buggy with cow") def test_hdfstore_strides(setup_path): # GH22073 df = DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) From 04adc17ade446c95e92d709eb415f17b9a3e15dd Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 16 Jan 2023 20:03:42 +0100 Subject: [PATCH 22/22] Fix mypy --- pandas/core/reshape/concat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index e60b998306e55..f8220649bf890 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -551,8 +551,8 @@ def __init__( obj = sample._constructor(obj, columns=[name], copy=False) if using_copy_on_write(): # TODO(CoW): Remove when ref tracking in constructors works - obj._mgr.parent = original_obj - obj._mgr.refs = [weakref.ref(original_obj._mgr.blocks[0])] + obj._mgr.parent = original_obj # type: ignore[union-attr] + obj._mgr.refs = [weakref.ref(original_obj._mgr.blocks[0])] # type: ignore[union-attr] # noqa: E501 obj.columns = [new_name]