diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 46f8c73027f48..b2fbc7b088eb3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -723,6 +723,7 @@ def __init__( ) elif getattr(data, "name", None) is not None: # i.e. Series/Index with non-None name + _copy = copy if using_copy_on_write() else True mgr = dict_to_mgr( # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no # attribute "name" @@ -731,6 +732,7 @@ def __init__( columns, dtype=dtype, typ=manager, + copy=_copy, ) else: mgr = ndarray_to_mgr( diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 364025d583b7d..d46b51a2ee954 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -7,6 +7,7 @@ Sequence, cast, ) +import weakref import numpy as np @@ -61,7 +62,10 @@ ensure_block_shape, new_block_2d, ) -from pandas.core.internals.managers import BlockManager +from pandas.core.internals.managers import ( + BlockManager, + using_copy_on_write, +) if TYPE_CHECKING: from pandas import Index @@ -267,6 +271,8 @@ def _concat_managers_axis0( offset = 0 blocks = [] + refs: list[weakref.ref | None] = [] + parents: list = [] for i, mgr in enumerate(mgrs): # If we already reindexed, then we definitely don't need another copy made_copy = had_reindexers[i] @@ -283,8 +289,18 @@ def _concat_managers_axis0( nb._mgr_locs = nb._mgr_locs.add(offset) blocks.append(nb) + if not made_copy and not copy and using_copy_on_write(): + refs.extend([weakref.ref(blk) for blk in mgr.blocks]) + parents.append(mgr) + elif using_copy_on_write(): + refs.extend([None] * len(mgr.blocks)) + offset += len(mgr.items) - return BlockManager(tuple(blocks), axes) + + result_parents = parents if parents else None + result_ref = refs if refs else None + result = BlockManager(tuple(blocks), axes, parent=result_parents, refs=result_ref) + return result def _maybe_reindex_columns_na_proxy( diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index aced5a73a1f02..f8220649bf890 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -14,9 +14,15 @@ cast, overload, ) +import weakref import numpy as np +from pandas._config import ( + get_option, + using_copy_on_write, +) + from pandas._typing import ( Axis, AxisInt, @@ -47,6 +53,7 @@ get_unanimous_names, ) from pandas.core.internals import concatenate_managers +from pandas.core.internals.construction import dict_to_mgr if TYPE_CHECKING: from pandas import ( @@ -155,7 +162,7 @@ def concat( names=None, verify_integrity: bool = False, sort: bool = False, - copy: bool = True, + copy: bool | None = None, ) -> DataFrame | Series: """ Concatenate pandas objects along a particular axis. @@ -363,6 +370,12 @@ def concat( 0 1 2 1 3 4 """ + if copy is None: + if using_copy_on_write(): + copy = False + else: + copy = True + op = _Concatenator( objs, axis=axis, @@ -523,18 +536,25 @@ def __init__( ) else: - name = getattr(obj, "name", None) + original_obj = obj + name = new_name = getattr(obj, "name", None) if ignore_index or name is None: - name = current_column + new_name = current_column current_column += 1 # doing a row-wise concatenation so need everything # to line up if self._is_frame and axis == 1: - name = 0 + new_name = 0 # mypy needs to know sample is not an NDFrame sample = cast("DataFrame | Series", sample) - obj = sample._constructor({name: obj}) + obj = sample._constructor(obj, columns=[name], copy=False) + if using_copy_on_write(): + # TODO(CoW): Remove when ref tracking in constructors works + obj._mgr.parent = original_obj # type: ignore[union-attr] + obj._mgr.refs = [weakref.ref(original_obj._mgr.blocks[0])] # type: ignore[union-attr] # noqa: E501 + + obj.columns = [new_name] self.objs.append(obj) @@ -584,7 +604,22 @@ def get_result(self): cons = sample._constructor_expanddim index, columns = self.new_axes - df = cons(data, index=index, copy=self.copy) + mgr = dict_to_mgr( + data, + index, + None, + copy=self.copy, + typ=get_option("mode.data_manager"), + ) + if using_copy_on_write() and not self.copy: + parents = [obj._mgr for obj in self.objs] + mgr.parent = parents # type: ignore[union-attr] + refs = [ + weakref.ref(obj._mgr.blocks[0]) # type: ignore[union-attr] + for obj in self.objs + ] + mgr.refs = refs # type: ignore[union-attr] + df = cons(mgr, copy=False) df.columns = columns return df.__finalize__(self, method="concat") @@ -611,7 +646,7 @@ def get_result(self): new_data = concatenate_managers( mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy ) - if not self.copy: + if not self.copy and not using_copy_on_write(): new_data._consolidate_inplace() cons = sample._constructor diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 8c0d5c3da385c..079317c1ed18d 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3205,7 +3205,7 @@ def read( dfs.append(df) if len(dfs) > 0: - out = concat(dfs, axis=1) + out = concat(dfs, axis=1, copy=True) out = out.reindex(columns=items, copy=False) return out diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py new file mode 100644 index 0000000000000..569cbc4ad7583 --- /dev/null +++ b/pandas/tests/copy_view/test_functions.py @@ -0,0 +1,179 @@ +import numpy as np + +from pandas import ( + DataFrame, + Series, + concat, +) +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + + +def test_concat_frames(using_copy_on_write): + df = DataFrame({"b": ["a"] * 3}) + df2 = DataFrame({"a": ["a"] * 3}) + df_orig = df.copy() + result = concat([df, df2], axis=1) + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + else: + assert not np.shares_memory(get_array(result, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + + result.iloc[0, 0] = "d" + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + + result.iloc[0, 1] = "d" + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + tm.assert_frame_equal(df, df_orig) + + +def test_concat_frames_updating_input(using_copy_on_write): + df = DataFrame({"b": ["a"] * 3}) + df2 = DataFrame({"a": ["a"] * 3}) + result = concat([df, df2], axis=1) + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + else: + assert not np.shares_memory(get_array(result, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + + expected = result.copy() + df.iloc[0, 0] = "d" + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + + df2.iloc[0, 0] = "d" + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + tm.assert_frame_equal(result, expected) + + +def test_concat_series(using_copy_on_write): + ser = Series([1, 2], name="a") + ser2 = Series([3, 4], name="b") + ser_orig = ser.copy() + ser2_orig = ser2.copy() + result = concat([ser, ser2], axis=1) + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), ser.values) + assert np.shares_memory(get_array(result, "b"), ser2.values) + else: + assert not np.shares_memory(get_array(result, "a"), ser.values) + assert not np.shares_memory(get_array(result, "b"), ser2.values) + + result.iloc[0, 0] = 100 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), ser.values) + assert np.shares_memory(get_array(result, "b"), ser2.values) + + result.iloc[0, 1] = 1000 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "b"), ser2.values) + tm.assert_series_equal(ser, ser_orig) + tm.assert_series_equal(ser2, ser2_orig) + + +def test_concat_frames_chained(using_copy_on_write): + df1 = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + df2 = DataFrame({"c": [4, 5, 6]}) + df3 = DataFrame({"d": [4, 5, 6]}) + result = concat([concat([df1, df2], axis=1), df3], axis=1) + expected = result.copy() + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert np.shares_memory(get_array(result, "c"), get_array(df2, "c")) + assert np.shares_memory(get_array(result, "d"), get_array(df3, "d")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert not np.shares_memory(get_array(result, "c"), get_array(df2, "c")) + assert not np.shares_memory(get_array(result, "d"), get_array(df3, "d")) + + df1.iloc[0, 0] = 100 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + + tm.assert_frame_equal(result, expected) + + +def test_concat_series_chained(using_copy_on_write): + ser1 = Series([1, 2, 3], name="a") + ser2 = Series([4, 5, 6], name="c") + ser3 = Series([4, 5, 6], name="d") + result = concat([concat([ser1, ser2], axis=1), ser3], axis=1) + expected = result.copy() + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(ser1, "a")) + assert np.shares_memory(get_array(result, "c"), get_array(ser2, "c")) + assert np.shares_memory(get_array(result, "d"), get_array(ser3, "d")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(ser1, "a")) + assert not np.shares_memory(get_array(result, "c"), get_array(ser2, "c")) + assert not np.shares_memory(get_array(result, "d"), get_array(ser3, "d")) + + ser1.iloc[0] = 100 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(ser1, "a")) + + tm.assert_frame_equal(result, expected) + + +def test_concat_series_updating_input(using_copy_on_write): + ser = Series([1, 2], name="a") + ser2 = Series([3, 4], name="b") + expected = DataFrame({"a": [1, 2], "b": [3, 4]}) + result = concat([ser, ser2], axis=1) + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(ser, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(ser2, "b")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(ser, "a")) + assert not np.shares_memory(get_array(result, "b"), get_array(ser2, "b")) + + ser.iloc[0] = 100 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(ser, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(ser2, "b")) + tm.assert_frame_equal(result, expected) + + ser2.iloc[0] = 1000 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "b"), get_array(ser2, "b")) + tm.assert_frame_equal(result, expected) + + +def test_concat_mixed_series_frame(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "c": 1}) + ser = Series([4, 5, 6], name="d") + result = concat([df, ser], axis=1) + expected = result.copy() + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(result, "c"), get_array(df, "c")) + assert np.shares_memory(get_array(result, "d"), get_array(ser, "d")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(result, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(result, "d"), get_array(ser, "d")) + + ser.iloc[0] = 100 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "d"), get_array(ser, "d")) + + df.iloc[0, 0] = 100 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 5c24a180c4d6d..daee175ee9bac 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -717,6 +717,23 @@ def test_sort_values_inplace(using_copy_on_write, obj, kwargs, using_array_manag assert np.shares_memory(get_array(obj, "a"), get_array(view, "a")) +def test_round(using_copy_on_write): + df = DataFrame({"a": [1, 2], "b": "c"}) + df2 = df.round() + df_orig = df.copy() + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + + df2.iloc[0, 1] = "d" + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + tm.assert_frame_equal(df, df_orig) + + def test_reorder_levels(using_copy_on_write): index = MultiIndex.from_tuples( [(1, 1), (1, 2), (2, 1), (2, 2)], names=["one", "two"] diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 5fa989419a7d4..73be92d1a5dcf 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -51,7 +51,7 @@ def test_append_concat(self): assert isinstance(result.index, PeriodIndex) assert result.index[0] == s1.index[0] - def test_concat_copy(self, using_array_manager): + def test_concat_copy(self, using_array_manager, using_copy_on_write): df = DataFrame(np.random.randn(4, 3)) df2 = DataFrame(np.random.randint(0, 10, size=4).reshape(4, 1)) df3 = DataFrame({5: "foo"}, index=range(4)) @@ -82,7 +82,7 @@ def test_concat_copy(self, using_array_manager): result = concat([df, df2, df3, df4], axis=1, copy=False) for arr in result._mgr.arrays: if arr.dtype.kind == "f": - if using_array_manager: + if using_array_manager or using_copy_on_write: # this is a view on some array in either df or df4 assert any( np.shares_memory(arr, other)