diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index d1bc322a6ec7a..173793098027f 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -243,6 +243,10 @@ Copy-on-Write improvements a modification to the data happens) when constructing a Series from an existing Series with the default of ``copy=False`` (:issue:`50471`) +- The :class:`DataFrame` constructor, when constructing a DataFrame from a dictionary + of Series objects and specifying ``copy=False``, will now use a lazy copy + of those Series objects for the columns of the DataFrame (:issue:`50777`) + - Trying to set values using chained assignment (for example, ``df["a"][1:3] = 0``) will now always raise an exception when Copy-on-Write is enabled. In this mode, chained assignment can never work because we are always setting into a temporary diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 401b9e60308be..669406eba34ce 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2207,7 +2207,9 @@ def get_block_type(dtype: DtypeObj): return cls -def new_block_2d(values: ArrayLike, placement: BlockPlacement): +def new_block_2d( + values: ArrayLike, placement: BlockPlacement, refs: BlockValuesRefs | None = None +): # new_block specialized to case with # ndim=2 # isinstance(placement, BlockPlacement) @@ -2215,7 +2217,7 @@ def new_block_2d(values: ArrayLike, placement: BlockPlacement): klass = get_block_type(values.dtype) values = maybe_coerce_values(values) - return klass(values, ndim=2, placement=placement) + return klass(values, ndim=2, placement=placement, refs=refs) def new_block( diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index ce709917d7123..e5b2e5ce42fcd 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -116,7 +116,7 @@ def arrays_to_mgr( index = ensure_index(index) # don't force copy because getting jammed in an ndarray anyway - arrays = _homogenize(arrays, index, dtype) + arrays, refs = _homogenize(arrays, index, dtype) # _homogenize ensures # - all(len(x) == len(index) for x in arrays) # - all(x.ndim == 1 for x in arrays) @@ -126,8 +126,10 @@ def arrays_to_mgr( else: index = ensure_index(index) arrays = [extract_array(x, extract_numpy=True) for x in arrays] + # with _from_arrays, the passed arrays should never be Series objects + refs = [None] * len(arrays) - # Reached via DataFrame._from_arrays; we do validation here + # Reached via DataFrame._from_arrays; we do minimal validation here for arr in arrays: if ( not isinstance(arr, (np.ndarray, ExtensionArray)) @@ -148,7 +150,7 @@ def arrays_to_mgr( if typ == "block": return create_block_manager_from_column_arrays( - arrays, axes, consolidate=consolidate + arrays, axes, consolidate=consolidate, refs=refs ) elif typ == "array": return ArrayManager(arrays, [index, columns]) @@ -547,9 +549,13 @@ def _ensure_2d(values: np.ndarray) -> np.ndarray: return values -def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]: +def _homogenize( + data, index: Index, dtype: DtypeObj | None +) -> tuple[list[ArrayLike], list[Any]]: oindex = None homogenized = [] + # if the original array-like in `data` is a Series, keep track of this Series' refs + refs: list[Any] = [] for val in data: if isinstance(val, ABCSeries): @@ -559,7 +565,10 @@ def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]: # Forces alignment. No need to copy data since we # are putting it into an ndarray later val = val.reindex(index, copy=False) - + if isinstance(val._mgr, SingleBlockManager): + refs.append(val._mgr._block.refs) + else: + refs.append(None) val = val._values else: if isinstance(val, dict): @@ -578,10 +587,11 @@ def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]: val = sanitize_array(val, index, dtype=dtype, copy=False) com.require_length_match(val, index) + refs.append(None) homogenized.append(val) - return homogenized + return homogenized, refs def _extract_index(data) -> Index: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 1439c174b4f43..95ce4774bfe1d 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -2071,7 +2071,8 @@ def create_block_manager_from_blocks( def create_block_manager_from_column_arrays( arrays: list[ArrayLike], axes: list[Index], - consolidate: bool = True, + consolidate: bool, + refs: list, ) -> BlockManager: # Assertions disabled for performance (caller is responsible for verifying) # assert isinstance(axes, list) @@ -2085,7 +2086,7 @@ def create_block_manager_from_column_arrays( # verify_integrity=False below. try: - blocks = _form_blocks(arrays, consolidate) + blocks = _form_blocks(arrays, consolidate, refs) mgr = BlockManager(blocks, axes, verify_integrity=False) except ValueError as e: raise_construction_error(len(arrays), arrays[0].shape, axes, e) @@ -2139,13 +2140,17 @@ def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, bool, DtypeObj]: return sep, isinstance(dtype, np.dtype), dtype -def _form_blocks(arrays: list[ArrayLike], consolidate: bool) -> list[Block]: +def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list[Block]: tuples = list(enumerate(arrays)) if not consolidate: - nbs = _tuples_to_blocks_no_consolidate(tuples) + nbs = _tuples_to_blocks_no_consolidate(tuples, refs) return nbs + # when consolidating, we can ignore refs (either stacking always copies, + # or the EA is already copied in the calling dict_to_mgr) + # TODO(CoW) check if this is also valid for rec_array_to_mgr + # group by dtype grouper = itertools.groupby(tuples, _grouping_func) @@ -2183,11 +2188,13 @@ def _form_blocks(arrays: list[ArrayLike], consolidate: bool) -> list[Block]: return nbs -def _tuples_to_blocks_no_consolidate(tuples) -> list[Block]: +def _tuples_to_blocks_no_consolidate(tuples, refs) -> list[Block]: # tuples produced within _form_blocks are of the form (placement, array) return [ - new_block_2d(ensure_block_shape(x[1], ndim=2), placement=BlockPlacement(x[0])) - for x in tuples + new_block_2d( + ensure_block_shape(arr, ndim=2), placement=BlockPlacement(i), refs=ref + ) + for ((i, arr), ref) in zip(tuples, refs) ] diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 43e0b77a90a85..0de995d2f773f 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -17,10 +17,7 @@ import numpy as np -from pandas._config import ( - get_option, - using_copy_on_write, -) +from pandas._config import using_copy_on_write from pandas._typing import ( Axis, @@ -52,7 +49,6 @@ get_unanimous_names, ) from pandas.core.internals import concatenate_managers -from pandas.core.internals.construction import dict_to_mgr if TYPE_CHECKING: from pandas import ( @@ -535,26 +531,18 @@ def __init__( ) else: - original_obj = obj - name = new_name = getattr(obj, "name", None) + name = getattr(obj, "name", None) if ignore_index or name is None: - new_name = current_column + name = current_column current_column += 1 # doing a row-wise concatenation so need everything # to line up if self._is_frame and axis == 1: - new_name = 0 + name = 0 # mypy needs to know sample is not an NDFrame sample = cast("DataFrame | Series", sample) - obj = sample._constructor(obj, columns=[name], copy=False) - if using_copy_on_write(): - # TODO(CoW): Remove when ref tracking in constructors works - for i, block in enumerate(original_obj._mgr.blocks): # type: ignore[union-attr] # noqa - obj._mgr.blocks[i].refs = block.refs # type: ignore[union-attr] # noqa - obj._mgr.blocks[i].refs.add_reference(obj._mgr.blocks[i]) # type: ignore[arg-type, union-attr] # noqa - - obj.columns = [new_name] + obj = sample._constructor({name: obj}, copy=False) self.objs.append(obj) @@ -604,18 +592,7 @@ def get_result(self): cons = sample._constructor_expanddim index, columns = self.new_axes - mgr = dict_to_mgr( - data, - index, - None, - copy=self.copy, - typ=get_option("mode.data_manager"), - ) - if using_copy_on_write() and not self.copy: - for i, obj in enumerate(self.objs): - mgr.blocks[i].refs = obj._mgr.blocks[0].refs # type: ignore[union-attr] # noqa - mgr.blocks[i].refs.add_reference(mgr.blocks[i]) # type: ignore[arg-type, union-attr] # noqa - df = cons(mgr, copy=False) + df = cons(data, index=index, copy=self.copy) df.columns = columns return df.__finalize__(self, method="concat") diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index 1b709a8aa8076..2cacf0d6f6f91 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -1,7 +1,12 @@ import numpy as np import pytest -from pandas import Series +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array # ----------------------------------------------------------------------------- # Copy/view behaviour for Series / DataFrame constructors @@ -75,3 +80,83 @@ def test_series_from_series_with_reindex(using_copy_on_write): assert not np.shares_memory(ser.values, result.values) if using_copy_on_write: assert not result._mgr.blocks[0].refs.has_reference() + + +@pytest.mark.parametrize("dtype", [None, "int64", "Int64"]) +@pytest.mark.parametrize("index", [None, [0, 1, 2]]) +@pytest.mark.parametrize("columns", [None, ["a", "b"], ["a", "b", "c"]]) +def test_dataframe_from_dict_of_series( + request, using_copy_on_write, columns, index, dtype +): + # Case: constructing a DataFrame from Series objects with copy=False + # has to do a lazy following CoW rules + # (the default for DataFrame(dict) is still to copy to ensure consolidation) + s1 = Series([1, 2, 3]) + s2 = Series([4, 5, 6]) + s1_orig = s1.copy() + expected = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6]}, index=index, columns=columns, dtype=dtype + ) + + result = DataFrame( + {"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False + ) + + # the shallow copy still shares memory + assert np.shares_memory(get_array(result, "a"), get_array(s1)) + + # mutating the new dataframe doesn't mutate original + result.iloc[0, 0] = 10 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(s1)) + tm.assert_series_equal(s1, s1_orig) + else: + assert s1.iloc[0] == 10 + + # the same when modifying the parent series + s1 = Series([1, 2, 3]) + s2 = Series([4, 5, 6]) + result = DataFrame( + {"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False + ) + s1.iloc[0] = 10 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(s1)) + tm.assert_frame_equal(result, expected) + else: + assert result.iloc[0, 0] == 10 + + +@pytest.mark.parametrize("dtype", [None, "int64"]) +def test_dataframe_from_dict_of_series_with_reindex(dtype): + # Case: constructing a DataFrame from Series objects with copy=False + # and passing an index that requires an actual (no-view) reindex -> need + # to ensure the result doesn't have refs set up to unnecessarily trigger + # a copy on write + s1 = Series([1, 2, 3]) + s2 = Series([4, 5, 6]) + df = DataFrame({"a": s1, "b": s2}, index=[1, 2, 3], dtype=dtype, copy=False) + + # df should own its memory, so mutating shouldn't trigger a copy + arr_before = get_array(df, "a") + assert not np.shares_memory(arr_before, get_array(s1)) + df.iloc[0, 0] = 100 + arr_after = get_array(df, "a") + assert np.shares_memory(arr_before, arr_after) + + +@pytest.mark.parametrize("index", [None, [0, 1, 2]]) +def test_dataframe_from_dict_of_series_with_dtype(index): + # Variant of above, but now passing a dtype that causes a copy + # -> need to ensure the result doesn't have refs set up to unnecessarily + # trigger a copy on write + s1 = Series([1.0, 2.0, 3.0]) + s2 = Series([4, 5, 6]) + df = DataFrame({"a": s1, "b": s2}, index=index, dtype="int64", copy=False) + + # df should own its memory, so mutating shouldn't trigger a copy + arr_before = get_array(df, "a") + assert not np.shares_memory(arr_before, get_array(s1)) + df.iloc[0, 0] = 100 + arr_after = get_array(df, "a") + assert np.shares_memory(arr_before, arr_after)