From 6ce330604b8434ba214a7bda0efc9fb9ef07a571 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 16 Jan 2023 18:34:26 +0100 Subject: [PATCH 01/16] API / CoW: DataFrame(, copy=False) constructor now gives lazy copy --- pandas/core/internals/construction.py | 32 +++++++++++------ pandas/core/internals/managers.py | 26 +++++++++++++- pandas/tests/copy_view/test_constructors.py | 39 ++++++++++++++++++++- 3 files changed, 85 insertions(+), 12 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 9bdfd7991689b..b84ae52d16369 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -116,7 +116,7 @@ def arrays_to_mgr( index = ensure_index(index) # don't force copy because getting jammed in an ndarray anyway - arrays = _homogenize(arrays, index, dtype) + arrays, parents = _homogenize(arrays, index, dtype) # _homogenize ensures # - all(len(x) == len(index) for x in arrays) # - all(x.ndim == 1 for x in arrays) @@ -125,7 +125,8 @@ def arrays_to_mgr( else: index = ensure_index(index) - arrays = [extract_array(x, extract_numpy=True) for x in arrays] + # arrays = [extract_array(x, extract_numpy=True) for x in arrays] + parents = None # Reached via DataFrame._from_arrays; we do validation here for arr in arrays: @@ -148,7 +149,10 @@ def arrays_to_mgr( if typ == "block": return create_block_manager_from_column_arrays( - arrays, axes, consolidate=consolidate + arrays, + axes, + consolidate=consolidate, + parents=parents, ) elif typ == "array": return ArrayManager(arrays, [index, columns]) @@ -550,17 +554,22 @@ def _ensure_2d(values: np.ndarray) -> np.ndarray: def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]: oindex = None homogenized = [] + # if the original array-like in `data` is a Series, keep track of this Series + parents = [] for val in data: if isinstance(val, ABCSeries): + hval = val if dtype is not None: - val = val.astype(dtype, copy=False) - if val.index is not index: + hval = hval.astype(dtype, copy=False) + if hval.index is not index: # Forces alignment. No need to copy data since we # are putting it into an ndarray later - val = val.reindex(index, copy=False) - - val = val._values + hval = hval.reindex(index, copy=False) + if hval is val: + hval = val.copy(deep=False) + homogenized.append(hval._values) + parents.append(hval) else: if isinstance(val, dict): # GH#41785 this _should_ be equivalent to (but faster than) @@ -578,10 +587,13 @@ def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]: val = sanitize_array(val, index, dtype=dtype, copy=False) com.require_length_match(val, index) + homogenized.append(val) + parents.append(None) - homogenized.append(val) + if com.all_none(*parents): + parents = None - return homogenized + return homogenized, parents def _extract_index(data) -> Index: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 2ce294f257d75..eba68850d9fb4 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -2125,6 +2125,7 @@ def create_block_manager_from_column_arrays( arrays: list[ArrayLike], axes: list[Index], consolidate: bool = True, + parents: list | None = None, ) -> BlockManager: # Assertions disabled for performance (caller is responsible for verifying) # assert isinstance(axes, list) @@ -2139,7 +2140,30 @@ def create_block_manager_from_column_arrays( try: blocks = _form_blocks(arrays, consolidate) - mgr = BlockManager(blocks, axes, verify_integrity=False) + refs = None + parent = None + if parents is not None and using_copy_on_write(): + # elements in `parents` are Series objects *if* the original input + # for the column was a Series, or otherwise None + # -> in case of a Series, keep track of its refs if it has those + # (this Series is already a view on the original one, so we can + # directly use its ref instead of creating a new ref to this Series) + refs = [] + parent = [] + for ser in parents: + if ( + ser is not None + and ser._mgr.refs is not None + and (ref := ser._mgr.refs[0]) is not None + ): + refs.append(ref) + parent.append(ser) + else: + refs.append(None) + + mgr = BlockManager( + blocks, axes, refs=refs, parent=parent, verify_integrity=False + ) except ValueError as e: raise_construction_error(len(arrays), arrays[0].shape, axes, e) if consolidate: diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index c04c733e5ee1d..c88aab428bda7 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -1,6 +1,11 @@ import numpy as np -from pandas import Series +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array # ----------------------------------------------------------------------------- # Copy/view behaviour for Series / DataFrame constructors @@ -73,3 +78,35 @@ def test_series_from_series_with_reindex(using_copy_on_write): assert not np.shares_memory(ser.values, result.values) if using_copy_on_write: assert result._mgr.refs is None or result._mgr.refs[0] is None + + +def test_dataframe_from_dict_of_series(using_copy_on_write): + # Case: constructing a DataFrame from Series objects with copy=False + # has to do a lazy following CoW rules + # (the default for DataFrame(dict) is still to copy to ensure consolidation) + s1 = Series([1, 2, 3]) + s2 = Series([4, 5, 6]) + s1_orig = s1.copy() + expected = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + + result = DataFrame({"a": s1, "b": s2}, copy=False) + + # the shallow copy still shares memory + assert np.shares_memory(get_array(result, "a"), s1.values) + + # mutating the new dataframe doesn't mutate original + result.iloc[0, 0] = 10 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), s1.values) + tm.assert_series_equal(s1, s1_orig) + else: + assert s1.iloc[0] == 10 + + # the same when modifying the parent series + result = DataFrame({"a": s1, "b": s2}, copy=False) + s1.iloc[0] = 10 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), s1.values) + tm.assert_frame_equal(result, expected) + else: + assert result.iloc[0, 0] == 10 From 104cab907eb7ac6917600c8435dad014ac0dd868 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 16 Jan 2023 22:15:48 +0100 Subject: [PATCH 02/16] expand test --- pandas/core/internals/construction.py | 12 +++++++++++- pandas/tests/copy_view/test_constructors.py | 11 +++++++---- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index b84ae52d16369..31ede7b905549 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -10,10 +10,13 @@ Hashable, Sequence, ) +import weakref import numpy as np from numpy import ma +from pandas._config import using_copy_on_write + from pandas._libs import lib from pandas._typing import ( ArrayLike, @@ -56,6 +59,7 @@ ExtensionArray, FloatingArray, IntegerArray, + PandasArray, ) from pandas.core.arrays.string_ import StringDtype from pandas.core.construction import ( @@ -125,7 +129,9 @@ def arrays_to_mgr( else: index = ensure_index(index) - # arrays = [extract_array(x, extract_numpy=True) for x in arrays] + arrays = [ + arr.to_numpy() if isinstance(arr, PandasArray) else arr for arr in arrays + ] parents = None # Reached via DataFrame._from_arrays; we do validation here @@ -562,6 +568,10 @@ def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]: hval = val if dtype is not None: hval = hval.astype(dtype, copy=False) + if using_copy_on_write() and hval.values is val.values: + # TODO(CoW) remove when astype() has implemented CoW + hval._mgr.refs = [weakref.ref(val._mgr._block)] + hval._mgr.parent = val._mgr if hval.index is not index: # Forces alignment. No need to copy data since we # are putting it into an ndarray later diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index c88aab428bda7..01c21a800a885 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas import ( DataFrame, @@ -80,16 +81,18 @@ def test_series_from_series_with_reindex(using_copy_on_write): assert result._mgr.refs is None or result._mgr.refs[0] is None -def test_dataframe_from_dict_of_series(using_copy_on_write): +@pytest.mark.parametrize("dtype", [None, "int64"]) +@pytest.mark.parametrize("columns", [None, ["a", "b"], ["a", "b", "c"]]) +def test_dataframe_from_dict_of_series(using_copy_on_write, columns, dtype): # Case: constructing a DataFrame from Series objects with copy=False # has to do a lazy following CoW rules # (the default for DataFrame(dict) is still to copy to ensure consolidation) s1 = Series([1, 2, 3]) s2 = Series([4, 5, 6]) s1_orig = s1.copy() - expected = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + expected = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, dtype=dtype, columns=columns) - result = DataFrame({"a": s1, "b": s2}, copy=False) + result = DataFrame({"a": s1, "b": s2}, columns=columns, dtype=dtype, copy=False) # the shallow copy still shares memory assert np.shares_memory(get_array(result, "a"), s1.values) @@ -103,7 +106,7 @@ def test_dataframe_from_dict_of_series(using_copy_on_write): assert s1.iloc[0] == 10 # the same when modifying the parent series - result = DataFrame({"a": s1, "b": s2}, copy=False) + result = DataFrame({"a": s1, "b": s2}, columns=columns, dtype=dtype, copy=False) s1.iloc[0] = 10 if using_copy_on_write: assert not np.shares_memory(get_array(result, "a"), s1.values) From e192a632934b40ec3f4febb88a833bfc860b143c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 16 Jan 2023 23:44:52 +0100 Subject: [PATCH 03/16] try fix mypy --- pandas/core/internals/construction.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 31ede7b905549..cad776e1c3b9a 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -557,11 +557,13 @@ def _ensure_2d(values: np.ndarray) -> np.ndarray: return values -def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]: +def _homogenize( + data, index: Index, dtype: DtypeObj | None +) -> tuple[list[ArrayLike], list[Any] | None]: oindex = None homogenized = [] # if the original array-like in `data` is a Series, keep track of this Series - parents = [] + parents: list[Any] = [] for val in data: if isinstance(val, ABCSeries): @@ -600,10 +602,7 @@ def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]: homogenized.append(val) parents.append(None) - if com.all_none(*parents): - parents = None - - return homogenized, parents + return homogenized, None if com.all_none(*parents) else parents def _extract_index(data) -> Index: From a300b0889a4c8658101961252fc0119ba2ee4a12 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 17 Jan 2023 09:30:18 +0100 Subject: [PATCH 04/16] clean-up usage of internals in reshape/concat.py --- pandas/core/reshape/concat.py | 39 ++++++----------------------------- 1 file changed, 6 insertions(+), 33 deletions(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index f8220649bf890..0de995d2f773f 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -14,14 +14,10 @@ cast, overload, ) -import weakref import numpy as np -from pandas._config import ( - get_option, - using_copy_on_write, -) +from pandas._config import using_copy_on_write from pandas._typing import ( Axis, @@ -53,7 +49,6 @@ get_unanimous_names, ) from pandas.core.internals import concatenate_managers -from pandas.core.internals.construction import dict_to_mgr if TYPE_CHECKING: from pandas import ( @@ -536,25 +531,18 @@ def __init__( ) else: - original_obj = obj - name = new_name = getattr(obj, "name", None) + name = getattr(obj, "name", None) if ignore_index or name is None: - new_name = current_column + name = current_column current_column += 1 # doing a row-wise concatenation so need everything # to line up if self._is_frame and axis == 1: - new_name = 0 + name = 0 # mypy needs to know sample is not an NDFrame sample = cast("DataFrame | Series", sample) - obj = sample._constructor(obj, columns=[name], copy=False) - if using_copy_on_write(): - # TODO(CoW): Remove when ref tracking in constructors works - obj._mgr.parent = original_obj # type: ignore[union-attr] - obj._mgr.refs = [weakref.ref(original_obj._mgr.blocks[0])] # type: ignore[union-attr] # noqa: E501 - - obj.columns = [new_name] + obj = sample._constructor({name: obj}, copy=False) self.objs.append(obj) @@ -604,22 +592,7 @@ def get_result(self): cons = sample._constructor_expanddim index, columns = self.new_axes - mgr = dict_to_mgr( - data, - index, - None, - copy=self.copy, - typ=get_option("mode.data_manager"), - ) - if using_copy_on_write() and not self.copy: - parents = [obj._mgr for obj in self.objs] - mgr.parent = parents # type: ignore[union-attr] - refs = [ - weakref.ref(obj._mgr.blocks[0]) # type: ignore[union-attr] - for obj in self.objs - ] - mgr.refs = refs # type: ignore[union-attr] - df = cons(mgr, copy=False) + df = cons(data, index=index, copy=self.copy) df.columns = columns return df.__finalize__(self, method="concat") From df21ca2b55d910f199c522651e43c851241fd606 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 17 Jan 2023 09:39:27 +0100 Subject: [PATCH 05/16] further fix mypy --- pandas/core/internals/construction.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index cad776e1c3b9a..c8eaa71de6a63 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -572,8 +572,9 @@ def _homogenize( hval = hval.astype(dtype, copy=False) if using_copy_on_write() and hval.values is val.values: # TODO(CoW) remove when astype() has implemented CoW - hval._mgr.refs = [weakref.ref(val._mgr._block)] - hval._mgr.parent = val._mgr + refs = [weakref.ref(val._mgr._block)] # type: ignore[union-attr] + hval._mgr.refs = refs # type: ignore[union-attr] + hval._mgr.parent = val._mgr # type: ignore[union-attr] if hval.index is not index: # Forces alignment. No need to copy data since we # are putting it into an ndarray later From e7d2f0f0d297819a89005bc3f1fe69096f785758 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 20 Jan 2023 16:04:58 +0100 Subject: [PATCH 06/16] expand tests --- pandas/tests/copy_view/test_constructors.py | 48 +++++++++++++++++++-- 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index 01c21a800a885..45b142d9e9458 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -82,17 +82,22 @@ def test_series_from_series_with_reindex(using_copy_on_write): @pytest.mark.parametrize("dtype", [None, "int64"]) +@pytest.mark.parametrize("index", [None, [0, 1, 2]]) @pytest.mark.parametrize("columns", [None, ["a", "b"], ["a", "b", "c"]]) -def test_dataframe_from_dict_of_series(using_copy_on_write, columns, dtype): +def test_dataframe_from_dict_of_series(using_copy_on_write, columns, index, dtype): # Case: constructing a DataFrame from Series objects with copy=False # has to do a lazy following CoW rules # (the default for DataFrame(dict) is still to copy to ensure consolidation) s1 = Series([1, 2, 3]) s2 = Series([4, 5, 6]) s1_orig = s1.copy() - expected = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, dtype=dtype, columns=columns) + expected = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6]}, index=index, columns=columns, dtype=dtype + ) - result = DataFrame({"a": s1, "b": s2}, columns=columns, dtype=dtype, copy=False) + result = DataFrame( + {"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False + ) # the shallow copy still shares memory assert np.shares_memory(get_array(result, "a"), s1.values) @@ -106,10 +111,45 @@ def test_dataframe_from_dict_of_series(using_copy_on_write, columns, dtype): assert s1.iloc[0] == 10 # the same when modifying the parent series - result = DataFrame({"a": s1, "b": s2}, columns=columns, dtype=dtype, copy=False) + result = DataFrame( + {"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False + ) s1.iloc[0] = 10 if using_copy_on_write: assert not np.shares_memory(get_array(result, "a"), s1.values) tm.assert_frame_equal(result, expected) else: assert result.iloc[0, 0] == 10 + + +@pytest.mark.parametrize("dtype", [None, "int64"]) +def test_dataframe_from_dict_of_series_with_reindex(dtype): + # Case: constructing a DataFrame from Series objects with copy=False + # and passing an index that requires an actual (no-view) reindex -> need + # to ensure the result doesn't have refs set up to unnecessarily trigger + # a copy on write + s1 = Series([1, 2, 3]) + s2 = Series([4, 5, 6]) + df = DataFrame({"a": s1, "b": s2}, index=[1, 2, 3], dtype=dtype, copy=False) + + # df should own its memory, so mutating shouldn't trigger a copy + arr_before = get_array(df, "a") + df.iloc[0, 0] = 100 + arr_after = get_array(df, "a") + assert np.shares_memory(arr_before, arr_after) + + +@pytest.mark.parametrize("index", [None, [0, 1, 2]]) +def test_dataframe_from_dict_of_series_with_dtype(index): + # Variant of above, but now passing a dtype that causes a copy + # -> need to ensure the result doesn't have refs set up to unnecessarily + # trigger a copy on write + s1 = Series([1.0, 2.0, 3.0]) + s2 = Series([4, 5, 6]) + df = DataFrame({"a": s1, "b": s2}, index=index, dtype="int64", copy=False) + + # df should own its memory, so mutating shouldn't trigger a copy + arr_before = get_array(df, "a") + df.iloc[0, 0] = 100 + arr_after = get_array(df, "a") + assert np.shares_memory(arr_before, arr_after) From 27e8549fc2657d617fe446310dfdeac248f1c499 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 20 Jan 2023 16:13:07 +0100 Subject: [PATCH 07/16] fix reindex case --- pandas/core/internals/construction.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index c8eaa71de6a63..45899e8423a94 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -568,6 +568,7 @@ def _homogenize( for val in data: if isinstance(val, ABCSeries): hval = val + is_copy = False if dtype is not None: hval = hval.astype(dtype, copy=False) if using_copy_on_write() and hval.values is val.values: @@ -575,14 +576,20 @@ def _homogenize( refs = [weakref.ref(val._mgr._block)] # type: ignore[union-attr] hval._mgr.refs = refs # type: ignore[union-attr] hval._mgr.parent = val._mgr # type: ignore[union-attr] + if using_copy_on_write(): + if hval._mgr.refs is None: # type: ignore[union-attr] + is_copy = True if hval.index is not index: # Forces alignment. No need to copy data since we # are putting it into an ndarray later hval = hval.reindex(index, copy=False) + if using_copy_on_write(): + if hval._mgr.refs is None: # type: ignore[union-attr] + is_copy = True if hval is val: hval = val.copy(deep=False) homogenized.append(hval._values) - parents.append(hval) + parents.append(hval if not is_copy else None) else: if isinstance(val, dict): # GH#41785 this _should_ be equivalent to (but faster than) From 10238ba60fbf7c0204e579bc2c70d965000f8391 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 23 Jan 2023 10:56:28 +0100 Subject: [PATCH 08/16] clean-up --- pandas/core/internals/construction.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 45899e8423a94..1b3f905289dd2 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -59,7 +59,6 @@ ExtensionArray, FloatingArray, IntegerArray, - PandasArray, ) from pandas.core.arrays.string_ import StringDtype from pandas.core.construction import ( @@ -128,13 +127,12 @@ def arrays_to_mgr( # - all(type(x) is not PandasArray for x in arrays) else: + # Reached via DataFrame._from_arrays; we do minimal validation here index = ensure_index(index) - arrays = [ - arr.to_numpy() if isinstance(arr, PandasArray) else arr for arr in arrays - ] + arrays = [extract_array(x, extract_numpy=True) for x in arrays] + # with _from_arrays, the passed arrays should never be Series objects parents = None - # Reached via DataFrame._from_arrays; we do validation here for arr in arrays: if ( not isinstance(arr, (np.ndarray, ExtensionArray)) From 6d860bce23072c5ebd36348f6e4b35b85262e90a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 23 Jan 2023 11:00:34 +0100 Subject: [PATCH 09/16] add whatsnew --- doc/source/whatsnew/v2.0.0.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 605f1d4b26e13..43c2752a91bd1 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -125,6 +125,10 @@ Copy-on-Write improvements a modification to the data happens) when constructing a Series from an existing Series with the default of ``copy=False`` (:issue:`50471`) +- The :class:`DataFrame` constructor, when constructing a DataFrame from a dictionary + of Series objects and specifying ``copy=False``, will now use_inf_as_null a lazy copy + of those Series objects for the columns of the DataFrame (:issue:`50777`) + Copy-on-Write can be enabled through .. code-block:: python From b24ea5f2bd7a85b560fc9838b110c12fab21ccb7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 30 Jan 2023 16:50:48 +0100 Subject: [PATCH 10/16] add xfailed test for int64->Int64 --- pandas/tests/copy_view/test_constructors.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index 45b142d9e9458..7f1fe413fd3ee 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -81,10 +81,12 @@ def test_series_from_series_with_reindex(using_copy_on_write): assert result._mgr.refs is None or result._mgr.refs[0] is None -@pytest.mark.parametrize("dtype", [None, "int64"]) +@pytest.mark.parametrize("dtype", [None, "int64", "Int64"]) @pytest.mark.parametrize("index", [None, [0, 1, 2]]) @pytest.mark.parametrize("columns", [None, ["a", "b"], ["a", "b", "c"]]) -def test_dataframe_from_dict_of_series(using_copy_on_write, columns, index, dtype): +def test_dataframe_from_dict_of_series( + request, using_copy_on_write, columns, index, dtype +): # Case: constructing a DataFrame from Series objects with copy=False # has to do a lazy following CoW rules # (the default for DataFrame(dict) is still to copy to ensure consolidation) @@ -99,6 +101,12 @@ def test_dataframe_from_dict_of_series(using_copy_on_write, columns, index, dtyp {"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False ) + if using_copy_on_write and dtype == "Int64": + # TODO(CoW) remove this once astype properly tracks refs + request.node.add_marker( + pytest.mark.xfail(reason="astype with Int64 is giving view") + ) + # the shallow copy still shares memory assert np.shares_memory(get_array(result, "a"), s1.values) From 6e32a342539cec0bc0bbd83b60cf844c957550a1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 10 Feb 2023 10:20:45 +0100 Subject: [PATCH 11/16] refactor --- pandas/core/internals/blocks.py | 6 ++-- pandas/core/internals/construction.py | 50 +++++++++------------------ pandas/core/internals/managers.py | 45 ++++++++---------------- 3 files changed, 34 insertions(+), 67 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index e66011acb978b..6df4d59597c7c 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2167,7 +2167,9 @@ def get_block_type(dtype: DtypeObj): return cls -def new_block_2d(values: ArrayLike, placement: BlockPlacement): +def new_block_2d( + values: ArrayLike, placement: BlockPlacement, refs: BlockValuesRefs | None = None +): # new_block specialized to case with # ndim=2 # isinstance(placement, BlockPlacement) @@ -2175,7 +2177,7 @@ def new_block_2d(values: ArrayLike, placement: BlockPlacement): klass = get_block_type(values.dtype) values = maybe_coerce_values(values) - return klass(values, ndim=2, placement=placement) + return klass(values, ndim=2, placement=placement, refs=refs) def new_block( diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 6e94c8106cb4d..89814e61cc1af 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -10,13 +10,10 @@ Hashable, Sequence, ) -import weakref import numpy as np from numpy import ma -from pandas._config import using_copy_on_write - from pandas._libs import lib from pandas._typing import ( ArrayLike, @@ -119,7 +116,7 @@ def arrays_to_mgr( index = ensure_index(index) # don't force copy because getting jammed in an ndarray anyway - arrays, parents = _homogenize(arrays, index, dtype) + arrays, refs = _homogenize(arrays, index, dtype) # _homogenize ensures # - all(len(x) == len(index) for x in arrays) # - all(x.ndim == 1 for x in arrays) @@ -131,7 +128,7 @@ def arrays_to_mgr( index = ensure_index(index) arrays = [extract_array(x, extract_numpy=True) for x in arrays] # with _from_arrays, the passed arrays should never be Series objects - parents = None + refs = [None] * len(arrays) for arr in arrays: if ( @@ -153,10 +150,7 @@ def arrays_to_mgr( if typ == "block": return create_block_manager_from_column_arrays( - arrays, - axes, - consolidate=consolidate, - parents=parents, + arrays, axes, consolidate=consolidate, refs=refs ) elif typ == "array": return ArrayManager(arrays, [index, columns]) @@ -557,37 +551,25 @@ def _ensure_2d(values: np.ndarray) -> np.ndarray: def _homogenize( data, index: Index, dtype: DtypeObj | None -) -> tuple[list[ArrayLike], list[Any] | None]: +) -> tuple[list[ArrayLike], list[Any]]: oindex = None homogenized = [] # if the original array-like in `data` is a Series, keep track of this Series - parents: list[Any] = [] + refs: list[Any] = [] for val in data: if isinstance(val, ABCSeries): - hval = val - is_copy = False + orig = val if dtype is not None: - hval = hval.astype(dtype, copy=False) - if using_copy_on_write() and hval.values is val.values: - # TODO(CoW) remove when astype() has implemented CoW - refs = [weakref.ref(val._mgr._block)] # type: ignore[union-attr] - hval._mgr.refs = refs # type: ignore[union-attr] - hval._mgr.parent = val._mgr # type: ignore[union-attr] - if using_copy_on_write(): - if hval._mgr.refs is None: # type: ignore[union-attr] - is_copy = True - if hval.index is not index: + val = val.astype(dtype, copy=False) + if val.index is not index: # Forces alignment. No need to copy data since we # are putting it into an ndarray later - hval = hval.reindex(index, copy=False) - if using_copy_on_write(): - if hval._mgr.refs is None: # type: ignore[union-attr] - is_copy = True - if hval is val: - hval = val.copy(deep=False) - homogenized.append(hval._values) - parents.append(hval if not is_copy else None) + val = val.reindex(index, copy=False) + if val is orig: + val = val.copy(deep=False) + refs.append(val._mgr._block.refs) + val = val._values else: if isinstance(val, dict): # GH#41785 this _should_ be equivalent to (but faster than) @@ -605,10 +587,10 @@ def _homogenize( val = sanitize_array(val, index, dtype=dtype, copy=False) com.require_length_match(val, index) - homogenized.append(val) - parents.append(None) + refs.append(None) + homogenized.append(val) - return homogenized, None if com.all_none(*parents) else parents + return homogenized, refs def _extract_index(data) -> Index: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index bb445e3d5b8be..df77a877f0842 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -2076,7 +2076,7 @@ def create_block_manager_from_column_arrays( arrays: list[ArrayLike], axes: list[Index], consolidate: bool = True, - parents: list | None = None, + refs: list = None, ) -> BlockManager: # Assertions disabled for performance (caller is responsible for verifying) # assert isinstance(axes, list) @@ -2090,31 +2090,8 @@ def create_block_manager_from_column_arrays( # verify_integrity=False below. try: - blocks = _form_blocks(arrays, consolidate) - refs = None - parent = None - if parents is not None and using_copy_on_write(): - # elements in `parents` are Series objects *if* the original input - # for the column was a Series, or otherwise None - # -> in case of a Series, keep track of its refs if it has those - # (this Series is already a view on the original one, so we can - # directly use its ref instead of creating a new ref to this Series) - refs = [] - parent = [] - for ser in parents: - if ( - ser is not None - and ser._mgr.refs is not None - and (ref := ser._mgr.refs[0]) is not None - ): - refs.append(ref) - parent.append(ser) - else: - refs.append(None) - - mgr = BlockManager( - blocks, axes, refs=refs, parent=parent, verify_integrity=False - ) + blocks = _form_blocks(arrays, consolidate, refs) + mgr = BlockManager(blocks, axes, verify_integrity=False) except ValueError as e: raise_construction_error(len(arrays), arrays[0].shape, axes, e) if consolidate: @@ -2167,13 +2144,17 @@ def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, bool, DtypeObj]: return sep, isinstance(dtype, np.dtype), dtype -def _form_blocks(arrays: list[ArrayLike], consolidate: bool) -> list[Block]: +def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list[Block]: tuples = list(enumerate(arrays)) if not consolidate: - nbs = _tuples_to_blocks_no_consolidate(tuples) + nbs = _tuples_to_blocks_no_consolidate(tuples, refs) return nbs + # when consolidating, we can ignore refs (either stacking always copies, + # or the EA is already copied in the calling dict_to_mgr) + # TODO(CoW) check if this is also valid for rec_array_to_mgr + # group by dtype grouper = itertools.groupby(tuples, _grouping_func) @@ -2211,11 +2192,13 @@ def _form_blocks(arrays: list[ArrayLike], consolidate: bool) -> list[Block]: return nbs -def _tuples_to_blocks_no_consolidate(tuples) -> list[Block]: +def _tuples_to_blocks_no_consolidate(tuples, refs) -> list[Block]: # tuples produced within _form_blocks are of the form (placement, array) return [ - new_block_2d(ensure_block_shape(x[1], ndim=2), placement=BlockPlacement(x[0])) - for x in tuples + new_block_2d( + ensure_block_shape(arr, ndim=2), placement=BlockPlacement(i), refs=ref + ) + for ((i, arr), ref) in zip(tuples, refs) ] From 82bc4bb612379356a61ff31d0a0f018a06d1482c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 10 Feb 2023 10:24:03 +0100 Subject: [PATCH 12/16] cleanup --- pandas/core/internals/construction.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 89814e61cc1af..e54176aee3a26 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -124,12 +124,12 @@ def arrays_to_mgr( # - all(type(x) is not PandasArray for x in arrays) else: - # Reached via DataFrame._from_arrays; we do minimal validation here index = ensure_index(index) arrays = [extract_array(x, extract_numpy=True) for x in arrays] # with _from_arrays, the passed arrays should never be Series objects refs = [None] * len(arrays) + # Reached via DataFrame._from_arrays; we do minimal validation here for arr in arrays: if ( not isinstance(arr, (np.ndarray, ExtensionArray)) @@ -554,7 +554,7 @@ def _homogenize( ) -> tuple[list[ArrayLike], list[Any]]: oindex = None homogenized = [] - # if the original array-like in `data` is a Series, keep track of this Series + # if the original array-like in `data` is a Series, keep track of this Series' refs refs: list[Any] = [] for val in data: @@ -588,6 +588,7 @@ def _homogenize( val = sanitize_array(val, index, dtype=dtype, copy=False) com.require_length_match(val, index) refs.append(None) + homogenized.append(val) return homogenized, refs From 7fc85945d18b50547c202301510cc9c9931de2e5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 10 Feb 2023 10:52:24 +0100 Subject: [PATCH 13/16] fix for AM --- pandas/core/internals/construction.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index e54176aee3a26..b8bc7adb9bfcc 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -568,7 +568,10 @@ def _homogenize( val = val.reindex(index, copy=False) if val is orig: val = val.copy(deep=False) - refs.append(val._mgr._block.refs) + if isinstance(val._mgr, BlockManager): + refs.append(val._mgr._block.refs) + else: + refs.append(None) val = val._values else: if isinstance(val, dict): From 3d2d5e6fd0414421314471ebe183842672c0b631 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 10 Feb 2023 12:46:40 +0100 Subject: [PATCH 14/16] fix + address feedback --- pandas/core/internals/construction.py | 5 +---- pandas/tests/copy_view/test_constructors.py | 10 +++++++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index b8bc7adb9bfcc..e5b2e5ce42fcd 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -559,16 +559,13 @@ def _homogenize( for val in data: if isinstance(val, ABCSeries): - orig = val if dtype is not None: val = val.astype(dtype, copy=False) if val.index is not index: # Forces alignment. No need to copy data since we # are putting it into an ndarray later val = val.reindex(index, copy=False) - if val is orig: - val = val.copy(deep=False) - if isinstance(val._mgr, BlockManager): + if isinstance(val._mgr, SingleBlockManager): refs.append(val._mgr._block.refs) else: refs.append(None) diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index fc59ac271430e..2cacf0d6f6f91 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -103,23 +103,25 @@ def test_dataframe_from_dict_of_series( ) # the shallow copy still shares memory - assert np.shares_memory(get_array(result, "a"), s1.values) + assert np.shares_memory(get_array(result, "a"), get_array(s1)) # mutating the new dataframe doesn't mutate original result.iloc[0, 0] = 10 if using_copy_on_write: - assert not np.shares_memory(get_array(result, "a"), s1.values) + assert not np.shares_memory(get_array(result, "a"), get_array(s1)) tm.assert_series_equal(s1, s1_orig) else: assert s1.iloc[0] == 10 # the same when modifying the parent series + s1 = Series([1, 2, 3]) + s2 = Series([4, 5, 6]) result = DataFrame( {"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False ) s1.iloc[0] = 10 if using_copy_on_write: - assert not np.shares_memory(get_array(result, "a"), s1.values) + assert not np.shares_memory(get_array(result, "a"), get_array(s1)) tm.assert_frame_equal(result, expected) else: assert result.iloc[0, 0] == 10 @@ -137,6 +139,7 @@ def test_dataframe_from_dict_of_series_with_reindex(dtype): # df should own its memory, so mutating shouldn't trigger a copy arr_before = get_array(df, "a") + assert not np.shares_memory(arr_before, get_array(s1)) df.iloc[0, 0] = 100 arr_after = get_array(df, "a") assert np.shares_memory(arr_before, arr_after) @@ -153,6 +156,7 @@ def test_dataframe_from_dict_of_series_with_dtype(index): # df should own its memory, so mutating shouldn't trigger a copy arr_before = get_array(df, "a") + assert not np.shares_memory(arr_before, get_array(s1)) df.iloc[0, 0] = 100 arr_after = get_array(df, "a") assert np.shares_memory(arr_before, arr_after) From 48a00a111f1ccc09542eb345bc82394e1e63023d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 10 Feb 2023 14:18:16 +0100 Subject: [PATCH 15/16] fixup typing --- pandas/core/internals/managers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index ef4f2149d8ac7..15f721a774e72 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -2072,7 +2072,7 @@ def create_block_manager_from_column_arrays( arrays: list[ArrayLike], axes: list[Index], consolidate: bool = True, - refs: list = None, + refs: list | None = None, ) -> BlockManager: # Assertions disabled for performance (caller is responsible for verifying) # assert isinstance(axes, list) From 5476780e67dbdc3c52e0352259d160199968874a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 10 Feb 2023 15:39:48 +0100 Subject: [PATCH 16/16] try fix typing --- pandas/core/internals/managers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 15f721a774e72..95ce4774bfe1d 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -2071,8 +2071,8 @@ def create_block_manager_from_blocks( def create_block_manager_from_column_arrays( arrays: list[ArrayLike], axes: list[Index], - consolidate: bool = True, - refs: list | None = None, + consolidate: bool, + refs: list, ) -> BlockManager: # Assertions disabled for performance (caller is responsible for verifying) # assert isinstance(axes, list)