-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
CoW: Delay copy when setting Series into DataFrame #51698
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 14 commits
b5d4f84
93a5d6c
e3a50dd
290facd
03ca70b
d75f46a
c61dffb
7bdff6b
90777f5
46256fe
2bc836b
14d77da
fcb0bb4
d6ed1cc
eca3446
f86f077
1997e3f
0e3d3f7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -199,6 +199,7 @@ | |
if TYPE_CHECKING: | ||
import datetime | ||
|
||
from pandas._libs.internals import BlockValuesRefs | ||
from pandas._typing import ( | ||
AggFuncType, | ||
AlignJoin, | ||
|
@@ -3944,11 +3945,11 @@ def isetitem(self, loc, value) -> None: | |
) | ||
|
||
for i, idx in enumerate(loc): | ||
arraylike = self._sanitize_column(value.iloc[:, i]) | ||
arraylike, _ = self._sanitize_column(value.iloc[:, i]) | ||
self._iset_item_mgr(idx, arraylike, inplace=False) | ||
return | ||
|
||
arraylike = self._sanitize_column(value) | ||
arraylike, _ = self._sanitize_column(value) | ||
self._iset_item_mgr(loc, arraylike, inplace=False) | ||
|
||
def __setitem__(self, key, value): | ||
|
@@ -4117,7 +4118,7 @@ def _set_item_frame_value(self, key, value: DataFrame) -> None: | |
return | ||
|
||
# now align rows | ||
arraylike = _reindex_for_setitem(value, self.index) | ||
arraylike, _ = _reindex_for_setitem(value, self.index) | ||
self._set_item_mgr(key, arraylike) | ||
return | ||
|
||
|
@@ -4130,20 +4131,26 @@ def _set_item_frame_value(self, key, value: DataFrame) -> None: | |
self[key] = value[value.columns[0]] | ||
|
||
def _iset_item_mgr( | ||
self, loc: int | slice | np.ndarray, value, inplace: bool = False | ||
self, | ||
loc: int | slice | np.ndarray, | ||
value, | ||
inplace: bool = False, | ||
refs: BlockValuesRefs | None = None, | ||
) -> None: | ||
# when called from _set_item_mgr loc can be anything returned from get_loc | ||
self._mgr.iset(loc, value, inplace=inplace) | ||
self._mgr.iset(loc, value, inplace=inplace, refs=refs) | ||
self._clear_item_cache() | ||
|
||
def _set_item_mgr(self, key, value: ArrayLike) -> None: | ||
def _set_item_mgr( | ||
self, key, value: ArrayLike, refs: BlockValuesRefs | None = None | ||
) -> None: | ||
try: | ||
loc = self._info_axis.get_loc(key) | ||
except KeyError: | ||
# This item wasn't present, just insert at end | ||
self._mgr.insert(len(self._info_axis), key, value) | ||
self._mgr.insert(len(self._info_axis), key, value, refs) | ||
else: | ||
self._iset_item_mgr(loc, value) | ||
self._iset_item_mgr(loc, value, refs=refs) | ||
|
||
# check if we are modifying a copy | ||
# try to set first as we want an invalid | ||
|
@@ -4173,7 +4180,7 @@ def _set_item(self, key, value) -> None: | |
Series/TimeSeries will be conformed to the DataFrames index to | ||
ensure homogeneity. | ||
""" | ||
value = self._sanitize_column(value) | ||
value, refs = self._sanitize_column(value, using_cow=using_copy_on_write()) | ||
|
||
if ( | ||
key in self.columns | ||
|
@@ -4185,8 +4192,9 @@ def _set_item(self, key, value) -> None: | |
existing_piece = self[key] | ||
if isinstance(existing_piece, DataFrame): | ||
value = np.tile(value, (len(existing_piece.columns), 1)).T | ||
refs = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was wondering if this would still be a view in case we are only setting a single column (i.e. |
||
|
||
self._set_item_mgr(key, value) | ||
self._set_item_mgr(key, value, refs) | ||
|
||
def _set_value( | ||
self, index: IndexLabel, col, value: Scalar, takeable: bool = False | ||
|
@@ -4813,7 +4821,7 @@ def insert( | |
elif isinstance(value, DataFrame): | ||
value = value.iloc[:, 0] | ||
|
||
value = self._sanitize_column(value) | ||
value, _ = self._sanitize_column(value) | ||
self._mgr.insert(loc, column, value) | ||
|
||
def assign(self, **kwargs) -> DataFrame: | ||
|
@@ -4884,7 +4892,9 @@ def assign(self, **kwargs) -> DataFrame: | |
data[k] = com.apply_if_callable(v, data) | ||
return data | ||
|
||
def _sanitize_column(self, value) -> ArrayLike: | ||
def _sanitize_column( | ||
self, value, using_cow: bool = False | ||
) -> tuple[ArrayLike, BlockValuesRefs | None]: | ||
""" | ||
Ensures new columns (which go into the BlockManager as new blocks) are | ||
always copied and converted into an array. | ||
|
@@ -4902,11 +4912,13 @@ def _sanitize_column(self, value) -> ArrayLike: | |
# Using a DataFrame would mean coercing values to one dtype | ||
assert not isinstance(value, DataFrame) | ||
if is_dict_like(value): | ||
return _reindex_for_setitem(Series(value), self.index) | ||
if not isinstance(value, Series): | ||
value = Series(value) | ||
return _reindex_for_setitem(value, self.index, using_cow=using_cow) | ||
|
||
if is_list_like(value): | ||
com.require_length_match(value, self.index) | ||
return sanitize_array(value, self.index, copy=True, allow_2d=True) | ||
return sanitize_array(value, self.index, copy=True, allow_2d=True), None | ||
|
||
@property | ||
def _series(self): | ||
|
@@ -12031,11 +12043,15 @@ def _from_nested_dict(data) -> collections.defaultdict: | |
return new_data | ||
|
||
|
||
def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike: | ||
def _reindex_for_setitem( | ||
value: DataFrame | Series, index: Index, using_cow: bool = False | ||
) -> tuple[ArrayLike, BlockValuesRefs | None]: | ||
# reindex if necessary | ||
|
||
if value.index.equals(index) or not len(index): | ||
return value._values.copy() | ||
if using_cow and isinstance(value, Series): | ||
return value._values, value._references | ||
return value._values.copy(), None | ||
|
||
# GH#4107 | ||
try: | ||
|
@@ -12049,4 +12065,4 @@ def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike: | |
raise TypeError( | ||
"incompatible index of inserted column with frame index" | ||
) from err | ||
return reindexed_value | ||
return reindexed_value, None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we return here the result of |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -977,28 +977,24 @@ def test_column_as_series_no_item_cache( | |
# TODO add tests for other indexing methods on the Series | ||
|
||
|
||
def test_dataframe_add_column_from_series(backend): | ||
def test_dataframe_add_column_from_series(backend, using_copy_on_write): | ||
# Case: adding a new column to a DataFrame from an existing column/series | ||
# -> always already takes a copy on assignment | ||
# (no change in behaviour here) | ||
# TODO can we achieve the same behaviour with Copy-on-Write? | ||
# -> delays copy under CoW | ||
_, DataFrame, Series = backend | ||
df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) | ||
|
||
s = Series([10, 11, 12]) | ||
df["new"] = s | ||
assert not np.shares_memory(get_array(df, "new"), s.values) | ||
if using_copy_on_write: | ||
assert np.shares_memory(get_array(df, "new"), get_array(s)) | ||
else: | ||
assert not np.shares_memory(get_array(df, "new"), get_array(s)) | ||
|
||
# editing series -> doesn't modify column in frame | ||
s[0] = 0 | ||
expected = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "new": [10, 11, 12]}) | ||
tm.assert_frame_equal(df, expected) | ||
|
||
# editing column in frame -> doesn't modify series | ||
df.loc[2, "new"] = 100 | ||
expected_s = Series([0, 11, 12]) | ||
tm.assert_series_equal(s, expected_s) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Was there a reason to remove this? Or just that testing the modification in both ways ideally is done by recreating the original There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah the copy was already triggered in the step before, so this part of the test was useless since the objects didn't share memory anyway |
||
|
||
|
||
@pytest.mark.parametrize("val", [100, "a"]) | ||
@pytest.mark.parametrize( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
abstraction-wise ive been thinking of refs as being handled at the Manager/Block level. is this wrong? is there a viable way to keep this at that level?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not if we don't want to move sanitise column to the same level. We have to keep track of the refs when we extract the array from values to set