-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
CoW warning mode: add warning for single block setitem #55838
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
e55b985
5a5a1d5
b570356
8b25667
956da9d
92044e6
a356b8d
b6ed119
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12392,7 +12392,7 @@ def _inplace_method(self, other, op) -> Self: | |
""" | ||
warn = True | ||
if not PYPY and warn_copy_on_write(): | ||
if sys.getrefcount(self) <= 5: | ||
if sys.getrefcount(self) <= 4: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How did you arrive at the conclusion to change this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From testing in practice, but my assumption is that it is the consequence of removing the item cache, which means one reference less (so this change sounds logical in hindsight) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah yeah that makes a lot of sense Thx |
||
# we are probably in an inplace setitem context (e.g. df['a'] += 1) | ||
warn = False | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,11 +19,12 @@ def index_view(index_data=[1, 2]): | |
return idx, view | ||
|
||
|
||
def test_set_index_update_column(using_copy_on_write): | ||
def test_set_index_update_column(using_copy_on_write, warn_copy_on_write): | ||
df = DataFrame({"a": [1, 2], "b": 1}) | ||
df = df.set_index("a", drop=False) | ||
expected = df.index.copy(deep=True) | ||
df.iloc[0, 0] = 100 | ||
with tm.assert_cow_warning(warn_copy_on_write): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks like a false positive to me, shouldn't set_index copy the data in non-Cow? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
You would assume so, but no .. (that's kind of a horrible bug I would say):
See #42934 (it seems I actually have an old draft PR for this) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yikes this is terrible and unexpected. |
||
df.iloc[0, 0] = 100 | ||
if using_copy_on_write: | ||
tm.assert_index_equal(df.index, expected) | ||
else: | ||
|
@@ -39,49 +40,53 @@ def test_set_index_drop_update_column(using_copy_on_write): | |
tm.assert_index_equal(df.index, expected) | ||
|
||
|
||
def test_set_index_series(using_copy_on_write): | ||
def test_set_index_series(using_copy_on_write, warn_copy_on_write): | ||
df = DataFrame({"a": [1, 2], "b": 1.5}) | ||
ser = Series([10, 11]) | ||
df = df.set_index(ser) | ||
expected = df.index.copy(deep=True) | ||
ser.iloc[0] = 100 | ||
with tm.assert_cow_warning(warn_copy_on_write): | ||
ser.iloc[0] = 100 | ||
if using_copy_on_write: | ||
tm.assert_index_equal(df.index, expected) | ||
else: | ||
tm.assert_index_equal(df.index, Index([100, 11])) | ||
|
||
|
||
def test_assign_index_as_series(using_copy_on_write): | ||
def test_assign_index_as_series(using_copy_on_write, warn_copy_on_write): | ||
df = DataFrame({"a": [1, 2], "b": 1.5}) | ||
ser = Series([10, 11]) | ||
df.index = ser | ||
expected = df.index.copy(deep=True) | ||
ser.iloc[0] = 100 | ||
with tm.assert_cow_warning(warn_copy_on_write): | ||
ser.iloc[0] = 100 | ||
if using_copy_on_write: | ||
tm.assert_index_equal(df.index, expected) | ||
else: | ||
tm.assert_index_equal(df.index, Index([100, 11])) | ||
|
||
|
||
def test_assign_index_as_index(using_copy_on_write): | ||
def test_assign_index_as_index(using_copy_on_write, warn_copy_on_write): | ||
df = DataFrame({"a": [1, 2], "b": 1.5}) | ||
ser = Series([10, 11]) | ||
rhs_index = Index(ser) | ||
df.index = rhs_index | ||
rhs_index = None # overwrite to clear reference | ||
expected = df.index.copy(deep=True) | ||
ser.iloc[0] = 100 | ||
with tm.assert_cow_warning(warn_copy_on_write): | ||
ser.iloc[0] = 100 | ||
if using_copy_on_write: | ||
tm.assert_index_equal(df.index, expected) | ||
else: | ||
tm.assert_index_equal(df.index, Index([100, 11])) | ||
|
||
|
||
def test_index_from_series(using_copy_on_write): | ||
def test_index_from_series(using_copy_on_write, warn_copy_on_write): | ||
ser = Series([1, 2]) | ||
idx = Index(ser) | ||
expected = idx.copy(deep=True) | ||
ser.iloc[0] = 100 | ||
with tm.assert_cow_warning(warn_copy_on_write): | ||
ser.iloc[0] = 100 | ||
if using_copy_on_write: | ||
tm.assert_index_equal(idx, expected) | ||
else: | ||
|
@@ -96,12 +101,13 @@ def test_index_from_series_copy(using_copy_on_write): | |
assert np.shares_memory(get_array(ser), arr) | ||
|
||
|
||
def test_index_from_index(using_copy_on_write): | ||
def test_index_from_index(using_copy_on_write, warn_copy_on_write): | ||
ser = Series([1, 2]) | ||
idx = Index(ser) | ||
idx = Index(idx) | ||
expected = idx.copy(deep=True) | ||
ser.iloc[0] = 100 | ||
with tm.assert_cow_warning(warn_copy_on_write): | ||
ser.iloc[0] = 100 | ||
if using_copy_on_write: | ||
tm.assert_index_equal(idx, expected) | ||
else: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I needed to make a decision here about what to do with the item cache in the warning mode.
In the end, the current version of the PR updates the warning mode to also not populate and use the item cache, like for Cow mode.
This will cause some behaviour changes when enabling the warning mode. But it also avoids a bunch of otherwise false-positive warnings. So it's a trade-off.
Examples that would otherwise give false positives:
or whenever we access columns under the hood in a method, like
df2 = df.astype({..})
, even when df2 fully consists of copied data because there was an actual cast,df
will have referenced because of the item cache.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fine by me