-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: Add lazy copy for take and between_time #50476
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
7c5513c
a21a2f3
7d48fea
700be46
697bb14
f694113
d66e5e3
258b1bc
36287a3
eabf2fa
297a828
64afeb2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
DataFrame, | ||
MultiIndex, | ||
Series, | ||
date_range, | ||
) | ||
import pandas._testing as tm | ||
from pandas.tests.copy_view.util import get_array | ||
|
@@ -387,6 +388,39 @@ def test_assign_drop_duplicates(using_copy_on_write, method): | |
tm.assert_frame_equal(df, df_orig) | ||
|
||
|
||
@pytest.mark.parametrize("obj", [Series([1, 2]), DataFrame({"a": [1, 2]})]) | ||
def test_take(using_copy_on_write, obj): | ||
obj_orig = obj.copy() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Small nitpick: can you add a comment here that is testing the corner case of taking all rows? (because in general take always by definition returns a copy) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added |
||
obj2 = obj.take([0, 1]) | ||
|
||
if using_copy_on_write: | ||
assert np.shares_memory(obj2.values, obj.values) | ||
else: | ||
assert not np.shares_memory(obj2.values, obj.values) | ||
|
||
obj2.iloc[0] = 0 | ||
if using_copy_on_write: | ||
assert not np.shares_memory(obj2.values, obj.values) | ||
tm.assert_equal(obj, obj_orig) | ||
|
||
|
||
@pytest.mark.parametrize("obj", [Series([1, 2]), DataFrame({"a": [1, 2]})]) | ||
def test_between_time(using_copy_on_write, obj): | ||
obj.index = date_range("2018-04-09", periods=2, freq="1D20min") | ||
obj_orig = obj.copy() | ||
obj2 = obj.between_time("0:00", "1:00") | ||
|
||
if using_copy_on_write: | ||
assert np.shares_memory(obj2.values, obj.values) | ||
else: | ||
assert not np.shares_memory(obj2.values, obj.values) | ||
|
||
obj2.iloc[0] = 0 | ||
if using_copy_on_write: | ||
assert not np.shares_memory(obj2.values, obj.values) | ||
tm.assert_equal(obj, obj_orig) | ||
|
||
|
||
def test_reindex_like(using_copy_on_write): | ||
df = DataFrame({"a": [1, 2], "b": "a"}) | ||
other = DataFrame({"b": "a", "a": [1, 2]}) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we do a quick check of how costly this is compared to the actual
take
? (to ensure we don't introduce a performance regression for the other cases)Maybe could also first check if the length of
indices
is equal tolen(self)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
np.array_equal
actually already does that, so doing that ourselves is not neededThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I profiled this, the cost is reduced if the DataFrame gets larger.
The actual problem is a bit different though: array_equal has no early exit, even if the first value is different, it checks the whole array (wanted to bring this up tomorrow as well). We should probably write something ourselves, because I guess we will need that a couple of times.