Skip to content

ENH: Add lazy copy for take and between_time #50476

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Jan 13, 2023
13 changes: 13 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from pandas._config import config

from pandas._libs import lib
from pandas._libs.lib import array_equal_fast
from pandas._libs.tslibs import (
Period,
Tick,
Expand Down Expand Up @@ -3789,6 +3790,18 @@ def _take(

See the docstring of `take` for full explanation of the parameters.
"""
if not isinstance(indices, slice):
indices = np.asarray(indices, dtype=np.intp)
if (
axis == 0
and indices.ndim == 1
and using_copy_on_write()
and array_equal_fast(
indices,
np.arange(0, len(self), dtype=np.intp),
Comment on lines +3764 to +3766
Copy link
Member

@jorisvandenbossche jorisvandenbossche Jan 11, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Further idea to potentially speed this up: right now we only use array_equal_fast to check an array against a standard 0...n indexer, I think?
For that specific case, we don't actually need to create this second array, but could just use the iteration variable inside array_equal_fast (it only needs the length).

(now, I don't know if we would want to start using array_equal_fast for other cases)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(that also avoids creating this additional array even for the fast cases where the length doesn't even match)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I’ll take a look at this in a follow up if ok to check how big the performance improvement would be

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure

)
):
return self.copy(deep=None)

new_data = self._mgr.take(
indices,
Expand Down
14 changes: 13 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,10 @@
properties,
reshape,
)
from pandas._libs.lib import no_default
from pandas._libs.lib import (
array_equal_fast,
no_default,
)
from pandas._typing import (
AggFuncType,
AlignJoin,
Expand Down Expand Up @@ -143,6 +146,7 @@
SingleArrayManager,
SingleBlockManager,
)
from pandas.core.internals.managers import using_copy_on_write
from pandas.core.shared_docs import _shared_docs
from pandas.core.sorting import (
ensure_key_mapped,
Expand Down Expand Up @@ -876,6 +880,14 @@ def take(self, indices, axis: Axis = 0, **kwargs) -> Series:
nv.validate_take((), kwargs)

indices = ensure_platform_int(indices)

if (
indices.ndim == 1
and using_copy_on_write()
and array_equal_fast(indices, np.arange(0, len(self), dtype=indices.dtype))
):
return self.copy(deep=None)

new_index = self.index.take(indices)
new_values = self._values.take(indices)

Expand Down
33 changes: 33 additions & 0 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,39 @@ def test_assign_drop_duplicates(using_copy_on_write, method):
tm.assert_frame_equal(df, df_orig)


@pytest.mark.parametrize("obj", [Series([1, 2]), DataFrame({"a": [1, 2]})])
def test_take(using_copy_on_write, obj):
obj_orig = obj.copy()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Small nitpick: can you add a comment here that is testing the corner case of taking all rows? (because in general take always by definition returns a copy)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added

obj2 = obj.take([0, 1])

if using_copy_on_write:
assert np.shares_memory(obj2.values, obj.values)
else:
assert not np.shares_memory(obj2.values, obj.values)

obj2.iloc[0] = 0
if using_copy_on_write:
assert not np.shares_memory(obj2.values, obj.values)
tm.assert_equal(obj, obj_orig)


@pytest.mark.parametrize("obj", [Series([1, 2]), DataFrame({"a": [1, 2]})])
def test_between_time(using_copy_on_write, obj):
obj.index = date_range("2018-04-09", periods=2, freq="1D20min")
obj_orig = obj.copy()
obj2 = obj.between_time("0:00", "1:00")

if using_copy_on_write:
assert np.shares_memory(obj2.values, obj.values)
else:
assert not np.shares_memory(obj2.values, obj.values)

obj2.iloc[0] = 0
if using_copy_on_write:
assert not np.shares_memory(obj2.values, obj.values)
tm.assert_equal(obj, obj_orig)


def test_reindex_like(using_copy_on_write):
df = DataFrame({"a": [1, 2], "b": "a"})
other = DataFrame({"b": "a", "a": [1, 2]})
Expand Down