From 7c5513c276f92603046bc3820fa2ba1fc90c0347 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 28 Dec 2022 23:24:19 +0100 Subject: [PATCH 1/9] ENH: Add lazy copy for take and between_time --- pandas/core/generic.py | 2 ++ pandas/core/series.py | 5 ++++ pandas/tests/copy_view/test_methods.py | 34 ++++++++++++++++++++++++++ 3 files changed, 41 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c893e9ce3d9a9..f9dffc9392585 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3779,6 +3779,8 @@ def _take( See the docstring of `take` for full explanation of the parameters. """ + if axis == 0 and np.array_equal(indices, np.arange(0, len(self))): + return self.copy(deep=None) new_data = self._mgr.take( indices, diff --git a/pandas/core/series.py b/pandas/core/series.py index b69fb4c1b58aa..2850b339700d3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -143,6 +143,7 @@ SingleArrayManager, SingleBlockManager, ) +from pandas.core.internals.managers import _using_copy_on_write from pandas.core.shared_docs import _shared_docs from pandas.core.sorting import ( ensure_key_mapped, @@ -877,6 +878,10 @@ def take(self, indices, axis: Axis = 0, **kwargs) -> Series: nv.validate_take((), kwargs) indices = ensure_platform_int(indices) + + if _using_copy_on_write() and np.array_equal(indices, np.arange(0, len(self))): + return self.copy(deep=None) + new_index = self.index.take(indices) new_values = self._values.take(indices) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 878f1d8089d33..aa84f31ee85da 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -5,6 +5,7 @@ DataFrame, MultiIndex, Series, + date_range, ) import pandas._testing as tm from pandas.tests.copy_view.util import get_array @@ -387,6 +388,39 @@ def test_assign_drop_duplicates(using_copy_on_write, method): tm.assert_frame_equal(df, df_orig) +@pytest.mark.parametrize("obj", [Series([1, 2]), DataFrame({"a": [1, 2]})]) +def test_take(using_copy_on_write, obj): + obj_orig = obj.copy() + obj2 = obj.take([0, 1]) + + if using_copy_on_write: + assert np.shares_memory(obj2.values, obj.values) + else: + assert not np.shares_memory(obj2.values, obj.values) + + obj2.iloc[0] = 0 + if using_copy_on_write: + assert not np.shares_memory(obj2.values, obj.values) + tm.assert_equal(obj, obj_orig) + + +@pytest.mark.parametrize("obj", [Series([1, 2]), DataFrame({"a": [1, 2]})]) +def test_between_time(using_copy_on_write, obj): + obj.index = date_range("2018-04-09", periods=2, freq="1D20min") + obj_orig = obj.copy() + obj2 = obj.between_time("0:00", "1:00") + + if using_copy_on_write: + assert np.shares_memory(obj2.values, obj.values) + else: + assert not np.shares_memory(obj2.values, obj.values) + + obj2.iloc[0] = 0 + if using_copy_on_write: + assert not np.shares_memory(obj2.values, obj.values) + tm.assert_equal(obj, obj_orig) + + def test_reindex_like(using_copy_on_write): df = DataFrame({"a": [1, 2], "b": "a"}) other = DataFrame({"b": "a", "a": [1, 2]}) From 700be46ff9a3a090428970ce1586e85def84c6af Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 6 Jan 2023 19:47:29 +0100 Subject: [PATCH 2/9] Use array equal fast --- pandas/core/generic.py | 10 +++++++++- pandas/core/series.py | 4 ++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 858c80c163284..ee11666112617 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -32,6 +32,7 @@ from pandas._config import config from pandas._libs import lib +from pandas._libs.lib import array_equal_fast from pandas._libs.tslibs import ( Period, Tick, @@ -3789,7 +3790,14 @@ def _take( See the docstring of `take` for full explanation of the parameters. """ - if axis == 0 and np.array_equal(indices, np.arange(0, len(self))): + if ( + axis == 0 + and using_copy_on_write() + and array_equal_fast( + np.asarray(indices, dtype=np.intp), + np.arange(0, len(self), dtype=np.intp), + ) + ): return self.copy(deep=None) new_data = self._mgr.take( diff --git a/pandas/core/series.py b/pandas/core/series.py index aa32078047a6e..dead0ba9b176c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -143,7 +143,7 @@ SingleArrayManager, SingleBlockManager, ) -from pandas.core.internals.managers import _using_copy_on_write +from pandas.core.internals.managers import using_copy_on_write from pandas.core.shared_docs import _shared_docs from pandas.core.sorting import ( ensure_key_mapped, @@ -878,7 +878,7 @@ def take(self, indices, axis: Axis = 0, **kwargs) -> Series: indices = ensure_platform_int(indices) - if _using_copy_on_write() and np.array_equal(indices, np.arange(0, len(self))): + if using_copy_on_write() and np.array_equal(indices, np.arange(0, len(self))): return self.copy(deep=None) new_index = self.index.take(indices) From 697bb14bbf444c5c2b2c8cf98340172a2c73f9db Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 6 Jan 2023 22:53:37 +0100 Subject: [PATCH 3/9] Fix cond --- pandas/core/generic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ee11666112617..270f11412b877 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3790,11 +3790,13 @@ def _take( See the docstring of `take` for full explanation of the parameters. """ + indices = np.asarray(indices, dtype=np.intp) if ( axis == 0 + and indices.ndim == 1 and using_copy_on_write() and array_equal_fast( - np.asarray(indices, dtype=np.intp), + indices, np.arange(0, len(self), dtype=np.intp), ) ): From f69411372fd3be8b4089bd2e8437d7cd8885b81c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 6 Jan 2023 22:56:32 +0100 Subject: [PATCH 4/9] Fix condition --- pandas/core/generic.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 270f11412b877..d87075e0dd9de 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3790,17 +3790,18 @@ def _take( See the docstring of `take` for full explanation of the parameters. """ - indices = np.asarray(indices, dtype=np.intp) - if ( - axis == 0 - and indices.ndim == 1 - and using_copy_on_write() - and array_equal_fast( - indices, - np.arange(0, len(self), dtype=np.intp), - ) - ): - return self.copy(deep=None) + if not isinstance(indices, slice): + indices = np.asarray(indices, dtype=np.intp) + if ( + axis == 0 + and indices.ndim == 1 + and using_copy_on_write() + and array_equal_fast( + indices, + np.arange(0, len(self), dtype=np.intp), + ) + ): + return self.copy(deep=None) new_data = self._mgr.take( indices, From d66e5e3e53e0a13ac2a0f2bc9c30f8b243eacd4c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 7 Jan 2023 12:38:44 +0100 Subject: [PATCH 5/9] Use array_equal_fast --- pandas/core/series.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index dead0ba9b176c..5daf543f2729d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -29,7 +29,10 @@ properties, reshape, ) -from pandas._libs.lib import no_default +from pandas._libs.lib import ( + array_equal_fast, + no_default, +) from pandas._typing import ( AggFuncType, AlignJoin, @@ -878,7 +881,7 @@ def take(self, indices, axis: Axis = 0, **kwargs) -> Series: indices = ensure_platform_int(indices) - if using_copy_on_write() and np.array_equal(indices, np.arange(0, len(self))): + if using_copy_on_write() and array_equal_fast(indices, np.arange(0, len(self))): return self.copy(deep=None) new_index = self.index.take(indices) From 258b1bc0c4a78aa8284c132149e2db16bcd765b8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 7 Jan 2023 15:41:06 +0100 Subject: [PATCH 6/9] Update pandas/core/series.py Co-authored-by: Joris Van den Bossche --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 5daf543f2729d..589f56bcbc4a9 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -881,7 +881,7 @@ def take(self, indices, axis: Axis = 0, **kwargs) -> Series: indices = ensure_platform_int(indices) - if using_copy_on_write() and array_equal_fast(indices, np.arange(0, len(self))): + if using_copy_on_write() and array_equal_fast(indices, np.arange(0, len(self), dtype=indices.dtype)): return self.copy(deep=None) new_index = self.index.take(indices) From 36287a35584ab02f3e53be883bf22e1270687da2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 7 Jan 2023 15:49:40 +0100 Subject: [PATCH 7/9] Fix test --- pandas/core/series.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 589f56bcbc4a9..8f61ab20171db 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -881,7 +881,11 @@ def take(self, indices, axis: Axis = 0, **kwargs) -> Series: indices = ensure_platform_int(indices) - if using_copy_on_write() and array_equal_fast(indices, np.arange(0, len(self), dtype=indices.dtype)): + if ( + indices.ndim == 1 + and using_copy_on_write() + and array_equal_fast(indices, np.arange(0, len(self), dtype=indices.dtype)) + ): return self.copy(deep=None) new_index = self.index.take(indices) From eabf2fac1164bd2c5ce7c0d1df4131da58502843 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 11 Jan 2023 13:47:27 +0100 Subject: [PATCH 8/9] Add comment --- pandas/tests/copy_view/test_methods.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 3899e36ea1623..446d2d44b02cd 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -483,6 +483,7 @@ def test_assign_drop_duplicates(using_copy_on_write, method): @pytest.mark.parametrize("obj", [Series([1, 2]), DataFrame({"a": [1, 2]})]) def test_take(using_copy_on_write, obj): + # Check that no copy is made when we take all rows in original order obj_orig = obj.copy() obj2 = obj.take([0, 1]) From 64afeb2d5ac2f97d2867792675badae31cbc8a84 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 12 Jan 2023 23:14:51 +0100 Subject: [PATCH 9/9] Remove import --- pandas/core/series.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index beef8c0217218..c38eb4e7c5d34 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -149,7 +149,6 @@ SingleArrayManager, SingleBlockManager, ) -from pandas.core.internals.managers import using_copy_on_write from pandas.core.shared_docs import _shared_docs from pandas.core.sorting import ( ensure_key_mapped,