From 6c10ecbbb87f0169b451b6fe08956c69b47584ad Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 24 Dec 2022 19:26:06 +0100 Subject: [PATCH 1/2] ENH: Add lazy copy for drop duplicates --- pandas/core/frame.py | 7 ++++++- pandas/tests/copy_view/test_methods.py | 5 +++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 21b3a0c033702..be8dffbc686ed 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -205,6 +205,7 @@ to_arrays, treat_as_nested, ) +from pandas.core.internals.managers import _using_copy_on_write from pandas.core.reshape.melt import melt from pandas.core.series import Series from pandas.core.shared_docs import _shared_docs @@ -3718,6 +3719,10 @@ def _getitem_bool_array(self, key): # check_bool_indexer will throw exception if Series key cannot # be reindexed to match DataFrame rows key = check_bool_indexer(self.index, key) + + if _using_copy_on_write() and key.all(): + return self.copy(deep=None) + indexer = key.nonzero()[0] return self._take_with_is_copy(indexer, axis=0) @@ -6418,7 +6423,7 @@ def drop_duplicates( 4 Indomie pack 5.0 """ if self.empty: - return self.copy() + return self.copy(deep=None) inplace = validate_bool_kwarg(inplace, "inplace") ignore_index = validate_bool_kwarg(ignore_index, "ignore_index") diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index f5c7b31e59bc5..b3be215bcf000 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -322,10 +322,11 @@ def test_head_tail(method, using_copy_on_write): tm.assert_frame_equal(df, df_orig) -def test_assign(using_copy_on_write): +@pytest.mark.parametrize("method", ["assign", "drop_duplicates"]) +def test_assign_drop_duplicates(using_copy_on_write, method): df = DataFrame({"a": [1, 2, 3]}) df_orig = df.copy() - df2 = df.assign() + df2 = getattr(df, method)() df2._mgr._verify_integrity() if using_copy_on_write: From 04ccfa8a03a42da2e9d23da7cbab4b7941ef19dc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 24 Dec 2022 19:36:20 +0100 Subject: [PATCH 2/2] Improve performance --- pandas/core/frame.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index be8dffbc686ed..9702a5b374356 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -205,7 +205,6 @@ to_arrays, treat_as_nested, ) -from pandas.core.internals.managers import _using_copy_on_write from pandas.core.reshape.melt import melt from pandas.core.series import Series from pandas.core.shared_docs import _shared_docs @@ -3720,7 +3719,7 @@ def _getitem_bool_array(self, key): # be reindexed to match DataFrame rows key = check_bool_indexer(self.index, key) - if _using_copy_on_write() and key.all(): + if key.all(): return self.copy(deep=None) indexer = key.nonzero()[0]