From d9feafcf7a720eb5cb82b4973f6508cd11947751 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 21 Jan 2022 13:28:27 +0100 Subject: [PATCH 1/6] BUG: df.getitem returning copy instead of view for unique column in dup index --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/core/frame.py | 7 ++++--- pandas/tests/frame/indexing/test_getitem.py | 8 ++++++++ pandas/tests/indexing/test_chaining_and_caching.py | 13 ------------- 4 files changed, 13 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 1ae76984484af..f1ba0e46455b0 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -210,6 +210,7 @@ Indexing ^^^^^^^^ - Bug in :meth:`loc.__getitem__` with a list of keys causing an internal inconsistency that could lead to a disconnect between ``frame.at[x, y]`` vs ``frame[y].loc[x]`` (:issue:`22372`) - Bug in :meth:`DataFrame.iloc` where indexing a single row on a :class:`DataFrame` with a single ExtensionDtype column gave a copy instead of a view on the underlying data (:issue:`45241`) +- Bug in :meth:`DataFrame.__getitem__` returning copy when :class:`DataFrame` has duplicated columns even if a unique column is selected (:issue:`45316`) - Bug in :meth:`Series.__setitem__` with a non-integer :class:`Index` when using an integer key to set a value that cannot be set inplace where a ``ValueError`` was raised insead of casting to a common dtype (:issue:`45070`) - Bug when setting a value too large for a :class:`Series` dtype failing to coerce to a common type (:issue:`26049`, :issue:`32878`) - diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 88b848f34fbe5..00b995aa5ff71 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3481,11 +3481,12 @@ def __getitem__(self, key): if is_hashable(key) and not is_iterator(key): # is_iterator to exclude generator e.g. test_getitem_listlike # shortcut if the key is in columns - if self.columns.is_unique and key in self.columns: - if isinstance(self.columns, MultiIndex): - return self._getitem_multilevel(key) + is_mi = isinstance(self.columns, MultiIndex) + if not is_mi and key in self.columns.drop_duplicates(keep=False): return self._get_item_cache(key) + elif is_mi and self.columns.is_unique and key in self.columns: + return self._getitem_multilevel(key) # Do we have a slicer (on rows)? indexer = convert_to_index_sliceable(self, key) if indexer is not None: diff --git a/pandas/tests/frame/indexing/test_getitem.py b/pandas/tests/frame/indexing/test_getitem.py index 0d4ab84175aab..b2fc440be991f 100644 --- a/pandas/tests/frame/indexing/test_getitem.py +++ b/pandas/tests/frame/indexing/test_getitem.py @@ -350,6 +350,14 @@ def test_getitem_empty_frame_with_boolean(self): df2 = df[df > 0] tm.assert_frame_equal(df, df2) + def test_getitem_returns_view_when_column_is_unique_in_df(self): + # GH#45316 + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) + view = df["b"] + view.loc[:] = 100 + expected = DataFrame([[1, 2, 100], [4, 5, 100]], columns=["a", "a", "b"]) + tm.assert_frame_equal(df, expected) + class TestGetitemSlice: def test_getitem_slice_float64(self, frame_or_series): diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index c8837a617bd9a..7a75a2ef99759 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -421,19 +421,6 @@ def test_detect_chained_assignment_warnings_errors(self): with pytest.raises(com.SettingWithCopyError, match=msg): df.loc[0]["A"] = 111 - def test_detect_chained_assignment_warnings_filter_and_dupe_cols(self): - # xref gh-13017. - with option_context("chained_assignment", "warn"): - df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, -9]], columns=["a", "a", "c"]) - - with tm.assert_produces_warning(com.SettingWithCopyWarning): - df.c.loc[df.c > 0] = None - - expected = DataFrame( - [[1, 2, 3], [4, 5, 6], [7, 8, -9]], columns=["a", "a", "c"] - ) - tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize("rhs", [3, DataFrame({0: [1, 2, 3, 4]})]) def test_detect_chained_assignment_warning_stacklevel(self, rhs): # GH#42570 From 9a565421f7e0504cafe96dac6ff09d6bdc4c9d6f Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 23 Jan 2022 19:38:31 +0100 Subject: [PATCH 2/6] Add comment --- pandas/core/frame.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 00b995aa5ff71..ee08cb1a0b804 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3482,7 +3482,11 @@ def __getitem__(self, key): # is_iterator to exclude generator e.g. test_getitem_listlike # shortcut if the key is in columns is_mi = isinstance(self.columns, MultiIndex) - if not is_mi and key in self.columns.drop_duplicates(keep=False): + # GH#45316 Return view if key is not duplicated + if not is_mi and ( + self.columns.is_unique + or key in self.columns.drop_duplicates(keep=False) + ): return self._get_item_cache(key) elif is_mi and self.columns.is_unique and key in self.columns: From 910216caa7e39b6c41a61b73ec3bca6cf6993731 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 23 Jan 2022 20:33:30 +0100 Subject: [PATCH 3/6] Fix check --- pandas/core/frame.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ee08cb1a0b804..38a04f623193f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3485,6 +3485,7 @@ def __getitem__(self, key): # GH#45316 Return view if key is not duplicated if not is_mi and ( self.columns.is_unique + and key in self.columns or key in self.columns.drop_duplicates(keep=False) ): return self._get_item_cache(key) From de0fd5277e6161ff72fdc0b1c8d3be1c9dffd65e Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 15 Feb 2022 22:18:28 +0100 Subject: [PATCH 4/6] Add test --- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/tests/frame/indexing/test_getitem.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 0e49ba3a0f9e0..993db2ad99531 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -316,7 +316,7 @@ Indexing ^^^^^^^^ - Bug in :meth:`loc.__getitem__` with a list of keys causing an internal inconsistency that could lead to a disconnect between ``frame.at[x, y]`` vs ``frame[y].loc[x]`` (:issue:`22372`) - Bug in :meth:`DataFrame.iloc` where indexing a single row on a :class:`DataFrame` with a single ExtensionDtype column gave a copy instead of a view on the underlying data (:issue:`45241`) -- Bug in :meth:`DataFrame.__getitem__` returning copy when :class:`DataFrame` has duplicated columns even if a unique column is selected (:issue:`45316`) +- Bug in :meth:`DataFrame.__getitem__` returning copy when :class:`DataFrame` has duplicated columns even if a unique column is selected (:issue:`45316`, :issue:`41062`) - Bug in setting a NA value (``None`` or ``np.nan``) into a :class:`Series` with int-based :class:`IntervalDtype` incorrectly casting to object dtype instead of a float-based :class:`IntervalDtype` (:issue:`45568`) - Bug in :meth:`Series.__setitem__` with a non-integer :class:`Index` when using an integer key to set a value that cannot be set inplace where a ``ValueError`` was raised instead of casting to a common dtype (:issue:`45070`) - Bug in :meth:`Series.__setitem__` when setting incompatible values into a ``PeriodDtype`` or ``IntervalDtype`` :class:`Series` raising when indexing with a boolean mask but coercing when indexing with otherwise-equivalent indexers; these now consistently coerce, along with :meth:`Series.mask` and :meth:`Series.where` (:issue:`45768`) diff --git a/pandas/tests/frame/indexing/test_getitem.py b/pandas/tests/frame/indexing/test_getitem.py index b2fc440be991f..4fcf222f0bb4d 100644 --- a/pandas/tests/frame/indexing/test_getitem.py +++ b/pandas/tests/frame/indexing/test_getitem.py @@ -358,6 +358,13 @@ def test_getitem_returns_view_when_column_is_unique_in_df(self): expected = DataFrame([[1, 2, 100], [4, 5, 100]], columns=["a", "a", "b"]) tm.assert_frame_equal(df, expected) + def test_getitem_frozenset_unique_in_column(self): + # GH#41062 + df = DataFrame([[1, 2, 3, 4]], columns=[frozenset(["KEY"]), "B", "C", "C"]) + result = df[frozenset(["KEY"])] + expected = Series([1], name=frozenset(["KEY"])) + tm.assert_series_equal(result, expected) + class TestGetitemSlice: def test_getitem_slice_float64(self, frame_or_series): From 367bdaa2f507ec792023840ac14ba842081300e2 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 14 Jun 2022 10:53:16 +0200 Subject: [PATCH 5/6] Remove test again --- pandas/tests/indexing/test_chaining_and_caching.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index 47f929d87bd6f..adc001695579c 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -424,19 +424,6 @@ def test_detect_chained_assignment_warnings_errors(self): with pytest.raises(SettingWithCopyError, match=msg): df.loc[0]["A"] = 111 - def test_detect_chained_assignment_warnings_filter_and_dupe_cols(self): - # xref gh-13017. - with option_context("chained_assignment", "warn"): - df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, -9]], columns=["a", "a", "c"]) - - with tm.assert_produces_warning(SettingWithCopyWarning): - df.c.loc[df.c > 0] = None - - expected = DataFrame( - [[1, 2, 3], [4, 5, 6], [7, 8, -9]], columns=["a", "a", "c"] - ) - tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize("rhs", [3, DataFrame({0: [1, 2, 3, 4]})]) def test_detect_chained_assignment_warning_stacklevel(self, rhs): # GH#42570 From 491c7bade1824a39d8390c54db8a16ddb8853e19 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 14 Jun 2022 18:53:14 +0200 Subject: [PATCH 6/6] Comment --- pandas/core/frame.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8dbc6db9e46b2..25b6e8a1ce3f9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3614,6 +3614,7 @@ def __getitem__(self, key): # shortcut if the key is in columns is_mi = isinstance(self.columns, MultiIndex) # GH#45316 Return view if key is not duplicated + # Only use drop_duplicates with duplicates for performance if not is_mi and ( self.columns.is_unique and key in self.columns