From 9ce8713b3397de8b89d65bcee1733a7f01e3c06f Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 25 Jan 2021 21:29:03 +0100 Subject: [PATCH 1/9] Clean up DataFrame.setitem behavior for duplicate columns --- pandas/core/frame.py | 13 ++++++++++--- pandas/tests/frame/indexing/test_setitem.py | 10 +++++++++- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c7585b21abe99..ec67ae5e760b1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3211,6 +3211,14 @@ def _setitem_slice(self, key: slice, value): self._check_setitem_copy() self.iloc[key] = value + def _check_key_length(self, key, value: DataFrame): + if self.columns.is_unique: + if len(value.columns) != len(key): + raise ValueError("Columns must be same length as key") + else: + if len(self.columns.get_indexer_non_unique(key)[0]) != len(value.columns): + raise ValueError("Columns must be same length as key") + def _setitem_array(self, key, value): # also raises Exception if object array with NA values if com.is_bool_indexer(key): @@ -3223,9 +3231,8 @@ def _setitem_array(self, key, value): self._check_setitem_copy() self.iloc[indexer] = value else: - if isinstance(value, DataFrame) and self.columns.is_unique: - if len(value.columns) != len(key): - raise ValueError("Columns must be same length as key") + if isinstance(value, DataFrame): + self._check_key_length(key, value) for k1, k2 in zip(key, value.columns): self[k1] = value[k2] else: diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 4f8ac49cb17ec..244e4b2174d35 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -378,11 +378,19 @@ def test_setitem_df_wrong_column_number(self, cols): def test_setitem_listlike_indexer_duplicate_columns(self): # GH#38604 df = DataFrame([[1, 2, 3]], columns=["a", "b", "b"]) - rhs = DataFrame([[10, 11, 12]], columns=["d", "e", "c"]) + rhs = DataFrame([[10, 11, 12]], columns=["a", "b", "b"]) df[["a", "b"]] = rhs expected = DataFrame([[10, 11, 12]], columns=["a", "b", "b"]) tm.assert_frame_equal(df, expected) + def test_setitem_listlike_indexer_duplicate_columns_not_equal_length(self): + # GH# + df = DataFrame([[1, 2, 3]], columns=["a", "b", "b"]) + rhs = DataFrame([[10, 11]], columns=["a", "b"]) + msg = "Columns must be same length as key" + with pytest.raises(ValueError, match=msg): + df[["a", "b"]] = rhs + class TestDataFrameSetItemWithExpansion: def test_setitem_listlike_views(self): From d9c80261c8f50b6d3ad6a327ff943d714a73c4e2 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 25 Jan 2021 21:30:46 +0100 Subject: [PATCH 2/9] Add gh reference --- pandas/tests/frame/indexing/test_setitem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 244e4b2174d35..d507682ad6108 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -384,7 +384,7 @@ def test_setitem_listlike_indexer_duplicate_columns(self): tm.assert_frame_equal(df, expected) def test_setitem_listlike_indexer_duplicate_columns_not_equal_length(self): - # GH# + # GH#39403 df = DataFrame([[1, 2, 3]], columns=["a", "b", "b"]) rhs = DataFrame([[10, 11]], columns=["a", "b"]) msg = "Columns must be same length as key" From 4d579740bac3b9c26cd4dee68e89c2bb5a2bddca Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 26 Jan 2021 21:16:05 +0100 Subject: [PATCH 3/9] Move function --- pandas/core/frame.py | 11 ++--------- pandas/core/indexers.py | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ec67ae5e760b1..8791f309d67c0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -128,6 +128,7 @@ from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.construction import extract_array, sanitize_masked_array from pandas.core.generic import NDFrame, _shared_docs +from pandas.core.indexers import check_key_length from pandas.core.indexes import base as ibase from pandas.core.indexes.api import ( DatetimeIndex, @@ -3211,14 +3212,6 @@ def _setitem_slice(self, key: slice, value): self._check_setitem_copy() self.iloc[key] = value - def _check_key_length(self, key, value: DataFrame): - if self.columns.is_unique: - if len(value.columns) != len(key): - raise ValueError("Columns must be same length as key") - else: - if len(self.columns.get_indexer_non_unique(key)[0]) != len(value.columns): - raise ValueError("Columns must be same length as key") - def _setitem_array(self, key, value): # also raises Exception if object array with NA values if com.is_bool_indexer(key): @@ -3232,7 +3225,7 @@ def _setitem_array(self, key, value): self.iloc[indexer] = value else: if isinstance(value, DataFrame): - self._check_key_length(key, value) + check_key_length(self.columns, key, value) for k1, k2 in zip(key, value.columns): self[k1] = value[k2] else: diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 399953fc17c73..15fa9b1bd231b 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -1,6 +1,7 @@ """ Low-dependency indexing utilities. """ +from typing import TYPE_CHECKING import warnings import numpy as np @@ -17,6 +18,9 @@ ) from pandas.core.dtypes.generic import ABCIndex, ABCSeries +if TYPE_CHECKING: + from pandas.core.frame import DataFrame + # ----------------------------------------------------------- # Indexer Identification @@ -376,6 +380,25 @@ def unpack_1tuple(tup): return tup +def check_key_length(columns, key, value: DataFrame): + """Checks if a key used as indexer has the same length as the columns it is + associated with. + + Parameters + ---------- + columns: The columns of the DataFrame to index. + key: A list-like of keys to index with. + value: The value to set for the keys. + + """ + if columns.is_unique: + if len(value.columns) != len(key): + raise ValueError("Columns must be same length as key") + else: + if len(columns.get_indexer_non_unique(key)[0]) != len(value.columns): + raise ValueError("Columns must be same length as key") + + # ----------------------------------------------------------- # Public indexer validation From d0bdc78fbeaf25732b797079da6415cefca10cb9 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 26 Jan 2021 21:34:58 +0100 Subject: [PATCH 4/9] Remove type --- pandas/core/indexers.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 15fa9b1bd231b..0107acd532f43 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -1,7 +1,6 @@ """ Low-dependency indexing utilities. """ -from typing import TYPE_CHECKING import warnings import numpy as np @@ -18,9 +17,6 @@ ) from pandas.core.dtypes.generic import ABCIndex, ABCSeries -if TYPE_CHECKING: - from pandas.core.frame import DataFrame - # ----------------------------------------------------------- # Indexer Identification @@ -380,7 +376,7 @@ def unpack_1tuple(tup): return tup -def check_key_length(columns, key, value: DataFrame): +def check_key_length(columns, key, value): """Checks if a key used as indexer has the same length as the columns it is associated with. From 6f5f419b8a78ddc9062254b9cb4280cb1f415d71 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 27 Jan 2021 19:43:58 +0100 Subject: [PATCH 5/9] Add note --- pandas/core/indexers.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 0107acd532f43..f2f631f82c4f1 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -377,12 +377,13 @@ def unpack_1tuple(tup): def check_key_length(columns, key, value): - """Checks if a key used as indexer has the same length as the columns it is + """ + Checks if a key used as indexer has the same length as the columns it is associated with. Parameters ---------- - columns: The columns of the DataFrame to index. + columns: Index The columns of the DataFrame to index. key: A list-like of keys to index with. value: The value to set for the keys. @@ -391,6 +392,7 @@ def check_key_length(columns, key, value): if len(value.columns) != len(key): raise ValueError("Columns must be same length as key") else: + # Missing keys in columns are represented as -1 if len(columns.get_indexer_non_unique(key)[0]) != len(value.columns): raise ValueError("Columns must be same length as key") From e0c047ef692146c060e559b0bee073ae4c179e9a Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 27 Jan 2021 19:45:55 +0100 Subject: [PATCH 6/9] Adjust test --- pandas/tests/frame/indexing/test_setitem.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index d507682ad6108..9318764a1b5ad 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -383,6 +383,10 @@ def test_setitem_listlike_indexer_duplicate_columns(self): expected = DataFrame([[10, 11, 12]], columns=["a", "b", "b"]) tm.assert_frame_equal(df, expected) + df[["c", "b"]] = rhs + expected = DataFrame([[10, 11, 12, 10]], columns=["a", "b", "b", "c"]) + tm.assert_frame_equal(df, expected) + def test_setitem_listlike_indexer_duplicate_columns_not_equal_length(self): # GH#39403 df = DataFrame([[1, 2, 3]], columns=["a", "b", "b"]) From 9a82e6c75d315e5b9f790997c7d7bb2726ea97e4 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 28 Jan 2021 19:57:43 +0100 Subject: [PATCH 7/9] Add spaces --- pandas/core/indexers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index f2f631f82c4f1..0b2560b161df0 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -383,9 +383,9 @@ def check_key_length(columns, key, value): Parameters ---------- - columns: Index The columns of the DataFrame to index. - key: A list-like of keys to index with. - value: The value to set for the keys. + columns : Index The columns of the DataFrame to index. + key : A list-like of keys to index with. + value : The value to set for the keys. """ if columns.is_unique: From 2dad847b6446dfe8d4ed87ae10cdfe29fbb8a27d Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 28 Jan 2021 20:05:38 +0100 Subject: [PATCH 8/9] Add raises section --- pandas/core/indexers.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 0b2560b161df0..1b67b9a7992db 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -385,8 +385,13 @@ def check_key_length(columns, key, value): ---------- columns : Index The columns of the DataFrame to index. key : A list-like of keys to index with. - value : The value to set for the keys. + value : DataFrame The value to set for the keys. + Raises + ------ + ValueError: If the length of key is not equal to the number of columns in value + or if the number of columns referenced by key is not equal to number + of columns. """ if columns.is_unique: if len(value.columns) != len(key): From ee4a849e4302aae919b8a065fc6d34ba6bf2bb81 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 28 Jan 2021 21:13:24 +0100 Subject: [PATCH 9/9] Add types --- pandas/core/indexers.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 1b67b9a7992db..c7011b4339fe7 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -1,6 +1,9 @@ """ Low-dependency indexing utilities. """ +from __future__ import annotations + +from typing import TYPE_CHECKING import warnings import numpy as np @@ -17,6 +20,10 @@ ) from pandas.core.dtypes.generic import ABCIndex, ABCSeries +if TYPE_CHECKING: + from pandas.core.frame import DataFrame + from pandas.core.indexes.base import Index + # ----------------------------------------------------------- # Indexer Identification @@ -376,7 +383,7 @@ def unpack_1tuple(tup): return tup -def check_key_length(columns, key, value): +def check_key_length(columns: Index, key, value: DataFrame): """ Checks if a key used as indexer has the same length as the columns it is associated with.