From 25ad42223005c397849febbcfa5a915fc5b2d7b9 Mon Sep 17 00:00:00 2001 From: How Si Wei Date: Thu, 20 Feb 2020 00:48:39 +0800 Subject: [PATCH 1/6] Add tests for setting missing columns --- pandas/tests/frame/indexing/test_indexing.py | 64 +++++++++++++++++--- pandas/tests/indexing/test_loc.py | 58 ++++++++++++++++++ 2 files changed, 115 insertions(+), 7 deletions(-) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 997414eceeb86..d66be2eacdc8b 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -215,6 +215,63 @@ def test_setitem_list_of_tuples(self, float_frame): expected = Series(tuples, index=float_frame.index, name="tuples") tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "columns,box,expected", + [ + ( + ["A", "B", "C", "D"], + 7, + pd.DataFrame( + [[7, 7, 7, 7], [7, 7, 7, 7], [7, 7, 7, 7]], + columns=["A", "B", "C", "D"], + ), + ), + ( + ["C", "D"], + [7, 8], + pd.DataFrame( + [[1, 2, 7, 8], [3, 4, 7, 8], [5, 6, 7, 8]], + columns=["A", "B", "C", "D"], + ), + ), + ( + ["A", "B", "C"], + np.array([7, 8, 9], dtype=np.int64), + pd.DataFrame( + [[7, 8, 9], [7, 8, 9], [7, 8, 9]], columns=["A", "B", "C"] + ), + ), + ( + ["B", "C", "D"], + [[7, 8, 9], [10, 11, 12], [13, 14, 15]], + pd.DataFrame( + [[1, 7, 8, 9], [3, 10, 11, 12], [5, 13, 14, 15]], + columns=["A", "B", "C", "D"], + ), + ), + ( + ["C", "A", "D"], + np.array([[7, 8, 9], [10, 11, 12], [13, 14, 15]], dtype=np.int64), + pd.DataFrame( + [[8, 2, 7, 9], [11, 4, 10, 12], [14, 6, 13, 15]], + columns=["A", "B", "C", "D"], + ), + ), + ( + ["A", "C"], + pd.DataFrame([[7, 8], [9, 10], [11, 12]], columns=["A", "C"]), + pd.DataFrame( + [[7, 2, 8], [9, 4, 10], [11, 6, 12]], columns=["A", "B", "C"] + ), + ), + ], + ) + def test_setitem_list_missing_columns(self, columns, box, expected): + # GH 29334 + df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "B"]) + df[columns] = box + tm.assert_frame_equal(df, expected) + def test_setitem_multi_index(self): # GH7655, test that assigning to a sub-frame of a frame # with multi-index columns aligns both rows and columns @@ -459,13 +516,6 @@ def test_setitem(self, float_frame): float_frame["col6"] = series tm.assert_series_equal(series, float_frame["col6"], check_names=False) - msg = ( - r"\"None of \[Float64Index\(\[.*dtype='float64'\)\] are in the " - r"\[columns\]\"" - ) - with pytest.raises(KeyError, match=msg): - float_frame[np.random.randn(len(float_frame) + 1)] = 1 - # set ndarray arr = np.random.randn(len(float_frame)) float_frame["col9"] = arr diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 4d042af8d59b4..010ef89393b85 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -631,6 +631,64 @@ def test_loc_setitem_with_scalar_index(self, indexer, value): assert is_scalar(result) and result == "Z" + @pytest.mark.parametrize( + "index,box,expected", + [ + ( + ([0, 2], ["A", "B", "C", "D"]), + 7, + pd.DataFrame( + [[7, 7, 7, 7], [3, 4, np.nan, np.nan], [7, 7, 7, 7]], + columns=["A", "B", "C", "D"], + ), + ), + ( + (1, ["C", "D"]), + [7, 8], + pd.DataFrame( + [[1, 2, np.nan, np.nan], [3, 4, 7, 8], [5, 6, np.nan, np.nan]], + columns=["A", "B", "C", "D"], + ), + ), + ( + (1, ["A", "B", "C"]), + np.array([7, 8, 9], dtype=np.int64), + pd.DataFrame( + [[1, 2, np.nan], [7, 8, 9], [5, 6, np.nan]], + columns=["A", "B", "C"], + ), + ), + ( + (slice(1, 3, None), ["B", "C", "D"]), + [[7, 8, 9], [10, 11, 12]], + pd.DataFrame( + [[1, 2, np.nan, np.nan], [3, 7, 8, 9], [5, 10, 11, 12]], + columns=["A", "B", "C", "D"], + ), + ), + ( + (slice(1, 3, None), ["C", "A", "D"]), + np.array([[7, 8, 9], [10, 11, 12]], dtype=np.int64), + pd.DataFrame( + [[1, 2, np.nan, np.nan], [8, 4, 7, 9], [11, 6, 10, 12]], + columns=["A", "B", "C", "D"], + ), + ), + ( + (slice(None, None, None), ["A", "C"]), + pd.DataFrame([[7, 8], [9, 10], [11, 12]], columns=["A", "C"]), + pd.DataFrame( + [[7, 2, 8], [9, 4, 10], [11, 6, 12]], columns=["A", "B", "C"] + ), + ), + ], + ) + def test_loc_setitem_missing_columns(self, index, box, expected): + # GH 29334 + df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "B"]) + df.loc[index] = box + tm.assert_frame_equal(df, expected) + def test_loc_coercion(self): # 12411 From b8d3c48377a4ead92531715f36b658c134af9340 Mon Sep 17 00:00:00 2001 From: How Si Wei Date: Thu, 20 Feb 2020 00:49:29 +0800 Subject: [PATCH 2/6] Fix assignment to missing columns --- pandas/core/frame.py | 1 + pandas/core/indexing.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f304fadbab871..e9ddaaf34e777 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2685,6 +2685,7 @@ def _setitem_array(self, key, value): for k1, k2 in zip(key, value.columns): self[k1] = value[k2] else: + self.loc._ensure_listlike_indexer(key) indexer = self.loc._get_listlike_indexer( key, axis=1, raise_missing=False )[1] diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 3ab180bafd156..87ab491653356 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -8,6 +8,7 @@ from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( + is_hashable, is_integer, is_iterator, is_list_like, @@ -581,6 +582,9 @@ def _get_setitem_indexer(self, key): """ Convert a potentially-label-based key into a positional indexer. """ + if self.name == "loc": + self._ensure_listlike_indexer(key) + if self.axis is not None: return self._convert_tuple(key, is_setter=True) @@ -611,6 +615,39 @@ def _get_setitem_indexer(self, key): raise raise IndexingError(key) from e + def _ensure_listlike_indexer(self, key): + """ + Ensure that a list-like of column labels are all present by adding them if + they do not already exist. + + Parameters + ---------- + key : _LocIndexer key or list-like of column labels + Target labels. + """ + column_axis = 1 + + # column only exists in 2-dimensional DataFrame + if self.ndim != 2: + return + + if isinstance(key, tuple): + # key may be a tuple if key is a _LocIndexer key + # in that case, set key to the column part of key + key = key[column_axis] + + if ( + not isinstance(self.obj._get_axis(column_axis), ABCMultiIndex) + and is_list_like_indexer(key) + and not com.is_bool_indexer(key) + and all(is_hashable(k) for k in key) + ): + for k in key: + try: + self.obj[k] + except KeyError: + self.obj[k] = np.nan + def __setitem__(self, key, value): if isinstance(key, tuple): key = tuple(com.apply_if_callable(x, self.obj) for x in key) From 660d0f28bf1206bd3bea71cfbf950a96d2d5ccc9 Mon Sep 17 00:00:00 2001 From: How Si Wei Date: Thu, 20 Feb 2020 00:53:01 +0800 Subject: [PATCH 3/6] Add whatsnew --- doc/source/whatsnew/v1.1.0.rst | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 0f18a1fd81815..225b1081e182b 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -259,6 +259,35 @@ Indexing - Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` when indexing with an integer key on a object-dtype :class:`Index` that is not all-integers (:issue:`31905`) - +Assignment to multiple columns of a DataFrame when some columns do not exist +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Assignment to multiple columns of a :class:`DataFrame` when some of the columns do not exist would previously assign the values to the last column. Now, new columns would be constructed with the right values. (:issue:`13658`) + +.. ipython:: python + + df = pd.DataFrame({'a': [0, 1, 2], 'b': [3, 4, 5]}) + df + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: df[['a', 'c']] = 1 + In [4]: df + Out[4]: + a b + 0 1 1 + 1 1 1 + 2 1 1 + +*New behavior*: + +.. ipython:: python + + df[['a', 'c']] = 1 + df + Missing ^^^^^^^ From d7184e72c9a6fcc3c565ce78dbc3e3030119a463 Mon Sep 17 00:00:00 2001 From: How Si Wei Date: Thu, 20 Feb 2020 00:54:45 +0800 Subject: [PATCH 4/6] Pass axis to _ensure_listlike_indexer --- pandas/core/frame.py | 2 +- pandas/core/indexing.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e9ddaaf34e777..65dfe61b93f2a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2685,7 +2685,7 @@ def _setitem_array(self, key, value): for k1, k2 in zip(key, value.columns): self[k1] = value[k2] else: - self.loc._ensure_listlike_indexer(key) + self.loc._ensure_listlike_indexer(key, axis=1) indexer = self.loc._get_listlike_indexer( key, axis=1, raise_missing=False )[1] diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 87ab491653356..8bafc69eb6be7 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -615,7 +615,7 @@ def _get_setitem_indexer(self, key): raise raise IndexingError(key) from e - def _ensure_listlike_indexer(self, key): + def _ensure_listlike_indexer(self, key, axis=None): """ Ensure that a list-like of column labels are all present by adding them if they do not already exist. @@ -624,6 +624,7 @@ def _ensure_listlike_indexer(self, key): ---------- key : _LocIndexer key or list-like of column labels Target labels. + axis : key axis if known """ column_axis = 1 @@ -635,9 +636,11 @@ def _ensure_listlike_indexer(self, key): # key may be a tuple if key is a _LocIndexer key # in that case, set key to the column part of key key = key[column_axis] + axis = column_axis if ( - not isinstance(self.obj._get_axis(column_axis), ABCMultiIndex) + axis == column_axis + and not isinstance(self.obj._get_axis(column_axis), ABCMultiIndex) and is_list_like_indexer(key) and not com.is_bool_indexer(key) and all(is_hashable(k) for k in key) From a509eb57989c01a470c2308477095b9bff857c58 Mon Sep 17 00:00:00 2001 From: How Si Wei Date: Sat, 7 Mar 2020 03:20:40 +0800 Subject: [PATCH 5/6] Use DataFrame.columns --- pandas/core/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 8bafc69eb6be7..2e727ab55c706 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -640,7 +640,7 @@ def _ensure_listlike_indexer(self, key, axis=None): if ( axis == column_axis - and not isinstance(self.obj._get_axis(column_axis), ABCMultiIndex) + and not isinstance(self.obj.columns, ABCMultiIndex) and is_list_like_indexer(key) and not com.is_bool_indexer(key) and all(is_hashable(k) for k in key) From 26ba2a998f8d593f5dbc7d0b5a37905ec1e8f3f1 Mon Sep 17 00:00:00 2001 From: How Si Wei Date: Sun, 15 Mar 2020 03:40:09 +0800 Subject: [PATCH 6/6] Update documentation --- doc/source/whatsnew/v1.1.0.rst | 60 ++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 4f8c67374bf3e..caf37a77f8216 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -168,6 +168,37 @@ key and type of :class:`Index`. These now consistently raise ``KeyError`` (:iss .. --------------------------------------------------------------------------- +.. _whatsnew_110.api_breaking.assignment_to_multiple_columns: + +Assignment to multiple columns of a DataFrame when some columns do not exist +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Assignment to multiple columns of a :class:`DataFrame` when some of the columns do not exist would previously assign the values to the last column. Now, new columns would be constructed with the right values. (:issue:`13658`) + +.. ipython:: python + + df = pd.DataFrame({'a': [0, 1, 2], 'b': [3, 4, 5]}) + df + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: df[['a', 'c']] = 1 + In [4]: df + Out[4]: + a b + 0 1 1 + 1 1 1 + 2 1 1 + +*New behavior*: + +.. ipython:: python + + df[['a', 'c']] = 1 + df + .. _whatsnew_110.deprecations: Deprecations @@ -267,35 +298,6 @@ Indexing - Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` when indexing with an integer key on a object-dtype :class:`Index` that is not all-integers (:issue:`31905`) - Bug in :meth:`DataFrame.iloc.__setitem__` on a :class:`DataFrame` with duplicate columns incorrectly setting values for all matching columns (:issue:`15686`, :issue:`22036`) -Assignment to multiple columns of a DataFrame when some columns do not exist -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Assignment to multiple columns of a :class:`DataFrame` when some of the columns do not exist would previously assign the values to the last column. Now, new columns would be constructed with the right values. (:issue:`13658`) - -.. ipython:: python - - df = pd.DataFrame({'a': [0, 1, 2], 'b': [3, 4, 5]}) - df - -*Previous behavior*: - -.. code-block:: ipython - - In [3]: df[['a', 'c']] = 1 - In [4]: df - Out[4]: - a b - 0 1 1 - 1 1 1 - 2 1 1 - -*New behavior*: - -.. ipython:: python - - df[['a', 'c']] = 1 - df - Missing ^^^^^^^