From 5469912f74b554a8dc5fd1b6d1fcc1a8c07ef321 Mon Sep 17 00:00:00 2001 From: How Si Wei Date: Fri, 26 Jul 2019 14:42:57 +0800 Subject: [PATCH 1/2] Fix assignment to multiple columns when some column do not exist --- doc/source/whatsnew/v1.0.0.rst | 31 +++++++++++++++ pandas/core/frame.py | 6 +++ pandas/core/indexing.py | 14 +++++++ pandas/tests/frame/test_indexing.py | 59 ++++++++++++++++++++--------- pandas/tests/indexing/test_loc.py | 40 +++++++++++++++++++ 5 files changed, 132 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 0be4ebc627b30..67a7a9387589c 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -40,6 +40,37 @@ Backwards incompatible API changes - :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`). - +.. _whatsnew_1000.api_breaking.multicolumn_assignment: + +Assignment to multiple columns of a DataFrame when some columns do not exist +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Assignment to multiple columns of a :class:`DataFrame` when some of the columns do not exist would previously assign the values to the last column. Now, new columns would be constructed withe the right values. (:issue:`13658`) + +.. ipython:: python + + df = pd.DataFrame({'a': [0, 1, 2], 'b': [3, 4, 5]}) + df + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: df[['a', 'c']] = 1 + In [4]: df + Out[4]: + a b + 0 1 1 + 1 1 1 + 2 1 1 + +*New behavior*: + +.. ipython:: python + + df[['a', 'c']] = 1 + df + .. _whatsnew_1000.api.other: Other API changes diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 603a615c1f8cb..77120a717ecd1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3007,6 +3007,12 @@ def _setitem_array(self, key, value): for k1, k2 in zip(key, value.columns): self[k1] = value[k2] else: + if all(is_hashable(k) for k in key): + for k in key: + try: + self[k] + except KeyError: + self[k] = np.nan indexer = self.loc._get_listlike_indexer( key, axis=1, raise_missing=False )[1] diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index ea00737f776ee..df8d73ebfa9fc 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -12,6 +12,7 @@ from pandas.core.dtypes.common import ( ensure_platform_int, is_float, + is_hashable, is_integer, is_integer_dtype, is_iterator, @@ -197,6 +198,19 @@ def _get_setitem_indexer(self, key): def __setitem__(self, key, value): if isinstance(key, tuple): key = tuple(com.apply_if_callable(x, self.obj) for x in key) + if ( + self.name == "loc" + and len(key) > 1 + and is_list_like_indexer(key[1]) + and not isinstance(key[1], tuple) + and not com.is_bool_indexer(key[1]) + and all(is_hashable(k) for k in key[1]) + ): + for k in key[1]: + try: + self.obj[k] + except KeyError: + self.obj[k] = np.nan else: key = com.apply_if_callable(key, self.obj) indexer = self._get_setitem_indexer(key) diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index ae14563e5952a..7799832e277ca 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -208,6 +208,47 @@ def test_setitem_list_of_tuples(self, float_frame): expected = Series(tuples, index=float_frame.index, name="tuples") assert_series_equal(result, expected) + def test_setitem_list_all_missing_columns_scalar(self, float_frame): + # GH 26534 + result = float_frame.copy() + result[["E", "F"]] = 1 + expected = float_frame.copy() + # force the dtypes to be float as currently multcolumn assignment does not + # change column dtype from float to int even when it's being assigned an int + expected["E"] = 1.0 + expected["F"] = 1.0 + assert_frame_equal(result, expected) + + def test_setitem_list_some_missing_columns_list(self, float_frame): + # GH 26534 + result = float_frame.copy() + result[["A", "E"]] = [1, 2] + expected = float_frame.copy() + # force the dtypes to be float as currently multcolumn assignment does not + # change column dtype from float to int even when it's being assigned an int + expected["A"] = 1.0 + expected["E"] = 2.0 + assert_frame_equal(result, expected) + + def test_setitem_list_some_missing_columns_dataframe(self, float_frame): + # GH 26534 + result = float_frame.copy() + result[["A", "E"]] = float_frame[["B", "C"]] + expected = float_frame.copy() + expected["A"] = float_frame["B"] + expected["E"] = float_frame["C"] + assert_frame_equal(result, expected) + + def test_setitem_list_some_missing_columns_2dlist(self): + # GH 26534 + result = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "B"]) + result[["B", "C", "D"]] = [[7, 8, 9], [10, 11, 12], [13, 14, 15]] + expected = pd.DataFrame( + [[1, 7, 8, 9], [3, 10, 11, 12], [5, 13, 14, 15]], + columns=["A", "B", "C", "D"], + ) + tm.assert_frame_equal(result, expected) + def test_setitem_mulit_index(self): # GH7655, test that assigning to a sub-frame of a frame # with multi-index columns aligns both rows and columns @@ -501,13 +542,6 @@ def test_setitem(self, float_frame): float_frame["col6"] = series tm.assert_series_equal(series, float_frame["col6"], check_names=False) - msg = ( - r"\"None of \[Float64Index\(\[.*dtype='float64'\)\] are in the" - r" \[columns\]\"" - ) - with pytest.raises(KeyError, match=msg): - float_frame[np.random.randn(len(float_frame) + 1)] = 1 - # set ndarray arr = np.random.randn(len(float_frame)) float_frame["col9"] = arr @@ -1143,17 +1177,6 @@ def test_fancy_index_int_labels_exceptions(self, float_frame): ) with pytest.raises(KeyError, match=msg): float_frame.ix[["foo", "bar", "baz"]] = 1 - msg = ( - r"None of \[Index\(\['E'\], dtype='object'\)\] are in the" - r" \[columns\]" - ) - with pytest.raises(KeyError, match=msg): - float_frame.ix[:, ["E"]] = 1 - - # FIXME: don't leave commented-out - # partial setting now allows this GH2578 - # pytest.raises(KeyError, float_frame.ix.__setitem__, - # (slice(None, None), 'E'), 1) def test_setitem_fancy_mixed_2d(self, float_string_frame): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index abe0cd86c90d7..56e1343433401 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -808,6 +808,46 @@ def test_loc_setitem_with_scalar_index(self, indexer, value): assert is_scalar(result) and result == "Z" + def test_loc_setitem_missing_columns_scalar_index_list_value(self): + # GH 26534 + df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "B"]) + df.loc[1, ["C", "D"]] = [7, 8] + expected = pd.DataFrame( + [[1, 2, np.nan, np.nan], [3, 4, 7, 8], [5, 6, np.nan, np.nan]], + columns=["A", "B", "C", "D"], + ) + tm.assert_frame_equal(df, expected) + + def test_loc_setitem_missing_columns_full_index_dataframe_value(self): + # GH 26534 + df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "B"]) + df2 = pd.DataFrame([[7, 8], [9, 10], [11, 12]], columns=["A", "C"]) + df.loc[:, ["A", "C"]] = df2 + expected = pd.DataFrame( + [[7, 2, 8], [9, 4, 10], [11, 6, 12]], columns=["A", "B", "C"] + ) + tm.assert_frame_equal(df, expected) + + def test_loc_setitem_missing_columns_list_index_scalar_value(self): + # GH 26534 + df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "B"]) + df.loc[[0, 2], ["B", "C", "D"]] = 9 + expected = pd.DataFrame( + [[1, 9, 9, 9], [3, 4, np.nan, np.nan], [5, 9, 9, 9]], + columns=["A", "B", "C", "D"], + ) + tm.assert_frame_equal(df, expected) + + def test_loc_setitem_missing_columns_range_index_2dlist_value(self): + # GH 26534 + df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "B"]) + df.loc[1:3, ["B", "C", "D"]] = [[7, 8, 9], [10, 11, 12]] + expected = pd.DataFrame( + [[1, 2, np.nan, np.nan], [3, 7, 8, 9], [5, 10, 11, 12]], + columns=["A", "B", "C", "D"], + ) + tm.assert_frame_equal(df, expected) + def test_loc_coercion(self): # 12411 From 3622744780c951e0f7e562ba543091478791a19c Mon Sep 17 00:00:00 2001 From: How Si Wei Date: Sun, 18 Aug 2019 15:19:33 +0800 Subject: [PATCH 2/2] Parametrize some tests --- pandas/tests/frame/test_indexing.py | 54 +++++++++------------------ pandas/tests/indexing/test_loc.py | 58 +++++++++++------------------ 2 files changed, 39 insertions(+), 73 deletions(-) diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 7799832e277ca..e4ed5f88afd84 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -208,47 +208,29 @@ def test_setitem_list_of_tuples(self, float_frame): expected = Series(tuples, index=float_frame.index, name="tuples") assert_series_equal(result, expected) - def test_setitem_list_all_missing_columns_scalar(self, float_frame): - # GH 26534 - result = float_frame.copy() - result[["E", "F"]] = 1 - expected = float_frame.copy() - # force the dtypes to be float as currently multcolumn assignment does not - # change column dtype from float to int even when it's being assigned an int - expected["E"] = 1.0 - expected["F"] = 1.0 - assert_frame_equal(result, expected) - - def test_setitem_list_some_missing_columns_list(self, float_frame): - # GH 26534 - result = float_frame.copy() - result[["A", "E"]] = [1, 2] - expected = float_frame.copy() - # force the dtypes to be float as currently multcolumn assignment does not - # change column dtype from float to int even when it's being assigned an int - expected["A"] = 1.0 - expected["E"] = 2.0 - assert_frame_equal(result, expected) - - def test_setitem_list_some_missing_columns_dataframe(self, float_frame): + @pytest.mark.parametrize("columns", [["A", "E"], ["E", "F"]]) + @pytest.mark.parametrize( + "box", + [ + lambda x: 1, + lambda x: [1, 2], + lambda x: np.array([1, 2]), + lambda x: x[["B", "C"]], + lambda x: x[["B", "A"]].values, + lambda x: x[["A", "C"]].values.tolist(), + ], + ) + def test_setitem_list_missing_columns(self, float_frame, columns, box): # GH 26534 result = float_frame.copy() - result[["A", "E"]] = float_frame[["B", "C"]] + result[columns] = box(float_frame) expected = float_frame.copy() - expected["A"] = float_frame["B"] - expected["E"] = float_frame["C"] + for col in columns: + if col not in expected.columns: + expected[col] = np.nan + expected[columns] = box(float_frame) assert_frame_equal(result, expected) - def test_setitem_list_some_missing_columns_2dlist(self): - # GH 26534 - result = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "B"]) - result[["B", "C", "D"]] = [[7, 8, 9], [10, 11, 12], [13, 14, 15]] - expected = pd.DataFrame( - [[1, 7, 8, 9], [3, 10, 11, 12], [5, 13, 14, 15]], - columns=["A", "B", "C", "D"], - ) - tm.assert_frame_equal(result, expected) - def test_setitem_mulit_index(self): # GH7655, test that assigning to a sub-frame of a frame # with multi-index columns aligns both rows and columns diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 56e1343433401..ca559e0ca59c7 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -808,45 +808,29 @@ def test_loc_setitem_with_scalar_index(self, indexer, value): assert is_scalar(result) and result == "Z" - def test_loc_setitem_missing_columns_scalar_index_list_value(self): - # GH 26534 - df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "B"]) - df.loc[1, ["C", "D"]] = [7, 8] - expected = pd.DataFrame( - [[1, 2, np.nan, np.nan], [3, 4, 7, 8], [5, 6, np.nan, np.nan]], - columns=["A", "B", "C", "D"], - ) - tm.assert_frame_equal(df, expected) - - def test_loc_setitem_missing_columns_full_index_dataframe_value(self): - # GH 26534 - df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "B"]) - df2 = pd.DataFrame([[7, 8], [9, 10], [11, 12]], columns=["A", "C"]) - df.loc[:, ["A", "C"]] = df2 - expected = pd.DataFrame( - [[7, 2, 8], [9, 4, 10], [11, 6, 12]], columns=["A", "B", "C"] - ) - tm.assert_frame_equal(df, expected) - - def test_loc_setitem_missing_columns_list_index_scalar_value(self): - # GH 26534 - df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "B"]) - df.loc[[0, 2], ["B", "C", "D"]] = 9 - expected = pd.DataFrame( - [[1, 9, 9, 9], [3, 4, np.nan, np.nan], [5, 9, 9, 9]], - columns=["A", "B", "C", "D"], - ) - tm.assert_frame_equal(df, expected) - - def test_loc_setitem_missing_columns_range_index_2dlist_value(self): + @pytest.mark.parametrize( + "index,box", + [ + ((1, ["C", "D"]), [7, 8]), + ( + (slice(None, None, None), ["A", "C"]), + pd.DataFrame([[7, 8], [9, 10], [11, 12]], columns=["A", "C"]), + ), + (([0, 2], ["B", "C", "D"]), 9), + ((slice(1, 3, None), ["B", "C", "D"]), [[7, 8, 9], [10, 11, 12]]), + ], + ) + def test_loc_setitem_missing_columns(self, index, box): # GH 26534 df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "B"]) - df.loc[1:3, ["B", "C", "D"]] = [[7, 8, 9], [10, 11, 12]] - expected = pd.DataFrame( - [[1, 2, np.nan, np.nan], [3, 7, 8, 9], [5, 10, 11, 12]], - columns=["A", "B", "C", "D"], - ) - tm.assert_frame_equal(df, expected) + result = df.copy() + result.loc[index] = box + expected = df.copy() + for col in index[1]: + if col not in expected.columns: + expected[col] = np.nan + expected.loc[index] = box + tm.assert_frame_equal(result, expected) def test_loc_coercion(self):