From 040ceb71d04145f30cf471cb68a482f321b109b3 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 13 Feb 2024 14:15:45 +0000 Subject: [PATCH 1/3] whatsnew --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/indexing.py | 15 +++++++++++++-- pandas/tests/frame/indexing/test_setitem.py | 16 ++++++++++++++++ 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 94e26ff6aa46a..fa60e28ed28a4 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -24,6 +24,7 @@ Fixed regressions - Fixed regression in :meth:`CategoricalIndex.difference` raising ``KeyError`` when other contains null values other than NaN (:issue:`57318`) - Fixed regression in :meth:`DataFrame.groupby` raising ``ValueError`` when grouping by a :class:`Series` in some cases (:issue:`57276`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) +- Fixed regression in :meth:`DataFrame.loc` which was unnecessarily throwing "incompatible dtype warning" when expanding with partial row indexer and multiple columns (see `PDEP6 `_) (:issue:`56503`) - Fixed regression in :meth:`DataFrame.merge` raising ``ValueError`` for certain types of 3rd-party extension arrays (:issue:`57316`) - Fixed regression in :meth:`DataFrame.shift` raising ``AssertionError`` for ``axis=1`` and empty :class:`DataFrame` (:issue:`57301`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index c6759a9f54509..0e8e621bf6dbd 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -854,7 +854,6 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None: if self.ndim != 2: return - orig_key = key if isinstance(key, tuple) and len(key) > 1: # key may be a tuple if we are .loc # if length of key is > 1 set key to column part @@ -872,7 +871,7 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None: keys = self.obj.columns.union(key, sort=False) diff = Index(key).difference(self.obj.columns, sort=False) - if len(diff) and com.is_null_slice(orig_key[0]): + if len(diff): # e.g. if we are doing df.loc[:, ["A", "B"]] = 7 and "B" # is a new column, add the new columns with dtype=np.void # so that later when we go through setitem_single_column @@ -2165,6 +2164,18 @@ def _setitem_single_column(self, loc: int, value, plane_indexer) -> None: else: # set value into the column (first attempting to operate inplace, then # falling back to casting if necessary) + dtype = self.obj.dtypes.iloc[loc] + if dtype == np.void: + # This means we're expanding, with multiple columns, e.g. + # df = pd.DataFrame({'A': [1,2,3], 'B': [4,5,6]}) + # df.loc[df.index <= 2, ['F', 'G']] = (1, 'abc') + # Columns F and G will initially be set to np.void. + # Here, we replace those temporary `np.void` columns with + # columns of the appropriate dtype, based on `value`. + arr = sanitize_array(value, Index(range(1)), copy=False) + taker = -1 * np.ones(len(self.obj), dtype=np.intp) + empty_value = algos.take_nd(arr, taker) + self.obj.iloc[:, loc] = empty_value self.obj._mgr.column_setitem(loc, plane_indexer, value) def _setitem_single_block(self, indexer, value, name: str) -> None: diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 20e7651f8af83..658fafd3ea2cc 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -1369,3 +1369,19 @@ def test_full_setter_loc_incompatible_dtype(): df.loc[:, "a"] = {0: 3, 1: 4} expected = DataFrame({"a": [3, 4]}) tm.assert_frame_equal(df, expected) + + +def test_setitem_partial_row_multiple_columns(): + # https://github.com/pandas-dev/pandas/issues/56503 + df = DataFrame({"A": [1, 2, 3], "B": [4.0, 5, 6]}) + # should not warn + df.loc[df.index <= 1, ["F", "G"]] = (1, "abc") + expected = DataFrame( + { + "A": [1, 2, 3], + "B": [4.0, 5, 6], + "F": [1.0, 1, float("nan")], + "G": ["abc", "abc", float("nan")], + } + ) + tm.assert_frame_equal(df, expected) From eb6fdc86dc06a97aa00b7db3a8c56a1871f17c8f Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 15 Feb 2024 17:43:38 +0000 Subject: [PATCH 2/3] factor out construct_1d_array_from_inferred_fill_value --- pandas/core/dtypes/missing.py | 14 ++++++++++++++ pandas/core/indexing.py | 18 +++++++----------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 17c1ad5e4d8d9..282c431f42256 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -646,6 +646,20 @@ def infer_fill_value(val): return np.nan +def construct_1d_array_from_inferred_fill_value( + value: object, length: int +) -> np.ndarray: + # Find our empty_value dtype by constructing an array + # from our value and doing a .take on it + from pandas.core.algorithms import take_nd + from pandas.core.construction import sanitize_array + from pandas.core.indexes.base import Index + + arr = sanitize_array(value, Index(range(1)), copy=False) + taker = -1 * np.ones(length, dtype=np.intp) + return take_nd(arr, taker) + + def maybe_fill(arr: np.ndarray) -> np.ndarray: """ Fill numpy.ndarray with NaN, unless we have a integer or boolean dtype. diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 0e8e621bf6dbd..c7a938dbc4449 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -50,6 +50,7 @@ ABCSeries, ) from pandas.core.dtypes.missing import ( + construct_1d_array_from_inferred_fill_value, infer_fill_value, is_valid_na_for_dtype, isna, @@ -61,7 +62,6 @@ from pandas.core.construction import ( array as pd_array, extract_array, - sanitize_array, ) from pandas.core.indexers import ( check_array_indexer, @@ -1877,12 +1877,9 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc") -> None: self.obj[key] = empty_value elif not is_list_like(value): - # Find our empty_value dtype by constructing an array - # from our value and doing a .take on it - arr = sanitize_array(value, Index(range(1)), copy=False) - taker = -1 * np.ones(len(self.obj), dtype=np.intp) - empty_value = algos.take_nd(arr, taker) - self.obj[key] = empty_value + self.obj[key] = construct_1d_array_from_inferred_fill_value( + value, len(self.obj) + ) else: # FIXME: GH#42099#issuecomment-864326014 self.obj[key] = infer_fill_value(value) @@ -2172,10 +2169,9 @@ def _setitem_single_column(self, loc: int, value, plane_indexer) -> None: # Columns F and G will initially be set to np.void. # Here, we replace those temporary `np.void` columns with # columns of the appropriate dtype, based on `value`. - arr = sanitize_array(value, Index(range(1)), copy=False) - taker = -1 * np.ones(len(self.obj), dtype=np.intp) - empty_value = algos.take_nd(arr, taker) - self.obj.iloc[:, loc] = empty_value + self.obj.iloc[:, loc] = construct_1d_array_from_inferred_fill_value( + value, len(self.obj) + ) self.obj._mgr.column_setitem(loc, plane_indexer, value) def _setitem_single_block(self, indexer, value, name: str) -> None: From 79835772d9b1a3a8c4964168dc357c649cf2d44d Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 16 Feb 2024 10:28:38 +0000 Subject: [PATCH 3/3] :label: typing --- pandas/core/dtypes/missing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 282c431f42256..9e00eb657f800 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -648,7 +648,7 @@ def infer_fill_value(val): def construct_1d_array_from_inferred_fill_value( value: object, length: int -) -> np.ndarray: +) -> ArrayLike: # Find our empty_value dtype by constructing an array # from our value and doing a .take on it from pandas.core.algorithms import take_nd