diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 8732e1c397ce5..85f0f16c44b89 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -543,6 +543,7 @@ Indexing - Bug when setting string-backed :class:`Categorical` values that can be parsed to datetimes into a :class:`DatetimeArray` or :class:`Series` or :class:`DataFrame` column backed by :class:`DatetimeArray` failing to parse these strings (:issue:`44236`) - Bug in :meth:`Series.__setitem__` with an integer dtype other than ``int64`` setting with a ``range`` object unnecessarily upcasting to ``int64`` (:issue:`44261`) - Bug in :meth:`Series.__setitem__` with a boolean mask indexer setting a listlike value of length 1 incorrectly broadcasting that value (:issue:`44265`) +- Bug in :meth:`DataFrame.loc.__setitem__` and :meth:`DataFrame.iloc.__setitem__` with mixed dtypes sometimes failing to operate in-place (:issue:`44345`) - Missing diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 669274e034905..4aa9d251b04c7 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1859,10 +1859,19 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): # in case of slice ser = value[pi] else: - # set the item, possibly having a dtype change - ser = ser.copy() - ser._mgr = ser._mgr.setitem(indexer=(pi,), value=value) - ser._maybe_update_cacher(clear=True, inplace=True) + # set the item, first attempting to operate inplace, then + # falling back to casting if necessary; see + # _whatsnew_130.notable_bug_fixes.setitem_column_try_inplace + + orig_values = ser._values + ser._mgr = ser._mgr.setitem((pi,), value) + + if ser._values is orig_values: + # The setitem happened inplace, so the DataFrame's values + # were modified inplace. + return + self.obj._iset_item(loc, ser, inplace=True) + return # reset the sliced object if unique self.obj._iset_item(loc, ser, inplace=True) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index f6c93e6f751c8..9803a2e4e3309 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -600,6 +600,8 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame: # Cast from unsupported types to supported types is_nullable_int = isinstance(data[col].dtype, (_IntegerDtype, BooleanDtype)) orig = data[col] + # We need to find orig_missing before altering data below + orig_missing = orig.isna() if is_nullable_int: missing_loc = data[col].isna() if missing_loc.any(): @@ -650,11 +652,10 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame: f"supported by Stata ({float64_max})" ) if is_nullable_int: - missing = orig.isna() - if missing.any(): + if orig_missing.any(): # Replace missing by Stata sentinel value sentinel = StataMissingValue.BASE_MISSING_VALUES[data[col].dtype.name] - data.loc[missing, col] = sentinel + data.loc[orig_missing, col] = sentinel if ws: warnings.warn(ws, PossiblePrecisionLoss) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index d735f0dbec8a5..389bf56ab6035 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -384,7 +384,7 @@ def test_setitem_frame_length_0_str_key(self, indexer): expected["A"] = expected["A"].astype("object") tm.assert_frame_equal(df, expected) - def test_setitem_frame_duplicate_columns(self, using_array_manager): + def test_setitem_frame_duplicate_columns(self, using_array_manager, request): # GH#15695 cols = ["A", "B", "C"] * 2 df = DataFrame(index=range(3), columns=cols) @@ -407,6 +407,11 @@ def test_setitem_frame_duplicate_columns(self, using_array_manager): expected["C"] = expected["C"].astype("int64") # TODO(ArrayManager) .loc still overwrites expected["B"] = expected["B"].astype("int64") + + mark = pytest.mark.xfail( + reason="Both 'A' columns get set with 3 instead of 0 and 3" + ) + request.node.add_marker(mark) else: # set these with unique columns to be extra-unambiguous expected[2] = expected[2].astype(np.int64) @@ -995,22 +1000,37 @@ def test_setitem_always_copy(self, float_frame): float_frame["E"][5:10] = np.nan assert notna(s[5:10]).all() - def test_setitem_clear_caches(self): - # see GH#304 + @pytest.mark.parametrize("consolidate", [True, False]) + def test_setitem_partial_column_inplace(self, consolidate, using_array_manager): + # This setting should be in-place, regardless of whether frame is + # single-block or multi-block + # GH#304 this used to be incorrectly not-inplace, in which case + # we needed to ensure _item_cache was cleared. + df = DataFrame( {"x": [1.1, 2.1, 3.1, 4.1], "y": [5.1, 6.1, 7.1, 8.1]}, index=[0, 1, 2, 3] ) df.insert(2, "z", np.nan) + if not using_array_manager: + if consolidate: + df._consolidate_inplace() + assert len(df._mgr.blocks) == 1 + else: + assert len(df._mgr.blocks) == 2 - # cache it - foo = df["z"] - df.loc[df.index[2:], "z"] = 42 + zvals = df["z"]._values - expected = Series([np.nan, np.nan, 42, 42], index=df.index, name="z") + df.loc[2:, "z"] = 42 - assert df["z"] is not foo + expected = Series([np.nan, np.nan, 42, 42], index=df.index, name="z") tm.assert_series_equal(df["z"], expected) + # check setting occurred in-place + tm.assert_numpy_array_equal(zvals, expected.values) + assert np.shares_memory(zvals, df["z"]._values) + if not consolidate: + assert df["z"]._values is zvals + def test_setitem_duplicate_columns_not_inplace(self): # GH#39510 cols = ["A", "B"] * 2 diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index d2704876c31c5..c6938abb57d64 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -366,12 +366,7 @@ def test_xs_droplevel_false_view(self, using_array_manager): assert np.shares_memory(result.iloc[:, 0]._values, df.iloc[:, 0]._values) # modifying original df also modifies result when having a single block df.iloc[0, 0] = 2 - if not using_array_manager: - expected = DataFrame({"a": [2]}) - else: - # TODO(ArrayManager) iloc does not update the array inplace using - # "split" path - expected = DataFrame({"a": [1]}) + expected = DataFrame({"a": [2]}) tm.assert_frame_equal(result, expected) # with mixed dataframe, modifying the parent doesn't modify result @@ -379,7 +374,13 @@ def test_xs_droplevel_false_view(self, using_array_manager): df = DataFrame([[1, 2.5, "a"]], columns=Index(["a", "b", "c"])) result = df.xs("a", axis=1, drop_level=False) df.iloc[0, 0] = 2 - expected = DataFrame({"a": [1]}) + if using_array_manager: + # Here the behavior is consistent + expected = DataFrame({"a": [2]}) + else: + # FIXME: iloc does not update the array inplace using + # "split" path + expected = DataFrame({"a": [1]}) tm.assert_frame_equal(result, expected) def test_xs_list_indexer_droplevel_false(self): diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 919d8ab14778e..fc2c138538ac9 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -789,6 +789,10 @@ def test_std_timedelta64_skipna_false(self): # GH#37392 tdi = pd.timedelta_range("1 Day", periods=10) df = DataFrame({"A": tdi, "B": tdi}) + # Copy is needed for ArrayManager case, otherwise setting df.iloc + # below edits tdi, alterting both df['A'] and df['B'] + # FIXME: passing copy=True to constructor does not fix this + df = df.copy() df.iloc[-2, -1] = pd.NaT result = df.std(skipna=False) @@ -1017,7 +1021,9 @@ def test_idxmax_mixed_dtype(self): # don't cast to object, which would raise in nanops dti = date_range("2016-01-01", periods=3) - df = DataFrame({1: [0, 2, 1], 2: range(3)[::-1], 3: dti}) + # Copying dti is needed for ArrayManager otherwise when we set + # df.loc[0, 3] = pd.NaT below it edits dti + df = DataFrame({1: [0, 2, 1], 2: range(3)[::-1], 3: dti.copy(deep=True)}) result = df.idxmax() expected = Series([1, 0, 2], index=[1, 2, 3]) @@ -1074,6 +1080,10 @@ def test_idxmax_idxmin_convert_dtypes(self, op, expected_value): def test_idxmax_dt64_multicolumn_axis1(self): dti = date_range("2016-01-01", periods=3) df = DataFrame({3: dti, 4: dti[::-1]}) + # FIXME: copy needed for ArrayManager, otherwise setting with iloc + # below also sets df.iloc[-1, 1]; passing copy=True to DataFrame + # does not solve this. + df = df.copy() df.iloc[0, 0] = pd.NaT df._consolidate_inplace()