diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index f7204ceb9d412..68130baff1847 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -181,6 +181,46 @@ Preserve dtypes in :meth:`~pandas.DataFrame.combine_first` combined.dtypes +Try operating inplace when setting values with ``loc`` and ``iloc`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When setting an entire column using ``loc`` or ``iloc``, pandas will try to +insert the values into the existing data rather than create an entirely new array. + +.. ipython:: python + + df = pd.DataFrame(range(3), columns=["A"], dtype="float64") + values = df.values + new = np.array([5, 6, 7], dtype="int64") + df.loc[[0, 1, 2], "A"] = new + +In both the new and old behavior, the data in ``values`` is overwritten, but in +the old behavior the dtype of ``df["A"]`` changed to ``int64``. + +*pandas 1.2.x* + +.. code-block:: ipython + + In [1]: df.dtypes + Out[1]: + A int64 + dtype: object + In [2]: np.shares_memory(df["A"].values, new) + Out[2]: False + In [3]: np.shares_memory(df["A"].values, values) + Out[3]: False + +In pandas 1.3.0, ``df`` continues to share data with ``values`` + +*pandas 1.3.0* + +.. ipython:: python + + df.dtypes + np.shares_memory(df["A"], new) + np.shares_memory(df["A"], values) + + .. _whatsnew_130.notable_bug_fixes.setitem_with_bool_casting: Consistent Casting With Setting Into Boolean Series diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index bded503a1e6db..31d2baf6c64f1 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1866,7 +1866,6 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): ser = value elif is_array_like(value) and is_exact_shape_match(ser, value): ser = value - else: # set the item, possibly having a dtype change ser = ser.copy() diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f2b8499a316b7..63fae32acf3ff 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -92,6 +92,8 @@ Categorical, DatetimeArray, ExtensionArray, + FloatingArray, + IntegerArray, PandasArray, ) from pandas.core.base import PandasObject @@ -994,6 +996,7 @@ def setitem(self, indexer, value): # length checking check_setitem_lengths(indexer, value, values) exact_match = is_exact_shape_match(values, arr_value) + if is_empty_indexer(indexer, arr_value): # GH#8669 empty indexers pass @@ -1007,18 +1010,14 @@ def setitem(self, indexer, value): # GH25495 - If the current dtype is not categorical, # we need to create a new categorical block values[indexer] = value - if values.ndim == 2: - # TODO(EA2D): special case not needed with 2D EAs - if values.shape[-1] != 1: - # shouldn't get here (at least until 2D EAs) - raise NotImplementedError - values = values[:, 0] - return self.make_block(Categorical(values, dtype=arr_value.dtype)) elif exact_match and is_ea_value: # GH#32395 if we're going to replace the values entirely, just # substitute in the new array - return self.make_block(arr_value) + if not self.is_object and isinstance(value, (IntegerArray, FloatingArray)): + values[indexer] = value.to_numpy(value.dtype.numpy_dtype) + else: + values[indexer] = np.asarray(value) # if we are an exact match (ex-broadcasting), # then use the resultant dtype @@ -1026,8 +1025,6 @@ def setitem(self, indexer, value): # We are setting _all_ of the array's values, so can cast to new dtype values[indexer] = value - values = values.astype(arr_value.dtype, copy=False) - elif is_ea_value: # GH#38952 if values.ndim == 1: @@ -1892,6 +1889,10 @@ class NumericBlock(Block): is_numeric = True def _can_hold_element(self, element: Any) -> bool: + element = extract_array(element, extract_numpy=True) + if isinstance(element, (IntegerArray, FloatingArray)): + if element._mask.any(): + return False return can_hold_element(self.dtype, element) @property diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 16b9b8e8efdea..34c9c097fbfd5 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -339,13 +339,20 @@ def test_setitem_with_expansion_dataframe_column(self, data, full_indexer): key = full_indexer(df) result.loc[key, "data"] = df["data"] + self.assert_frame_equal(result, expected) def test_setitem_series(self, data, full_indexer): # https://github.com/pandas-dev/pandas/issues/32395 - ser = expected = pd.Series(data, name="data") + ser = pd.Series(data, name="data") result = pd.Series(index=ser.index, dtype=object, name="data") + # because result has object dtype, the attempt to do setting inplace + # is successful, and object dtype is retained key = full_indexer(ser) result.loc[key] = ser + + expected = pd.Series( + data.astype(object), index=ser.index, name="data", dtype=object + ) self.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 98e173ee23f01..e8995bc654428 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -425,6 +425,23 @@ def test_setitem_slice(self, data, box_in_series): def test_setitem_loc_iloc_slice(self, data): super().test_setitem_loc_iloc_slice(data) + def test_setitem_with_expansion_dataframe_column(self, data, full_indexer): + # https://github.com/pandas-dev/pandas/issues/32395 + df = expected = pd.DataFrame({"data": pd.Series(data)}) + result = pd.DataFrame(index=df.index) + + # because result has object dtype, the attempt to do setting inplace + # is successful, and object dtype is retained + key = full_indexer(df) + result.loc[key, "data"] = df["data"] + + # base class method has expected = df; PandasArray behaves oddly because + # we patch _typ for these tests. + if data.dtype.numpy_dtype != object: + if not isinstance(key, slice) or key != slice(None): + expected = pd.DataFrame({"data": data.to_numpy()}) + self.assert_frame_equal(result, expected) + @skip_nested class TestParsing(BaseNumPyTests, base.BaseParsingTests): diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index d0fdf81121c71..ad5f54174952d 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -67,10 +67,6 @@ def test_iloc_setitem_fullcol_categorical(self, indexer, key): frame = DataFrame({0: range(3)}, dtype=object) cat = Categorical(["alpha", "beta", "gamma"]) - expected = DataFrame({0: cat}) - # NB: pending GH#38896, the expected likely should become - # expected= DataFrame({"A": cat.astype(object)}) - # and should remain a view on the original values assert frame._mgr.blocks[0]._can_hold_element(cat) @@ -78,22 +74,24 @@ def test_iloc_setitem_fullcol_categorical(self, indexer, key): orig_vals = df.values indexer(df)[key, 0] = cat - overwrite = not isinstance(key, slice) + overwrite = isinstance(key, slice) and key == slice(None) - tm.assert_frame_equal(df, expected) - - # TODO: this inconsistency is likely undesired GH#39986 if overwrite: - # check that we overwrote underlying - tm.assert_numpy_array_equal(orig_vals, df.values) + # TODO: GH#39986 this probably shouldn't behave differently + expected = DataFrame({0: cat}) + assert not np.shares_memory(df.values, orig_vals) + else: + expected = DataFrame({0: cat}).astype(object) + assert np.shares_memory(df.values, orig_vals) - # but we don't have a view on orig_vals - orig_vals[0, 0] = 19 - assert df.iloc[0, 0] != 19 + tm.assert_frame_equal(df, expected) # check we dont have a view on cat (may be undesired GH#39986) df.iloc[0, 0] = "gamma" - assert cat[0] != "gamma" + if overwrite: + assert cat[0] != "gamma" + else: + assert cat[0] != "gamma" @pytest.mark.parametrize("box", [pd_array, Series]) def test_iloc_setitem_ea_inplace(self, frame_or_series, box): diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index f55a0ae2c199b..7f0fed71ca5f1 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -620,6 +620,7 @@ def test_float_index_non_scalar_assignment(self): expected = DataFrame({"a": [1, 1, 3], "b": [1, 1, 5]}, index=df.index) tm.assert_frame_equal(expected, df) + def test_loc_setitem_fullindex_views(self): df = DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}, index=[1.0, 2.0, 3.0]) df2 = df.copy() df.loc[df.index] = df.loc[df.index] diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 5b6c042a11332..9dbce283d2a8f 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -588,32 +588,19 @@ def test_loc_modify_datetime(self): tm.assert_frame_equal(df, expected) - def test_loc_setitem_frame(self): - df = DataFrame(np.random.randn(4, 4), index=list("abcd"), columns=list("ABCD")) - - df.loc["a", "A"] = 1 - result = df.loc["a", "A"] - assert result == 1 - - result = df.iloc[0, 0] - assert result == 1 - - df.loc[:, "B":"D"] = 0 - expected = df.loc[:, "B":"D"] - result = df.iloc[:, 1:] - tm.assert_frame_equal(result, expected) - - # GH 6254 - # setting issue - df = DataFrame(index=[3, 5, 4], columns=["A"]) + def test_loc_setitem_frame_with_reindex(self): + # GH#6254 setting issue + df = DataFrame(index=[3, 5, 4], columns=["A"], dtype=float) df.loc[[4, 3, 5], "A"] = np.array([1, 2, 3], dtype="int64") - expected = DataFrame({"A": Series([1, 2, 3], index=[4, 3, 5])}).reindex( - index=[3, 5, 4] - ) + + # setting integer values into a float dataframe with loc is inplace, + # so we retain float dtype + ser = Series([2, 3, 1], index=[3, 5, 4], dtype=float) + expected = DataFrame({"A": ser}) tm.assert_frame_equal(df, expected) - # GH 6252 - # setting with an empty frame + def test_loc_setitem_empty_frame(self): + # GH#6252 setting with an empty frame keys1 = ["@" + str(i) for i in range(5)] val1 = np.arange(5, dtype="int64") @@ -628,11 +615,31 @@ def test_loc_setitem_frame(self): df["B"] = np.nan df.loc[keys2, "B"] = val2 - expected = DataFrame( - {"A": Series(val1, index=keys1), "B": Series(val2, index=keys2)} - ).reindex(index=index) + # Because df["A"] was initialized as float64, setting values into it + # is inplace, so that dtype is retained + sera = Series(val1, index=keys1, dtype=np.float64) + serb = Series(val2, index=keys2) + expected = DataFrame({"A": sera, "B": serb}).reindex(index=index) tm.assert_frame_equal(df, expected) + def test_loc_setitem_frame(self): + df = DataFrame(np.random.randn(4, 4), index=list("abcd"), columns=list("ABCD")) + + result = df.iloc[0, 0] + + df.loc["a", "A"] = 1 + result = df.loc["a", "A"] + assert result == 1 + + result = df.iloc[0, 0] + assert result == 1 + + df.loc[:, "B":"D"] = 0 + expected = df.loc[:, "B":"D"] + result = df.iloc[:, 1:] + tm.assert_frame_equal(result, expected) + + def test_loc_setitem_frame_nan_int_coercion_invalid(self): # GH 8669 # invalid coercion of nan -> int df = DataFrame({"A": [1, 2, 3], "B": np.nan}) @@ -640,6 +647,7 @@ def test_loc_setitem_frame(self): expected = DataFrame({"A": [1, 2, 3], "B": np.nan}) tm.assert_frame_equal(df, expected) + def test_loc_setitem_frame_mixed_labels(self): # GH 6546 # setting with mixed labels df = DataFrame({1: [1, 2], 2: [3, 4], "a": ["a", "b"]}) @@ -1063,8 +1071,15 @@ def test_loc_setitem_str_to_small_float_conversion_type(self): expected = DataFrame(col_data, columns=["A"], dtype=object) tm.assert_frame_equal(result, expected) - # change the dtype of the elements from object to float one by one + # assigning with loc/iloc attempts to set the values inplace, which + # in this case is succesful result.loc[result.index, "A"] = [float(x) for x in col_data] + expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object) + tm.assert_frame_equal(result, expected) + + # assigning the entire column using __setitem__ swaps in the new array + # GH#??? + result["A"] = [float(x) for x in col_data] expected = DataFrame(col_data, columns=["A"], dtype=float) tm.assert_frame_equal(result, expected) @@ -1219,7 +1234,9 @@ def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): tz = tz_naive_fixture idx = date_range(start="2015-07-12", periods=3, freq="H", tz=tz) expected = DataFrame(1.2, index=idx, columns=["var"]) - result = DataFrame(index=idx, columns=["var"]) + # if result started off with object dtype, tehn the .loc.__setitem__ + # below would retain object dtype + result = DataFrame(index=idx, columns=["var"], dtype=np.float64) result.loc[:, idxer] = expected tm.assert_frame_equal(result, expected)