From 2d2858fccb0ae0f5c6617d1d476de3e63d2f9705 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 13 Jan 2021 17:51:57 -0800 Subject: [PATCH 1/3] API/BUG: always try to operate inplace when setting with loc/iloc[foo, bar] --- pandas/core/indexers.py | 23 ++++++- pandas/core/indexing.py | 3 + pandas/core/internals/blocks.py | 22 +++--- pandas/tests/extension/base/setitem.py | 9 ++- pandas/tests/extension/test_numpy.py | 17 +++++ pandas/tests/frame/indexing/test_indexing.py | 4 +- pandas/tests/indexing/test_indexing.py | 1 + pandas/tests/indexing/test_loc.py | 71 ++++++++++++-------- 8 files changed, 109 insertions(+), 41 deletions(-) diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index a221f77f26170..79479c6db8d9d 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -5,7 +5,7 @@ import numpy as np -from pandas._typing import Any, AnyArrayLike +from pandas._typing import Any, AnyArrayLike, ArrayLike from pandas.core.dtypes.common import ( is_array_like, @@ -270,6 +270,27 @@ def maybe_convert_indices(indices, n: int): # Unsorted +def is_exact_shape_match(target: ArrayLike, value: ArrayLike) -> bool: + """ + Is setting this value into this target overwriting the entire column? + + Parameters + ---------- + target : np.ndarray or ExtensionArray + value : np.ndarray or ExtensionArray + + Returns + ------- + bool + """ + return ( + len(value.shape) > 0 + and len(target.shape) > 0 + and value.shape[0] == target.shape[0] + and value.size == target.size + ) + + def length_of_indexer(indexer, target=None) -> int: """ Return the expected length of target[indexer] diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index ac0955c6c3e4c..f33e5e990fab2 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -33,6 +33,7 @@ from pandas.core.construction import array as pd_array from pandas.core.indexers import ( check_array_indexer, + is_exact_shape_match, is_list_like_indexer, length_of_indexer, ) @@ -1815,6 +1816,8 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): # GH#6149 (null slice), GH#10408 (full bounds) if com.is_null_slice(pi) or com.is_full_slice(pi, len(self.obj)): ser = value + elif is_array_like(value) and is_exact_shape_match(value, ser): + ser = value else: # set the item, possibly having a dtype change ser = ser.copy() diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 9eb4bdc5dbae3..8f4753c9d0cfc 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -65,6 +65,8 @@ Categorical, DatetimeArray, ExtensionArray, + FloatingArray, + IntegerArray, PandasArray, PandasDtype, TimedeltaArray, @@ -75,6 +77,7 @@ from pandas.core.indexers import ( check_setitem_lengths, is_empty_indexer, + is_exact_shape_match, is_scalar_indexer, ) import pandas.core.missing as missing @@ -947,11 +950,8 @@ def setitem(self, indexer, value): # length checking check_setitem_lengths(indexer, value, values) - exact_match = ( - len(arr_value.shape) - and arr_value.shape[0] == values.shape[0] - and arr_value.size == values.size - ) + exact_match = is_exact_shape_match(values, arr_value) + if is_empty_indexer(indexer, arr_value): # GH#8669 empty indexers pass @@ -965,12 +965,14 @@ def setitem(self, indexer, value): # GH25495 - If the current dtype is not categorical, # we need to create a new categorical block values[indexer] = value - return self.make_block(Categorical(self.values, dtype=arr_value.dtype)) elif exact_match and is_ea_value: # GH#32395 if we're going to replace the values entirely, just # substitute in the new array - return self.make_block(arr_value) + if not self.is_object and isinstance(value, (IntegerArray, FloatingArray)): + values[indexer] = value.to_numpy(value.dtype.numpy_dtype) + else: + values[indexer] = np.asarray(value) # if we are an exact match (ex-broadcasting), # then use the resultant dtype @@ -978,8 +980,6 @@ def setitem(self, indexer, value): # We are setting _all_ of the array's values, so can cast to new dtype values[indexer] = value - values = values.astype(arr_value.dtype, copy=False) - elif is_ea_value: # GH#38952 if values.ndim == 1: @@ -1907,6 +1907,10 @@ class NumericBlock(Block): _can_hold_na = True def _can_hold_element(self, element: Any) -> bool: + element = extract_array(element, extract_numpy=True) + if isinstance(element, (IntegerArray, FloatingArray)): + if element._mask.any(): + return False return can_hold_element(self.dtype, element) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index cb9a19b438feb..e610c1659a268 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -341,13 +341,20 @@ def test_setitem_with_expansion_dataframe_column(self, data, full_indexer): key = full_indexer(df) result.loc[key, "data"] = df["data"] + self.assert_frame_equal(result, expected) def test_setitem_series(self, data, full_indexer): # https://github.com/pandas-dev/pandas/issues/32395 - ser = expected = pd.Series(data, name="data") + ser = pd.Series(data, name="data") result = pd.Series(index=ser.index, dtype=object, name="data") + # because result has object dtype, the attempt to do setting inplace + # is successful, and object dtype is retained key = full_indexer(ser) result.loc[key] = ser + + expected = pd.Series( + data.astype(object), index=ser.index, name="data", dtype=object + ) self.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 1f0181eec8830..08608ce0a3d15 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -501,6 +501,23 @@ def test_setitem_slice(self, data, box_in_series): def test_setitem_loc_iloc_slice(self, data): super().test_setitem_loc_iloc_slice(data) + def test_setitem_with_expansion_dataframe_column(self, data, full_indexer): + # https://github.com/pandas-dev/pandas/issues/32395 + df = expected = pd.DataFrame({"data": pd.Series(data)}) + result = pd.DataFrame(index=df.index) + + # because result has object dtype, the attempt to do setting inplace + # is successful, and object dtype is retained + key = full_indexer(df) + result.loc[key, "data"] = df["data"] + + # base class method has expected = df; PandasArray behaves oddly because + # we patch _typ for these tests. + if data.dtype.numpy_dtype != object: + if not isinstance(key, slice) or key != slice(None): + expected = pd.DataFrame({"data": data.to_numpy()}) + self.assert_frame_equal(result, expected) + @skip_nested class TestParsing(BaseNumPyTests, base.BaseParsingTests): diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 6a16b93c8da2f..69a411047c7d2 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1465,7 +1465,9 @@ def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): tz = tz_naive_fixture idx = date_range(start="2015-07-12", periods=3, freq="H", tz=tz) expected = DataFrame(1.2, index=idx, columns=["var"]) - result = DataFrame(index=idx, columns=["var"]) + # if result started off with object dtype, tehn the .loc.__setitem__ + # below would retain object dtype + result = DataFrame(index=idx, columns=["var"], dtype=np.float64) result.loc[:, idxer] = expected tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index f67341ab176d7..509537f967bb1 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -637,6 +637,7 @@ def test_float_index_non_scalar_assignment(self): expected = DataFrame({"a": [1, 1, 3], "b": [1, 1, 5]}, index=df.index) tm.assert_frame_equal(expected, df) + def test_loc_setitem_fullindex_views(self): df = DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}, index=[1.0, 2.0, 3.0]) df2 = df.copy() df.loc[df.index] = df.loc[df.index] diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 7c73917e44b22..30d2b067b5918 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -556,34 +556,19 @@ def test_loc_modify_datetime(self): tm.assert_frame_equal(df, expected) - def test_loc_setitem_frame(self): - df = DataFrame(np.random.randn(4, 4), index=list("abcd"), columns=list("ABCD")) - - result = df.iloc[0, 0] - - df.loc["a", "A"] = 1 - result = df.loc["a", "A"] - assert result == 1 - - result = df.iloc[0, 0] - assert result == 1 - - df.loc[:, "B":"D"] = 0 - expected = df.loc[:, "B":"D"] - result = df.iloc[:, 1:] - tm.assert_frame_equal(result, expected) - - # GH 6254 - # setting issue - df = DataFrame(index=[3, 5, 4], columns=["A"]) + def test_loc_setitem_frame_with_reindex(self): + # GH#6254 setting issue + df = DataFrame(index=[3, 5, 4], columns=["A"], dtype=float) df.loc[[4, 3, 5], "A"] = np.array([1, 2, 3], dtype="int64") - expected = DataFrame({"A": Series([1, 2, 3], index=[4, 3, 5])}).reindex( - index=[3, 5, 4] - ) + + # setting integer values into a float dataframe with loc is inplace, + # so we retain float dtype + ser = Series([2, 3, 1], index=[3, 5, 4], dtype=float) + expected = DataFrame({"A": ser}) tm.assert_frame_equal(df, expected) - # GH 6252 - # setting with an empty frame + def test_loc_setitem_empty_frame(self): + # GH#6252 setting with an empty frame keys1 = ["@" + str(i) for i in range(5)] val1 = np.arange(5, dtype="int64") @@ -598,11 +583,31 @@ def test_loc_setitem_frame(self): df["B"] = np.nan df.loc[keys2, "B"] = val2 - expected = DataFrame( - {"A": Series(val1, index=keys1), "B": Series(val2, index=keys2)} - ).reindex(index=index) + # Becasue df["A"] was initialized as float64, setting values into it + # is inplace, so that dtype is retained + sera = Series(val1, index=keys1, dtype=np.float64) + serb = Series(val2, index=keys2) + expected = DataFrame({"A": sera, "B": serb}).reindex(index=index) tm.assert_frame_equal(df, expected) + def test_loc_setitem_frame(self): + df = DataFrame(np.random.randn(4, 4), index=list("abcd"), columns=list("ABCD")) + + result = df.iloc[0, 0] + + df.loc["a", "A"] = 1 + result = df.loc["a", "A"] + assert result == 1 + + result = df.iloc[0, 0] + assert result == 1 + + df.loc[:, "B":"D"] = 0 + expected = df.loc[:, "B":"D"] + result = df.iloc[:, 1:] + tm.assert_frame_equal(result, expected) + + def test_loc_setitem_frame_nan_int_coercion_invalid(self): # GH 8669 # invalid coercion of nan -> int df = DataFrame({"A": [1, 2, 3], "B": np.nan}) @@ -610,6 +615,7 @@ def test_loc_setitem_frame(self): expected = DataFrame({"A": [1, 2, 3], "B": np.nan}) tm.assert_frame_equal(df, expected) + def test_loc_setitem_frame_mixed_labels(self): # GH 6546 # setting with mixed labels df = DataFrame({1: [1, 2], 2: [3, 4], "a": ["a", "b"]}) @@ -1035,8 +1041,15 @@ def test_loc_setitem_str_to_small_float_conversion_type(self): expected = DataFrame(col_data, columns=["A"], dtype=object) tm.assert_frame_equal(result, expected) - # change the dtype of the elements from object to float one by one + # assigning with loc/iloc attempts to set the values inplace, which + # in this case is succesful result.loc[result.index, "A"] = [float(x) for x in col_data] + expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object) + tm.assert_frame_equal(result, expected) + + # assigning the entire column using __setitem__ swaps in the new array + # GH#??? + result["A"] = [float(x) for x in col_data] expected = DataFrame(col_data, columns=["A"], dtype=float) tm.assert_frame_equal(result, expected) From 515726de9bab5fd196243ffc42f8811fa7995917 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 20 Jan 2021 16:10:21 -0800 Subject: [PATCH 2/3] more whatsnew --- doc/source/whatsnew/v1.3.0.rst | 39 ++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index b061c4040096e..2f84c594e7b64 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -95,6 +95,45 @@ Preserve dtypes in :meth:`~pandas.DataFrame.combine_first` combined.dtypes +Try operating inplace when setting values with ``loc`` and ``iloc`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When setting an entire column using ``loc`` or ``iloc``, pandas will try to +insert the values into the existing data rather than create an entirely new array. + +.. ipython:: python + + df = pd.DataFrame(range(3), columns=["A"], dtype="float64") + values = df.values + new = np.array([5, 6, 7], dtype="int64") + df.loc[[0, 1, 2], "A"] = new + +In both the new and old behavior, the data in ``values`` is overwritten, but in +the old behavior neither ``values`` nor ``new`` is a view on ``df["A"]``. + +*pandas 1.2.x* + +.. code-block:: ipython + + In [1]: df.iloc[0, 0] = 9 + In [2]: values # does not have values[0, 0] == 9 + Out[2]: + array([[5.], + [6.], + [7.]]) + In [3]: new # does not have new[0] == 9 + Out [3]: array([5, 6, 7]) + +In pandas 1.3.0, ``df`` continues to share data with ``values`` + +*pandas 1.3.0* + +.. ipython:: python + + df.iloc[0, 0] = 9 + values # _does_ have values[0, 0] == 9 + + .. _whatsnew_130.api_breaking.deps: Increased minimum versions for dependencies From 09c8ffda44bfd41f3b76cdb54128d7d8a143f513 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 9 Feb 2021 21:33:52 -0800 Subject: [PATCH 3/3] re-write whatnsew note --- doc/source/whatsnew/v1.3.0.rst | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index fb43d7154485e..dcdc0eae6b93e 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -115,20 +115,20 @@ insert the values into the existing data rather than create an entirely new arra df.loc[[0, 1, 2], "A"] = new In both the new and old behavior, the data in ``values`` is overwritten, but in -the old behavior neither ``values`` nor ``new`` is a view on ``df["A"]``. +the old behavior the dtype of ``df["A"]`` changed to ``int64``. *pandas 1.2.x* .. code-block:: ipython - In [1]: df.iloc[0, 0] = 9 - In [2]: values # does not have values[0, 0] == 9 - Out[2]: - array([[5.], - [6.], - [7.]]) - In [3]: new # does not have new[0] == 9 - Out [3]: array([5, 6, 7]) + In [1]: df.dtypes + Out[1]: + A int64 + dtype: object + In [2]: np.shares_memory(df["A"].values, new) + Out[2]: False + In [3]: np.shares_memory(df["A"].values, values) + Out[3]: False In pandas 1.3.0, ``df`` continues to share data with ``values`` @@ -136,8 +136,9 @@ In pandas 1.3.0, ``df`` continues to share data with ``values`` .. ipython:: python - df.iloc[0, 0] = 9 - values # _does_ have values[0, 0] == 9 + df.dtypes + np.shares_memory(df["A"], new) + np.shares_memory(df["A"], values) .. _whatsnew_130.notable_bug_fixes.setitem_with_bool_casting: