REGR: setitem writing into RangeIndex instead of creating a copy (pandas-dev#47143)

phofl · yehoshuadimarsky · commit 1728768231bb · 2022-07-13T10:18:03.000-04:00
diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst
@@ -15,6 +15,7 @@ including other versions of pandas.
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
 - Fixed regression in :meth:`DataFrame.replace` when the replacement value was explicitly ``None`` when passed in a dictionary to ``to_replace`` also casting other columns to object dtype even when there were no values to replace (:issue:`46634`)
+- Fixed regression when setting values with :meth:`DataFrame.loc` updating :class:`RangeIndex` when index was set as new column and column was updated afterwards (:issue:`47128`)
 - Fixed regression in :meth:`DataFrame.nsmallest` led to wrong results when ``np.nan`` in the sorting column (:issue:`46589`)
 - Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`)
 - Fixed regression in :func:`concat` not sorting columns for mixed column names (:issue:`47127`)
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
@@ -531,7 +531,7 @@ def sanitize_array(
         dtype = dtype.numpy_dtype
 
     # extract ndarray or ExtensionArray, ensure we have no PandasArray
-    data = extract_array(data, extract_numpy=True)
+    data = extract_array(data, extract_numpy=True, extract_range=True)
 
     if isinstance(data, np.ndarray) and data.ndim == 0:
         if dtype is None:
@@ -610,7 +610,7 @@ def sanitize_array(
         # materialize e.g. generators, convert e.g. tuples, abc.ValueView
         if hasattr(data, "__array__"):
             # e.g. dask array GH#38645
-            data = np.asarray(data)
+            data = np.array(data, copy=copy)
         else:
             data = list(data)
 
diff --git a/pandas/tests/copy_view/test_setitem.py b/pandas/tests/copy_view/test_setitem.py
@@ -60,21 +60,14 @@ def test_set_column_with_index(using_copy_on_write):
     idx.values[0] = 0
     tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c"))
 
-    # however, in case of a RangeIndex, we currently don't copy the cached
-    # "materialized" values
     idx = RangeIndex(1, 4)
     arr = idx.values
 
     df["d"] = idx
 
-    if using_copy_on_write:
-        assert not np.shares_memory(df["d"].values, arr)
-        arr[0] = 0
-        tm.assert_series_equal(df["d"], Series([1, 2, 3], name="d"))
-    else:
-        assert np.shares_memory(df["d"].values, arr)
-        arr[0] = 0
-        tm.assert_series_equal(df["d"], Series([0, 2, 3], name="d"))
+    assert not np.shares_memory(df["d"].values, arr)
+    arr[0] = 0
+    tm.assert_series_equal(df["d"], Series([1, 2, 3], name="d"))
 
 
 def test_set_columns_with_dataframe(using_copy_on_write):
diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py
@@ -857,6 +857,15 @@ def test_frame_setitem_newcol_timestamp(self):
         data[ts] = np.nan  # works, mostly a smoke-test
         assert np.isnan(data[ts]).all()
 
+    def test_frame_setitem_rangeindex_into_new_col(self):
+        # GH#47128
+        df = DataFrame({"a": ["a", "b"]})
+        df["b"] = df.index
+        df.loc[[False, True], "b"] = 100
+        result = df.loc[[1], :]
+        expected = DataFrame({"a": ["b"], "b": [100]}, index=[1])
+        tm.assert_frame_equal(result, expected)
+
 
 class TestDataFrameSetItemSlicing:
     def test_setitem_slice_position(self):
diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py
@@ -304,3 +304,27 @@ def test_missing_required_dependency():
     output = exc.value.stdout.decode()
     for name in ["numpy", "pytz", "dateutil"]:
         assert name in output
+
+
+def test_frame_setitem_dask_array_into_new_col():
+    # GH#47128
+
+    # dask sets "compute.use_numexpr" to False, so catch the current value
+    # and ensure to reset it afterwards to avoid impacting other tests
+    olduse = pd.get_option("compute.use_numexpr")
+
+    try:
+        dask = import_module("dask")  # noqa:F841
+
+        import dask.array as da
+
+        dda = da.array([1, 2])
+        df = DataFrame({"a": ["a", "b"]})
+        df["b"] = dda
+        df["c"] = dda
+        df.loc[[False, True], "b"] = 100
+        result = df.loc[[1], :]
+        expected = DataFrame({"a": ["b"], "b": [100], "c": [2]}, index=[1])
+        tm.assert_frame_equal(result, expected)
+    finally:
+        pd.set_option("compute.use_numexpr", olduse)