From 0c6de301e2333f2f5a9ef006dfe38ce734e71bd9 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 27 May 2022 15:33:47 +0200 Subject: [PATCH 1/5] REGR: setitem writing into RangeIndex instead of creating a copy --- doc/source/whatsnew/v1.4.3.rst | 1 + pandas/core/construction.py | 4 +++- pandas/tests/frame/indexing/test_setitem.py | 9 ++++++++ pandas/tests/test_downstream.py | 25 +++++++++++++++++++++ 4 files changed, 38 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index bf414ab77cf65..ed2469c3267a4 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -15,6 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`DataFrame.replace` when the replacement value was explicitly ``None`` when passed in a dictionary to ``to_replace`` also casting other columns to object dtype even when there were no values to replace (:issue:`46634`) +- Fixed regression in :meth:`DataFrame.loc.__setitem__` updating :class:`RangeIndex` when index was set as new column and column was updated afterwards (:issue:`47128`) - Fixed regression in :meth:`DataFrame.nsmallest` led to wrong results when ``np.nan`` in the sorting column (:issue:`46589`) - Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`) - Fixed regression in :meth:`.Groupby.transform` and :meth:`.Groupby.agg` failing with ``engine="numba"`` when the index was a :class:`MultiIndex` (:issue:`46867`) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 434302b39fef9..145a0bbfc6476 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -531,7 +531,7 @@ def sanitize_array( dtype = dtype.numpy_dtype # extract ndarray or ExtensionArray, ensure we have no PandasArray - data = extract_array(data, extract_numpy=True) + data = extract_array(data, extract_numpy=True, extract_range=True) if isinstance(data, np.ndarray) and data.ndim == 0: if dtype is None: @@ -611,6 +611,8 @@ def sanitize_array( if hasattr(data, "__array__"): # e.g. dask array GH#38645 data = np.asarray(data) + if copy: + data = np.copy(data) else: data = list(data) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index fda37fdedb92a..8c5ebf9c1b125 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -854,6 +854,15 @@ def test_frame_setitem_newcol_timestamp(self): data[ts] = np.nan # works, mostly a smoke-test assert np.isnan(data[ts]).all() + def test_frame_setitem_rangeindex_into_new_col(self): + # GH#47128 + df = DataFrame({"a": ["a", "b"]}) + df["b"] = df.index + df.loc[[False, True], "b"] = 100 + result = df.loc[[1], :] + expected = DataFrame({"a": ["b"], "b": [100]}, index=[1]) + tm.assert_frame_equal(result, expected) + class TestDataFrameSetItemSlicing: def test_setitem_slice_position(self): diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index b4887cc321785..56fe1ef51d668 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -304,3 +304,28 @@ def test_missing_required_dependency(): output = exc.value.stdout.decode() for name in ["numpy", "pytz", "dateutil"]: assert name in output + + +def test_frame_setitem_dask_array_into_new_col(): + # GH#47128 + + # dask sets "compute.use_numexpr" to False, so catch the current value + # and ensure to reset it afterwards to avoid impacting other tests + olduse = pd.get_option("compute.use_numexpr") + + try: + toolz = import_module("toolz") # noqa:F841 + dask = import_module("dask") # noqa:F841 + + import dask.array as da + + dda = da.array([1, 2]) + df = DataFrame({"a": ["a", "b"]}) + df["b"] = dda + df["c"] = dda + df.loc[[False, True], "b"] = 100 + result = df.loc[[1], :] + expected = DataFrame({"a": ["b"], "b": [100], "c": [2]}, index=[1]) + tm.assert_frame_equal(result, expected) + finally: + pd.set_option("compute.use_numexpr", olduse) From 4dd122b8764a2591cd7353fb7884d84190f75f99 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 27 May 2022 15:37:20 +0200 Subject: [PATCH 2/5] Remove import --- pandas/tests/test_downstream.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 56fe1ef51d668..119ffd8cfd5a1 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -314,7 +314,6 @@ def test_frame_setitem_dask_array_into_new_col(): olduse = pd.get_option("compute.use_numexpr") try: - toolz = import_module("toolz") # noqa:F841 dask = import_module("dask") # noqa:F841 import dask.array as da From f0dcf7f6f36e4de527f01f7f0f1a114757b994fd Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 27 May 2022 15:49:53 +0200 Subject: [PATCH 3/5] Update doc/source/whatsnew/v1.4.3.rst Co-authored-by: Simon Hawkins --- doc/source/whatsnew/v1.4.3.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index ed2469c3267a4..72475d018a61f 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -15,7 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`DataFrame.replace` when the replacement value was explicitly ``None`` when passed in a dictionary to ``to_replace`` also casting other columns to object dtype even when there were no values to replace (:issue:`46634`) -- Fixed regression in :meth:`DataFrame.loc.__setitem__` updating :class:`RangeIndex` when index was set as new column and column was updated afterwards (:issue:`47128`) +- Fixed regression when setting values with :meth:`DataFrame.loc` updating :class:`RangeIndex` when index was set as new column and column was updated afterwards (:issue:`47128`) - Fixed regression in :meth:`DataFrame.nsmallest` led to wrong results when ``np.nan`` in the sorting column (:issue:`46589`) - Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`) - Fixed regression in :meth:`.Groupby.transform` and :meth:`.Groupby.agg` failing with ``engine="numba"`` when the index was a :class:`MultiIndex` (:issue:`46867`) From b03d1c9a297097fa312c07c5ed2f7921f018095c Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 27 May 2022 16:25:03 +0200 Subject: [PATCH 4/5] Fix test --- pandas/tests/copy_view/test_setitem.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/pandas/tests/copy_view/test_setitem.py b/pandas/tests/copy_view/test_setitem.py index 37714e346eee7..dd42983179806 100644 --- a/pandas/tests/copy_view/test_setitem.py +++ b/pandas/tests/copy_view/test_setitem.py @@ -60,21 +60,14 @@ def test_set_column_with_index(using_copy_on_write): idx.values[0] = 0 tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c")) - # however, in case of a RangeIndex, we currently don't copy the cached - # "materialized" values idx = RangeIndex(1, 4) arr = idx.values df["d"] = idx - if using_copy_on_write: - assert not np.shares_memory(df["d"].values, arr) - arr[0] = 0 - tm.assert_series_equal(df["d"], Series([1, 2, 3], name="d")) - else: - assert np.shares_memory(df["d"].values, arr) - arr[0] = 0 - tm.assert_series_equal(df["d"], Series([0, 2, 3], name="d")) + assert not np.shares_memory(df["d"].values, arr) + arr[0] = 0 + tm.assert_series_equal(df["d"], Series([1, 2, 3], name="d")) def test_set_columns_with_dataframe(using_copy_on_write): From b7a57c154e69c2bab014b4c73be5fe7897d3b992 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 3 Jun 2022 14:09:22 +0200 Subject: [PATCH 5/5] Change to array --- pandas/core/construction.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 145a0bbfc6476..8d26284a5ce45 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -610,9 +610,7 @@ def sanitize_array( # materialize e.g. generators, convert e.g. tuples, abc.ValueView if hasattr(data, "__array__"): # e.g. dask array GH#38645 - data = np.asarray(data) - if copy: - data = np.copy(data) + data = np.array(data, copy=copy) else: data = list(data)