From 35073731ed03fb4b319320271bcd96e86525baa8 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 26 Feb 2021 18:24:35 -0800 Subject: [PATCH 1/4] BUG: df.loc setitem-with-expansion with duplicate index --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/core/indexes/datetimelike.py | 11 ++-- pandas/core/indexes/extension.py | 19 +++++-- pandas/core/indexes/multi.py | 7 +-- pandas/core/indexing.py | 7 ++- .../indexes/categorical/test_category.py | 14 +++--- pandas/tests/indexing/test_categorical.py | 43 ++++++++++++---- pandas/tests/indexing/test_loc.py | 50 +++++++++++++++++++ 8 files changed, 115 insertions(+), 38 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 32a2514b3b6a3..fdf7c6b5c80fe 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -374,7 +374,7 @@ Indexing - Bug in :meth:`RangeIndex.append` where a single object of length 1 was concatenated incorrectly (:issue:`39401`) - Bug in setting ``numpy.timedelta64`` values into an object-dtype :class:`Series` using a boolean indexer (:issue:`39488`) - Bug in setting numeric values into a into a boolean-dtypes :class:`Series` using ``at`` or ``iat`` failing to cast to object-dtype (:issue:`39582`) -- +- Bug in :meth:`DataFrame.loc.__setitem__` when setting-with-expansion incorrectly raising when the index in the expanding axis contains duplicates (:issue:`???`) Missing ^^^^^^^ diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 6d5992540ef49..caee981f37c02 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -618,13 +618,10 @@ def delete(self: _T, loc) -> _T: @doc(NDArrayBackedExtensionIndex.insert) def insert(self, loc: int, item): - try: - result = super().insert(loc, item) - except (ValueError, TypeError): - # i.e. self._data._validate_scalar raised - return self.astype(object).insert(loc, item) - - result._data._freq = self._get_insert_freq(loc, item) + result = super().insert(loc, item) + if isinstance(result, type(self)): + # i.e. parent class method did not cast + result._data._freq = self._get_insert_freq(loc, item) return result # -------------------------------------------------------------------- diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index f1418869713d6..514222c4ea11c 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -16,6 +16,10 @@ doc, ) +from pandas.core.dtypes.cast import ( + find_common_type, + infer_dtype_from, +) from pandas.core.dtypes.common import ( is_dtype_equal, is_object_dtype, @@ -370,11 +374,16 @@ def insert(self: _T, loc: int, item) -> _T: ValueError if the item is not valid for this dtype. """ arr = self._data - code = arr._validate_scalar(item) - - new_vals = np.concatenate((arr._ndarray[:loc], [code], arr._ndarray[loc:])) - new_arr = arr._from_backing_data(new_vals) - return type(self)._simple_new(new_arr, name=self.name) + try: + code = arr._validate_scalar(item) + except (ValueError, TypeError): + dtype, _ = infer_dtype_from(item, pandas_dtype=True) + dtype = find_common_type([self.dtype, dtype]) + return self.astype(dtype).insert(loc, item) + else: + new_vals = np.concatenate((arr._ndarray[:loc], [code], arr._ndarray[loc:])) + new_arr = arr._from_backing_data(new_vals) + return type(self)._simple_new(new_arr, name=self.name) def putmask(self, mask, value): res_values = self._data.copy() diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 1889821c79756..88b92c7b304ae 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3719,12 +3719,7 @@ def insert(self, loc: int, item) -> MultiIndex: # must insert at end otherwise you have to recompute all the # other codes lev_loc = len(level) - try: - level = level.insert(lev_loc, k) - except TypeError: - # TODO: Should this be done inside insert? - # TODO: smarter casting rules? - level = level.astype(object).insert(lev_loc, k) + level = level.insert(lev_loc, k) else: lev_loc = level.get_loc(k) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index cfe16627d5c64..ca9dc4efb47a3 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1641,7 +1641,12 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): # so the object is the same index = self.obj._get_axis(i) labels = index.insert(len(index), key) - self.obj._mgr = self.obj.reindex(labels, axis=i)._mgr + taker = list(range(len(index))) + [-1] + reindexers = {i: (labels, taker)} + new_obj = self.obj._reindex_with_indexers( + reindexers, allow_dups=True + ) + self.obj._mgr = new_obj._mgr self.obj._maybe_update_cacher(clear=True) self.obj._is_copy = None diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 8c9caf2e59011..5d9a9dd6d2f74 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -97,10 +97,10 @@ def test_insert(self): expected = CategoricalIndex(["a"], categories=categories) tm.assert_index_equal(result, expected, exact=True) - # invalid - msg = "'fill_value=d' is not present in this Categorical's categories" - with pytest.raises(TypeError, match=msg): - ci.insert(0, "d") + # invalid -> cast to object + expected = ci.astype(object).insert(0, "d") + result = ci.insert(0, "d") + tm.assert_index_equal(result, expected, exact=True) # GH 18295 (test missing) expected = CategoricalIndex(["a", np.nan, "a", "b", "c", "b"]) @@ -110,9 +110,9 @@ def test_insert(self): def test_insert_na_mismatched_dtype(self): ci = CategoricalIndex([0, 1, 1]) - msg = "'fill_value=NaT' is not present in this Categorical's categories" - with pytest.raises(TypeError, match=msg): - ci.insert(0, pd.NaT) + result = ci.insert(0, pd.NaT) + expected = Index([pd.NaT, 0, 1, 1], dtype=object) + tm.assert_index_equal(result, expected) def test_delete(self): diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 68ae1a0dd6f3d..ad11a16baebc9 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -37,20 +37,24 @@ def setup_method(self, method): ) def test_loc_scalar(self): + dtype = CDT(list("cab")) result = self.df.loc["a"] - expected = DataFrame( - {"A": [0, 1, 5], "B": (Series(list("aaa")).astype(CDT(list("cab"))))} - ).set_index("B") + bidx = Series(list("aaa"), name="B").astype(dtype) + assert bidx.dtype == dtype + + expected = DataFrame({"A": [0, 1, 5]}, index=Index(bidx)) tm.assert_frame_equal(result, expected) df = self.df.copy() df.loc["a"] = 20 + bidx2 = Series(list("aabbca"), name="B").astype(dtype) + assert bidx2.dtype == dtype expected = DataFrame( { "A": [20, 20, 2, 3, 4, 20], - "B": (Series(list("aabbca")).astype(CDT(list("cab")))), - } - ).set_index("B") + }, + index=Index(bidx2), + ) tm.assert_frame_equal(df, expected) # value not in the categories @@ -64,11 +68,28 @@ def test_loc_scalar(self): df2.loc["d"] = 10 tm.assert_frame_equal(df2, expected) - msg = "'fill_value=d' is not present in this Categorical's categories" - with pytest.raises(TypeError, match=msg): - df.loc["d", "A"] = 10 - with pytest.raises(TypeError, match=msg): - df.loc["d", "C"] = 10 + # Setting-with-expansion with a new key "d" that is not among caegories + df3 = df.copy() + df3.loc["d", "A"] = 10 + bidx3 = Index(list("aabbcad"), name="B") + expected3 = DataFrame( + { + "A": [20, 20, 2, 3, 4, 20, 10.0], + }, + index=Index(bidx3), + ) + tm.assert_frame_equal(df3, expected3) + + df4 = df.copy() + df4.loc["d", "C"] = 10 + expected3 = DataFrame( + { + "A": [20, 20, 2, 3, 4, 20, np.nan], + "C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 10], + }, + index=Index(bidx3), + ) + tm.assert_frame_equal(df4, expected3) with pytest.raises(KeyError, match="^1$"): df.loc[1] diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 7c9b11650f05a..58d45d1fba0b0 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -23,6 +23,7 @@ DatetimeIndex, Index, IndexSlice, + IntervalIndex, MultiIndex, Period, Series, @@ -1657,6 +1658,55 @@ def test_loc_setitem_with_expansion_inf_upcast_empty(self): expected = pd.Float64Index([0, 1, np.inf]) tm.assert_index_equal(result, expected) + @pytest.mark.filterwarnings("ignore:indexing past lexsort depth") + def test_loc_setitem_with_expansion_nonunique_index(self, index, request): + if not len(index): + return + if isinstance(index, IntervalIndex): + mark = pytest.mark.xfail(reason="IntervalIndex raises") + request.node.add_marker(mark) + + index = index.repeat(2) # ensure non-unique + N = len(index) + arr = np.arange(N).astype(np.int64) + + orig = DataFrame(arr, index=index, columns=[0]) + + # key that will requiring object-dtype casting in the index + key = "kapow" + assert key not in index # otherwise test is invalid + # TODO: using a tuple key breaks here in many cases + + exp_index = index.insert(len(index), key) + if isinstance(index, MultiIndex): + exp_index = index.insert(len(index), key) + assert exp_index[-1][0] == key + else: + assert exp_index[-1] == key + exp_data = np.arange(N + 1).astype(np.float64) + expected = DataFrame(exp_data, index=exp_index, columns=[0]) + + # Add new row, but no new columns + df = orig.copy() + df.loc[key, 0] = N + tm.assert_frame_equal(df, expected) + + # add new row on a Series + ser = orig.copy()[0] + ser.loc[key] = N + # the series machinery lets us preserve int dtype instead of float + expected = expected[0].astype(np.int64) + tm.assert_series_equal(ser, expected) + + # add new row and new column + df = orig.copy() + df.loc[key, 1] = N + expected = DataFrame( + {0: list(arr) + [np.nan], 1: [np.nan] * N + [float(N)]}, + index=exp_index, + ) + tm.assert_frame_equal(df, expected) + class TestLocCallable: def test_frame_loc_getitem_callable(self): From 09eb6c583c821a18dd0a29764636ff2759442db2 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 26 Feb 2021 18:26:05 -0800 Subject: [PATCH 2/4] GH ref --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/tests/indexing/test_loc.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index fdf7c6b5c80fe..32c36c1ffd03f 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -374,7 +374,7 @@ Indexing - Bug in :meth:`RangeIndex.append` where a single object of length 1 was concatenated incorrectly (:issue:`39401`) - Bug in setting ``numpy.timedelta64`` values into an object-dtype :class:`Series` using a boolean indexer (:issue:`39488`) - Bug in setting numeric values into a into a boolean-dtypes :class:`Series` using ``at`` or ``iat`` failing to cast to object-dtype (:issue:`39582`) -- Bug in :meth:`DataFrame.loc.__setitem__` when setting-with-expansion incorrectly raising when the index in the expanding axis contains duplicates (:issue:`???`) +- Bug in :meth:`DataFrame.loc.__setitem__` when setting-with-expansion incorrectly raising when the index in the expanding axis contains duplicates (:issue:`40096`) Missing ^^^^^^^ diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 58d45d1fba0b0..2ae7830dd0495 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1660,6 +1660,7 @@ def test_loc_setitem_with_expansion_inf_upcast_empty(self): @pytest.mark.filterwarnings("ignore:indexing past lexsort depth") def test_loc_setitem_with_expansion_nonunique_index(self, index, request): + # GH#40096 if not len(index): return if isinstance(index, IntervalIndex): From 0afec6311f47c9cfd25c725264a419b1df231308 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 1 Mar 2021 20:00:15 -0800 Subject: [PATCH 3/4] comments, split test --- pandas/core/indexes/extension.py | 3 +++ pandas/core/indexing.py | 4 ++++ pandas/tests/indexing/test_categorical.py | 9 ++++++++- pandas/tests/indexing/test_loc.py | 1 - 4 files changed, 15 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 514222c4ea11c..ac70200c0c404 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -377,6 +377,9 @@ def insert(self: _T, loc: int, item) -> _T: try: code = arr._validate_scalar(item) except (ValueError, TypeError): + # e.g. trying to insert an integer into a DatetimeIndex + # We cannot keep the same dtype, so cast to the (often object) + # minimal shared dtype before doing the insert. dtype, _ = infer_dtype_from(item, pandas_dtype=True) dtype = find_common_type([self.dtype, dtype]) return self.astype(dtype).insert(loc, item) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index ca9dc4efb47a3..261f152099be7 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1641,6 +1641,10 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): # so the object is the same index = self.obj._get_axis(i) labels = index.insert(len(index), key) + + # We are expanding the Series/DataFrame values to match + # the length of thenew index `labels`. GH#40096 ensure + # this is valid even if the index has duplicates. taker = list(range(len(index))) + [-1] reindexers = {i: (labels, taker)} new_obj = self.obj._reindex_with_indexers( diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index ad11a16baebc9..f104587ebbded 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -68,7 +68,12 @@ def test_loc_scalar(self): df2.loc["d"] = 10 tm.assert_frame_equal(df2, expected) + def test_loc_setitem_with_expansion_non_category(self): # Setting-with-expansion with a new key "d" that is not among caegories + df = self.df + df.loc["a"] = 20 + + # Setting a new row on an existing column df3 = df.copy() df3.loc["d", "A"] = 10 bidx3 = Index(list("aabbcad"), name="B") @@ -80,6 +85,7 @@ def test_loc_scalar(self): ) tm.assert_frame_equal(df3, expected3) + # Settig a new row _and_ new column df4 = df.copy() df4.loc["d", "C"] = 10 expected3 = DataFrame( @@ -91,8 +97,9 @@ def test_loc_scalar(self): ) tm.assert_frame_equal(df4, expected3) + def test_loc_getitem_scalar_non_category(self): with pytest.raises(KeyError, match="^1$"): - df.loc[1] + self.df.loc[1] def test_slicing(self): cat = Series(Categorical([1, 2, 3, 4])) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 2ae7830dd0495..5b6c042a11332 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1680,7 +1680,6 @@ def test_loc_setitem_with_expansion_nonunique_index(self, index, request): exp_index = index.insert(len(index), key) if isinstance(index, MultiIndex): - exp_index = index.insert(len(index), key) assert exp_index[-1][0] == key else: assert exp_index[-1] == key From a280932457435f8ffd9112b888845ba1ccd56d1d Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 2 Mar 2021 09:12:55 -0800 Subject: [PATCH 4/4] avoid constructing list --- pandas/core/indexing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 261f152099be7..411480a1b1201 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1645,7 +1645,8 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): # We are expanding the Series/DataFrame values to match # the length of thenew index `labels`. GH#40096 ensure # this is valid even if the index has duplicates. - taker = list(range(len(index))) + [-1] + taker = np.arange(len(index) + 1, dtype=np.intp) + taker[-1] = -1 reindexers = {i: (labels, taker)} new_obj = self.obj._reindex_with_indexers( reindexers, allow_dups=True