From 35073731ed03fb4b319320271bcd96e86525baa8 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 26 Feb 2021 18:24:35 -0800
Subject: [PATCH 1/4] BUG: df.loc setitem-with-expansion with duplicate index

---
 doc/source/whatsnew/v1.3.0.rst                |  2 +-
 pandas/core/indexes/datetimelike.py           | 11 ++--
 pandas/core/indexes/extension.py              | 19 +++++--
 pandas/core/indexes/multi.py                  |  7 +--
 pandas/core/indexing.py                       |  7 ++-
 .../indexes/categorical/test_category.py      | 14 +++---
 pandas/tests/indexing/test_categorical.py     | 43 ++++++++++++----
 pandas/tests/indexing/test_loc.py             | 50 +++++++++++++++++++
 8 files changed, 115 insertions(+), 38 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 32a2514b3b6a3..fdf7c6b5c80fe 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -374,7 +374,7 @@ Indexing
 - Bug in :meth:`RangeIndex.append` where a single object of length 1 was concatenated incorrectly (:issue:`39401`)
 - Bug in setting ``numpy.timedelta64`` values into an object-dtype :class:`Series` using a boolean indexer (:issue:`39488`)
 - Bug in setting numeric values into a into a boolean-dtypes :class:`Series` using ``at`` or ``iat`` failing to cast to object-dtype (:issue:`39582`)
--
+- Bug in :meth:`DataFrame.loc.__setitem__` when setting-with-expansion incorrectly raising when the index in the expanding axis contains duplicates (:issue:`???`)
 
 Missing
 ^^^^^^^
diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
index 6d5992540ef49..caee981f37c02 100644
--- a/pandas/core/indexes/datetimelike.py
+++ b/pandas/core/indexes/datetimelike.py
@@ -618,13 +618,10 @@ def delete(self: _T, loc) -> _T:
 
     @doc(NDArrayBackedExtensionIndex.insert)
     def insert(self, loc: int, item):
-        try:
-            result = super().insert(loc, item)
-        except (ValueError, TypeError):
-            # i.e. self._data._validate_scalar raised
-            return self.astype(object).insert(loc, item)
-
-        result._data._freq = self._get_insert_freq(loc, item)
+        result = super().insert(loc, item)
+        if isinstance(result, type(self)):
+            # i.e. parent class method did not cast
+            result._data._freq = self._get_insert_freq(loc, item)
         return result
 
     # --------------------------------------------------------------------
diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py
index f1418869713d6..514222c4ea11c 100644
--- a/pandas/core/indexes/extension.py
+++ b/pandas/core/indexes/extension.py
@@ -16,6 +16,10 @@
     doc,
 )
 
+from pandas.core.dtypes.cast import (
+    find_common_type,
+    infer_dtype_from,
+)
 from pandas.core.dtypes.common import (
     is_dtype_equal,
     is_object_dtype,
@@ -370,11 +374,16 @@ def insert(self: _T, loc: int, item) -> _T:
         ValueError if the item is not valid for this dtype.
         """
         arr = self._data
-        code = arr._validate_scalar(item)
-
-        new_vals = np.concatenate((arr._ndarray[:loc], [code], arr._ndarray[loc:]))
-        new_arr = arr._from_backing_data(new_vals)
-        return type(self)._simple_new(new_arr, name=self.name)
+        try:
+            code = arr._validate_scalar(item)
+        except (ValueError, TypeError):
+            dtype, _ = infer_dtype_from(item, pandas_dtype=True)
+            dtype = find_common_type([self.dtype, dtype])
+            return self.astype(dtype).insert(loc, item)
+        else:
+            new_vals = np.concatenate((arr._ndarray[:loc], [code], arr._ndarray[loc:]))
+            new_arr = arr._from_backing_data(new_vals)
+            return type(self)._simple_new(new_arr, name=self.name)
 
     def putmask(self, mask, value):
         res_values = self._data.copy()
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index 1889821c79756..88b92c7b304ae 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -3719,12 +3719,7 @@ def insert(self, loc: int, item) -> MultiIndex:
                 # must insert at end otherwise you have to recompute all the
                 # other codes
                 lev_loc = len(level)
-                try:
-                    level = level.insert(lev_loc, k)
-                except TypeError:
-                    # TODO: Should this be done inside insert?
-                    # TODO: smarter casting rules?
-                    level = level.astype(object).insert(lev_loc, k)
+                level = level.insert(lev_loc, k)
             else:
                 lev_loc = level.get_loc(k)
 
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
index cfe16627d5c64..ca9dc4efb47a3 100644
--- a/pandas/core/indexing.py
+++ b/pandas/core/indexing.py
@@ -1641,7 +1641,12 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"):
                     # so the object is the same
                     index = self.obj._get_axis(i)
                     labels = index.insert(len(index), key)
-                    self.obj._mgr = self.obj.reindex(labels, axis=i)._mgr
+                    taker = list(range(len(index))) + [-1]
+                    reindexers = {i: (labels, taker)}
+                    new_obj = self.obj._reindex_with_indexers(
+                        reindexers, allow_dups=True
+                    )
+                    self.obj._mgr = new_obj._mgr
                     self.obj._maybe_update_cacher(clear=True)
                     self.obj._is_copy = None
 
diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py
index 8c9caf2e59011..5d9a9dd6d2f74 100644
--- a/pandas/tests/indexes/categorical/test_category.py
+++ b/pandas/tests/indexes/categorical/test_category.py
@@ -97,10 +97,10 @@ def test_insert(self):
         expected = CategoricalIndex(["a"], categories=categories)
         tm.assert_index_equal(result, expected, exact=True)
 
-        # invalid
-        msg = "'fill_value=d' is not present in this Categorical's categories"
-        with pytest.raises(TypeError, match=msg):
-            ci.insert(0, "d")
+        # invalid -> cast to object
+        expected = ci.astype(object).insert(0, "d")
+        result = ci.insert(0, "d")
+        tm.assert_index_equal(result, expected, exact=True)
 
         # GH 18295 (test missing)
         expected = CategoricalIndex(["a", np.nan, "a", "b", "c", "b"])
@@ -110,9 +110,9 @@ def test_insert(self):
 
     def test_insert_na_mismatched_dtype(self):
         ci = CategoricalIndex([0, 1, 1])
-        msg = "'fill_value=NaT' is not present in this Categorical's categories"
-        with pytest.raises(TypeError, match=msg):
-            ci.insert(0, pd.NaT)
+        result = ci.insert(0, pd.NaT)
+        expected = Index([pd.NaT, 0, 1, 1], dtype=object)
+        tm.assert_index_equal(result, expected)
 
     def test_delete(self):
 
diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py
index 68ae1a0dd6f3d..ad11a16baebc9 100644
--- a/pandas/tests/indexing/test_categorical.py
+++ b/pandas/tests/indexing/test_categorical.py
@@ -37,20 +37,24 @@ def setup_method(self, method):
         )
 
     def test_loc_scalar(self):
+        dtype = CDT(list("cab"))
         result = self.df.loc["a"]
-        expected = DataFrame(
-            {"A": [0, 1, 5], "B": (Series(list("aaa")).astype(CDT(list("cab"))))}
-        ).set_index("B")
+        bidx = Series(list("aaa"), name="B").astype(dtype)
+        assert bidx.dtype == dtype
+
+        expected = DataFrame({"A": [0, 1, 5]}, index=Index(bidx))
         tm.assert_frame_equal(result, expected)
 
         df = self.df.copy()
         df.loc["a"] = 20
+        bidx2 = Series(list("aabbca"), name="B").astype(dtype)
+        assert bidx2.dtype == dtype
         expected = DataFrame(
             {
                 "A": [20, 20, 2, 3, 4, 20],
-                "B": (Series(list("aabbca")).astype(CDT(list("cab")))),
-            }
-        ).set_index("B")
+            },
+            index=Index(bidx2),
+        )
         tm.assert_frame_equal(df, expected)
 
         # value not in the categories
@@ -64,11 +68,28 @@ def test_loc_scalar(self):
         df2.loc["d"] = 10
         tm.assert_frame_equal(df2, expected)
 
-        msg = "'fill_value=d' is not present in this Categorical's categories"
-        with pytest.raises(TypeError, match=msg):
-            df.loc["d", "A"] = 10
-        with pytest.raises(TypeError, match=msg):
-            df.loc["d", "C"] = 10
+        # Setting-with-expansion with a new key "d" that is not among caegories
+        df3 = df.copy()
+        df3.loc["d", "A"] = 10
+        bidx3 = Index(list("aabbcad"), name="B")
+        expected3 = DataFrame(
+            {
+                "A": [20, 20, 2, 3, 4, 20, 10.0],
+            },
+            index=Index(bidx3),
+        )
+        tm.assert_frame_equal(df3, expected3)
+
+        df4 = df.copy()
+        df4.loc["d", "C"] = 10
+        expected3 = DataFrame(
+            {
+                "A": [20, 20, 2, 3, 4, 20, np.nan],
+                "C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 10],
+            },
+            index=Index(bidx3),
+        )
+        tm.assert_frame_equal(df4, expected3)
 
         with pytest.raises(KeyError, match="^1$"):
             df.loc[1]
diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
index 7c9b11650f05a..58d45d1fba0b0 100644
--- a/pandas/tests/indexing/test_loc.py
+++ b/pandas/tests/indexing/test_loc.py
@@ -23,6 +23,7 @@
     DatetimeIndex,
     Index,
     IndexSlice,
+    IntervalIndex,
     MultiIndex,
     Period,
     Series,
@@ -1657,6 +1658,55 @@ def test_loc_setitem_with_expansion_inf_upcast_empty(self):
         expected = pd.Float64Index([0, 1, np.inf])
         tm.assert_index_equal(result, expected)
 
+    @pytest.mark.filterwarnings("ignore:indexing past lexsort depth")
+    def test_loc_setitem_with_expansion_nonunique_index(self, index, request):
+        if not len(index):
+            return
+        if isinstance(index, IntervalIndex):
+            mark = pytest.mark.xfail(reason="IntervalIndex raises")
+            request.node.add_marker(mark)
+
+        index = index.repeat(2)  # ensure non-unique
+        N = len(index)
+        arr = np.arange(N).astype(np.int64)
+
+        orig = DataFrame(arr, index=index, columns=[0])
+
+        # key that will requiring object-dtype casting in the index
+        key = "kapow"
+        assert key not in index  # otherwise test is invalid
+        # TODO: using a tuple key breaks here in many cases
+
+        exp_index = index.insert(len(index), key)
+        if isinstance(index, MultiIndex):
+            exp_index = index.insert(len(index), key)
+            assert exp_index[-1][0] == key
+        else:
+            assert exp_index[-1] == key
+        exp_data = np.arange(N + 1).astype(np.float64)
+        expected = DataFrame(exp_data, index=exp_index, columns=[0])
+
+        # Add new row, but no new columns
+        df = orig.copy()
+        df.loc[key, 0] = N
+        tm.assert_frame_equal(df, expected)
+
+        # add new row on a Series
+        ser = orig.copy()[0]
+        ser.loc[key] = N
+        # the series machinery lets us preserve int dtype instead of float
+        expected = expected[0].astype(np.int64)
+        tm.assert_series_equal(ser, expected)
+
+        # add new row and new column
+        df = orig.copy()
+        df.loc[key, 1] = N
+        expected = DataFrame(
+            {0: list(arr) + [np.nan], 1: [np.nan] * N + [float(N)]},
+            index=exp_index,
+        )
+        tm.assert_frame_equal(df, expected)
+
 
 class TestLocCallable:
     def test_frame_loc_getitem_callable(self):

From 09eb6c583c821a18dd0a29764636ff2759442db2 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 26 Feb 2021 18:26:05 -0800
Subject: [PATCH 2/4] GH ref

---
 doc/source/whatsnew/v1.3.0.rst    | 2 +-
 pandas/tests/indexing/test_loc.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index fdf7c6b5c80fe..32c36c1ffd03f 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -374,7 +374,7 @@ Indexing
 - Bug in :meth:`RangeIndex.append` where a single object of length 1 was concatenated incorrectly (:issue:`39401`)
 - Bug in setting ``numpy.timedelta64`` values into an object-dtype :class:`Series` using a boolean indexer (:issue:`39488`)
 - Bug in setting numeric values into a into a boolean-dtypes :class:`Series` using ``at`` or ``iat`` failing to cast to object-dtype (:issue:`39582`)
-- Bug in :meth:`DataFrame.loc.__setitem__` when setting-with-expansion incorrectly raising when the index in the expanding axis contains duplicates (:issue:`???`)
+- Bug in :meth:`DataFrame.loc.__setitem__` when setting-with-expansion incorrectly raising when the index in the expanding axis contains duplicates (:issue:`40096`)
 
 Missing
 ^^^^^^^
diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
index 58d45d1fba0b0..2ae7830dd0495 100644
--- a/pandas/tests/indexing/test_loc.py
+++ b/pandas/tests/indexing/test_loc.py
@@ -1660,6 +1660,7 @@ def test_loc_setitem_with_expansion_inf_upcast_empty(self):
 
     @pytest.mark.filterwarnings("ignore:indexing past lexsort depth")
     def test_loc_setitem_with_expansion_nonunique_index(self, index, request):
+        # GH#40096
         if not len(index):
             return
         if isinstance(index, IntervalIndex):

From 0afec6311f47c9cfd25c725264a419b1df231308 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 1 Mar 2021 20:00:15 -0800
Subject: [PATCH 3/4] comments, split test

---
 pandas/core/indexes/extension.py          | 3 +++
 pandas/core/indexing.py                   | 4 ++++
 pandas/tests/indexing/test_categorical.py | 9 ++++++++-
 pandas/tests/indexing/test_loc.py         | 1 -
 4 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py
index 514222c4ea11c..ac70200c0c404 100644
--- a/pandas/core/indexes/extension.py
+++ b/pandas/core/indexes/extension.py
@@ -377,6 +377,9 @@ def insert(self: _T, loc: int, item) -> _T:
         try:
             code = arr._validate_scalar(item)
         except (ValueError, TypeError):
+            # e.g. trying to insert an integer into a DatetimeIndex
+            #  We cannot keep the same dtype, so cast to the (often object)
+            #  minimal shared dtype before doing the insert.
             dtype, _ = infer_dtype_from(item, pandas_dtype=True)
             dtype = find_common_type([self.dtype, dtype])
             return self.astype(dtype).insert(loc, item)
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
index ca9dc4efb47a3..261f152099be7 100644
--- a/pandas/core/indexing.py
+++ b/pandas/core/indexing.py
@@ -1641,6 +1641,10 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"):
                     # so the object is the same
                     index = self.obj._get_axis(i)
                     labels = index.insert(len(index), key)
+
+                    # We are expanding the Series/DataFrame values to match
+                    #  the length of thenew index `labels`.  GH#40096 ensure
+                    #  this is valid even if the index has duplicates.
                     taker = list(range(len(index))) + [-1]
                     reindexers = {i: (labels, taker)}
                     new_obj = self.obj._reindex_with_indexers(
diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py
index ad11a16baebc9..f104587ebbded 100644
--- a/pandas/tests/indexing/test_categorical.py
+++ b/pandas/tests/indexing/test_categorical.py
@@ -68,7 +68,12 @@ def test_loc_scalar(self):
         df2.loc["d"] = 10
         tm.assert_frame_equal(df2, expected)
 
+    def test_loc_setitem_with_expansion_non_category(self):
         # Setting-with-expansion with a new key "d" that is not among caegories
+        df = self.df
+        df.loc["a"] = 20
+
+        # Setting a new row on an existing column
         df3 = df.copy()
         df3.loc["d", "A"] = 10
         bidx3 = Index(list("aabbcad"), name="B")
@@ -80,6 +85,7 @@ def test_loc_scalar(self):
         )
         tm.assert_frame_equal(df3, expected3)
 
+        # Settig a new row _and_ new column
         df4 = df.copy()
         df4.loc["d", "C"] = 10
         expected3 = DataFrame(
@@ -91,8 +97,9 @@ def test_loc_scalar(self):
         )
         tm.assert_frame_equal(df4, expected3)
 
+    def test_loc_getitem_scalar_non_category(self):
         with pytest.raises(KeyError, match="^1$"):
-            df.loc[1]
+            self.df.loc[1]
 
     def test_slicing(self):
         cat = Series(Categorical([1, 2, 3, 4]))
diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
index 2ae7830dd0495..5b6c042a11332 100644
--- a/pandas/tests/indexing/test_loc.py
+++ b/pandas/tests/indexing/test_loc.py
@@ -1680,7 +1680,6 @@ def test_loc_setitem_with_expansion_nonunique_index(self, index, request):
 
         exp_index = index.insert(len(index), key)
         if isinstance(index, MultiIndex):
-            exp_index = index.insert(len(index), key)
             assert exp_index[-1][0] == key
         else:
             assert exp_index[-1] == key

From a280932457435f8ffd9112b888845ba1ccd56d1d Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 2 Mar 2021 09:12:55 -0800
Subject: [PATCH 4/4] avoid constructing list

---
 pandas/core/indexing.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
index 261f152099be7..411480a1b1201 100644
--- a/pandas/core/indexing.py
+++ b/pandas/core/indexing.py
@@ -1645,7 +1645,8 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"):
                     # We are expanding the Series/DataFrame values to match
                     #  the length of thenew index `labels`.  GH#40096 ensure
                     #  this is valid even if the index has duplicates.
-                    taker = list(range(len(index))) + [-1]
+                    taker = np.arange(len(index) + 1, dtype=np.intp)
+                    taker[-1] = -1
                     reindexers = {i: (labels, taker)}
                     new_obj = self.obj._reindex_with_indexers(
                         reindexers, allow_dups=True