Skip to content

Commit 13a97c2

Browse files
authored
BUG: df.loc setitem-with-expansion with duplicate index (#40096)
1 parent 4f18821 commit 13a97c2

File tree

8 files changed

+131
-39
lines changed

8 files changed

+131
-39
lines changed

doc/source/whatsnew/v1.3.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -442,7 +442,7 @@ Indexing
442442
- Bug in :meth:`RangeIndex.append` where a single object of length 1 was concatenated incorrectly (:issue:`39401`)
443443
- Bug in setting ``numpy.timedelta64`` values into an object-dtype :class:`Series` using a boolean indexer (:issue:`39488`)
444444
- Bug in setting numeric values into a into a boolean-dtypes :class:`Series` using ``at`` or ``iat`` failing to cast to object-dtype (:issue:`39582`)
445-
-
445+
- Bug in :meth:`DataFrame.loc.__setitem__` when setting-with-expansion incorrectly raising when the index in the expanding axis contains duplicates (:issue:`40096`)
446446

447447
Missing
448448
^^^^^^^

pandas/core/indexes/datetimelike.py

+4-7
Original file line numberDiff line numberDiff line change
@@ -614,13 +614,10 @@ def delete(self: _T, loc) -> _T:
614614

615615
@doc(NDArrayBackedExtensionIndex.insert)
616616
def insert(self, loc: int, item):
617-
try:
618-
result = super().insert(loc, item)
619-
except (ValueError, TypeError):
620-
# i.e. self._data._validate_scalar raised
621-
return self.astype(object).insert(loc, item)
622-
623-
result._data._freq = self._get_insert_freq(loc, item)
617+
result = super().insert(loc, item)
618+
if isinstance(result, type(self)):
619+
# i.e. parent class method did not cast
620+
result._data._freq = self._get_insert_freq(loc, item)
624621
return result
625622

626623
# --------------------------------------------------------------------

pandas/core/indexes/extension.py

+17-5
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@
1616
doc,
1717
)
1818

19+
from pandas.core.dtypes.cast import (
20+
find_common_type,
21+
infer_dtype_from,
22+
)
1923
from pandas.core.dtypes.common import (
2024
is_dtype_equal,
2125
is_object_dtype,
@@ -370,11 +374,19 @@ def insert(self: _T, loc: int, item) -> _T:
370374
ValueError if the item is not valid for this dtype.
371375
"""
372376
arr = self._data
373-
code = arr._validate_scalar(item)
374-
375-
new_vals = np.concatenate((arr._ndarray[:loc], [code], arr._ndarray[loc:]))
376-
new_arr = arr._from_backing_data(new_vals)
377-
return type(self)._simple_new(new_arr, name=self.name)
377+
try:
378+
code = arr._validate_scalar(item)
379+
except (ValueError, TypeError):
380+
# e.g. trying to insert an integer into a DatetimeIndex
381+
# We cannot keep the same dtype, so cast to the (often object)
382+
# minimal shared dtype before doing the insert.
383+
dtype, _ = infer_dtype_from(item, pandas_dtype=True)
384+
dtype = find_common_type([self.dtype, dtype])
385+
return self.astype(dtype).insert(loc, item)
386+
else:
387+
new_vals = np.concatenate((arr._ndarray[:loc], [code], arr._ndarray[loc:]))
388+
new_arr = arr._from_backing_data(new_vals)
389+
return type(self)._simple_new(new_arr, name=self.name)
378390

379391
def putmask(self, mask, value):
380392
res_values = self._data.copy()

pandas/core/indexes/multi.py

+1-6
Original file line numberDiff line numberDiff line change
@@ -3719,12 +3719,7 @@ def insert(self, loc: int, item) -> MultiIndex:
37193719
# must insert at end otherwise you have to recompute all the
37203720
# other codes
37213721
lev_loc = len(level)
3722-
try:
3723-
level = level.insert(lev_loc, k)
3724-
except TypeError:
3725-
# TODO: Should this be done inside insert?
3726-
# TODO: smarter casting rules?
3727-
level = level.astype(object).insert(lev_loc, k)
3722+
level = level.insert(lev_loc, k)
37283723
else:
37293724
lev_loc = level.get_loc(k)
37303725

pandas/core/indexing.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -1641,7 +1641,17 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"):
16411641
# so the object is the same
16421642
index = self.obj._get_axis(i)
16431643
labels = index.insert(len(index), key)
1644-
self.obj._mgr = self.obj.reindex(labels, axis=i)._mgr
1644+
1645+
# We are expanding the Series/DataFrame values to match
1646+
# the length of thenew index `labels`. GH#40096 ensure
1647+
# this is valid even if the index has duplicates.
1648+
taker = np.arange(len(index) + 1, dtype=np.intp)
1649+
taker[-1] = -1
1650+
reindexers = {i: (labels, taker)}
1651+
new_obj = self.obj._reindex_with_indexers(
1652+
reindexers, allow_dups=True
1653+
)
1654+
self.obj._mgr = new_obj._mgr
16451655
self.obj._maybe_update_cacher(clear=True)
16461656
self.obj._is_copy = None
16471657

pandas/tests/indexes/categorical/test_category.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,10 @@ def test_insert(self):
5050
expected = CategoricalIndex(["a"], categories=categories)
5151
tm.assert_index_equal(result, expected, exact=True)
5252

53-
# invalid
54-
msg = "'fill_value=d' is not present in this Categorical's categories"
55-
with pytest.raises(TypeError, match=msg):
56-
ci.insert(0, "d")
53+
# invalid -> cast to object
54+
expected = ci.astype(object).insert(0, "d")
55+
result = ci.insert(0, "d")
56+
tm.assert_index_equal(result, expected, exact=True)
5757

5858
# GH 18295 (test missing)
5959
expected = CategoricalIndex(["a", np.nan, "a", "b", "c", "b"])
@@ -63,9 +63,9 @@ def test_insert(self):
6363

6464
def test_insert_na_mismatched_dtype(self):
6565
ci = CategoricalIndex([0, 1, 1])
66-
msg = "'fill_value=NaT' is not present in this Categorical's categories"
67-
with pytest.raises(TypeError, match=msg):
68-
ci.insert(0, pd.NaT)
66+
result = ci.insert(0, pd.NaT)
67+
expected = Index([pd.NaT, 0, 1, 1], dtype=object)
68+
tm.assert_index_equal(result, expected)
6969

7070
def test_delete(self):
7171

pandas/tests/indexing/test_categorical.py

+40-12
Original file line numberDiff line numberDiff line change
@@ -37,20 +37,24 @@ def setup_method(self, method):
3737
)
3838

3939
def test_loc_scalar(self):
40+
dtype = CDT(list("cab"))
4041
result = self.df.loc["a"]
41-
expected = DataFrame(
42-
{"A": [0, 1, 5], "B": (Series(list("aaa")).astype(CDT(list("cab"))))}
43-
).set_index("B")
42+
bidx = Series(list("aaa"), name="B").astype(dtype)
43+
assert bidx.dtype == dtype
44+
45+
expected = DataFrame({"A": [0, 1, 5]}, index=Index(bidx))
4446
tm.assert_frame_equal(result, expected)
4547

4648
df = self.df.copy()
4749
df.loc["a"] = 20
50+
bidx2 = Series(list("aabbca"), name="B").astype(dtype)
51+
assert bidx2.dtype == dtype
4852
expected = DataFrame(
4953
{
5054
"A": [20, 20, 2, 3, 4, 20],
51-
"B": (Series(list("aabbca")).astype(CDT(list("cab")))),
52-
}
53-
).set_index("B")
55+
},
56+
index=Index(bidx2),
57+
)
5458
tm.assert_frame_equal(df, expected)
5559

5660
# value not in the categories
@@ -64,14 +68,38 @@ def test_loc_scalar(self):
6468
df2.loc["d"] = 10
6569
tm.assert_frame_equal(df2, expected)
6670

67-
msg = "'fill_value=d' is not present in this Categorical's categories"
68-
with pytest.raises(TypeError, match=msg):
69-
df.loc["d", "A"] = 10
70-
with pytest.raises(TypeError, match=msg):
71-
df.loc["d", "C"] = 10
71+
def test_loc_setitem_with_expansion_non_category(self):
72+
# Setting-with-expansion with a new key "d" that is not among caegories
73+
df = self.df
74+
df.loc["a"] = 20
75+
76+
# Setting a new row on an existing column
77+
df3 = df.copy()
78+
df3.loc["d", "A"] = 10
79+
bidx3 = Index(list("aabbcad"), name="B")
80+
expected3 = DataFrame(
81+
{
82+
"A": [20, 20, 2, 3, 4, 20, 10.0],
83+
},
84+
index=Index(bidx3),
85+
)
86+
tm.assert_frame_equal(df3, expected3)
87+
88+
# Settig a new row _and_ new column
89+
df4 = df.copy()
90+
df4.loc["d", "C"] = 10
91+
expected3 = DataFrame(
92+
{
93+
"A": [20, 20, 2, 3, 4, 20, np.nan],
94+
"C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 10],
95+
},
96+
index=Index(bidx3),
97+
)
98+
tm.assert_frame_equal(df4, expected3)
7299

100+
def test_loc_getitem_scalar_non_category(self):
73101
with pytest.raises(KeyError, match="^1$"):
74-
df.loc[1]
102+
self.df.loc[1]
75103

76104
def test_slicing(self):
77105
cat = Series(Categorical([1, 2, 3, 4]))

pandas/tests/indexing/test_loc.py

+50
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
DatetimeIndex,
2424
Index,
2525
IndexSlice,
26+
IntervalIndex,
2627
MultiIndex,
2728
Period,
2829
Series,
@@ -1657,6 +1658,55 @@ def test_loc_setitem_with_expansion_inf_upcast_empty(self):
16571658
expected = pd.Float64Index([0, 1, np.inf])
16581659
tm.assert_index_equal(result, expected)
16591660

1661+
@pytest.mark.filterwarnings("ignore:indexing past lexsort depth")
1662+
def test_loc_setitem_with_expansion_nonunique_index(self, index, request):
1663+
# GH#40096
1664+
if not len(index):
1665+
return
1666+
if isinstance(index, IntervalIndex):
1667+
mark = pytest.mark.xfail(reason="IntervalIndex raises")
1668+
request.node.add_marker(mark)
1669+
1670+
index = index.repeat(2) # ensure non-unique
1671+
N = len(index)
1672+
arr = np.arange(N).astype(np.int64)
1673+
1674+
orig = DataFrame(arr, index=index, columns=[0])
1675+
1676+
# key that will requiring object-dtype casting in the index
1677+
key = "kapow"
1678+
assert key not in index # otherwise test is invalid
1679+
# TODO: using a tuple key breaks here in many cases
1680+
1681+
exp_index = index.insert(len(index), key)
1682+
if isinstance(index, MultiIndex):
1683+
assert exp_index[-1][0] == key
1684+
else:
1685+
assert exp_index[-1] == key
1686+
exp_data = np.arange(N + 1).astype(np.float64)
1687+
expected = DataFrame(exp_data, index=exp_index, columns=[0])
1688+
1689+
# Add new row, but no new columns
1690+
df = orig.copy()
1691+
df.loc[key, 0] = N
1692+
tm.assert_frame_equal(df, expected)
1693+
1694+
# add new row on a Series
1695+
ser = orig.copy()[0]
1696+
ser.loc[key] = N
1697+
# the series machinery lets us preserve int dtype instead of float
1698+
expected = expected[0].astype(np.int64)
1699+
tm.assert_series_equal(ser, expected)
1700+
1701+
# add new row and new column
1702+
df = orig.copy()
1703+
df.loc[key, 1] = N
1704+
expected = DataFrame(
1705+
{0: list(arr) + [np.nan], 1: [np.nan] * N + [float(N)]},
1706+
index=exp_index,
1707+
)
1708+
tm.assert_frame_equal(df, expected)
1709+
16601710

16611711
class TestLocCallable:
16621712
def test_frame_loc_getitem_callable(self):

0 commit comments

Comments
 (0)