Skip to content

BUG: df.loc setitem-with-expansion with duplicate index #40096

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Mar 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ Indexing
- Bug in :meth:`RangeIndex.append` where a single object of length 1 was concatenated incorrectly (:issue:`39401`)
- Bug in setting ``numpy.timedelta64`` values into an object-dtype :class:`Series` using a boolean indexer (:issue:`39488`)
- Bug in setting numeric values into a into a boolean-dtypes :class:`Series` using ``at`` or ``iat`` failing to cast to object-dtype (:issue:`39582`)
-
- Bug in :meth:`DataFrame.loc.__setitem__` when setting-with-expansion incorrectly raising when the index in the expanding axis contains duplicates (:issue:`40096`)

Missing
^^^^^^^
Expand Down
11 changes: 4 additions & 7 deletions pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,13 +614,10 @@ def delete(self: _T, loc) -> _T:

@doc(NDArrayBackedExtensionIndex.insert)
def insert(self, loc: int, item):
try:
result = super().insert(loc, item)
except (ValueError, TypeError):
# i.e. self._data._validate_scalar raised
return self.astype(object).insert(loc, item)

result._data._freq = self._get_insert_freq(loc, item)
result = super().insert(loc, item)
if isinstance(result, type(self)):
# i.e. parent class method did not cast
result._data._freq = self._get_insert_freq(loc, item)
return result

# --------------------------------------------------------------------
Expand Down
22 changes: 17 additions & 5 deletions pandas/core/indexes/extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@
doc,
)

from pandas.core.dtypes.cast import (
find_common_type,
infer_dtype_from,
)
from pandas.core.dtypes.common import (
is_dtype_equal,
is_object_dtype,
Expand Down Expand Up @@ -370,11 +374,19 @@ def insert(self: _T, loc: int, item) -> _T:
ValueError if the item is not valid for this dtype.
"""
arr = self._data
code = arr._validate_scalar(item)

new_vals = np.concatenate((arr._ndarray[:loc], [code], arr._ndarray[loc:]))
new_arr = arr._from_backing_data(new_vals)
return type(self)._simple_new(new_arr, name=self.name)
try:
code = arr._validate_scalar(item)
except (ValueError, TypeError):
# e.g. trying to insert an integer into a DatetimeIndex
# We cannot keep the same dtype, so cast to the (often object)
# minimal shared dtype before doing the insert.
dtype, _ = infer_dtype_from(item, pandas_dtype=True)
dtype = find_common_type([self.dtype, dtype])
return self.astype(dtype).insert(loc, item)
else:
new_vals = np.concatenate((arr._ndarray[:loc], [code], arr._ndarray[loc:]))
new_arr = arr._from_backing_data(new_vals)
return type(self)._simple_new(new_arr, name=self.name)

def putmask(self, mask, value):
res_values = self._data.copy()
Expand Down
7 changes: 1 addition & 6 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -3719,12 +3719,7 @@ def insert(self, loc: int, item) -> MultiIndex:
# must insert at end otherwise you have to recompute all the
# other codes
lev_loc = len(level)
try:
level = level.insert(lev_loc, k)
except TypeError:
# TODO: Should this be done inside insert?
# TODO: smarter casting rules?
level = level.astype(object).insert(lev_loc, k)
level = level.insert(lev_loc, k)
else:
lev_loc = level.get_loc(k)

Expand Down
12 changes: 11 additions & 1 deletion pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1641,7 +1641,17 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"):
# so the object is the same
index = self.obj._get_axis(i)
labels = index.insert(len(index), key)
self.obj._mgr = self.obj.reindex(labels, axis=i)._mgr

# We are expanding the Series/DataFrame values to match
# the length of thenew index `labels`. GH#40096 ensure
# this is valid even if the index has duplicates.
taker = np.arange(len(index) + 1, dtype=np.intp)
taker[-1] = -1
reindexers = {i: (labels, taker)}
new_obj = self.obj._reindex_with_indexers(
reindexers, allow_dups=True
)
self.obj._mgr = new_obj._mgr
self.obj._maybe_update_cacher(clear=True)
self.obj._is_copy = None

Expand Down
14 changes: 7 additions & 7 deletions pandas/tests/indexes/categorical/test_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,10 @@ def test_insert(self):
expected = CategoricalIndex(["a"], categories=categories)
tm.assert_index_equal(result, expected, exact=True)

# invalid
msg = "'fill_value=d' is not present in this Categorical's categories"
with pytest.raises(TypeError, match=msg):
ci.insert(0, "d")
# invalid -> cast to object
expected = ci.astype(object).insert(0, "d")
result = ci.insert(0, "d")
tm.assert_index_equal(result, expected, exact=True)

# GH 18295 (test missing)
expected = CategoricalIndex(["a", np.nan, "a", "b", "c", "b"])
Expand All @@ -63,9 +63,9 @@ def test_insert(self):

def test_insert_na_mismatched_dtype(self):
ci = CategoricalIndex([0, 1, 1])
msg = "'fill_value=NaT' is not present in this Categorical's categories"
with pytest.raises(TypeError, match=msg):
ci.insert(0, pd.NaT)
result = ci.insert(0, pd.NaT)
expected = Index([pd.NaT, 0, 1, 1], dtype=object)
tm.assert_index_equal(result, expected)

def test_delete(self):

Expand Down
52 changes: 40 additions & 12 deletions pandas/tests/indexing/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,20 +37,24 @@ def setup_method(self, method):
)

def test_loc_scalar(self):
dtype = CDT(list("cab"))
result = self.df.loc["a"]
expected = DataFrame(
{"A": [0, 1, 5], "B": (Series(list("aaa")).astype(CDT(list("cab"))))}
).set_index("B")
bidx = Series(list("aaa"), name="B").astype(dtype)
assert bidx.dtype == dtype

expected = DataFrame({"A": [0, 1, 5]}, index=Index(bidx))
tm.assert_frame_equal(result, expected)

df = self.df.copy()
df.loc["a"] = 20
bidx2 = Series(list("aabbca"), name="B").astype(dtype)
assert bidx2.dtype == dtype
expected = DataFrame(
{
"A": [20, 20, 2, 3, 4, 20],
"B": (Series(list("aabbca")).astype(CDT(list("cab")))),
}
).set_index("B")
},
index=Index(bidx2),
)
tm.assert_frame_equal(df, expected)

# value not in the categories
Expand All @@ -64,14 +68,38 @@ def test_loc_scalar(self):
df2.loc["d"] = 10
tm.assert_frame_equal(df2, expected)

msg = "'fill_value=d' is not present in this Categorical's categories"
with pytest.raises(TypeError, match=msg):
df.loc["d", "A"] = 10
with pytest.raises(TypeError, match=msg):
df.loc["d", "C"] = 10
def test_loc_setitem_with_expansion_non_category(self):
# Setting-with-expansion with a new key "d" that is not among caegories
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might be worth splitting this test (e.g. errors to a new tests) and even these cases

df = self.df
df.loc["a"] = 20

# Setting a new row on an existing column
df3 = df.copy()
df3.loc["d", "A"] = 10
bidx3 = Index(list("aabbcad"), name="B")
expected3 = DataFrame(
{
"A": [20, 20, 2, 3, 4, 20, 10.0],
},
index=Index(bidx3),
)
tm.assert_frame_equal(df3, expected3)

# Settig a new row _and_ new column
df4 = df.copy()
df4.loc["d", "C"] = 10
expected3 = DataFrame(
{
"A": [20, 20, 2, 3, 4, 20, np.nan],
"C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 10],
},
index=Index(bidx3),
)
tm.assert_frame_equal(df4, expected3)

def test_loc_getitem_scalar_non_category(self):
with pytest.raises(KeyError, match="^1$"):
df.loc[1]
self.df.loc[1]

def test_slicing(self):
cat = Series(Categorical([1, 2, 3, 4]))
Expand Down
50 changes: 50 additions & 0 deletions pandas/tests/indexing/test_loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
DatetimeIndex,
Index,
IndexSlice,
IntervalIndex,
MultiIndex,
Period,
Series,
Expand Down Expand Up @@ -1657,6 +1658,55 @@ def test_loc_setitem_with_expansion_inf_upcast_empty(self):
expected = pd.Float64Index([0, 1, np.inf])
tm.assert_index_equal(result, expected)

@pytest.mark.filterwarnings("ignore:indexing past lexsort depth")
def test_loc_setitem_with_expansion_nonunique_index(self, index, request):
# GH#40096
if not len(index):
return
if isinstance(index, IntervalIndex):
mark = pytest.mark.xfail(reason="IntervalIndex raises")
request.node.add_marker(mark)

index = index.repeat(2) # ensure non-unique
N = len(index)
arr = np.arange(N).astype(np.int64)

orig = DataFrame(arr, index=index, columns=[0])

# key that will requiring object-dtype casting in the index
key = "kapow"
assert key not in index # otherwise test is invalid
# TODO: using a tuple key breaks here in many cases

exp_index = index.insert(len(index), key)
if isinstance(index, MultiIndex):
assert exp_index[-1][0] == key
else:
assert exp_index[-1] == key
exp_data = np.arange(N + 1).astype(np.float64)
expected = DataFrame(exp_data, index=exp_index, columns=[0])

# Add new row, but no new columns
df = orig.copy()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same might be worth splitting this up a bit

df.loc[key, 0] = N
tm.assert_frame_equal(df, expected)

# add new row on a Series
ser = orig.copy()[0]
ser.loc[key] = N
# the series machinery lets us preserve int dtype instead of float
expected = expected[0].astype(np.int64)
tm.assert_series_equal(ser, expected)

# add new row and new column
df = orig.copy()
df.loc[key, 1] = N
expected = DataFrame(
{0: list(arr) + [np.nan], 1: [np.nan] * N + [float(N)]},
index=exp_index,
)
tm.assert_frame_equal(df, expected)


class TestLocCallable:
def test_frame_loc_getitem_callable(self):
Expand Down