Skip to content

REGR: non-unique, masked dtype index raising IndexError #57061

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jan 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Fixed regressions
- Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`)
- Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`)
- Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`)
- Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`)
- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`)
- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`)
- Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`)
Expand Down
63 changes: 31 additions & 32 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,20 @@ cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None):
return indexer.view(bool)


cdef _maybe_resize_array(ndarray values, Py_ssize_t loc, Py_ssize_t max_length):
"""
Resize array if loc is out of bounds.
"""
cdef:
Py_ssize_t n = len(values)

if loc >= n:
while loc >= n:
n *= 2
values = np.resize(values, min(n, max_length))
return values


# Don't populate hash tables in monotonic indexes larger than this
_SIZE_CUTOFF = 1_000_000

Expand Down Expand Up @@ -456,27 +470,18 @@ cdef class IndexEngine:
# found
if val in d:
key = val

result = _maybe_resize_array(
result,
count + len(d[key]) - 1,
max_alloc
)
for j in d[key]:

# realloc if needed
if count >= n_alloc:
n_alloc *= 2
if n_alloc > max_alloc:
n_alloc = max_alloc
result = np.resize(result, n_alloc)

result[count] = j
count += 1

# value not found
else:

if count >= n_alloc:
n_alloc *= 2
if n_alloc > max_alloc:
n_alloc = max_alloc
result = np.resize(result, n_alloc)
result = _maybe_resize_array(result, count, max_alloc)
result[count] = -1
count += 1
missing[count_missing] = i
Expand Down Expand Up @@ -1214,37 +1219,31 @@ cdef class MaskedIndexEngine(IndexEngine):

if PySequence_GetItem(target_mask, i):
if na_pos:
result = _maybe_resize_array(
result,
count + len(na_pos) - 1,
max_alloc,
)
for na_idx in na_pos:
# realloc if needed
if count >= n_alloc:
n_alloc *= 2
if n_alloc > max_alloc:
n_alloc = max_alloc

result[count] = na_idx
count += 1
continue

elif val in d:
# found
key = val

result = _maybe_resize_array(
result,
count + len(d[key]) - 1,
max_alloc,
)
for j in d[key]:

# realloc if needed
if count >= n_alloc:
n_alloc *= 2
if n_alloc > max_alloc:
n_alloc = max_alloc

result[count] = j
count += 1
continue

# value not found
if count >= n_alloc:
n_alloc += 10_000
result = np.resize(result, n_alloc)
result = _maybe_resize_array(result, count, max_alloc)
result[count] = -1
count += 1
missing[count_missing] = i
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/indexing/test_loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3347,3 +3347,15 @@ def test_getitem_loc_str_periodindex(self):
index = pd.period_range(start="2000", periods=20, freq="B")
series = Series(range(20), index=index)
assert series.loc["2000-01-14"] == 9

def test_loc_nonunique_masked_index(self):
# GH 57027
ids = list(range(11))
index = Index(ids * 1000, dtype="Int64")
df = DataFrame({"val": np.arange(len(index), dtype=np.intp)}, index=index)
result = df.loc[ids]
expected = DataFrame(
{"val": index.argsort(kind="stable").astype(np.intp)},
index=Index(np.array(ids).repeat(1000), dtype="Int64"),
)
tm.assert_frame_equal(result, expected)