Skip to content

Commit a302b1b

Browse files
authored
REGR: non-unique, masked dtype index raising IndexError (#57061)
* fix masked indexing regression * fix test * fix test * dedup resizing logic * add types
1 parent ab3d4bf commit a302b1b

File tree

3 files changed

+44
-32
lines changed

3 files changed

+44
-32
lines changed

doc/source/whatsnew/v2.2.1.rst

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ Fixed regressions
1717
- Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`)
1818
- Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`)
1919
- Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`)
20+
- Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`)
2021
- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`)
2122
- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`)
2223
- Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`)

pandas/_libs/index.pyx

+31-32
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,20 @@ cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None):
9898
return indexer.view(bool)
9999

100100

101+
cdef _maybe_resize_array(ndarray values, Py_ssize_t loc, Py_ssize_t max_length):
102+
"""
103+
Resize array if loc is out of bounds.
104+
"""
105+
cdef:
106+
Py_ssize_t n = len(values)
107+
108+
if loc >= n:
109+
while loc >= n:
110+
n *= 2
111+
values = np.resize(values, min(n, max_length))
112+
return values
113+
114+
101115
# Don't populate hash tables in monotonic indexes larger than this
102116
_SIZE_CUTOFF = 1_000_000
103117

@@ -456,27 +470,18 @@ cdef class IndexEngine:
456470
# found
457471
if val in d:
458472
key = val
459-
473+
result = _maybe_resize_array(
474+
result,
475+
count + len(d[key]) - 1,
476+
max_alloc
477+
)
460478
for j in d[key]:
461-
462-
# realloc if needed
463-
if count >= n_alloc:
464-
n_alloc *= 2
465-
if n_alloc > max_alloc:
466-
n_alloc = max_alloc
467-
result = np.resize(result, n_alloc)
468-
469479
result[count] = j
470480
count += 1
471481

472482
# value not found
473483
else:
474-
475-
if count >= n_alloc:
476-
n_alloc *= 2
477-
if n_alloc > max_alloc:
478-
n_alloc = max_alloc
479-
result = np.resize(result, n_alloc)
484+
result = _maybe_resize_array(result, count, max_alloc)
480485
result[count] = -1
481486
count += 1
482487
missing[count_missing] = i
@@ -1214,37 +1219,31 @@ cdef class MaskedIndexEngine(IndexEngine):
12141219

12151220
if PySequence_GetItem(target_mask, i):
12161221
if na_pos:
1222+
result = _maybe_resize_array(
1223+
result,
1224+
count + len(na_pos) - 1,
1225+
max_alloc,
1226+
)
12171227
for na_idx in na_pos:
1218-
# realloc if needed
1219-
if count >= n_alloc:
1220-
n_alloc *= 2
1221-
if n_alloc > max_alloc:
1222-
n_alloc = max_alloc
1223-
12241228
result[count] = na_idx
12251229
count += 1
12261230
continue
12271231

12281232
elif val in d:
12291233
# found
12301234
key = val
1231-
1235+
result = _maybe_resize_array(
1236+
result,
1237+
count + len(d[key]) - 1,
1238+
max_alloc,
1239+
)
12321240
for j in d[key]:
1233-
1234-
# realloc if needed
1235-
if count >= n_alloc:
1236-
n_alloc *= 2
1237-
if n_alloc > max_alloc:
1238-
n_alloc = max_alloc
1239-
12401241
result[count] = j
12411242
count += 1
12421243
continue
12431244

12441245
# value not found
1245-
if count >= n_alloc:
1246-
n_alloc += 10_000
1247-
result = np.resize(result, n_alloc)
1246+
result = _maybe_resize_array(result, count, max_alloc)
12481247
result[count] = -1
12491248
count += 1
12501249
missing[count_missing] = i

pandas/tests/indexing/test_loc.py

+12
Original file line numberDiff line numberDiff line change
@@ -3347,3 +3347,15 @@ def test_getitem_loc_str_periodindex(self):
33473347
index = pd.period_range(start="2000", periods=20, freq="B")
33483348
series = Series(range(20), index=index)
33493349
assert series.loc["2000-01-14"] == 9
3350+
3351+
def test_loc_nonunique_masked_index(self):
3352+
# GH 57027
3353+
ids = list(range(11))
3354+
index = Index(ids * 1000, dtype="Int64")
3355+
df = DataFrame({"val": np.arange(len(index), dtype=np.intp)}, index=index)
3356+
result = df.loc[ids]
3357+
expected = DataFrame(
3358+
{"val": index.argsort(kind="stable").astype(np.intp)},
3359+
index=Index(np.array(ids).repeat(1000), dtype="Int64"),
3360+
)
3361+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)