diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index ff5fadec735e6..386e8cbc51f6f 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -17,6 +17,7 @@ Fixed regressions - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) +- Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 2ea0e51236534..a277272040a69 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -98,6 +98,20 @@ cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None): return indexer.view(bool) +cdef _maybe_resize_array(ndarray values, Py_ssize_t loc, Py_ssize_t max_length): + """ + Resize array if loc is out of bounds. + """ + cdef: + Py_ssize_t n = len(values) + + if loc >= n: + while loc >= n: + n *= 2 + values = np.resize(values, min(n, max_length)) + return values + + # Don't populate hash tables in monotonic indexes larger than this _SIZE_CUTOFF = 1_000_000 @@ -456,27 +470,18 @@ cdef class IndexEngine: # found if val in d: key = val - + result = _maybe_resize_array( + result, + count + len(d[key]) - 1, + max_alloc + ) for j in d[key]: - - # realloc if needed - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result = np.resize(result, n_alloc) - result[count] = j count += 1 # value not found else: - - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result = np.resize(result, n_alloc) + result = _maybe_resize_array(result, count, max_alloc) result[count] = -1 count += 1 missing[count_missing] = i @@ -1214,13 +1219,12 @@ cdef class MaskedIndexEngine(IndexEngine): if PySequence_GetItem(target_mask, i): if na_pos: + result = _maybe_resize_array( + result, + count + len(na_pos) - 1, + max_alloc, + ) for na_idx in na_pos: - # realloc if needed - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result[count] = na_idx count += 1 continue @@ -1228,23 +1232,18 @@ cdef class MaskedIndexEngine(IndexEngine): elif val in d: # found key = val - + result = _maybe_resize_array( + result, + count + len(d[key]) - 1, + max_alloc, + ) for j in d[key]: - - # realloc if needed - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result[count] = j count += 1 continue # value not found - if count >= n_alloc: - n_alloc += 10_000 - result = np.resize(result, n_alloc) + result = _maybe_resize_array(result, count, max_alloc) result[count] = -1 count += 1 missing[count_missing] = i diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index c446f2c44b745..b155c3aabd287 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -3347,3 +3347,15 @@ def test_getitem_loc_str_periodindex(self): index = pd.period_range(start="2000", periods=20, freq="B") series = Series(range(20), index=index) assert series.loc["2000-01-14"] == 9 + + def test_loc_nonunique_masked_index(self): + # GH 57027 + ids = list(range(11)) + index = Index(ids * 1000, dtype="Int64") + df = DataFrame({"val": np.arange(len(index), dtype=np.intp)}, index=index) + result = df.loc[ids] + expected = DataFrame( + {"val": index.argsort(kind="stable").astype(np.intp)}, + index=Index(np.array(ids).repeat(1000), dtype="Int64"), + ) + tm.assert_frame_equal(result, expected)