Skip to content

Commit 5d82d8b

Browse files
authored
PERF: IndexEngine.get_indexer_non_unique (#55816)
* resize array by factor of 2 * whatsnew
1 parent e31a686 commit 5d82d8b

File tree

2 files changed

+17
-8
lines changed

2 files changed

+17
-8
lines changed

doc/source/whatsnew/v2.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,7 @@ Performance improvements
326326
- Performance improvement in :meth:`Index.difference` (:issue:`55108`)
327327
- Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`)
328328
- Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (:issue:`54234`)
329+
- Performance improvement when indexing into a non-unique index (:issue:`55816`)
329330
- Performance improvement when indexing with more than 4 keys (:issue:`54550`)
330331
- Performance improvement when localizing time to UTC (:issue:`55241`)
331332

pandas/_libs/index.pyx

+16-8
Original file line numberDiff line numberDiff line change
@@ -354,7 +354,7 @@ cdef class IndexEngine:
354354
dict d = {}
355355
object val
356356
Py_ssize_t count = 0, count_missing = 0
357-
Py_ssize_t i, j, n, n_t, n_alloc, start, end
357+
Py_ssize_t i, j, n, n_t, n_alloc, max_alloc, start, end
358358
bint check_na_values = False
359359

360360
values = self.values
@@ -364,6 +364,7 @@ cdef class IndexEngine:
364364

365365
n = len(values)
366366
n_t = len(targets)
367+
max_alloc = n * n_t
367368
if n > 10_000:
368369
n_alloc = 10_000
369370
else:
@@ -453,7 +454,9 @@ cdef class IndexEngine:
453454

454455
# realloc if needed
455456
if count >= n_alloc:
456-
n_alloc += 10_000
457+
n_alloc *= 2
458+
if n_alloc > max_alloc:
459+
n_alloc = max_alloc
457460
result = np.resize(result, n_alloc)
458461

459462
result[count] = j
@@ -463,7 +466,9 @@ cdef class IndexEngine:
463466
else:
464467

465468
if count >= n_alloc:
466-
n_alloc += 10_000
469+
n_alloc *= 2
470+
if n_alloc > max_alloc:
471+
n_alloc = max_alloc
467472
result = np.resize(result, n_alloc)
468473
result[count] = -1
469474
count += 1
@@ -1211,7 +1216,7 @@ cdef class MaskedIndexEngine(IndexEngine):
12111216
dict d = {}
12121217
object val
12131218
Py_ssize_t count = 0, count_missing = 0
1214-
Py_ssize_t i, j, n, n_t, n_alloc, start, end, na_idx
1219+
Py_ssize_t i, j, n, n_t, n_alloc, max_alloc, start, end, na_idx
12151220

12161221
target_vals = self._get_data(targets)
12171222
target_mask = self._get_mask(targets)
@@ -1224,6 +1229,7 @@ cdef class MaskedIndexEngine(IndexEngine):
12241229

12251230
n = len(values)
12261231
n_t = len(target_vals)
1232+
max_alloc = n * n_t
12271233
if n > 10_000:
12281234
n_alloc = 10_000
12291235
else:
@@ -1274,8 +1280,9 @@ cdef class MaskedIndexEngine(IndexEngine):
12741280
for na_idx in na_pos:
12751281
# realloc if needed
12761282
if count >= n_alloc:
1277-
n_alloc += 10_000
1278-
result = np.resize(result, n_alloc)
1283+
n_alloc *= 2
1284+
if n_alloc > max_alloc:
1285+
n_alloc = max_alloc
12791286

12801287
result[count] = na_idx
12811288
count += 1
@@ -1289,8 +1296,9 @@ cdef class MaskedIndexEngine(IndexEngine):
12891296

12901297
# realloc if needed
12911298
if count >= n_alloc:
1292-
n_alloc += 10_000
1293-
result = np.resize(result, n_alloc)
1299+
n_alloc *= 2
1300+
if n_alloc > max_alloc:
1301+
n_alloc = max_alloc
12941302

12951303
result[count] = j
12961304
count += 1

0 commit comments

Comments
 (0)