Skip to content

Commit cf36911

Browse files
committed
PERF: improve get_loc on unsorted, non-unique indexes
closes pandas-dev#19478
1 parent d5a7e7c commit cf36911

File tree

3 files changed

+22
-47
lines changed

3 files changed

+22
-47
lines changed

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,7 @@ Performance Improvements
465465
- :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`)
466466
- Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`)
467467
- Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`)
468+
- Improved performance of :func:`Index.get_loc` for non-unique indexes (:issue:`19478`)
468469
- Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`)
469470

470471
.. _whatsnew_0230.docs:

pandas/_libs/index.pyx

+10-25
Original file line numberDiff line numberDiff line change
@@ -183,32 +183,20 @@ cdef class IndexEngine:
183183

184184
cdef _maybe_get_bool_indexer(self, object val):
185185
cdef:
186-
ndarray[uint8_t] indexer
187-
ndarray[object] values
188-
int count = 0
189-
Py_ssize_t i, n
190-
int last_true
186+
ndarray[cnp.uint8_t, ndim=1, cast=True] indexer
187+
ndarray[int64_t, ndim=1] found
188+
int count
191189

192-
values = np.array(self._get_index_values(), copy=False)
193-
n = len(values)
194-
195-
result = np.empty(n, dtype=bool)
196-
indexer = result.view(np.uint8)
190+
indexer = self._get_index_values() == val
191+
found = np.where(indexer)[0]
192+
count = len(found)
197193

198-
for i in range(n):
199-
if values[i] == val:
200-
count += 1
201-
indexer[i] = 1
202-
last_true = i
203-
else:
204-
indexer[i] = 0
205-
206-
if count == 0:
207-
raise KeyError(val)
194+
if count > 1:
195+
return indexer
208196
if count == 1:
209-
return last_true
197+
return found[0]
210198

211-
return result
199+
raise KeyError(val)
212200

213201
def sizeof(self, deep=False):
214202
""" return the sizeof our mapping """
@@ -542,9 +530,6 @@ cdef class PeriodEngine(Int64Engine):
542530

543531
return super(PeriodEngine, self).get_indexer_non_unique(ordinal_array)
544532

545-
cdef _get_index_values_for_bool_indexer(self):
546-
return self._get_index_values().view('i8')
547-
548533

549534
cpdef convert_scalar(ndarray arr, object value):
550535
# we don't turn integers

pandas/_libs/index_class_helper.pxi.in

+11-22
Original file line numberDiff line numberDiff line change
@@ -55,40 +55,29 @@ cdef class {{name}}Engine(IndexEngine):
5555

5656
cdef _maybe_get_bool_indexer(self, object val):
5757
cdef:
58-
ndarray[uint8_t, cast=True] indexer
58+
ndarray[cnp.uint8_t, ndim=1, cast=True] indexer
59+
ndarray[int64_t, ndim=1] found
5960
ndarray[{{ctype}}] values
6061
int count = 0
61-
Py_ssize_t i, n
62-
int last_true
6362

6463
{{if name != 'Float64'}}
6564
if not util.is_integer_object(val):
6665
raise KeyError(val)
6766
{{endif}}
6867

69-
values = self._get_index_values_for_bool_indexer()
70-
n = len(values)
68+
# A view is needed for some subclasses, such as PeriodEngine:
69+
values = self._get_index_values().view('{{dtype}}')
70+
indexer = values == val
71+
found = np.where(indexer)[0]
72+
count = len(found)
7173

72-
result = np.empty(n, dtype=bool)
73-
indexer = result.view(np.uint8)
74-
75-
for i in range(n):
76-
if values[i] == val:
77-
count += 1
78-
indexer[i] = 1
79-
last_true = i
80-
else:
81-
indexer[i] = 0
82-
83-
if count == 0:
84-
raise KeyError(val)
74+
if count > 1:
75+
return indexer
8576
if count == 1:
86-
return last_true
77+
return found[0]
8778

88-
return result
79+
raise KeyError(val)
8980

90-
cdef _get_index_values_for_bool_indexer(self):
91-
return self._get_index_values()
9281
{{endif}}
9382

9483
{{endfor}}

0 commit comments

Comments
 (0)