diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index eb6c212731822..fa048c8707ff1 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1063,6 +1063,7 @@ Performance Improvements - Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`) - Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`) - Improved performance of :func:`MultiIndex.remove_unused_levels` when there are no unused levels, at the cost of a reduction in performance when there are (:issue:`19289`) +- Improved performance of :func:`Index.get_loc` for non-unique indexes (:issue:`19478`) - Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`) - Improved performance of :func:`pandas.core.groupby.GroupBy.rank` (:issue:`15779`) - Improved performance of variable ``.rolling()`` on ``.min()`` and ``.max()`` (:issue:`19521`) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 6b23e487aad3a..9968d398e9040 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -183,32 +183,20 @@ cdef class IndexEngine: cdef _maybe_get_bool_indexer(self, object val): cdef: - ndarray[uint8_t] indexer - ndarray[object] values - int count = 0 - Py_ssize_t i, n - int last_true + ndarray[cnp.uint8_t, ndim=1, cast=True] indexer + ndarray[int64_t, ndim=1] found + int count - values = np.array(self._get_index_values(), copy=False) - n = len(values) - - result = np.empty(n, dtype=bool) - indexer = result.view(np.uint8) + indexer = self._get_index_values() == val + found = np.where(indexer)[0] + count = len(found) - for i in range(n): - if values[i] == val: - count += 1 - indexer[i] = 1 - last_true = i - else: - indexer[i] = 0 - - if count == 0: - raise KeyError(val) + if count > 1: + return indexer if count == 1: - return last_true + return int(found[0]) - return result + raise KeyError(val) def sizeof(self, deep=False): """ return the sizeof our mapping """ @@ -542,9 +530,6 @@ cdef class PeriodEngine(Int64Engine): return super(PeriodEngine, self).get_indexer_non_unique(ordinal_array) - cdef _get_index_values_for_bool_indexer(self): - return self._get_index_values().view('i8') - cpdef convert_scalar(ndarray arr, object value): # we don't turn integers diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index b9fc0ddd7ea1c..6f726dd49f11e 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -55,40 +55,29 @@ cdef class {{name}}Engine(IndexEngine): cdef _maybe_get_bool_indexer(self, object val): cdef: - ndarray[uint8_t, cast=True] indexer + ndarray[cnp.uint8_t, ndim=1, cast=True] indexer + ndarray[int64_t, ndim=1] found ndarray[{{ctype}}] values int count = 0 - Py_ssize_t i, n - int last_true {{if name != 'Float64'}} if not util.is_integer_object(val): raise KeyError(val) {{endif}} - values = self._get_index_values_for_bool_indexer() - n = len(values) + # A view is needed for some subclasses, such as PeriodEngine: + values = self._get_index_values().view('{{dtype}}') + indexer = values == val + found = np.where(indexer)[0] + count = len(found) - result = np.empty(n, dtype=bool) - indexer = result.view(np.uint8) - - for i in range(n): - if values[i] == val: - count += 1 - indexer[i] = 1 - last_true = i - else: - indexer[i] = 0 - - if count == 0: - raise KeyError(val) + if count > 1: + return indexer if count == 1: - return last_true + return int(found[0]) - return result + raise KeyError(val) - cdef _get_index_values_for_bool_indexer(self): - return self._get_index_values() {{endif}} {{endfor}}