Skip to content

PERF: improve get_loc on unsorted, non-unique indexes #19539

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 8, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1063,6 +1063,7 @@ Performance Improvements
- Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`)
- Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`)
- Improved performance of :func:`MultiIndex.remove_unused_levels` when there are no unused levels, at the cost of a reduction in performance when there are (:issue:`19289`)
- Improved performance of :func:`Index.get_loc` for non-unique indexes (:issue:`19478`)
- Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`)
- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` (:issue:`15779`)
- Improved performance of variable ``.rolling()`` on ``.min()`` and ``.max()`` (:issue:`19521`)
Expand Down
35 changes: 10 additions & 25 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -183,32 +183,20 @@ cdef class IndexEngine:

cdef _maybe_get_bool_indexer(self, object val):
cdef:
ndarray[uint8_t] indexer
ndarray[object] values
int count = 0
Py_ssize_t i, n
int last_true
ndarray[cnp.uint8_t, ndim=1, cast=True] indexer
ndarray[int64_t, ndim=1] found
int count

values = np.array(self._get_index_values(), copy=False)
n = len(values)

result = np.empty(n, dtype=bool)
indexer = result.view(np.uint8)
indexer = self._get_index_values() == val
found = np.where(indexer)[0]
count = len(found)

for i in range(n):
if values[i] == val:
count += 1
indexer[i] = 1
last_true = i
else:
indexer[i] = 0

if count == 0:
raise KeyError(val)
if count > 1:
return indexer
if count == 1:
return last_true
return int(found[0])

return result
raise KeyError(val)

def sizeof(self, deep=False):
""" return the sizeof our mapping """
Expand Down Expand Up @@ -542,9 +530,6 @@ cdef class PeriodEngine(Int64Engine):

return super(PeriodEngine, self).get_indexer_non_unique(ordinal_array)

cdef _get_index_values_for_bool_indexer(self):
return self._get_index_values().view('i8')


cpdef convert_scalar(ndarray arr, object value):
# we don't turn integers
Expand Down
33 changes: 11 additions & 22 deletions pandas/_libs/index_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -55,40 +55,29 @@ cdef class {{name}}Engine(IndexEngine):

cdef _maybe_get_bool_indexer(self, object val):
cdef:
ndarray[uint8_t, cast=True] indexer
ndarray[cnp.uint8_t, ndim=1, cast=True] indexer
ndarray[int64_t, ndim=1] found
ndarray[{{ctype}}] values
int count = 0
Py_ssize_t i, n
int last_true

{{if name != 'Float64'}}
if not util.is_integer_object(val):
raise KeyError(val)
{{endif}}

values = self._get_index_values_for_bool_indexer()
n = len(values)
# A view is needed for some subclasses, such as PeriodEngine:
values = self._get_index_values().view('{{dtype}}')
indexer = values == val
found = np.where(indexer)[0]
count = len(found)

result = np.empty(n, dtype=bool)
indexer = result.view(np.uint8)

for i in range(n):
if values[i] == val:
count += 1
indexer[i] = 1
last_true = i
else:
indexer[i] = 0

if count == 0:
raise KeyError(val)
if count > 1:
return indexer
if count == 1:
return last_true
return int(found[0])

return result
raise KeyError(val)

cdef _get_index_values_for_bool_indexer(self):
return self._get_index_values()
{{endif}}

{{endfor}}