-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
PERF: fast non-unique indexing #15468
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -44,6 +44,117 @@ PyDateTime_IMPORT | |
cdef extern from "Python.h": | ||
int PySlice_Check(object) | ||
|
||
@cython.boundscheck(False) | ||
@cython.wraparound(False) | ||
@cython.initializedcheck(False) | ||
cdef _count(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1, | ||
int64_t[:] mapping_count, int64_t[:] missing_count): | ||
|
||
cdef: | ||
int64_t n_v = values.shape[0] | ||
int64_t n_t = targets.shape[0] | ||
int64_t i = 0 | ||
int64_t j = 0 | ||
|
||
while i < n_v and j < n_t: | ||
|
||
val0 = values[idx0[i]] | ||
val1 = targets[idx1[j]] | ||
|
||
if val0 == val1: | ||
|
||
while i < n_v and values[idx0[i]] == val1: | ||
i += 1 | ||
mapping_count[idx1[j]] += 1 | ||
|
||
j += 1 | ||
while j < n_t and val0 == targets[idx1[j]]: | ||
mapping_count[idx1[j]] = mapping_count[idx1[j-1]] | ||
j += 1 | ||
|
||
elif val0 > val1: | ||
|
||
mapping_count[idx1[j]] += 1 | ||
missing_count[idx1[j]] = 1 | ||
j += 1 | ||
|
||
else: | ||
i += 1 | ||
|
||
while j < n_t: | ||
mapping_count[idx1[j]] += 1 | ||
missing_count[idx1[j]] = 1 | ||
j += 1 | ||
|
||
@cython.boundscheck(False) | ||
@cython.wraparound(False) | ||
@cython.initializedcheck(False) | ||
cdef _map(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1, | ||
int64_t[:] start_mapping, int64_t[:] start_missing, | ||
int64_t[:] mapping, int64_t[:] missing): | ||
|
||
cdef: | ||
int64_t n_v = values.shape[0] | ||
int64_t n_t = targets.shape[0] | ||
int64_t i = 0 | ||
int64_t j = 0 | ||
int64_t c | ||
|
||
while i < n_v and j < n_t: | ||
|
||
val0 = values[idx0[i]] | ||
val1 = targets[idx1[j]] | ||
|
||
if val0 == val1: | ||
|
||
c = 0 | ||
while i < n_v and values[idx0[i]] == val1: | ||
mapping[start_mapping[idx1[j]] + c] = idx0[i] | ||
i += 1 | ||
c += 1 | ||
|
||
j += 1 | ||
while j < n_t and val0 == targets[idx1[j]]: | ||
for ii in range(c): | ||
mapping[start_mapping[idx1[j]] + ii] = \ | ||
mapping[start_mapping[idx1[j-1]] + ii] | ||
j += 1 | ||
|
||
elif val0 > val1: | ||
|
||
mapping[start_mapping[idx1[j]]] = -1 | ||
missing[start_missing[idx1[j]]] = idx1[j] | ||
j += 1 | ||
|
||
else: | ||
i += 1 | ||
|
||
while j < n_t: | ||
|
||
mapping[start_mapping[idx1[j]]] = -1 | ||
missing[start_missing[idx1[j]]] = idx1[j] | ||
j += 1 | ||
|
||
def _map_targets_to_values(values, targets, idx0, idx1): | ||
mapping_count = np.zeros(len(targets), np.int64) | ||
missing_count = np.zeros(len(targets), np.int64) | ||
|
||
_count(values, targets, idx0, idx1, mapping_count, missing_count) | ||
|
||
np.cumsum(mapping_count, out=mapping_count) | ||
np.cumsum(missing_count, out=missing_count) | ||
|
||
mapping = np.empty(mapping_count[-1], np.int64) | ||
missing = np.empty(missing_count[-1], np.int64) | ||
|
||
mapping_count[1:] = mapping_count[:-1] | ||
mapping_count[0] = 0 | ||
missing_count -= 1 | ||
|
||
_map(values, targets, idx0, idx1, mapping_count, missing_count, mapping, | ||
missing) | ||
|
||
return mapping, missing | ||
|
||
cdef inline is_definitely_invalid_key(object val): | ||
if PyTuple_Check(val): | ||
|
@@ -371,6 +482,16 @@ cdef class IndexEngine: | |
|
||
return result[0:count], missing[0:count_missing] | ||
|
||
def get_indexer_non_unique_orderable(self, ndarray targets, | ||
int64_t[:] idx0, | ||
int64_t[:] idx1): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is a massive amount of added code, what exactly are you doing? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Notice that I have removed the code regarding lists. Most of the code consists in two functions: |
||
|
||
cdef: | ||
ndarray values | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can add 1 function in cython, anything else is WAY too complicated. |
||
self._ensure_mapping_populated() | ||
values = np.array(self._get_index_values(), copy=False) | ||
return _map_targets_to_values(values, targets, idx0, idx1) | ||
|
||
cdef Py_ssize_t _bin_search(ndarray values, object val) except -1: | ||
cdef: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2530,10 +2530,40 @@ def get_indexer_non_unique(self, target): | |
if self.is_all_dates: | ||
self = Index(self.asi8) | ||
tgt_values = target.asi8 | ||
src_values = self.asi8 | ||
else: | ||
tgt_values = target._values | ||
src_values = self._values | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is quite messy, what are you trying to do here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed! Just factorised that. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you are adding much more code. Where exactly does this routine fail? The only scenario I could see is with actual mixed types (which I am happy to raise on when you have a non-unique index). Further it looks like you are duplicating lots of functionailty that already exists, see pandas/core/algorithms.py w.r.t. counting / sorting / factorizing. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't find routines un pandas/core/algorithms.py that I could use. I have now described the functions in index.pyx so I hope it is now clearer. Yes, I was having problem with mixed types. For example, v = np.array([1, 'danilo'], object)
v[0] < v[1] raises a |
||
try: | ||
src_values[0] < tgt_values[0] | ||
src_values[0] > tgt_values[0] | ||
except (TypeError, IndexError): | ||
orderable = False | ||
else: | ||
try: | ||
if self.is_monotonic_increasing: | ||
idx0 = np.arange(len(src_values), dtype=np.int64) | ||
else: | ||
idx0 = np.argsort(src_values, kind='mergesort') | ||
idx0 = np.asarray(idx0, dtype=np.int64) | ||
|
||
if target.is_monotonic_increasing: | ||
idx1 = np.arange(len(tgt_values), dtype=np.int64) | ||
else: | ||
idx1 = np.argsort(tgt_values, kind='mergesort') | ||
idx1 = np.asarray(idx1, dtype=np.int64) | ||
|
||
except TypeError: | ||
orderable = False | ||
else: | ||
orderable = True | ||
|
||
if orderable: | ||
indexer, missing = self._engine.get_indexer_non_unique_orderable(tgt_values, idx0, idx1) | ||
else: | ||
indexer, missing = self._engine.get_indexer_non_unique(tgt_values) | ||
|
||
indexer, missing = self._engine.get_indexer_non_unique(tgt_values) | ||
return Index(indexer), missing | ||
|
||
def get_indexer_for(self, target, **kwargs): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this signature is WAY too complicated.