-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
PERF: fast non-unique indexing #15468
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -44,6 +44,190 @@ PyDateTime_IMPORT | |
cdef extern from "Python.h": | ||
int PySlice_Check(object) | ||
|
||
@cython.boundscheck(False) | ||
@cython.wraparound(False) | ||
@cython.initializedcheck(False) | ||
cdef _count(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1, | ||
int64_t[:] mapping_count, int64_t[:] missing_count): | ||
""" | ||
Compute the number of times a `targets` value is in `values`. | ||
|
||
We also want an indication for the case in which a `targets` value is not | ||
found in `values`. | ||
|
||
Parameters | ||
---------- | ||
values : Sequence of comparable values. | ||
targets : Sequence of comparable values. | ||
idx0 : np.argsort on `values`. | ||
idx1 : np.argsort on `targets`. | ||
mapping_count : Number of `targets` elements found in `values`. | ||
missing_count : Number of `targets` elements not found in `values`. | ||
|
||
Examples | ||
-------- | ||
Let | ||
|
||
.. code-block:: python | ||
|
||
>>> values = np.array([5, 4, 5, 3, 3, 5, 1]) | ||
>>> targets = np.array([3, 5, 1, 2, 1]) | ||
|
||
be two integer arrays. The first element of `targets` is in `values` in | ||
two positions. The second one is in `values` in three positions. | ||
On the other hand, the fourth one is not found in `values`. The resulting | ||
map will be `mapping_count = np.array([2, 3, 1, -1, 1])`. Moreover, the | ||
resulting `missing_count` will be `np.array([0, 0, 0, 1, 0])` | ||
""" | ||
|
||
cdef: | ||
int64_t n_v = values.shape[0] | ||
int64_t n_t = targets.shape[0] | ||
int64_t i = 0 | ||
int64_t j = 0 | ||
|
||
while i < n_v and j < n_t: | ||
|
||
val0 = values[idx0[i]] | ||
val1 = targets[idx1[j]] | ||
|
||
if val0 == val1: | ||
|
||
while i < n_v and values[idx0[i]] == val1: | ||
i += 1 | ||
mapping_count[idx1[j]] += 1 | ||
|
||
j += 1 | ||
while j < n_t and val0 == targets[idx1[j]]: | ||
mapping_count[idx1[j]] = mapping_count[idx1[j-1]] | ||
j += 1 | ||
|
||
elif val0 > val1: | ||
|
||
mapping_count[idx1[j]] += 1 | ||
missing_count[idx1[j]] = 1 | ||
j += 1 | ||
|
||
else: | ||
i += 1 | ||
|
||
while j < n_t: | ||
mapping_count[idx1[j]] += 1 | ||
missing_count[idx1[j]] = 1 | ||
j += 1 | ||
|
||
@cython.boundscheck(False) | ||
@cython.wraparound(False) | ||
@cython.initializedcheck(False) | ||
cdef _map(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1, | ||
int64_t[:] start_mapping, int64_t[:] start_missing, | ||
int64_t[:] mapping, int64_t[:] missing): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this signature is WAY too complicated. |
||
""" | ||
Map `targets` values to `values` positions. | ||
|
||
We also want to know the indices of `targets` whose values are not found in | ||
`targets`. | ||
|
||
Parameters | ||
---------- | ||
values : Sequence of comparable values. | ||
targets : Sequence of comparable values. | ||
idx0 : np.argsort on `values`. | ||
idx1 : np.argsort on `targets`. | ||
start_mapping : Index of the starting block of `mapping`. | ||
start_missing : Index of the starting block of `missing`. | ||
mapping : `values` indices of `targets` values found in `values`. | ||
missing : `targets` indices of `targets` values not found in `values`. | ||
|
||
Examples | ||
-------- | ||
Let | ||
|
||
.. code-block:: python | ||
|
||
>>> values = np.array([5, 4, 5, 3, 3, 5, 1]) | ||
>>> targets = np.array([3, 5, 1, 2, 1]) | ||
|
||
be two integer arrays. The first block of `mapping` corresponds to the | ||
first element of `targets` and will have size `2`. The second block of | ||
`mapping` corresponds to the second element of `targets` and will have size | ||
`3`. The resulting map will be | ||
`mapping = np.array([3, 4, 0, 2, 5, 6, -1, 6])`. Moreover, the resulting | ||
`missing` will be `np.array([3])`. | ||
""" | ||
|
||
cdef: | ||
int64_t n_v = values.shape[0] | ||
int64_t n_t = targets.shape[0] | ||
int64_t i = 0 | ||
int64_t j = 0 | ||
int64_t c | ||
|
||
while i < n_v and j < n_t: | ||
|
||
val0 = values[idx0[i]] | ||
val1 = targets[idx1[j]] | ||
|
||
if val0 == val1: | ||
|
||
c = 0 | ||
while i < n_v and values[idx0[i]] == val1: | ||
mapping[start_mapping[idx1[j]] + c] = idx0[i] | ||
i += 1 | ||
c += 1 | ||
|
||
j += 1 | ||
while j < n_t and val0 == targets[idx1[j]]: | ||
for ii in range(c): | ||
mapping[start_mapping[idx1[j]] + ii] = \ | ||
mapping[start_mapping[idx1[j-1]] + ii] | ||
j += 1 | ||
|
||
elif val0 > val1: | ||
|
||
mapping[start_mapping[idx1[j]]] = -1 | ||
missing[start_missing[idx1[j]]] = idx1[j] | ||
j += 1 | ||
|
||
else: | ||
i += 1 | ||
|
||
while j < n_t: | ||
|
||
mapping[start_mapping[idx1[j]]] = -1 | ||
missing[start_missing[idx1[j]]] = idx1[j] | ||
j += 1 | ||
|
||
def _map_targets_to_values(values, targets, idx0, idx1): | ||
""" | ||
Map `targets` values to `values` positions. | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this looks like a take, but really not sure what you are doing |
||
Please, refer to function `_map` for a complete description. | ||
""" | ||
n_t = len(targets) | ||
|
||
mapping_count = np.zeros(n_t, np.int64) | ||
missing_count = np.zeros(n_t, np.int64) | ||
|
||
if n_t == 0: | ||
return np.empty(0, np.int64), np.empty(0, np.int64) | ||
|
||
_count(values, targets, idx0, idx1, mapping_count, missing_count) | ||
|
||
np.cumsum(mapping_count, out=mapping_count) | ||
np.cumsum(missing_count, out=missing_count) | ||
|
||
mapping = np.empty(mapping_count[-1], np.int64) | ||
missing = np.empty(missing_count[-1], np.int64) | ||
|
||
mapping_count[1:] = mapping_count[:-1] | ||
mapping_count[0] = 0 | ||
missing_count -= 1 | ||
|
||
_map(values, targets, idx0, idx1, mapping_count, missing_count, mapping, | ||
missing) | ||
|
||
return mapping, missing | ||
|
||
cdef inline is_definitely_invalid_key(object val): | ||
if PyTuple_Check(val): | ||
|
@@ -371,6 +555,16 @@ cdef class IndexEngine: | |
|
||
return result[0:count], missing[0:count_missing] | ||
|
||
def get_indexer_non_unique_orderable(self, ndarray targets, | ||
int64_t[:] idx0, | ||
int64_t[:] idx1): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is a massive amount of added code, what exactly are you doing? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Notice that I have removed the code regarding lists. Most of the code consists in two functions: |
||
|
||
cdef: | ||
ndarray values | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can add 1 function in cython, anything else is WAY too complicated. |
||
self._ensure_mapping_populated() | ||
values = np.array(self._get_index_values(), copy=False) | ||
return _map_targets_to_values(values, targets, idx0, idx1) | ||
|
||
cdef Py_ssize_t _bin_search(ndarray values, object val) except -1: | ||
cdef: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2530,10 +2530,23 @@ def get_indexer_non_unique(self, target): | |
if self.is_all_dates: | ||
self = Index(self.asi8) | ||
tgt_values = target.asi8 | ||
src_values = self.asi8 | ||
else: | ||
tgt_values = target._values | ||
src_values = self._values | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is quite messy, what are you trying to do here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed! Just factorised that. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you are adding much more code. Where exactly does this routine fail? The only scenario I could see is with actual mixed types (which I am happy to raise on when you have a non-unique index). Further it looks like you are duplicating lots of functionailty that already exists, see pandas/core/algorithms.py w.r.t. counting / sorting / factorizing. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't find routines un pandas/core/algorithms.py that I could use. I have now described the functions in index.pyx so I hope it is now clearer. Yes, I was having problem with mixed types. For example, v = np.array([1, 'danilo'], object)
v[0] < v[1] raises a |
||
is_mono0 = self.is_monotonic_increasing | ||
is_mono1 = target.is_monotonic_increasing | ||
(orderable, idx0, idx1) = _order_them(is_mono0, src_values, is_mono1, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if you insist on ordering, then simply order an unordered index, get the results and take the original. this is so much complexity. |
||
tgt_values) | ||
|
||
e = self._engine | ||
if orderable: | ||
indexer, missing = e.get_indexer_non_unique_orderable(tgt_values, | ||
idx0, idx1) | ||
else: | ||
indexer, missing = e.get_indexer_non_unique(tgt_values) | ||
|
||
indexer, missing = self._engine.get_indexer_non_unique(tgt_values) | ||
return Index(indexer), missing | ||
|
||
def get_indexer_for(self, target, **kwargs): | ||
|
@@ -3875,3 +3888,43 @@ def _trim_front(strings): | |
def _validate_join_method(method): | ||
if method not in ['left', 'right', 'inner', 'outer']: | ||
raise ValueError('do not recognize join method %s' % method) | ||
|
||
|
||
def _order_it(is_mono, x): | ||
"""Tries to sort a sequence.""" | ||
if is_mono: | ||
return (True, np.arange(len(x), dtype=np.int64)) | ||
try: | ||
indices = np.argsort(x, kind='mergesort') | ||
except TypeError: | ||
return (False, None) | ||
return (True, indices) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do ALL of this in the cython function. you are needlessly splitting things up. |
||
|
||
|
||
def _order_them(x_is_mono, x, y_is_mono, y): | ||
""" | ||
Tries to sort `x` and `y`, checking whether they are jointly orderable. | ||
""" | ||
(xorderable, xindices) = _order_it(x_is_mono, x) | ||
(yorderable, yindices) = _order_it(y_is_mono, y) | ||
ok = xorderable and yorderable and _are_orderable(x, y) | ||
if ok: | ||
xindices = np.asarray(xindices, dtype=np.int64) | ||
yindices = np.asarray(yindices, dtype=np.int64) | ||
return (ok, xindices, yindices) | ||
|
||
|
||
def _are_orderable(x, y): | ||
"""Are `x` and `y` jointly orderable? | ||
|
||
We assume that `x` and `y` are indeed independently orderable. In this | ||
case, they are also jointly orderable if their first elements are | ||
orderable. | ||
""" | ||
if len(x) > 0 and len(y) > 0: | ||
try: | ||
x[0] < y[0] | ||
x[0] > y[0] | ||
except TypeError: | ||
return False | ||
return True |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is exactly value_counts