Skip to content

PERF: fast non-unique indexing #15468

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
194 changes: 194 additions & 0 deletions pandas/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,190 @@ PyDateTime_IMPORT
cdef extern from "Python.h":
int PySlice_Check(object)

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.initializedcheck(False)
cdef _count(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1,
int64_t[:] mapping_count, int64_t[:] missing_count):
"""
Compute the number of times a `targets` value is in `values`.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is exactly value_counts

We also want an indication for the case in which a `targets` value is not
found in `values`.

Parameters
----------
values : Sequence of comparable values.
targets : Sequence of comparable values.
idx0 : np.argsort on `values`.
idx1 : np.argsort on `targets`.
mapping_count : Number of `targets` elements found in `values`.
missing_count : Number of `targets` elements not found in `values`.

Examples
--------
Let

.. code-block:: python

>>> values = np.array([5, 4, 5, 3, 3, 5, 1])
>>> targets = np.array([3, 5, 1, 2, 1])

be two integer arrays. The first element of `targets` is in `values` in
two positions. The second one is in `values` in three positions.
On the other hand, the fourth one is not found in `values`. The resulting
map will be `mapping_count = np.array([2, 3, 1, -1, 1])`. Moreover, the
resulting `missing_count` will be `np.array([0, 0, 0, 1, 0])`
"""

cdef:
int64_t n_v = values.shape[0]
int64_t n_t = targets.shape[0]
int64_t i = 0
int64_t j = 0

while i < n_v and j < n_t:

val0 = values[idx0[i]]
val1 = targets[idx1[j]]

if val0 == val1:

while i < n_v and values[idx0[i]] == val1:
i += 1
mapping_count[idx1[j]] += 1

j += 1
while j < n_t and val0 == targets[idx1[j]]:
mapping_count[idx1[j]] = mapping_count[idx1[j-1]]
j += 1

elif val0 > val1:

mapping_count[idx1[j]] += 1
missing_count[idx1[j]] = 1
j += 1

else:
i += 1

while j < n_t:
mapping_count[idx1[j]] += 1
missing_count[idx1[j]] = 1
j += 1

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.initializedcheck(False)
cdef _map(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1,
int64_t[:] start_mapping, int64_t[:] start_missing,
int64_t[:] mapping, int64_t[:] missing):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this signature is WAY too complicated.

"""
Map `targets` values to `values` positions.

We also want to know the indices of `targets` whose values are not found in
`targets`.

Parameters
----------
values : Sequence of comparable values.
targets : Sequence of comparable values.
idx0 : np.argsort on `values`.
idx1 : np.argsort on `targets`.
start_mapping : Index of the starting block of `mapping`.
start_missing : Index of the starting block of `missing`.
mapping : `values` indices of `targets` values found in `values`.
missing : `targets` indices of `targets` values not found in `values`.

Examples
--------
Let

.. code-block:: python

>>> values = np.array([5, 4, 5, 3, 3, 5, 1])
>>> targets = np.array([3, 5, 1, 2, 1])

be two integer arrays. The first block of `mapping` corresponds to the
first element of `targets` and will have size `2`. The second block of
`mapping` corresponds to the second element of `targets` and will have size
`3`. The resulting map will be
`mapping = np.array([3, 4, 0, 2, 5, 6, -1, 6])`. Moreover, the resulting
`missing` will be `np.array([3])`.
"""

cdef:
int64_t n_v = values.shape[0]
int64_t n_t = targets.shape[0]
int64_t i = 0
int64_t j = 0
int64_t c

while i < n_v and j < n_t:

val0 = values[idx0[i]]
val1 = targets[idx1[j]]

if val0 == val1:

c = 0
while i < n_v and values[idx0[i]] == val1:
mapping[start_mapping[idx1[j]] + c] = idx0[i]
i += 1
c += 1

j += 1
while j < n_t and val0 == targets[idx1[j]]:
for ii in range(c):
mapping[start_mapping[idx1[j]] + ii] = \
mapping[start_mapping[idx1[j-1]] + ii]
j += 1

elif val0 > val1:

mapping[start_mapping[idx1[j]]] = -1
missing[start_missing[idx1[j]]] = idx1[j]
j += 1

else:
i += 1

while j < n_t:

mapping[start_mapping[idx1[j]]] = -1
missing[start_missing[idx1[j]]] = idx1[j]
j += 1

def _map_targets_to_values(values, targets, idx0, idx1):
"""
Map `targets` values to `values` positions.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this looks like a take, but really not sure what you are doing

Please, refer to function `_map` for a complete description.
"""
n_t = len(targets)

mapping_count = np.zeros(n_t, np.int64)
missing_count = np.zeros(n_t, np.int64)

if n_t == 0:
return np.empty(0, np.int64), np.empty(0, np.int64)

_count(values, targets, idx0, idx1, mapping_count, missing_count)

np.cumsum(mapping_count, out=mapping_count)
np.cumsum(missing_count, out=missing_count)

mapping = np.empty(mapping_count[-1], np.int64)
missing = np.empty(missing_count[-1], np.int64)

mapping_count[1:] = mapping_count[:-1]
mapping_count[0] = 0
missing_count -= 1

_map(values, targets, idx0, idx1, mapping_count, missing_count, mapping,
missing)

return mapping, missing

cdef inline is_definitely_invalid_key(object val):
if PyTuple_Check(val):
Expand Down Expand Up @@ -371,6 +555,16 @@ cdef class IndexEngine:

return result[0:count], missing[0:count_missing]

def get_indexer_non_unique_orderable(self, ndarray targets,
int64_t[:] idx0,
int64_t[:] idx1):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a massive amount of added code, what exactly are you doing?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Notice that I have removed the code regarding lists. Most of the code consists in two functions: _count and _map. They implement two very specific algorithms that I couldn't find in the above-mentioned file.


cdef:
ndarray values

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can add 1 function in cython, anything else is WAY too complicated.

self._ensure_mapping_populated()
values = np.array(self._get_index_values(), copy=False)
return _map_targets_to_values(values, targets, idx0, idx1)

cdef Py_ssize_t _bin_search(ndarray values, object val) except -1:
cdef:
Expand Down
55 changes: 54 additions & 1 deletion pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2530,10 +2530,23 @@ def get_indexer_non_unique(self, target):
if self.is_all_dates:
self = Index(self.asi8)
tgt_values = target.asi8
src_values = self.asi8
else:
tgt_values = target._values
src_values = self._values

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is quite messy, what are you trying to do here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed! Just factorised that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you are adding much more code. Where exactly does this routine fail? The only scenario I could see is with actual mixed types (which I am happy to raise on when you have a non-unique index).

Further it looks like you are duplicating lots of functionailty that already exists, see pandas/core/algorithms.py w.r.t. counting / sorting / factorizing.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't find routines un pandas/core/algorithms.py that I could use. I have now described the functions in index.pyx so I hope it is now clearer.

Yes, I was having problem with mixed types. For example,

v = np.array([1, 'danilo'], object)
v[0] < v[1]

raises a TypeError exception.

is_mono0 = self.is_monotonic_increasing
is_mono1 = target.is_monotonic_increasing
(orderable, idx0, idx1) = _order_them(is_mono0, src_values, is_mono1,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you insist on ordering, then simply order an unordered index, get the results and take the original. this is so much complexity.

tgt_values)

e = self._engine
if orderable:
indexer, missing = e.get_indexer_non_unique_orderable(tgt_values,
idx0, idx1)
else:
indexer, missing = e.get_indexer_non_unique(tgt_values)

indexer, missing = self._engine.get_indexer_non_unique(tgt_values)
return Index(indexer), missing

def get_indexer_for(self, target, **kwargs):
Expand Down Expand Up @@ -3875,3 +3888,43 @@ def _trim_front(strings):
def _validate_join_method(method):
if method not in ['left', 'right', 'inner', 'outer']:
raise ValueError('do not recognize join method %s' % method)


def _order_it(is_mono, x):
"""Tries to sort a sequence."""
if is_mono:
return (True, np.arange(len(x), dtype=np.int64))
try:
indices = np.argsort(x, kind='mergesort')
except TypeError:
return (False, None)
return (True, indices)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do ALL of this in the cython function. you are needlessly splitting things up.



def _order_them(x_is_mono, x, y_is_mono, y):
"""
Tries to sort `x` and `y`, checking whether they are jointly orderable.
"""
(xorderable, xindices) = _order_it(x_is_mono, x)
(yorderable, yindices) = _order_it(y_is_mono, y)
ok = xorderable and yorderable and _are_orderable(x, y)
if ok:
xindices = np.asarray(xindices, dtype=np.int64)
yindices = np.asarray(yindices, dtype=np.int64)
return (ok, xindices, yindices)


def _are_orderable(x, y):
"""Are `x` and `y` jointly orderable?

We assume that `x` and `y` are indeed independently orderable. In this
case, they are also jointly orderable if their first elements are
orderable.
"""
if len(x) > 0 and len(y) > 0:
try:
x[0] < y[0]
x[0] > y[0]
except TypeError:
return False
return True