Skip to content

PERF: improve MultiIndex get_loc performance #16346

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions asv_bench/benchmarks/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,12 +227,24 @@ def time_multiindex_get_indexer(self):
def time_multiindex_large_get_loc(self):
self.mi_large.get_loc((999, 19, 'Z'))

def time_multiindex_large_get_loc_warm(self):
for _ in range(1000):
self.mi_large.get_loc((999, 19, 'Z'))

def time_multiindex_med_get_loc(self):
self.mi_med.get_loc((999, 9, 'A'))

def time_multiindex_med_get_loc_warm(self):
for _ in range(1000):
self.mi_med.get_loc((999, 9, 'A'))

def time_multiindex_string_get_loc(self):
self.mi_small.get_loc((99, 'A', 'A'))

def time_multiindex_small_get_loc_warm(self):
for _ in range(1000):
self.mi_small.get_loc((99, 'A', 'A'))

def time_is_monotonic(self):
self.miint.is_monotonic

Expand Down
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.20.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,10 @@ Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~

- Performance regression fix when indexing with a list-like (:issue:`16285`)
- Performance regression fix for small MultiIndexes (:issuse:`16319`)
- Performance regression fix for MultiIndexes (:issue:`16319`, :issue:`16346`)
- Improved performance of ``.clip()`` with scalar arguments (:issue:`15400`)


.. _whatsnew_0202.bug_fixes:

Bug Fixes
Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/hashtable.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ cdef class MultiIndexHashTable(HashTable):

cpdef get_item(self, object val)
cpdef set_item(self, object key, Py_ssize_t val)
cdef inline void _check_for_collision(self, Py_ssize_t loc, object label)


cdef class StringHashTable(HashTable):
cdef kh_str_t *table
Expand Down
19 changes: 17 additions & 2 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ Template for each `dtype` helper function for hashtable
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
"""

from lib cimport is_null_datetimelike


#----------------------------------------------------------------------
# VectorData
#----------------------------------------------------------------------
Expand Down Expand Up @@ -921,6 +924,19 @@ cdef class MultiIndexHashTable(HashTable):
"hash collision\nlocs:\n{}\n"
"result:\n{}\nmi:\n{}".format(alocs, result, mi))

cdef inline void _check_for_collision(self, Py_ssize_t loc, object label):
# validate that the loc maps to the actual value
# version of _check_for_collisions above for single label (tuple)

result = self.mi[loc]

if not all(l == r or (is_null_datetimelike(l)
and is_null_datetimelike(r))
for l, r in zip(result, label)):
raise AssertionError(
"hash collision\nloc:\n{}\n"
"result:\n{}\nmi:\n{}".format(loc, result, label))

def __contains__(self, object key):
try:
self.get_item(key)
Expand All @@ -939,8 +955,7 @@ cdef class MultiIndexHashTable(HashTable):
k = kh_get_uint64(self.table, value)
if k != self.table.n_buckets:
loc = self.table.vals[k]
locs = np.array([loc], dtype=np.int64)
self._check_for_collisions(locs, key)
self._check_for_collision(loc, key)
return loc
else:
raise KeyError(key)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -748,7 +748,7 @@ def _hashed_indexing_key(self, key):
we need to stringify if we have mixed levels

"""
from pandas.core.util.hashing import hash_tuples
from pandas.core.util.hashing import hash_tuples, hash_tuple

if not isinstance(key, tuple):
return hash_tuples(key)
Expand All @@ -762,7 +762,7 @@ def f(k, stringify):
return k
key = tuple([f(k, stringify)
for k, stringify in zip(key, self._have_mixed_levels)])
return hash_tuples(key)
return hash_tuple(key)

@Appender(base._shared_docs['duplicated'] % _index_doc_kwargs)
def duplicated(self, keep='first'):
Expand Down
56 changes: 55 additions & 1 deletion pandas/core/util/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,17 @@
import itertools

import numpy as np
from pandas._libs import hashing
from pandas._libs import hashing, tslib
from pandas.core.dtypes.generic import (
ABCMultiIndex,
ABCIndexClass,
ABCSeries,
ABCDataFrame)
from pandas.core.dtypes.common import (
is_categorical_dtype, is_list_like)
from pandas.core.dtypes.missing import isnull
from pandas.core.dtypes.cast import infer_dtype_from_scalar


# 16 byte long hashing key
_default_hash_key = '0123456789123456'
Expand Down Expand Up @@ -164,6 +167,29 @@ def hash_tuples(vals, encoding='utf8', hash_key=None):
return h


def hash_tuple(val, encoding='utf8', hash_key=None):
"""
Hash a single tuple efficiently

Parameters
----------
val : single tuple
encoding : string, default 'utf8'
hash_key : string key to encode, default to _default_hash_key

Returns
-------
hash

"""
hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key)
for v in val)

h = _combine_hash_arrays(hashes, len(val))[0]

return h


def _hash_categorical(c, encoding, hash_key):
"""
Hash a Categorical by hashing its categories, and then mapping the codes
Expand Down Expand Up @@ -276,3 +302,31 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
vals *= np.uint64(0x94d049bb133111eb)
vals ^= vals >> 31
return vals


def _hash_scalar(val, encoding='utf8', hash_key=None):
"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is duplicating lots of code. I sure there is a way to do this a bit more generically.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I know, but this was mainly to test.
And from that it, this makes it a lot faster than using the hash_array (the commented part in hash_tuple). So not sure how to solve that. In principle I can put the common parts in helper functions (eg the redistributing part), but for most of it it is not possible as there are slight differences.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what does all of this buy you? (IOW can you post updated timings)?

maintaining a separate code path for scalars will cause future issues. these will need to be kept in sync (with the array hashing), if any code changes are made. you can easily share code here which would make this more palatable.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In [4]: %timeit pd.core.util.hashing.hash_tuple2((999, np.nan, 'E'))
380 µs ± 60.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

In [5]: %timeit pd.core.util.hashing.hash_tuple((999, np.nan, 'E'))
81.8 µs ± 3.53 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)

hash_tuple2 uses the hash_array (the commented out version in the current branch), the hash_tuple uses the hash_scalar

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, its prob reasonable, but willing to sacrifice some perf to get some shared code (IOW between old and new prob a good compromise to repeating lots of code)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, I pushed a new version that almost fully reuses the hash_array function for the actual hashing, only has some specific logic before that to convert the scalar to a proper array.
This reduces a lot of the code duplication, and has only minor perf impact.

Apart from that, it still has the separate code path to first convert the scalar to an array, which might be a bit brittle, and I agree certainly not ideal for code maintenance, but using something more general (eg infer_dtype) gives a big perf penalty.

Hash scalar value

Returns
-------
1d uint64 numpy array of hash value, of length 1
"""

if isnull(val):
# this is to be consistent with the _hash_categorical implementation
return np.array([np.iinfo(np.uint64).max], dtype='u8')

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you need to handle datetime w/tz directly (IOW, we basically ignore it), then I would.

if getattr(val, 'tzinfo', None) is not None:
    val = val.tz_localize(None)

I suppose an option to ignore tz is fine for infer_dtype_from_scalar, but if you add it I would rename, document and test.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, I can certainly do that check here as well.

That is maybe better to keep the custom logic here, as the keyword added to infer_dtype_from_scalar would not be used anywhere else.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

he i think better locally

if getattr(val, 'tzinfo', None) is not None:
# for tz-aware datetimes, we need the underlying naive UTC value and
# not the tz aware object or pd extension type (as
# infer_dtype_from_scalar would do)
if not isinstance(val, tslib.Timestamp):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see #16372 (for later)

val = tslib.Timestamp(val)
val = val.tz_convert(None)

dtype, val = infer_dtype_from_scalar(val)
vals = np.array([val], dtype=dtype)

return hash_array(vals, hash_key=hash_key, encoding=encoding,
categorize=False)
24 changes: 23 additions & 1 deletion pandas/tests/util/test_hashing.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import pytest
import datetime

from warnings import catch_warnings
import numpy as np
import pandas as pd

from pandas import DataFrame, Series, Index, MultiIndex
from pandas.util import hash_array, hash_pandas_object
from pandas.core.util.hashing import hash_tuples
from pandas.core.util.hashing import hash_tuples, hash_tuple, _hash_scalar
import pandas.util.testing as tm


Expand Down Expand Up @@ -79,6 +80,27 @@ def test_hash_tuples(self):
result = hash_tuples(tups[0])
assert result == expected[0]

def test_hash_tuple(self):
# test equivalence between hash_tuples and hash_tuple
for tup in [(1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A'),
('A', pd.Timestamp("2012-01-01"))]:
result = hash_tuple(tup)
expected = hash_tuples([tup])[0]
assert result == expected

def test_hash_scalar(self):
for val in [1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"),
pd.Timestamp("2012-01-01", tz='Europe/Brussels'),
datetime.datetime(2012, 1, 1),
pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(),
pd.Timedelta('1 days'), datetime.timedelta(1),
pd.Period('2012-01-01', freq='D'), pd.Interval(0, 1),
np.nan, pd.NaT, None]:
result = _hash_scalar(val)
expected = hash_array(np.array([val], dtype=object),
categorize=True)
assert result[0] == expected[0]

def test_hash_tuples_err(self):

for val in [5, 'foo', pd.Timestamp('20130101')]:
Expand Down