Skip to content

Performance of get_loc on non-unique MultiIndex #19464

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 32 additions & 3 deletions asv_bench/benchmarks/multiindex_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,30 @@ class Duplicates(object):
goal_time = 0.2

def setup(self):
size = 65536
size = 6553
arrays = [np.random.randint(0, 8192, size),
np.random.randint(0, 1024, size)]
mask = np.random.rand(size) < 0.1
self.mi_unused_levels = MultiIndex.from_arrays(arrays)
self.mi_unused_levels = self.mi_unused_levels[mask]
self.mi = self.mi_unused_levels.remove_unused_levels()
self.sorted = self.mi.sort_values()
self.key = self.mi[len(self.mi) // 2]
self.partial_key = (self.key[0],)

def time_remove_unused_levels(self):
self.mi_unused_levels.remove_unused_levels()

def time_duplicates_loc(self):
self.mi.get_loc(self.key)

def time_duplicates_partial_loc(self):
self.mi.get_loc(self.partial_key)

def time_duplicates_sorted_loc(self):
self.sorted.get_loc(self.key)

def time_duplicates_sorted_partial_loc(self):
self.sorted.get_loc(self.partial_key)


class Integer(object):

Expand Down Expand Up @@ -91,10 +105,25 @@ def setup(self):
1000 + np.arange(n)]
labels = [np.random.choice(n, (k * n)) for lev in levels]
self.mi = MultiIndex(levels=levels, labels=labels)
self.sorted = self.mi.sort_values()
self.key = self.mi[len(self.mi) // 2]
self.partial_key = (self.key[0], self.key[1])

def time_duplicated(self):
self.mi.duplicated()

def time_duplicated_loc(self):
self.mi.get_loc(self.key)

def time_duplicated_partial_loc(self):
self.mi.get_loc(self.partial_key)

def time_duplicates_sorted_loc(self):
self.sorted.get_loc(self.key)

def time_duplicates_sorted_partial_loc(self):
self.sorted.get_loc(self.partial_key)


class Sortlevel(object):

Expand Down
15 changes: 8 additions & 7 deletions doc/source/advanced.rst
Original file line number Diff line number Diff line change
Expand Up @@ -535,16 +535,17 @@ they have a ``MultiIndex``:

df.T.sort_index(level=1, axis=1)

Indexing will work even if the data are not sorted, but will be rather
inefficient (and show a ``PerformanceWarning``). It will also
Indexing will work even if the data are not sorted, but partial indexing will
be rather inefficient (and show a ``PerformanceWarning``). It will also
return a copy of the data rather than a view:

.. ipython:: python

dfm = pd.DataFrame({'jim': [0, 0, 1, 1],
'joe': ['x', 'x', 'z', 'y'],
'jolie': np.random.rand(4)})
dfm = dfm.set_index(['jim', 'joe'])
'jolie': list('abcd'),
'values' : np.random.rand(4)})
dfm = dfm.set_index(['jim', 'joe', 'jolie'])
dfm

.. code-block:: ipython
Expand All @@ -553,9 +554,9 @@ return a copy of the data rather than a view:
PerformanceWarning: indexing past lexsort depth may impact performance.

Out[4]:
jolie
jim joe
1 z 0.64094
values
jolie
0.879189 c

.. _advanced.unsorted:

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,7 @@ Performance Improvements
- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`,:issue:`21606`)
- Improved performance of :func:`MultiIndex.get_loc` for non-unique indexes, which as a consequence does not emit a ``PerformanceWarning`` any more (:issue:`19464`)
- Improved performance of membership checks in :class:`Categorical` and :class:`CategoricalIndex`
(i.e. ``x in cat``-style checks are much faster). :meth:`CategoricalIndex.contains`
is likewise much faster (:issue:`21369`, :issue:`21508`)
Expand Down
17 changes: 14 additions & 3 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from pandas.core.dtypes.common import (
ensure_int64,
ensure_platform_int,
is_integer,
is_categorical_dtype,
is_object_dtype,
is_hashable,
Expand Down Expand Up @@ -2197,10 +2198,16 @@ def _maybe_to_slice(loc):
raise KeyError('Key length ({0}) exceeds index depth ({1})'
''.format(keylen, self.nlevels))

if keylen == self.nlevels and self.is_unique:
return self._engine.get_loc(key)
# If the index is monotonic, the code for partial selection or
# non-unique index (below) is more efficient than the following:
if keylen == self.nlevels and not self.is_monotonic:
loc = self._engine.get_loc(key)
if not self.is_unique and is_integer(loc):
# Indexers expect a slice from indexing a non-unique index
loc = slice(loc, loc + 1)
return loc

# -- partial selection or non-unique index
# -- partial selection or non-unique index or monotonic index
# break the key into 2 parts based on the lexsort_depth of the index;
# the first part returns a continuous slice of the index; the 2nd part
# needs linear search within the slice
Expand All @@ -2213,6 +2220,10 @@ def _maybe_to_slice(loc):
raise KeyError(key)

if not follow_key:
# Indexers expect an integer from indexing a key in a unique index
if self.is_unique:
# Breaks if we pass a np.int64. TODO: investigate why
return int(start)
return slice(start, stop)

warnings.warn('indexing past lexsort depth may impact performance.',
Expand Down
10 changes: 1 addition & 9 deletions pandas/tests/frame/test_sort_values_level_as_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import pytest

from pandas import DataFrame, Index
from pandas.errors import PerformanceWarning
from pandas.util import testing as tm
from pandas.util.testing import assert_frame_equal

Expand Down Expand Up @@ -85,14 +84,7 @@ def test_sort_column_level_and_index_label(
ascending=ascending,
axis=1)

if len(levels) > 1:
# Accessing multi-level columns that are not lexsorted raises a
# performance warning
with tm.assert_produces_warning(PerformanceWarning,
check_stacklevel=False):
assert_frame_equal(result, expected)
else:
assert_frame_equal(result, expected)
assert_frame_equal(result, expected)


def test_sort_values_column_index_level_precedence():
Expand Down
11 changes: 6 additions & 5 deletions pandas/tests/indexes/multi/test_sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,12 @@ def test_unsortedindex():

def test_unsortedindex_doc_examples():
# http://pandas.pydata.org/pandas-docs/stable/advanced.html#sorting-a-multiindex # noqa
dfm = DataFrame({'jim': [0, 0, 1, 1],
'joe': ['x', 'x', 'z', 'y'],
'jolie': np.random.rand(4)})
dfm = pd.DataFrame({'jim': [0, 0, 1, 1],
'joe': ['x', 'x', 'z', 'y'],
'jolie': list('abcd'),
'values': np.random.rand(4)})

dfm = dfm.set_index(['jim', 'joe'])
dfm = dfm.set_index(['jim', 'joe', 'jolie'])
with tm.assert_produces_warning(PerformanceWarning):
dfm.loc[(1, 'z')]

Expand All @@ -134,7 +135,7 @@ def test_unsortedindex_doc_examples():
dfm.loc[(0, 'y'):(1, 'z')]

assert dfm.index.is_lexsorted()
assert dfm.index.lexsort_depth == 2
assert dfm.index.lexsort_depth == 3


def test_reconstruct_sort():
Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/indexing/test_ix.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from pandas.compat import lrange
from pandas import Series, DataFrame, option_context, MultiIndex
from pandas.util import testing as tm
from pandas.errors import PerformanceWarning


class TestIX(object):
Expand Down Expand Up @@ -187,9 +186,7 @@ def test_ix_general(self):
df = DataFrame(data).set_index(keys=['col', 'year'])
key = 4.0, 2012

# emits a PerformanceWarning, ok
with tm.assert_produces_warning(PerformanceWarning):
tm.assert_frame_equal(df.loc[key], df.iloc[2:])
tm.assert_frame_equal(df.loc[key], df.iloc[2:])

# this is ok
df.sort_index(inplace=True)
Expand Down
4 changes: 0 additions & 4 deletions pandas/tests/indexing/test_multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,10 +366,6 @@ def test_multiindex_perf_warn(self):
'joe': ['x', 'x', 'z', 'y'],
'jolie': np.random.rand(4)}).set_index(['jim', 'joe'])

with tm.assert_produces_warning(PerformanceWarning,
clear=[pd.core.index]):
df.loc[(1, 'z')]

df = df.iloc[[2, 1, 3, 0]]
with tm.assert_produces_warning(PerformanceWarning):
df.loc[(0, )]
Expand Down