From 85f45ffca246ff86c58fef25e1a9d0faacac6500 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Tue, 30 Jan 2018 06:40:39 +0100 Subject: [PATCH 1/3] PERF: MultiIndex._engine.get_loc() handles non-unique fine --- doc/source/whatsnew/v0.24.0.txt | 1 + pandas/core/indexes/multi.py | 17 ++++++++++++++--- .../frame/test_sort_values_level_as_str.py | 10 +--------- pandas/tests/indexing/test_ix.py | 5 +---- pandas/tests/indexing/test_multiindex.py | 4 ---- 5 files changed, 17 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 5c15c7b6a742f..aadbfe0f3c4cc 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -501,6 +501,7 @@ Performance Improvements - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) - Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) - Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`,:issue:`21606`) +- Improved performance of :func:`MultiIndex.get_loc` for non-unique indexes, which as a consequence does not emit a ``PerformanceWarning`` any more (:issue:`19464`) - Improved performance of membership checks in :class:`Categorical` and :class:`CategoricalIndex` (i.e. ``x in cat``-style checks are much faster). :meth:`CategoricalIndex.contains` is likewise much faster (:issue:`21369`, :issue:`21508`) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 2a97c37449e12..db97439664d9a 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -16,6 +16,7 @@ from pandas.core.dtypes.common import ( ensure_int64, ensure_platform_int, + is_integer, is_categorical_dtype, is_object_dtype, is_hashable, @@ -2197,10 +2198,16 @@ def _maybe_to_slice(loc): raise KeyError('Key length ({0}) exceeds index depth ({1})' ''.format(keylen, self.nlevels)) - if keylen == self.nlevels and self.is_unique: - return self._engine.get_loc(key) + # If the index is monotonic, the code for partial selection or + # non-unique index (below) is more efficient than the following: + if keylen == self.nlevels and not self.is_monotonic: + loc = self._engine.get_loc(key) + if not self.is_unique and is_integer(loc): + # Indexers expect a slice from indexing a non-unique index + loc = slice(loc, loc + 1) + return loc - # -- partial selection or non-unique index + # -- partial selection or non-unique index or monotonic index # break the key into 2 parts based on the lexsort_depth of the index; # the first part returns a continuous slice of the index; the 2nd part # needs linear search within the slice @@ -2213,6 +2220,10 @@ def _maybe_to_slice(loc): raise KeyError(key) if not follow_key: + # Indexers expect an integer from indexing a key in a unique index + if self.is_unique: + # Breaks if we pass a np.int64. TODO: investigate why + return int(start) return slice(start, stop) warnings.warn('indexing past lexsort depth may impact performance.', diff --git a/pandas/tests/frame/test_sort_values_level_as_str.py b/pandas/tests/frame/test_sort_values_level_as_str.py index 3b4eadfce81cd..266c0d336d898 100644 --- a/pandas/tests/frame/test_sort_values_level_as_str.py +++ b/pandas/tests/frame/test_sort_values_level_as_str.py @@ -2,7 +2,6 @@ import pytest from pandas import DataFrame, Index -from pandas.errors import PerformanceWarning from pandas.util import testing as tm from pandas.util.testing import assert_frame_equal @@ -85,14 +84,7 @@ def test_sort_column_level_and_index_label( ascending=ascending, axis=1) - if len(levels) > 1: - # Accessing multi-level columns that are not lexsorted raises a - # performance warning - with tm.assert_produces_warning(PerformanceWarning, - check_stacklevel=False): - assert_frame_equal(result, expected) - else: - assert_frame_equal(result, expected) + assert_frame_equal(result, expected) def test_sort_values_column_index_level_precedence(): diff --git a/pandas/tests/indexing/test_ix.py b/pandas/tests/indexing/test_ix.py index c84576c984525..a8fd82ce3098e 100644 --- a/pandas/tests/indexing/test_ix.py +++ b/pandas/tests/indexing/test_ix.py @@ -11,7 +11,6 @@ from pandas.compat import lrange from pandas import Series, DataFrame, option_context, MultiIndex from pandas.util import testing as tm -from pandas.errors import PerformanceWarning class TestIX(object): @@ -187,9 +186,7 @@ def test_ix_general(self): df = DataFrame(data).set_index(keys=['col', 'year']) key = 4.0, 2012 - # emits a PerformanceWarning, ok - with tm.assert_produces_warning(PerformanceWarning): - tm.assert_frame_equal(df.loc[key], df.iloc[2:]) + tm.assert_frame_equal(df.loc[key], df.iloc[2:]) # this is ok df.sort_index(inplace=True) diff --git a/pandas/tests/indexing/test_multiindex.py b/pandas/tests/indexing/test_multiindex.py index d2c4c8f5e149b..581aee0dcf971 100644 --- a/pandas/tests/indexing/test_multiindex.py +++ b/pandas/tests/indexing/test_multiindex.py @@ -366,10 +366,6 @@ def test_multiindex_perf_warn(self): 'joe': ['x', 'x', 'z', 'y'], 'jolie': np.random.rand(4)}).set_index(['jim', 'joe']) - with tm.assert_produces_warning(PerformanceWarning, - clear=[pd.core.index]): - df.loc[(1, 'z')] - df = df.iloc[[2, 1, 3, 0]] with tm.assert_produces_warning(PerformanceWarning): df.loc[(0, )] From ddd29ae46eac20830e844967d3c5bbab05dd9aaf Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Tue, 30 Jan 2018 06:51:32 +0100 Subject: [PATCH 2/3] DOC: sorting isn't (and wasn't) a problem for single key indexing --- doc/source/advanced.rst | 15 ++++++++------- pandas/tests/indexes/multi/test_sorting.py | 11 ++++++----- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index e530ece2e12c5..de372eceb5aaa 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -535,16 +535,17 @@ they have a ``MultiIndex``: df.T.sort_index(level=1, axis=1) -Indexing will work even if the data are not sorted, but will be rather -inefficient (and show a ``PerformanceWarning``). It will also +Indexing will work even if the data are not sorted, but partial indexing will +be rather inefficient (and show a ``PerformanceWarning``). It will also return a copy of the data rather than a view: .. ipython:: python dfm = pd.DataFrame({'jim': [0, 0, 1, 1], 'joe': ['x', 'x', 'z', 'y'], - 'jolie': np.random.rand(4)}) - dfm = dfm.set_index(['jim', 'joe']) + 'jolie': list('abcd'), + 'values' : np.random.rand(4)}) + dfm = dfm.set_index(['jim', 'joe', 'jolie']) dfm .. code-block:: ipython @@ -553,9 +554,9 @@ return a copy of the data rather than a view: PerformanceWarning: indexing past lexsort depth may impact performance. Out[4]: - jolie - jim joe - 1 z 0.64094 + values + jolie + 0.879189 c .. _advanced.unsorted: diff --git a/pandas/tests/indexes/multi/test_sorting.py b/pandas/tests/indexes/multi/test_sorting.py index ee29ea1be8aea..4a9831f1d5b80 100644 --- a/pandas/tests/indexes/multi/test_sorting.py +++ b/pandas/tests/indexes/multi/test_sorting.py @@ -114,11 +114,12 @@ def test_unsortedindex(): def test_unsortedindex_doc_examples(): # http://pandas.pydata.org/pandas-docs/stable/advanced.html#sorting-a-multiindex # noqa - dfm = DataFrame({'jim': [0, 0, 1, 1], - 'joe': ['x', 'x', 'z', 'y'], - 'jolie': np.random.rand(4)}) + dfm = pd.DataFrame({'jim': [0, 0, 1, 1], + 'joe': ['x', 'x', 'z', 'y'], + 'jolie': list('abcd'), + 'values': np.random.rand(4)}) - dfm = dfm.set_index(['jim', 'joe']) + dfm = dfm.set_index(['jim', 'joe', 'jolie']) with tm.assert_produces_warning(PerformanceWarning): dfm.loc[(1, 'z')] @@ -134,7 +135,7 @@ def test_unsortedindex_doc_examples(): dfm.loc[(0, 'y'):(1, 'z')] assert dfm.index.is_lexsorted() - assert dfm.index.lexsort_depth == 2 + assert dfm.index.lexsort_depth == 3 def test_reconstruct_sort(): From b72f9c5885ba751f3a90127a859edf452faead1d Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Wed, 31 Jan 2018 08:47:52 +0100 Subject: [PATCH 3/3] TST: asv tests for indexing in non-unique MultiIndex --- asv_bench/benchmarks/multiindex_object.py | 35 +++++++++++++++++++++-- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 0c92214795557..eb4c4e1c5380e 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -49,16 +49,30 @@ class Duplicates(object): goal_time = 0.2 def setup(self): - size = 65536 + size = 6553 arrays = [np.random.randint(0, 8192, size), np.random.randint(0, 1024, size)] - mask = np.random.rand(size) < 0.1 self.mi_unused_levels = MultiIndex.from_arrays(arrays) - self.mi_unused_levels = self.mi_unused_levels[mask] + self.mi = self.mi_unused_levels.remove_unused_levels() + self.sorted = self.mi.sort_values() + self.key = self.mi[len(self.mi) // 2] + self.partial_key = (self.key[0],) def time_remove_unused_levels(self): self.mi_unused_levels.remove_unused_levels() + def time_duplicates_loc(self): + self.mi.get_loc(self.key) + + def time_duplicates_partial_loc(self): + self.mi.get_loc(self.partial_key) + + def time_duplicates_sorted_loc(self): + self.sorted.get_loc(self.key) + + def time_duplicates_sorted_partial_loc(self): + self.sorted.get_loc(self.partial_key) + class Integer(object): @@ -91,10 +105,25 @@ def setup(self): 1000 + np.arange(n)] labels = [np.random.choice(n, (k * n)) for lev in levels] self.mi = MultiIndex(levels=levels, labels=labels) + self.sorted = self.mi.sort_values() + self.key = self.mi[len(self.mi) // 2] + self.partial_key = (self.key[0], self.key[1]) def time_duplicated(self): self.mi.duplicated() + def time_duplicated_loc(self): + self.mi.get_loc(self.key) + + def time_duplicated_partial_loc(self): + self.mi.get_loc(self.partial_key) + + def time_duplicates_sorted_loc(self): + self.sorted.get_loc(self.key) + + def time_duplicates_sorted_partial_loc(self): + self.sorted.get_loc(self.partial_key) + class Sortlevel(object):