From 16177af6589f476cfd26bc272e7d04b14c6aa53d Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Tue, 4 Sep 2018 20:54:17 -0700 Subject: [PATCH 1/3] PERF: improve get_indexer_non_unique on sorted, non-unique indexes Use binary search instead of re-indexing if the iterable key length is small enough --- doc/source/whatsnew/v0.24.0.txt | 2 ++ pandas/_libs/index.pyx | 25 +++++++++++++++++-------- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 6c91b6374b8af..50afa28b2606e 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -604,6 +604,8 @@ Performance Improvements :meth:`~HDFStore.keys`. (i.e. ``x in store`` checks are much faster) (:issue:`21372`) - Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`) +- Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`) + .. _whatsnew_0240.docs: diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 562c1ba218141..3f76915655f58 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -294,14 +294,23 @@ cdef class IndexEngine: result = np.empty(n_alloc, dtype=np.int64) missing = np.empty(n_t, dtype=np.int64) - # form the set of the results (like ismember) - members = np.empty(n, dtype=np.uint8) - for i in range(n): - val = values[i] - if val in stargets: - if val not in d: - d[val] = [] - d[val].append(i) + # map each starget to its position in the index + if stargets and len(stargets) < 5 and self.is_monotonic_increasing: + # if there are few enough stargets and the index is monotonically + # increasing, then use binary search for each starget + for starget in stargets: + start = values.searchsorted(starget, side='left') + end = values.searchsorted(starget, side='right') + if start != end: + d[starget] = list(range(start, end)) + else: + # otherwise, map by iterating through all items in the index + for i in range(n): + val = values[i] + if val in stargets: + if val not in d: + d[val] = [] + d[val].append(i) for i in range(n_t): val = targets[i] From 8b704c5bb8899bea1dcff20ffae78eea6477bbed Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Tue, 4 Sep 2018 20:59:13 -0700 Subject: [PATCH 2/3] PERF: parameterize index structure for asv indexing benchmarks Adds benchmarks for non-unique, sorted indices in NumericSeriesIndexing and NonNumericSeriesIndexing classes --- asv_bench/benchmarks/indexing.py | 75 +++++++++++++++++++------------- 1 file changed, 45 insertions(+), 30 deletions(-) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 739ad6a3d278b..8c6b0cccdff7b 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -11,95 +11,110 @@ class NumericSeriesIndexing(object): goal_time = 0.2 - params = [Int64Index, Float64Index] - param = ['index'] + params = [ + (Int64Index, Float64Index), + ('unique_monotonic_inc', 'nonunique_monotonic_inc'), + ] + param_names = ['index dtype', 'index structure'] - def setup(self, index): + def setup(self, index, index_structure): N = 10**6 - idx = index(range(N)) - self.data = Series(np.random.rand(N), index=idx) + indices = { + 'unique_monotonic_inc': index(range(N)), + 'nonunique_monotonic_inc': index( + list(range(55)) + [54] + list(range(55, N - 1))), + } + self.data = Series(np.random.rand(N), index=indices[index_structure]) self.array = np.arange(10000) self.array_list = self.array.tolist() - def time_getitem_scalar(self, index): + def time_getitem_scalar(self, index, index_structure): self.data[800000] - def time_getitem_slice(self, index): + def time_getitem_slice(self, index, index_structure): self.data[:800000] - def time_getitem_list_like(self, index): + def time_getitem_list_like(self, index, index_structure): self.data[[800000]] - def time_getitem_array(self, index): + def time_getitem_array(self, index, index_structure): self.data[self.array] - def time_getitem_lists(self, index): + def time_getitem_lists(self, index, index_structure): self.data[self.array_list] - def time_iloc_array(self, index): + def time_iloc_array(self, index, index_structure): self.data.iloc[self.array] - def time_iloc_list_like(self, index): + def time_iloc_list_like(self, index, index_structure): self.data.iloc[[800000]] - def time_iloc_scalar(self, index): + def time_iloc_scalar(self, index, index_structure): self.data.iloc[800000] - def time_iloc_slice(self, index): + def time_iloc_slice(self, index, index_structure): self.data.iloc[:800000] - def time_ix_array(self, index): + def time_ix_array(self, index, index_structure): self.data.ix[self.array] - def time_ix_list_like(self, index): + def time_ix_list_like(self, index, index_structure): self.data.ix[[800000]] - def time_ix_scalar(self, index): + def time_ix_scalar(self, index, index_structure): self.data.ix[800000] - def time_ix_slice(self, index): + def time_ix_slice(self, index, index_structure): self.data.ix[:800000] - def time_loc_array(self, index): + def time_loc_array(self, index, index_structure): self.data.loc[self.array] - def time_loc_list_like(self, index): + def time_loc_list_like(self, index, index_structure): self.data.loc[[800000]] - def time_loc_scalar(self, index): + def time_loc_scalar(self, index, index_structure): self.data.loc[800000] - def time_loc_slice(self, index): + def time_loc_slice(self, index, index_structure): self.data.loc[:800000] class NonNumericSeriesIndexing(object): goal_time = 0.2 - params = ['string', 'datetime'] - param_names = ['index'] + params = [ + ('string', 'datetime'), + ('unique_monotonic_inc', 'nonunique_monotonic_inc'), + ] + param_names = ['index dtype', 'index structure'] - def setup(self, index): - N = 10**5 + def setup(self, index, index_structure): + N = 10**6 indexes = {'string': tm.makeStringIndex(N), 'datetime': date_range('1900', periods=N, freq='s')} index = indexes[index] + if index_structure == 'nonunique_monotonic_inc': + index = index.insert(item=index[2], loc=2)[:-1] self.s = Series(np.random.rand(N), index=index) self.lbl = index[80000] - def time_getitem_label_slice(self, index): + def time_getitem_label_slice(self, index, index_structure): self.s[:self.lbl] - def time_getitem_pos_slice(self, index): + def time_getitem_pos_slice(self, index, index_structure): self.s[:80000] - def time_get_value(self, index): + def time_get_value(self, index, index_structure): with warnings.catch_warnings(record=True): self.s.get_value(self.lbl) - def time_getitem_scalar(self, index): + def time_getitem_scalar(self, index, index_structure): self.s[self.lbl] + def time_getitem_list_like(self, index, index_structure): + self.s[[self.lbl]] + class DataFrameStringIndexing(object): From 4ad3006b7fa6b41a0c4d32c05272bb8cde786597 Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Wed, 26 Sep 2018 11:17:47 -0700 Subject: [PATCH 3/3] CLN: use underscore in asv param names --- asv_bench/benchmarks/indexing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 8c6b0cccdff7b..c5b147b152aa6 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -15,7 +15,7 @@ class NumericSeriesIndexing(object): (Int64Index, Float64Index), ('unique_monotonic_inc', 'nonunique_monotonic_inc'), ] - param_names = ['index dtype', 'index structure'] + param_names = ['index_dtype', 'index_structure'] def setup(self, index, index_structure): N = 10**6 @@ -87,7 +87,7 @@ class NonNumericSeriesIndexing(object): ('string', 'datetime'), ('unique_monotonic_inc', 'nonunique_monotonic_inc'), ] - param_names = ['index dtype', 'index structure'] + param_names = ['index_dtype', 'index_structure'] def setup(self, index, index_structure): N = 10**6