Skip to content

Commit 4a4fe0b

Browse files
committed
Merge pull request pandas-dev#10290 from jreback/mi_perf
PERF: improved performance of multiindex slicing
2 parents ad37b5d + 2874420 commit 4a4fe0b

File tree

10 files changed

+156
-40
lines changed

10 files changed

+156
-40
lines changed

doc/source/whatsnew/v0.17.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ Performance Improvements
6363

6464
- 4x improvement in ``timedelta`` string parsing (:issue:`6755`, :issue:`10426`)
6565
- 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`)
66+
- Significantly improved performance of indexing ``MultiIndex`` with slicers (:issue:`10287`)
67+
- Improved performance of ``Series.isin`` for datetimelike/integer Series (:issue:`10287`)
6668

6769
.. _whatsnew_0170.bug_fixes:
6870

pandas/core/common.py

+4
Original file line numberDiff line numberDiff line change
@@ -2497,6 +2497,10 @@ def is_integer_dtype(arr_or_dtype):
24972497
return (issubclass(tipo, np.integer) and
24982498
not issubclass(tipo, (np.datetime64, np.timedelta64)))
24992499

2500+
def is_int64_dtype(arr_or_dtype):
2501+
tipo = _get_dtype_type(arr_or_dtype)
2502+
return issubclass(tipo, np.int64)
2503+
25002504

25012505
def is_int_or_datetime_dtype(arr_or_dtype):
25022506
tipo = _get_dtype_type(arr_or_dtype)

pandas/core/index.py

+73-35
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ class Index(IndexOpsMixin, PandasObject):
105105
_is_numeric_dtype = False
106106

107107
_engine_type = _index.ObjectEngine
108+
_isin_type = lib.ismember
108109

109110
def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False,
110111
tupleize_cols=True, **kwargs):
@@ -1838,7 +1839,7 @@ def isin(self, values, level=None):
18381839
value_set = set(values)
18391840
if level is not None:
18401841
self._validate_index_level(level)
1841-
return lib.ismember(np.array(self), value_set)
1842+
return self._isin_type(np.array(self), value_set)
18421843

18431844
def _can_reindex(self, indexer):
18441845
"""
@@ -3379,6 +3380,7 @@ class Int64Index(NumericIndex):
33793380
_outer_indexer = _algos.outer_join_indexer_int64
33803381

33813382
_engine_type = _index.Int64Engine
3383+
_isin_type = lib.ismember_int64
33823384

33833385
def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, **kwargs):
33843386

@@ -5235,13 +5237,39 @@ def partial_selection(key, indexer=None):
52355237
indexer = self._get_level_indexer(key, level=level)
52365238
return indexer, maybe_droplevels(indexer, [level], drop_level)
52375239

5238-
def _get_level_indexer(self, key, level=0):
5239-
# return a boolean indexer or a slice showing where the key is
5240+
def _get_level_indexer(self, key, level=0, indexer=None):
5241+
# return an indexer, boolean array or a slice showing where the key is
52405242
# in the totality of values
5243+
# if the indexer is provided, then use this
52415244

52425245
level_index = self.levels[level]
52435246
labels = self.labels[level]
52445247

5248+
def convert_indexer(start, stop, step, indexer=indexer, labels=labels):
5249+
# given the inputs and the labels/indexer, compute an indexer set
5250+
# if we have a provided indexer, then this need not consider
5251+
# the entire labels set
5252+
5253+
r = np.arange(start,stop,step)
5254+
if indexer is not None and len(indexer) != len(labels):
5255+
5256+
# we have an indexer which maps the locations in the labels that we
5257+
# have already selected (and is not an indexer for the entire set)
5258+
# otherwise this is wasteful
5259+
# so we only need to examine locations that are in this set
5260+
# the only magic here is that the result are the mappings to the
5261+
# set that we have selected
5262+
from pandas import Series
5263+
mapper = Series(indexer)
5264+
result = Series(Index(labels.take(indexer)).isin(r).nonzero()[0])
5265+
m = result.map(mapper).values
5266+
5267+
else:
5268+
m = np.zeros(len(labels),dtype=bool)
5269+
m[np.in1d(labels,r,assume_unique=True)] = True
5270+
5271+
return m
5272+
52455273
if isinstance(key, slice):
52465274
# handle a slice, returnig a slice if we can
52475275
# otherwise a boolean indexer
@@ -5267,17 +5295,13 @@ def _get_level_indexer(self, key, level=0):
52675295
# a partial date slicer on a DatetimeIndex generates a slice
52685296
# note that the stop ALREADY includes the stopped point (if
52695297
# it was a string sliced)
5270-
m = np.zeros(len(labels),dtype=bool)
5271-
m[np.in1d(labels,np.arange(start.start,stop.stop,step))] = True
5272-
return m
5298+
return convert_indexer(start.start,stop.stop,step)
52735299

52745300
elif level > 0 or self.lexsort_depth == 0 or step is not None:
52755301
# need to have like semantics here to right
52765302
# searching as when we are using a slice
52775303
# so include the stop+1 (so we include stop)
5278-
m = np.zeros(len(labels),dtype=bool)
5279-
m[np.in1d(labels,np.arange(start,stop+1,step))] = True
5280-
return m
5304+
return convert_indexer(start,stop+1,step)
52815305
else:
52825306
# sorted, so can return slice object -> view
52835307
i = labels.searchsorted(start, side='left')
@@ -5315,59 +5339,73 @@ def get_locs(self, tup):
53155339
raise KeyError('MultiIndex Slicing requires the index to be fully lexsorted'
53165340
' tuple len ({0}), lexsort depth ({1})'.format(len(tup), self.lexsort_depth))
53175341

5318-
def _convert_indexer(r):
5342+
# indexer
5343+
# this is the list of all values that we want to select
5344+
n = len(self)
5345+
indexer = None
5346+
5347+
def _convert_to_indexer(r):
5348+
# return an indexer
53195349
if isinstance(r, slice):
5320-
m = np.zeros(len(self),dtype=bool)
5350+
m = np.zeros(n,dtype=bool)
53215351
m[r] = True
5322-
return m
5323-
return r
5352+
r = m.nonzero()[0]
5353+
elif is_bool_indexer(r):
5354+
if len(r) != n:
5355+
raise ValueError("cannot index with a boolean indexer that is"
5356+
" not the same length as the index")
5357+
r = r.nonzero()[0]
5358+
return Int64Index(r)
5359+
5360+
def _update_indexer(idxr, indexer=indexer):
5361+
if indexer is None:
5362+
indexer = Index(np.arange(n))
5363+
if idxr is None:
5364+
return indexer
5365+
return indexer & idxr
53245366

5325-
ranges = []
53265367
for i,k in enumerate(tup):
53275368

53285369
if is_bool_indexer(k):
53295370
# a boolean indexer, must be the same length!
53305371
k = np.asarray(k)
5331-
if len(k) != len(self):
5332-
raise ValueError("cannot index with a boolean indexer that is"
5333-
" not the same length as the index")
5334-
ranges.append(k)
5372+
indexer = _update_indexer(_convert_to_indexer(k), indexer=indexer)
5373+
53355374
elif is_list_like(k):
53365375
# a collection of labels to include from this level (these are or'd)
5337-
indexers = []
5376+
indexers = None
53385377
for x in k:
53395378
try:
5340-
indexers.append(_convert_indexer(self._get_level_indexer(x, level=i)))
5379+
idxrs = _convert_to_indexer(self._get_level_indexer(x, level=i, indexer=indexer))
5380+
indexers = idxrs if indexers is None else indexers | idxrs
53415381
except (KeyError):
53425382

53435383
# ignore not founds
53445384
continue
5345-
if len(k):
5346-
ranges.append(reduce(np.logical_or, indexers))
5385+
5386+
if indexers is not None:
5387+
indexer = _update_indexer(indexers, indexer=indexer)
53475388
else:
5348-
ranges.append(np.zeros(self.labels[i].shape, dtype=bool))
5389+
5390+
# no matches we are done
5391+
return Int64Index([]).values
53495392

53505393
elif is_null_slice(k):
53515394
# empty slice
5352-
pass
5395+
indexer = _update_indexer(None, indexer=indexer)
53535396

53545397
elif isinstance(k,slice):
53555398

53565399
# a slice, include BOTH of the labels
5357-
ranges.append(self._get_level_indexer(k,level=i))
5400+
indexer = _update_indexer(_convert_to_indexer(self._get_level_indexer(k,level=i,indexer=indexer)), indexer=indexer)
53585401
else:
53595402
# a single label
5360-
ranges.append(self.get_loc_level(k,level=i,drop_level=False)[0])
5361-
5362-
# identity
5363-
if len(ranges) == 0:
5364-
return slice(0,len(self))
5365-
5366-
elif len(ranges) == 1:
5367-
return ranges[0]
5403+
indexer = _update_indexer(_convert_to_indexer(self.get_loc_level(k,level=i,drop_level=False)[0]), indexer=indexer)
53685404

5369-
# construct a boolean indexer if we have a slice or boolean indexer
5370-
return reduce(np.logical_and,[ _convert_indexer(r) for r in ranges ])
5405+
# empty indexer
5406+
if indexer is None:
5407+
return Int64Index([]).values
5408+
return indexer.values
53715409

53725410
def truncate(self, before=None, after=None):
53735411
"""

pandas/core/indexing.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -509,7 +509,7 @@ def can_do_equal_len():
509509

510510
def _align_series(self, indexer, ser):
511511
# indexer to assign Series can be tuple, slice, scalar
512-
if isinstance(indexer, (slice, np.ndarray, list)):
512+
if isinstance(indexer, (slice, np.ndarray, list, Index)):
513513
indexer = tuple([indexer])
514514

515515
if isinstance(indexer, tuple):
@@ -1719,7 +1719,7 @@ def maybe_convert_ix(*args):
17191719

17201720
ixify = True
17211721
for arg in args:
1722-
if not isinstance(arg, (np.ndarray, list, ABCSeries)):
1722+
if not isinstance(arg, (np.ndarray, list, ABCSeries, Index)):
17231723
ixify = False
17241724

17251725
if ixify:

pandas/core/series.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
is_list_like, _values_from_object,
2020
_possibly_cast_to_datetime, _possibly_castable,
2121
_possibly_convert_platform, _try_sort,
22+
is_int64_dtype,
2223
ABCSparseArray, _maybe_match_name,
2324
_coerce_to_dtype, SettingWithCopyError,
2425
_maybe_box_datetimelike, ABCDataFrame,
@@ -2250,17 +2251,22 @@ def isin(self, values):
22502251

22512252
# may need i8 conversion for proper membership testing
22522253
comps = _values_from_object(self)
2254+
f = lib.ismember
22532255
if com.is_datetime64_dtype(self):
22542256
from pandas.tseries.tools import to_datetime
22552257
values = Series(to_datetime(values)).values.view('i8')
22562258
comps = comps.view('i8')
2259+
f = lib.ismember_int64
22572260
elif com.is_timedelta64_dtype(self):
22582261
from pandas.tseries.timedeltas import to_timedelta
22592262
values = Series(to_timedelta(values)).values.view('i8')
22602263
comps = comps.view('i8')
2264+
f = lib.ismember_int64
2265+
elif is_int64_dtype(self):
2266+
f = lib.ismember_int64
22612267

22622268
value_set = set(values)
2263-
result = lib.ismember(comps, value_set)
2269+
result = f(comps, value_set)
22642270
return self._constructor(result, index=self.index).__finalize__(self)
22652271

22662272
def between(self, left, right, inclusive=True):

pandas/lib.pyx

+25
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,31 @@ def ismember(ndarray arr, set values):
156156

157157
return result.view(np.bool_)
158158

159+
def ismember_int64(ndarray[int64_t] arr, set values):
160+
'''
161+
Checks whether
162+
163+
Parameters
164+
----------
165+
arr : ndarray of int64
166+
values : set
167+
168+
Returns
169+
-------
170+
ismember : ndarray (boolean dtype)
171+
'''
172+
cdef:
173+
Py_ssize_t i, n
174+
ndarray[uint8_t] result
175+
int64_t v
176+
177+
n = len(arr)
178+
result = np.empty(n, dtype=np.uint8)
179+
for i in range(n):
180+
result[i] = arr[i] in values
181+
182+
return result.view(np.bool_)
183+
159184
#----------------------------------------------------------------------
160185
# datetime / io related
161186

pandas/tests/test_indexing.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -2312,6 +2312,7 @@ def f():
23122312
index=pd.MultiIndex.from_product([['A','B','C'],['foo']],
23132313
names=['one','two'])
23142314
).sortlevel()
2315+
23152316
result = s.loc[idx[:,['foo']]]
23162317
assert_series_equal(result,expected)
23172318
result = s.loc[idx[:,['foo','bah']]]
@@ -2323,9 +2324,9 @@ def f():
23232324
df = DataFrame(np.random.randn(5, 6), index=range(5), columns=multi_index)
23242325
df = df.sortlevel(0, axis=1)
23252326

2327+
expected = DataFrame(index=range(5),columns=multi_index.reindex([])[0])
23262328
result1 = df.loc[:, ([], slice(None))]
23272329
result2 = df.loc[:, (['foo'], [])]
2328-
expected = DataFrame(index=range(5),columns=multi_index.reindex([])[0])
23292330
assert_frame_equal(result1, expected)
23302331
assert_frame_equal(result2, expected)
23312332

pandas/tseries/base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -449,7 +449,7 @@ def isin(self, values):
449449
return self.asobject.isin(values)
450450

451451
value_set = set(values.asi8)
452-
return lib.ismember(self.asi8, value_set)
452+
return lib.ismember_int64(self.asi8, value_set)
453453

454454
def shift(self, n, freq=None):
455455
"""

vb_suite/indexing.py

+30
Original file line numberDiff line numberDiff line change
@@ -235,3 +235,33 @@
235235
series_ix_slice = Benchmark("s.ix[:800000]", setup)
236236
series_ix_list_like = Benchmark("s.ix[[800000]]", setup)
237237
series_ix_array = Benchmark("s.ix[np.arange(10000)]", setup)
238+
239+
240+
# multi-index slicing
241+
setup = common_setup + """
242+
np.random.seed(1234)
243+
idx=pd.IndexSlice
244+
n=100000
245+
mdt = pandas.DataFrame()
246+
mdt['A'] = np.random.choice(range(10000,45000,1000), n)
247+
mdt['B'] = np.random.choice(range(10,400), n)
248+
mdt['C'] = np.random.choice(range(1,150), n)
249+
mdt['D'] = np.random.choice(range(10000,45000), n)
250+
mdt['x'] = np.random.choice(range(400), n)
251+
mdt['y'] = np.random.choice(range(25), n)
252+
253+
254+
test_A = 25000
255+
test_B = 25
256+
test_C = 40
257+
test_D = 35000
258+
259+
eps_A = 5000
260+
eps_B = 5
261+
eps_C = 5
262+
eps_D = 5000
263+
mdt2 = mdt.set_index(['A','B','C','D']).sortlevel()
264+
"""
265+
266+
multiindex_slicers = Benchmark('mdt2.loc[idx[test_A-eps_A:test_A+eps_A,test_B-eps_B:test_B+eps_B,test_C-eps_C:test_C+eps_C,test_D-eps_D:test_D+eps_D],:]', setup,
267+
start_date=datetime(2015, 1, 1))

vb_suite/series_methods.py

+10
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
setup = common_setup + """
88
s1 = Series(np.random.randn(10000))
99
s2 = Series(np.random.randint(1, 10, 10000))
10+
s3 = Series(np.random.randint(1, 10, 100000)).astype('int64')
11+
values = [1,2]
12+
s4 = s3.astype('object')
1013
"""
1114

1215
series_nlargest1 = Benchmark('s1.nlargest(3, take_last=True);'
@@ -27,3 +30,10 @@
2730
's2.nsmallest(3, take_last=False)',
2831
setup,
2932
start_date=datetime(2014, 1, 25))
33+
34+
series_isin_int64 = Benchmark('s3.isin(values)',
35+
setup,
36+
start_date=datetime(2014, 1, 25))
37+
series_isin_object = Benchmark('s4.isin(values)',
38+
setup,
39+
start_date=datetime(2014, 1, 25))

0 commit comments

Comments
 (0)