Skip to content

Commit 2874420

Browse files
committed
PERF: fix _get_level_indexer to accept an intermediate indexer result
1 parent b069253 commit 2874420

File tree

10 files changed

+127
-40
lines changed

10 files changed

+127
-40
lines changed

doc/source/whatsnew/v0.17.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ Performance Improvements
6363

6464
- 4x improvement in ``timedelta`` string parsing (:issue:`6755`)
6565
- 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`)
66+
- Significantly improved performance of indexing ``MultiIndex`` with slicers (:issue:`10287`)
67+
- Improved performance of ``Series.isin`` for datetimelike/integer Series (:issue:`10287`)
6668

6769
.. _whatsnew_0170.bug_fixes:
6870

pandas/core/common.py

+4
Original file line numberDiff line numberDiff line change
@@ -2497,6 +2497,10 @@ def is_integer_dtype(arr_or_dtype):
24972497
return (issubclass(tipo, np.integer) and
24982498
not issubclass(tipo, (np.datetime64, np.timedelta64)))
24992499

2500+
def is_int64_dtype(arr_or_dtype):
2501+
tipo = _get_dtype_type(arr_or_dtype)
2502+
return issubclass(tipo, np.int64)
2503+
25002504

25012505
def is_int_or_datetime_dtype(arr_or_dtype):
25022506
tipo = _get_dtype_type(arr_or_dtype)

pandas/core/index.py

+73-35
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ class Index(IndexOpsMixin, PandasObject):
105105
_is_numeric_dtype = False
106106

107107
_engine_type = _index.ObjectEngine
108+
_isin_type = lib.ismember
108109

109110
def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False,
110111
tupleize_cols=True, **kwargs):
@@ -1838,7 +1839,7 @@ def isin(self, values, level=None):
18381839
value_set = set(values)
18391840
if level is not None:
18401841
self._validate_index_level(level)
1841-
return lib.ismember(np.array(self), value_set)
1842+
return self._isin_type(np.array(self), value_set)
18421843

18431844
def _can_reindex(self, indexer):
18441845
"""
@@ -3381,6 +3382,7 @@ class Int64Index(NumericIndex):
33813382
_outer_indexer = _algos.outer_join_indexer_int64
33823383

33833384
_engine_type = _index.Int64Engine
3385+
_isin_type = lib.ismember_int64
33843386

33853387
def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, **kwargs):
33863388

@@ -5237,13 +5239,39 @@ def partial_selection(key, indexer=None):
52375239
indexer = self._get_level_indexer(key, level=level)
52385240
return indexer, maybe_droplevels(indexer, [level], drop_level)
52395241

5240-
def _get_level_indexer(self, key, level=0):
5241-
# return a boolean indexer or a slice showing where the key is
5242+
def _get_level_indexer(self, key, level=0, indexer=None):
5243+
# return an indexer, boolean array or a slice showing where the key is
52425244
# in the totality of values
5245+
# if the indexer is provided, then use this
52435246

52445247
level_index = self.levels[level]
52455248
labels = self.labels[level]
52465249

5250+
def convert_indexer(start, stop, step, indexer=indexer, labels=labels):
5251+
# given the inputs and the labels/indexer, compute an indexer set
5252+
# if we have a provided indexer, then this need not consider
5253+
# the entire labels set
5254+
5255+
r = np.arange(start,stop,step)
5256+
if indexer is not None and len(indexer) != len(labels):
5257+
5258+
# we have an indexer which maps the locations in the labels that we
5259+
# have already selected (and is not an indexer for the entire set)
5260+
# otherwise this is wasteful
5261+
# so we only need to examine locations that are in this set
5262+
# the only magic here is that the result are the mappings to the
5263+
# set that we have selected
5264+
from pandas import Series
5265+
mapper = Series(indexer)
5266+
result = Series(Index(labels.take(indexer)).isin(r).nonzero()[0])
5267+
m = result.map(mapper).values
5268+
5269+
else:
5270+
m = np.zeros(len(labels),dtype=bool)
5271+
m[np.in1d(labels,r,assume_unique=True)] = True
5272+
5273+
return m
5274+
52475275
if isinstance(key, slice):
52485276
# handle a slice, returnig a slice if we can
52495277
# otherwise a boolean indexer
@@ -5269,17 +5297,13 @@ def _get_level_indexer(self, key, level=0):
52695297
# a partial date slicer on a DatetimeIndex generates a slice
52705298
# note that the stop ALREADY includes the stopped point (if
52715299
# it was a string sliced)
5272-
m = np.zeros(len(labels),dtype=bool)
5273-
m[np.in1d(labels,np.arange(start.start,stop.stop,step))] = True
5274-
return m
5300+
return convert_indexer(start.start,stop.stop,step)
52755301

52765302
elif level > 0 or self.lexsort_depth == 0 or step is not None:
52775303
# need to have like semantics here to right
52785304
# searching as when we are using a slice
52795305
# so include the stop+1 (so we include stop)
5280-
m = np.zeros(len(labels),dtype=bool)
5281-
m[np.in1d(labels,np.arange(start,stop+1,step))] = True
5282-
return m
5306+
return convert_indexer(start,stop+1,step)
52835307
else:
52845308
# sorted, so can return slice object -> view
52855309
i = labels.searchsorted(start, side='left')
@@ -5317,59 +5341,73 @@ def get_locs(self, tup):
53175341
raise KeyError('MultiIndex Slicing requires the index to be fully lexsorted'
53185342
' tuple len ({0}), lexsort depth ({1})'.format(len(tup), self.lexsort_depth))
53195343

5320-
def _convert_indexer(r):
5344+
# indexer
5345+
# this is the list of all values that we want to select
5346+
n = len(self)
5347+
indexer = None
5348+
5349+
def _convert_to_indexer(r):
5350+
# return an indexer
53215351
if isinstance(r, slice):
5322-
m = np.zeros(len(self),dtype=bool)
5352+
m = np.zeros(n,dtype=bool)
53235353
m[r] = True
5324-
return m
5325-
return r
5354+
r = m.nonzero()[0]
5355+
elif is_bool_indexer(r):
5356+
if len(r) != n:
5357+
raise ValueError("cannot index with a boolean indexer that is"
5358+
" not the same length as the index")
5359+
r = r.nonzero()[0]
5360+
return Int64Index(r)
5361+
5362+
def _update_indexer(idxr, indexer=indexer):
5363+
if indexer is None:
5364+
indexer = Index(np.arange(n))
5365+
if idxr is None:
5366+
return indexer
5367+
return indexer & idxr
53265368

5327-
ranges = []
53285369
for i,k in enumerate(tup):
53295370

53305371
if is_bool_indexer(k):
53315372
# a boolean indexer, must be the same length!
53325373
k = np.asarray(k)
5333-
if len(k) != len(self):
5334-
raise ValueError("cannot index with a boolean indexer that is"
5335-
" not the same length as the index")
5336-
ranges.append(k)
5374+
indexer = _update_indexer(_convert_to_indexer(k), indexer=indexer)
5375+
53375376
elif is_list_like(k):
53385377
# a collection of labels to include from this level (these are or'd)
5339-
indexers = []
5378+
indexers = None
53405379
for x in k:
53415380
try:
5342-
indexers.append(_convert_indexer(self._get_level_indexer(x, level=i)))
5381+
idxrs = _convert_to_indexer(self._get_level_indexer(x, level=i, indexer=indexer))
5382+
indexers = idxrs if indexers is None else indexers | idxrs
53435383
except (KeyError):
53445384

53455385
# ignore not founds
53465386
continue
5347-
if len(k):
5348-
ranges.append(reduce(np.logical_or, indexers))
5387+
5388+
if indexers is not None:
5389+
indexer = _update_indexer(indexers, indexer=indexer)
53495390
else:
5350-
ranges.append(np.zeros(self.labels[i].shape, dtype=bool))
5391+
5392+
# no matches we are done
5393+
return Int64Index([]).values
53515394

53525395
elif is_null_slice(k):
53535396
# empty slice
5354-
pass
5397+
indexer = _update_indexer(None, indexer=indexer)
53555398

53565399
elif isinstance(k,slice):
53575400

53585401
# a slice, include BOTH of the labels
5359-
ranges.append(self._get_level_indexer(k,level=i))
5402+
indexer = _update_indexer(_convert_to_indexer(self._get_level_indexer(k,level=i,indexer=indexer)), indexer=indexer)
53605403
else:
53615404
# a single label
5362-
ranges.append(self.get_loc_level(k,level=i,drop_level=False)[0])
5363-
5364-
# identity
5365-
if len(ranges) == 0:
5366-
return slice(0,len(self))
5367-
5368-
elif len(ranges) == 1:
5369-
return ranges[0]
5405+
indexer = _update_indexer(_convert_to_indexer(self.get_loc_level(k,level=i,drop_level=False)[0]), indexer=indexer)
53705406

5371-
# construct a boolean indexer if we have a slice or boolean indexer
5372-
return reduce(np.logical_and,[ _convert_indexer(r) for r in ranges ])
5407+
# empty indexer
5408+
if indexer is None:
5409+
return Int64Index([]).values
5410+
return indexer.values
53735411

53745412
def truncate(self, before=None, after=None):
53755413
"""

pandas/core/indexing.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -509,7 +509,7 @@ def can_do_equal_len():
509509

510510
def _align_series(self, indexer, ser):
511511
# indexer to assign Series can be tuple, slice, scalar
512-
if isinstance(indexer, (slice, np.ndarray, list)):
512+
if isinstance(indexer, (slice, np.ndarray, list, Index)):
513513
indexer = tuple([indexer])
514514

515515
if isinstance(indexer, tuple):
@@ -1719,7 +1719,7 @@ def maybe_convert_ix(*args):
17191719

17201720
ixify = True
17211721
for arg in args:
1722-
if not isinstance(arg, (np.ndarray, list, ABCSeries)):
1722+
if not isinstance(arg, (np.ndarray, list, ABCSeries, Index)):
17231723
ixify = False
17241724

17251725
if ixify:

pandas/core/series.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
is_list_like, _values_from_object,
2020
_possibly_cast_to_datetime, _possibly_castable,
2121
_possibly_convert_platform, _try_sort,
22+
is_int64_dtype,
2223
ABCSparseArray, _maybe_match_name,
2324
_coerce_to_dtype, SettingWithCopyError,
2425
_maybe_box_datetimelike, ABCDataFrame,
@@ -2250,17 +2251,22 @@ def isin(self, values):
22502251

22512252
# may need i8 conversion for proper membership testing
22522253
comps = _values_from_object(self)
2254+
f = lib.ismember
22532255
if com.is_datetime64_dtype(self):
22542256
from pandas.tseries.tools import to_datetime
22552257
values = Series(to_datetime(values)).values.view('i8')
22562258
comps = comps.view('i8')
2259+
f = lib.ismember_int64
22572260
elif com.is_timedelta64_dtype(self):
22582261
from pandas.tseries.timedeltas import to_timedelta
22592262
values = Series(to_timedelta(values)).values.view('i8')
22602263
comps = comps.view('i8')
2264+
f = lib.ismember_int64
2265+
elif is_int64_dtype(self):
2266+
f = lib.ismember_int64
22612267

22622268
value_set = set(values)
2263-
result = lib.ismember(comps, value_set)
2269+
result = f(comps, value_set)
22642270
return self._constructor(result, index=self.index).__finalize__(self)
22652271

22662272
def between(self, left, right, inclusive=True):

pandas/lib.pyx

+25
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,31 @@ def ismember(ndarray arr, set values):
156156

157157
return result.view(np.bool_)
158158

159+
def ismember_int64(ndarray[int64_t] arr, set values):
160+
'''
161+
Checks whether
162+
163+
Parameters
164+
----------
165+
arr : ndarray of int64
166+
values : set
167+
168+
Returns
169+
-------
170+
ismember : ndarray (boolean dtype)
171+
'''
172+
cdef:
173+
Py_ssize_t i, n
174+
ndarray[uint8_t] result
175+
int64_t v
176+
177+
n = len(arr)
178+
result = np.empty(n, dtype=np.uint8)
179+
for i in range(n):
180+
result[i] = arr[i] in values
181+
182+
return result.view(np.bool_)
183+
159184
#----------------------------------------------------------------------
160185
# datetime / io related
161186

pandas/tests/test_indexing.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -2293,6 +2293,7 @@ def f():
22932293
index=pd.MultiIndex.from_product([['A','B','C'],['foo']],
22942294
names=['one','two'])
22952295
).sortlevel()
2296+
22962297
result = s.loc[idx[:,['foo']]]
22972298
assert_series_equal(result,expected)
22982299
result = s.loc[idx[:,['foo','bah']]]
@@ -2304,9 +2305,9 @@ def f():
23042305
df = DataFrame(np.random.randn(5, 6), index=range(5), columns=multi_index)
23052306
df = df.sortlevel(0, axis=1)
23062307

2308+
expected = DataFrame(index=range(5),columns=multi_index.reindex([])[0])
23072309
result1 = df.loc[:, ([], slice(None))]
23082310
result2 = df.loc[:, (['foo'], [])]
2309-
expected = DataFrame(index=range(5),columns=multi_index.reindex([])[0])
23102311
assert_frame_equal(result1, expected)
23112312
assert_frame_equal(result2, expected)
23122313

pandas/tseries/base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -449,7 +449,7 @@ def isin(self, values):
449449
return self.asobject.isin(values)
450450

451451
value_set = set(values.asi8)
452-
return lib.ismember(self.asi8, value_set)
452+
return lib.ismember_int64(self.asi8, value_set)
453453

454454
def shift(self, n, freq=None):
455455
"""

vb_suite/indexing.py

+1
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@
239239

240240
# multi-index slicing
241241
setup = common_setup + """
242+
np.random.seed(1234)
242243
idx=pd.IndexSlice
243244
n=100000
244245
mdt = pandas.DataFrame()

vb_suite/series_methods.py

+10
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
setup = common_setup + """
88
s1 = Series(np.random.randn(10000))
99
s2 = Series(np.random.randint(1, 10, 10000))
10+
s3 = Series(np.random.randint(1, 10, 100000)).astype('int64')
11+
values = [1,2]
12+
s4 = s3.astype('object')
1013
"""
1114

1215
series_nlargest1 = Benchmark('s1.nlargest(3, take_last=True);'
@@ -27,3 +30,10 @@
2730
's2.nsmallest(3, take_last=False)',
2831
setup,
2932
start_date=datetime(2014, 1, 25))
33+
34+
series_isin_int64 = Benchmark('s3.isin(values)',
35+
setup,
36+
start_date=datetime(2014, 1, 25))
37+
series_isin_object = Benchmark('s4.isin(values)',
38+
setup,
39+
start_date=datetime(2014, 1, 25))

0 commit comments

Comments
 (0)