Skip to content

Commit b92b043

Browse files
rtlee9jreback
authored andcommitted
Loc enhancements (#22826)
1 parent 4226d74 commit b92b043

File tree

3 files changed

+64
-38
lines changed

3 files changed

+64
-38
lines changed

asv_bench/benchmarks/indexing.py

+45-30
Original file line numberDiff line numberDiff line change
@@ -11,95 +11,110 @@
1111
class NumericSeriesIndexing(object):
1212

1313
goal_time = 0.2
14-
params = [Int64Index, Float64Index]
15-
param = ['index']
14+
params = [
15+
(Int64Index, Float64Index),
16+
('unique_monotonic_inc', 'nonunique_monotonic_inc'),
17+
]
18+
param_names = ['index_dtype', 'index_structure']
1619

17-
def setup(self, index):
20+
def setup(self, index, index_structure):
1821
N = 10**6
19-
idx = index(range(N))
20-
self.data = Series(np.random.rand(N), index=idx)
22+
indices = {
23+
'unique_monotonic_inc': index(range(N)),
24+
'nonunique_monotonic_inc': index(
25+
list(range(55)) + [54] + list(range(55, N - 1))),
26+
}
27+
self.data = Series(np.random.rand(N), index=indices[index_structure])
2128
self.array = np.arange(10000)
2229
self.array_list = self.array.tolist()
2330

24-
def time_getitem_scalar(self, index):
31+
def time_getitem_scalar(self, index, index_structure):
2532
self.data[800000]
2633

27-
def time_getitem_slice(self, index):
34+
def time_getitem_slice(self, index, index_structure):
2835
self.data[:800000]
2936

30-
def time_getitem_list_like(self, index):
37+
def time_getitem_list_like(self, index, index_structure):
3138
self.data[[800000]]
3239

33-
def time_getitem_array(self, index):
40+
def time_getitem_array(self, index, index_structure):
3441
self.data[self.array]
3542

36-
def time_getitem_lists(self, index):
43+
def time_getitem_lists(self, index, index_structure):
3744
self.data[self.array_list]
3845

39-
def time_iloc_array(self, index):
46+
def time_iloc_array(self, index, index_structure):
4047
self.data.iloc[self.array]
4148

42-
def time_iloc_list_like(self, index):
49+
def time_iloc_list_like(self, index, index_structure):
4350
self.data.iloc[[800000]]
4451

45-
def time_iloc_scalar(self, index):
52+
def time_iloc_scalar(self, index, index_structure):
4653
self.data.iloc[800000]
4754

48-
def time_iloc_slice(self, index):
55+
def time_iloc_slice(self, index, index_structure):
4956
self.data.iloc[:800000]
5057

51-
def time_ix_array(self, index):
58+
def time_ix_array(self, index, index_structure):
5259
self.data.ix[self.array]
5360

54-
def time_ix_list_like(self, index):
61+
def time_ix_list_like(self, index, index_structure):
5562
self.data.ix[[800000]]
5663

57-
def time_ix_scalar(self, index):
64+
def time_ix_scalar(self, index, index_structure):
5865
self.data.ix[800000]
5966

60-
def time_ix_slice(self, index):
67+
def time_ix_slice(self, index, index_structure):
6168
self.data.ix[:800000]
6269

63-
def time_loc_array(self, index):
70+
def time_loc_array(self, index, index_structure):
6471
self.data.loc[self.array]
6572

66-
def time_loc_list_like(self, index):
73+
def time_loc_list_like(self, index, index_structure):
6774
self.data.loc[[800000]]
6875

69-
def time_loc_scalar(self, index):
76+
def time_loc_scalar(self, index, index_structure):
7077
self.data.loc[800000]
7178

72-
def time_loc_slice(self, index):
79+
def time_loc_slice(self, index, index_structure):
7380
self.data.loc[:800000]
7481

7582

7683
class NonNumericSeriesIndexing(object):
7784

7885
goal_time = 0.2
79-
params = ['string', 'datetime']
80-
param_names = ['index']
86+
params = [
87+
('string', 'datetime'),
88+
('unique_monotonic_inc', 'nonunique_monotonic_inc'),
89+
]
90+
param_names = ['index_dtype', 'index_structure']
8191

82-
def setup(self, index):
83-
N = 10**5
92+
def setup(self, index, index_structure):
93+
N = 10**6
8494
indexes = {'string': tm.makeStringIndex(N),
8595
'datetime': date_range('1900', periods=N, freq='s')}
8696
index = indexes[index]
97+
if index_structure == 'nonunique_monotonic_inc':
98+
index = index.insert(item=index[2], loc=2)[:-1]
8799
self.s = Series(np.random.rand(N), index=index)
88100
self.lbl = index[80000]
89101

90-
def time_getitem_label_slice(self, index):
102+
def time_getitem_label_slice(self, index, index_structure):
91103
self.s[:self.lbl]
92104

93-
def time_getitem_pos_slice(self, index):
105+
def time_getitem_pos_slice(self, index, index_structure):
94106
self.s[:80000]
95107

96-
def time_get_value(self, index):
108+
def time_get_value(self, index, index_structure):
97109
with warnings.catch_warnings(record=True):
98110
self.s.get_value(self.lbl)
99111

100-
def time_getitem_scalar(self, index):
112+
def time_getitem_scalar(self, index, index_structure):
101113
self.s[self.lbl]
102114

115+
def time_getitem_list_like(self, index, index_structure):
116+
self.s[[self.lbl]]
117+
103118

104119
class DataFrameStringIndexing(object):
105120

doc/source/whatsnew/v0.24.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -610,6 +610,8 @@ Performance Improvements
610610
:meth:`~HDFStore.keys`. (i.e. ``x in store`` checks are much faster)
611611
(:issue:`21372`)
612612
- Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`)
613+
- Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`)
614+
613615

614616
.. _whatsnew_0240.docs:
615617

pandas/_libs/index.pyx

+17-8
Original file line numberDiff line numberDiff line change
@@ -294,14 +294,23 @@ cdef class IndexEngine:
294294
result = np.empty(n_alloc, dtype=np.int64)
295295
missing = np.empty(n_t, dtype=np.int64)
296296

297-
# form the set of the results (like ismember)
298-
members = np.empty(n, dtype=np.uint8)
299-
for i in range(n):
300-
val = values[i]
301-
if val in stargets:
302-
if val not in d:
303-
d[val] = []
304-
d[val].append(i)
297+
# map each starget to its position in the index
298+
if stargets and len(stargets) < 5 and self.is_monotonic_increasing:
299+
# if there are few enough stargets and the index is monotonically
300+
# increasing, then use binary search for each starget
301+
for starget in stargets:
302+
start = values.searchsorted(starget, side='left')
303+
end = values.searchsorted(starget, side='right')
304+
if start != end:
305+
d[starget] = list(range(start, end))
306+
else:
307+
# otherwise, map by iterating through all items in the index
308+
for i in range(n):
309+
val = values[i]
310+
if val in stargets:
311+
if val not in d:
312+
d[val] = []
313+
d[val].append(i)
305314

306315
for i in range(n_t):
307316
val = targets[i]

0 commit comments

Comments
 (0)