Skip to content

Loc enhancements #22826

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 1, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 45 additions & 30 deletions asv_bench/benchmarks/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,95 +11,110 @@
class NumericSeriesIndexing(object):

goal_time = 0.2
params = [Int64Index, Float64Index]
param = ['index']
params = [
(Int64Index, Float64Index),
('unique_monotonic_inc', 'nonunique_monotonic_inc'),
]
param_names = ['index_dtype', 'index_structure']

def setup(self, index):
def setup(self, index, index_structure):
N = 10**6
idx = index(range(N))
self.data = Series(np.random.rand(N), index=idx)
indices = {
'unique_monotonic_inc': index(range(N)),
'nonunique_monotonic_inc': index(
list(range(55)) + [54] + list(range(55, N - 1))),
}
self.data = Series(np.random.rand(N), index=indices[index_structure])
self.array = np.arange(10000)
self.array_list = self.array.tolist()

def time_getitem_scalar(self, index):
def time_getitem_scalar(self, index, index_structure):
self.data[800000]

def time_getitem_slice(self, index):
def time_getitem_slice(self, index, index_structure):
self.data[:800000]

def time_getitem_list_like(self, index):
def time_getitem_list_like(self, index, index_structure):
self.data[[800000]]

def time_getitem_array(self, index):
def time_getitem_array(self, index, index_structure):
self.data[self.array]

def time_getitem_lists(self, index):
def time_getitem_lists(self, index, index_structure):
self.data[self.array_list]

def time_iloc_array(self, index):
def time_iloc_array(self, index, index_structure):
self.data.iloc[self.array]

def time_iloc_list_like(self, index):
def time_iloc_list_like(self, index, index_structure):
self.data.iloc[[800000]]

def time_iloc_scalar(self, index):
def time_iloc_scalar(self, index, index_structure):
self.data.iloc[800000]

def time_iloc_slice(self, index):
def time_iloc_slice(self, index, index_structure):
self.data.iloc[:800000]

def time_ix_array(self, index):
def time_ix_array(self, index, index_structure):
self.data.ix[self.array]

def time_ix_list_like(self, index):
def time_ix_list_like(self, index, index_structure):
self.data.ix[[800000]]

def time_ix_scalar(self, index):
def time_ix_scalar(self, index, index_structure):
self.data.ix[800000]

def time_ix_slice(self, index):
def time_ix_slice(self, index, index_structure):
self.data.ix[:800000]

def time_loc_array(self, index):
def time_loc_array(self, index, index_structure):
self.data.loc[self.array]

def time_loc_list_like(self, index):
def time_loc_list_like(self, index, index_structure):
self.data.loc[[800000]]

def time_loc_scalar(self, index):
def time_loc_scalar(self, index, index_structure):
self.data.loc[800000]

def time_loc_slice(self, index):
def time_loc_slice(self, index, index_structure):
self.data.loc[:800000]


class NonNumericSeriesIndexing(object):

goal_time = 0.2
params = ['string', 'datetime']
param_names = ['index']
params = [
('string', 'datetime'),
('unique_monotonic_inc', 'nonunique_monotonic_inc'),
]
param_names = ['index_dtype', 'index_structure']

def setup(self, index):
N = 10**5
def setup(self, index, index_structure):
N = 10**6
indexes = {'string': tm.makeStringIndex(N),
'datetime': date_range('1900', periods=N, freq='s')}
index = indexes[index]
if index_structure == 'nonunique_monotonic_inc':
index = index.insert(item=index[2], loc=2)[:-1]
self.s = Series(np.random.rand(N), index=index)
self.lbl = index[80000]

def time_getitem_label_slice(self, index):
def time_getitem_label_slice(self, index, index_structure):
self.s[:self.lbl]

def time_getitem_pos_slice(self, index):
def time_getitem_pos_slice(self, index, index_structure):
self.s[:80000]

def time_get_value(self, index):
def time_get_value(self, index, index_structure):
with warnings.catch_warnings(record=True):
self.s.get_value(self.lbl)

def time_getitem_scalar(self, index):
def time_getitem_scalar(self, index, index_structure):
self.s[self.lbl]

def time_getitem_list_like(self, index, index_structure):
self.s[[self.lbl]]


class DataFrameStringIndexing(object):

Expand Down
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,8 @@ Performance Improvements
:meth:`~HDFStore.keys`. (i.e. ``x in store`` checks are much faster)
(:issue:`21372`)
- Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`)
- Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`)


.. _whatsnew_0240.docs:

Expand Down
25 changes: 17 additions & 8 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -294,14 +294,23 @@ cdef class IndexEngine:
result = np.empty(n_alloc, dtype=np.int64)
missing = np.empty(n_t, dtype=np.int64)

# form the set of the results (like ismember)
members = np.empty(n, dtype=np.uint8)
for i in range(n):
val = values[i]
if val in stargets:
if val not in d:
d[val] = []
d[val].append(i)
# map each starget to its position in the index
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what if you drop the len(stargets) < 5 and just use it if its monotonic_increasing? does the small case actually make any difference here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you drop the len(stargets) < 5, then we'd be running a binary search against the index for each item in a potentially large set of targets -- runtime should be O(m log n) where m is the number of items in the set and n is the length of the index. Presumably, this would be slower when m is large enough compared to the current behavior which is to run through each item in the index and check if it is in the set of targets, which should be O(n) assuming constant time checks for whether an item is in the set of targets. Thoughts?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a case where this is true in the asv's and compare?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, in each of the following asv's stargets is either length one or length two

     [ab9dbd64]       [b704c5bb]
     <master>         <loc-enhancements>
-         383±4ms          211±3ms     0.55  indexing.NonNumericSeriesIndexing.time_getitem_list_like('datetime', 'nonunique_monotonic_inc')
-        59.0±2ms         11.9±1ms     0.20  indexing.CategoricalIndexIndexing.time_get_indexer_list('monotonic_incr')
-      69.4±0.6ms          445±3μs     0.01  indexing.NumericSeriesIndexing.time_getitem_list_like(<class 'pandas.core.indexes.numeric.Int64Index'>, 'nonunique_monotonic_inc')
-      66.3±0.3ms          423±1μs     0.01  indexing.NumericSeriesIndexing.time_getitem_list_like(<class 'pandas.core.indexes.numeric.Float64Index'>, 'nonunique_monotonic_inc')
-      66.1±0.6ms          320±2μs     0.00  indexing.NumericSeriesIndexing.time_ix_list_like(<class 'pandas.core.indexes.numeric.Float64Index'>, 'nonunique_monotonic_inc')
-      69.2±0.4ms          330±3μs     0.00  indexing.NumericSeriesIndexing.time_ix_list_like(<class 'pandas.core.indexes.numeric.Int64Index'>, 'nonunique_monotonic_inc')
-      65.7±0.3ms          286±3μs     0.00  indexing.NumericSeriesIndexing.time_loc_list_like(<class 'pandas.core.indexes.numeric.Float64Index'>, 'nonunique_monotonic_inc')
-      69.3±0.5ms          295±2μs     0.00  indexing.NumericSeriesIndexing.time_loc_list_like(<class 'pandas.core.indexes.numeric.Int64Index'>, 'nonunique_monotonic_inc')

SOME BENCHMARKS HAVE CHANGED SIGNIFICANTLY.

if stargets and len(stargets) < 5 and self.is_monotonic_increasing:
# if there are few enough stargets and the index is monotonically
# increasing, then use binary search for each starget
for starget in stargets:
start = values.searchsorted(starget, side='left')
end = values.searchsorted(starget, side='right')
if start != end:
d[starget] = list(range(start, end))
else:
# otherwise, map by iterating through all items in the index
for i in range(n):
val = values[i]
if val in stargets:
if val not in d:
d[val] = []
d[val].append(i)

for i in range(n_t):
val = targets[i]
Expand Down