Skip to content

Commit 968a4f7

Browse files
committed
PERF: use uniqueness_check from monotonic check when possible
closes #14266
1 parent a7469cf commit 968a4f7

File tree

5 files changed

+89
-55
lines changed

5 files changed

+89
-55
lines changed

doc/source/whatsnew/v0.19.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1404,6 +1404,7 @@ Performance Improvements
14041404
- Improved performance of datetime string parsing in ``DatetimeIndex`` (:issue:`13692`)
14051405
- Improved performance of hashing ``Period`` (:issue:`12817`)
14061406
- Improved performance of ``factorize`` of datetime with timezone (:issue:`13750`)
1407+
- Improved performance of by lazily creating indexing hashtables on larger Indexes (:issue:`14266`)
14071408

14081409

14091410
.. _whatsnew_0190.bug_fixes:
@@ -1422,7 +1423,6 @@ Bug Fixes
14221423
- Bug in selection from a ``HDFStore`` with a fixed format and ``start`` and/or ``stop`` specified will now return the selected range (:issue:`8287`)
14231424
- Bug in ``Categorical.from_codes()`` where an unhelpful error was raised when an invalid ``ordered`` parameter was passed in (:issue:`14058`)
14241425
- Bug in ``Series`` construction from a tuple of integers on windows not returning default dtype (int64) (:issue:`13646`)
1425-
14261426
- Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`)
14271427
- Bug in ``.to_records()`` when index name is a unicode string (:issue:`13172`)
14281428

pandas/index.pyx

+24-5
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ cdef class IndexEngine:
8282

8383
cdef:
8484
bint unique, monotonic_inc, monotonic_dec
85-
bint initialized, monotonic_check
85+
bint initialized, monotonic_check, unique_check
8686

8787
def __init__(self, vgetter, n):
8888
self.vgetter = vgetter
@@ -91,6 +91,7 @@ cdef class IndexEngine:
9191

9292
self.initialized = 0
9393
self.monotonic_check = 0
94+
self.unique_check = 0
9495

9596
self.unique = 0
9697
self.monotonic_inc = 0
@@ -177,8 +178,8 @@ cdef class IndexEngine:
177178
return left
178179
else:
179180
return slice(left, right)
180-
else:
181-
return self._maybe_get_bool_indexer(val)
181+
182+
return self._maybe_get_bool_indexer(val)
182183

183184
cdef _maybe_get_bool_indexer(self, object val):
184185
cdef:
@@ -215,6 +216,7 @@ cdef class IndexEngine:
215216
if not self.initialized:
216217
self.initialize()
217218

219+
self.unique_check = 1
218220
return self.unique == 1
219221

220222
property is_monotonic_increasing:
@@ -234,16 +236,24 @@ cdef class IndexEngine:
234236
return self.monotonic_dec == 1
235237

236238
cdef inline _do_monotonic_check(self):
239+
cdef object is_unique
237240
try:
238241
values = self._get_index_values()
239-
self.monotonic_inc, self.monotonic_dec = \
242+
self.monotonic_inc, self.monotonic_dec, is_unique = \
240243
self._call_monotonic(values)
241244
except TypeError:
242245
self.monotonic_inc = 0
243246
self.monotonic_dec = 0
247+
is_unique = 0
244248

245249
self.monotonic_check = 1
246250

251+
# we can only be sure of uniqueness if is_unique=1
252+
if is_unique:
253+
self.initialized = 1
254+
self.unique = 1
255+
self.unique_check = 1
256+
247257
cdef _get_index_values(self):
248258
return self.vgetter()
249259

@@ -257,6 +267,10 @@ cdef class IndexEngine:
257267
hash(val)
258268

259269
cdef inline _ensure_mapping_populated(self):
270+
# need to reset if we have previously
271+
# set the initialized from monotonic checks
272+
if self.unique_check:
273+
self.initialized = 0
260274
if not self.initialized:
261275
self.initialize()
262276

@@ -274,6 +288,12 @@ cdef class IndexEngine:
274288
def clear_mapping(self):
275289
self.mapping = None
276290
self.initialized = 0
291+
self.monotonic_check = 0
292+
self.unique_check = 0
293+
294+
self.unique = 0
295+
self.monotonic_inc = 0
296+
self.monotonic_dec = 0
277297

278298
def get_indexer(self, values):
279299
self._ensure_mapping_populated()
@@ -537,7 +557,6 @@ cdef class DatetimeEngine(Int64Engine):
537557
raise TypeError
538558

539559
# Welcome to the spaghetti factory
540-
541560
if self.over_size_threshold and self.is_monotonic_increasing:
542561
if not self.is_unique:
543562
val = _to_i8(val)

0 commit comments

Comments
 (0)