Skip to content

Commit 3c96442

Browse files
committed
PERF: use uniqueness_check from monotonic check when possible
closes #14266 Author: Jeff Reback <[email protected]> Closes #14270 from jreback/memory and squashes the following commits: 968a4f7 [Jeff Reback] PERF: use uniqueness_check from monotonic check when possible
1 parent dfb6373 commit 3c96442

File tree

5 files changed

+89
-55
lines changed

5 files changed

+89
-55
lines changed

doc/source/whatsnew/v0.19.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1405,6 +1405,7 @@ Performance Improvements
14051405
- Improved performance of datetime string parsing in ``DatetimeIndex`` (:issue:`13692`)
14061406
- Improved performance of hashing ``Period`` (:issue:`12817`)
14071407
- Improved performance of ``factorize`` of datetime with timezone (:issue:`13750`)
1408+
- Improved performance of by lazily creating indexing hashtables on larger Indexes (:issue:`14266`)
14081409

14091410

14101411
.. _whatsnew_0190.bug_fixes:
@@ -1423,7 +1424,6 @@ Bug Fixes
14231424
- Bug in selection from a ``HDFStore`` with a fixed format and ``start`` and/or ``stop`` specified will now return the selected range (:issue:`8287`)
14241425
- Bug in ``Categorical.from_codes()`` where an unhelpful error was raised when an invalid ``ordered`` parameter was passed in (:issue:`14058`)
14251426
- Bug in ``Series`` construction from a tuple of integers on windows not returning default dtype (int64) (:issue:`13646`)
1426-
14271427
- Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`)
14281428
- Bug in ``.to_records()`` when index name is a unicode string (:issue:`13172`)
14291429

pandas/index.pyx

+24-5
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ cdef class IndexEngine:
8282

8383
cdef:
8484
bint unique, monotonic_inc, monotonic_dec
85-
bint initialized, monotonic_check
85+
bint initialized, monotonic_check, unique_check
8686

8787
def __init__(self, vgetter, n):
8888
self.vgetter = vgetter
@@ -91,6 +91,7 @@ cdef class IndexEngine:
9191

9292
self.initialized = 0
9393
self.monotonic_check = 0
94+
self.unique_check = 0
9495

9596
self.unique = 0
9697
self.monotonic_inc = 0
@@ -177,8 +178,8 @@ cdef class IndexEngine:
177178
return left
178179
else:
179180
return slice(left, right)
180-
else:
181-
return self._maybe_get_bool_indexer(val)
181+
182+
return self._maybe_get_bool_indexer(val)
182183

183184
cdef _maybe_get_bool_indexer(self, object val):
184185
cdef:
@@ -215,6 +216,7 @@ cdef class IndexEngine:
215216
if not self.initialized:
216217
self.initialize()
217218

219+
self.unique_check = 1
218220
return self.unique == 1
219221

220222
property is_monotonic_increasing:
@@ -234,16 +236,24 @@ cdef class IndexEngine:
234236
return self.monotonic_dec == 1
235237

236238
cdef inline _do_monotonic_check(self):
239+
cdef object is_unique
237240
try:
238241
values = self._get_index_values()
239-
self.monotonic_inc, self.monotonic_dec = \
242+
self.monotonic_inc, self.monotonic_dec, is_unique = \
240243
self._call_monotonic(values)
241244
except TypeError:
242245
self.monotonic_inc = 0
243246
self.monotonic_dec = 0
247+
is_unique = 0
244248

245249
self.monotonic_check = 1
246250

251+
# we can only be sure of uniqueness if is_unique=1
252+
if is_unique:
253+
self.initialized = 1
254+
self.unique = 1
255+
self.unique_check = 1
256+
247257
cdef _get_index_values(self):
248258
return self.vgetter()
249259

@@ -257,6 +267,10 @@ cdef class IndexEngine:
257267
hash(val)
258268

259269
cdef inline _ensure_mapping_populated(self):
270+
# need to reset if we have previously
271+
# set the initialized from monotonic checks
272+
if self.unique_check:
273+
self.initialized = 0
260274
if not self.initialized:
261275
self.initialize()
262276

@@ -274,6 +288,12 @@ cdef class IndexEngine:
274288
def clear_mapping(self):
275289
self.mapping = None
276290
self.initialized = 0
291+
self.monotonic_check = 0
292+
self.unique_check = 0
293+
294+
self.unique = 0
295+
self.monotonic_inc = 0
296+
self.monotonic_dec = 0
277297

278298
def get_indexer(self, values):
279299
self._ensure_mapping_populated()
@@ -537,7 +557,6 @@ cdef class DatetimeEngine(Int64Engine):
537557
raise TypeError
538558

539559
# Welcome to the spaghetti factory
540-
541560
if self.over_size_threshold and self.is_monotonic_increasing:
542561
if not self.is_unique:
543562
val = _to_i8(val)

0 commit comments

Comments
 (0)