Skip to content

Commit 07c83ee

Browse files
committed
PERF: fix getitem unique_check / initialization issue
closes pandas-dev#14930 Author: Jeff Reback <[email protected]> Closes pandas-dev#14933 from jreback/perf and squashes the following commits: dc32b39 [Jeff Reback] PERF: fix getitem unique_check / initialization issue
1 parent 0a7cd97 commit 07c83ee

File tree

3 files changed

+40
-34
lines changed

3 files changed

+40
-34
lines changed

asv_bench/benchmarks/frame_methods.py

+7
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ class Iteration(object):
6868
def setup(self):
6969
self.df = DataFrame(randn(10000, 1000))
7070
self.df2 = DataFrame(np.random.randn(50000, 10))
71+
self.df3 = pd.DataFrame(np.random.randn(1000,5000),
72+
columns=['C'+str(c) for c in range(5000)])
7173

7274
def f(self):
7375
if hasattr(self.df, '_item_cache'):
@@ -85,6 +87,11 @@ def time_iteritems(self):
8587
def time_iteritems_cached(self):
8688
self.g()
8789

90+
def time_iteritems_indexing(self):
91+
df = self.df3
92+
for col in df:
93+
df[col]
94+
8895
def time_itertuples(self):
8996
for row in self.df2.itertuples():
9097
pass

doc/source/whatsnew/v0.19.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ Performance Improvements
2323

2424
- Improved performance of ``.replace()`` (:issue:`12745`)
2525
- Improved performance of ``PeriodIndex`` (:issue:`14822`)
26+
- Performance regression in indexing with getitem (:issue:`14930`)
2627
- Improved performance ``Series`` creation with a datetime index and dictionary data (:issue:`14894`)
2728

2829
.. _whatsnew_0192.enhancements.other:

pandas/index.pyx

+32-34
Original file line numberDiff line numberDiff line change
@@ -82,20 +82,13 @@ cdef class IndexEngine:
8282

8383
cdef:
8484
bint unique, monotonic_inc, monotonic_dec
85-
bint initialized, monotonic_check, unique_check
85+
bint need_monotonic_check, need_unique_check
8686

8787
def __init__(self, vgetter, n):
8888
self.vgetter = vgetter
8989

9090
self.over_size_threshold = n >= _SIZE_CUTOFF
91-
92-
self.initialized = 0
93-
self.monotonic_check = 0
94-
self.unique_check = 0
95-
96-
self.unique = 0
97-
self.monotonic_inc = 0
98-
self.monotonic_dec = 0
91+
self.clear_mapping()
9992

10093
def __contains__(self, object val):
10194
self._ensure_mapping_populated()
@@ -213,24 +206,28 @@ cdef class IndexEngine:
213206
property is_unique:
214207

215208
def __get__(self):
216-
if not self.initialized:
217-
self.initialize()
209+
if self.need_unique_check:
210+
self._do_unique_check()
218211

219-
self.unique_check = 1
220212
return self.unique == 1
221213

214+
cdef inline _do_unique_check(self):
215+
216+
# this de-facto the same
217+
self._ensure_mapping_populated()
218+
222219
property is_monotonic_increasing:
223220

224221
def __get__(self):
225-
if not self.monotonic_check:
222+
if self.need_monotonic_check:
226223
self._do_monotonic_check()
227224

228225
return self.monotonic_inc == 1
229226

230227
property is_monotonic_decreasing:
231228

232229
def __get__(self):
233-
if not self.monotonic_check:
230+
if self.need_monotonic_check:
234231
self._do_monotonic_check()
235232

236233
return self.monotonic_dec == 1
@@ -246,13 +243,12 @@ cdef class IndexEngine:
246243
self.monotonic_dec = 0
247244
is_unique = 0
248245

249-
self.monotonic_check = 1
246+
self.need_monotonic_check = 0
250247

251248
# we can only be sure of uniqueness if is_unique=1
252249
if is_unique:
253-
self.initialized = 1
254250
self.unique = 1
255-
self.unique_check = 1
251+
self.need_unique_check = 0
256252

257253
cdef _get_index_values(self):
258254
return self.vgetter()
@@ -266,30 +262,32 @@ cdef class IndexEngine:
266262
cdef _check_type(self, object val):
267263
hash(val)
268264

265+
property is_mapping_populated:
266+
267+
def __get__(self):
268+
return self.mapping is not None
269+
269270
cdef inline _ensure_mapping_populated(self):
270-
# need to reset if we have previously
271-
# set the initialized from monotonic checks
272-
if self.unique_check:
273-
self.initialized = 0
274-
if not self.initialized:
275-
self.initialize()
276-
277-
cdef initialize(self):
278-
values = self._get_index_values()
271+
# this populates the mapping
272+
# if its not already populated
273+
# also satisfies the need_unique_check
279274

280-
self.mapping = self._make_hash_table(len(values))
281-
self.mapping.map_locations(values)
275+
if not self.is_mapping_populated:
282276

283-
if len(self.mapping) == len(values):
284-
self.unique = 1
277+
values = self._get_index_values()
278+
279+
self.mapping = self._make_hash_table(len(values))
280+
self.mapping.map_locations(values)
281+
282+
if len(self.mapping) == len(values):
283+
self.unique = 1
285284

286-
self.initialized = 1
285+
self.need_unique_check = 0
287286

288287
def clear_mapping(self):
289288
self.mapping = None
290-
self.initialized = 0
291-
self.monotonic_check = 0
292-
self.unique_check = 0
289+
self.need_monotonic_check = 1
290+
self.need_unique_check = 1
293291

294292
self.unique = 0
295293
self.monotonic_inc = 0

0 commit comments

Comments
 (0)