Skip to content

PERF: fix getitem unique_check / initialization issue #14933

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions asv_bench/benchmarks/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ class Iteration(object):
def setup(self):
self.df = DataFrame(randn(10000, 1000))
self.df2 = DataFrame(np.random.randn(50000, 10))
self.df3 = pd.DataFrame(np.random.randn(1000,5000),
columns=['C'+str(c) for c in range(5000)])

def f(self):
if hasattr(self.df, '_item_cache'):
Expand All @@ -85,6 +87,11 @@ def time_iteritems(self):
def time_iteritems_cached(self):
self.g()

def time_iteritems_indexing(self):
df = self.df3
for col in df:
df[col]

def time_itertuples(self):
for row in self.df2.itertuples():
pass
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.19.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Performance Improvements

- Improved performance of ``.replace()`` (:issue:`12745`)
- Improved performance of ``PeriodIndex`` (:issue:`14822`)
- Performance regression in indexing with getitem (:issue:`14930`)
- Improved performance ``Series`` creation with a datetime index and dictionary data (:issue:`14894`)

.. _whatsnew_0192.enhancements.other:
Expand Down
66 changes: 32 additions & 34 deletions pandas/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -82,20 +82,13 @@ cdef class IndexEngine:

cdef:
bint unique, monotonic_inc, monotonic_dec
bint initialized, monotonic_check, unique_check
bint need_monotonic_check, need_unique_check

def __init__(self, vgetter, n):
self.vgetter = vgetter

self.over_size_threshold = n >= _SIZE_CUTOFF

self.initialized = 0
self.monotonic_check = 0
self.unique_check = 0

self.unique = 0
self.monotonic_inc = 0
self.monotonic_dec = 0
self.clear_mapping()

def __contains__(self, object val):
self._ensure_mapping_populated()
Expand Down Expand Up @@ -213,24 +206,28 @@ cdef class IndexEngine:
property is_unique:

def __get__(self):
if not self.initialized:
self.initialize()
if self.need_unique_check:
self._do_unique_check()

self.unique_check = 1
return self.unique == 1

cdef inline _do_unique_check(self):

# this de-facto the same
self._ensure_mapping_populated()

property is_monotonic_increasing:

def __get__(self):
if not self.monotonic_check:
if self.need_monotonic_check:
self._do_monotonic_check()

return self.monotonic_inc == 1

property is_monotonic_decreasing:

def __get__(self):
if not self.monotonic_check:
if self.need_monotonic_check:
self._do_monotonic_check()

return self.monotonic_dec == 1
Expand All @@ -246,13 +243,12 @@ cdef class IndexEngine:
self.monotonic_dec = 0
is_unique = 0

self.monotonic_check = 1
self.need_monotonic_check = 0

# we can only be sure of uniqueness if is_unique=1
if is_unique:
self.initialized = 1
self.unique = 1
self.unique_check = 1
self.need_unique_check = 0

cdef _get_index_values(self):
return self.vgetter()
Expand All @@ -266,30 +262,32 @@ cdef class IndexEngine:
cdef _check_type(self, object val):
hash(val)

property is_mapping_populated:

def __get__(self):
return self.mapping is not None

cdef inline _ensure_mapping_populated(self):
# need to reset if we have previously
# set the initialized from monotonic checks
if self.unique_check:
self.initialized = 0
if not self.initialized:
self.initialize()

cdef initialize(self):
values = self._get_index_values()
# this populates the mapping
# if its not already populated
# also satisfies the need_unique_check

self.mapping = self._make_hash_table(len(values))
self.mapping.map_locations(values)
if not self.is_mapping_populated:

if len(self.mapping) == len(values):
self.unique = 1
values = self._get_index_values()

self.mapping = self._make_hash_table(len(values))
self.mapping.map_locations(values)

if len(self.mapping) == len(values):
self.unique = 1

self.initialized = 1
self.need_unique_check = 0

def clear_mapping(self):
self.mapping = None
self.initialized = 0
self.monotonic_check = 0
self.unique_check = 0
self.need_monotonic_check = 1
self.need_unique_check = 1

self.unique = 0
self.monotonic_inc = 0
Expand Down