Skip to content

Commit a25ffc3

Browse files
committed
PERF: fix iteritems performance regression in unique case. #2336
1 parent 1037fc0 commit a25ffc3

File tree

5 files changed

+33
-38
lines changed

5 files changed

+33
-38
lines changed

pandas/core/algorithms.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -56,13 +56,13 @@ def unique(values):
5656
return _hashtable_algo(f, values.dtype)
5757

5858

59-
def count(values, uniques=None):
60-
f = lambda htype, caster: _count_generic(values, htype, caster)
59+
# def count(values, uniques=None):
60+
# f = lambda htype, caster: _count_generic(values, htype, caster)
6161

62-
if uniques is not None:
63-
raise NotImplementedError
64-
else:
65-
return _hashtable_algo(f, values.dtype)
62+
# if uniques is not None:
63+
# raise NotImplementedError
64+
# else:
65+
# return _hashtable_algo(f, values.dtype)
6666

6767

6868
def _hashtable_algo(f, dtype):
@@ -82,7 +82,7 @@ def _count_generic(values, table_type, type_caster):
8282

8383
values = type_caster(values)
8484
table = table_type(min(len(values), 1000000))
85-
uniques, labels, counts = table.factorize(values)
85+
uniques, labels = table.factorize(values)
8686

8787
return Series(counts, index=uniques)
8888

pandas/core/frame.py

+8-11
Original file line numberDiff line numberDiff line change
@@ -681,8 +681,12 @@ def keys(self):
681681

682682
def iteritems(self):
683683
"""Iterator over (column, series) pairs"""
684-
for i, k in enumerate(self.columns):
685-
yield k, self.icol(i)
684+
if self.columns.is_unique and hasattr(self, '_item_cache'):
685+
for k in self.columns:
686+
yield k, self._get_item_cache(k)
687+
else:
688+
for i, k in enumerate(self.columns):
689+
yield k, self.icol(i)
686690

687691
def iterrows(self):
688692
"""
@@ -1829,15 +1833,8 @@ def icol(self, i):
18291833
return self.ix[:, i]
18301834

18311835
values = self._data.iget(i)
1832-
if hasattr(self,'default_fill_value'):
1833-
s = self._col_klass.from_array(values, index=self.index,
1834-
name=label,
1835-
fill_value= self.default_fill_value)
1836-
else:
1837-
s = self._col_klass.from_array(values, index=self.index,
1838-
name=label)
1839-
1840-
return s
1836+
return self._col_klass.from_array(values, index=self.index,
1837+
name=label)
18411838

18421839
def _ixs(self, i, axis=0):
18431840
if axis == 0:

pandas/sparse/frame.py

+4
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,10 @@ def icol(self, i):
377377
return self.ix[:, i]
378378

379379
return self[label]
380+
# values = self._data.iget(i)
381+
# return self._col_klass.from_array(
382+
# values, index=self.index, name=label,
383+
# fill_value= self.default_fill_value)
380384

381385
@Appender(DataFrame.get_value.__doc__, indents=0)
382386
def get_value(self, index, col):

pandas/src/hashtable.pyx

-18
Original file line numberDiff line numberDiff line change
@@ -444,24 +444,6 @@ cdef class Int64HashTable(HashTable):
444444

445445
return locs
446446

447-
def lookup_i4(self, ndarray[int64_t] values):
448-
cdef:
449-
Py_ssize_t i, n = len(values)
450-
int ret = 0
451-
int64_t val
452-
khiter_t k
453-
ndarray[int64_t] locs = np.empty(n, dtype=np.int64)
454-
455-
for i in range(n):
456-
val = values[i]
457-
k = kh_get_int64(self.table, val)
458-
if k != self.table.n_buckets:
459-
locs[i] = self.table.vals[k]
460-
else:
461-
locs[i] = -1
462-
463-
return locs
464-
465447
def factorize(self, ndarray[object] values):
466448
reverse = {}
467449
labels = self.get_labels(values, reverse, 0)

vb_suite/frame_methods.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,20 @@
7171

7272
setup = common_setup + """
7373
df = DataFrame(randn(10000, 100))
74+
def f():
75+
if hasattr(df, '_item_cache'):
76+
df._item_cache.clear()
77+
for name, col in df.iteritems():
78+
pass
79+
80+
def g():
81+
for name, col in df.iteritems():
82+
pass
7483
"""
7584

7685
# as far back as the earliest test currently in the suite
77-
frame_iteritems = Benchmark('for name,col in df.iteritems(): pass', setup,
78-
start_date=datetime(2010, 6, 1))
86+
frame_iteritems = Benchmark('f()', setup,
87+
start_date=datetime(2010, 6, 1))
88+
89+
frame_iteritems_cached = Benchmark('g()', setup,
90+
start_date=datetime(2010, 6, 1))

0 commit comments

Comments
 (0)