Skip to content

Commit fc4c580

Browse files
mitarPingviinituutti
authored andcommitted
ENH: Implemented lazy iteration (pandas-dev#20796)
1 parent 15eea06 commit fc4c580

File tree

4 files changed

+76
-8
lines changed

4 files changed

+76
-8
lines changed

asv_bench/benchmarks/frame_methods.py

+62-1
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ def setup(self):
103103
self.df2 = DataFrame(np.random.randn(N * 50, 10))
104104
self.df3 = DataFrame(np.random.randn(N, 5 * N),
105105
columns=['C' + str(c) for c in range(N * 5)])
106+
self.df4 = DataFrame(np.random.randn(N * 1000, 10))
106107

107108
def time_iteritems(self):
108109
# (monitor no-copying behaviour)
@@ -119,10 +120,70 @@ def time_iteritems_indexing(self):
119120
for col in self.df3:
120121
self.df3[col]
121122

123+
def time_itertuples_start(self):
124+
self.df4.itertuples()
125+
126+
def time_itertuples_read_first(self):
127+
next(self.df4.itertuples())
128+
122129
def time_itertuples(self):
123-
for row in self.df2.itertuples():
130+
for row in self.df4.itertuples():
131+
pass
132+
133+
def time_itertuples_to_list(self):
134+
list(self.df4.itertuples())
135+
136+
def mem_itertuples_start(self):
137+
return self.df4.itertuples()
138+
139+
def peakmem_itertuples_start(self):
140+
self.df4.itertuples()
141+
142+
def mem_itertuples_read_first(self):
143+
return next(self.df4.itertuples())
144+
145+
def peakmem_itertuples(self):
146+
for row in self.df4.itertuples():
147+
pass
148+
149+
def mem_itertuples_to_list(self):
150+
return list(self.df4.itertuples())
151+
152+
def peakmem_itertuples_to_list(self):
153+
list(self.df4.itertuples())
154+
155+
def time_itertuples_raw_start(self):
156+
self.df4.itertuples(index=False, name=None)
157+
158+
def time_itertuples_raw_read_first(self):
159+
next(self.df4.itertuples(index=False, name=None))
160+
161+
def time_itertuples_raw_tuples(self):
162+
for row in self.df4.itertuples(index=False, name=None):
124163
pass
125164

165+
def time_itertuples_raw_tuples_to_list(self):
166+
list(self.df4.itertuples(index=False, name=None))
167+
168+
def mem_itertuples_raw_start(self):
169+
return self.df4.itertuples(index=False, name=None)
170+
171+
def peakmem_itertuples_raw_start(self):
172+
self.df4.itertuples(index=False, name=None)
173+
174+
def peakmem_itertuples_raw_read_first(self):
175+
next(self.df4.itertuples(index=False, name=None))
176+
177+
def peakmem_itertuples_raw(self):
178+
for row in self.df4.itertuples(index=False, name=None):
179+
pass
180+
181+
def mem_itertuples_raw_to_list(self):
182+
return list(self.df4.itertuples(index=False, name=None))
183+
184+
def peakmem_itertuples_raw_to_list(self):
185+
list(self.df4.itertuples(index=False, name=None))
186+
126187
def time_iterrows(self):
127188
for row in self.df.iterrows():
128189
pass

doc/source/whatsnew/v0.24.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -1251,6 +1251,8 @@ Performance Improvements
12511251
- Fixed a performance regression on Windows with Python 3.7 of :func:`read_csv` (:issue:`23516`)
12521252
- Improved performance of :class:`Categorical` constructor for ``Series`` objects (:issue:`23814`)
12531253
- Improved performance of :meth:`~DataFrame.where` for Categorical data (:issue:`24077`)
1254+
- Improved performance of iterating over a :class:`Series`. Using :meth:`DataFrame.itertuples` now creates iterators
1255+
without internally allocating lists of all elements (:issue:`20783`)
12541256

12551257
.. _whatsnew_0240.docs:
12561258

pandas/core/base.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
import pandas._libs.lib as lib
1010
import pandas.compat as compat
11-
from pandas.compat import PYPY, OrderedDict, builtins
11+
from pandas.compat import PYPY, OrderedDict, builtins, map, range
1212
from pandas.compat.numpy import function as nv
1313
from pandas.errors import AbstractMethodError
1414
from pandas.util._decorators import Appender, Substitution, cache_readonly
@@ -1072,7 +1072,13 @@ def __iter__(self):
10721072
(for str, int, float) or a pandas scalar
10731073
(for Timestamp/Timedelta/Interval/Period)
10741074
"""
1075-
return iter(self.tolist())
1075+
# We are explicity making element iterators.
1076+
if is_datetimelike(self._values):
1077+
return map(com.maybe_box_datetimelike, self._values)
1078+
elif is_extension_array_dtype(self._values):
1079+
return iter(self._values)
1080+
else:
1081+
return map(self._values.item, range(self._values.size))
10761082

10771083
@cache_readonly
10781084
def hasnans(self):

pandas/core/frame.py

+4-5
Original file line numberDiff line numberDiff line change
@@ -898,10 +898,10 @@ def itertuples(self, index=True, name="Pandas"):
898898
Animal(Index='hawk', num_legs=2, num_wings=2)
899899
"""
900900
arrays = []
901-
fields = []
901+
fields = list(self.columns)
902902
if index:
903903
arrays.append(self.index)
904-
fields.append("Index")
904+
fields.insert(0, "Index")
905905

906906
# use integer indexing because of possible duplicate column names
907907
arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
@@ -911,10 +911,9 @@ def itertuples(self, index=True, name="Pandas"):
911911
if name is not None and len(self.columns) + index < 256:
912912
# `rename` is unsupported in Python 2.6
913913
try:
914-
itertuple = collections.namedtuple(name,
915-
fields + list(self.columns),
916-
rename=True)
914+
itertuple = collections.namedtuple(name, fields, rename=True)
917915
return map(itertuple._make, zip(*arrays))
916+
918917
except Exception:
919918
pass
920919

0 commit comments

Comments
 (0)