Skip to content

Commit 320f9d4

Browse files
yrlihuanjreback
authored andcommitted
BUG: DatetimeIndex.__iter__ creates a temp array of Timestamp (GH7683)
PERF: DatetimeIndex.__iter__ now uses ints_to_pydatetime with boxing tslib: remove code dup in ints_to_pydatetime PERF: add inline to create_timestamp_from_*
1 parent e6a5b6c commit 320f9d4

File tree

7 files changed

+104
-32
lines changed

7 files changed

+104
-32
lines changed

doc/source/v0.15.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ Enhancements
150150
Performance
151151
~~~~~~~~~~~
152152

153+
- Performance improvements in ``DatetimeIndex.__iter__`` to allow faster iteration (:issue:`7683`)
153154

154155

155156

pandas/core/base.py

+3
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,9 @@ def _ops_compat(self, name, op_accessor):
390390
is_year_start = _field_accessor('is_year_start', "Logical indicating if first day of year (defined by frequency)")
391391
is_year_end = _field_accessor('is_year_end', "Logical indicating if last day of year (defined by frequency)")
392392

393+
def __iter__(self):
394+
return (self._box_func(v) for v in self.asi8)
395+
393396
@property
394397
def _box_func(self):
395398
"""

pandas/tseries/index.py

+21-3
Original file line numberDiff line numberDiff line change
@@ -1093,6 +1093,27 @@ def __array_finalize__(self, obj):
10931093
self.name = getattr(obj, 'name', None)
10941094
self._reset_identity()
10951095

1096+
def __iter__(self):
1097+
"""
1098+
Return an iterator over the boxed values
1099+
1100+
Returns
1101+
-------
1102+
Timestamps : ndarray
1103+
"""
1104+
1105+
# convert in chunks of 10k for efficiency
1106+
data = self.asi8
1107+
l = len(self)
1108+
chunksize = 10000
1109+
chunks = int(l / chunksize) + 1
1110+
for i in range(chunks):
1111+
start_i = i*chunksize
1112+
end_i = min((i+1)*chunksize,l)
1113+
converted = tslib.ints_to_pydatetime(data[start_i:end_i], tz=self.tz, offset=self.offset, box=True)
1114+
for v in converted:
1115+
yield v
1116+
10961117
def _wrap_union_result(self, other, result):
10971118
name = self.name if self.name == other.name else None
10981119
if self.tz != other.tz:
@@ -1476,9 +1497,6 @@ def normalize(self):
14761497
return DatetimeIndex(new_values, freq='infer', name=self.name,
14771498
tz=self.tz)
14781499

1479-
def __iter__(self):
1480-
return iter(self.asobject)
1481-
14821500
def searchsorted(self, key, side='left'):
14831501
if isinstance(key, np.ndarray):
14841502
key = np.array(key, dtype=_NS_DTYPE, copy=False)

pandas/tseries/period.py

-4
Original file line numberDiff line numberDiff line change
@@ -738,10 +738,6 @@ def astype(self, dtype):
738738
return Index(self.values, dtype)
739739
raise ValueError('Cannot cast PeriodIndex to dtype %s' % dtype)
740740

741-
def __iter__(self):
742-
for val in self.values:
743-
yield Period(ordinal=val, freq=self.freq)
744-
745741
def searchsorted(self, key, side='left'):
746742
if isinstance(key, compat.string_types):
747743
key = Period(key, freq=self.freq).ordinal

pandas/tseries/tests/test_timezones.py

-1
Original file line numberDiff line numberDiff line change
@@ -1027,7 +1027,6 @@ def test_intersection(self):
10271027

10281028
def test_timestamp_equality_different_timezones(self):
10291029
utc_range = date_range('1/1/2000', periods=20, tz='UTC')
1030-
10311030
eastern_range = utc_range.tz_convert('US/Eastern')
10321031
berlin_range = utc_range.tz_convert('Europe/Berlin')
10331032

pandas/tslib.pyx

+54-24
Original file line numberDiff line numberDiff line change
@@ -74,61 +74,90 @@ try:
7474
except NameError: # py3
7575
basestring = str
7676

77-
def ints_to_pydatetime(ndarray[int64_t] arr, tz=None):
77+
cdef inline object create_timestamp_from_ts(int64_t value, pandas_datetimestruct dts, object tz, object offset):
78+
cdef _Timestamp ts_base
79+
ts_base = _Timestamp.__new__(Timestamp, dts.year, dts.month,
80+
dts.day, dts.hour, dts.min,
81+
dts.sec, dts.us, tz)
82+
83+
ts_base.value = value
84+
ts_base.offset = offset
85+
ts_base.nanosecond = dts.ps / 1000
86+
87+
return ts_base
88+
89+
cdef inline object create_datetime_from_ts(int64_t value, pandas_datetimestruct dts, object tz, object offset):
90+
return datetime(dts.year, dts.month, dts.day, dts.hour,
91+
dts.min, dts.sec, dts.us, tz)
92+
93+
def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, offset=None, box=False):
94+
# convert an i8 repr to an ndarray of datetimes or Timestamp (if box == True)
95+
7896
cdef:
7997
Py_ssize_t i, n = len(arr)
8098
pandas_datetimestruct dts
99+
object dt
100+
int64_t value
81101
ndarray[object] result = np.empty(n, dtype=object)
102+
object (*func_create)(int64_t, pandas_datetimestruct, object, object)
103+
104+
if box and util.is_string_object(offset):
105+
from pandas.tseries.frequencies import to_offset
106+
offset = to_offset(offset)
107+
108+
if box:
109+
func_create = create_timestamp_from_ts
110+
else:
111+
func_create = create_datetime_from_ts
82112

83113
if tz is not None:
84114
if _is_utc(tz):
85115
for i in range(n):
86-
if arr[i] == iNaT:
87-
result[i] = np.nan
116+
value = arr[i]
117+
if value == iNaT:
118+
result[i] = NaT
88119
else:
89-
pandas_datetime_to_datetimestruct(arr[i], PANDAS_FR_ns, &dts)
90-
result[i] = datetime(dts.year, dts.month, dts.day, dts.hour,
91-
dts.min, dts.sec, dts.us, tz)
120+
pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts)
121+
result[i] = func_create(value, dts, tz, offset)
92122
elif _is_tzlocal(tz) or _is_fixed_offset(tz):
93123
for i in range(n):
94-
if arr[i] == iNaT:
95-
result[i] = np.nan
124+
value = arr[i]
125+
if value == iNaT:
126+
result[i] = NaT
96127
else:
97-
pandas_datetime_to_datetimestruct(arr[i], PANDAS_FR_ns, &dts)
98-
dt = datetime(dts.year, dts.month, dts.day, dts.hour,
99-
dts.min, dts.sec, dts.us, tz)
128+
pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts)
129+
dt = func_create(value, dts, tz, offset)
100130
result[i] = dt + tz.utcoffset(dt)
101131
else:
102132
trans = _get_transitions(tz)
103133
deltas = _get_deltas(tz)
104134
for i in range(n):
105135

106-
if arr[i] == iNaT:
107-
result[i] = np.nan
136+
value = arr[i]
137+
if value == iNaT:
138+
result[i] = NaT
108139
else:
109140

110141
# Adjust datetime64 timestamp, recompute datetimestruct
111-
pos = trans.searchsorted(arr[i], side='right') - 1
142+
pos = trans.searchsorted(value, side='right') - 1
112143
if _treat_tz_as_pytz(tz):
113144
# find right representation of dst etc in pytz timezone
114145
new_tz = tz._tzinfos[tz._transition_info[pos]]
115146
else:
116147
# no zone-name change for dateutil tzs - dst etc represented in single object.
117148
new_tz = tz
118149

119-
pandas_datetime_to_datetimestruct(arr[i] + deltas[pos],
120-
PANDAS_FR_ns, &dts)
121-
result[i] = datetime(dts.year, dts.month, dts.day, dts.hour,
122-
dts.min, dts.sec, dts.us,
123-
new_tz)
150+
pandas_datetime_to_datetimestruct(value + deltas[pos], PANDAS_FR_ns, &dts)
151+
result[i] = func_create(value, dts, new_tz, offset)
124152
else:
125153
for i in range(n):
126-
if arr[i] == iNaT:
127-
result[i] = np.nan
154+
155+
value = arr[i]
156+
if value == iNaT:
157+
result[i] = NaT
128158
else:
129-
pandas_datetime_to_datetimestruct(arr[i], PANDAS_FR_ns, &dts)
130-
result[i] = datetime(dts.year, dts.month, dts.day, dts.hour,
131-
dts.min, dts.sec, dts.us)
159+
pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts)
160+
result[i] = func_create(value, dts, None, offset)
132161

133162
return result
134163

@@ -183,6 +212,7 @@ class Timestamp(_Timestamp):
183212
def utcnow(cls):
184213
return cls.now('UTC')
185214

215+
186216
def __new__(cls, object ts_input, object offset=None, tz=None, unit=None):
187217
cdef _TSObject ts
188218
cdef _Timestamp ts_base

vb_suite/timeseries.py

+25
Original file line numberDiff line numberDiff line change
@@ -333,3 +333,28 @@ def date_range(start=None, end=None, periods=None, freq=None):
333333

334334
timeseries_is_month_start = Benchmark('rng.is_month_start', setup,
335335
start_date=datetime(2014, 4, 1))
336+
337+
#----------------------------------------------------------------------
338+
# iterate over DatetimeIndex/PeriodIndex
339+
setup = common_setup + """
340+
N = 1000000
341+
M = 10000
342+
idx1 = date_range(start='20140101', freq='T', periods=N)
343+
idx2 = period_range(start='20140101', freq='T', periods=N)
344+
345+
def iter_n(iterable, n=None):
346+
i = 0
347+
for _ in iterable:
348+
i += 1
349+
if n is not None and i > n:
350+
break
351+
"""
352+
353+
timeseries_iter_datetimeindex = Benchmark('iter_n(idx1)', setup)
354+
355+
timeseries_iter_periodindex = Benchmark('iter_n(idx2)', setup)
356+
357+
timeseries_iter_datetimeindex_preexit = Benchmark('iter_n(idx1, M)', setup)
358+
359+
timeseries_iter_periodindex_preexit = Benchmark('iter_n(idx2, M)', setup)
360+

0 commit comments

Comments
 (0)