Skip to content

PERF: improve perf of index iteration (GH7683) #7720

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 16, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/v0.15.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ Enhancements
Performance
~~~~~~~~~~~

- Performance improvements in ``DatetimeIndex.__iter__`` to allow faster iteration (:issue:`7683`)



Expand Down
3 changes: 3 additions & 0 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,9 @@ def _ops_compat(self, name, op_accessor):
is_year_start = _field_accessor('is_year_start', "Logical indicating if first day of year (defined by frequency)")
is_year_end = _field_accessor('is_year_end', "Logical indicating if last day of year (defined by frequency)")

def __iter__(self):
return (self._box_func(v) for v in self.asi8)

@property
def _box_func(self):
"""
Expand Down
24 changes: 21 additions & 3 deletions pandas/tseries/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1093,6 +1093,27 @@ def __array_finalize__(self, obj):
self.name = getattr(obj, 'name', None)
self._reset_identity()

def __iter__(self):
"""
Return an iterator over the boxed values

Returns
-------
Timestamps : ndarray
"""

# convert in chunks of 10k for efficiency
data = self.asi8
l = len(self)
chunksize = 10000
chunks = int(l / chunksize) + 1
for i in range(chunks):
start_i = i*chunksize
end_i = min((i+1)*chunksize,l)
converted = tslib.ints_to_pydatetime(data[start_i:end_i], tz=self.tz, offset=self.offset, box=True)
for v in converted:
yield v

def _wrap_union_result(self, other, result):
name = self.name if self.name == other.name else None
if self.tz != other.tz:
Expand Down Expand Up @@ -1476,9 +1497,6 @@ def normalize(self):
return DatetimeIndex(new_values, freq='infer', name=self.name,
tz=self.tz)

def __iter__(self):
return iter(self.asobject)

def searchsorted(self, key, side='left'):
if isinstance(key, np.ndarray):
key = np.array(key, dtype=_NS_DTYPE, copy=False)
Expand Down
4 changes: 0 additions & 4 deletions pandas/tseries/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -738,10 +738,6 @@ def astype(self, dtype):
return Index(self.values, dtype)
raise ValueError('Cannot cast PeriodIndex to dtype %s' % dtype)

def __iter__(self):
for val in self.values:
yield Period(ordinal=val, freq=self.freq)

def searchsorted(self, key, side='left'):
if isinstance(key, compat.string_types):
key = Period(key, freq=self.freq).ordinal
Expand Down
1 change: 0 additions & 1 deletion pandas/tseries/tests/test_timezones.py
Original file line number Diff line number Diff line change
Expand Up @@ -1027,7 +1027,6 @@ def test_intersection(self):

def test_timestamp_equality_different_timezones(self):
utc_range = date_range('1/1/2000', periods=20, tz='UTC')

eastern_range = utc_range.tz_convert('US/Eastern')
berlin_range = utc_range.tz_convert('Europe/Berlin')

Expand Down
78 changes: 54 additions & 24 deletions pandas/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -74,61 +74,90 @@ try:
except NameError: # py3
basestring = str

def ints_to_pydatetime(ndarray[int64_t] arr, tz=None):
cdef inline object create_timestamp_from_ts(int64_t value, pandas_datetimestruct dts, object tz, object offset):
cdef _Timestamp ts_base
ts_base = _Timestamp.__new__(Timestamp, dts.year, dts.month,
dts.day, dts.hour, dts.min,
dts.sec, dts.us, tz)

ts_base.value = value
ts_base.offset = offset
ts_base.nanosecond = dts.ps / 1000

return ts_base

cdef inline object create_datetime_from_ts(int64_t value, pandas_datetimestruct dts, object tz, object offset):
return datetime(dts.year, dts.month, dts.day, dts.hour,
dts.min, dts.sec, dts.us, tz)

def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, offset=None, box=False):
# convert an i8 repr to an ndarray of datetimes or Timestamp (if box == True)

cdef:
Py_ssize_t i, n = len(arr)
pandas_datetimestruct dts
object dt
int64_t value
ndarray[object] result = np.empty(n, dtype=object)
object (*func_create)(int64_t, pandas_datetimestruct, object, object)

if box and util.is_string_object(offset):
from pandas.tseries.frequencies import to_offset
offset = to_offset(offset)

if box:
func_create = create_timestamp_from_ts
else:
func_create = create_datetime_from_ts

if tz is not None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a lot dup codes under this if/else clause. combine them?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these are not completely dupes
this is how cython goes though
u could right an inline function to do it
why don't u give a try!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you make such a good case. ok then!
though i don't know how this work. how can i pull this change and continue to work on it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

like this:

git checkout -b jreback-iter origin/master
git pull https://github.com/jreback/pandas.git iter

Then you will have a local branch jreback-iter where you can make changes and such
then you can push up this branch; it will be local to you. just ping and i'll pick-up your commits.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if _is_utc(tz):
for i in range(n):
if arr[i] == iNaT:
result[i] = np.nan
value = arr[i]
if value == iNaT:
result[i] = NaT
else:
pandas_datetime_to_datetimestruct(arr[i], PANDAS_FR_ns, &dts)
result[i] = datetime(dts.year, dts.month, dts.day, dts.hour,
dts.min, dts.sec, dts.us, tz)
pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts)
result[i] = func_create(value, dts, tz, offset)
elif _is_tzlocal(tz) or _is_fixed_offset(tz):
for i in range(n):
if arr[i] == iNaT:
result[i] = np.nan
value = arr[i]
if value == iNaT:
result[i] = NaT
else:
pandas_datetime_to_datetimestruct(arr[i], PANDAS_FR_ns, &dts)
dt = datetime(dts.year, dts.month, dts.day, dts.hour,
dts.min, dts.sec, dts.us, tz)
pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts)
dt = func_create(value, dts, tz, offset)
result[i] = dt + tz.utcoffset(dt)
else:
trans = _get_transitions(tz)
deltas = _get_deltas(tz)
for i in range(n):

if arr[i] == iNaT:
result[i] = np.nan
value = arr[i]
if value == iNaT:
result[i] = NaT
else:

# Adjust datetime64 timestamp, recompute datetimestruct
pos = trans.searchsorted(arr[i], side='right') - 1
pos = trans.searchsorted(value, side='right') - 1
if _treat_tz_as_pytz(tz):
# find right representation of dst etc in pytz timezone
new_tz = tz._tzinfos[tz._transition_info[pos]]
else:
# no zone-name change for dateutil tzs - dst etc represented in single object.
new_tz = tz

pandas_datetime_to_datetimestruct(arr[i] + deltas[pos],
PANDAS_FR_ns, &dts)
result[i] = datetime(dts.year, dts.month, dts.day, dts.hour,
dts.min, dts.sec, dts.us,
new_tz)
pandas_datetime_to_datetimestruct(value + deltas[pos], PANDAS_FR_ns, &dts)
result[i] = func_create(value, dts, new_tz, offset)
else:
for i in range(n):
if arr[i] == iNaT:
result[i] = np.nan

value = arr[i]
if value == iNaT:
result[i] = NaT
else:
pandas_datetime_to_datetimestruct(arr[i], PANDAS_FR_ns, &dts)
result[i] = datetime(dts.year, dts.month, dts.day, dts.hour,
dts.min, dts.sec, dts.us)
pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts)
result[i] = func_create(value, dts, None, offset)

return result

Expand Down Expand Up @@ -183,6 +212,7 @@ class Timestamp(_Timestamp):
def utcnow(cls):
return cls.now('UTC')


def __new__(cls, object ts_input, object offset=None, tz=None, unit=None):
cdef _TSObject ts
cdef _Timestamp ts_base
Expand Down
25 changes: 25 additions & 0 deletions vb_suite/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,3 +333,28 @@ def date_range(start=None, end=None, periods=None, freq=None):

timeseries_is_month_start = Benchmark('rng.is_month_start', setup,
start_date=datetime(2014, 4, 1))

#----------------------------------------------------------------------
# iterate over DatetimeIndex/PeriodIndex
setup = common_setup + """
N = 1000000
M = 10000
idx1 = date_range(start='20140101', freq='T', periods=N)
idx2 = period_range(start='20140101', freq='T', periods=N)

def iter_n(iterable, n=None):
i = 0
for _ in iterable:
i += 1
if n is not None and i > n:
break
"""

timeseries_iter_datetimeindex = Benchmark('iter_n(idx1)', setup)

timeseries_iter_periodindex = Benchmark('iter_n(idx2)', setup)

timeseries_iter_datetimeindex_preexit = Benchmark('iter_n(idx1, M)', setup)

timeseries_iter_periodindex_preexit = Benchmark('iter_n(idx2, M)', setup)