Skip to content

PERF: performance regression in Series.asof #14476

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 26, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 51 additions & 30 deletions asv_bench/benchmarks/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,56 +284,77 @@ class timeseries_asof(object):
goal_time = 0.2

def setup(self):
self.N = 100000
self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
if hasattr(Series, 'convert'):
Series.resample = Series.convert
self.ts = Series(np.random.randn(self.N), index=self.rng)
self.N = 10000
self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s')
self.ts = Series(np.random.randn(self.N), index=self.rng)
self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s')
self.ts = Series(np.random.randn(self.N), index=self.rng)
self.ts2 = self.ts.copy()
self.ts2[250:5000] = np.nan
self.ts3 = self.ts.copy()
self.ts3[-5000:] = np.nan

def time_timeseries_asof(self):
# test speed of pre-computing NAs.
def time_asof_list(self):
self.ts.asof(self.dates)

# should be roughly the same as above.
def time_asof_nan_list(self):
self.ts2.asof(self.dates)

class timeseries_asof_nan(object):
goal_time = 0.2
# test speed of the code path for a scalar index
# without *while* loop
def time_asof_single(self):
self.ts.asof(self.dates[0])

def setup(self):
self.N = 100000
self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
if hasattr(Series, 'convert'):
Series.resample = Series.convert
self.ts = Series(np.random.randn(self.N), index=self.rng)
self.N = 10000
self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s')
self.ts = Series(np.random.randn(self.N), index=self.rng)
self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s')
self.ts[250:5000] = np.nan
# test speed of the code path for a scalar index
# before the start. should be the same as above.
def time_asof_single_early(self):
self.ts.asof(self.dates[0] - dt.timedelta(10))

def time_timeseries_asof_nan(self):
self.ts.asof(self.dates)
# test the speed of the code path for a scalar index
# with a long *while* loop. should still be much
# faster than pre-computing all the NAs.
def time_asof_nan_single(self):
self.ts3.asof(self.dates[-1])


class timeseries_asof_single(object):
class timeseries_dataframe_asof(object):
goal_time = 0.2

def setup(self):
self.N = 100000
self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
if hasattr(Series, 'convert'):
Series.resample = Series.convert
self.ts = Series(np.random.randn(self.N), index=self.rng)
self.N = 10000
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these are too small make 100000

put nans at beginning and another one at the end

self.M = 100
self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s')
self.ts = Series(np.random.randn(self.N), index=self.rng)
self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s')
self.ts = DataFrame(np.random.randn(self.N, self.M), index=self.rng)
self.ts2 = self.ts.copy()
self.ts2.iloc[250:5000] = np.nan
self.ts3 = self.ts.copy()
self.ts3.iloc[-5000:] = np.nan

# test speed of pre-computing NAs.
def time_asof_list(self):
self.ts.asof(self.dates)

def time_timeseries_asof_single(self):
# should be roughly the same as above.
def time_asof_nan_list(self):
self.ts2.asof(self.dates)

# test speed of the code path for a scalar index
# with pre-computing all NAs.
def time_asof_single(self):
self.ts.asof(self.dates[0])

# should be roughly the same as above.
def time_asof_nan_single(self):
self.ts3.asof(self.dates[-1])

# test speed of the code path for a scalar index
# before the start. should be without the cost of
# pre-computing all the NAs.
def time_asof_single_early(self):
self.ts.asof(self.dates[0] - dt.timedelta(10))


class timeseries_custom_bday_apply(object):
goal_time = 0.2
Expand Down
5 changes: 3 additions & 2 deletions doc/source/whatsnew/v0.19.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@ Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~

- Fixed performance regression in factorization of ``Period`` data (:issue:`14338`)
- Improved Performance in ``.to_json()`` when ``lines=True`` (:issue:`14408`)

- Improved performance in ``.to_json()`` when ``lines=True`` (:issue:`14408`)
- Improved performance in ``Series.asof(where)`` when ``where`` is a scalar (:issue:`14461)
- Improved performance in ``DataFrame.asof(where)`` when ``where`` is a scalar (:issue:`14461)



Expand Down
34 changes: 22 additions & 12 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3735,10 +3735,10 @@ def asof(self, where, subset=None):
if not self.index.is_monotonic:
raise ValueError("asof requires a sorted index")

if isinstance(self, ABCSeries):
is_series = isinstance(self, ABCSeries)
if is_series:
if subset is not None:
raise ValueError("subset is not valid for Series")
nulls = self.isnull()
elif self.ndim > 2:
raise NotImplementedError("asof is not implemented "
"for {type}".format(type(self)))
Expand All @@ -3747,9 +3747,9 @@ def asof(self, where, subset=None):
subset = self.columns
if not is_list_like(subset):
subset = [subset]
nulls = self[subset].isnull().any(1)

if not is_list_like(where):
is_list = is_list_like(where)
if not is_list:
start = self.index[0]
if isinstance(self.index, PeriodIndex):
where = Period(where, freq=self.index.freq).ordinal
Expand All @@ -3758,24 +3758,34 @@ def asof(self, where, subset=None):
if where < start:
return np.nan

loc = self.index.searchsorted(where, side='right')
if loc > 0:
loc -= 1
while nulls[loc] and loc > 0:
loc -= 1
return self.iloc[loc]
# It's always much faster to use a *while* loop here for
# Series than pre-computing all the NAs. However a
# *while* loop is extremely expensive for DataFrame
# so we later pre-compute all the NAs and use the same
# code path whether *where* is a scalar or list.
# See PR: https://github.com/pandas-dev/pandas/pull/14476
if is_series:
loc = self.index.searchsorted(where, side='right')
if loc > 0:
loc -= 1

values = self._values
while loc > 0 and isnull(values[loc]):
loc -= 1
return values[loc]

if not isinstance(where, Index):
where = Index(where)
where = Index(where) if is_list else Index([where])

nulls = self.isnull() if is_series else self[subset].isnull().any(1)
locs = self.index.asof_locs(where, ~(nulls.values))

# mask the missing
missing = locs == -1
data = self.take(locs, is_copy=False)
data.index = where
data.loc[missing] = np.nan
return data
return data if is_list else data.iloc[-1]

# ----------------------------------------------------------------------
# Action Methods
Expand Down