diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index fda6ebb4b437e..8c00924cb07ef 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -284,56 +284,77 @@ class timeseries_asof(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) self.N = 10000 self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s') - self.ts = Series(np.random.randn(self.N), index=self.rng) self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s') + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.ts2 = self.ts.copy() + self.ts2[250:5000] = np.nan + self.ts3 = self.ts.copy() + self.ts3[-5000:] = np.nan - def time_timeseries_asof(self): + # test speed of pre-computing NAs. + def time_asof_list(self): self.ts.asof(self.dates) + # should be roughly the same as above. + def time_asof_nan_list(self): + self.ts2.asof(self.dates) -class timeseries_asof_nan(object): - goal_time = 0.2 + # test speed of the code path for a scalar index + # without *while* loop + def time_asof_single(self): + self.ts.asof(self.dates[0]) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 10000 - self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s') - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s') - self.ts[250:5000] = np.nan + # test speed of the code path for a scalar index + # before the start. should be the same as above. + def time_asof_single_early(self): + self.ts.asof(self.dates[0] - dt.timedelta(10)) - def time_timeseries_asof_nan(self): - self.ts.asof(self.dates) + # test the speed of the code path for a scalar index + # with a long *while* loop. should still be much + # faster than pre-computing all the NAs. + def time_asof_nan_single(self): + self.ts3.asof(self.dates[-1]) -class timeseries_asof_single(object): +class timeseries_dataframe_asof(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) self.N = 10000 + self.M = 100 self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s') - self.ts = Series(np.random.randn(self.N), index=self.rng) self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s') + self.ts = DataFrame(np.random.randn(self.N, self.M), index=self.rng) + self.ts2 = self.ts.copy() + self.ts2.iloc[250:5000] = np.nan + self.ts3 = self.ts.copy() + self.ts3.iloc[-5000:] = np.nan + + # test speed of pre-computing NAs. + def time_asof_list(self): + self.ts.asof(self.dates) - def time_timeseries_asof_single(self): + # should be roughly the same as above. + def time_asof_nan_list(self): + self.ts2.asof(self.dates) + + # test speed of the code path for a scalar index + # with pre-computing all NAs. + def time_asof_single(self): self.ts.asof(self.dates[0]) + # should be roughly the same as above. + def time_asof_nan_single(self): + self.ts3.asof(self.dates[-1]) + + # test speed of the code path for a scalar index + # before the start. should be without the cost of + # pre-computing all the NAs. + def time_asof_single_early(self): + self.ts.asof(self.dates[0] - dt.timedelta(10)) + class timeseries_custom_bday_apply(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 147ff8795eb00..41e0b48ab80e7 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -21,8 +21,9 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Fixed performance regression in factorization of ``Period`` data (:issue:`14338`) -- Improved Performance in ``.to_json()`` when ``lines=True`` (:issue:`14408`) - +- Improved performance in ``.to_json()`` when ``lines=True`` (:issue:`14408`) +- Improved performance in ``Series.asof(where)`` when ``where`` is a scalar (:issue:`14461) +- Improved performance in ``DataFrame.asof(where)`` when ``where`` is a scalar (:issue:`14461) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 697438df87d4f..037ab900e6150 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3735,10 +3735,10 @@ def asof(self, where, subset=None): if not self.index.is_monotonic: raise ValueError("asof requires a sorted index") - if isinstance(self, ABCSeries): + is_series = isinstance(self, ABCSeries) + if is_series: if subset is not None: raise ValueError("subset is not valid for Series") - nulls = self.isnull() elif self.ndim > 2: raise NotImplementedError("asof is not implemented " "for {type}".format(type(self))) @@ -3747,9 +3747,9 @@ def asof(self, where, subset=None): subset = self.columns if not is_list_like(subset): subset = [subset] - nulls = self[subset].isnull().any(1) - if not is_list_like(where): + is_list = is_list_like(where) + if not is_list: start = self.index[0] if isinstance(self.index, PeriodIndex): where = Period(where, freq=self.index.freq).ordinal @@ -3758,16 +3758,26 @@ def asof(self, where, subset=None): if where < start: return np.nan - loc = self.index.searchsorted(where, side='right') - if loc > 0: - loc -= 1 - while nulls[loc] and loc > 0: - loc -= 1 - return self.iloc[loc] + # It's always much faster to use a *while* loop here for + # Series than pre-computing all the NAs. However a + # *while* loop is extremely expensive for DataFrame + # so we later pre-compute all the NAs and use the same + # code path whether *where* is a scalar or list. + # See PR: https://github.com/pandas-dev/pandas/pull/14476 + if is_series: + loc = self.index.searchsorted(where, side='right') + if loc > 0: + loc -= 1 + + values = self._values + while loc > 0 and isnull(values[loc]): + loc -= 1 + return values[loc] if not isinstance(where, Index): - where = Index(where) + where = Index(where) if is_list else Index([where]) + nulls = self.isnull() if is_series else self[subset].isnull().any(1) locs = self.index.asof_locs(where, ~(nulls.values)) # mask the missing @@ -3775,7 +3785,7 @@ def asof(self, where, subset=None): data = self.take(locs, is_copy=False) data.index = where data.loc[missing] = np.nan - return data + return data if is_list else data.iloc[-1] # ---------------------------------------------------------------------- # Action Methods