Skip to content

Commit e3d943d

Browse files
laudneyjorisvandenbossche
authored andcommitted
PERF: performance regression in Series.asof (#14476)
* Fix performance regression in Series.asof by avoiding pre-computing nulls and returning value by indexing the underlying ndarray.
1 parent d1d75d7 commit e3d943d

File tree

3 files changed

+76
-44
lines changed

3 files changed

+76
-44
lines changed

asv_bench/benchmarks/timeseries.py

+51-30
Original file line numberDiff line numberDiff line change
@@ -284,56 +284,77 @@ class timeseries_asof(object):
284284
goal_time = 0.2
285285

286286
def setup(self):
287-
self.N = 100000
288-
self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
289-
if hasattr(Series, 'convert'):
290-
Series.resample = Series.convert
291-
self.ts = Series(np.random.randn(self.N), index=self.rng)
292287
self.N = 10000
293288
self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s')
294-
self.ts = Series(np.random.randn(self.N), index=self.rng)
295289
self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s')
290+
self.ts = Series(np.random.randn(self.N), index=self.rng)
291+
self.ts2 = self.ts.copy()
292+
self.ts2[250:5000] = np.nan
293+
self.ts3 = self.ts.copy()
294+
self.ts3[-5000:] = np.nan
296295

297-
def time_timeseries_asof(self):
296+
# test speed of pre-computing NAs.
297+
def time_asof_list(self):
298298
self.ts.asof(self.dates)
299299

300+
# should be roughly the same as above.
301+
def time_asof_nan_list(self):
302+
self.ts2.asof(self.dates)
300303

301-
class timeseries_asof_nan(object):
302-
goal_time = 0.2
304+
# test speed of the code path for a scalar index
305+
# without *while* loop
306+
def time_asof_single(self):
307+
self.ts.asof(self.dates[0])
303308

304-
def setup(self):
305-
self.N = 100000
306-
self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
307-
if hasattr(Series, 'convert'):
308-
Series.resample = Series.convert
309-
self.ts = Series(np.random.randn(self.N), index=self.rng)
310-
self.N = 10000
311-
self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s')
312-
self.ts = Series(np.random.randn(self.N), index=self.rng)
313-
self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s')
314-
self.ts[250:5000] = np.nan
309+
# test speed of the code path for a scalar index
310+
# before the start. should be the same as above.
311+
def time_asof_single_early(self):
312+
self.ts.asof(self.dates[0] - dt.timedelta(10))
315313

316-
def time_timeseries_asof_nan(self):
317-
self.ts.asof(self.dates)
314+
# test the speed of the code path for a scalar index
315+
# with a long *while* loop. should still be much
316+
# faster than pre-computing all the NAs.
317+
def time_asof_nan_single(self):
318+
self.ts3.asof(self.dates[-1])
318319

319320

320-
class timeseries_asof_single(object):
321+
class timeseries_dataframe_asof(object):
321322
goal_time = 0.2
322323

323324
def setup(self):
324-
self.N = 100000
325-
self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
326-
if hasattr(Series, 'convert'):
327-
Series.resample = Series.convert
328-
self.ts = Series(np.random.randn(self.N), index=self.rng)
329325
self.N = 10000
326+
self.M = 100
330327
self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s')
331-
self.ts = Series(np.random.randn(self.N), index=self.rng)
332328
self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s')
329+
self.ts = DataFrame(np.random.randn(self.N, self.M), index=self.rng)
330+
self.ts2 = self.ts.copy()
331+
self.ts2.iloc[250:5000] = np.nan
332+
self.ts3 = self.ts.copy()
333+
self.ts3.iloc[-5000:] = np.nan
334+
335+
# test speed of pre-computing NAs.
336+
def time_asof_list(self):
337+
self.ts.asof(self.dates)
333338

334-
def time_timeseries_asof_single(self):
339+
# should be roughly the same as above.
340+
def time_asof_nan_list(self):
341+
self.ts2.asof(self.dates)
342+
343+
# test speed of the code path for a scalar index
344+
# with pre-computing all NAs.
345+
def time_asof_single(self):
335346
self.ts.asof(self.dates[0])
336347

348+
# should be roughly the same as above.
349+
def time_asof_nan_single(self):
350+
self.ts3.asof(self.dates[-1])
351+
352+
# test speed of the code path for a scalar index
353+
# before the start. should be without the cost of
354+
# pre-computing all the NAs.
355+
def time_asof_single_early(self):
356+
self.ts.asof(self.dates[0] - dt.timedelta(10))
357+
337358

338359
class timeseries_custom_bday_apply(object):
339360
goal_time = 0.2

doc/source/whatsnew/v0.19.1.txt

+3-2
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,9 @@ Performance Improvements
2121
~~~~~~~~~~~~~~~~~~~~~~~~
2222

2323
- Fixed performance regression in factorization of ``Period`` data (:issue:`14338`)
24-
- Improved Performance in ``.to_json()`` when ``lines=True`` (:issue:`14408`)
25-
24+
- Improved performance in ``.to_json()`` when ``lines=True`` (:issue:`14408`)
25+
- Improved performance in ``Series.asof(where)`` when ``where`` is a scalar (:issue:`14461)
26+
- Improved performance in ``DataFrame.asof(where)`` when ``where`` is a scalar (:issue:`14461)
2627

2728

2829

pandas/core/generic.py

+22-12
Original file line numberDiff line numberDiff line change
@@ -3735,10 +3735,10 @@ def asof(self, where, subset=None):
37353735
if not self.index.is_monotonic:
37363736
raise ValueError("asof requires a sorted index")
37373737

3738-
if isinstance(self, ABCSeries):
3738+
is_series = isinstance(self, ABCSeries)
3739+
if is_series:
37393740
if subset is not None:
37403741
raise ValueError("subset is not valid for Series")
3741-
nulls = self.isnull()
37423742
elif self.ndim > 2:
37433743
raise NotImplementedError("asof is not implemented "
37443744
"for {type}".format(type(self)))
@@ -3747,9 +3747,9 @@ def asof(self, where, subset=None):
37473747
subset = self.columns
37483748
if not is_list_like(subset):
37493749
subset = [subset]
3750-
nulls = self[subset].isnull().any(1)
37513750

3752-
if not is_list_like(where):
3751+
is_list = is_list_like(where)
3752+
if not is_list:
37533753
start = self.index[0]
37543754
if isinstance(self.index, PeriodIndex):
37553755
where = Period(where, freq=self.index.freq).ordinal
@@ -3758,24 +3758,34 @@ def asof(self, where, subset=None):
37583758
if where < start:
37593759
return np.nan
37603760

3761-
loc = self.index.searchsorted(where, side='right')
3762-
if loc > 0:
3763-
loc -= 1
3764-
while nulls[loc] and loc > 0:
3765-
loc -= 1
3766-
return self.iloc[loc]
3761+
# It's always much faster to use a *while* loop here for
3762+
# Series than pre-computing all the NAs. However a
3763+
# *while* loop is extremely expensive for DataFrame
3764+
# so we later pre-compute all the NAs and use the same
3765+
# code path whether *where* is a scalar or list.
3766+
# See PR: https://github.com/pandas-dev/pandas/pull/14476
3767+
if is_series:
3768+
loc = self.index.searchsorted(where, side='right')
3769+
if loc > 0:
3770+
loc -= 1
3771+
3772+
values = self._values
3773+
while loc > 0 and isnull(values[loc]):
3774+
loc -= 1
3775+
return values[loc]
37673776

37683777
if not isinstance(where, Index):
3769-
where = Index(where)
3778+
where = Index(where) if is_list else Index([where])
37703779

3780+
nulls = self.isnull() if is_series else self[subset].isnull().any(1)
37713781
locs = self.index.asof_locs(where, ~(nulls.values))
37723782

37733783
# mask the missing
37743784
missing = locs == -1
37753785
data = self.take(locs, is_copy=False)
37763786
data.index = where
37773787
data.loc[missing] = np.nan
3778-
return data
3788+
return data if is_list else data.iloc[-1]
37793789

37803790
# ----------------------------------------------------------------------
37813791
# Action Methods

0 commit comments

Comments
 (0)