diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py index dfdebec86d67c..22b8ed80f3d07 100644 --- a/asv_bench/benchmarks/binary_ops.py +++ b/asv_bench/benchmarks/binary_ops.py @@ -52,6 +52,8 @@ def setup(self): np.iinfo(np.int16).max, size=(N, N))) + self.s = Series(np.random.randn(N)) + # Division def time_frame_float_div(self): @@ -74,6 +76,17 @@ def time_frame_int_mod(self): def time_frame_float_mod(self): self.df % self.df2 + # Dot product + + def time_frame_dot(self): + self.df.dot(self.df2) + + def time_series_dot(self): + self.s.dot(self.s) + + def time_frame_series_dot(self): + self.df.dot(self.s) + class Timeseries(object): diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index b60b45cc29f7d..527a2f129cf37 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -69,6 +69,36 @@ def time_reindex_upcast(self): self.df2.reindex(np.random.permutation(range(1200))) +class Rename(object): + + def setup(self): + N = 10**3 + self.df = DataFrame(np.random.randn(N * 10, N)) + self.idx = np.arange(4 * N, 7 * N) + self.dict_idx = {k: k for k in self.idx} + self.df2 = DataFrame( + {c: {0: np.random.randint(0, 2, N).astype(np.bool_), + 1: np.random.randint(0, N, N).astype(np.int16), + 2: np.random.randint(0, N, N).astype(np.int32), + 3: np.random.randint(0, N, N).astype(np.int64)} + [np.random.randint(0, 4)] for c in range(N)}) + + def time_rename_single(self): + self.df.rename({0: 0}) + + def time_rename_axis0(self): + self.df.rename(self.dict_idx) + + def time_rename_axis1(self): + self.df.rename(columns=self.dict_idx) + + def time_rename_both_axes(self): + self.df.rename(index=self.dict_idx, columns=self.dict_idx) + + def time_dict_rename_both_axes(self): + self.df.rename(index=self.dict_idx, columns=self.dict_idx) + + class Iteration(object): def setup(self): diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py index 1373d5f0b4258..4f0bbb1690d4b 100644 --- a/asv_bench/benchmarks/plotting.py +++ b/asv_bench/benchmarks/plotting.py @@ -8,17 +8,48 @@ matplotlib.use('Agg') -class Plotting(object): - - def setup(self): - self.s = Series(np.random.randn(1000000)) - self.df = DataFrame({'col': self.s}) - - def time_series_plot(self): - self.s.plot() - - def time_frame_plot(self): - self.df.plot() +class SeriesPlotting(object): + params = [['line', 'bar', 'area', 'barh', 'hist', 'kde', 'pie']] + param_names = ['kind'] + + def setup(self, kind): + if kind in ['bar', 'barh', 'pie']: + n = 100 + elif kind in ['kde']: + n = 10000 + else: + n = 1000000 + + self.s = Series(np.random.randn(n)) + if kind in ['area', 'pie']: + self.s = self.s.abs() + + def time_series_plot(self, kind): + self.s.plot(kind=kind) + + +class FramePlotting(object): + params = [['line', 'bar', 'area', 'barh', 'hist', 'kde', 'pie', 'scatter', + 'hexbin']] + param_names = ['kind'] + + def setup(self, kind): + if kind in ['bar', 'barh', 'pie']: + n = 100 + elif kind in ['kde', 'scatter', 'hexbin']: + n = 10000 + else: + n = 1000000 + + self.x = Series(np.random.randn(n)) + self.y = Series(np.random.randn(n)) + if kind in ['area', 'pie']: + self.x = self.x.abs() + self.y = self.y.abs() + self.df = DataFrame({'x': self.x, 'y': self.y}) + + def time_frame_plot(self, kind): + self.df.plot(x='x', y='y', kind=kind) class TimeseriesPlotting(object): diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 67fdfb82e72c0..e5c2f54263a3c 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -146,4 +146,42 @@ def time_get_dummies_1d_sparse(self): pd.get_dummies(self.s, sparse=True) +class Cut(object): + params = [[4, 10, 1000]] + param_names = ['bins'] + + def setup(self, bins): + N = 10**5 + self.int_series = pd.Series(np.arange(N).repeat(5)) + self.float_series = pd.Series(np.random.randn(N).repeat(5)) + self.timedelta_series = pd.Series(np.random.randint(N, size=N), + dtype='timedelta64[ns]') + self.datetime_series = pd.Series(np.random.randint(N, size=N), + dtype='datetime64[ns]') + + def time_cut_int(self, bins): + pd.cut(self.int_series, bins) + + def time_cut_float(self, bins): + pd.cut(self.float_series, bins) + + def time_cut_timedelta(self, bins): + pd.cut(self.timedelta_series, bins) + + def time_cut_datetime(self, bins): + pd.cut(self.datetime_series, bins) + + def time_qcut_int(self, bins): + pd.qcut(self.int_series, bins) + + def time_qcut_float(self, bins): + pd.qcut(self.float_series, bins) + + def time_qcut_timedelta(self, bins): + pd.qcut(self.timedelta_series, bins) + + def time_qcut_datetime(self, bins): + pd.qcut(self.datetime_series, bins) + + from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 86294e33e1e06..659b6591fbd4b 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -21,6 +21,42 @@ def time_rolling(self, constructor, window, dtype, method): getattr(self.roll, method)() +class ExpandingMethods(object): + + sample_time = 0.2 + params = (['DataFrame', 'Series'], + ['int', 'float'], + ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt', + 'sum']) + param_names = ['contructor', 'window', 'dtype', 'method'] + + def setup(self, constructor, dtype, method): + N = 10**5 + arr = (100 * np.random.random(N)).astype(dtype) + self.expanding = getattr(pd, constructor)(arr).expanding() + + def time_expanding(self, constructor, dtype, method): + getattr(self.expanding, method)() + + +class EWMMethods(object): + + sample_time = 0.2 + params = (['DataFrame', 'Series'], + [10, 1000], + ['int', 'float'], + ['mean', 'std']) + param_names = ['contructor', 'window', 'dtype', 'method'] + + def setup(self, constructor, window, dtype, method): + N = 10**5 + arr = (100 * np.random.random(N)).astype(dtype) + self.ewm = getattr(pd, constructor)(arr).ewm(halflife=window) + + def time_ewm(self, constructor, window, dtype, method): + getattr(self.ewm, method)() + + class VariableWindowMethods(Methods): sample_time = 0.2 params = (['DataFrame', 'Series'], diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 5c777c00261e1..66ded52ca35b2 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -96,14 +96,42 @@ def time_average_old(self, constructor, pct): class Correlation(object): - params = ['spearman', 'kendall', 'pearson'] - param_names = ['method'] + params = [['spearman', 'kendall', 'pearson'], [True, False]] + param_names = ['method', 'use_bottleneck'] - def setup(self, method): + def setup(self, method, use_bottleneck): + try: + pd.options.compute.use_bottleneck = use_bottleneck + except TypeError: + from pandas.core import nanops + nanops._USE_BOTTLENECK = use_bottleneck self.df = pd.DataFrame(np.random.randn(1000, 30)) + self.s = pd.Series(np.random.randn(1000)) + self.s2 = pd.Series(np.random.randn(1000)) - def time_corr(self, method): + def time_corr(self, method, use_bottleneck): self.df.corr(method=method) + def time_corr_series(self, method, use_bottleneck): + self.s.corr(self.s2, method=method) + + +class Covariance(object): + + params = [[True, False]] + param_names = ['use_bottleneck'] + + def setup(self, use_bottleneck): + try: + pd.options.compute.use_bottleneck = use_bottleneck + except TypeError: + from pandas.core import nanops + nanops._USE_BOTTLENECK = use_bottleneck + self.s = pd.Series(np.random.randn(100000)) + self.s2 = pd.Series(np.random.randn(100000)) + + def time_cov_series(self, use_bottleneck): + self.s.cov(self.s2) + from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index d880fb258560d..e9f2727f64e15 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -26,21 +26,42 @@ def time_extract(self): def time_findall(self): self.s.str.findall('[A-Z]+') + def time_find(self): + self.s.str.find('[A-Z]+') + + def time_rfind(self): + self.s.str.rfind('[A-Z]+') + def time_get(self): self.s.str.get(0) def time_len(self): self.s.str.len() + def time_join(self): + self.s.str.join(' ') + def time_match(self): self.s.str.match('A') + def time_normalize(self): + self.s.str.normalize('NFC') + def time_pad(self): self.s.str.pad(100, side='both') + def time_partition(self): + self.s.str.partition('A') + + def time_rpartition(self): + self.s.str.rpartition('A') + def time_replace(self): self.s.str.replace('A', '\x01\x01') + def time_translate(self): + self.s.str.translate({'A': '\x01\x01'}) + def time_slice(self): self.s.str.slice(5, 15, 2) @@ -65,6 +86,12 @@ def time_upper(self): def time_lower(self): self.s.str.lower() + def time_wrap(self): + self.s.str.wrap(10) + + def time_zfill(self): + self.s.str.zfill(10) + class Repeat(object): @@ -129,6 +156,9 @@ def setup(self, expand): def time_split(self, expand): self.s.str.split('--', expand=expand) + def time_rsplit(self, expand): + self.s.str.rsplit('--', expand=expand) + class Dummies(object): diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py index 01d53fb9cbbd9..7ee73fb7ac7b6 100644 --- a/asv_bench/benchmarks/timedelta.py +++ b/asv_bench/benchmarks/timedelta.py @@ -1,7 +1,8 @@ import datetime import numpy as np -from pandas import Series, timedelta_range, to_timedelta, Timestamp, Timedelta +from pandas import Series, timedelta_range, to_timedelta, Timestamp, \ + Timedelta, TimedeltaIndex, DataFrame class TimedeltaConstructor(object): @@ -116,3 +117,36 @@ def time_timedelta_microseconds(self, series): def time_timedelta_nanoseconds(self, series): series.dt.nanoseconds + + +class TimedeltaIndexing(object): + + def setup(self): + self.index = TimedeltaIndex(start='1985', periods=1000, freq='D') + self.index2 = TimedeltaIndex(start='1986', periods=1000, freq='D') + self.series = Series(range(1000), index=self.index) + self.timedelta = self.index[500] + + def time_get_loc(self): + self.index.get_loc(self.timedelta) + + def time_shape(self): + self.index.shape + + def time_shallow_copy(self): + self.index._shallow_copy() + + def time_series_loc(self): + self.series.loc[self.timedelta] + + def time_align(self): + DataFrame({'a': self.series, 'b': self.series[:500]}) + + def time_intersection(self): + self.index.intersection(self.index2) + + def time_union(self): + self.index.union(self.index2) + + def time_unique(self): + self.index.unique()