diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 7dcd7b284d66d..34fb161e5afcb 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -16,63 +16,75 @@ class Factorize(object): - params = [True, False] - param_names = ['sort'] + params = [[True, False], ['int', 'uint', 'float', 'string']] + param_names = ['sort', 'dtype'] - def setup(self, sort): + def setup(self, sort, dtype): N = 10**5 - self.int_idx = pd.Int64Index(np.arange(N).repeat(5)) - self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5)) - self.string_idx = tm.makeStringIndex(N) + data = {'int': pd.Int64Index(np.arange(N).repeat(5)), + 'uint': pd.UInt64Index(np.arange(N).repeat(5)), + 'float': pd.Float64Index(np.random.randn(N).repeat(5)), + 'string': tm.makeStringIndex(N).repeat(5)} + self.idx = data[dtype] - def time_factorize_int(self, sort): - self.int_idx.factorize(sort=sort) + def time_factorize(self, sort, dtype): + self.idx.factorize(sort=sort) - def time_factorize_float(self, sort): - self.float_idx.factorize(sort=sort) - def time_factorize_string(self, sort): - self.string_idx.factorize(sort=sort) +class FactorizeUnique(object): + params = [[True, False], ['int', 'uint', 'float', 'string']] + param_names = ['sort', 'dtype'] -class Duplicated(object): - - params = ['first', 'last', False] - param_names = ['keep'] - - def setup(self, keep): + def setup(self, sort, dtype): N = 10**5 - self.int_idx = pd.Int64Index(np.arange(N).repeat(5)) - self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5)) - self.string_idx = tm.makeStringIndex(N) - - def time_duplicated_int(self, keep): - self.int_idx.duplicated(keep=keep) + data = {'int': pd.Int64Index(np.arange(N)), + 'uint': pd.UInt64Index(np.arange(N)), + 'float': pd.Float64Index(np.arange(N)), + 'string': tm.makeStringIndex(N)} + self.idx = data[dtype] + assert self.idx.is_unique - def time_duplicated_float(self, keep): - self.float_idx.duplicated(keep=keep) + def time_factorize(self, sort, dtype): + self.idx.factorize(sort=sort) - def time_duplicated_string(self, keep): - self.string_idx.duplicated(keep=keep) +class Duplicated(object): -class DuplicatedUniqueIndex(object): + params = [['first', 'last', False], ['int', 'uint', 'float', 'string']] + param_names = ['keep', 'dtype'] - def setup(self): + def setup(self, keep, dtype): N = 10**5 - self.idx_int_dup = pd.Int64Index(np.arange(N * 5)) + data = {'int': pd.Int64Index(np.arange(N).repeat(5)), + 'uint': pd.UInt64Index(np.arange(N).repeat(5)), + 'float': pd.Float64Index(np.random.randn(N).repeat(5)), + 'string': tm.makeStringIndex(N).repeat(5)} + self.idx = data[dtype] # cache is_unique - self.idx_int_dup.is_unique + self.idx.is_unique + + def time_duplicated(self, keep, dtype): + self.idx.duplicated(keep=keep) + - def time_duplicated_unique_int(self): - self.idx_int_dup.duplicated() +class DuplicatedUniqueIndex(object): + params = ['int', 'uint', 'float', 'string'] + param_names = ['dtype'] -class Match(object): + def setup(self, dtype): + N = 10**5 + data = {'int': pd.Int64Index(np.arange(N)), + 'uint': pd.UInt64Index(np.arange(N)), + 'float': pd.Float64Index(np.random.randn(N)), + 'string': tm.makeStringIndex(N)} + self.idx = data[dtype] + # cache is_unique + self.idx.is_unique - def setup(self): - self.uniques = tm.makeStringIndex(1000).values - self.all = self.uniques.repeat(10) + def time_duplicated_unique(self, dtype): + self.idx.duplicated() class Hashing(object): @@ -113,4 +125,21 @@ def time_series_dates(self, df): hashing.hash_pandas_object(df['dates']) +class Quantile(object): + params = [[0, 0.5, 1], + ['linear', 'nearest', 'lower', 'higher', 'midpoint'], + ['float', 'int', 'uint']] + param_names = ['quantile', 'interpolation', 'dtype'] + + def setup(self, quantile, interpolation, dtype): + N = 10**5 + data = {'int': np.arange(N), + 'uint': np.arange(N).astype(np.uint64), + 'float': np.random.randn(N)} + self.idx = pd.Series(data[dtype].repeat(5)) + + def time_quantile(self, quantile, interpolation, dtype): + self.idx.quantile(quantile, interpolation=interpolation) + + from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 7318b40efc8fb..e5dab0cb066aa 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -84,7 +84,8 @@ class ValueCounts(object): def setup(self, dropna): n = 5 * 10**5 - arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)] + arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10, + size=n)] self.ts = pd.Series(arr).astype('category') def time_value_counts(self, dropna): @@ -104,13 +105,26 @@ class SetCategories(object): def setup(self): n = 5 * 10**5 - arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)] + arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10, + size=n)] self.ts = pd.Series(arr).astype('category') def time_set_categories(self): self.ts.cat.set_categories(self.ts.cat.categories[::2]) +class RemoveCategories(object): + + def setup(self): + n = 5 * 10**5 + arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10, + size=n)] + self.ts = pd.Series(arr).astype('category') + + def time_remove_categories(self): + self.ts.cat.remove_categories(self.ts.cat.categories[::2]) + + class Rank(object): def setup(self): @@ -159,7 +173,7 @@ def setup(self, dtype): sample_size = 100 arr = [i for i in np.random.randint(0, n // 10, size=n)] if dtype == 'object': - arr = ['s%04d' % i for i in arr] + arr = ['s{:04d}'.format(i) for i in arr] self.sample = np.random.choice(arr, sample_size) self.series = pd.Series(arr).astype('category') @@ -236,4 +250,40 @@ def time_getitem_bool_array(self, index): self.data[self.data == self.cat_scalar] +class Indexing(object): + + def setup(self): + N = 10**5 + self.index = pd.CategoricalIndex(range(N), range(N)) + self.series = pd.Series(range(N), index=self.index).sort_index() + self.category = self.index[500] + + def time_get_loc(self): + self.index.get_loc(self.category) + + def time_shape(self): + self.index.shape + + def time_shallow_copy(self): + self.index._shallow_copy() + + def time_align(self): + pd.DataFrame({'a': self.series, 'b': self.series[:500]}) + + def time_intersection(self): + self.index[:750].intersection(self.index[250:]) + + def time_unique(self): + self.index.unique() + + def time_reindex(self): + self.index.reindex(self.index[:500]) + + def time_reindex_missing(self): + self.index.reindex(['a', 'b', 'c', 'd']) + + def time_sort_values(self): + self.index.sort_values(ascending=False) + + from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index a1cdb00260fc4..6da8287a06d80 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -48,6 +48,7 @@ def setup(self, axis): index=date_range('20130101', periods=N, freq='s')) self.empty_left = [DataFrame(), df] self.empty_right = [df, DataFrame()] + self.mixed_ndims = [df, df.head(N // 2)] def time_concat_series(self, axis): concat(self.series, axis=axis, sort=False) @@ -61,6 +62,9 @@ def time_concat_empty_right(self, axis): def time_concat_empty_left(self, axis): concat(self.empty_left, axis=axis) + def time_concat_mixed_ndims(self, axis): + concat(self.mixed_ndims, axis=axis) + class ConcatPanels(object): @@ -274,8 +278,10 @@ def time_merge_ordered(self): class MergeAsof(object): + params = [['backward', 'forward', 'nearest']] + param_names = ['direction'] - def setup(self): + def setup(self, direction): one_count = 200000 two_count = 1000000 @@ -307,20 +313,23 @@ def setup(self): self.df1e = df1[['time', 'key', 'key2', 'value1']] self.df2e = df2[['time', 'key', 'key2', 'value2']] - def time_on_int(self): - merge_asof(self.df1a, self.df2a, on='time') + def time_on_int(self, direction): + merge_asof(self.df1a, self.df2a, on='time', direction=direction) - def time_on_int32(self): - merge_asof(self.df1d, self.df2d, on='time32') + def time_on_int32(self, direction): + merge_asof(self.df1d, self.df2d, on='time32', direction=direction) - def time_by_object(self): - merge_asof(self.df1b, self.df2b, on='time', by='key') + def time_by_object(self, direction): + merge_asof(self.df1b, self.df2b, on='time', by='key', + direction=direction) - def time_by_int(self): - merge_asof(self.df1c, self.df2c, on='time', by='key2') + def time_by_int(self, direction): + merge_asof(self.df1c, self.df2c, on='time', by='key2', + direction=direction) - def time_multiby(self): - merge_asof(self.df1e, self.df2e, on='time', by=['key', 'key2']) + def time_multiby(self, direction): + merge_asof(self.df1e, self.df2e, on='time', by=['key', 'key2'], + direction=direction) class Align(object): diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index ab5e5fd3bfe10..d479952cbfbf6 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -16,14 +16,17 @@ np.float64, np.int16, np.int8, np.uint16, np.uint8] datetime_dtypes = [np.datetime64, np.timedelta64] string_dtypes = [np.object] -extension_dtypes = [pd.Int8Dtype, pd.Int16Dtype, - pd.Int32Dtype, pd.Int64Dtype, - pd.UInt8Dtype, pd.UInt16Dtype, - pd.UInt32Dtype, pd.UInt64Dtype, - pd.CategoricalDtype, - pd.IntervalDtype, - pd.DatetimeTZDtype('ns', 'UTC'), - pd.PeriodDtype('D')] +try: + extension_dtypes = [pd.Int8Dtype, pd.Int16Dtype, + pd.Int32Dtype, pd.Int64Dtype, + pd.UInt8Dtype, pd.UInt16Dtype, + pd.UInt32Dtype, pd.UInt64Dtype, + pd.CategoricalDtype, + pd.IntervalDtype, + pd.DatetimeTZDtype('ns', 'UTC'), + pd.PeriodDtype('D')] +except AttributeError: + extension_dtypes = [] def setup(*args, **kwargs): diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py index 4f0bbb1690d4b..8a67af0bdabd1 100644 --- a/asv_bench/benchmarks/plotting.py +++ b/asv_bench/benchmarks/plotting.py @@ -74,6 +74,9 @@ def time_plot_regular_compat(self): def time_plot_irregular(self): self.df2.plot() + def time_plot_table(self): + self.df.plot(table=True) + class Misc(object): diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index fb47fa81d8dfd..3080b34024a33 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -1,7 +1,7 @@ import numpy as np import pandas.util.testing as tm -from pandas import (DataFrame, Series, MultiIndex, Index, - date_range) +from pandas import (DataFrame, Series, MultiIndex, Index, date_range, + period_range) from .pandas_vb_common import lib @@ -35,15 +35,15 @@ def time_reindex_multiindex(self): class ReindexMethod(object): - params = ['pad', 'backfill'] - param_names = ['method'] + params = [['pad', 'backfill'], [date_range, period_range]] + param_names = ['method', 'constructor'] - def setup(self, method): + def setup(self, method, constructor): N = 100000 - self.idx = date_range('1/1/2000', periods=N, freq='1min') + self.idx = constructor('1/1/2000', periods=N, freq='1min') self.ts = Series(np.random.randn(N), index=self.idx)[::2] - def time_reindex_method(self, method): + def time_reindex_method(self, method, constructor): self.ts.reindex(self.idx, method=method) diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index e5c2f54263a3c..f6ee107ab618e 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -131,6 +131,38 @@ def setup(self): def time_pivot_table(self): self.df.pivot_table(index='key1', columns=['key2', 'key3']) + def time_pivot_table_agg(self): + self.df.pivot_table(index='key1', columns=['key2', 'key3'], + aggfunc=['sum', 'mean']) + + def time_pivot_table_margins(self): + self.df.pivot_table(index='key1', columns=['key2', 'key3'], + margins=True) + + +class Crosstab(object): + + def setup(self): + N = 100000 + fac1 = np.array(['A', 'B', 'C'], dtype='O') + fac2 = np.array(['one', 'two'], dtype='O') + self.ind1 = np.random.randint(0, 3, size=N) + self.ind2 = np.random.randint(0, 2, size=N) + self.vec1 = fac1.take(self.ind1) + self.vec2 = fac2.take(self.ind2) + + def time_crosstab(self): + pd.crosstab(self.vec1, self.vec2) + + def time_crosstab_values(self): + pd.crosstab(self.vec1, self.vec2, values=self.ind1, aggfunc='sum') + + def time_crosstab_normalize(self): + pd.crosstab(self.vec1, self.vec2, normalize=True) + + def time_crosstab_normalize_margins(self): + pd.crosstab(self.vec1, self.vec2, normalize=True, margins=True) + class GetDummies(object): def setup(self): diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 46fb5011cc1a5..5b0981dc10a8a 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -23,7 +23,7 @@ def time_constructor(self, data): class IsIn(object): - params = ['int64', 'object'] + params = ['int64', 'uint64', 'object'] param_names = ['dtype'] def setup(self, dtype): @@ -150,7 +150,7 @@ def time_clip(self): class ValueCounts(object): - params = ['int', 'float', 'object'] + params = ['int', 'uint', 'float', 'object'] param_names = ['dtype'] def setup(self, dtype): diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index dc31d23105845..6efd720d1acdd 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -342,6 +342,15 @@ def time_different_offset(self): to_datetime(self.diff_offset) +class ToDatetimeFormatQuarters(object): + + def setup(self): + self.s = Series(['2Q2005', '2Q05', '2005Q1', '05Q1'] * 10000) + + def time_infer_quarter(self): + to_datetime(self.s) + + class ToDatetimeFormat(object): def setup(self): diff --git a/asv_bench/benchmarks/timestamp.py b/asv_bench/benchmarks/timestamp.py index 4c1d6e8533408..b45ae22650e17 100644 --- a/asv_bench/benchmarks/timestamp.py +++ b/asv_bench/benchmarks/timestamp.py @@ -77,17 +77,20 @@ def time_is_quarter_end(self, tz, freq): self.ts.is_quarter_end def time_is_year_start(self, tz, freq): - self.ts.is_quarter_end + self.ts.is_year_start def time_is_year_end(self, tz, freq): - self.ts.is_quarter_end + self.ts.is_year_end def time_is_leap_year(self, tz, freq): - self.ts.is_quarter_end + self.ts.is_leap_year def time_microsecond(self, tz, freq): self.ts.microsecond + def time_month_name(self, tz, freq): + self.ts.month_name() + class TimestampOps(object): params = [None, 'US/Eastern', pytz.UTC, @@ -117,6 +120,15 @@ def time_tz_localize(self, tz): if self.ts.tz is None: self.ts.tz_localize(tz) + def time_to_julian_date(self, tz): + self.ts.to_julian_date() + + def time_floor(self, tz): + self.ts.floor('5T') + + def time_ceil(self, tz): + self.ts.ceil('5T') + class TimestampAcrossDst(object): def setup(self):