From 9d53edb2b7904d8a710aeaff8ce1a32d8a23853b Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 11 Nov 2023 21:00:08 -0500 Subject: [PATCH 1/2] clean setup methods --- asv_bench/benchmarks/algorithms.py | 100 +++++++++++------------ asv_bench/benchmarks/array.py | 6 +- asv_bench/benchmarks/categoricals.py | 10 +-- asv_bench/benchmarks/frame_methods.py | 3 +- asv_bench/benchmarks/gil.py | 18 ++-- asv_bench/benchmarks/groupby.py | 2 +- asv_bench/benchmarks/index_object.py | 4 +- asv_bench/benchmarks/indexing_engines.py | 27 +++--- asv_bench/benchmarks/series_methods.py | 2 +- 9 files changed, 81 insertions(+), 91 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 192f19c36b47d..7b077c5b54e69 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -1,7 +1,6 @@ from importlib import import_module import numpy as np -import pyarrow as pa import pandas as pd @@ -20,9 +19,9 @@ class Factorize: [True, False], [True, False], [ - "int", - "uint", - "float", + "int64", + "uint64", + "float64", "object", "object_str", "datetime64[ns]", @@ -36,28 +35,24 @@ class Factorize: def setup(self, unique, sort, dtype): N = 10**5 - string_index = tm.makeStringIndex(N) - string_arrow = None - if dtype == "string[pyarrow]": - try: - string_arrow = pd.array(string_index, dtype="string[pyarrow]") - except ImportError: - raise NotImplementedError - - data = { - "int": pd.Index(np.arange(N), dtype="int64"), - "uint": pd.Index(np.arange(N), dtype="uint64"), - "float": pd.Index(np.random.randn(N), dtype="float64"), - "object_str": string_index, - "object": pd.Index(np.arange(N), dtype="object"), - "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N), - "datetime64[ns, tz]": pd.date_range( - "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" - ), - "Int64": pd.array(np.arange(N), dtype="Int64"), - "boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"), - "string[pyarrow]": string_arrow, - }[dtype] + + if dtype in ["int64", "uint64", "Int64", "object"]: + data = pd.Index(np.arange(N), dtype=dtype) + elif dtype == "float64": + data = pd.Index(np.random.randn(N), dtype=dtype) + elif dtype == "boolean": + data = pd.array(np.random.randint(0, 2, N), dtype=dtype) + elif dtype == "datetime64[ns]": + data = pd.date_range("2011-01-01", freq="H", periods=N) + elif dtype == "datetime64[ns, tz]": + data = pd.date_range("2011-01-01", freq="H", periods=N, tz="Asia/Tokyo") + elif dtype == "object_str": + data = tm.makeStringIndex(N) + elif dtype == "string[pyarrow]": + data = pd.array(tm.makeStringIndex(N), dtype="string[pyarrow]") + else: + raise NotImplementedError + if not unique: data = data.repeat(5) self.data = data @@ -74,9 +69,9 @@ class Duplicated: [True, False], ["first", "last", False], [ - "int", - "uint", - "float", + "int64", + "uint64", + "float64", "string", "datetime64[ns]", "datetime64[ns, tz]", @@ -88,22 +83,20 @@ class Duplicated: def setup(self, unique, keep, dtype): N = 10**5 - data = { - "int": pd.Index(np.arange(N), dtype="int64"), - "uint": pd.Index(np.arange(N), dtype="uint64"), - "float": pd.Index(np.random.randn(N), dtype="float64"), - "string": tm.makeStringIndex(N), - "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N), - "datetime64[ns, tz]": pd.date_range( - "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" - ), - "timestamp[ms][pyarrow]": pd.Index( - np.arange(N), dtype=pd.ArrowDtype(pa.timestamp("ms")) - ), - "duration[s][pyarrow]": pd.Index( - np.arange(N), dtype=pd.ArrowDtype(pa.duration("s")) - ), - }[dtype] + if dtype in ["int64", "uint64"]: + data = pd.Index(np.arange(N), dtype=dtype) + elif dtype == "float": + data = pd.Index(np.random.randn(N), dtype="float64") + elif dtype == "string": + data = tm.makeStringIndex(N) + elif dtype == "datetime64[ns]": + data = pd.date_range("2011-01-01", freq="H", periods=N) + elif dtype == "datetime64[ns, tz]": + data = pd.date_range("2011-01-01", freq="H", periods=N, tz="Asia/Tokyo") + elif dtype in ["timestamp[ms][pyarrow]", "duration[s][pyarrow]"]: + data = pd.Index(np.arange(N), dtype=dtype) + else: + raise NotImplementedError if not unique: data = data.repeat(5) self.idx = data @@ -181,21 +174,22 @@ class Quantile: params = [ [0, 0.5, 1], ["linear", "nearest", "lower", "higher", "midpoint"], - ["float", "int", "uint"], + ["float64", "int64", "uint64"], ] param_names = ["quantile", "interpolation", "dtype"] def setup(self, quantile, interpolation, dtype): N = 10**5 - data = { - "int": np.arange(N), - "uint": np.arange(N).astype(np.uint64), - "float": np.random.randn(N), - } - self.idx = pd.Series(data[dtype].repeat(5)) + if dtype in ["int64", "uint64"]: + data = np.arange(N, dtype=dtype) + elif dtype == "float64": + data = np.random.randn(N) + else: + raise NotImplementedError + self.ser = pd.Series(data.repeat(5)) def time_quantile(self, quantile, interpolation, dtype): - self.idx.quantile(quantile, interpolation=interpolation) + self.ser.quantile(quantile, interpolation=interpolation) class SortIntegerArray: diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index 0229cf15fbfb8..aefc6e6d3c307 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -31,9 +31,9 @@ def time_from_float_array(self): class IntegerArray: def setup(self): N = 250_000 - self.values_integer = np.array([1, 0, 1, 0] * N) - self.data = np.array([1, 2, 3, 4] * N, dtype="int64") - self.mask = np.array([False, False, True, False] * N) + self.values_integer = np.tile(np.array([1, 0, 1, 0]), N) + self.data = np.tile(np.array([1, 2, 3, 4], dtype="int64"), N) + self.mask = np.tile(np.array([False, False, True, False]), N) def time_constructor(self): pd.arrays.IntegerArray(self.data, self.mask) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 84d4a28d675d5..7e70db5681850 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -260,18 +260,16 @@ class CategoricalSlicing: def setup(self, index): N = 10**6 categories = ["a", "b", "c"] - values = [0] * N + [1] * N + [2] * N if index == "monotonic_incr": - self.data = pd.Categorical.from_codes(values, categories=categories) + codes = np.repeat([0, 1, 2], N) elif index == "monotonic_decr": - self.data = pd.Categorical.from_codes( - list(reversed(values)), categories=categories - ) + codes = np.repeat([2, 1, 0], N) elif index == "non_monotonic": - self.data = pd.Categorical.from_codes([0, 1, 2] * N, categories=categories) + codes = np.tile([0, 1, 2], N) else: raise ValueError(f"Invalid index param: {index}") + self.data = pd.Categorical.from_codes(codes, categories=categories) self.scalar = 10000 self.list = list(range(10000)) self.cat_scalar = "b" diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index e56fbf1d8c32f..c0e024203d231 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -640,7 +640,8 @@ def time_frame_nunique(self): class SeriesNuniqueWithNan: def setup(self): - self.ser = Series(100000 * (100 * [np.nan] + list(range(100)))).astype(float) + values = 100 * [np.nan] + list(range(100)) + self.ser = Series(np.tile(values, 10000), dtype=float) def time_series_nunique_nan(self): self.ser.nunique() diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 4993ffd2c47d0..2cf7752d7c3e4 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -272,18 +272,20 @@ class ParallelReadCSV(BaseIO): def setup(self, dtype): rows = 10000 cols = 50 - data = { - "float": DataFrame(np.random.randn(rows, cols)), - "datetime": DataFrame( + if dtype == "float": + df = DataFrame(np.random.randn(rows, cols)) + elif dtype == "datetime": + df = DataFrame( np.random.randn(rows, cols), index=date_range("1/1/2000", periods=rows) - ), - "object": DataFrame( + ) + elif dtype == "object": + df = DataFrame( "foo", index=range(rows), columns=["object%03d" for _ in range(5)] - ), - } + ) + else: + raise NotImplementedError self.fname = f"__test_{dtype}__.csv" - df = data[dtype] df.to_csv(self.fname) @test_parallel(num_threads=2) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 54c240e84243a..1916247e8c3f1 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -713,7 +713,7 @@ def setup(self, dtype, tie_method): if dtype == "datetime64": data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype) else: - data = np.array([1] * N, dtype=dtype) + data = np.ones(N, dtype=dtype) self.df = DataFrame({"values": data, "key": ["foo"] * N}) def time_rank_ties(self, dtype, tie_method): diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 2d8014570466e..7e33223260e0f 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -161,9 +161,7 @@ def setup(self, dtype): self.sorted = self.idx.sort_values() half = N // 2 self.non_unique = self.idx[:half].append(self.idx[:half]) - self.non_unique_sorted = ( - self.sorted[:half].append(self.sorted[:half]).sort_values() - ) + self.non_unique_sorted = self.sorted[:half].repeat(2) self.key = self.sorted[N // 4] def time_boolean_array(self, dtype): diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index 6585a4be78dc6..fd3d0f0b9cf2e 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -71,14 +71,12 @@ def setup(self, engine_and_dtype, index_type, unique, N): if unique: arr = np.arange(N * 3, dtype=dtype) else: - values = list([1] * N + [2] * N + [3] * N) - arr = np.array(values, dtype=dtype) + arr = np.array([1, 2, 3], dtype=dtype).repeat(N) elif index_type == "monotonic_decr": if unique: arr = np.arange(N * 3, dtype=dtype)[::-1] else: - values = list([1] * N + [2] * N + [3] * N) - arr = np.array(values, dtype=dtype)[::-1] + arr = np.array([3, 2, 1], dtype=dtype).repeat(N) else: assert index_type == "non_monotonic" if unique: @@ -86,7 +84,7 @@ def setup(self, engine_and_dtype, index_type, unique, N): arr[:N] = np.arange(N * 2, N * 3, dtype=dtype) arr[N:] = np.arange(N * 2, dtype=dtype) else: - arr = np.array([1, 2, 3] * N, dtype=dtype) + arr = np.array([1, 2, 3], dtype=dtype).repeat(N) self.data = engine(arr) # code belows avoids populating the mapping etc. while timing. @@ -115,30 +113,29 @@ class MaskedNumericEngineIndexing: def setup(self, engine_and_dtype, index_type, unique, N): engine, dtype = engine_and_dtype + dtype = dtype.lower() if index_type == "monotonic_incr": if unique: - arr = np.arange(N * 3, dtype=dtype.lower()) + arr = np.arange(N * 3, dtype=dtype) else: - values = list([1] * N + [2] * N + [3] * N) - arr = np.array(values, dtype=dtype.lower()) + arr = np.array([1, 2, 3], dtype=dtype).repeat(N) mask = np.zeros(N * 3, dtype=np.bool_) elif index_type == "monotonic_decr": if unique: - arr = np.arange(N * 3, dtype=dtype.lower())[::-1] + arr = np.arange(N * 3, dtype=dtype)[::-1] else: - values = list([1] * N + [2] * N + [3] * N) - arr = np.array(values, dtype=dtype.lower())[::-1] + arr = np.array([3, 2, 1], dtype=dtype).repeat(N) mask = np.zeros(N * 3, dtype=np.bool_) else: assert index_type == "non_monotonic" if unique: - arr = np.zeros(N * 3, dtype=dtype.lower()) - arr[:N] = np.arange(N * 2, N * 3, dtype=dtype.lower()) - arr[N:] = np.arange(N * 2, dtype=dtype.lower()) + arr = np.zeros(N * 3, dtype=dtype) + arr[:N] = np.arange(N * 2, N * 3, dtype=dtype) + arr[N:] = np.arange(N * 2, dtype=dtype) else: - arr = np.array([1, 2, 3] * N, dtype=dtype.lower()) + arr = np.array([1, 2, 3], dtype=dtype).repeat(N) mask = np.zeros(N * 3, dtype=np.bool_) mask[-1] = True diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index f52f7a4bef37a..ee3f062e5fca3 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -317,7 +317,7 @@ def setup(self, func, N, dtype): if func == "argmax" and dtype in {"Int64", "boolean"}: # Skip argmax for nullable int since this doesn't work yet (GH-24382) raise NotImplementedError - self.s = Series([1] * N, dtype=dtype) + self.s = Series(np.ones(N), dtype=dtype) self.func = getattr(self.s, func) def time_func(self, func, N, dtype): From a2cea9e36c7e677aab68ace3c01abb696844f6e1 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 11 Nov 2023 22:20:58 -0500 Subject: [PATCH 2/2] fix dtype --- asv_bench/benchmarks/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index ea8b1b966c09f..3c78b0a9a60c8 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -85,7 +85,7 @@ def setup(self, unique, keep, dtype): N = 10**5 if dtype in ["int64", "uint64"]: data = pd.Index(np.arange(N), dtype=dtype) - elif dtype == "float": + elif dtype == "float64": data = pd.Index(np.random.randn(N), dtype="float64") elif dtype == "string": data = tm.makeStringIndex(N)