Skip to content

Commit c4f0e6f

Browse files
authored
ASV: clean/simplify setup methods (pandas-dev#55923)
* clean setup methods * fix dtype
1 parent 9a40316 commit c4f0e6f

9 files changed

+81
-91
lines changed

asv_bench/benchmarks/algorithms.py

+47-53
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from importlib import import_module
22

33
import numpy as np
4-
import pyarrow as pa
54

65
import pandas as pd
76

@@ -20,9 +19,9 @@ class Factorize:
2019
[True, False],
2120
[True, False],
2221
[
23-
"int",
24-
"uint",
25-
"float",
22+
"int64",
23+
"uint64",
24+
"float64",
2625
"object",
2726
"object_str",
2827
"datetime64[ns]",
@@ -36,28 +35,24 @@ class Factorize:
3635

3736
def setup(self, unique, sort, dtype):
3837
N = 10**5
39-
string_index = tm.makeStringIndex(N)
40-
string_arrow = None
41-
if dtype == "string[pyarrow]":
42-
try:
43-
string_arrow = pd.array(string_index, dtype="string[pyarrow]")
44-
except ImportError:
45-
raise NotImplementedError
46-
47-
data = {
48-
"int": pd.Index(np.arange(N), dtype="int64"),
49-
"uint": pd.Index(np.arange(N), dtype="uint64"),
50-
"float": pd.Index(np.random.randn(N), dtype="float64"),
51-
"object_str": string_index,
52-
"object": pd.Index(np.arange(N), dtype="object"),
53-
"datetime64[ns]": pd.date_range("2011-01-01", freq="h", periods=N),
54-
"datetime64[ns, tz]": pd.date_range(
55-
"2011-01-01", freq="h", periods=N, tz="Asia/Tokyo"
56-
),
57-
"Int64": pd.array(np.arange(N), dtype="Int64"),
58-
"boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"),
59-
"string[pyarrow]": string_arrow,
60-
}[dtype]
38+
39+
if dtype in ["int64", "uint64", "Int64", "object"]:
40+
data = pd.Index(np.arange(N), dtype=dtype)
41+
elif dtype == "float64":
42+
data = pd.Index(np.random.randn(N), dtype=dtype)
43+
elif dtype == "boolean":
44+
data = pd.array(np.random.randint(0, 2, N), dtype=dtype)
45+
elif dtype == "datetime64[ns]":
46+
data = pd.date_range("2011-01-01", freq="h", periods=N)
47+
elif dtype == "datetime64[ns, tz]":
48+
data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo")
49+
elif dtype == "object_str":
50+
data = tm.makeStringIndex(N)
51+
elif dtype == "string[pyarrow]":
52+
data = pd.array(tm.makeStringIndex(N), dtype="string[pyarrow]")
53+
else:
54+
raise NotImplementedError
55+
6156
if not unique:
6257
data = data.repeat(5)
6358
self.data = data
@@ -74,9 +69,9 @@ class Duplicated:
7469
[True, False],
7570
["first", "last", False],
7671
[
77-
"int",
78-
"uint",
79-
"float",
72+
"int64",
73+
"uint64",
74+
"float64",
8075
"string",
8176
"datetime64[ns]",
8277
"datetime64[ns, tz]",
@@ -88,22 +83,20 @@ class Duplicated:
8883

8984
def setup(self, unique, keep, dtype):
9085
N = 10**5
91-
data = {
92-
"int": pd.Index(np.arange(N), dtype="int64"),
93-
"uint": pd.Index(np.arange(N), dtype="uint64"),
94-
"float": pd.Index(np.random.randn(N), dtype="float64"),
95-
"string": tm.makeStringIndex(N),
96-
"datetime64[ns]": pd.date_range("2011-01-01", freq="h", periods=N),
97-
"datetime64[ns, tz]": pd.date_range(
98-
"2011-01-01", freq="h", periods=N, tz="Asia/Tokyo"
99-
),
100-
"timestamp[ms][pyarrow]": pd.Index(
101-
np.arange(N), dtype=pd.ArrowDtype(pa.timestamp("ms"))
102-
),
103-
"duration[s][pyarrow]": pd.Index(
104-
np.arange(N), dtype=pd.ArrowDtype(pa.duration("s"))
105-
),
106-
}[dtype]
86+
if dtype in ["int64", "uint64"]:
87+
data = pd.Index(np.arange(N), dtype=dtype)
88+
elif dtype == "float64":
89+
data = pd.Index(np.random.randn(N), dtype="float64")
90+
elif dtype == "string":
91+
data = tm.makeStringIndex(N)
92+
elif dtype == "datetime64[ns]":
93+
data = pd.date_range("2011-01-01", freq="h", periods=N)
94+
elif dtype == "datetime64[ns, tz]":
95+
data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo")
96+
elif dtype in ["timestamp[ms][pyarrow]", "duration[s][pyarrow]"]:
97+
data = pd.Index(np.arange(N), dtype=dtype)
98+
else:
99+
raise NotImplementedError
107100
if not unique:
108101
data = data.repeat(5)
109102
self.idx = data
@@ -181,21 +174,22 @@ class Quantile:
181174
params = [
182175
[0, 0.5, 1],
183176
["linear", "nearest", "lower", "higher", "midpoint"],
184-
["float", "int", "uint"],
177+
["float64", "int64", "uint64"],
185178
]
186179
param_names = ["quantile", "interpolation", "dtype"]
187180

188181
def setup(self, quantile, interpolation, dtype):
189182
N = 10**5
190-
data = {
191-
"int": np.arange(N),
192-
"uint": np.arange(N).astype(np.uint64),
193-
"float": np.random.randn(N),
194-
}
195-
self.idx = pd.Series(data[dtype].repeat(5))
183+
if dtype in ["int64", "uint64"]:
184+
data = np.arange(N, dtype=dtype)
185+
elif dtype == "float64":
186+
data = np.random.randn(N)
187+
else:
188+
raise NotImplementedError
189+
self.ser = pd.Series(data.repeat(5))
196190

197191
def time_quantile(self, quantile, interpolation, dtype):
198-
self.idx.quantile(quantile, interpolation=interpolation)
192+
self.ser.quantile(quantile, interpolation=interpolation)
199193

200194

201195
class SortIntegerArray:

asv_bench/benchmarks/array.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ def time_from_float_array(self):
3131
class IntegerArray:
3232
def setup(self):
3333
N = 250_000
34-
self.values_integer = np.array([1, 0, 1, 0] * N)
35-
self.data = np.array([1, 2, 3, 4] * N, dtype="int64")
36-
self.mask = np.array([False, False, True, False] * N)
34+
self.values_integer = np.tile(np.array([1, 0, 1, 0]), N)
35+
self.data = np.tile(np.array([1, 2, 3, 4], dtype="int64"), N)
36+
self.mask = np.tile(np.array([False, False, True, False]), N)
3737

3838
def time_constructor(self):
3939
pd.arrays.IntegerArray(self.data, self.mask)

asv_bench/benchmarks/categoricals.py

+4-6
Original file line numberDiff line numberDiff line change
@@ -260,18 +260,16 @@ class CategoricalSlicing:
260260
def setup(self, index):
261261
N = 10**6
262262
categories = ["a", "b", "c"]
263-
values = [0] * N + [1] * N + [2] * N
264263
if index == "monotonic_incr":
265-
self.data = pd.Categorical.from_codes(values, categories=categories)
264+
codes = np.repeat([0, 1, 2], N)
266265
elif index == "monotonic_decr":
267-
self.data = pd.Categorical.from_codes(
268-
list(reversed(values)), categories=categories
269-
)
266+
codes = np.repeat([2, 1, 0], N)
270267
elif index == "non_monotonic":
271-
self.data = pd.Categorical.from_codes([0, 1, 2] * N, categories=categories)
268+
codes = np.tile([0, 1, 2], N)
272269
else:
273270
raise ValueError(f"Invalid index param: {index}")
274271

272+
self.data = pd.Categorical.from_codes(codes, categories=categories)
275273
self.scalar = 10000
276274
self.list = list(range(10000))
277275
self.cat_scalar = "b"

asv_bench/benchmarks/frame_methods.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -640,7 +640,8 @@ def time_frame_nunique(self):
640640

641641
class SeriesNuniqueWithNan:
642642
def setup(self):
643-
self.ser = Series(100000 * (100 * [np.nan] + list(range(100)))).astype(float)
643+
values = 100 * [np.nan] + list(range(100))
644+
self.ser = Series(np.tile(values, 10000), dtype=float)
644645

645646
def time_series_nunique_nan(self):
646647
self.ser.nunique()

asv_bench/benchmarks/gil.py

+10-8
Original file line numberDiff line numberDiff line change
@@ -272,18 +272,20 @@ class ParallelReadCSV(BaseIO):
272272
def setup(self, dtype):
273273
rows = 10000
274274
cols = 50
275-
data = {
276-
"float": DataFrame(np.random.randn(rows, cols)),
277-
"datetime": DataFrame(
275+
if dtype == "float":
276+
df = DataFrame(np.random.randn(rows, cols))
277+
elif dtype == "datetime":
278+
df = DataFrame(
278279
np.random.randn(rows, cols), index=date_range("1/1/2000", periods=rows)
279-
),
280-
"object": DataFrame(
280+
)
281+
elif dtype == "object":
282+
df = DataFrame(
281283
"foo", index=range(rows), columns=["object%03d" for _ in range(5)]
282-
),
283-
}
284+
)
285+
else:
286+
raise NotImplementedError
284287

285288
self.fname = f"__test_{dtype}__.csv"
286-
df = data[dtype]
287289
df.to_csv(self.fname)
288290

289291
@test_parallel(num_threads=2)

asv_bench/benchmarks/groupby.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -713,7 +713,7 @@ def setup(self, dtype, tie_method):
713713
if dtype == "datetime64":
714714
data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype)
715715
else:
716-
data = np.array([1] * N, dtype=dtype)
716+
data = np.ones(N, dtype=dtype)
717717
self.df = DataFrame({"values": data, "key": ["foo"] * N})
718718

719719
def time_rank_ties(self, dtype, tie_method):

asv_bench/benchmarks/index_object.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -161,9 +161,7 @@ def setup(self, dtype):
161161
self.sorted = self.idx.sort_values()
162162
half = N // 2
163163
self.non_unique = self.idx[:half].append(self.idx[:half])
164-
self.non_unique_sorted = (
165-
self.sorted[:half].append(self.sorted[:half]).sort_values()
166-
)
164+
self.non_unique_sorted = self.sorted[:half].repeat(2)
167165
self.key = self.sorted[N // 4]
168166

169167
def time_boolean_array(self, dtype):

asv_bench/benchmarks/indexing_engines.py

+12-15
Original file line numberDiff line numberDiff line change
@@ -71,22 +71,20 @@ def setup(self, engine_and_dtype, index_type, unique, N):
7171
if unique:
7272
arr = np.arange(N * 3, dtype=dtype)
7373
else:
74-
values = list([1] * N + [2] * N + [3] * N)
75-
arr = np.array(values, dtype=dtype)
74+
arr = np.array([1, 2, 3], dtype=dtype).repeat(N)
7675
elif index_type == "monotonic_decr":
7776
if unique:
7877
arr = np.arange(N * 3, dtype=dtype)[::-1]
7978
else:
80-
values = list([1] * N + [2] * N + [3] * N)
81-
arr = np.array(values, dtype=dtype)[::-1]
79+
arr = np.array([3, 2, 1], dtype=dtype).repeat(N)
8280
else:
8381
assert index_type == "non_monotonic"
8482
if unique:
8583
arr = np.empty(N * 3, dtype=dtype)
8684
arr[:N] = np.arange(N * 2, N * 3, dtype=dtype)
8785
arr[N:] = np.arange(N * 2, dtype=dtype)
8886
else:
89-
arr = np.array([1, 2, 3] * N, dtype=dtype)
87+
arr = np.array([1, 2, 3], dtype=dtype).repeat(N)
9088

9189
self.data = engine(arr)
9290
# code belows avoids populating the mapping etc. while timing.
@@ -115,30 +113,29 @@ class MaskedNumericEngineIndexing:
115113

116114
def setup(self, engine_and_dtype, index_type, unique, N):
117115
engine, dtype = engine_and_dtype
116+
dtype = dtype.lower()
118117

119118
if index_type == "monotonic_incr":
120119
if unique:
121-
arr = np.arange(N * 3, dtype=dtype.lower())
120+
arr = np.arange(N * 3, dtype=dtype)
122121
else:
123-
values = list([1] * N + [2] * N + [3] * N)
124-
arr = np.array(values, dtype=dtype.lower())
122+
arr = np.array([1, 2, 3], dtype=dtype).repeat(N)
125123
mask = np.zeros(N * 3, dtype=np.bool_)
126124
elif index_type == "monotonic_decr":
127125
if unique:
128-
arr = np.arange(N * 3, dtype=dtype.lower())[::-1]
126+
arr = np.arange(N * 3, dtype=dtype)[::-1]
129127
else:
130-
values = list([1] * N + [2] * N + [3] * N)
131-
arr = np.array(values, dtype=dtype.lower())[::-1]
128+
arr = np.array([3, 2, 1], dtype=dtype).repeat(N)
132129
mask = np.zeros(N * 3, dtype=np.bool_)
133130
else:
134131
assert index_type == "non_monotonic"
135132
if unique:
136-
arr = np.zeros(N * 3, dtype=dtype.lower())
137-
arr[:N] = np.arange(N * 2, N * 3, dtype=dtype.lower())
138-
arr[N:] = np.arange(N * 2, dtype=dtype.lower())
133+
arr = np.zeros(N * 3, dtype=dtype)
134+
arr[:N] = np.arange(N * 2, N * 3, dtype=dtype)
135+
arr[N:] = np.arange(N * 2, dtype=dtype)
139136

140137
else:
141-
arr = np.array([1, 2, 3] * N, dtype=dtype.lower())
138+
arr = np.array([1, 2, 3], dtype=dtype).repeat(N)
142139
mask = np.zeros(N * 3, dtype=np.bool_)
143140
mask[-1] = True
144141

asv_bench/benchmarks/series_methods.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ def setup(self, func, N, dtype):
317317
if func == "argmax" and dtype in {"Int64", "boolean"}:
318318
# Skip argmax for nullable int since this doesn't work yet (GH-24382)
319319
raise NotImplementedError
320-
self.s = Series([1] * N, dtype=dtype)
320+
self.s = Series(np.ones(N), dtype=dtype)
321321
self.func = getattr(self.s, func)
322322

323323
def time_func(self, func, N, dtype):

0 commit comments

Comments
 (0)