Skip to content

ASV: clean/simplify setup methods #55923

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Nov 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 47 additions & 53 deletions asv_bench/benchmarks/algorithms.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from importlib import import_module

import numpy as np
import pyarrow as pa

import pandas as pd

Expand All @@ -20,9 +19,9 @@ class Factorize:
[True, False],
[True, False],
[
"int",
"uint",
"float",
"int64",
"uint64",
"float64",
"object",
"object_str",
"datetime64[ns]",
Expand All @@ -36,28 +35,24 @@ class Factorize:

def setup(self, unique, sort, dtype):
N = 10**5
string_index = tm.makeStringIndex(N)
string_arrow = None
if dtype == "string[pyarrow]":
try:
string_arrow = pd.array(string_index, dtype="string[pyarrow]")
except ImportError:
raise NotImplementedError

data = {
"int": pd.Index(np.arange(N), dtype="int64"),
"uint": pd.Index(np.arange(N), dtype="uint64"),
"float": pd.Index(np.random.randn(N), dtype="float64"),
"object_str": string_index,
"object": pd.Index(np.arange(N), dtype="object"),
"datetime64[ns]": pd.date_range("2011-01-01", freq="h", periods=N),
"datetime64[ns, tz]": pd.date_range(
"2011-01-01", freq="h", periods=N, tz="Asia/Tokyo"
),
"Int64": pd.array(np.arange(N), dtype="Int64"),
"boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"),
"string[pyarrow]": string_arrow,
}[dtype]

if dtype in ["int64", "uint64", "Int64", "object"]:
data = pd.Index(np.arange(N), dtype=dtype)
elif dtype == "float64":
data = pd.Index(np.random.randn(N), dtype=dtype)
elif dtype == "boolean":
data = pd.array(np.random.randint(0, 2, N), dtype=dtype)
elif dtype == "datetime64[ns]":
data = pd.date_range("2011-01-01", freq="h", periods=N)
elif dtype == "datetime64[ns, tz]":
data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo")
elif dtype == "object_str":
data = tm.makeStringIndex(N)
elif dtype == "string[pyarrow]":
data = pd.array(tm.makeStringIndex(N), dtype="string[pyarrow]")
else:
raise NotImplementedError

if not unique:
data = data.repeat(5)
self.data = data
Expand All @@ -74,9 +69,9 @@ class Duplicated:
[True, False],
["first", "last", False],
[
"int",
"uint",
"float",
"int64",
"uint64",
"float64",
"string",
"datetime64[ns]",
"datetime64[ns, tz]",
Expand All @@ -88,22 +83,20 @@ class Duplicated:

def setup(self, unique, keep, dtype):
N = 10**5
data = {
"int": pd.Index(np.arange(N), dtype="int64"),
"uint": pd.Index(np.arange(N), dtype="uint64"),
"float": pd.Index(np.random.randn(N), dtype="float64"),
"string": tm.makeStringIndex(N),
"datetime64[ns]": pd.date_range("2011-01-01", freq="h", periods=N),
"datetime64[ns, tz]": pd.date_range(
"2011-01-01", freq="h", periods=N, tz="Asia/Tokyo"
),
"timestamp[ms][pyarrow]": pd.Index(
np.arange(N), dtype=pd.ArrowDtype(pa.timestamp("ms"))
),
"duration[s][pyarrow]": pd.Index(
np.arange(N), dtype=pd.ArrowDtype(pa.duration("s"))
),
}[dtype]
if dtype in ["int64", "uint64"]:
data = pd.Index(np.arange(N), dtype=dtype)
elif dtype == "float64":
data = pd.Index(np.random.randn(N), dtype="float64")
elif dtype == "string":
data = tm.makeStringIndex(N)
elif dtype == "datetime64[ns]":
data = pd.date_range("2011-01-01", freq="h", periods=N)
elif dtype == "datetime64[ns, tz]":
data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo")
elif dtype in ["timestamp[ms][pyarrow]", "duration[s][pyarrow]"]:
data = pd.Index(np.arange(N), dtype=dtype)
else:
raise NotImplementedError
if not unique:
data = data.repeat(5)
self.idx = data
Expand Down Expand Up @@ -181,21 +174,22 @@ class Quantile:
params = [
[0, 0.5, 1],
["linear", "nearest", "lower", "higher", "midpoint"],
["float", "int", "uint"],
["float64", "int64", "uint64"],
]
param_names = ["quantile", "interpolation", "dtype"]

def setup(self, quantile, interpolation, dtype):
N = 10**5
data = {
"int": np.arange(N),
"uint": np.arange(N).astype(np.uint64),
"float": np.random.randn(N),
}
self.idx = pd.Series(data[dtype].repeat(5))
if dtype in ["int64", "uint64"]:
data = np.arange(N, dtype=dtype)
elif dtype == "float64":
data = np.random.randn(N)
else:
raise NotImplementedError
self.ser = pd.Series(data.repeat(5))

def time_quantile(self, quantile, interpolation, dtype):
self.idx.quantile(quantile, interpolation=interpolation)
self.ser.quantile(quantile, interpolation=interpolation)


class SortIntegerArray:
Expand Down
6 changes: 3 additions & 3 deletions asv_bench/benchmarks/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ def time_from_float_array(self):
class IntegerArray:
def setup(self):
N = 250_000
self.values_integer = np.array([1, 0, 1, 0] * N)
self.data = np.array([1, 2, 3, 4] * N, dtype="int64")
self.mask = np.array([False, False, True, False] * N)
self.values_integer = np.tile(np.array([1, 0, 1, 0]), N)
self.data = np.tile(np.array([1, 2, 3, 4], dtype="int64"), N)
self.mask = np.tile(np.array([False, False, True, False]), N)

def time_constructor(self):
pd.arrays.IntegerArray(self.data, self.mask)
Expand Down
10 changes: 4 additions & 6 deletions asv_bench/benchmarks/categoricals.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,18 +260,16 @@ class CategoricalSlicing:
def setup(self, index):
N = 10**6
categories = ["a", "b", "c"]
values = [0] * N + [1] * N + [2] * N
if index == "monotonic_incr":
self.data = pd.Categorical.from_codes(values, categories=categories)
codes = np.repeat([0, 1, 2], N)
elif index == "monotonic_decr":
self.data = pd.Categorical.from_codes(
list(reversed(values)), categories=categories
)
codes = np.repeat([2, 1, 0], N)
elif index == "non_monotonic":
self.data = pd.Categorical.from_codes([0, 1, 2] * N, categories=categories)
codes = np.tile([0, 1, 2], N)
else:
raise ValueError(f"Invalid index param: {index}")

self.data = pd.Categorical.from_codes(codes, categories=categories)
self.scalar = 10000
self.list = list(range(10000))
self.cat_scalar = "b"
Expand Down
3 changes: 2 additions & 1 deletion asv_bench/benchmarks/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -640,7 +640,8 @@ def time_frame_nunique(self):

class SeriesNuniqueWithNan:
def setup(self):
self.ser = Series(100000 * (100 * [np.nan] + list(range(100)))).astype(float)
values = 100 * [np.nan] + list(range(100))
self.ser = Series(np.tile(values, 10000), dtype=float)

def time_series_nunique_nan(self):
self.ser.nunique()
Expand Down
18 changes: 10 additions & 8 deletions asv_bench/benchmarks/gil.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,18 +272,20 @@ class ParallelReadCSV(BaseIO):
def setup(self, dtype):
rows = 10000
cols = 50
data = {
"float": DataFrame(np.random.randn(rows, cols)),
"datetime": DataFrame(
if dtype == "float":
df = DataFrame(np.random.randn(rows, cols))
elif dtype == "datetime":
df = DataFrame(
np.random.randn(rows, cols), index=date_range("1/1/2000", periods=rows)
),
"object": DataFrame(
)
elif dtype == "object":
df = DataFrame(
"foo", index=range(rows), columns=["object%03d" for _ in range(5)]
),
}
)
else:
raise NotImplementedError

self.fname = f"__test_{dtype}__.csv"
df = data[dtype]
df.to_csv(self.fname)

@test_parallel(num_threads=2)
Expand Down
2 changes: 1 addition & 1 deletion asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -713,7 +713,7 @@ def setup(self, dtype, tie_method):
if dtype == "datetime64":
data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype)
else:
data = np.array([1] * N, dtype=dtype)
data = np.ones(N, dtype=dtype)
self.df = DataFrame({"values": data, "key": ["foo"] * N})

def time_rank_ties(self, dtype, tie_method):
Expand Down
4 changes: 1 addition & 3 deletions asv_bench/benchmarks/index_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,7 @@ def setup(self, dtype):
self.sorted = self.idx.sort_values()
half = N // 2
self.non_unique = self.idx[:half].append(self.idx[:half])
self.non_unique_sorted = (
self.sorted[:half].append(self.sorted[:half]).sort_values()
)
self.non_unique_sorted = self.sorted[:half].repeat(2)
self.key = self.sorted[N // 4]

def time_boolean_array(self, dtype):
Expand Down
27 changes: 12 additions & 15 deletions asv_bench/benchmarks/indexing_engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,22 +71,20 @@ def setup(self, engine_and_dtype, index_type, unique, N):
if unique:
arr = np.arange(N * 3, dtype=dtype)
else:
values = list([1] * N + [2] * N + [3] * N)
arr = np.array(values, dtype=dtype)
arr = np.array([1, 2, 3], dtype=dtype).repeat(N)
elif index_type == "monotonic_decr":
if unique:
arr = np.arange(N * 3, dtype=dtype)[::-1]
else:
values = list([1] * N + [2] * N + [3] * N)
arr = np.array(values, dtype=dtype)[::-1]
arr = np.array([3, 2, 1], dtype=dtype).repeat(N)
else:
assert index_type == "non_monotonic"
if unique:
arr = np.empty(N * 3, dtype=dtype)
arr[:N] = np.arange(N * 2, N * 3, dtype=dtype)
arr[N:] = np.arange(N * 2, dtype=dtype)
else:
arr = np.array([1, 2, 3] * N, dtype=dtype)
arr = np.array([1, 2, 3], dtype=dtype).repeat(N)

self.data = engine(arr)
# code belows avoids populating the mapping etc. while timing.
Expand Down Expand Up @@ -115,30 +113,29 @@ class MaskedNumericEngineIndexing:

def setup(self, engine_and_dtype, index_type, unique, N):
engine, dtype = engine_and_dtype
dtype = dtype.lower()

if index_type == "monotonic_incr":
if unique:
arr = np.arange(N * 3, dtype=dtype.lower())
arr = np.arange(N * 3, dtype=dtype)
else:
values = list([1] * N + [2] * N + [3] * N)
arr = np.array(values, dtype=dtype.lower())
arr = np.array([1, 2, 3], dtype=dtype).repeat(N)
mask = np.zeros(N * 3, dtype=np.bool_)
elif index_type == "monotonic_decr":
if unique:
arr = np.arange(N * 3, dtype=dtype.lower())[::-1]
arr = np.arange(N * 3, dtype=dtype)[::-1]
else:
values = list([1] * N + [2] * N + [3] * N)
arr = np.array(values, dtype=dtype.lower())[::-1]
arr = np.array([3, 2, 1], dtype=dtype).repeat(N)
mask = np.zeros(N * 3, dtype=np.bool_)
else:
assert index_type == "non_monotonic"
if unique:
arr = np.zeros(N * 3, dtype=dtype.lower())
arr[:N] = np.arange(N * 2, N * 3, dtype=dtype.lower())
arr[N:] = np.arange(N * 2, dtype=dtype.lower())
arr = np.zeros(N * 3, dtype=dtype)
arr[:N] = np.arange(N * 2, N * 3, dtype=dtype)
arr[N:] = np.arange(N * 2, dtype=dtype)

else:
arr = np.array([1, 2, 3] * N, dtype=dtype.lower())
arr = np.array([1, 2, 3], dtype=dtype).repeat(N)
mask = np.zeros(N * 3, dtype=np.bool_)
mask[-1] = True

Expand Down
2 changes: 1 addition & 1 deletion asv_bench/benchmarks/series_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ def setup(self, func, N, dtype):
if func == "argmax" and dtype in {"Int64", "boolean"}:
# Skip argmax for nullable int since this doesn't work yet (GH-24382)
raise NotImplementedError
self.s = Series([1] * N, dtype=dtype)
self.s = Series(np.ones(N), dtype=dtype)
self.func = getattr(self.s, func)

def time_func(self, func, N, dtype):
Expand Down