ASV: clean/simplify setup methods (pandas-dev#55923)

lukemanley · web-flow · commit c4f0e6fce4b9 · 2023-11-13T17:08:15.000-08:00
* clean setup methods

* fix dtype
diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
@@ -1,7 +1,6 @@
 from importlib import import_module
 
 import numpy as np
-import pyarrow as pa
 
 import pandas as pd
 
@@ -20,9 +19,9 @@ class Factorize:
         [True, False],
         [True, False],
         [
-            "int",
-            "uint",
-            "float",
+            "int64",
+            "uint64",
+            "float64",
             "object",
             "object_str",
             "datetime64[ns]",
@@ -36,28 +35,24 @@ class Factorize:
 
     def setup(self, unique, sort, dtype):
         N = 10**5
-        string_index = tm.makeStringIndex(N)
-        string_arrow = None
-        if dtype == "string[pyarrow]":
-            try:
-                string_arrow = pd.array(string_index, dtype="string[pyarrow]")
-            except ImportError:
-                raise NotImplementedError
-
-        data = {
-            "int": pd.Index(np.arange(N), dtype="int64"),
-            "uint": pd.Index(np.arange(N), dtype="uint64"),
-            "float": pd.Index(np.random.randn(N), dtype="float64"),
-            "object_str": string_index,
-            "object": pd.Index(np.arange(N), dtype="object"),
-            "datetime64[ns]": pd.date_range("2011-01-01", freq="h", periods=N),
-            "datetime64[ns, tz]": pd.date_range(
-                "2011-01-01", freq="h", periods=N, tz="Asia/Tokyo"
-            ),
-            "Int64": pd.array(np.arange(N), dtype="Int64"),
-            "boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"),
-            "string[pyarrow]": string_arrow,
-        }[dtype]
+
+        if dtype in ["int64", "uint64", "Int64", "object"]:
+            data = pd.Index(np.arange(N), dtype=dtype)
+        elif dtype == "float64":
+            data = pd.Index(np.random.randn(N), dtype=dtype)
+        elif dtype == "boolean":
+            data = pd.array(np.random.randint(0, 2, N), dtype=dtype)
+        elif dtype == "datetime64[ns]":
+            data = pd.date_range("2011-01-01", freq="h", periods=N)
+        elif dtype == "datetime64[ns, tz]":
+            data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo")
+        elif dtype == "object_str":
+            data = tm.makeStringIndex(N)
+        elif dtype == "string[pyarrow]":
+            data = pd.array(tm.makeStringIndex(N), dtype="string[pyarrow]")
+        else:
+            raise NotImplementedError
+
         if not unique:
             data = data.repeat(5)
         self.data = data
@@ -74,9 +69,9 @@ class Duplicated:
         [True, False],
         ["first", "last", False],
         [
-            "int",
-            "uint",
-            "float",
+            "int64",
+            "uint64",
+            "float64",
             "string",
             "datetime64[ns]",
             "datetime64[ns, tz]",
@@ -88,22 +83,20 @@ class Duplicated:
 
     def setup(self, unique, keep, dtype):
         N = 10**5
-        data = {
-            "int": pd.Index(np.arange(N), dtype="int64"),
-            "uint": pd.Index(np.arange(N), dtype="uint64"),
-            "float": pd.Index(np.random.randn(N), dtype="float64"),
-            "string": tm.makeStringIndex(N),
-            "datetime64[ns]": pd.date_range("2011-01-01", freq="h", periods=N),
-            "datetime64[ns, tz]": pd.date_range(
-                "2011-01-01", freq="h", periods=N, tz="Asia/Tokyo"
-            ),
-            "timestamp[ms][pyarrow]": pd.Index(
-                np.arange(N), dtype=pd.ArrowDtype(pa.timestamp("ms"))
-            ),
-            "duration[s][pyarrow]": pd.Index(
-                np.arange(N), dtype=pd.ArrowDtype(pa.duration("s"))
-            ),
-        }[dtype]
+        if dtype in ["int64", "uint64"]:
+            data = pd.Index(np.arange(N), dtype=dtype)
+        elif dtype == "float64":
+            data = pd.Index(np.random.randn(N), dtype="float64")
+        elif dtype == "string":
+            data = tm.makeStringIndex(N)
+        elif dtype == "datetime64[ns]":
+            data = pd.date_range("2011-01-01", freq="h", periods=N)
+        elif dtype == "datetime64[ns, tz]":
+            data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo")
+        elif dtype in ["timestamp[ms][pyarrow]", "duration[s][pyarrow]"]:
+            data = pd.Index(np.arange(N), dtype=dtype)
+        else:
+            raise NotImplementedError
         if not unique:
             data = data.repeat(5)
         self.idx = data
@@ -181,21 +174,22 @@ class Quantile:
     params = [
         [0, 0.5, 1],
         ["linear", "nearest", "lower", "higher", "midpoint"],
-        ["float", "int", "uint"],
+        ["float64", "int64", "uint64"],
     ]
     param_names = ["quantile", "interpolation", "dtype"]
 
     def setup(self, quantile, interpolation, dtype):
         N = 10**5
-        data = {
-            "int": np.arange(N),
-            "uint": np.arange(N).astype(np.uint64),
-            "float": np.random.randn(N),
-        }
-        self.idx = pd.Series(data[dtype].repeat(5))
+        if dtype in ["int64", "uint64"]:
+            data = np.arange(N, dtype=dtype)
+        elif dtype == "float64":
+            data = np.random.randn(N)
+        else:
+            raise NotImplementedError
+        self.ser = pd.Series(data.repeat(5))
 
     def time_quantile(self, quantile, interpolation, dtype):
-        self.idx.quantile(quantile, interpolation=interpolation)
+        self.ser.quantile(quantile, interpolation=interpolation)
 
 
 class SortIntegerArray:
diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py
@@ -31,9 +31,9 @@ def time_from_float_array(self):
 class IntegerArray:
     def setup(self):
         N = 250_000
-        self.values_integer = np.array([1, 0, 1, 0] * N)
-        self.data = np.array([1, 2, 3, 4] * N, dtype="int64")
-        self.mask = np.array([False, False, True, False] * N)
+        self.values_integer = np.tile(np.array([1, 0, 1, 0]), N)
+        self.data = np.tile(np.array([1, 2, 3, 4], dtype="int64"), N)
+        self.mask = np.tile(np.array([False, False, True, False]), N)
 
     def time_constructor(self):
         pd.arrays.IntegerArray(self.data, self.mask)
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
@@ -260,18 +260,16 @@ class CategoricalSlicing:
     def setup(self, index):
         N = 10**6
         categories = ["a", "b", "c"]
-        values = [0] * N + [1] * N + [2] * N
         if index == "monotonic_incr":
-            self.data = pd.Categorical.from_codes(values, categories=categories)
+            codes = np.repeat([0, 1, 2], N)
         elif index == "monotonic_decr":
-            self.data = pd.Categorical.from_codes(
-                list(reversed(values)), categories=categories
-            )
+            codes = np.repeat([2, 1, 0], N)
         elif index == "non_monotonic":
-            self.data = pd.Categorical.from_codes([0, 1, 2] * N, categories=categories)
+            codes = np.tile([0, 1, 2], N)
         else:
             raise ValueError(f"Invalid index param: {index}")
 
+        self.data = pd.Categorical.from_codes(codes, categories=categories)
         self.scalar = 10000
         self.list = list(range(10000))
         self.cat_scalar = "b"
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
@@ -640,7 +640,8 @@ def time_frame_nunique(self):
 
 class SeriesNuniqueWithNan:
     def setup(self):
-        self.ser = Series(100000 * (100 * [np.nan] + list(range(100)))).astype(float)
+        values = 100 * [np.nan] + list(range(100))
+        self.ser = Series(np.tile(values, 10000), dtype=float)
 
     def time_series_nunique_nan(self):
         self.ser.nunique()
diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py
@@ -272,18 +272,20 @@ class ParallelReadCSV(BaseIO):
     def setup(self, dtype):
         rows = 10000
         cols = 50
-        data = {
-            "float": DataFrame(np.random.randn(rows, cols)),
-            "datetime": DataFrame(
+        if dtype == "float":
+            df = DataFrame(np.random.randn(rows, cols))
+        elif dtype == "datetime":
+            df = DataFrame(
                 np.random.randn(rows, cols), index=date_range("1/1/2000", periods=rows)
-            ),
-            "object": DataFrame(
+            )
+        elif dtype == "object":
+            df = DataFrame(
                 "foo", index=range(rows), columns=["object%03d" for _ in range(5)]
-            ),
-        }
+            )
+        else:
+            raise NotImplementedError
 
         self.fname = f"__test_{dtype}__.csv"
-        df = data[dtype]
         df.to_csv(self.fname)
 
         @test_parallel(num_threads=2)
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -713,7 +713,7 @@ def setup(self, dtype, tie_method):
         if dtype == "datetime64":
             data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype)
         else:
-            data = np.array([1] * N, dtype=dtype)
+            data = np.ones(N, dtype=dtype)
         self.df = DataFrame({"values": data, "key": ["foo"] * N})
 
     def time_rank_ties(self, dtype, tie_method):
diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py
@@ -161,9 +161,7 @@ def setup(self, dtype):
         self.sorted = self.idx.sort_values()
         half = N // 2
         self.non_unique = self.idx[:half].append(self.idx[:half])
-        self.non_unique_sorted = (
-            self.sorted[:half].append(self.sorted[:half]).sort_values()
-        )
+        self.non_unique_sorted = self.sorted[:half].repeat(2)
         self.key = self.sorted[N // 4]
 
     def time_boolean_array(self, dtype):
diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py
@@ -71,22 +71,20 @@ def setup(self, engine_and_dtype, index_type, unique, N):
             if unique:
                 arr = np.arange(N * 3, dtype=dtype)
             else:
-                values = list([1] * N + [2] * N + [3] * N)
-                arr = np.array(values, dtype=dtype)
+                arr = np.array([1, 2, 3], dtype=dtype).repeat(N)
         elif index_type == "monotonic_decr":
             if unique:
                 arr = np.arange(N * 3, dtype=dtype)[::-1]
             else:
-                values = list([1] * N + [2] * N + [3] * N)
-                arr = np.array(values, dtype=dtype)[::-1]
+                arr = np.array([3, 2, 1], dtype=dtype).repeat(N)
         else:
             assert index_type == "non_monotonic"
             if unique:
                 arr = np.empty(N * 3, dtype=dtype)
                 arr[:N] = np.arange(N * 2, N * 3, dtype=dtype)
                 arr[N:] = np.arange(N * 2, dtype=dtype)
             else:
-                arr = np.array([1, 2, 3] * N, dtype=dtype)
+                arr = np.array([1, 2, 3], dtype=dtype).repeat(N)
 
         self.data = engine(arr)
         # code belows avoids populating the mapping etc. while timing.
@@ -115,30 +113,29 @@ class MaskedNumericEngineIndexing:
 
     def setup(self, engine_and_dtype, index_type, unique, N):
         engine, dtype = engine_and_dtype
+        dtype = dtype.lower()
 
         if index_type == "monotonic_incr":
             if unique:
-                arr = np.arange(N * 3, dtype=dtype.lower())
+                arr = np.arange(N * 3, dtype=dtype)
             else:
-                values = list([1] * N + [2] * N + [3] * N)
-                arr = np.array(values, dtype=dtype.lower())
+                arr = np.array([1, 2, 3], dtype=dtype).repeat(N)
             mask = np.zeros(N * 3, dtype=np.bool_)
         elif index_type == "monotonic_decr":
             if unique:
-                arr = np.arange(N * 3, dtype=dtype.lower())[::-1]
+                arr = np.arange(N * 3, dtype=dtype)[::-1]
             else:
-                values = list([1] * N + [2] * N + [3] * N)
-                arr = np.array(values, dtype=dtype.lower())[::-1]
+                arr = np.array([3, 2, 1], dtype=dtype).repeat(N)
             mask = np.zeros(N * 3, dtype=np.bool_)
         else:
             assert index_type == "non_monotonic"
             if unique:
-                arr = np.zeros(N * 3, dtype=dtype.lower())
-                arr[:N] = np.arange(N * 2, N * 3, dtype=dtype.lower())
-                arr[N:] = np.arange(N * 2, dtype=dtype.lower())
+                arr = np.zeros(N * 3, dtype=dtype)
+                arr[:N] = np.arange(N * 2, N * 3, dtype=dtype)
+                arr[N:] = np.arange(N * 2, dtype=dtype)
 
             else:
-                arr = np.array([1, 2, 3] * N, dtype=dtype.lower())
+                arr = np.array([1, 2, 3], dtype=dtype).repeat(N)
             mask = np.zeros(N * 3, dtype=np.bool_)
             mask[-1] = True
 
diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
@@ -317,7 +317,7 @@ def setup(self, func, N, dtype):
         if func == "argmax" and dtype in {"Int64", "boolean"}:
             # Skip argmax for nullable int since this doesn't work yet (GH-24382)
             raise NotImplementedError
-        self.s = Series([1] * N, dtype=dtype)
+        self.s = Series(np.ones(N), dtype=dtype)
         self.func = getattr(self.s, func)
 
     def time_func(self, func, N, dtype):