TST: Parameterize & make tests more performant (#55830)

mroeschke · web-flow · commit 8c52003df4e0 · 2023-11-06T17:16:58.000-08:00
* use _SIZE_CUTOFF in test_indexer_caching

* use _SIZE_CUTOFF in test_loc_getitem_large_series

* Parameterize test_align_fill_method

* Mark numba engine as single_cpu

* Reduce data size for test_concat_copies

* Use _SIZE_CUTOFF in test_large_mi_contains

* Parameterize

* Parameterize test_round_trip_current

* Remove str call

* Param over limit
diff --git a/pandas/tests/apply/conftest.py b/pandas/tests/apply/conftest.py
@@ -18,7 +18,7 @@ def int_frame_const_col():
     return df
 
 
-@pytest.fixture(params=["python", "numba"])
+@pytest.fixture(params=["python", pytest.param("numba", marks=pytest.mark.single_cpu)])
 def engine(request):
     if request.param == "numba":
         pytest.importorskip("numba")
diff --git a/pandas/tests/frame/methods/test_align.py b/pandas/tests/frame/methods/test_align.py
@@ -392,27 +392,57 @@ def test_missing_axis_specification_exception(self):
         with pytest.raises(ValueError, match=r"axis=0 or 1"):
             df.align(series)
 
-    def _check_align(self, a, b, axis, fill_axis, how, method, limit=None):
+    @pytest.mark.parametrize("method", ["pad", "bfill"])
+    @pytest.mark.parametrize("axis", [0, 1, None])
+    @pytest.mark.parametrize("fill_axis", [0, 1])
+    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
+    @pytest.mark.parametrize(
+        "left_slice",
+        [
+            [slice(4), slice(10)],
+            [slice(0), slice(0)],
+        ],
+    )
+    @pytest.mark.parametrize(
+        "right_slice",
+        [
+            [slice(2, None), slice(6, None)],
+            [slice(0), slice(0)],
+        ],
+    )
+    @pytest.mark.parametrize("limit", [1, None])
+    def test_align_fill_method(
+        self, how, method, axis, fill_axis, float_frame, left_slice, right_slice, limit
+    ):
+        frame = float_frame
+        left = frame.iloc[left_slice[0], left_slice[1]]
+        right = frame.iloc[right_slice[0], right_slice[1]]
+
         msg = (
             "The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
             "are deprecated"
         )
 
         with tm.assert_produces_warning(FutureWarning, match=msg):
-            aa, ab = a.align(
-                b, axis=axis, join=how, method=method, limit=limit, fill_axis=fill_axis
+            aa, ab = left.align(
+                right,
+                axis=axis,
+                join=how,
+                method=method,
+                limit=limit,
+                fill_axis=fill_axis,
             )
 
         join_index, join_columns = None, None
 
-        ea, eb = a, b
+        ea, eb = left, right
         if axis is None or axis == 0:
-            join_index = a.index.join(b.index, how=how)
+            join_index = left.index.join(right.index, how=how)
             ea = ea.reindex(index=join_index)
             eb = eb.reindex(index=join_index)
 
         if axis is None or axis == 1:
-            join_columns = a.columns.join(b.columns, how=how)
+            join_columns = left.columns.join(right.columns, how=how)
             ea = ea.reindex(columns=join_columns)
             eb = eb.reindex(columns=join_columns)
 
@@ -424,42 +454,6 @@ def _check_align(self, a, b, axis, fill_axis, how, method, limit=None):
         tm.assert_frame_equal(aa, ea)
         tm.assert_frame_equal(ab, eb)
 
-    @pytest.mark.parametrize("meth", ["pad", "bfill"])
-    @pytest.mark.parametrize("ax", [0, 1, None])
-    @pytest.mark.parametrize("fax", [0, 1])
-    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
-    def test_align_fill_method(self, how, meth, ax, fax, float_frame):
-        df = float_frame
-        self._check_align_fill(df, how, meth, ax, fax)
-
-    def _check_align_fill(self, frame, kind, meth, ax, fax):
-        left = frame.iloc[0:4, :10]
-        right = frame.iloc[2:, 6:]
-        empty = frame.iloc[:0, :0]
-
-        self._check_align(left, right, axis=ax, fill_axis=fax, how=kind, method=meth)
-        self._check_align(
-            left, right, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1
-        )
-
-        # empty left
-        self._check_align(empty, right, axis=ax, fill_axis=fax, how=kind, method=meth)
-        self._check_align(
-            empty, right, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1
-        )
-
-        # empty right
-        self._check_align(left, empty, axis=ax, fill_axis=fax, how=kind, method=meth)
-        self._check_align(
-            left, empty, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1
-        )
-
-        # both empty
-        self._check_align(empty, empty, axis=ax, fill_axis=fax, how=kind, method=meth)
-        self._check_align(
-            empty, empty, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1
-        )
-
     def test_align_series_check_copy(self):
         # GH#
         df = DataFrame({0: [1, 2]})
diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py
@@ -4,6 +4,7 @@
 import numpy as np
 import pytest
 
+from pandas._libs import index as libindex
 from pandas.errors import (
     InvalidIndexError,
     PerformanceWarning,
@@ -843,11 +844,12 @@ def test_contains_td64_level(self):
         assert "element_not_exit" not in idx
         assert "0 day 09:30:00" in idx
 
-    @pytest.mark.slow
-    def test_large_mi_contains(self):
+    def test_large_mi_contains(self, monkeypatch):
         # GH#10645
-        result = MultiIndex.from_arrays([range(10**6), range(10**6)])
-        assert (10**6, 0) not in result
+        with monkeypatch.context():
+            monkeypatch.setattr(libindex, "_SIZE_CUTOFF", 10)
+            result = MultiIndex.from_arrays([range(10), range(10)])
+            assert (10, 0) not in result
 
 
 def test_timestamp_multiindex_indexer():
diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pytest
 
+from pandas._libs import index as libindex
 from pandas.compat import IS64
 
 import pandas as pd
@@ -72,15 +73,18 @@ def test_getitem_non_matching(self, series_with_interval_index, indexer_sl):
         with pytest.raises(KeyError, match=r"\[-1\] not in index"):
             indexer_sl(ser)[[-1, 3]]
 
-    @pytest.mark.slow
-    def test_loc_getitem_large_series(self):
-        ser = Series(
-            np.arange(1000000), index=IntervalIndex.from_breaks(np.arange(1000001))
-        )
-
-        result1 = ser.loc[:80000]
-        result2 = ser.loc[0:80000]
-        result3 = ser.loc[0:80000:1]
+    def test_loc_getitem_large_series(self, monkeypatch):
+        size_cutoff = 20
+        with monkeypatch.context():
+            monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff)
+            ser = Series(
+                np.arange(size_cutoff),
+                index=IntervalIndex.from_breaks(np.arange(size_cutoff + 1)),
+            )
+
+            result1 = ser.loc[:8]
+            result2 = ser.loc[0:8]
+            result3 = ser.loc[0:8:1]
         tm.assert_series_equal(result1, result2)
         tm.assert_series_equal(result1, result3)
 
diff --git a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pytest
 
+from pandas._libs import index as libindex
 from pandas.errors import SettingWithCopyError
 import pandas.util._test_decorators as td
 
@@ -69,15 +70,16 @@ def test_cache_updating(using_copy_on_write):
     assert result == 2
 
 
-@pytest.mark.slow
-def test_indexer_caching():
+def test_indexer_caching(monkeypatch):
     # GH5727
     # make sure that indexers are in the _internal_names_set
-    n = 1000001
-    index = MultiIndex.from_arrays([np.arange(n), np.arange(n)])
-    ser = Series(np.zeros(n), index=index)
+    size_cutoff = 20
+    with monkeypatch.context():
+        monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff)
+        index = MultiIndex.from_arrays([np.arange(size_cutoff), np.arange(size_cutoff)])
+        s = Series(np.zeros(size_cutoff), index=index)
 
-    # setitem
-    expected = Series(np.ones(n), index=index)
-    ser[ser == 0] = 1
-    tm.assert_series_equal(ser, expected)
+        # setitem
+        s[s == 0] = 1
+    expected = Series(np.ones(size_cutoff), index=index)
+    tm.assert_series_equal(s, expected)
diff --git a/pandas/tests/io/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py
@@ -124,7 +124,7 @@ def _create_sp_frame():
     return DataFrame(data, index=dates).apply(SparseArray)
 
 
-def create_data():
+def create_pickle_data():
     """create the pickle data"""
     data = {
         "A": [0.0, 1.0, 2.0, 3.0, np.nan],
@@ -282,12 +282,6 @@ def create_data():
     }
 
 
-def create_pickle_data():
-    data = create_data()
-
-    return data
-
-
 def platform_name():
     return "_".join(
         [
diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py
@@ -298,6 +298,8 @@ def test_empty_field_eof(self):
         }
         assert_array_dicts_equal(result, expected)
 
+    @pytest.mark.parametrize("repeat", range(10))
+    def test_empty_field_eof_mem_access_bug(self, repeat):
         # GH5664
         a = DataFrame([["b"], [np.nan]], columns=["a"], index=["a", "c"])
         b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], columns=list("abcd"), index=[1, 1])
@@ -312,21 +314,20 @@ def test_empty_field_eof(self):
             index=[0, 5, 7, 12],
         )
 
-        for _ in range(100):
-            df = read_csv(StringIO("a,b\nc\n"), skiprows=0, names=["a"], engine="c")
-            tm.assert_frame_equal(df, a)
+        df = read_csv(StringIO("a,b\nc\n"), skiprows=0, names=["a"], engine="c")
+        tm.assert_frame_equal(df, a)
 
-            df = read_csv(
-                StringIO("1,1,1,1,0\n" * 2 + "\n" * 2), names=list("abcd"), engine="c"
-            )
-            tm.assert_frame_equal(df, b)
+        df = read_csv(
+            StringIO("1,1,1,1,0\n" * 2 + "\n" * 2), names=list("abcd"), engine="c"
+        )
+        tm.assert_frame_equal(df, b)
 
-            df = read_csv(
-                StringIO("0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14"),
-                names=list("abcd"),
-                engine="c",
-            )
-            tm.assert_frame_equal(df, c)
+        df = read_csv(
+            StringIO("0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14"),
+            names=list("abcd"),
+            engine="c",
+        )
+        tm.assert_frame_equal(df, c)
 
     def test_empty_csv_input(self):
         # GH14867
diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py
@@ -10,6 +10,8 @@
 
 3. Move the created pickle to "data/legacy_pickle/<version>" directory.
 """
+from __future__ import annotations
+
 from array import array
 import bz2
 import datetime
@@ -22,6 +24,7 @@
 import pickle
 import shutil
 import tarfile
+from typing import Any
 import uuid
 import zipfile
 
@@ -52,12 +55,6 @@
 )
 
 
-@pytest.fixture
-def current_pickle_data():
-    # our current version pickle data
-    return create_pickle_data()
-
-
 # ---------------------
 # comparison functions
 # ---------------------
@@ -173,6 +170,15 @@ def python_unpickler(path):
         return pickle.load(fh)
 
 
+def flatten(data: dict) -> list[tuple[str, Any]]:
+    """Flatten create_pickle_data"""
+    return [
+        (typ, example)
+        for typ, examples in data.items()
+        for example in examples.values()
+    ]
+
+
 @pytest.mark.parametrize(
     "pickle_writer",
     [
@@ -190,29 +196,27 @@ def python_unpickler(path):
     ],
 )
 @pytest.mark.parametrize("writer", [pd.to_pickle, python_pickler])
-def test_round_trip_current(current_pickle_data, pickle_writer, writer):
-    data = current_pickle_data
-    for typ, dv in data.items():
-        for dt, expected in dv.items():
-            with tm.ensure_clean() as path:
-                # test writing with each pickler
-                pickle_writer(expected, path)
-
-                # test reading with each unpickler
-                result = pd.read_pickle(path)
-                compare_element(result, expected, typ)
-
-                result = python_unpickler(path)
-                compare_element(result, expected, typ)
-
-                # and the same for file objects (GH 35679)
-                with open(path, mode="wb") as handle:
-                    writer(expected, path)
-                    handle.seek(0)  # shouldn't close file handle
-                with open(path, mode="rb") as handle:
-                    result = pd.read_pickle(handle)
-                    handle.seek(0)  # shouldn't close file handle
-                compare_element(result, expected, typ)
+@pytest.mark.parametrize("typ, expected", flatten(create_pickle_data()))
+def test_round_trip_current(typ, expected, pickle_writer, writer):
+    with tm.ensure_clean() as path:
+        # test writing with each pickler
+        pickle_writer(expected, path)
+
+        # test reading with each unpickler
+        result = pd.read_pickle(path)
+        compare_element(result, expected, typ)
+
+        result = python_unpickler(path)
+        compare_element(result, expected, typ)
+
+        # and the same for file objects (GH 35679)
+        with open(path, mode="wb") as handle:
+            writer(expected, path)
+            handle.seek(0)  # shouldn't close file handle
+        with open(path, mode="rb") as handle:
+            result = pd.read_pickle(handle)
+            handle.seek(0)  # shouldn't close file handle
+        compare_element(result, expected, typ)
 
 
 def test_pickle_path_pathlib():
diff --git a/pandas/tests/reshape/concat/test_dataframe.py b/pandas/tests/reshape/concat/test_dataframe.py
@@ -197,7 +197,7 @@ def test_concat_duplicates_in_index_with_keys(self):
     @pytest.mark.parametrize("axis", [0, 1])
     def test_concat_copies(self, axis, order, ignore_index, using_copy_on_write):
         # based on asv ConcatDataFrames
-        df = DataFrame(np.zeros((10000, 200), dtype=np.float32, order=order))
+        df = DataFrame(np.zeros((10, 5), dtype=np.float32, order=order))
 
         res = concat([df] * 5, axis=axis, ignore_index=ignore_index, copy=True)