TST: Refactor some slow tests (pandas-dev#53784)

mroeschke · im-vinicius · commit c3187528f25f · 2023-07-08T12:28:09.000+02:00
* Cleanup single used method

* Clean plotting test

* Improve test_series_groupby_nunique

* Address more slow tests

* Undo changes
diff --git a/pandas/tests/frame/methods/test_duplicated.py b/pandas/tests/frame/methods/test_duplicated.py
@@ -1,4 +1,5 @@
 import re
+import sys
 
 import numpy as np
 import pytest
@@ -21,14 +22,17 @@ def test_duplicated_with_misspelled_column_name(subset):
         df.duplicated(subset)
 
 
-@pytest.mark.slow
-def test_duplicated_do_not_fail_on_wide_dataframes():
+def test_duplicated_implemented_no_recursion():
     # gh-21524
-    # Given the wide dataframe with a lot of columns
-    # with different (important!) values
-    data = {f"col_{i:02d}": np.random.randint(0, 1000, 30000) for i in range(100)}
-    df = DataFrame(data).T
-    result = df.duplicated()
+    # Ensure duplicated isn't implemented using recursion that
+    # can fail on wide frames
+    df = DataFrame(np.random.randint(0, 1000, (10, 1000)))
+    rec_limit = sys.getrecursionlimit()
+    try:
+        sys.setrecursionlimit(100)
+        result = df.duplicated()
+    finally:
+        sys.setrecursionlimit(rec_limit)
 
     # Then duplicates produce the bool Series as a result and don't fail during
     # calculation. Actual values doesn't matter here, though usually it's all
diff --git a/pandas/tests/groupby/test_nunique.py b/pandas/tests/groupby/test_nunique.py
@@ -17,51 +17,43 @@
 
 
 @pytest.mark.slow
-@pytest.mark.parametrize("n", 10 ** np.arange(2, 6))
-@pytest.mark.parametrize("m", [10, 100, 1000])
 @pytest.mark.parametrize("sort", [False, True])
 @pytest.mark.parametrize("dropna", [False, True])
-def test_series_groupby_nunique(n, m, sort, dropna):
-    def check_nunique(df, keys, as_index=True):
-        original_df = df.copy()
-        gr = df.groupby(keys, as_index=as_index, sort=sort)
-        left = gr["julie"].nunique(dropna=dropna)
-
-        gr = df.groupby(keys, as_index=as_index, sort=sort)
-        right = gr["julie"].apply(Series.nunique, dropna=dropna)
-        if not as_index:
-            right = right.reset_index(drop=True)
-
-        if as_index:
-            tm.assert_series_equal(left, right, check_names=False)
-        else:
-            tm.assert_frame_equal(left, right, check_names=False)
-        tm.assert_frame_equal(df, original_df)
-
+@pytest.mark.parametrize("as_index", [True, False])
+@pytest.mark.parametrize("with_nan", [True, False])
+@pytest.mark.parametrize("keys", [["joe"], ["joe", "jim"]])
+def test_series_groupby_nunique(sort, dropna, as_index, with_nan, keys):
+    n = 100
+    m = 10
     days = date_range("2015-08-23", periods=10)
-
-    frame = DataFrame(
+    df = DataFrame(
         {
             "jim": np.random.choice(list(ascii_lowercase), n),
             "joe": np.random.choice(days, n),
             "julie": np.random.randint(0, m, n),
         }
     )
-
-    check_nunique(frame, ["jim"])
-    check_nunique(frame, ["jim", "joe"])
-
-    frame = frame.astype({"julie": float})  # Explicit cast to avoid implicit cast below
-    frame.loc[1::17, "jim"] = None
-    frame.loc[3::37, "joe"] = None
-    frame.loc[7::19, "julie"] = None
-    frame.loc[8::19, "julie"] = None
-    frame.loc[9::19, "julie"] = None
-
-    check_nunique(frame, ["jim"])
-    check_nunique(frame, ["jim", "joe"])
-    check_nunique(frame, ["jim"], as_index=False)
-    check_nunique(frame, ["jim", "joe"], as_index=False)
+    if with_nan:
+        df = df.astype({"julie": float})  # Explicit cast to avoid implicit cast below
+        df.loc[1::17, "jim"] = None
+        df.loc[3::37, "joe"] = None
+        df.loc[7::19, "julie"] = None
+        df.loc[8::19, "julie"] = None
+        df.loc[9::19, "julie"] = None
+    original_df = df.copy()
+    gr = df.groupby(keys, as_index=as_index, sort=sort)
+    left = gr["julie"].nunique(dropna=dropna)
+
+    gr = df.groupby(keys, as_index=as_index, sort=sort)
+    right = gr["julie"].apply(Series.nunique, dropna=dropna)
+    if not as_index:
+        right = right.reset_index(drop=True)
+
+    if as_index:
+        tm.assert_series_equal(left, right, check_names=False)
+    else:
+        tm.assert_frame_equal(left, right, check_names=False)
+    tm.assert_frame_equal(df, original_df)
 
 
 def test_nunique():
diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py
@@ -11,7 +11,6 @@
     IS64,
     PYPY,
 )
-import pandas.util._test_decorators as td
 
 from pandas import (
     CategoricalIndex,
@@ -504,9 +503,10 @@ def test_memory_usage_empty_no_warning():
     tm.assert_series_equal(result, expected)
 
 
-@td.skip_if_no("numba")
+@pytest.mark.single_cpu
 def test_info_compute_numba():
     # GH#51922
+    pytest.importorskip("numba")
     df = DataFrame([[1, 2], [3, 4]])
 
     with option_context("compute.use_numba", True):
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -1831,9 +1831,10 @@ def test_encoding_latin1_118(self, datapath):
         # will block pytests skip mechanism from triggering (failing the test)
         # if the path is not present
         path = datapath("io", "data", "stata", "stata1_encoding_118.dta")
-        with tm.assert_produces_warning(UnicodeWarning) as w:
+        with tm.assert_produces_warning(UnicodeWarning, filter_level="once") as w:
             encoded = read_stata(path)
-            assert len(w) == 151
+            # with filter_level="always", produces 151 warnings which can be slow
+            assert len(w) == 1
             assert w[0].message.args[0] == msg
 
         expected = DataFrame([["Düsseldorf"]] * 151, columns=["kreis1849"])
diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py
@@ -1782,39 +1782,33 @@ def _check(axes):
             _check_visible(ax.get_xticklabels(minor=True), visible=True)
 
     @td.skip_if_no_scipy
-    def test_memory_leak(self):
+    @pytest.mark.parametrize("kind", plotting.PlotAccessor._all_kinds)
+    def test_memory_leak(self, kind):
         """Check that every plot type gets properly collected."""
-        results = {}
-        for kind in plotting.PlotAccessor._all_kinds:
-            args = {}
-            if kind in ["hexbin", "scatter", "pie"]:
-                df = DataFrame(
-                    {
-                        "A": np.random.uniform(size=20),
-                        "B": np.random.uniform(size=20),
-                        "C": np.arange(20) + np.random.uniform(size=20),
-                    }
-                )
-                args = {"x": "A", "y": "B"}
-            elif kind == "area":
-                df = tm.makeTimeDataFrame().abs()
-            else:
-                df = tm.makeTimeDataFrame()
-
-            # Use a weakref so we can see if the object gets collected without
-            # also preventing it from being collected
-            results[kind] = weakref.proxy(df.plot(kind=kind, **args))
+        args = {}
+        if kind in ["hexbin", "scatter", "pie"]:
+            df = DataFrame(
+                {
+                    "A": np.random.uniform(size=20),
+                    "B": np.random.uniform(size=20),
+                    "C": np.arange(20) + np.random.uniform(size=20),
+                }
+            )
+            args = {"x": "A", "y": "B"}
+        elif kind == "area":
+            df = tm.makeTimeDataFrame().abs()
+        else:
+            df = tm.makeTimeDataFrame()
+
+        # Use a weakref so we can see if the object gets collected without
+        # also preventing it from being collected
+        ref = weakref.ref(df.plot(kind=kind, **args))
 
         # have matplotlib delete all the figures
         tm.close()
         # force a garbage collection
         gc.collect()
-        msg = "weakly-referenced object no longer exists"
-        for result_value in results.values():
-            # check that every plot was collected
-            with pytest.raises(ReferenceError, match=msg):
-                # need to actually access something to get an error
-                result_value.lines
+        assert ref() is None
 
     def test_df_gridspec_patterns(self):
         # GH 10819
diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py
@@ -392,38 +392,51 @@ def test_grouped_box_return_type(self, hist_df):
             result, None, expected_keys=["height", "weight", "category"]
         )
 
+    @pytest.mark.slow
+    def test_grouped_box_return_type_groupby(self, hist_df):
+        df = hist_df
         # now for groupby
         result = df.groupby("gender").boxplot(return_type="dict")
         _check_box_return_type(result, "dict", expected_keys=["Male", "Female"])
 
-        columns2 = "X B C D A G Y N Q O".split()
-        df2 = DataFrame(np.random.randn(50, 10), columns=columns2)
-        categories2 = "A B C D E F G H I J".split()
-        df2["category"] = categories2 * 5
+    @pytest.mark.slow
+    @pytest.mark.parametrize("return_type", ["dict", "axes", "both"])
+    def test_grouped_box_return_type_arg(self, hist_df, return_type):
+        df = hist_df
 
-        for t in ["dict", "axes", "both"]:
-            returned = df.groupby("classroom").boxplot(return_type=t)
-            _check_box_return_type(returned, t, expected_keys=["A", "B", "C"])
+        returned = df.groupby("classroom").boxplot(return_type=return_type)
+        _check_box_return_type(returned, return_type, expected_keys=["A", "B", "C"])
 
-            returned = df.boxplot(by="classroom", return_type=t)
-            _check_box_return_type(
-                returned, t, expected_keys=["height", "weight", "category"]
-            )
+        returned = df.boxplot(by="classroom", return_type=return_type)
+        _check_box_return_type(
+            returned, return_type, expected_keys=["height", "weight", "category"]
+        )
 
-            returned = df2.groupby("category").boxplot(return_type=t)
-            _check_box_return_type(returned, t, expected_keys=categories2)
+    @pytest.mark.slow
+    @pytest.mark.parametrize("return_type", ["dict", "axes", "both"])
+    def test_grouped_box_return_type_arg_duplcate_cats(self, return_type):
+        columns2 = "X B C D A".split()
+        df2 = DataFrame(np.random.randn(6, 5), columns=columns2)
+        categories2 = "A B".split()
+        df2["category"] = categories2 * 3
+
+        returned = df2.groupby("category").boxplot(return_type=return_type)
+        _check_box_return_type(returned, return_type, expected_keys=categories2)
 
-            returned = df2.boxplot(by="category", return_type=t)
-            _check_box_return_type(returned, t, expected_keys=columns2)
+        returned = df2.boxplot(by="category", return_type=return_type)
+        _check_box_return_type(returned, return_type, expected_keys=columns2)
 
     @pytest.mark.slow
-    def test_grouped_box_layout(self, hist_df):
+    def test_grouped_box_layout_too_small(self, hist_df):
         df = hist_df
 
         msg = "Layout of 1x1 must be larger than required size 2"
         with pytest.raises(ValueError, match=msg):
             df.boxplot(column=["weight", "height"], by=df.gender, layout=(1, 1))
 
+    @pytest.mark.slow
+    def test_grouped_box_layout_needs_by(self, hist_df):
+        df = hist_df
         msg = "The 'layout' keyword is not supported when 'by' is None"
         with pytest.raises(ValueError, match=msg):
             df.boxplot(
@@ -432,79 +445,84 @@ def test_grouped_box_layout(self, hist_df):
                 return_type="dict",
             )
 
+    @pytest.mark.slow
+    def test_grouped_box_layout_positive_layout(self, hist_df):
+        df = hist_df
         msg = "At least one dimension of layout must be positive"
         with pytest.raises(ValueError, match=msg):
             df.boxplot(column=["weight", "height"], by=df.gender, layout=(-1, -1))
 
-        # _check_plot_works adds an ax so catch warning. see GH #13188
-        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
-            _check_plot_works(
-                df.groupby("gender").boxplot, column="height", return_type="dict"
-            )
-        _check_axes_shape(mpl.pyplot.gcf().axes, axes_num=2, layout=(1, 2))
-
-        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
-            _check_plot_works(
-                df.groupby("category").boxplot, column="height", return_type="dict"
-            )
-        _check_axes_shape(mpl.pyplot.gcf().axes, axes_num=4, layout=(2, 2))
-
-        # GH 6769
+    @pytest.mark.slow
+    @pytest.mark.parametrize(
+        "gb_key, axes_num, rows",
+        [["gender", 2, 1], ["category", 4, 2], ["classroom", 3, 2]],
+    )
+    def test_grouped_box_layout_positive_layout_axes(
+        self, hist_df, gb_key, axes_num, rows
+    ):
+        df = hist_df
+        # _check_plot_works adds an ax so catch warning. see GH #13188 GH 6769
         with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
             _check_plot_works(
-                df.groupby("classroom").boxplot, column="height", return_type="dict"
+                df.groupby(gb_key).boxplot, column="height", return_type="dict"
             )
-        _check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(2, 2))
+        _check_axes_shape(mpl.pyplot.gcf().axes, axes_num=axes_num, layout=(rows, 2))
 
+    @pytest.mark.slow
+    @pytest.mark.parametrize(
+        "col, visible", [["height", False], ["weight", True], ["category", True]]
+    )
+    def test_grouped_box_layout_visible(self, hist_df, col, visible):
+        df = hist_df
         # GH 5897
         axes = df.boxplot(
             column=["height", "weight", "category"], by="gender", return_type="axes"
         )
         _check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(2, 2))
-        for ax in [axes["height"]]:
-            _check_visible(ax.get_xticklabels(), visible=False)
-            _check_visible([ax.xaxis.get_label()], visible=False)
-        for ax in [axes["weight"], axes["category"]]:
-            _check_visible(ax.get_xticklabels())
-            _check_visible([ax.xaxis.get_label()])
+        ax = axes[col]
+        _check_visible(ax.get_xticklabels(), visible=visible)
+        _check_visible([ax.xaxis.get_label()], visible=visible)
 
+    @pytest.mark.slow
+    def test_grouped_box_layout_shape(self, hist_df):
+        df = hist_df
         df.groupby("classroom").boxplot(
             column=["height", "weight", "category"], return_type="dict"
         )
         _check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(2, 2))
 
+    @pytest.mark.slow
+    @pytest.mark.parametrize("cols", [2, -1])
+    def test_grouped_box_layout_works(self, hist_df, cols):
+        df = hist_df
         with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
             _check_plot_works(
                 df.groupby("category").boxplot,
                 column="height",
-                layout=(3, 2),
-                return_type="dict",
-            )
-        _check_axes_shape(mpl.pyplot.gcf().axes, axes_num=4, layout=(3, 2))
-        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
-            _check_plot_works(
-                df.groupby("category").boxplot,
-                column="height",
-                layout=(3, -1),
+                layout=(3, cols),
                 return_type="dict",
             )
         _check_axes_shape(mpl.pyplot.gcf().axes, axes_num=4, layout=(3, 2))
 
-        df.boxplot(column=["height", "weight", "category"], by="gender", layout=(4, 1))
-        _check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(4, 1))
-
-        df.boxplot(column=["height", "weight", "category"], by="gender", layout=(-1, 1))
-        _check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(3, 1))
-
-        df.groupby("classroom").boxplot(
-            column=["height", "weight", "category"], layout=(1, 4), return_type="dict"
+    @pytest.mark.slow
+    @pytest.mark.parametrize("rows, res", [[4, 4], [-1, 3]])
+    def test_grouped_box_layout_axes_shape_rows(self, hist_df, rows, res):
+        df = hist_df
+        df.boxplot(
+            column=["height", "weight", "category"], by="gender", layout=(rows, 1)
         )
-        _check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(1, 4))
+        _check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(res, 1))
 
+    @pytest.mark.slow
+    @pytest.mark.parametrize("cols, res", [[4, 4], [-1, 3]])
+    def test_grouped_box_layout_axes_shape_cols_groupby(self, hist_df, cols, res):
+        df = hist_df
         df.groupby("classroom").boxplot(
-            column=["height", "weight", "category"], layout=(1, -1), return_type="dict"
+            column=["height", "weight", "category"],
+            layout=(1, cols),
+            return_type="dict",
         )
-        _check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(1, 3))
+        _check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(1, res))
 
     @pytest.mark.slow
     def test_grouped_box_multiple_axes(self, hist_df):
diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py