Skip to content

Commit c318752

Browse files
mroeschkeim-vinicius
authored and
im-vinicius
committed
TST: Refactor some slow tests (pandas-dev#53784)
* Cleanup single used method * Clean plotting test * Improve test_series_groupby_nunique * Address more slow tests * Undo changes
1 parent 1418b11 commit c318752

File tree

7 files changed

+199
-183
lines changed

7 files changed

+199
-183
lines changed

pandas/tests/frame/methods/test_duplicated.py

+11-7
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import re
2+
import sys
23

34
import numpy as np
45
import pytest
@@ -21,14 +22,17 @@ def test_duplicated_with_misspelled_column_name(subset):
2122
df.duplicated(subset)
2223

2324

24-
@pytest.mark.slow
25-
def test_duplicated_do_not_fail_on_wide_dataframes():
25+
def test_duplicated_implemented_no_recursion():
2626
# gh-21524
27-
# Given the wide dataframe with a lot of columns
28-
# with different (important!) values
29-
data = {f"col_{i:02d}": np.random.randint(0, 1000, 30000) for i in range(100)}
30-
df = DataFrame(data).T
31-
result = df.duplicated()
27+
# Ensure duplicated isn't implemented using recursion that
28+
# can fail on wide frames
29+
df = DataFrame(np.random.randint(0, 1000, (10, 1000)))
30+
rec_limit = sys.getrecursionlimit()
31+
try:
32+
sys.setrecursionlimit(100)
33+
result = df.duplicated()
34+
finally:
35+
sys.setrecursionlimit(rec_limit)
3236

3337
# Then duplicates produce the bool Series as a result and don't fail during
3438
# calculation. Actual values doesn't matter here, though usually it's all

pandas/tests/groupby/test_nunique.py

+28-36
Original file line numberDiff line numberDiff line change
@@ -17,51 +17,43 @@
1717

1818

1919
@pytest.mark.slow
20-
@pytest.mark.parametrize("n", 10 ** np.arange(2, 6))
21-
@pytest.mark.parametrize("m", [10, 100, 1000])
2220
@pytest.mark.parametrize("sort", [False, True])
2321
@pytest.mark.parametrize("dropna", [False, True])
24-
def test_series_groupby_nunique(n, m, sort, dropna):
25-
def check_nunique(df, keys, as_index=True):
26-
original_df = df.copy()
27-
gr = df.groupby(keys, as_index=as_index, sort=sort)
28-
left = gr["julie"].nunique(dropna=dropna)
29-
30-
gr = df.groupby(keys, as_index=as_index, sort=sort)
31-
right = gr["julie"].apply(Series.nunique, dropna=dropna)
32-
if not as_index:
33-
right = right.reset_index(drop=True)
34-
35-
if as_index:
36-
tm.assert_series_equal(left, right, check_names=False)
37-
else:
38-
tm.assert_frame_equal(left, right, check_names=False)
39-
tm.assert_frame_equal(df, original_df)
40-
22+
@pytest.mark.parametrize("as_index", [True, False])
23+
@pytest.mark.parametrize("with_nan", [True, False])
24+
@pytest.mark.parametrize("keys", [["joe"], ["joe", "jim"]])
25+
def test_series_groupby_nunique(sort, dropna, as_index, with_nan, keys):
26+
n = 100
27+
m = 10
4128
days = date_range("2015-08-23", periods=10)
42-
43-
frame = DataFrame(
29+
df = DataFrame(
4430
{
4531
"jim": np.random.choice(list(ascii_lowercase), n),
4632
"joe": np.random.choice(days, n),
4733
"julie": np.random.randint(0, m, n),
4834
}
4935
)
50-
51-
check_nunique(frame, ["jim"])
52-
check_nunique(frame, ["jim", "joe"])
53-
54-
frame = frame.astype({"julie": float}) # Explicit cast to avoid implicit cast below
55-
frame.loc[1::17, "jim"] = None
56-
frame.loc[3::37, "joe"] = None
57-
frame.loc[7::19, "julie"] = None
58-
frame.loc[8::19, "julie"] = None
59-
frame.loc[9::19, "julie"] = None
60-
61-
check_nunique(frame, ["jim"])
62-
check_nunique(frame, ["jim", "joe"])
63-
check_nunique(frame, ["jim"], as_index=False)
64-
check_nunique(frame, ["jim", "joe"], as_index=False)
36+
if with_nan:
37+
df = df.astype({"julie": float}) # Explicit cast to avoid implicit cast below
38+
df.loc[1::17, "jim"] = None
39+
df.loc[3::37, "joe"] = None
40+
df.loc[7::19, "julie"] = None
41+
df.loc[8::19, "julie"] = None
42+
df.loc[9::19, "julie"] = None
43+
original_df = df.copy()
44+
gr = df.groupby(keys, as_index=as_index, sort=sort)
45+
left = gr["julie"].nunique(dropna=dropna)
46+
47+
gr = df.groupby(keys, as_index=as_index, sort=sort)
48+
right = gr["julie"].apply(Series.nunique, dropna=dropna)
49+
if not as_index:
50+
right = right.reset_index(drop=True)
51+
52+
if as_index:
53+
tm.assert_series_equal(left, right, check_names=False)
54+
else:
55+
tm.assert_frame_equal(left, right, check_names=False)
56+
tm.assert_frame_equal(df, original_df)
6557

6658

6759
def test_nunique():

pandas/tests/io/formats/test_info.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
IS64,
1212
PYPY,
1313
)
14-
import pandas.util._test_decorators as td
1514

1615
from pandas import (
1716
CategoricalIndex,
@@ -504,9 +503,10 @@ def test_memory_usage_empty_no_warning():
504503
tm.assert_series_equal(result, expected)
505504

506505

507-
@td.skip_if_no("numba")
506+
@pytest.mark.single_cpu
508507
def test_info_compute_numba():
509508
# GH#51922
509+
pytest.importorskip("numba")
510510
df = DataFrame([[1, 2], [3, 4]])
511511

512512
with option_context("compute.use_numba", True):

pandas/tests/io/test_stata.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1831,9 +1831,10 @@ def test_encoding_latin1_118(self, datapath):
18311831
# will block pytests skip mechanism from triggering (failing the test)
18321832
# if the path is not present
18331833
path = datapath("io", "data", "stata", "stata1_encoding_118.dta")
1834-
with tm.assert_produces_warning(UnicodeWarning) as w:
1834+
with tm.assert_produces_warning(UnicodeWarning, filter_level="once") as w:
18351835
encoded = read_stata(path)
1836-
assert len(w) == 151
1836+
# with filter_level="always", produces 151 warnings which can be slow
1837+
assert len(w) == 1
18371838
assert w[0].message.args[0] == msg
18381839

18391840
expected = DataFrame([["Düsseldorf"]] * 151, columns=["kreis1849"])

pandas/tests/plotting/frame/test_frame.py

+21-27
Original file line numberDiff line numberDiff line change
@@ -1782,39 +1782,33 @@ def _check(axes):
17821782
_check_visible(ax.get_xticklabels(minor=True), visible=True)
17831783

17841784
@td.skip_if_no_scipy
1785-
def test_memory_leak(self):
1785+
@pytest.mark.parametrize("kind", plotting.PlotAccessor._all_kinds)
1786+
def test_memory_leak(self, kind):
17861787
"""Check that every plot type gets properly collected."""
1787-
results = {}
1788-
for kind in plotting.PlotAccessor._all_kinds:
1789-
args = {}
1790-
if kind in ["hexbin", "scatter", "pie"]:
1791-
df = DataFrame(
1792-
{
1793-
"A": np.random.uniform(size=20),
1794-
"B": np.random.uniform(size=20),
1795-
"C": np.arange(20) + np.random.uniform(size=20),
1796-
}
1797-
)
1798-
args = {"x": "A", "y": "B"}
1799-
elif kind == "area":
1800-
df = tm.makeTimeDataFrame().abs()
1801-
else:
1802-
df = tm.makeTimeDataFrame()
1803-
1804-
# Use a weakref so we can see if the object gets collected without
1805-
# also preventing it from being collected
1806-
results[kind] = weakref.proxy(df.plot(kind=kind, **args))
1788+
args = {}
1789+
if kind in ["hexbin", "scatter", "pie"]:
1790+
df = DataFrame(
1791+
{
1792+
"A": np.random.uniform(size=20),
1793+
"B": np.random.uniform(size=20),
1794+
"C": np.arange(20) + np.random.uniform(size=20),
1795+
}
1796+
)
1797+
args = {"x": "A", "y": "B"}
1798+
elif kind == "area":
1799+
df = tm.makeTimeDataFrame().abs()
1800+
else:
1801+
df = tm.makeTimeDataFrame()
1802+
1803+
# Use a weakref so we can see if the object gets collected without
1804+
# also preventing it from being collected
1805+
ref = weakref.ref(df.plot(kind=kind, **args))
18071806

18081807
# have matplotlib delete all the figures
18091808
tm.close()
18101809
# force a garbage collection
18111810
gc.collect()
1812-
msg = "weakly-referenced object no longer exists"
1813-
for result_value in results.values():
1814-
# check that every plot was collected
1815-
with pytest.raises(ReferenceError, match=msg):
1816-
# need to actually access something to get an error
1817-
result_value.lines
1811+
assert ref() is None
18181812

18191813
def test_df_gridspec_patterns(self):
18201814
# GH 10819

pandas/tests/plotting/test_boxplot_method.py

+76-58
Original file line numberDiff line numberDiff line change
@@ -392,38 +392,51 @@ def test_grouped_box_return_type(self, hist_df):
392392
result, None, expected_keys=["height", "weight", "category"]
393393
)
394394

395+
@pytest.mark.slow
396+
def test_grouped_box_return_type_groupby(self, hist_df):
397+
df = hist_df
395398
# now for groupby
396399
result = df.groupby("gender").boxplot(return_type="dict")
397400
_check_box_return_type(result, "dict", expected_keys=["Male", "Female"])
398401

399-
columns2 = "X B C D A G Y N Q O".split()
400-
df2 = DataFrame(np.random.randn(50, 10), columns=columns2)
401-
categories2 = "A B C D E F G H I J".split()
402-
df2["category"] = categories2 * 5
402+
@pytest.mark.slow
403+
@pytest.mark.parametrize("return_type", ["dict", "axes", "both"])
404+
def test_grouped_box_return_type_arg(self, hist_df, return_type):
405+
df = hist_df
403406

404-
for t in ["dict", "axes", "both"]:
405-
returned = df.groupby("classroom").boxplot(return_type=t)
406-
_check_box_return_type(returned, t, expected_keys=["A", "B", "C"])
407+
returned = df.groupby("classroom").boxplot(return_type=return_type)
408+
_check_box_return_type(returned, return_type, expected_keys=["A", "B", "C"])
407409

408-
returned = df.boxplot(by="classroom", return_type=t)
409-
_check_box_return_type(
410-
returned, t, expected_keys=["height", "weight", "category"]
411-
)
410+
returned = df.boxplot(by="classroom", return_type=return_type)
411+
_check_box_return_type(
412+
returned, return_type, expected_keys=["height", "weight", "category"]
413+
)
412414

413-
returned = df2.groupby("category").boxplot(return_type=t)
414-
_check_box_return_type(returned, t, expected_keys=categories2)
415+
@pytest.mark.slow
416+
@pytest.mark.parametrize("return_type", ["dict", "axes", "both"])
417+
def test_grouped_box_return_type_arg_duplcate_cats(self, return_type):
418+
columns2 = "X B C D A".split()
419+
df2 = DataFrame(np.random.randn(6, 5), columns=columns2)
420+
categories2 = "A B".split()
421+
df2["category"] = categories2 * 3
422+
423+
returned = df2.groupby("category").boxplot(return_type=return_type)
424+
_check_box_return_type(returned, return_type, expected_keys=categories2)
415425

416-
returned = df2.boxplot(by="category", return_type=t)
417-
_check_box_return_type(returned, t, expected_keys=columns2)
426+
returned = df2.boxplot(by="category", return_type=return_type)
427+
_check_box_return_type(returned, return_type, expected_keys=columns2)
418428

419429
@pytest.mark.slow
420-
def test_grouped_box_layout(self, hist_df):
430+
def test_grouped_box_layout_too_small(self, hist_df):
421431
df = hist_df
422432

423433
msg = "Layout of 1x1 must be larger than required size 2"
424434
with pytest.raises(ValueError, match=msg):
425435
df.boxplot(column=["weight", "height"], by=df.gender, layout=(1, 1))
426436

437+
@pytest.mark.slow
438+
def test_grouped_box_layout_needs_by(self, hist_df):
439+
df = hist_df
427440
msg = "The 'layout' keyword is not supported when 'by' is None"
428441
with pytest.raises(ValueError, match=msg):
429442
df.boxplot(
@@ -432,79 +445,84 @@ def test_grouped_box_layout(self, hist_df):
432445
return_type="dict",
433446
)
434447

448+
@pytest.mark.slow
449+
def test_grouped_box_layout_positive_layout(self, hist_df):
450+
df = hist_df
435451
msg = "At least one dimension of layout must be positive"
436452
with pytest.raises(ValueError, match=msg):
437453
df.boxplot(column=["weight", "height"], by=df.gender, layout=(-1, -1))
438454

439-
# _check_plot_works adds an ax so catch warning. see GH #13188
440-
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
441-
_check_plot_works(
442-
df.groupby("gender").boxplot, column="height", return_type="dict"
443-
)
444-
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=2, layout=(1, 2))
445-
446-
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
447-
_check_plot_works(
448-
df.groupby("category").boxplot, column="height", return_type="dict"
449-
)
450-
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=4, layout=(2, 2))
451-
452-
# GH 6769
455+
@pytest.mark.slow
456+
@pytest.mark.parametrize(
457+
"gb_key, axes_num, rows",
458+
[["gender", 2, 1], ["category", 4, 2], ["classroom", 3, 2]],
459+
)
460+
def test_grouped_box_layout_positive_layout_axes(
461+
self, hist_df, gb_key, axes_num, rows
462+
):
463+
df = hist_df
464+
# _check_plot_works adds an ax so catch warning. see GH #13188 GH 6769
453465
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
454466
_check_plot_works(
455-
df.groupby("classroom").boxplot, column="height", return_type="dict"
467+
df.groupby(gb_key).boxplot, column="height", return_type="dict"
456468
)
457-
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(2, 2))
469+
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=axes_num, layout=(rows, 2))
458470

471+
@pytest.mark.slow
472+
@pytest.mark.parametrize(
473+
"col, visible", [["height", False], ["weight", True], ["category", True]]
474+
)
475+
def test_grouped_box_layout_visible(self, hist_df, col, visible):
476+
df = hist_df
459477
# GH 5897
460478
axes = df.boxplot(
461479
column=["height", "weight", "category"], by="gender", return_type="axes"
462480
)
463481
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(2, 2))
464-
for ax in [axes["height"]]:
465-
_check_visible(ax.get_xticklabels(), visible=False)
466-
_check_visible([ax.xaxis.get_label()], visible=False)
467-
for ax in [axes["weight"], axes["category"]]:
468-
_check_visible(ax.get_xticklabels())
469-
_check_visible([ax.xaxis.get_label()])
482+
ax = axes[col]
483+
_check_visible(ax.get_xticklabels(), visible=visible)
484+
_check_visible([ax.xaxis.get_label()], visible=visible)
470485

486+
@pytest.mark.slow
487+
def test_grouped_box_layout_shape(self, hist_df):
488+
df = hist_df
471489
df.groupby("classroom").boxplot(
472490
column=["height", "weight", "category"], return_type="dict"
473491
)
474492
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(2, 2))
475493

494+
@pytest.mark.slow
495+
@pytest.mark.parametrize("cols", [2, -1])
496+
def test_grouped_box_layout_works(self, hist_df, cols):
497+
df = hist_df
476498
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
477499
_check_plot_works(
478500
df.groupby("category").boxplot,
479501
column="height",
480-
layout=(3, 2),
481-
return_type="dict",
482-
)
483-
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=4, layout=(3, 2))
484-
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
485-
_check_plot_works(
486-
df.groupby("category").boxplot,
487-
column="height",
488-
layout=(3, -1),
502+
layout=(3, cols),
489503
return_type="dict",
490504
)
491505
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=4, layout=(3, 2))
492506

493-
df.boxplot(column=["height", "weight", "category"], by="gender", layout=(4, 1))
494-
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(4, 1))
495-
496-
df.boxplot(column=["height", "weight", "category"], by="gender", layout=(-1, 1))
497-
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(3, 1))
498-
499-
df.groupby("classroom").boxplot(
500-
column=["height", "weight", "category"], layout=(1, 4), return_type="dict"
507+
@pytest.mark.slow
508+
@pytest.mark.parametrize("rows, res", [[4, 4], [-1, 3]])
509+
def test_grouped_box_layout_axes_shape_rows(self, hist_df, rows, res):
510+
df = hist_df
511+
df.boxplot(
512+
column=["height", "weight", "category"], by="gender", layout=(rows, 1)
501513
)
502-
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(1, 4))
514+
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(res, 1))
503515

516+
@pytest.mark.slow
517+
@pytest.mark.parametrize("cols, res", [[4, 4], [-1, 3]])
518+
def test_grouped_box_layout_axes_shape_cols_groupby(self, hist_df, cols, res):
519+
df = hist_df
504520
df.groupby("classroom").boxplot(
505-
column=["height", "weight", "category"], layout=(1, -1), return_type="dict"
521+
column=["height", "weight", "category"],
522+
layout=(1, cols),
523+
return_type="dict",
506524
)
507-
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(1, 3))
525+
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(1, res))
508526

509527
@pytest.mark.slow
510528
def test_grouped_box_multiple_axes(self, hist_df):

0 commit comments

Comments
 (0)