Skip to content

Commit adae693

Browse files
authored
TST: Make old tests more performant (#55746)
* Use zeros instead of random data * Mark numba apply tests as single cpu * Parameterize and make test_grow_boundary_at_cap more specific * parameterize and test less values in test_precise_conversion * Parameterize and mark test_parse_trim_buffers as slow * Reduce resample size of test_nearest_upsample_with_limit * use start_caching_at for test_bad_date_parse * Parameterize test_series_groupby_value_counts * Monkeypatch magic number in test_isin_large_series_mixed_dtypes_and_nan * Use _SIZE_CUTOFF for test_loc_setitem_with_expansion_large_dataframe * Use switch_numexpr_min_elements for test_floordiv_axis0_numexpr_path * Remove redundant test
1 parent 8425c97 commit adae693

File tree

9 files changed

+75
-104
lines changed

9 files changed

+75
-104
lines changed

pandas/tests/apply/test_numba.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
)
1010
import pandas._testing as tm
1111

12-
pytestmark = td.skip_if_no("numba")
12+
pytestmark = [td.skip_if_no("numba"), pytest.mark.single_cpu]
1313

1414

1515
def test_numba_vs_python_noop(float_frame, apply_axis):

pandas/tests/frame/test_arithmetic.py

+1-31
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,13 @@
2222
)
2323
import pandas._testing as tm
2424
from pandas.core.computation import expressions as expr
25-
from pandas.core.computation.expressions import _MIN_ELEMENTS
2625
from pandas.tests.frame.common import (
2726
_check_mixed_float,
2827
_check_mixed_int,
2928
)
30-
from pandas.util.version import Version
3129

3230

33-
@pytest.fixture(autouse=True, params=[0, 1000000], ids=["numexpr", "python"])
31+
@pytest.fixture(autouse=True, params=[0, 100], ids=["numexpr", "python"])
3432
def switch_numexpr_min_elements(request, monkeypatch):
3533
with monkeypatch.context() as m:
3634
m.setattr(expr, "_MIN_ELEMENTS", request.param)
@@ -499,34 +497,6 @@ def test_floordiv_axis0(self):
499497
result2 = df.floordiv(ser.values, axis=0)
500498
tm.assert_frame_equal(result2, expected)
501499

502-
@pytest.mark.parametrize("opname", ["floordiv", "pow"])
503-
def test_floordiv_axis0_numexpr_path(self, opname, request):
504-
# case that goes through numexpr and has to fall back to masked_arith_op
505-
ne = pytest.importorskip("numexpr")
506-
if (
507-
Version(ne.__version__) >= Version("2.8.7")
508-
and opname == "pow"
509-
and "python" in request.node.callspec.id
510-
):
511-
request.applymarker(
512-
pytest.mark.xfail(reason="https://github.com/pydata/numexpr/issues/454")
513-
)
514-
515-
op = getattr(operator, opname)
516-
517-
arr = np.arange(_MIN_ELEMENTS + 100).reshape(_MIN_ELEMENTS // 100 + 1, -1) * 100
518-
df = DataFrame(arr)
519-
df["C"] = 1.0
520-
521-
ser = df[0]
522-
result = getattr(df, opname)(ser, axis=0)
523-
524-
expected = DataFrame({col: op(df[col], ser) for col in df.columns})
525-
tm.assert_frame_equal(result, expected)
526-
527-
result2 = getattr(df, opname)(ser.values, axis=0)
528-
tm.assert_frame_equal(result2, expected)
529-
530500
def test_df_add_td64_columnwise(self):
531501
# GH 22534 Check that column-wise addition broadcasts correctly
532502
dti = pd.date_range("2016-01-01", periods=10)

pandas/tests/frame/test_stack_unstack.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2186,7 +2186,7 @@ def __init__(self, *args, **kwargs) -> None:
21862186
with monkeypatch.context() as m:
21872187
m.setattr(reshape_lib, "_Unstacker", MockUnstacker)
21882188
df = DataFrame(
2189-
np.random.default_rng(2).standard_normal((2**16, 2)),
2189+
np.zeros((2**16, 2)),
21902190
index=[np.arange(2**16), np.arange(2**16)],
21912191
)
21922192
msg = "The following operation may generate"

pandas/tests/groupby/methods/test_value_counts.py

+18-17
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
and proper parameter handling
55
"""
66

7-
from itertools import product
87

98
import numpy as np
109
import pytest
@@ -46,7 +45,6 @@ def tests_value_counts_index_names_category_column():
4645
tm.assert_series_equal(result, expected)
4746

4847

49-
# our starting frame
5048
def seed_df(seed_nans, n, m):
5149
days = date_range("2015-08-24", periods=10)
5250

@@ -70,29 +68,32 @@ def seed_df(seed_nans, n, m):
7068
return frame
7169

7270

73-
# create input df, keys, and the bins
74-
binned = []
75-
ids = []
76-
for seed_nans in [True, False]:
77-
for n, m in product((100, 1000), (5, 20)):
78-
df = seed_df(seed_nans, n, m)
79-
bins = None, np.arange(0, max(5, df["3rd"].max()) + 1, 2)
80-
keys = "1st", "2nd", ["1st", "2nd"]
81-
for k, b in product(keys, bins):
82-
binned.append((df, k, b, n, m))
83-
ids.append(f"{k}-{n}-{m}")
84-
85-
8671
@pytest.mark.slow
87-
@pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids)
72+
@pytest.mark.parametrize("seed_nans", [True, False])
73+
@pytest.mark.parametrize("num_rows", [10, 50])
74+
@pytest.mark.parametrize("max_int", [5, 20])
75+
@pytest.mark.parametrize("keys", ["1st", "2nd", ["1st", "2nd"]], ids=repr)
76+
@pytest.mark.parametrize("bins", [None, [0, 5]], ids=repr)
8877
@pytest.mark.parametrize("isort", [True, False])
8978
@pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")])
9079
@pytest.mark.parametrize("sort", [True, False])
9180
@pytest.mark.parametrize("ascending", [True, False])
9281
@pytest.mark.parametrize("dropna", [True, False])
9382
def test_series_groupby_value_counts(
94-
df, keys, bins, n, m, isort, normalize, name, sort, ascending, dropna
83+
seed_nans,
84+
num_rows,
85+
max_int,
86+
keys,
87+
bins,
88+
isort,
89+
normalize,
90+
name,
91+
sort,
92+
ascending,
93+
dropna,
9594
):
95+
df = seed_df(seed_nans, num_rows, max_int)
96+
9697
def rebuild_index(df):
9798
arr = list(map(df.index.get_level_values, range(df.index.nlevels)))
9899
df.index = MultiIndex.from_arrays(arr, names=df.index.names)

pandas/tests/indexing/test_loc.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import numpy as np
1313
import pytest
1414

15+
from pandas._libs import index as libindex
1516
from pandas.errors import IndexingError
1617
import pandas.util._test_decorators as td
1718

@@ -1974,12 +1975,14 @@ def test_loc_drops_level(self):
19741975

19751976

19761977
class TestLocSetitemWithExpansion:
1977-
@pytest.mark.slow
1978-
def test_loc_setitem_with_expansion_large_dataframe(self):
1978+
def test_loc_setitem_with_expansion_large_dataframe(self, monkeypatch):
19791979
# GH#10692
1980-
result = DataFrame({"x": range(10**6)}, dtype="int64")
1981-
result.loc[len(result)] = len(result) + 1
1982-
expected = DataFrame({"x": range(10**6 + 1)}, dtype="int64")
1980+
size_cutoff = 50
1981+
with monkeypatch.context():
1982+
monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff)
1983+
result = DataFrame({"x": range(size_cutoff)}, dtype="int64")
1984+
result.loc[size_cutoff] = size_cutoff
1985+
expected = DataFrame({"x": range(size_cutoff + 1)}, dtype="int64")
19831986
tm.assert_frame_equal(result, expected)
19841987

19851988
def test_loc_setitem_empty_series(self):

pandas/tests/io/parser/test_c_parser_only.py

+33-41
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,9 @@ def test_unsupported_dtype(c_parser_only, match, kwargs):
147147

148148
@td.skip_if_32bit
149149
@pytest.mark.slow
150-
def test_precise_conversion(c_parser_only):
150+
# test numbers between 1 and 2
151+
@pytest.mark.parametrize("num", np.linspace(1.0, 2.0, num=21))
152+
def test_precise_conversion(c_parser_only, num):
151153
parser = c_parser_only
152154

153155
normal_errors = []
@@ -156,27 +158,23 @@ def test_precise_conversion(c_parser_only):
156158
def error(val: float, actual_val: Decimal) -> Decimal:
157159
return abs(Decimal(f"{val:.100}") - actual_val)
158160

159-
# test numbers between 1 and 2
160-
for num in np.linspace(1.0, 2.0, num=500):
161-
# 25 decimal digits of precision
162-
text = f"a\n{num:.25}"
161+
# 25 decimal digits of precision
162+
text = f"a\n{num:.25}"
163163

164-
normal_val = float(
165-
parser.read_csv(StringIO(text), float_precision="legacy")["a"][0]
166-
)
167-
precise_val = float(
168-
parser.read_csv(StringIO(text), float_precision="high")["a"][0]
169-
)
170-
roundtrip_val = float(
171-
parser.read_csv(StringIO(text), float_precision="round_trip")["a"][0]
172-
)
173-
actual_val = Decimal(text[2:])
164+
normal_val = float(
165+
parser.read_csv(StringIO(text), float_precision="legacy")["a"][0]
166+
)
167+
precise_val = float(parser.read_csv(StringIO(text), float_precision="high")["a"][0])
168+
roundtrip_val = float(
169+
parser.read_csv(StringIO(text), float_precision="round_trip")["a"][0]
170+
)
171+
actual_val = Decimal(text[2:])
174172

175-
normal_errors.append(error(normal_val, actual_val))
176-
precise_errors.append(error(precise_val, actual_val))
173+
normal_errors.append(error(normal_val, actual_val))
174+
precise_errors.append(error(precise_val, actual_val))
177175

178-
# round-trip should match float()
179-
assert roundtrip_val == float(text[2:])
176+
# round-trip should match float()
177+
assert roundtrip_val == float(text[2:])
180178

181179
assert sum(precise_errors) <= sum(normal_errors)
182180
assert max(precise_errors) <= max(normal_errors)
@@ -287,7 +285,8 @@ def test_tokenize_CR_with_quoting(c_parser_only):
287285

288286

289287
@pytest.mark.slow
290-
def test_grow_boundary_at_cap(c_parser_only):
288+
@pytest.mark.parametrize("count", [3 * 2**n for n in range(6)])
289+
def test_grow_boundary_at_cap(c_parser_only, count):
291290
# See gh-12494
292291
#
293292
# Cause of error was that the C parser
@@ -296,19 +295,18 @@ def test_grow_boundary_at_cap(c_parser_only):
296295
# to capacity, which would later cause a
297296
# buffer overflow error when checking the
298297
# EOF terminator of the CSV stream.
298+
# 3 * 2^n commas was observed to break the parser
299299
parser = c_parser_only
300300

301-
def test_empty_header_read(count):
302-
with StringIO("," * count) as s:
303-
expected = DataFrame(columns=[f"Unnamed: {i}" for i in range(count + 1)])
304-
df = parser.read_csv(s)
305-
tm.assert_frame_equal(df, expected)
306-
307-
for cnt in range(1, 101):
308-
test_empty_header_read(cnt)
301+
with StringIO("," * count) as s:
302+
expected = DataFrame(columns=[f"Unnamed: {i}" for i in range(count + 1)])
303+
df = parser.read_csv(s)
304+
tm.assert_frame_equal(df, expected)
309305

310306

311-
def test_parse_trim_buffers(c_parser_only):
307+
@pytest.mark.slow
308+
@pytest.mark.parametrize("encoding", [None, "utf-8"])
309+
def test_parse_trim_buffers(c_parser_only, encoding):
312310
# This test is part of a bugfix for gh-13703. It attempts to
313311
# to stress the system memory allocator, to cause it to move the
314312
# stream buffer and either let the OS reclaim the region, or let
@@ -319,6 +317,9 @@ def test_parse_trim_buffers(c_parser_only):
319317
# times it fails due to memory corruption, which causes the
320318
# loaded DataFrame to differ from the expected one.
321319

320+
# Also force 'utf-8' encoding, so that `_string_convert` would take
321+
# a different execution branch.
322+
322323
parser = c_parser_only
323324

324325
# Generate a large mixed-type CSV file on-the-fly (one record is
@@ -374,25 +375,16 @@ def test_parse_trim_buffers(c_parser_only):
374375
)
375376

376377
# Iterate over the CSV file in chunks of `chunksize` lines
377-
with parser.read_csv(
378-
StringIO(csv_data), header=None, dtype=object, chunksize=chunksize
379-
) as chunks_:
380-
result = concat(chunks_, axis=0, ignore_index=True)
381-
382-
# Check for data corruption if there was no segfault
383-
tm.assert_frame_equal(result, expected)
384-
385-
# This extra test was added to replicate the fault in gh-5291.
386-
# Force 'utf-8' encoding, so that `_string_convert` would take
387-
# a different execution branch.
388378
with parser.read_csv(
389379
StringIO(csv_data),
390380
header=None,
391381
dtype=object,
392382
chunksize=chunksize,
393-
encoding="utf_8",
383+
encoding=encoding,
394384
) as chunks_:
395385
result = concat(chunks_, axis=0, ignore_index=True)
386+
387+
# Check for data corruption if there was no segfault
396388
tm.assert_frame_equal(result, expected)
397389

398390

pandas/tests/io/parser/test_parse_dates.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
import pandas._testing as tm
3333
from pandas._testing._hypothesis import DATETIME_NO_TZ
3434
from pandas.core.indexes.datetimes import date_range
35+
from pandas.core.tools.datetimes import start_caching_at
3536

3637
from pandas.io.parsers import read_csv
3738

@@ -1285,7 +1286,7 @@ def test_bad_date_parse(all_parsers, cache_dates, value):
12851286
# if we have an invalid date make sure that we handle this with
12861287
# and w/o the cache properly
12871288
parser = all_parsers
1288-
s = StringIO((f"{value},\n") * 50000)
1289+
s = StringIO((f"{value},\n") * (start_caching_at + 1))
12891290

12901291
warn = None
12911292
msg = "Passing a BlockManager to DataFrame"

pandas/tests/resample/test_datetime_index.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -516,7 +516,7 @@ def test_upsample_with_limit(unit):
516516
tm.assert_series_equal(result, expected)
517517

518518

519-
@pytest.mark.parametrize("freq", ["5D", "10h", "5Min", "10s"])
519+
@pytest.mark.parametrize("freq", ["1D", "10h", "5Min", "10s"])
520520
@pytest.mark.parametrize("rule", ["Y", "3ME", "15D", "30h", "15Min", "30s"])
521521
def test_nearest_upsample_with_limit(tz_aware_fixture, freq, rule, unit):
522522
# GH 33939

pandas/tests/series/methods/test_isin.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
date_range,
88
)
99
import pandas._testing as tm
10+
from pandas.core import algorithms
1011
from pandas.core.arrays import PeriodArray
1112

1213

@@ -197,13 +198,16 @@ def test_isin_masked_types(self, dtype, data, values, expected):
197198
tm.assert_series_equal(result, expected)
198199

199200

200-
@pytest.mark.slow
201-
def test_isin_large_series_mixed_dtypes_and_nan():
201+
def test_isin_large_series_mixed_dtypes_and_nan(monkeypatch):
202202
# https://github.com/pandas-dev/pandas/issues/37094
203-
# combination of object dtype for the values and > 1_000_000 elements
204-
ser = Series([1, 2, np.nan] * 1_000_000)
205-
result = ser.isin({"foo", "bar"})
206-
expected = Series([False] * 3 * 1_000_000)
203+
# combination of object dtype for the values
204+
# and > _MINIMUM_COMP_ARR_LEN elements
205+
min_isin_comp = 5
206+
ser = Series([1, 2, np.nan] * min_isin_comp)
207+
with monkeypatch.context() as m:
208+
m.setattr(algorithms, "_MINIMUM_COMP_ARR_LEN", min_isin_comp)
209+
result = ser.isin({"foo", "bar"})
210+
expected = Series([False] * 3 * min_isin_comp)
207211
tm.assert_series_equal(result, expected)
208212

209213

0 commit comments

Comments
 (0)