Skip to content

TST: Make old tests more performant #55746

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Oct 29, 2023
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/tests/apply/test_numba.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
)
import pandas._testing as tm

pytestmark = td.skip_if_no("numba")
pytestmark = [td.skip_if_no("numba"), pytest.mark.single_cpu]


def test_numba_vs_python_noop(float_frame, apply_axis):
Expand Down
14 changes: 10 additions & 4 deletions pandas/tests/frame/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,14 @@
)
import pandas._testing as tm
from pandas.core.computation import expressions as expr
from pandas.core.computation.expressions import _MIN_ELEMENTS
from pandas.tests.frame.common import (
_check_mixed_float,
_check_mixed_int,
)
from pandas.util.version import Version


@pytest.fixture(autouse=True, params=[0, 1000000], ids=["numexpr", "python"])
@pytest.fixture(autouse=True, params=[0, 100], ids=["numexpr", "python"])
def switch_numexpr_min_elements(request, monkeypatch):
with monkeypatch.context() as m:
m.setattr(expr, "_MIN_ELEMENTS", request.param)
Expand Down Expand Up @@ -500,7 +499,9 @@ def test_floordiv_axis0(self):
tm.assert_frame_equal(result2, expected)

@pytest.mark.parametrize("opname", ["floordiv", "pow"])
def test_floordiv_axis0_numexpr_path(self, opname, request):
def test_floordiv_axis0_numexpr_path(
self, opname, request, switch_numexpr_min_elements
):
# case that goes through numexpr and has to fall back to masked_arith_op
ne = pytest.importorskip("numexpr")
if (
Expand All @@ -514,7 +515,12 @@ def test_floordiv_axis0_numexpr_path(self, opname, request):

op = getattr(operator, opname)

arr = np.arange(_MIN_ELEMENTS + 100).reshape(_MIN_ELEMENTS // 100 + 1, -1) * 100
arr = (
np.arange(switch_numexpr_min_elements + 10).reshape(
switch_numexpr_min_elements // 10 + 1, -1
)
* 100
)
df = DataFrame(arr)
df["C"] = 1.0

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -2186,7 +2186,7 @@ def __init__(self, *args, **kwargs) -> None:
with monkeypatch.context() as m:
m.setattr(reshape_lib, "_Unstacker", MockUnstacker)
df = DataFrame(
np.random.default_rng(2).standard_normal((2**16, 2)),
np.zeros((2**16, 2)),
index=[np.arange(2**16), np.arange(2**16)],
)
msg = "The following operation may generate"
Expand Down
35 changes: 18 additions & 17 deletions pandas/tests/groupby/methods/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
and proper parameter handling
"""

from itertools import product

import numpy as np
import pytest
Expand Down Expand Up @@ -46,7 +45,6 @@ def tests_value_counts_index_names_category_column():
tm.assert_series_equal(result, expected)


# our starting frame
def seed_df(seed_nans, n, m):
days = date_range("2015-08-24", periods=10)

Expand All @@ -70,29 +68,32 @@ def seed_df(seed_nans, n, m):
return frame


# create input df, keys, and the bins
binned = []
ids = []
for seed_nans in [True, False]:
for n, m in product((100, 1000), (5, 20)):
df = seed_df(seed_nans, n, m)
bins = None, np.arange(0, max(5, df["3rd"].max()) + 1, 2)
keys = "1st", "2nd", ["1st", "2nd"]
for k, b in product(keys, bins):
binned.append((df, k, b, n, m))
ids.append(f"{k}-{n}-{m}")


@pytest.mark.slow
@pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice!

@pytest.mark.parametrize("seed_nans", [True, False])
@pytest.mark.parametrize("num_rows", [10, 50])
@pytest.mark.parametrize("max_int", [5, 20])
@pytest.mark.parametrize("keys", ["1st", "2nd", ["1st", "2nd"]], ids=repr)
@pytest.mark.parametrize("bins", [None, [0, 5]], ids=repr)
@pytest.mark.parametrize("isort", [True, False])
@pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")])
@pytest.mark.parametrize("sort", [True, False])
@pytest.mark.parametrize("ascending", [True, False])
@pytest.mark.parametrize("dropna", [True, False])
def test_series_groupby_value_counts(
df, keys, bins, n, m, isort, normalize, name, sort, ascending, dropna
seed_nans,
num_rows,
max_int,
keys,
bins,
isort,
normalize,
name,
sort,
ascending,
dropna,
):
df = seed_df(seed_nans, num_rows, max_int)

def rebuild_index(df):
arr = list(map(df.index.get_level_values, range(df.index.nlevels)))
df.index = MultiIndex.from_arrays(arr, names=df.index.names)
Expand Down
13 changes: 8 additions & 5 deletions pandas/tests/indexing/test_loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import numpy as np
import pytest

from pandas._libs import index as libindex
from pandas.errors import IndexingError
import pandas.util._test_decorators as td

Expand Down Expand Up @@ -1974,12 +1975,14 @@ def test_loc_drops_level(self):


class TestLocSetitemWithExpansion:
@pytest.mark.slow
def test_loc_setitem_with_expansion_large_dataframe(self):
def test_loc_setitem_with_expansion_large_dataframe(self, monkeypatch):
# GH#10692
result = DataFrame({"x": range(10**6)}, dtype="int64")
result.loc[len(result)] = len(result) + 1
expected = DataFrame({"x": range(10**6 + 1)}, dtype="int64")
size_cutoff = 50
with monkeypatch.context():
monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm looking at this id actually expect us to cdef _SIZE_CUTOFF for a tiny perf boost. no harm in doing this until then i guess

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As long as there's an accessible Python version to override this too, SGTM.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think they're mutually exclusive, but im fine with doing this for the short-to-medium term

result = DataFrame({"x": range(size_cutoff)}, dtype="int64")
result.loc[size_cutoff] = size_cutoff
expected = DataFrame({"x": range(size_cutoff + 1)}, dtype="int64")
tm.assert_frame_equal(result, expected)

def test_loc_setitem_empty_series(self):
Expand Down
74 changes: 33 additions & 41 deletions pandas/tests/io/parser/test_c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,9 @@ def test_unsupported_dtype(c_parser_only, match, kwargs):

@td.skip_if_32bit
@pytest.mark.slow
def test_precise_conversion(c_parser_only):
# test numbers between 1 and 2
@pytest.mark.parametrize("num", np.linspace(1.0, 2.0, num=21))
def test_precise_conversion(c_parser_only, num):
parser = c_parser_only

normal_errors = []
Expand All @@ -156,27 +158,23 @@ def test_precise_conversion(c_parser_only):
def error(val: float, actual_val: Decimal) -> Decimal:
return abs(Decimal(f"{val:.100}") - actual_val)

# test numbers between 1 and 2
for num in np.linspace(1.0, 2.0, num=500):
# 25 decimal digits of precision
text = f"a\n{num:.25}"
# 25 decimal digits of precision
text = f"a\n{num:.25}"

normal_val = float(
parser.read_csv(StringIO(text), float_precision="legacy")["a"][0]
)
precise_val = float(
parser.read_csv(StringIO(text), float_precision="high")["a"][0]
)
roundtrip_val = float(
parser.read_csv(StringIO(text), float_precision="round_trip")["a"][0]
)
actual_val = Decimal(text[2:])
normal_val = float(
parser.read_csv(StringIO(text), float_precision="legacy")["a"][0]
)
precise_val = float(parser.read_csv(StringIO(text), float_precision="high")["a"][0])
roundtrip_val = float(
parser.read_csv(StringIO(text), float_precision="round_trip")["a"][0]
)
actual_val = Decimal(text[2:])

normal_errors.append(error(normal_val, actual_val))
precise_errors.append(error(precise_val, actual_val))
normal_errors.append(error(normal_val, actual_val))
precise_errors.append(error(precise_val, actual_val))

# round-trip should match float()
assert roundtrip_val == float(text[2:])
# round-trip should match float()
assert roundtrip_val == float(text[2:])

assert sum(precise_errors) <= sum(normal_errors)
assert max(precise_errors) <= max(normal_errors)
Expand Down Expand Up @@ -287,7 +285,8 @@ def test_tokenize_CR_with_quoting(c_parser_only):


@pytest.mark.slow
def test_grow_boundary_at_cap(c_parser_only):
@pytest.mark.parametrize("count", [3 * 2**n for n in range(6)])
def test_grow_boundary_at_cap(c_parser_only, count):
# See gh-12494
#
# Cause of error was that the C parser
Expand All @@ -296,19 +295,18 @@ def test_grow_boundary_at_cap(c_parser_only):
# to capacity, which would later cause a
# buffer overflow error when checking the
# EOF terminator of the CSV stream.
# 3 * 2^n commas was observed to break the parser
parser = c_parser_only

def test_empty_header_read(count):
with StringIO("," * count) as s:
expected = DataFrame(columns=[f"Unnamed: {i}" for i in range(count + 1)])
df = parser.read_csv(s)
tm.assert_frame_equal(df, expected)

for cnt in range(1, 101):
test_empty_header_read(cnt)
with StringIO("," * count) as s:
expected = DataFrame(columns=[f"Unnamed: {i}" for i in range(count + 1)])
df = parser.read_csv(s)
tm.assert_frame_equal(df, expected)


def test_parse_trim_buffers(c_parser_only):
@pytest.mark.slow
@pytest.mark.parametrize("encoding", [None, "utf-8"])
def test_parse_trim_buffers(c_parser_only, encoding):
# This test is part of a bugfix for gh-13703. It attempts to
# to stress the system memory allocator, to cause it to move the
# stream buffer and either let the OS reclaim the region, or let
Expand All @@ -319,6 +317,9 @@ def test_parse_trim_buffers(c_parser_only):
# times it fails due to memory corruption, which causes the
# loaded DataFrame to differ from the expected one.

# Also force 'utf-8' encoding, so that `_string_convert` would take
# a different execution branch.

parser = c_parser_only

# Generate a large mixed-type CSV file on-the-fly (one record is
Expand Down Expand Up @@ -374,25 +375,16 @@ def test_parse_trim_buffers(c_parser_only):
)

# Iterate over the CSV file in chunks of `chunksize` lines
with parser.read_csv(
StringIO(csv_data), header=None, dtype=object, chunksize=chunksize
) as chunks_:
result = concat(chunks_, axis=0, ignore_index=True)

# Check for data corruption if there was no segfault
tm.assert_frame_equal(result, expected)

# This extra test was added to replicate the fault in gh-5291.
# Force 'utf-8' encoding, so that `_string_convert` would take
# a different execution branch.
with parser.read_csv(
StringIO(csv_data),
header=None,
dtype=object,
chunksize=chunksize,
encoding="utf_8",
encoding=encoding,
) as chunks_:
result = concat(chunks_, axis=0, ignore_index=True)

# Check for data corruption if there was no segfault
tm.assert_frame_equal(result, expected)


Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/io/parser/test_parse_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import pandas._testing as tm
from pandas._testing._hypothesis import DATETIME_NO_TZ
from pandas.core.indexes.datetimes import date_range
from pandas.core.tools.datetimes import start_caching_at

from pandas.io.parsers import read_csv

Expand Down Expand Up @@ -1285,7 +1286,7 @@ def test_bad_date_parse(all_parsers, cache_dates, value):
# if we have an invalid date make sure that we handle this with
# and w/o the cache properly
parser = all_parsers
s = StringIO((f"{value},\n") * 50000)
s = StringIO((f"{value},\n") * (start_caching_at + 1))

warn = None
msg = "Passing a BlockManager to DataFrame"
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/resample/test_datetime_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,7 @@ def test_upsample_with_limit(unit):
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("freq", ["5D", "10h", "5Min", "10s"])
@pytest.mark.parametrize("freq", ["1D", "10h", "5Min", "10s"])
@pytest.mark.parametrize("rule", ["Y", "3ME", "15D", "30h", "15Min", "30s"])
def test_nearest_upsample_with_limit(tz_aware_fixture, freq, rule, unit):
# GH 33939
Expand Down
16 changes: 10 additions & 6 deletions pandas/tests/series/methods/test_isin.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
date_range,
)
import pandas._testing as tm
from pandas.core import algorithms
from pandas.core.arrays import PeriodArray


Expand Down Expand Up @@ -197,13 +198,16 @@ def test_isin_masked_types(self, dtype, data, values, expected):
tm.assert_series_equal(result, expected)


@pytest.mark.slow
def test_isin_large_series_mixed_dtypes_and_nan():
def test_isin_large_series_mixed_dtypes_and_nan(monkeypatch):
# https://github.com/pandas-dev/pandas/issues/37094
# combination of object dtype for the values and > 1_000_000 elements
ser = Series([1, 2, np.nan] * 1_000_000)
result = ser.isin({"foo", "bar"})
expected = Series([False] * 3 * 1_000_000)
# combination of object dtype for the values
# and > _MINIMUM_COMP_ARR_LEN elements
min_isin_comp = 5
ser = Series([1, 2, np.nan] * min_isin_comp)
with monkeypatch.context() as m:
m.setattr(algorithms, "_MINIMUM_COMP_ARR_LEN", min_isin_comp)
result = ser.isin({"foo", "bar"})
expected = Series([False] * 3 * min_isin_comp)
tm.assert_series_equal(result, expected)


Expand Down