Skip to content

TST: Parameterize & make tests more performant #55830

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Nov 7, 2023
Merged
2 changes: 1 addition & 1 deletion pandas/tests/apply/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def int_frame_const_col():
return df


@pytest.fixture(params=["python", "numba"])
@pytest.fixture(params=["python", pytest.param("numba", marks=pytest.mark.single_cpu)])
def engine(request):
if request.param == "numba":
pytest.importorskip("numba")
Expand Down
78 changes: 36 additions & 42 deletions pandas/tests/frame/methods/test_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,27 +392,57 @@ def test_missing_axis_specification_exception(self):
with pytest.raises(ValueError, match=r"axis=0 or 1"):
df.align(series)

def _check_align(self, a, b, axis, fill_axis, how, method, limit=None):
@pytest.mark.parametrize("method", ["pad", "bfill"])
@pytest.mark.parametrize("axis", [0, 1, None])
@pytest.mark.parametrize("fill_axis", [0, 1])
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize(
"left_slice",
[
[slice(4), slice(10)],
[slice(0), slice(0)],
],
)
@pytest.mark.parametrize(
"right_slice",
[
[slice(2, None), slice(6, None)],
[slice(0), slice(0)],
],
)
@pytest.mark.parametrize("limit", [1, None])
def test_align_fill_method(
self, how, method, axis, fill_axis, float_frame, left_slice, right_slice, limit
):
frame = float_frame
left = frame.iloc[left_slice[0], left_slice[1]]
right = frame.iloc[right_slice[0], right_slice[1]]

msg = (
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
"are deprecated"
)

with tm.assert_produces_warning(FutureWarning, match=msg):
aa, ab = a.align(
b, axis=axis, join=how, method=method, limit=limit, fill_axis=fill_axis
aa, ab = left.align(
right,
axis=axis,
join=how,
method=method,
limit=limit,
fill_axis=fill_axis,
)

join_index, join_columns = None, None

ea, eb = a, b
ea, eb = left, right
if axis is None or axis == 0:
join_index = a.index.join(b.index, how=how)
join_index = left.index.join(right.index, how=how)
ea = ea.reindex(index=join_index)
eb = eb.reindex(index=join_index)

if axis is None or axis == 1:
join_columns = a.columns.join(b.columns, how=how)
join_columns = left.columns.join(right.columns, how=how)
ea = ea.reindex(columns=join_columns)
eb = eb.reindex(columns=join_columns)

Expand All @@ -424,42 +454,6 @@ def _check_align(self, a, b, axis, fill_axis, how, method, limit=None):
tm.assert_frame_equal(aa, ea)
tm.assert_frame_equal(ab, eb)

@pytest.mark.parametrize("meth", ["pad", "bfill"])
@pytest.mark.parametrize("ax", [0, 1, None])
@pytest.mark.parametrize("fax", [0, 1])
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
def test_align_fill_method(self, how, meth, ax, fax, float_frame):
df = float_frame
self._check_align_fill(df, how, meth, ax, fax)

def _check_align_fill(self, frame, kind, meth, ax, fax):
left = frame.iloc[0:4, :10]
right = frame.iloc[2:, 6:]
empty = frame.iloc[:0, :0]

self._check_align(left, right, axis=ax, fill_axis=fax, how=kind, method=meth)
self._check_align(
left, right, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1
)

# empty left
self._check_align(empty, right, axis=ax, fill_axis=fax, how=kind, method=meth)
self._check_align(
empty, right, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1
)

# empty right
self._check_align(left, empty, axis=ax, fill_axis=fax, how=kind, method=meth)
self._check_align(
left, empty, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1
)

# both empty
self._check_align(empty, empty, axis=ax, fill_axis=fax, how=kind, method=meth)
self._check_align(
empty, empty, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1
)

def test_align_series_check_copy(self):
# GH#
df = DataFrame({0: [1, 2]})
Expand Down
10 changes: 6 additions & 4 deletions pandas/tests/indexes/multi/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np
import pytest

from pandas._libs import index as libindex
from pandas.errors import (
InvalidIndexError,
PerformanceWarning,
Expand Down Expand Up @@ -843,11 +844,12 @@ def test_contains_td64_level(self):
assert "element_not_exit" not in idx
assert "0 day 09:30:00" in idx

@pytest.mark.slow
def test_large_mi_contains(self):
def test_large_mi_contains(self, monkeypatch):
# GH#10645
result = MultiIndex.from_arrays([range(10**6), range(10**6)])
assert (10**6, 0) not in result
with monkeypatch.context():
monkeypatch.setattr(libindex, "_SIZE_CUTOFF", 10)
result = MultiIndex.from_arrays([range(10), range(10)])
assert (10, 0) not in result


def test_timestamp_multiindex_indexer():
Expand Down
22 changes: 13 additions & 9 deletions pandas/tests/indexing/interval/test_interval.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pytest

from pandas._libs import index as libindex
from pandas.compat import IS64

import pandas as pd
Expand Down Expand Up @@ -72,15 +73,18 @@ def test_getitem_non_matching(self, series_with_interval_index, indexer_sl):
with pytest.raises(KeyError, match=r"\[-1\] not in index"):
indexer_sl(ser)[[-1, 3]]

@pytest.mark.slow
def test_loc_getitem_large_series(self):
ser = Series(
np.arange(1000000), index=IntervalIndex.from_breaks(np.arange(1000001))
)

result1 = ser.loc[:80000]
result2 = ser.loc[0:80000]
result3 = ser.loc[0:80000:1]
def test_loc_getitem_large_series(self, monkeypatch):
size_cutoff = 20
with monkeypatch.context():
monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff)
ser = Series(
np.arange(size_cutoff),
index=IntervalIndex.from_breaks(np.arange(size_cutoff + 1)),
)

result1 = ser.loc[:8]
result2 = ser.loc[0:8]
result3 = ser.loc[0:8:1]
tm.assert_series_equal(result1, result2)
tm.assert_series_equal(result1, result3)

Expand Down
20 changes: 11 additions & 9 deletions pandas/tests/indexing/multiindex/test_chaining_and_caching.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pytest

from pandas._libs import index as libindex
from pandas.errors import SettingWithCopyError
import pandas.util._test_decorators as td

Expand Down Expand Up @@ -69,15 +70,16 @@ def test_cache_updating(using_copy_on_write):
assert result == 2


@pytest.mark.slow
def test_indexer_caching():
def test_indexer_caching(monkeypatch):
# GH5727
# make sure that indexers are in the _internal_names_set
n = 1000001
index = MultiIndex.from_arrays([np.arange(n), np.arange(n)])
ser = Series(np.zeros(n), index=index)
size_cutoff = 20
with monkeypatch.context():
monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff)
index = MultiIndex.from_arrays([np.arange(size_cutoff), np.arange(size_cutoff)])
s = Series(np.zeros(size_cutoff), index=index)

# setitem
expected = Series(np.ones(n), index=index)
ser[ser == 0] = 1
tm.assert_series_equal(ser, expected)
# setitem
s[s == 0] = 1
expected = Series(np.ones(size_cutoff), index=index)
tm.assert_series_equal(s, expected)
8 changes: 1 addition & 7 deletions pandas/tests/io/generate_legacy_storage_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def _create_sp_frame():
return DataFrame(data, index=dates).apply(SparseArray)


def create_data():
def create_pickle_data():
"""create the pickle data"""
data = {
"A": [0.0, 1.0, 2.0, 3.0, np.nan],
Expand Down Expand Up @@ -282,12 +282,6 @@ def create_data():
}


def create_pickle_data():
data = create_data()

return data


def platform_name():
return "_".join(
[
Expand Down
27 changes: 14 additions & 13 deletions pandas/tests/io/parser/test_textreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,8 @@ def test_empty_field_eof(self):
}
assert_array_dicts_equal(result, expected)

@pytest.mark.parametrize("repeat", range(10))
def test_empty_field_eof_mem_access_bug(self, repeat):
# GH5664
a = DataFrame([["b"], [np.nan]], columns=["a"], index=["a", "c"])
b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], columns=list("abcd"), index=[1, 1])
Expand All @@ -312,21 +314,20 @@ def test_empty_field_eof(self):
index=[0, 5, 7, 12],
)

for _ in range(100):
df = read_csv(StringIO("a,b\nc\n"), skiprows=0, names=["a"], engine="c")
tm.assert_frame_equal(df, a)
df = read_csv(StringIO("a,b\nc\n"), skiprows=0, names=["a"], engine="c")
tm.assert_frame_equal(df, a)

df = read_csv(
StringIO("1,1,1,1,0\n" * 2 + "\n" * 2), names=list("abcd"), engine="c"
)
tm.assert_frame_equal(df, b)
df = read_csv(
StringIO("1,1,1,1,0\n" * 2 + "\n" * 2), names=list("abcd"), engine="c"
)
tm.assert_frame_equal(df, b)

df = read_csv(
StringIO("0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14"),
names=list("abcd"),
engine="c",
)
tm.assert_frame_equal(df, c)
df = read_csv(
StringIO("0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14"),
names=list("abcd"),
engine="c",
)
tm.assert_frame_equal(df, c)

def test_empty_csv_input(self):
# GH14867
Expand Down
62 changes: 33 additions & 29 deletions pandas/tests/io/test_pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

3. Move the created pickle to "data/legacy_pickle/<version>" directory.
"""
from __future__ import annotations

from array import array
import bz2
import datetime
Expand All @@ -22,6 +24,7 @@
import pickle
import shutil
import tarfile
from typing import Any
import uuid
import zipfile

Expand Down Expand Up @@ -52,12 +55,6 @@
)


@pytest.fixture
def current_pickle_data():
# our current version pickle data
return create_pickle_data()


# ---------------------
# comparison functions
# ---------------------
Expand Down Expand Up @@ -173,6 +170,15 @@ def python_unpickler(path):
return pickle.load(fh)


def flatten(data: dict) -> list[tuple[str, Any]]:
"""Flatten create_pickle_data"""
return [
(typ, example)
for typ, examples in data.items()
for example in examples.values()
]


@pytest.mark.parametrize(
"pickle_writer",
[
Expand All @@ -190,29 +196,27 @@ def python_unpickler(path):
],
)
@pytest.mark.parametrize("writer", [pd.to_pickle, python_pickler])
def test_round_trip_current(current_pickle_data, pickle_writer, writer):
data = current_pickle_data
for typ, dv in data.items():
for dt, expected in dv.items():
with tm.ensure_clean() as path:
# test writing with each pickler
pickle_writer(expected, path)

# test reading with each unpickler
result = pd.read_pickle(path)
compare_element(result, expected, typ)

result = python_unpickler(path)
compare_element(result, expected, typ)

# and the same for file objects (GH 35679)
with open(path, mode="wb") as handle:
writer(expected, path)
handle.seek(0) # shouldn't close file handle
with open(path, mode="rb") as handle:
result = pd.read_pickle(handle)
handle.seek(0) # shouldn't close file handle
compare_element(result, expected, typ)
@pytest.mark.parametrize("typ, expected", flatten(create_pickle_data()))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how big is the memory footprint of create_pickle_data? i think doing this outside the test means it sticks around during the whole collection/runtime

Copy link
Member Author

@mroeschke mroeschke Nov 6, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doing a recursive getsizeof of the dict yields 426965 bytes so it doesn't seem too much. Additionally IIUC before this data was being yielded from the current_pickle_data fixture which caches & keeps the data around after first use

def test_round_trip_current(typ, expected, pickle_writer, writer):
with tm.ensure_clean() as path:
# test writing with each pickler
pickle_writer(expected, path)

# test reading with each unpickler
result = pd.read_pickle(path)
compare_element(result, expected, typ)

result = python_unpickler(path)
compare_element(result, expected, typ)

# and the same for file objects (GH 35679)
with open(path, mode="wb") as handle:
writer(expected, path)
handle.seek(0) # shouldn't close file handle
with open(path, mode="rb") as handle:
result = pd.read_pickle(handle)
handle.seek(0) # shouldn't close file handle
compare_element(result, expected, typ)


def test_pickle_path_pathlib():
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/reshape/concat/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def test_concat_duplicates_in_index_with_keys(self):
@pytest.mark.parametrize("axis", [0, 1])
def test_concat_copies(self, axis, order, ignore_index, using_copy_on_write):
# based on asv ConcatDataFrames
df = DataFrame(np.zeros((10000, 200), dtype=np.float32, order=order))
df = DataFrame(np.zeros((10, 5), dtype=np.float32, order=order))

res = concat([df] * 5, axis=axis, ignore_index=ignore_index, copy=True)

Expand Down