Skip to content

TST: parametrize and improve test performance #55962

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Nov 16, 2023
30 changes: 14 additions & 16 deletions pandas/tests/frame/test_iteration.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import datetime

import numpy as np
import pytest

from pandas.compat import (
IS64,
Expand Down Expand Up @@ -91,6 +92,7 @@ def test_itertuples(self, float_frame):
expected = float_frame.iloc[i, :].reset_index(drop=True)
tm.assert_series_equal(ser, expected)

def test_itertuples_index_false(self):
df = DataFrame(
{"floats": np.random.default_rng(2).standard_normal(5), "ints": range(5)},
columns=["floats", "ints"],
Expand All @@ -99,6 +101,7 @@ def test_itertuples(self, float_frame):
for tup in df.itertuples(index=False):
assert isinstance(tup[1], int)

def test_itertuples_duplicate_cols(self):
df = DataFrame(data={"a": [1, 2, 3], "b": [4, 5, 6]})
dfaa = df[["a", "a"]]

Expand All @@ -111,32 +114,27 @@ def test_itertuples(self, float_frame):
== "[(0, 1, 4), (1, 2, 5), (2, 3, 6)]"
)

def test_itertuples_tuple_name(self):
df = DataFrame(data={"a": [1, 2, 3], "b": [4, 5, 6]})
tup = next(df.itertuples(name="TestName"))
assert tup._fields == ("Index", "a", "b")
assert (tup.Index, tup.a, tup.b) == tup
assert type(tup).__name__ == "TestName"

df.columns = ["def", "return"]
def test_itertuples_disallowed_col_labels(self):
df = DataFrame(data={"def": [1, 2, 3], "return": [4, 5, 6]})
tup2 = next(df.itertuples(name="TestName"))
assert tup2 == (0, 1, 4)
assert tup2._fields == ("Index", "_1", "_2")

df3 = DataFrame({"f" + str(i): [i] for i in range(1024)})
# will raise SyntaxError if trying to create namedtuple
tup3 = next(df3.itertuples())
assert isinstance(tup3, tuple)
assert hasattr(tup3, "_fields")

@pytest.mark.parametrize("limit", [254, 255, 1024])
@pytest.mark.parametrize("index", [True, False])
def test_itertuples_py2_3_field_limit_namedtuple(self, limit, index):
# GH#28282
df_254_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(254)}])
result_254_columns = next(df_254_columns.itertuples(index=False))
assert isinstance(result_254_columns, tuple)
assert hasattr(result_254_columns, "_fields")

df_255_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(255)}])
result_255_columns = next(df_255_columns.itertuples(index=False))
assert isinstance(result_255_columns, tuple)
assert hasattr(result_255_columns, "_fields")
df = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(limit)}])
result = next(df.itertuples(index=index))
assert isinstance(result, tuple)
assert hasattr(result, "_fields")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this a kludgy way of checking for a NamedTuple?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think so, yes (these are also fairly old test)


def test_sequence_like_with_categorical(self):
# GH#7839
Expand Down
13 changes: 7 additions & 6 deletions pandas/tests/indexes/datetimes/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np
import pytest

from pandas._libs import index as libindex
from pandas.compat.numpy import np_long

import pandas as pd
Expand Down Expand Up @@ -425,17 +426,17 @@ def test_get_loc_time_obj(self):
expected = np.array([])
tm.assert_numpy_array_equal(result, expected, check_dtype=False)

def test_get_loc_time_obj2(self):
@pytest.mark.parametrize("offset", [-10, 10])
def test_get_loc_time_obj2(self, monkeypatch, offset):
# GH#8667

from pandas._libs.index import _SIZE_CUTOFF

ns = _SIZE_CUTOFF + np.array([-100, 100], dtype=np.int64)
size_cutoff = 50
n = size_cutoff + offset
key = time(15, 11, 30)
start = key.hour * 3600 + key.minute * 60 + key.second
step = 24 * 3600

for n in ns:
with monkeypatch.context():
monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff)
idx = date_range("2014-11-26", periods=n, freq="s")
ts = pd.Series(np.random.default_rng(2).standard_normal(n), index=idx)
locs = np.arange(start, n, step, dtype=np.intp)
Expand Down
25 changes: 12 additions & 13 deletions pandas/tests/indexing/multiindex/test_multiindex.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import pytest

import pandas._libs.index as _index
import pandas._libs.index as libindex
from pandas.errors import PerformanceWarning

import pandas as pd
Expand Down Expand Up @@ -33,20 +33,19 @@ def test_multiindex_perf_warn(self):
with tm.assert_produces_warning(PerformanceWarning):
df.loc[(0,)]

def test_indexing_over_hashtable_size_cutoff(self):
n = 10000
@pytest.mark.parametrize("offset", [-5, 5])
def test_indexing_over_hashtable_size_cutoff(self, monkeypatch, offset):
size_cutoff = 20
n = size_cutoff + offset

old_cutoff = _index._SIZE_CUTOFF
_index._SIZE_CUTOFF = 20000
with monkeypatch.context():
monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff)
s = Series(np.arange(n), MultiIndex.from_arrays((["a"] * n, np.arange(n))))

s = Series(np.arange(n), MultiIndex.from_arrays((["a"] * n, np.arange(n))))

# hai it works!
assert s[("a", 5)] == 5
assert s[("a", 6)] == 6
assert s[("a", 7)] == 7

_index._SIZE_CUTOFF = old_cutoff
# hai it works!
assert s[("a", 5)] == 5
assert s[("a", 6)] == 6
assert s[("a", 7)] == 7

def test_multi_nan_indexing(self):
# GH 3588
Expand Down
11 changes: 5 additions & 6 deletions pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1833,15 +1833,14 @@ def test_encoding_latin1_118(self, datapath):
@pytest.mark.slow
def test_stata_119(self, datapath):
# Gzipped since contains 32,999 variables and uncompressed is 20MiB
# Just validate that the reader reports correct number of variables
# to avoid high peak memory
with gzip.open(
datapath("io", "data", "stata", "stata1_119.dta.gz"), "rb"
) as gz:
df = read_stata(gz)
assert df.shape == (1, 32999)
assert df.iloc[0, 6] == "A" * 3000
assert df.iloc[0, 7] == 3.14
assert df.iloc[0, -1] == 1
assert df.iloc[0, 0] == pd.Timestamp(datetime(2012, 12, 21, 21, 12, 21))
with StataReader(gz) as reader:
reader._ensure_open()
assert reader._nvar == 32999

@pytest.mark.parametrize("version", [118, 119, None])
def test_utf8_writer(self, version):
Expand Down
Loading