Skip to content

Commit 7e89905

Browse files
authored
TST: parametrize and improve test performance (pandas-dev#55962)
* parameterize itertuples test * Refactor test_agg_misc * Speed up test_stata_119 * use monkeypatch in test_get_loc_time_obj2 * Use a monkeypatch in test_indexing_over_hashtable_size_cutoff * Parametrize
1 parent 03c1663 commit 7e89905

File tree

5 files changed

+224
-196
lines changed

5 files changed

+224
-196
lines changed

pandas/tests/frame/test_iteration.py

+14-16
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import datetime
22

33
import numpy as np
4+
import pytest
45

56
from pandas.compat import (
67
IS64,
@@ -91,6 +92,7 @@ def test_itertuples(self, float_frame):
9192
expected = float_frame.iloc[i, :].reset_index(drop=True)
9293
tm.assert_series_equal(ser, expected)
9394

95+
def test_itertuples_index_false(self):
9496
df = DataFrame(
9597
{"floats": np.random.default_rng(2).standard_normal(5), "ints": range(5)},
9698
columns=["floats", "ints"],
@@ -99,6 +101,7 @@ def test_itertuples(self, float_frame):
99101
for tup in df.itertuples(index=False):
100102
assert isinstance(tup[1], int)
101103

104+
def test_itertuples_duplicate_cols(self):
102105
df = DataFrame(data={"a": [1, 2, 3], "b": [4, 5, 6]})
103106
dfaa = df[["a", "a"]]
104107

@@ -111,32 +114,27 @@ def test_itertuples(self, float_frame):
111114
== "[(0, 1, 4), (1, 2, 5), (2, 3, 6)]"
112115
)
113116

117+
def test_itertuples_tuple_name(self):
118+
df = DataFrame(data={"a": [1, 2, 3], "b": [4, 5, 6]})
114119
tup = next(df.itertuples(name="TestName"))
115120
assert tup._fields == ("Index", "a", "b")
116121
assert (tup.Index, tup.a, tup.b) == tup
117122
assert type(tup).__name__ == "TestName"
118123

119-
df.columns = ["def", "return"]
124+
def test_itertuples_disallowed_col_labels(self):
125+
df = DataFrame(data={"def": [1, 2, 3], "return": [4, 5, 6]})
120126
tup2 = next(df.itertuples(name="TestName"))
121127
assert tup2 == (0, 1, 4)
122128
assert tup2._fields == ("Index", "_1", "_2")
123129

124-
df3 = DataFrame({"f" + str(i): [i] for i in range(1024)})
125-
# will raise SyntaxError if trying to create namedtuple
126-
tup3 = next(df3.itertuples())
127-
assert isinstance(tup3, tuple)
128-
assert hasattr(tup3, "_fields")
129-
130+
@pytest.mark.parametrize("limit", [254, 255, 1024])
131+
@pytest.mark.parametrize("index", [True, False])
132+
def test_itertuples_py2_3_field_limit_namedtuple(self, limit, index):
130133
# GH#28282
131-
df_254_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(254)}])
132-
result_254_columns = next(df_254_columns.itertuples(index=False))
133-
assert isinstance(result_254_columns, tuple)
134-
assert hasattr(result_254_columns, "_fields")
135-
136-
df_255_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(255)}])
137-
result_255_columns = next(df_255_columns.itertuples(index=False))
138-
assert isinstance(result_255_columns, tuple)
139-
assert hasattr(result_255_columns, "_fields")
134+
df = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(limit)}])
135+
result = next(df.itertuples(index=index))
136+
assert isinstance(result, tuple)
137+
assert hasattr(result, "_fields")
140138

141139
def test_sequence_like_with_categorical(self):
142140
# GH#7839

pandas/tests/indexes/datetimes/test_indexing.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import numpy as np
99
import pytest
1010

11+
from pandas._libs import index as libindex
1112
from pandas.compat.numpy import np_long
1213

1314
import pandas as pd
@@ -425,17 +426,17 @@ def test_get_loc_time_obj(self):
425426
expected = np.array([])
426427
tm.assert_numpy_array_equal(result, expected, check_dtype=False)
427428

428-
def test_get_loc_time_obj2(self):
429+
@pytest.mark.parametrize("offset", [-10, 10])
430+
def test_get_loc_time_obj2(self, monkeypatch, offset):
429431
# GH#8667
430-
431-
from pandas._libs.index import _SIZE_CUTOFF
432-
433-
ns = _SIZE_CUTOFF + np.array([-100, 100], dtype=np.int64)
432+
size_cutoff = 50
433+
n = size_cutoff + offset
434434
key = time(15, 11, 30)
435435
start = key.hour * 3600 + key.minute * 60 + key.second
436436
step = 24 * 3600
437437

438-
for n in ns:
438+
with monkeypatch.context():
439+
monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff)
439440
idx = date_range("2014-11-26", periods=n, freq="s")
440441
ts = pd.Series(np.random.default_rng(2).standard_normal(n), index=idx)
441442
locs = np.arange(start, n, step, dtype=np.intp)

pandas/tests/indexing/multiindex/test_multiindex.py

+12-13
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import numpy as np
22
import pytest
33

4-
import pandas._libs.index as _index
4+
import pandas._libs.index as libindex
55
from pandas.errors import PerformanceWarning
66

77
import pandas as pd
@@ -33,20 +33,19 @@ def test_multiindex_perf_warn(self):
3333
with tm.assert_produces_warning(PerformanceWarning):
3434
df.loc[(0,)]
3535

36-
def test_indexing_over_hashtable_size_cutoff(self):
37-
n = 10000
36+
@pytest.mark.parametrize("offset", [-5, 5])
37+
def test_indexing_over_hashtable_size_cutoff(self, monkeypatch, offset):
38+
size_cutoff = 20
39+
n = size_cutoff + offset
3840

39-
old_cutoff = _index._SIZE_CUTOFF
40-
_index._SIZE_CUTOFF = 20000
41+
with monkeypatch.context():
42+
monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff)
43+
s = Series(np.arange(n), MultiIndex.from_arrays((["a"] * n, np.arange(n))))
4144

42-
s = Series(np.arange(n), MultiIndex.from_arrays((["a"] * n, np.arange(n))))
43-
44-
# hai it works!
45-
assert s[("a", 5)] == 5
46-
assert s[("a", 6)] == 6
47-
assert s[("a", 7)] == 7
48-
49-
_index._SIZE_CUTOFF = old_cutoff
45+
# hai it works!
46+
assert s[("a", 5)] == 5
47+
assert s[("a", 6)] == 6
48+
assert s[("a", 7)] == 7
5049

5150
def test_multi_nan_indexing(self):
5251
# GH 3588

pandas/tests/io/test_stata.py

+5-6
Original file line numberDiff line numberDiff line change
@@ -1833,15 +1833,14 @@ def test_encoding_latin1_118(self, datapath):
18331833
@pytest.mark.slow
18341834
def test_stata_119(self, datapath):
18351835
# Gzipped since contains 32,999 variables and uncompressed is 20MiB
1836+
# Just validate that the reader reports correct number of variables
1837+
# to avoid high peak memory
18361838
with gzip.open(
18371839
datapath("io", "data", "stata", "stata1_119.dta.gz"), "rb"
18381840
) as gz:
1839-
df = read_stata(gz)
1840-
assert df.shape == (1, 32999)
1841-
assert df.iloc[0, 6] == "A" * 3000
1842-
assert df.iloc[0, 7] == 3.14
1843-
assert df.iloc[0, -1] == 1
1844-
assert df.iloc[0, 0] == pd.Timestamp(datetime(2012, 12, 21, 21, 12, 21))
1841+
with StataReader(gz) as reader:
1842+
reader._ensure_open()
1843+
assert reader._nvar == 32999
18451844

18461845
@pytest.mark.parametrize("version", [118, 119, None])
18471846
def test_utf8_writer(self, version):

0 commit comments

Comments
 (0)