TST: parametrize and improve test performance (pandas-dev#55962)

mroeschke · web-flow · commit 7e89905a7097 · 2023-11-16T09:37:18.000-08:00
* parameterize itertuples test

* Refactor test_agg_misc

* Speed up test_stata_119

* use monkeypatch in test_get_loc_time_obj2

* Use a monkeypatch in test_indexing_over_hashtable_size_cutoff

* Parametrize
diff --git a/pandas/tests/frame/test_iteration.py b/pandas/tests/frame/test_iteration.py
@@ -1,6 +1,7 @@
 import datetime
 
 import numpy as np
+import pytest
 
 from pandas.compat import (
     IS64,
@@ -91,6 +92,7 @@ def test_itertuples(self, float_frame):
             expected = float_frame.iloc[i, :].reset_index(drop=True)
             tm.assert_series_equal(ser, expected)
 
+    def test_itertuples_index_false(self):
         df = DataFrame(
             {"floats": np.random.default_rng(2).standard_normal(5), "ints": range(5)},
             columns=["floats", "ints"],
@@ -99,6 +101,7 @@ def test_itertuples(self, float_frame):
         for tup in df.itertuples(index=False):
             assert isinstance(tup[1], int)
 
+    def test_itertuples_duplicate_cols(self):
         df = DataFrame(data={"a": [1, 2, 3], "b": [4, 5, 6]})
         dfaa = df[["a", "a"]]
 
@@ -111,32 +114,27 @@ def test_itertuples(self, float_frame):
                 == "[(0, 1, 4), (1, 2, 5), (2, 3, 6)]"
             )
 
+    def test_itertuples_tuple_name(self):
+        df = DataFrame(data={"a": [1, 2, 3], "b": [4, 5, 6]})
         tup = next(df.itertuples(name="TestName"))
         assert tup._fields == ("Index", "a", "b")
         assert (tup.Index, tup.a, tup.b) == tup
         assert type(tup).__name__ == "TestName"
 
-        df.columns = ["def", "return"]
+    def test_itertuples_disallowed_col_labels(self):
+        df = DataFrame(data={"def": [1, 2, 3], "return": [4, 5, 6]})
         tup2 = next(df.itertuples(name="TestName"))
         assert tup2 == (0, 1, 4)
         assert tup2._fields == ("Index", "_1", "_2")
 
-        df3 = DataFrame({"f" + str(i): [i] for i in range(1024)})
-        # will raise SyntaxError if trying to create namedtuple
-        tup3 = next(df3.itertuples())
-        assert isinstance(tup3, tuple)
-        assert hasattr(tup3, "_fields")
-
+    @pytest.mark.parametrize("limit", [254, 255, 1024])
+    @pytest.mark.parametrize("index", [True, False])
+    def test_itertuples_py2_3_field_limit_namedtuple(self, limit, index):
         # GH#28282
-        df_254_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(254)}])
-        result_254_columns = next(df_254_columns.itertuples(index=False))
-        assert isinstance(result_254_columns, tuple)
-        assert hasattr(result_254_columns, "_fields")
-
-        df_255_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(255)}])
-        result_255_columns = next(df_255_columns.itertuples(index=False))
-        assert isinstance(result_255_columns, tuple)
-        assert hasattr(result_255_columns, "_fields")
+        df = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(limit)}])
+        result = next(df.itertuples(index=index))
+        assert isinstance(result, tuple)
+        assert hasattr(result, "_fields")
 
     def test_sequence_like_with_categorical(self):
         # GH#7839
diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py
@@ -8,6 +8,7 @@
 import numpy as np
 import pytest
 
+from pandas._libs import index as libindex
 from pandas.compat.numpy import np_long
 
 import pandas as pd
@@ -425,17 +426,17 @@ def test_get_loc_time_obj(self):
         expected = np.array([])
         tm.assert_numpy_array_equal(result, expected, check_dtype=False)
 
-    def test_get_loc_time_obj2(self):
+    @pytest.mark.parametrize("offset", [-10, 10])
+    def test_get_loc_time_obj2(self, monkeypatch, offset):
         # GH#8667
-
-        from pandas._libs.index import _SIZE_CUTOFF
-
-        ns = _SIZE_CUTOFF + np.array([-100, 100], dtype=np.int64)
+        size_cutoff = 50
+        n = size_cutoff + offset
         key = time(15, 11, 30)
         start = key.hour * 3600 + key.minute * 60 + key.second
         step = 24 * 3600
 
-        for n in ns:
+        with monkeypatch.context():
+            monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff)
             idx = date_range("2014-11-26", periods=n, freq="s")
             ts = pd.Series(np.random.default_rng(2).standard_normal(n), index=idx)
             locs = np.arange(start, n, step, dtype=np.intp)
diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pytest
 
-import pandas._libs.index as _index
+import pandas._libs.index as libindex
 from pandas.errors import PerformanceWarning
 
 import pandas as pd
@@ -33,20 +33,19 @@ def test_multiindex_perf_warn(self):
         with tm.assert_produces_warning(PerformanceWarning):
             df.loc[(0,)]
 
-    def test_indexing_over_hashtable_size_cutoff(self):
-        n = 10000
+    @pytest.mark.parametrize("offset", [-5, 5])
+    def test_indexing_over_hashtable_size_cutoff(self, monkeypatch, offset):
+        size_cutoff = 20
+        n = size_cutoff + offset
 
-        old_cutoff = _index._SIZE_CUTOFF
-        _index._SIZE_CUTOFF = 20000
+        with monkeypatch.context():
+            monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff)
+            s = Series(np.arange(n), MultiIndex.from_arrays((["a"] * n, np.arange(n))))
 
-        s = Series(np.arange(n), MultiIndex.from_arrays((["a"] * n, np.arange(n))))
-
-        # hai it works!
-        assert s[("a", 5)] == 5
-        assert s[("a", 6)] == 6
-        assert s[("a", 7)] == 7
-
-        _index._SIZE_CUTOFF = old_cutoff
+            # hai it works!
+            assert s[("a", 5)] == 5
+            assert s[("a", 6)] == 6
+            assert s[("a", 7)] == 7
 
     def test_multi_nan_indexing(self):
         # GH 3588
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -1833,15 +1833,14 @@ def test_encoding_latin1_118(self, datapath):
     @pytest.mark.slow
     def test_stata_119(self, datapath):
         # Gzipped since contains 32,999 variables and uncompressed is 20MiB
+        # Just validate that the reader reports correct number of variables
+        # to avoid high peak memory
         with gzip.open(
             datapath("io", "data", "stata", "stata1_119.dta.gz"), "rb"
         ) as gz:
-            df = read_stata(gz)
-        assert df.shape == (1, 32999)
-        assert df.iloc[0, 6] == "A" * 3000
-        assert df.iloc[0, 7] == 3.14
-        assert df.iloc[0, -1] == 1
-        assert df.iloc[0, 0] == pd.Timestamp(datetime(2012, 12, 21, 21, 12, 21))
+            with StataReader(gz) as reader:
+                reader._ensure_open()
+                assert reader._nvar == 32999
 
     @pytest.mark.parametrize("version", [118, 119, None])
     def test_utf8_writer(self, version):
diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py