diff --git a/pandas/tests/frame/test_iteration.py b/pandas/tests/frame/test_iteration.py index 216fd5c2f70c5..7374a8ea6aa77 100644 --- a/pandas/tests/frame/test_iteration.py +++ b/pandas/tests/frame/test_iteration.py @@ -1,6 +1,7 @@ import datetime import numpy as np +import pytest from pandas.compat import ( IS64, @@ -91,6 +92,7 @@ def test_itertuples(self, float_frame): expected = float_frame.iloc[i, :].reset_index(drop=True) tm.assert_series_equal(ser, expected) + def test_itertuples_index_false(self): df = DataFrame( {"floats": np.random.default_rng(2).standard_normal(5), "ints": range(5)}, columns=["floats", "ints"], @@ -99,6 +101,7 @@ def test_itertuples(self, float_frame): for tup in df.itertuples(index=False): assert isinstance(tup[1], int) + def test_itertuples_duplicate_cols(self): df = DataFrame(data={"a": [1, 2, 3], "b": [4, 5, 6]}) dfaa = df[["a", "a"]] @@ -111,32 +114,27 @@ def test_itertuples(self, float_frame): == "[(0, 1, 4), (1, 2, 5), (2, 3, 6)]" ) + def test_itertuples_tuple_name(self): + df = DataFrame(data={"a": [1, 2, 3], "b": [4, 5, 6]}) tup = next(df.itertuples(name="TestName")) assert tup._fields == ("Index", "a", "b") assert (tup.Index, tup.a, tup.b) == tup assert type(tup).__name__ == "TestName" - df.columns = ["def", "return"] + def test_itertuples_disallowed_col_labels(self): + df = DataFrame(data={"def": [1, 2, 3], "return": [4, 5, 6]}) tup2 = next(df.itertuples(name="TestName")) assert tup2 == (0, 1, 4) assert tup2._fields == ("Index", "_1", "_2") - df3 = DataFrame({"f" + str(i): [i] for i in range(1024)}) - # will raise SyntaxError if trying to create namedtuple - tup3 = next(df3.itertuples()) - assert isinstance(tup3, tuple) - assert hasattr(tup3, "_fields") - + @pytest.mark.parametrize("limit", [254, 255, 1024]) + @pytest.mark.parametrize("index", [True, False]) + def test_itertuples_py2_3_field_limit_namedtuple(self, limit, index): # GH#28282 - df_254_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(254)}]) - result_254_columns = next(df_254_columns.itertuples(index=False)) - assert isinstance(result_254_columns, tuple) - assert hasattr(result_254_columns, "_fields") - - df_255_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(255)}]) - result_255_columns = next(df_255_columns.itertuples(index=False)) - assert isinstance(result_255_columns, tuple) - assert hasattr(result_255_columns, "_fields") + df = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(limit)}]) + result = next(df.itertuples(index=index)) + assert isinstance(result, tuple) + assert hasattr(result, "_fields") def test_sequence_like_with_categorical(self): # GH#7839 diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index da11920ac1cba..73de33607ca0b 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -8,6 +8,7 @@ import numpy as np import pytest +from pandas._libs import index as libindex from pandas.compat.numpy import np_long import pandas as pd @@ -425,17 +426,17 @@ def test_get_loc_time_obj(self): expected = np.array([]) tm.assert_numpy_array_equal(result, expected, check_dtype=False) - def test_get_loc_time_obj2(self): + @pytest.mark.parametrize("offset", [-10, 10]) + def test_get_loc_time_obj2(self, monkeypatch, offset): # GH#8667 - - from pandas._libs.index import _SIZE_CUTOFF - - ns = _SIZE_CUTOFF + np.array([-100, 100], dtype=np.int64) + size_cutoff = 50 + n = size_cutoff + offset key = time(15, 11, 30) start = key.hour * 3600 + key.minute * 60 + key.second step = 24 * 3600 - for n in ns: + with monkeypatch.context(): + monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff) idx = date_range("2014-11-26", periods=n, freq="s") ts = pd.Series(np.random.default_rng(2).standard_normal(n), index=idx) locs = np.arange(start, n, step, dtype=np.intp) diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 3d2ed1d168040..36cc8316ea5ff 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -1,7 +1,7 @@ import numpy as np import pytest -import pandas._libs.index as _index +import pandas._libs.index as libindex from pandas.errors import PerformanceWarning import pandas as pd @@ -33,20 +33,19 @@ def test_multiindex_perf_warn(self): with tm.assert_produces_warning(PerformanceWarning): df.loc[(0,)] - def test_indexing_over_hashtable_size_cutoff(self): - n = 10000 + @pytest.mark.parametrize("offset", [-5, 5]) + def test_indexing_over_hashtable_size_cutoff(self, monkeypatch, offset): + size_cutoff = 20 + n = size_cutoff + offset - old_cutoff = _index._SIZE_CUTOFF - _index._SIZE_CUTOFF = 20000 + with monkeypatch.context(): + monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff) + s = Series(np.arange(n), MultiIndex.from_arrays((["a"] * n, np.arange(n)))) - s = Series(np.arange(n), MultiIndex.from_arrays((["a"] * n, np.arange(n)))) - - # hai it works! - assert s[("a", 5)] == 5 - assert s[("a", 6)] == 6 - assert s[("a", 7)] == 7 - - _index._SIZE_CUTOFF = old_cutoff + # hai it works! + assert s[("a", 5)] == 5 + assert s[("a", 6)] == 6 + assert s[("a", 7)] == 7 def test_multi_nan_indexing(self): # GH 3588 diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index f7f94af92743e..c8ff42509505a 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1833,15 +1833,14 @@ def test_encoding_latin1_118(self, datapath): @pytest.mark.slow def test_stata_119(self, datapath): # Gzipped since contains 32,999 variables and uncompressed is 20MiB + # Just validate that the reader reports correct number of variables + # to avoid high peak memory with gzip.open( datapath("io", "data", "stata", "stata1_119.dta.gz"), "rb" ) as gz: - df = read_stata(gz) - assert df.shape == (1, 32999) - assert df.iloc[0, 6] == "A" * 3000 - assert df.iloc[0, 7] == 3.14 - assert df.iloc[0, -1] == 1 - assert df.iloc[0, 0] == pd.Timestamp(datetime(2012, 12, 21, 21, 12, 21)) + with StataReader(gz) as reader: + reader._ensure_open() + assert reader._nvar == 32999 @pytest.mark.parametrize("version", [118, 119, None]) def test_utf8_writer(self, version): diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 12618f42c7f07..94e50bc980e0a 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -376,214 +376,245 @@ def test_agg_consistency_int_str_column_mix(): # `Base` test class -def test_agg(): - # test with all three Resampler apis and TimeGrouper - +@pytest.fixture +def index(): index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") index.name = "date" - df = DataFrame( + return index + + +@pytest.fixture +def df(index): + frame = DataFrame( np.random.default_rng(2).random((10, 2)), columns=list("AB"), index=index ) - df_col = df.reset_index() + return frame + + +@pytest.fixture +def df_col(df): + return df.reset_index() + + +@pytest.fixture +def df_mult(df_col, index): df_mult = df_col.copy() df_mult.index = pd.MultiIndex.from_arrays( - [range(10), df.index], names=["index", "date"] + [range(10), index], names=["index", "date"] ) - r = df.resample("2D") - cases = [ - r, - df_col.resample("2D", on="date"), - df_mult.resample("2D", level="date"), - df.groupby(pd.Grouper(freq="2D")), - ] + return df_mult + + +@pytest.fixture +def a_mean(df): + return df.resample("2D")["A"].mean() + + +@pytest.fixture +def a_std(df): + return df.resample("2D")["A"].std() + + +@pytest.fixture +def a_sum(df): + return df.resample("2D")["A"].sum() + + +@pytest.fixture +def b_mean(df): + return df.resample("2D")["B"].mean() + + +@pytest.fixture +def b_std(df): + return df.resample("2D")["B"].std() + + +@pytest.fixture +def b_sum(df): + return df.resample("2D")["B"].sum() + + +@pytest.fixture +def df_resample(df): + return df.resample("2D") - a_mean = r["A"].mean() - a_std = r["A"].std() - a_sum = r["A"].sum() - b_mean = r["B"].mean() - b_std = r["B"].std() - b_sum = r["B"].sum() +@pytest.fixture +def df_col_resample(df_col): + return df_col.resample("2D", on="date") + + +@pytest.fixture +def df_mult_resample(df_mult): + return df_mult.resample("2D", level="date") + + +@pytest.fixture +def df_grouper_resample(df): + return df.groupby(pd.Grouper(freq="2D")) + + +@pytest.fixture( + params=["df_resample", "df_col_resample", "df_mult_resample", "df_grouper_resample"] +) +def cases(request): + return request.getfixturevalue(request.param) + + +def test_agg_mixed_column_aggregation(cases, a_mean, a_std, b_mean, b_std, request): expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) msg = "using SeriesGroupBy.[mean|std]" - for t in cases: - # In case 2, "date" is an index and a column, so get included in the agg - if t == cases[2]: - date_mean = t["date"].mean() - date_std = t["date"].std() - exp = pd.concat([date_mean, date_std, expected], axis=1) - exp.columns = pd.MultiIndex.from_product( - [["date", "A", "B"], ["mean", "std"]] - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = t.aggregate([np.mean, np.std]) - tm.assert_frame_equal(result, exp) - else: - with tm.assert_produces_warning(FutureWarning, match=msg): - result = t.aggregate([np.mean, np.std]) - tm.assert_frame_equal(result, expected) + # "date" is an index and a column, so get included in the agg + if "df_mult" in request.node.callspec.id: + date_mean = cases["date"].mean() + date_std = cases["date"].std() + expected = pd.concat([date_mean, date_std, expected], axis=1) + expected.columns = pd.MultiIndex.from_product( + [["date", "A", "B"], ["mean", "std"]] + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = cases.aggregate([np.mean, np.std]) + tm.assert_frame_equal(result, expected) - expected = pd.concat([a_mean, b_std], axis=1) - for t in cases: - with tm.assert_produces_warning(FutureWarning, match=msg): - result = t.aggregate({"A": np.mean, "B": np.std}) - tm.assert_frame_equal(result, expected, check_like=True) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = t.aggregate(A=("A", np.mean), B=("B", np.std)) - tm.assert_frame_equal(result, expected, check_like=True) +@pytest.mark.parametrize( + "agg", + [ + {"func": {"A": np.mean, "B": np.std}}, + {"A": ("A", np.mean), "B": ("B", np.std)}, + {"A": NamedAgg("A", np.mean), "B": NamedAgg("B", np.std)}, + ], +) +def test_agg_both_mean_std_named_result(cases, a_mean, b_std, agg): + msg = "using SeriesGroupBy.[mean|std]" + expected = pd.concat([a_mean, b_std], axis=1) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = cases.aggregate(**agg) + tm.assert_frame_equal(result, expected, check_like=True) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = t.aggregate(A=NamedAgg("A", np.mean), B=NamedAgg("B", np.std)) - tm.assert_frame_equal(result, expected, check_like=True) +def test_agg_both_mean_std_dict_of_list(cases, a_mean, a_std): expected = pd.concat([a_mean, a_std], axis=1) expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "std")]) - for t in cases: - result = t.aggregate({"A": ["mean", "std"]}) - tm.assert_frame_equal(result, expected) + result = cases.aggregate({"A": ["mean", "std"]}) + tm.assert_frame_equal(result, expected) + +@pytest.mark.parametrize( + "agg", [{"func": ["mean", "sum"]}, {"mean": "mean", "sum": "sum"}] +) +def test_agg_both_mean_sum(cases, a_mean, a_sum, agg): expected = pd.concat([a_mean, a_sum], axis=1) expected.columns = ["mean", "sum"] - for t in cases: - result = t["A"].aggregate(["mean", "sum"]) - tm.assert_frame_equal(result, expected) + result = cases["A"].aggregate(**agg) + tm.assert_frame_equal(result, expected) - result = t["A"].aggregate(mean="mean", sum="sum") - tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "agg", + [ + {"A": {"mean": "mean", "sum": "sum"}}, + { + "A": {"mean": "mean", "sum": "sum"}, + "B": {"mean2": "mean", "sum2": "sum"}, + }, + ], +) +def test_agg_dict_of_dict_specificationerror(cases, agg): msg = "nested renamer is not supported" - for t in cases: - with pytest.raises(pd.errors.SpecificationError, match=msg): - t.aggregate({"A": {"mean": "mean", "sum": "sum"}}) + with pytest.raises(pd.errors.SpecificationError, match=msg): + cases.aggregate(agg) - expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples( - [("A", "mean"), ("A", "sum"), ("B", "mean2"), ("B", "sum2")] - ) - for t in cases: - with pytest.raises(pd.errors.SpecificationError, match=msg): - t.aggregate( - { - "A": {"mean": "mean", "sum": "sum"}, - "B": {"mean2": "mean", "sum2": "sum"}, - } - ) +def test_agg_dict_of_lists(cases, a_mean, a_std, b_mean, b_std): expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_tuples( [("A", "mean"), ("A", "std"), ("B", "mean"), ("B", "std")] ) - for t in cases: - result = t.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]}) - tm.assert_frame_equal(result, expected, check_like=True) - - expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples( - [ - ("r1", "A", "mean"), - ("r1", "A", "sum"), - ("r2", "B", "mean"), - ("r2", "B", "sum"), - ] - ) + result = cases.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]}) + tm.assert_frame_equal(result, expected, check_like=True) -def test_agg_misc(): - # test with all three Resampler apis and TimeGrouper - - index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") - index.name = "date" - df = DataFrame( - np.random.default_rng(2).random((10, 2)), columns=list("AB"), index=index - ) - df_col = df.reset_index() - df_mult = df_col.copy() - df_mult.index = pd.MultiIndex.from_arrays( - [range(10), df.index], names=["index", "date"] - ) - - r = df.resample("2D") - cases = [ - r, - df_col.resample("2D", on="date"), - df_mult.resample("2D", level="date"), - df.groupby(pd.Grouper(freq="2D")), - ] - +@pytest.mark.parametrize( + "agg", + [ + {"func": {"A": np.sum, "B": lambda x: np.std(x, ddof=1)}}, + {"A": ("A", np.sum), "B": ("B", lambda x: np.std(x, ddof=1))}, + {"A": NamedAgg("A", np.sum), "B": NamedAgg("B", lambda x: np.std(x, ddof=1))}, + ], +) +def test_agg_with_lambda(cases, agg): # passed lambda msg = "using SeriesGroupBy.sum" - for t in cases: - with tm.assert_produces_warning(FutureWarning, match=msg): - result = t.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) - rcustom = t["B"].apply(lambda x: np.std(x, ddof=1)) - expected = pd.concat([r["A"].sum(), rcustom], axis=1) - tm.assert_frame_equal(result, expected, check_like=True) - - with tm.assert_produces_warning(FutureWarning, match=msg): - result = t.agg(A=("A", np.sum), B=("B", lambda x: np.std(x, ddof=1))) - tm.assert_frame_equal(result, expected, check_like=True) - - with tm.assert_produces_warning(FutureWarning, match=msg): - result = t.agg( - A=NamedAgg("A", np.sum), B=NamedAgg("B", lambda x: np.std(x, ddof=1)) - ) - tm.assert_frame_equal(result, expected, check_like=True) + rcustom = cases["B"].apply(lambda x: np.std(x, ddof=1)) + expected = pd.concat([cases["A"].sum(), rcustom], axis=1) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = cases.agg(**agg) + tm.assert_frame_equal(result, expected, check_like=True) - # agg with renamers - expected = pd.concat( - [t["A"].sum(), t["B"].sum(), t["A"].mean(), t["B"].mean()], axis=1 - ) - expected.columns = pd.MultiIndex.from_tuples( - [("result1", "A"), ("result1", "B"), ("result2", "A"), ("result2", "B")] - ) +@pytest.mark.parametrize( + "agg", + [ + {"func": {"result1": np.sum, "result2": np.mean}}, + {"A": ("result1", np.sum), "B": ("result2", np.mean)}, + {"A": NamedAgg("result1", np.sum), "B": NamedAgg("result2", np.mean)}, + ], +) +def test_agg_no_column(cases, agg): msg = r"Column\(s\) \['result1', 'result2'\] do not exist" - for t in cases: - with pytest.raises(KeyError, match=msg): - t[["A", "B"]].agg({"result1": np.sum, "result2": np.mean}) - - with pytest.raises(KeyError, match=msg): - t[["A", "B"]].agg(A=("result1", np.sum), B=("result2", np.mean)) + with pytest.raises(KeyError, match=msg): + cases[["A", "B"]].agg(**agg) - with pytest.raises(KeyError, match=msg): - t[["A", "B"]].agg( - A=NamedAgg("result1", np.sum), B=NamedAgg("result2", np.mean) - ) +@pytest.mark.parametrize( + "cols, agg", + [ + [None, {"A": ["sum", "std"], "B": ["mean", "std"]}], + [ + [ + "A", + "B", + ], + {"A": ["sum", "std"], "B": ["mean", "std"]}, + ], + ], +) +def test_agg_specificationerror_nested(cases, cols, agg, a_sum, a_std, b_mean, b_std): # agg with different hows - expected = pd.concat( - [t["A"].sum(), t["A"].std(), t["B"].mean(), t["B"].std()], axis=1 - ) + # equivalent of using a selection list / or not + expected = pd.concat([a_sum, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_tuples( [("A", "sum"), ("A", "std"), ("B", "mean"), ("B", "std")] ) - for t in cases: - result = t.agg({"A": ["sum", "std"], "B": ["mean", "std"]}) - tm.assert_frame_equal(result, expected, check_like=True) + if cols is not None: + obj = cases[cols] + else: + obj = cases + + result = obj.agg(agg) + tm.assert_frame_equal(result, expected, check_like=True) - # equivalent of using a selection list / or not - for t in cases: - result = t[["A", "B"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) - tm.assert_frame_equal(result, expected, check_like=True) +@pytest.mark.parametrize( + "agg", [{"A": ["sum", "std"]}, {"A": ["sum", "std"], "B": ["mean", "std"]}] +) +def test_agg_specificationerror_series(cases, agg): msg = "nested renamer is not supported" # series like aggs - for t in cases: - with pytest.raises(pd.errors.SpecificationError, match=msg): - t["A"].agg({"A": ["sum", "std"]}) + with pytest.raises(pd.errors.SpecificationError, match=msg): + cases["A"].agg(agg) - with pytest.raises(pd.errors.SpecificationError, match=msg): - t["A"].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) +def test_agg_specificationerror_invalid_names(cases): # errors # invalid names in the agg specification msg = r"Column\(s\) \['B'\] do not exist" - for t in cases: - with pytest.raises(KeyError, match=msg): - t[["A"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) + with pytest.raises(KeyError, match=msg): + cases[["A"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) @pytest.mark.parametrize(