From a22dfc190329c30b0fbb623f076aeeb74e86e27b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 7 Oct 2022 14:35:46 -0700 Subject: [PATCH 1/3] CLN/TST: Use fixture instead of setup_method --- pandas/tests/window/test_groupby.py | 37 +++++++++++++++-------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 38ac6bb2e1c09..7282328805c8b 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -1006,14 +1006,15 @@ def test_datelike_on_not_monotonic_within_each_group(self): class TestExpanding: - def setup_method(self): - self.frame = DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) + @pytest.fixture() + def frame(self): + return DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) @pytest.mark.parametrize( "f", ["sum", "mean", "min", "max", "count", "kurt", "skew"] ) - def test_expanding(self, f): - g = self.frame.groupby("A", group_keys=False) + def test_expanding(self, f, frame): + g = frame.groupby("A", group_keys=False) r = g.expanding() result = getattr(r, f)() @@ -1021,13 +1022,13 @@ def test_expanding(self, f): # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 - expected_index = MultiIndex.from_arrays([self.frame["A"], range(40)]) + expected_index = MultiIndex.from_arrays([frame["A"], range(40)]) expected.index = expected_index tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("f", ["std", "var"]) - def test_expanding_ddof(self, f): - g = self.frame.groupby("A", group_keys=False) + def test_expanding_ddof(self, f, frame): + g = frame.groupby("A", group_keys=False) r = g.expanding() result = getattr(r, f)(ddof=0) @@ -1035,15 +1036,15 @@ def test_expanding_ddof(self, f): # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 - expected_index = MultiIndex.from_arrays([self.frame["A"], range(40)]) + expected_index = MultiIndex.from_arrays([frame["A"], range(40)]) expected.index = expected_index tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"] ) - def test_expanding_quantile(self, interpolation): - g = self.frame.groupby("A", group_keys=False) + def test_expanding_quantile(self, interpolation, frame): + g = frame.groupby("A", group_keys=False) r = g.expanding() result = r.quantile(0.4, interpolation=interpolation) @@ -1053,19 +1054,19 @@ def test_expanding_quantile(self, interpolation): # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 - expected_index = MultiIndex.from_arrays([self.frame["A"], range(40)]) + expected_index = MultiIndex.from_arrays([frame["A"], range(40)]) expected.index = expected_index tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("f", ["corr", "cov"]) - def test_expanding_corr_cov(self, f): - g = self.frame.groupby("A") + def test_expanding_corr_cov(self, f, frame): + g = frame.groupby("A") r = g.expanding() - result = getattr(r, f)(self.frame) + result = getattr(r, f)(frame) def func_0(x): - return getattr(x.expanding(), f)(self.frame) + return getattr(x.expanding(), f)(frame) expected = g.apply(func_0) # GH 39591: groupby.apply returns 1 instead of nan for windows @@ -1085,8 +1086,8 @@ def func_1(x): expected = g.apply(func_1) tm.assert_series_equal(result, expected) - def test_expanding_apply(self, raw): - g = self.frame.groupby("A", group_keys=False) + def test_expanding_apply(self, raw, frame): + g = frame.groupby("A", group_keys=False) r = g.expanding() # reduction @@ -1095,7 +1096,7 @@ def test_expanding_apply(self, raw): # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 - expected_index = MultiIndex.from_arrays([self.frame["A"], range(40)]) + expected_index = MultiIndex.from_arrays([frame["A"], range(40)]) expected.index = expected_index tm.assert_frame_equal(result, expected) From eef1f9fc9de39edb2134c5e68fa6d7bf55cd395d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 7 Oct 2022 14:36:55 -0700 Subject: [PATCH 2/3] Remove () --- pandas/tests/window/test_groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 7282328805c8b..c01bb519875b9 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -1006,7 +1006,7 @@ def test_datelike_on_not_monotonic_within_each_group(self): class TestExpanding: - @pytest.fixture() + @pytest.fixture def frame(self): return DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) From 2b8aaa5b1161066f236a822276e46d5cd817025e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 7 Oct 2022 15:47:20 -0700 Subject: [PATCH 3/3] Refactor more setup_methods --- pandas/tests/frame/test_query_eval.py | 43 +++--- .../json/test_json_table_schema_ext_dtype.py | 53 ++++--- pandas/tests/plotting/test_converter.py | 50 +++--- pandas/tests/reshape/merge/test_join.py | 128 +++++++++------- pandas/tests/reshape/merge/test_merge.py | 90 +++++------ pandas/tests/reshape/test_pivot.py | 143 +++++++++--------- 6 files changed, 275 insertions(+), 232 deletions(-) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 35335c54cd41e..4da57fc177712 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -36,45 +36,48 @@ def skip_if_no_pandas_parser(parser): class TestCompat: - def setup_method(self): - self.df = DataFrame({"A": [1, 2, 3]}) - self.expected1 = self.df[self.df.A > 0] - self.expected2 = self.df.A + 1 + @pytest.fixture + def df(self): + return DataFrame({"A": [1, 2, 3]}) + + @pytest.fixture + def expected1(self, df): + return df[df.A > 0] + + @pytest.fixture + def expected2(self, df): + return df.A + 1 - def test_query_default(self): + def test_query_default(self, df, expected1, expected2): # GH 12749 # this should always work, whether NUMEXPR_INSTALLED or not - df = self.df result = df.query("A>0") - tm.assert_frame_equal(result, self.expected1) + tm.assert_frame_equal(result, expected1) result = df.eval("A+1") - tm.assert_series_equal(result, self.expected2, check_names=False) + tm.assert_series_equal(result, expected2, check_names=False) - def test_query_None(self): + def test_query_None(self, df, expected1, expected2): - df = self.df result = df.query("A>0", engine=None) - tm.assert_frame_equal(result, self.expected1) + tm.assert_frame_equal(result, expected1) result = df.eval("A+1", engine=None) - tm.assert_series_equal(result, self.expected2, check_names=False) + tm.assert_series_equal(result, expected2, check_names=False) - def test_query_python(self): + def test_query_python(self, df, expected1, expected2): - df = self.df result = df.query("A>0", engine="python") - tm.assert_frame_equal(result, self.expected1) + tm.assert_frame_equal(result, expected1) result = df.eval("A+1", engine="python") - tm.assert_series_equal(result, self.expected2, check_names=False) + tm.assert_series_equal(result, expected2, check_names=False) - def test_query_numexpr(self): + def test_query_numexpr(self, df, expected1, expected2): - df = self.df if NUMEXPR_INSTALLED: result = df.query("A>0", engine="numexpr") - tm.assert_frame_equal(result, self.expected1) + tm.assert_frame_equal(result, expected1) result = df.eval("A+1", engine="numexpr") - tm.assert_series_equal(result, self.expected2, check_names=False) + tm.assert_series_equal(result, expected2, check_names=False) else: msg = ( r"'numexpr' is not installed or an unsupported version. " diff --git a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py index fbf4006066f6b..ae926173e129b 100644 --- a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py +++ b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py @@ -111,23 +111,36 @@ def test_as_json_table_type_ext_integer_dtype(self): class TestTableOrient: - def setup_method(self): - self.da = DateArray([dt.date(2021, 10, 10)]) - self.dc = DecimalArray([decimal.Decimal(10)]) - self.sa = array(["pandas"], dtype="string") - self.ia = array([10], dtype="Int64") - self.df = DataFrame( + @pytest.fixture + def da(self): + return DateArray([dt.date(2021, 10, 10)]) + + @pytest.fixture + def dc(self): + return DecimalArray([decimal.Decimal(10)]) + + @pytest.fixture + def sa(self): + return array(["pandas"], dtype="string") + + @pytest.fixture + def ia(self): + return array([10], dtype="Int64") + + @pytest.fixture + def df(self, da, dc, sa, ia): + return DataFrame( { - "A": self.da, - "B": self.dc, - "C": self.sa, - "D": self.ia, + "A": da, + "B": dc, + "C": sa, + "D": ia, } ) - def test_build_date_series(self): + def test_build_date_series(self, da): - s = Series(self.da, name="a") + s = Series(da, name="a") s.index.name = "id" result = s.to_json(orient="table", date_format="iso") result = json.loads(result, object_pairs_hook=OrderedDict) @@ -151,9 +164,9 @@ def test_build_date_series(self): assert result == expected - def test_build_decimal_series(self): + def test_build_decimal_series(self, dc): - s = Series(self.dc, name="a") + s = Series(dc, name="a") s.index.name = "id" result = s.to_json(orient="table", date_format="iso") result = json.loads(result, object_pairs_hook=OrderedDict) @@ -177,8 +190,8 @@ def test_build_decimal_series(self): assert result == expected - def test_build_string_series(self): - s = Series(self.sa, name="a") + def test_build_string_series(self, sa): + s = Series(sa, name="a") s.index.name = "id" result = s.to_json(orient="table", date_format="iso") result = json.loads(result, object_pairs_hook=OrderedDict) @@ -202,8 +215,8 @@ def test_build_string_series(self): assert result == expected - def test_build_int64_series(self): - s = Series(self.ia, name="a") + def test_build_int64_series(self, ia): + s = Series(ia, name="a") s.index.name = "id" result = s.to_json(orient="table", date_format="iso") result = json.loads(result, object_pairs_hook=OrderedDict) @@ -227,9 +240,9 @@ def test_build_int64_series(self): assert result == expected - def test_to_json(self): + def test_to_json(self, df): - df = self.df.copy() + df = df.copy() df.index.name = "idx" result = df.to_json(orient="table", date_format="iso") result = json.loads(result, object_pairs_hook=OrderedDict) diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index 3ec8f4bd71c2b..9a6fed1afad1f 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -287,66 +287,70 @@ def test_convert_nested(self, dtc): class TestPeriodConverter: - def setup_method(self): - self.pc = converter.PeriodConverter() + @pytest.fixture + def pc(self): + return converter.PeriodConverter() + @pytest.fixture + def axis(self): class Axis: pass - self.axis = Axis() - self.axis.freq = "D" + axis = Axis() + axis.freq = "D" + return axis - def test_convert_accepts_unicode(self): - r1 = self.pc.convert("2012-1-1", None, self.axis) - r2 = self.pc.convert("2012-1-1", None, self.axis) + def test_convert_accepts_unicode(self, pc, axis): + r1 = pc.convert("2012-1-1", None, axis) + r2 = pc.convert("2012-1-1", None, axis) assert r1 == r2 - def test_conversion(self): - rs = self.pc.convert(["2012-1-1"], None, self.axis)[0] + def test_conversion(self, pc, axis): + rs = pc.convert(["2012-1-1"], None, axis)[0] xp = Period("2012-1-1").ordinal assert rs == xp - rs = self.pc.convert("2012-1-1", None, self.axis) + rs = pc.convert("2012-1-1", None, axis) assert rs == xp - rs = self.pc.convert([date(2012, 1, 1)], None, self.axis)[0] + rs = pc.convert([date(2012, 1, 1)], None, axis)[0] assert rs == xp - rs = self.pc.convert(date(2012, 1, 1), None, self.axis) + rs = pc.convert(date(2012, 1, 1), None, axis) assert rs == xp - rs = self.pc.convert([Timestamp("2012-1-1")], None, self.axis)[0] + rs = pc.convert([Timestamp("2012-1-1")], None, axis)[0] assert rs == xp - rs = self.pc.convert(Timestamp("2012-1-1"), None, self.axis) + rs = pc.convert(Timestamp("2012-1-1"), None, axis) assert rs == xp - rs = self.pc.convert("2012-01-01", None, self.axis) + rs = pc.convert("2012-01-01", None, axis) assert rs == xp - rs = self.pc.convert("2012-01-01 00:00:00+0000", None, self.axis) + rs = pc.convert("2012-01-01 00:00:00+0000", None, axis) assert rs == xp - rs = self.pc.convert( + rs = pc.convert( np.array( ["2012-01-01 00:00:00", "2012-01-02 00:00:00"], dtype="datetime64[ns]", ), None, - self.axis, + axis, ) assert rs[0] == xp - def test_integer_passthrough(self): + def test_integer_passthrough(self, pc, axis): # GH9012 - rs = self.pc.convert([0, 1], None, self.axis) + rs = pc.convert([0, 1], None, axis) xp = [0, 1] assert rs == xp - def test_convert_nested(self): + def test_convert_nested(self, pc, axis): data = ["2012-1-1", "2012-1-2"] - r1 = self.pc.convert([data, data], None, self.axis) - r2 = [self.pc.convert(data, None, self.axis) for _ in range(2)] + r1 = pc.convert([data, data], None, axis) + r2 = [pc.convert(data, None, axis) for _ in range(2)] assert r1 == r2 diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 6e87c221426c1..23d7c91ceefae 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -13,82 +13,96 @@ merge, ) import pandas._testing as tm -from pandas.tests.reshape.merge.test_merge import ( - NGROUPS, - N, - get_test_data, -) + + +def get_test_data(ngroups=8, n=50): + unique_groups = list(range(ngroups)) + arr = np.asarray(np.tile(unique_groups, n // ngroups)) + + if len(arr) < n: + arr = np.asarray(list(arr) + unique_groups[: n - len(arr)]) + + np.random.shuffle(arr) + return arr class TestJoin: - def setup_method(self): - # aggregate multiple columns - self.df = DataFrame( + # aggregate multiple columns + @pytest.fixture + def df(self): + df = DataFrame( { "key1": get_test_data(), "key2": get_test_data(), - "data1": np.random.randn(N), - "data2": np.random.randn(N), + "data1": np.random.randn(50), + "data2": np.random.randn(50), } ) # exclude a couple keys for fun - self.df = self.df[self.df["key2"] > 1] + df = df[df["key2"] > 1] + return df - self.df2 = DataFrame( + @pytest.fixture + def df2(self): + return DataFrame( { - "key1": get_test_data(n=N // 5), - "key2": get_test_data(ngroups=NGROUPS // 2, n=N // 5), - "value": np.random.randn(N // 5), + "key1": get_test_data(n=10), + "key2": get_test_data(ngroups=4, n=10), + "value": np.random.randn(10), } ) + @pytest.fixture + def target_source(self): index, data = tm.getMixedTypeDict() - self.target = DataFrame(data, index=index) + target = DataFrame(data, index=index) # Join on string value - self.source = DataFrame( + + source = DataFrame( {"MergedA": data["A"], "MergedD": data["D"]}, index=data["C"] ) + return target, source - def test_left_outer_join(self): - joined_key2 = merge(self.df, self.df2, on="key2") - _check_join(self.df, self.df2, joined_key2, ["key2"], how="left") + def test_left_outer_join(self, df, df2): + joined_key2 = merge(df, df2, on="key2") + _check_join(df, df2, joined_key2, ["key2"], how="left") - joined_both = merge(self.df, self.df2) - _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="left") + joined_both = merge(df, df2) + _check_join(df, df2, joined_both, ["key1", "key2"], how="left") - def test_right_outer_join(self): - joined_key2 = merge(self.df, self.df2, on="key2", how="right") - _check_join(self.df, self.df2, joined_key2, ["key2"], how="right") + def test_right_outer_join(self, df, df2): + joined_key2 = merge(df, df2, on="key2", how="right") + _check_join(df, df2, joined_key2, ["key2"], how="right") - joined_both = merge(self.df, self.df2, how="right") - _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="right") + joined_both = merge(df, df2, how="right") + _check_join(df, df2, joined_both, ["key1", "key2"], how="right") - def test_full_outer_join(self): - joined_key2 = merge(self.df, self.df2, on="key2", how="outer") - _check_join(self.df, self.df2, joined_key2, ["key2"], how="outer") + def test_full_outer_join(self, df, df2): + joined_key2 = merge(df, df2, on="key2", how="outer") + _check_join(df, df2, joined_key2, ["key2"], how="outer") - joined_both = merge(self.df, self.df2, how="outer") - _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="outer") + joined_both = merge(df, df2, how="outer") + _check_join(df, df2, joined_both, ["key1", "key2"], how="outer") - def test_inner_join(self): - joined_key2 = merge(self.df, self.df2, on="key2", how="inner") - _check_join(self.df, self.df2, joined_key2, ["key2"], how="inner") + def test_inner_join(self, df, df2): + joined_key2 = merge(df, df2, on="key2", how="inner") + _check_join(df, df2, joined_key2, ["key2"], how="inner") - joined_both = merge(self.df, self.df2, how="inner") - _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="inner") + joined_both = merge(df, df2, how="inner") + _check_join(df, df2, joined_both, ["key1", "key2"], how="inner") - def test_handle_overlap(self): - joined = merge(self.df, self.df2, on="key2", suffixes=(".foo", ".bar")) + def test_handle_overlap(self, df, df2): + joined = merge(df, df2, on="key2", suffixes=(".foo", ".bar")) assert "key1.foo" in joined assert "key1.bar" in joined - def test_handle_overlap_arbitrary_key(self): + def test_handle_overlap_arbitrary_key(self, df, df2): joined = merge( - self.df, - self.df2, + df, + df2, left_on="key2", right_on="key1", suffixes=(".foo", ".bar"), @@ -96,9 +110,8 @@ def test_handle_overlap_arbitrary_key(self): assert "key1.foo" in joined assert "key2.bar" in joined - def test_join_on(self): - target = self.target - source = self.source + def test_join_on(self, target_source): + target, source = target_source merged = target.join(source, on="C") tm.assert_series_equal(merged["MergedA"], target["A"], check_names=False) @@ -189,22 +202,24 @@ def test_join_on_fails_with_wrong_object_type(self, wrong_type): with pytest.raises(TypeError, match=msg): merge(df, wrong_type, left_on="a", right_on="a") - def test_join_on_pass_vector(self): - expected = self.target.join(self.source, on="C") + def test_join_on_pass_vector(self, target_source): + target, source = target_source + expected = target.join(source, on="C") del expected["C"] - join_col = self.target.pop("C") - result = self.target.join(self.source, on=join_col) + join_col = target.pop("C") + result = target.join(source, on=join_col) tm.assert_frame_equal(result, expected) - def test_join_with_len0(self): + def test_join_with_len0(self, target_source): # nothing to merge - merged = self.target.join(self.source.reindex([]), on="C") - for col in self.source: + target, source = target_source + merged = target.join(source.reindex([]), on="C") + for col in source: assert col in merged assert merged[col].isna().all() - merged2 = self.target.join(self.source.reindex([]), on="C", how="inner") + merged2 = target.join(source.reindex([]), on="C", how="inner") tm.assert_index_equal(merged2.columns, merged.columns) assert len(merged2) == 0 @@ -230,9 +245,10 @@ def test_join_on_singlekey_list(self): tm.assert_frame_equal(joined, expected) - def test_join_on_series(self): - result = self.target.join(self.source["MergedA"], on="C") - expected = self.target.join(self.source[["MergedA"]], on="C") + def test_join_on_series(self, target_source): + target, source = target_source + result = target.join(source["MergedA"], on="C") + expected = target.join(source[["MergedA"]], on="C") tm.assert_frame_equal(result, expected) def test_join_on_series_buglet(self): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 0c074815450f9..a7125e69f2a27 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -3,7 +3,6 @@ datetime, timedelta, ) -import random import re import numpy as np @@ -41,18 +40,15 @@ merge, ) -N = 50 -NGROUPS = 8 - -def get_test_data(ngroups=NGROUPS, n=N): +def get_test_data(ngroups=8, n=50): unique_groups = list(range(ngroups)) arr = np.asarray(np.tile(unique_groups, n // ngroups)) if len(arr) < n: arr = np.asarray(list(arr) + unique_groups[: n - len(arr)]) - random.shuffle(arr) + np.random.shuffle(arr) return arr @@ -118,32 +114,40 @@ def dfs_for_indicator(): class TestMerge: - def setup_method(self): - # aggregate multiple columns - self.df = DataFrame( + @pytest.fixture + def df(self): + df = DataFrame( { "key1": get_test_data(), "key2": get_test_data(), - "data1": np.random.randn(N), - "data2": np.random.randn(N), + "data1": np.random.randn(50), + "data2": np.random.randn(50), } ) # exclude a couple keys for fun - self.df = self.df[self.df["key2"] > 1] + df = df[df["key2"] > 1] + return df - self.df2 = DataFrame( + @pytest.fixture + def df2(self): + return DataFrame( { - "key1": get_test_data(n=N // 5), - "key2": get_test_data(ngroups=NGROUPS // 2, n=N // 5), - "value": np.random.randn(N // 5), + "key1": get_test_data(n=10), + "key2": get_test_data(ngroups=4, n=10), + "value": np.random.randn(10), } ) - self.left = DataFrame( + @pytest.fixture + def left(self): + return DataFrame( {"key": ["a", "b", "c", "d", "e", "e", "a"], "v1": np.random.randn(7)} ) - self.right = DataFrame({"v2": np.random.randn(4)}, index=["d", "b", "c", "a"]) + + @pytest.fixture + def right(self): + return DataFrame({"v2": np.random.randn(4)}, index=["d", "b", "c", "a"]) def test_merge_inner_join_empty(self): # GH 15328 @@ -153,9 +157,9 @@ def test_merge_inner_join_empty(self): expected = DataFrame({"a": []}, index=[], dtype="int64") tm.assert_frame_equal(result, expected) - def test_merge_common(self): - joined = merge(self.df, self.df2) - exp = merge(self.df, self.df2, on=["key1", "key2"]) + def test_merge_common(self, df, df2): + joined = merge(df, df2) + exp = merge(df, df2, on=["key1", "key2"]) tm.assert_frame_equal(joined, exp) def test_merge_non_string_columns(self): @@ -170,13 +174,13 @@ def test_merge_non_string_columns(self): result = merge(left, right) tm.assert_frame_equal(expected, result) - def test_merge_index_as_on_arg(self): + def test_merge_index_as_on_arg(self, df, df2): # GH14355 - left = self.df.set_index("key1") - right = self.df2.set_index("key1") + left = df.set_index("key1") + right = df2.set_index("key1") result = merge(left, right, on="key1") - expected = merge(self.df, self.df2, on="key1").set_index("key1") + expected = merge(df, df2, on="key1").set_index("key1") tm.assert_frame_equal(result, expected) def test_merge_index_singlekey_right_vs_left(self): @@ -216,31 +220,31 @@ def test_merge_index_singlekey_inner(self): expected = left.join(right, on="key").loc[result.index] tm.assert_frame_equal(result, expected.loc[:, result.columns]) - def test_merge_misspecified(self): + def test_merge_misspecified(self, df, df2, left, right): msg = "Must pass right_on or right_index=True" with pytest.raises(pd.errors.MergeError, match=msg): - merge(self.left, self.right, left_index=True) + merge(left, right, left_index=True) msg = "Must pass left_on or left_index=True" with pytest.raises(pd.errors.MergeError, match=msg): - merge(self.left, self.right, right_index=True) + merge(left, right, right_index=True) msg = ( 'Can only pass argument "on" OR "left_on" and "right_on", not ' "a combination of both" ) with pytest.raises(pd.errors.MergeError, match=msg): - merge(self.left, self.left, left_on="key", on="key") + merge(left, left, left_on="key", on="key") msg = r"len\(right_on\) must equal len\(left_on\)" with pytest.raises(ValueError, match=msg): - merge(self.df, self.df2, left_on=["key1"], right_on=["key1", "key2"]) + merge(df, df2, left_on=["key1"], right_on=["key1", "key2"]) - def test_index_and_on_parameters_confusion(self): + def test_index_and_on_parameters_confusion(self, df, df2): msg = "right_index parameter must be of type bool, not " with pytest.raises(ValueError, match=msg): merge( - self.df, - self.df2, + df, + df2, how="left", left_index=False, right_index=["key1", "key2"], @@ -248,24 +252,24 @@ def test_index_and_on_parameters_confusion(self): msg = "left_index parameter must be of type bool, not " with pytest.raises(ValueError, match=msg): merge( - self.df, - self.df2, + df, + df2, how="left", left_index=["key1", "key2"], right_index=False, ) with pytest.raises(ValueError, match=msg): merge( - self.df, - self.df2, + df, + df2, how="left", left_index=["key1", "key2"], right_index=["key1", "key2"], ) - def test_merge_overlap(self): - merged = merge(self.left, self.left, on="key") - exp_len = (self.left["key"].value_counts() ** 2).sum() + def test_merge_overlap(self, left): + merged = merge(left, left, on="key") + exp_len = (left["key"].value_counts() ** 2).sum() assert len(merged) == exp_len assert "v1_x" in merged assert "v1_y" in merged @@ -671,14 +675,14 @@ def test_merge_nan_right2(self): )[["i1", "i2", "i1_", "i3"]] tm.assert_frame_equal(result, expected) - def test_merge_type(self): + def test_merge_type(self, df, df2): class NotADataFrame(DataFrame): @property def _constructor(self): return NotADataFrame - nad = NotADataFrame(self.df) - result = nad.merge(self.df2, on="key1") + nad = NotADataFrame(df) + result = nad.merge(df2, on="key1") assert isinstance(result, NotADataFrame) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 38727aa20ff4c..8c2c1026d5c82 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -39,8 +39,9 @@ def interval_values(request, closed): class TestPivotTable: - def setup_method(self): - self.data = DataFrame( + @pytest.fixture + def data(self): + return DataFrame( { "A": [ "foo", @@ -87,20 +88,20 @@ def setup_method(self): } ) - def test_pivot_table(self, observed): + def test_pivot_table(self, observed, data): index = ["A", "B"] columns = "C" table = pivot_table( - self.data, values="D", index=index, columns=columns, observed=observed + data, values="D", index=index, columns=columns, observed=observed ) - table2 = self.data.pivot_table( + table2 = data.pivot_table( values="D", index=index, columns=columns, observed=observed ) tm.assert_frame_equal(table, table2) # this works - pivot_table(self.data, values="D", index=index, observed=observed) + pivot_table(data, values="D", index=index, observed=observed) if len(index) > 1: assert table.index.names == tuple(index) @@ -112,7 +113,7 @@ def test_pivot_table(self, observed): else: assert table.columns.name == columns[0] - expected = self.data.groupby(index + [columns])["D"].agg(np.mean).unstack() + expected = data.groupby(index + [columns])["D"].agg(np.mean).unstack() tm.assert_frame_equal(table, expected) def test_pivot_table_categorical_observed_equal(self, observed): @@ -325,21 +326,21 @@ def test_pivot_with_interval_index_margins(self): ) tm.assert_series_equal(result, expected) - def test_pass_array(self): - result = self.data.pivot_table("D", index=self.data.A, columns=self.data.C) - expected = self.data.pivot_table("D", index="A", columns="C") + def test_pass_array(self, data): + result = data.pivot_table("D", index=data.A, columns=data.C) + expected = data.pivot_table("D", index="A", columns="C") tm.assert_frame_equal(result, expected) - def test_pass_function(self): - result = self.data.pivot_table("D", index=lambda x: x // 5, columns=self.data.C) - expected = self.data.pivot_table("D", index=self.data.index // 5, columns="C") + def test_pass_function(self, data): + result = data.pivot_table("D", index=lambda x: x // 5, columns=data.C) + expected = data.pivot_table("D", index=data.index // 5, columns="C") tm.assert_frame_equal(result, expected) - def test_pivot_table_multiple(self): + def test_pivot_table_multiple(self, data): index = ["A", "B"] columns = "C" - table = pivot_table(self.data, index=index, columns=columns) - expected = self.data.groupby(index + [columns]).agg(np.mean).unstack() + table = pivot_table(data, index=index, columns=columns) + expected = data.groupby(index + [columns]).agg(np.mean).unstack() tm.assert_frame_equal(table, expected) def test_pivot_dtypes(self): @@ -434,18 +435,18 @@ def test_pivot_no_values(self): ) tm.assert_frame_equal(res, exp) - def test_pivot_multi_values(self): + def test_pivot_multi_values(self, data): result = pivot_table( - self.data, values=["D", "E"], index="A", columns=["B", "C"], fill_value=0 + data, values=["D", "E"], index="A", columns=["B", "C"], fill_value=0 ) expected = pivot_table( - self.data.drop(["F"], axis=1), index="A", columns=["B", "C"], fill_value=0 + data.drop(["F"], axis=1), index="A", columns=["B", "C"], fill_value=0 ) tm.assert_frame_equal(result, expected) - def test_pivot_multi_functions(self): + def test_pivot_multi_functions(self, data): f = lambda func: pivot_table( - self.data, values=["D", "E"], index=["A", "B"], columns="C", aggfunc=func + data, values=["D", "E"], index=["A", "B"], columns="C", aggfunc=func ) result = f([np.mean, np.std]) means = f(np.mean) @@ -455,7 +456,7 @@ def test_pivot_multi_functions(self): # margins not supported?? f = lambda func: pivot_table( - self.data, + data, values=["D", "E"], index=["A", "B"], columns="C", @@ -859,33 +860,39 @@ def test_pivot_with_tuple_of_values(self, method): pd.pivot(df, index="zoo", columns="foo", values=("bar", "baz")) def _check_output( - self, result, values_col, index=["A", "B"], columns=["C"], margins_col="All" + self, + result, + values_col, + data, + index=["A", "B"], + columns=["C"], + margins_col="All", ): col_margins = result.loc[result.index[:-1], margins_col] - expected_col_margins = self.data.groupby(index)[values_col].mean() + expected_col_margins = data.groupby(index)[values_col].mean() tm.assert_series_equal(col_margins, expected_col_margins, check_names=False) assert col_margins.name == margins_col result = result.sort_index() index_margins = result.loc[(margins_col, "")].iloc[:-1] - expected_ix_margins = self.data.groupby(columns)[values_col].mean() + expected_ix_margins = data.groupby(columns)[values_col].mean() tm.assert_series_equal(index_margins, expected_ix_margins, check_names=False) assert index_margins.name == (margins_col, "") grand_total_margins = result.loc[(margins_col, ""), margins_col] - expected_total_margins = self.data[values_col].mean() + expected_total_margins = data[values_col].mean() assert grand_total_margins == expected_total_margins - def test_margins(self): + def test_margins(self, data): # column specified - result = self.data.pivot_table( + result = data.pivot_table( values="D", index=["A", "B"], columns="C", margins=True, aggfunc=np.mean ) - self._check_output(result, "D") + self._check_output(result, "D", data) # Set a different margins_name (not 'All') - result = self.data.pivot_table( + result = data.pivot_table( values="D", index=["A", "B"], columns="C", @@ -893,36 +900,32 @@ def test_margins(self): aggfunc=np.mean, margins_name="Totals", ) - self._check_output(result, "D", margins_col="Totals") + self._check_output(result, "D", data, margins_col="Totals") # no column specified - table = self.data.pivot_table( + table = data.pivot_table( index=["A", "B"], columns="C", margins=True, aggfunc=np.mean ) for value_col in table.columns.levels[0]: - self._check_output(table[value_col], value_col) + self._check_output(table[value_col], value_col, data) - def test_no_col(self): + def test_no_col(self, data): # no col # to help with a buglet - self.data.columns = [k * 2 for k in self.data.columns] + data.columns = [k * 2 for k in data.columns] msg = "The default value of numeric_only" with tm.assert_produces_warning(FutureWarning, match=msg): - table = self.data.pivot_table( - index=["AA", "BB"], margins=True, aggfunc=np.mean - ) + table = data.pivot_table(index=["AA", "BB"], margins=True, aggfunc=np.mean) for value_col in table.columns: totals = table.loc[("All", ""), value_col] - assert totals == self.data[value_col].mean() + assert totals == data[value_col].mean() with tm.assert_produces_warning(FutureWarning, match=msg): - table = self.data.pivot_table( - index=["AA", "BB"], margins=True, aggfunc="mean" - ) + table = data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") for item in ["DD", "EE", "FF"]: totals = table.loc[("All", ""), item] - assert totals == self.data[item].mean() + assert totals == data[item].mean() @pytest.mark.parametrize( "columns, aggfunc, values, expected_columns", @@ -982,10 +985,10 @@ def test_margin_with_only_columns_defined( tm.assert_frame_equal(result, expected) - def test_margins_dtype(self): + def test_margins_dtype(self, data): # GH 17013 - df = self.data.copy() + df = data.copy() df[["D", "E", "F"]] = np.arange(len(df) * 3).reshape(len(df), 3).astype("i8") mi_val = list(product(["bar", "foo"], ["one", "two"])) + [("All", "")] @@ -1006,7 +1009,7 @@ def test_margins_dtype(self): tm.assert_frame_equal(expected, result) - def test_margins_dtype_len(self): + def test_margins_dtype_len(self, data): mi_val = list(product(["bar", "foo"], ["one", "two"])) + [("All", "")] mi = MultiIndex.from_tuples(mi_val, names=("A", "B")) expected = DataFrame( @@ -1014,7 +1017,7 @@ def test_margins_dtype_len(self): ).rename_axis("C", axis=1) expected["All"] = expected["dull"] + expected["shiny"] - result = self.data.pivot_table( + result = data.pivot_table( values="D", index=["A", "B"], columns="C", @@ -1138,48 +1141,48 @@ def test_pivot_columns_lexsorted(self): assert pivoted.columns.is_monotonic_increasing - def test_pivot_complex_aggfunc(self): + def test_pivot_complex_aggfunc(self, data): f = {"D": ["std"], "E": ["sum"]} - expected = self.data.groupby(["A", "B"]).agg(f).unstack("B") - result = self.data.pivot_table(index="A", columns="B", aggfunc=f) + expected = data.groupby(["A", "B"]).agg(f).unstack("B") + result = data.pivot_table(index="A", columns="B", aggfunc=f) tm.assert_frame_equal(result, expected) - def test_margins_no_values_no_cols(self): + def test_margins_no_values_no_cols(self, data): # Regression test on pivot table: no values or cols passed. - result = self.data[["A", "B"]].pivot_table( + result = data[["A", "B"]].pivot_table( index=["A", "B"], aggfunc=len, margins=True ) result_list = result.tolist() assert sum(result_list[:-1]) == result_list[-1] - def test_margins_no_values_two_rows(self): + def test_margins_no_values_two_rows(self, data): # Regression test on pivot table: no values passed but rows are a # multi-index - result = self.data[["A", "B", "C"]].pivot_table( + result = data[["A", "B", "C"]].pivot_table( index=["A", "B"], columns="C", aggfunc=len, margins=True ) assert result.All.tolist() == [3.0, 1.0, 4.0, 3.0, 11.0] - def test_margins_no_values_one_row_one_col(self): + def test_margins_no_values_one_row_one_col(self, data): # Regression test on pivot table: no values passed but row and col # defined - result = self.data[["A", "B"]].pivot_table( + result = data[["A", "B"]].pivot_table( index="A", columns="B", aggfunc=len, margins=True ) assert result.All.tolist() == [4.0, 7.0, 11.0] - def test_margins_no_values_two_row_two_cols(self): + def test_margins_no_values_two_row_two_cols(self, data): # Regression test on pivot table: no values passed but rows and cols # are multi-indexed - self.data["D"] = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"] - result = self.data[["A", "B", "C", "D"]].pivot_table( + data["D"] = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"] + result = data[["A", "B", "C", "D"]].pivot_table( index=["A", "B"], columns=["C", "D"], aggfunc=len, margins=True ) assert result.All.tolist() == [3.0, 1.0, 4.0, 3.0, 11.0] @pytest.mark.parametrize("margin_name", ["foo", "one", 666, None, ["a", "b"]]) - def test_pivot_table_with_margins_set_margin_name(self, margin_name): + def test_pivot_table_with_margins_set_margin_name(self, margin_name, data): # see gh-3335 msg = ( f'Conflicting name "{margin_name}" in margins|' @@ -1188,7 +1191,7 @@ def test_pivot_table_with_margins_set_margin_name(self, margin_name): with pytest.raises(ValueError, match=msg): # multi-index index pivot_table( - self.data, + data, values="D", index=["A", "B"], columns=["C"], @@ -1198,7 +1201,7 @@ def test_pivot_table_with_margins_set_margin_name(self, margin_name): with pytest.raises(ValueError, match=msg): # multi-index column pivot_table( - self.data, + data, values="D", index=["C"], columns=["A", "B"], @@ -1208,7 +1211,7 @@ def test_pivot_table_with_margins_set_margin_name(self, margin_name): with pytest.raises(ValueError, match=msg): # non-multi-index index/column pivot_table( - self.data, + data, values="D", index=["A"], columns=["B"], @@ -1683,22 +1686,22 @@ def test_monthly(self, i): tm.assert_series_equal(result, subset, check_names=False) assert result.name == i - def test_pivot_table_with_iterator_values(self): + def test_pivot_table_with_iterator_values(self, data): # GH 12017 aggs = {"D": "sum", "E": "mean"} pivot_values_list = pivot_table( - self.data, index=["A"], values=list(aggs.keys()), aggfunc=aggs + data, index=["A"], values=list(aggs.keys()), aggfunc=aggs ) pivot_values_keys = pivot_table( - self.data, index=["A"], values=aggs.keys(), aggfunc=aggs + data, index=["A"], values=aggs.keys(), aggfunc=aggs ) tm.assert_frame_equal(pivot_values_keys, pivot_values_list) agg_values_gen = (value for value in aggs.keys()) pivot_values_gen = pivot_table( - self.data, index=["A"], values=agg_values_gen, aggfunc=aggs + data, index=["A"], values=agg_values_gen, aggfunc=aggs ) tm.assert_frame_equal(pivot_values_gen, pivot_values_list) @@ -2000,14 +2003,14 @@ def test_pivot_string_as_func(self): (["std", "mean"], [np.std, np.mean]), ], ) - def test_pivot_string_func_vs_func(self, f, f_numpy): + def test_pivot_string_func_vs_func(self, f, f_numpy, data): # GH #18713 # for consistency purposes msg = "The default value of numeric_only" with tm.assert_produces_warning(FutureWarning, match=msg): - result = pivot_table(self.data, index="A", columns="B", aggfunc=f) - expected = pivot_table(self.data, index="A", columns="B", aggfunc=f_numpy) + result = pivot_table(data, index="A", columns="B", aggfunc=f) + expected = pivot_table(data, index="A", columns="B", aggfunc=f_numpy) tm.assert_frame_equal(result, expected) @pytest.mark.slow