diff --git a/doc/source/user_guide/homs_api.rst b/doc/source/user_guide/homs_api.rst new file mode 100644 index 0000000000000..87525d84be56a --- /dev/null +++ b/doc/source/user_guide/homs_api.rst @@ -0,0 +1,80 @@ +.. _homs: + +:orphan: + +{{ header }} + +*************************** +pandas Higher Order Methods +*************************** + +pandas is experimenting with improving the behavior of higher order methods (HOMs). These +are methods that take a function as an argument, often a user-defined function (UDF). +The modified methods include the following. + + - :meth:`DataFrame.agg` + - :meth:`.DataFrameGroupBy.aggregate` + +The goal is to make these methods behave in a more predictable and consistent manner, +reducing the complexity of their implementation, and improving performance where +possible. This page details the differences between the old and new behaviors, as well +as providing some context behind each change that is being made. + +There are a great number of changes that are planned. In order to transition in a +reasonable manner for users, all changes are behind an experimental "api.use_hom" +option. When enabled, pandas HOMs are subject to breaking changes without notice. +Users can opt into the new behavior and provide feedback. Once the improvements have +been made, this option will be declared no longer experimental. At this point, any +breaking changes will happen only when preceded by a ``FutureWarning`` and when +pandas releases a major version. After a period of community feedback, and when the +behavior is deemed ready for release, pandas will then raise a ``FutureWarning`` that +the default value of this option will be set to ``True`` in a future version. Once the +default is ``True``, users can still override it to ``False``. After a sufficient +amount of time, pandas will remove this option altogether and only the new behavior +will remain. + +``DataFrame.agg`` with list-likes +--------------------------------- + +Previously, using ``DataFrame.agg`` with a list-like argument would transpose the result when +compared with just providing a single aggregation function. + +.. ipython:: python + + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + + df.agg("sum") + df.agg(["sum"]) + +This transpose no longer occurs, making the result more consistent. + +.. ipython:: python + + with pd.option_context("api.use_hom", True): + result = df.agg(["sum"]) + result + + with pd.option_context("api.use_hom", True): + result = df.agg(["sum", "mean"]) + result + +``DataFrame.groupby(...).agg`` with list-likes +---------------------------------------------- + +Previously, using ``DataFrame.groupby(...).agg`` with a list-like argument would put the +columns as the first level of the resulting hierarchical columns. The result is +that the columns for each aggregation function are separated, inconsistent with the result +for a single aggregator. + +.. ipython:: python + + df.groupby("a").agg("sum") + df.groupby("a").agg(["sum", "min"]) + +Now the levels are swapped, so that the columns for each aggregation are together. + +.. ipython:: python + + with pd.option_context("api.use_hom", True): + result = df.groupby("a").agg(["sum", "min"]) + result diff --git a/pandas/conftest.py b/pandas/conftest.py index ba90c9eedb53c..925349a7be634 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1780,3 +1780,11 @@ def using_array_manager(request): Fixture to check if the array manager is being used. """ return pd.options.mode.data_manager == "array" + + +@pytest.fixture +def using_hom_api(request): + """ + Fixture to check if the Higher Order Methods API is being used. + """ + return pd.options.api.use_hom diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 238f1382890c9..25e3e0832f9ec 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -22,7 +22,10 @@ import numpy as np -from pandas._config import option_context +from pandas._config import ( + get_option, + option_context, +) from pandas._libs import lib from pandas._typing import ( @@ -168,7 +171,10 @@ def agg(self) -> DataFrame | Series | None: return self.agg_dict_like() elif is_list_like(arg): # we require a list, but not a 'str' - return self.agg_list_like() + if get_option("api.use_hom"): + return self.hom_list_like("agg") + else: + return self.agg_list_like() if callable(arg): f = com.get_cython_func(arg) @@ -442,6 +448,79 @@ def agg_list_like(self) -> DataFrame | Series: ) return concatenated.reindex(full_ordered_index, copy=False) + def hom_list_single_arg( + self, method: str, a: AggFuncTypeBase, result_dim: int | None + ) -> tuple[int | None, AggFuncTypeBase | None, DataFrame | Series | None]: + result = None + if isinstance(a, (tuple, list)): + # Handle (name, value) pairs + name, a = a + else: + name = com.get_callable_name(a) or a + try: + result = getattr(self.obj, method)(a) + except (TypeError, DataError): + warnings.warn( + f"{name} did not aggregate successfully. If any error is " + "raised this will raise in a future version of pandas. " + "Drop these columns/ops to avoid this warning.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if result_dim is None: + result_dim = getattr(result, "ndim", 0) + elif getattr(result, "ndim", 0) != result_dim: + raise ValueError("cannot combine transform and aggregation operations") + + return result_dim, name, result + + def hom_list_like(self, method: str) -> DataFrame | Series: + """ + Compute aggregation in the case of a list-like argument. + + Returns + ------- + Result of aggregation. + """ + from pandas.core.reshape.concat import concat + + obj = self.obj + arg = cast(List[AggFuncTypeBase], self.f) + + results = [] + keys = [] + result_dim = None + + for a in arg: + result_dim, name, new_res = self.hom_list_single_arg(method, a, result_dim) + if new_res is not None: + results.append(new_res) + keys.append(name) + + # if we are empty + if not len(results): + raise ValueError("no results") + + try: + concatenated = concat(results, keys=keys, axis=1, sort=False) + except TypeError: + # we are concatting non-NDFrame objects, + # e.g. a list of scalars + from pandas import Series + + result = Series(results, index=keys, name=obj.name) + return result + else: + # Concat uses the first index to determine the final indexing order. + # The union of a shorter first index with the other indices causes + # the index sorting to be different from the order of the aggregating + # functions. Reindex if this is the case. + index_size = concatenated.index.size + full_ordered_index = next( + result.index for result in results if result.index.size == index_size + ) + return concatenated.reindex(full_ordered_index, copy=False) + def agg_dict_like(self) -> DataFrame | Series: """ Compute aggregation in the case of a dict-like argument. diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index dd106b6dbb63c..0345102bf5402 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -526,6 +526,23 @@ def use_inf_as_na_cb(key): validator=is_one_of_factory(["block", "array"]), ) +use_hom_doc = """ +: boolean + Whether to use the Higher Order Methods implementations. Currently experimental. + Defaults to False. +""" + + +with cf.config_prefix("api"): + cf.register_option( + "use_hom", + # Get the default from an environment variable, if set, otherwise defaults + # to False. This environment variable can be set for testing. + os.environ.get("PANDAS_USE_HOM", "false").lower() == "true", + use_hom_doc, + validator=is_bool, + ) + # user warnings chained_assignment = """ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 949f369849323..612b11214f191 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -26,6 +26,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs import reduction as libreduction from pandas._typing import ( ArrayLike, @@ -876,6 +878,8 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) result.columns = columns if result is None: + if get_option("api.use_hom"): + return self._hom_agg(func, args, kwargs) # grouper specific aggregations if self.grouper.nkeys > 1: @@ -926,6 +930,28 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) return result + def _hom_agg(self, func, args, kwargs): + if args or kwargs: + # test_pass_args_kwargs gets here (with and without as_index) + # can't return early + result = self._aggregate_frame(func, *args, **kwargs) + + elif self.axis == 1 and self.grouper.nkeys == 1: + # _aggregate_multiple_funcs does not allow self.axis == 1 + # Note: axis == 1 precludes 'not self.as_index', see __init__ + result = self._aggregate_frame(func) + return result + else: + # test_groupby_as_index_series_scalar gets here + # with 'not self.as_index' + return self._python_agg_general(func, *args, **kwargs) + + if not self.as_index: + self._insert_inaxis_grouper_inplace(result) + result.index = Index(range(len(result))) + + return result + agg = aggregate def _iterate_slices(self) -> Iterable[Series]: diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 98872571ae2bb..a6dc417fbfb8a 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -678,10 +678,12 @@ def test_apply_non_numpy_dtype_category(): tm.assert_frame_equal(result, df) -def test_apply_dup_names_multi_agg(): +def test_apply_dup_names_multi_agg(using_hom_api): # GH 21063 df = DataFrame([[0, 1], [2, 3]], columns=["a", "a"]) expected = DataFrame([[0, 1]], columns=["a", "a"], index=["min"]) + if using_hom_api: + expected = expected.T result = df.agg(["min"]) tm.assert_frame_equal(result, expected) @@ -1064,7 +1066,7 @@ def test_consistency_for_boxed(box, int_frame_const_col): tm.assert_frame_equal(result, expected) -def test_agg_transform(axis, float_frame): +def test_agg_transform(axis, float_frame, using_hom_api): other_axis = 1 if axis in {0, "index"} else 0 with np.errstate(all="ignore"): @@ -1080,29 +1082,50 @@ def test_agg_transform(axis, float_frame): # list-like result = float_frame.apply([np.sqrt], axis=axis) expected = f_sqrt.copy() - if axis in {0, "index"}: - expected.columns = MultiIndex.from_product([float_frame.columns, ["sqrt"]]) + if using_hom_api: + if axis in {0, "index"}: + expected.columns = MultiIndex.from_product( + [["sqrt"], float_frame.columns] + ) + else: + expected.index = MultiIndex.from_product([["sqrt"], float_frame.index]) else: - expected.index = MultiIndex.from_product([float_frame.index, ["sqrt"]]) + if axis in {0, "index"}: + expected.columns = MultiIndex.from_product( + [float_frame.columns, ["sqrt"]] + ) + else: + expected.index = MultiIndex.from_product([float_frame.index, ["sqrt"]]) tm.assert_frame_equal(result, expected) # multiple items in list # these are in the order as if we are applying both # functions per series and then concatting result = float_frame.apply([np.abs, np.sqrt], axis=axis) - expected = zip_frames([f_abs, f_sqrt], axis=other_axis) - if axis in {0, "index"}: - expected.columns = MultiIndex.from_product( - [float_frame.columns, ["absolute", "sqrt"]] - ) + if using_hom_api: + expected = pd.concat([f_abs, f_sqrt], axis=other_axis) + if axis in {0, "index"}: + expected.columns = MultiIndex.from_product( + [["absolute", "sqrt"], float_frame.columns] + ) + else: + expected.index = MultiIndex.from_product( + [["absolute", "sqrt"], float_frame.index] + ) else: - expected.index = MultiIndex.from_product( - [float_frame.index, ["absolute", "sqrt"]] - ) + expected = zip_frames([f_abs, f_sqrt], axis=other_axis) + if axis in {0, "index"}: + expected.columns = MultiIndex.from_product( + [float_frame.columns, ["absolute", "sqrt"]] + ) + else: + expected.index = MultiIndex.from_product( + [float_frame.index, ["absolute", "sqrt"]] + ) tm.assert_frame_equal(result, expected) -def test_demo(): +def test_demo(using_hom_api): # demonstration tests df = DataFrame({"A": range(5), "B": 5}) @@ -1110,6 +1133,8 @@ def test_demo(): expected = DataFrame( {"A": [0, 4], "B": [5, 5]}, columns=["A", "B"], index=["min", "max"] ) + if using_hom_api: + expected = expected.T tm.assert_frame_equal(result, expected) @@ -1141,7 +1166,7 @@ def test_agg_with_name_as_column_name(): tm.assert_series_equal(result, expected) -def test_agg_multiple_mixed_no_warning(): +def test_agg_multiple_mixed_no_warning(using_hom_api): # GH 20909 mdf = DataFrame( { @@ -1160,26 +1185,35 @@ def test_agg_multiple_mixed_no_warning(): }, index=["min", "sum"], ) + if using_hom_api: + expected = expected.T + match = "Dropping of nuisance columns" + else: + match = "did not aggregate successfully" # sorted index - with tm.assert_produces_warning( - FutureWarning, match=r"\['D'\] did not aggregate successfully" - ): + with tm.assert_produces_warning(FutureWarning, match=match): result = mdf.agg(["min", "sum"]) tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning( - FutureWarning, match=r"\['D'\] did not aggregate successfully" - ): + if using_hom_api: + match = "Dropping of nuisance columns" + else: + match = "did not aggregate successfully" + + with tm.assert_produces_warning(FutureWarning, match=match, check_stacklevel=False): result = mdf[["D", "C", "B", "A"]].agg(["sum", "min"]) # GH40420: the result of .agg should have an index that is sorted # according to the arguments provided to agg. - expected = expected[["D", "C", "B", "A"]].reindex(["sum", "min"]) + if using_hom_api: + expected = expected.loc[["D", "C", "B", "A"], ["sum", "min"]] + else: + expected = expected[["D", "C", "B", "A"]].reindex(["sum", "min"]) tm.assert_frame_equal(result, expected) -def test_agg_reduce(axis, float_frame): +def test_agg_reduce(axis, float_frame, using_hom_api): other_axis = 1 if axis in {0, "index"} else 0 name1, name2 = float_frame.axes[other_axis].unique()[:2].sort_values() @@ -1194,6 +1228,8 @@ def test_agg_reduce(axis, float_frame): ) expected.columns = ["mean", "max", "sum"] expected = expected.T if axis in {0, "index"} else expected + if using_hom_api: + expected = expected.T result = float_frame.agg(["mean", "max", "sum"], axis=axis) tm.assert_frame_equal(result, expected) @@ -1248,7 +1284,7 @@ def test_agg_reduce(axis, float_frame): tm.assert_frame_equal(result, expected) -def test_nuiscance_columns(): +def test_nuiscance_columns(using_hom_api): # GH 15015 df = DataFrame( @@ -1270,6 +1306,8 @@ def test_nuiscance_columns(): index=["min"], columns=df.columns, ) + if using_hom_api: + expected = expected.T tm.assert_frame_equal(result, expected) with tm.assert_produces_warning(FutureWarning, match="Select only valid"): @@ -1277,18 +1315,22 @@ def test_nuiscance_columns(): expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"]) tm.assert_series_equal(result, expected) - with tm.assert_produces_warning( - FutureWarning, match=r"\['D'\] did not aggregate successfully" - ): + if using_hom_api: + match = "Select only valid" + else: + match = "did not aggregate successfully" + with tm.assert_produces_warning(FutureWarning, match=match): result = df.agg(["sum"]) expected = DataFrame( [[6, 6.0, "foobarbaz"]], index=["sum"], columns=["A", "B", "C"] ) + if using_hom_api: + expected = expected.T tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("how", ["agg", "apply"]) -def test_non_callable_aggregates(how): +def test_non_callable_aggregates(how, using_hom_api): # GH 16405 # 'size' is a property of frame/series @@ -1323,8 +1365,12 @@ def test_non_callable_aggregates(how): } ) - tm.assert_frame_equal(result1, result2, check_like=True) - tm.assert_frame_equal(result2, expected, check_like=True) + if using_hom_api: + tm.assert_frame_equal(result2, expected) + tm.assert_frame_equal(result1, expected.T) + else: + tm.assert_frame_equal(result1, result2, check_like=True) + tm.assert_frame_equal(result2, expected, check_like=True) # Just functional string arg is same as calling df.arg() result = getattr(df, how)("count") @@ -1349,7 +1395,7 @@ def test_size_as_str(how, axis): tm.assert_series_equal(result, expected) -def test_agg_listlike_result(): +def test_agg_listlike_result(using_hom_api): # GH-29587 user defined function returning list-likes df = DataFrame({"A": [2, 2, 3], "B": [1.5, np.nan, 1.5], "C": ["foo", None, "bar"]}) @@ -1361,7 +1407,9 @@ def func(group_col): tm.assert_series_equal(result, expected) result = df.agg([func]) - expected = expected.to_frame("func").T + expected = expected.to_frame("func") + if not using_hom_api: + expected = expected.T tm.assert_frame_equal(result, expected) @@ -1474,14 +1522,20 @@ def test_apply_empty_list_reduce(): tm.assert_series_equal(result, expected) -def test_apply_no_suffix_index(): +def test_apply_no_suffix_index(request, using_hom_api): # GH36189 pdf = DataFrame([[4, 9]] * 3, columns=["A", "B"]) - result = pdf.apply(["sum", lambda x: x.sum(), lambda x: x.sum()]) - expected = DataFrame( - {"A": [12, 12, 12], "B": [27, 27, 27]}, index=["sum", "", ""] - ) - + result = pdf.apply([np.square, lambda x: x, lambda x: x]) + if using_hom_api: + columns = MultiIndex.from_product( + [["square", "", ""], ["A", "B"]] + ) + expected = DataFrame(3 * [[16, 81, 4, 9, 4, 9]], columns=columns) + else: + columns = MultiIndex.from_product( + [["A", "B"], ["square", "", ""]] + ) + expected = DataFrame(3 * [[16, 4, 4, 81, 9, 9]], columns=columns) tm.assert_frame_equal(result, expected) @@ -1493,7 +1547,7 @@ def test_apply_raw_returns_string(): tm.assert_series_equal(result, expected) -def test_aggregation_func_column_order(): +def test_aggregation_func_column_order(using_hom_api): # GH40420: the result of .agg should have an index that is sorted # according to the arguments provided to agg. df = DataFrame( @@ -1513,18 +1567,28 @@ def foo(s): aggs = ["sum", foo, "count", "min"] with tm.assert_produces_warning( - FutureWarning, match=r"\['item'\] did not aggregate successfully" + FutureWarning, match="did not aggregate successfully" ): result = df.agg(aggs) - expected = DataFrame( - { - "item": ["123456", np.nan, 6, "1"], - "att1": [21.0, 10.5, 6.0, 1.0], - "att2": [18.0, 9.0, 6.0, 0.0], - "att3": [17.0, 8.5, 6.0, 0.0], - }, - index=["sum", "foo", "count", "min"], - ) + if using_hom_api: + expected = DataFrame( + { + "sum": ["123456", 21, 18, 17], + "count": [6, 6, 6, 6], + "min": ["1", 1, 0, 0], + }, + index=["item", "att1", "att2", "att3"], + ) + else: + expected = DataFrame( + { + "item": ["123456", np.nan, 6, "1"], + "att1": [21.0, 10.5, 6.0, 1.0], + "att2": [18.0, 9.0, 6.0, 0.0], + "att3": [17.0, 8.5, 6.0, 0.0], + }, + index=["sum", "foo", "count", "min"], + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 1ea44871eea4d..10aba5a713053 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -374,7 +374,7 @@ def test_agg_multiple_functions_same_name_with_ohlc_present(): tm.assert_frame_equal(result, expected) -def test_multiple_functions_tuples_and_non_tuples(df): +def test_multiple_functions_tuples_and_non_tuples(df, using_hom_api): # #1359 funcs = [("foo", "mean"), "std"] ex_funcs = [("foo", "mean"), ("std", "std")] @@ -383,12 +383,13 @@ def test_multiple_functions_tuples_and_non_tuples(df): expected = df.groupby("A")["C"].agg(ex_funcs) tm.assert_frame_equal(result, expected) + klass = None if using_hom_api else FutureWarning with tm.assert_produces_warning( - FutureWarning, match=r"\['B'\] did not aggregate successfully" + klass, match=r"\['B'\] did not aggregate successfully" ): result = df.groupby("A").agg(funcs) with tm.assert_produces_warning( - FutureWarning, match=r"\['B'\] did not aggregate successfully" + klass, match=r"\['B'\] did not aggregate successfully" ): expected = df.groupby("A").agg(ex_funcs) tm.assert_frame_equal(result, expected) @@ -545,16 +546,22 @@ def test_callable_result_dtype_series(keys, agg_index, input, dtype, method): tm.assert_series_equal(result, expected) -def test_order_aggregate_multiple_funcs(): +def test_order_aggregate_multiple_funcs(using_hom_api): # GH 25692 df = DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]}) - res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"]) - result = res.columns.levels[1] + if using_hom_api: + # TODO (GH 35725): This will not raise when agg-must-agg is implemented + msg = "Cannot concat indices that do not have the same number of levels" + with pytest.raises(AssertionError, match=msg): + df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"]) + else: + res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"]) + result = res.columns.levels[1] - expected = Index(["sum", "max", "mean", "ohlc", "min"]) + expected = Index(["sum", "max", "mean", "ohlc", "min"]) - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected) @pytest.mark.parametrize("dtype", [np.int64, np.uint64]) @@ -1260,14 +1267,17 @@ def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data): tm.assert_frame_equal(result_df, expected_df) -def test_nonagg_agg(): +def test_nonagg_agg(using_hom_api): # GH 35490 - Single/Multiple agg of non-agg function give same results # TODO: agg should raise for functions that don't aggregate df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 2, 1]}) g = df.groupby("a") result = g.agg(["cumsum"]) - result.columns = result.columns.droplevel(-1) + if using_hom_api: + result.columns = result.columns.droplevel(0) + else: + result.columns = result.columns.droplevel(-1) expected = g.agg("cumsum") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 06044ddd3f4b8..dfb7dfd7350ac 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -24,7 +24,7 @@ from pandas.io.formats.printing import pprint_thing -def test_agg_api(): +def test_agg_api(using_hom_api): # GH 6337 # https://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error # different api for agg when passed custom function with mixed frame @@ -42,16 +42,21 @@ def test_agg_api(): def peak_to_peak(arr): return arr.max() - arr.min() + if using_hom_api: + msg = "Dropping invalid columns" + else: + msg = r"\['key2'\] did not aggregate successfully" + with tm.assert_produces_warning( FutureWarning, - match=r"\['key2'\] did not aggregate successfully", + match=msg, ): expected = grouped.agg([peak_to_peak]) expected.columns = ["data1", "data2"] with tm.assert_produces_warning( FutureWarning, - match=r"\['key2'\] did not aggregate successfully", + match=msg, ): result = grouped.agg(peak_to_peak) tm.assert_frame_equal(result, expected) @@ -176,7 +181,7 @@ def test_aggregate_float64_no_int64(): tm.assert_frame_equal(result, expected) -def test_aggregate_api_consistency(): +def test_aggregate_api_consistency(using_hom_api): # GH 9052 # make sure that the aggregates via dict # are consistent @@ -201,13 +206,21 @@ def test_aggregate_api_consistency(): tm.assert_frame_equal(result, expected, check_like=True) result = grouped.agg([np.sum, np.mean]) - expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1) - expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]]) + if using_hom_api: + expected = pd.concat([c_sum, d_sum, c_mean, d_mean], axis=1) + expected.columns = MultiIndex.from_product([["sum", "mean"], ["C", "D"]]) + else: + expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1) + expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]]) tm.assert_frame_equal(result, expected, check_like=True) result = grouped[["D", "C"]].agg([np.sum, np.mean]) - expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1) - expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]]) + if using_hom_api: + expected = pd.concat([d_sum, c_sum, d_mean, c_mean], axis=1) + expected.columns = MultiIndex.from_product([["sum", "mean"], ["D", "C"]]) + else: + expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1) + expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]]) tm.assert_frame_equal(result, expected, check_like=True) result = grouped.agg({"C": "mean", "D": "sum"}) @@ -371,7 +384,7 @@ def bad(x): tm.assert_frame_equal(result, expected) -def test_agg_consistency(): +def test_agg_consistency(using_hom_api): # agg with ([]) and () not consistent # GH 6715 def P1(a): @@ -393,7 +406,10 @@ def P1(a): g = df.groupby("date") expected = g.agg([P1]) - expected.columns = expected.columns.levels[0] + if using_hom_api: + expected.columns = expected.columns.levels[1] + else: + expected.columns = expected.columns.levels[0] result = g.agg(P1) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 1555e9d02c8ca..034e5099fba7c 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1132,7 +1132,7 @@ def test_groupby_mean_no_overflow(): ], ) @pytest.mark.parametrize("function", ["mean", "median", "var"]) -def test_apply_to_nullable_integer_returns_float(values, function): +def test_apply_to_nullable_integer_returns_float(values, function, using_hom_api): # https://github.com/pandas-dev/pandas/issues/32219 output = 0.5 if function == "var" else 1.5 arr = np.array([output] * 3, dtype=float) @@ -1148,7 +1148,10 @@ def test_apply_to_nullable_integer_returns_float(values, function): tm.assert_frame_equal(result, expected) result = groups.agg([function]) - expected.columns = MultiIndex.from_tuples([("b", function)]) + if using_hom_api: + expected.columns = MultiIndex.from_tuples([(function, "b")]) + else: + expected.columns = MultiIndex.from_tuples([("b", function)]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 10bf1a3ef91f2..1e496ca160ddb 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -533,7 +533,7 @@ def test_multi_key_multiple_functions(df): tm.assert_frame_equal(agged, expected) -def test_frame_multi_key_function_list(): +def test_frame_multi_key_function_list(using_hom_api): data = DataFrame( { "A": [ @@ -583,15 +583,23 @@ def test_frame_multi_key_function_list(): grouped = data.groupby(["A", "B"]) funcs = [np.mean, np.std] + klass = None if using_hom_api else FutureWarning with tm.assert_produces_warning( - FutureWarning, match=r"\['C'\] did not aggregate successfully" + klass, match=r"\['C'\] did not aggregate successfully" ): agged = grouped.agg(funcs) - expected = pd.concat( - [grouped["D"].agg(funcs), grouped["E"].agg(funcs), grouped["F"].agg(funcs)], - keys=["D", "E", "F"], - axis=1, - ) + if using_hom_api: + expected = pd.concat( + [grouped.agg(funcs[0]), grouped.agg(funcs[1])], + keys=["mean", "std"], + axis=1, + ) + else: + expected = pd.concat( + [grouped["D"].agg(funcs), grouped["E"].agg(funcs), grouped["F"].agg(funcs)], + keys=["D", "E", "F"], + axis=1, + ) assert isinstance(agged.index, MultiIndex) assert isinstance(expected.index, MultiIndex) tm.assert_frame_equal(agged, expected) @@ -2064,7 +2072,7 @@ def test_tuple_correct_keyerror(): df.groupby((7, 8)).mean() -def test_groupby_agg_ohlc_non_first(): +def test_groupby_agg_ohlc_non_first(using_hom_api): # GH 21716 df = DataFrame( [[1], [1]], @@ -2087,9 +2095,14 @@ def test_groupby_agg_ohlc_non_first(): index=date_range("2018-01-01", periods=2, freq="D", name="dti"), ) - result = df.groupby(Grouper(freq="D")).agg(["sum", "ohlc"]) - - tm.assert_frame_equal(result, expected) + if using_hom_api: + # TODO (GH 35725): This will not raise when agg-must-agg is implemented + msg = "Cannot concat indices that do not have the same number of levels" + with pytest.raises(AssertionError, match=msg): + df.groupby(Grouper(freq="D")).agg(["sum", "ohlc"]) + else: + result = df.groupby(Grouper(freq="D")).agg(["sum", "ohlc"]) + tm.assert_frame_equal(result, expected) def test_groupby_multiindex_nat(): diff --git a/pandas/tests/resample/test_deprecated.py b/pandas/tests/resample/test_deprecated.py index 126ca05ca1546..7805f1b12b2f7 100644 --- a/pandas/tests/resample/test_deprecated.py +++ b/pandas/tests/resample/test_deprecated.py @@ -80,7 +80,7 @@ def test_deprecating_on_loffset_and_base(): @all_ts @pytest.mark.parametrize("arg", ["mean", {"value": "mean"}, ["mean"]]) -def test_resample_loffset_arg_type(frame, create_index, arg): +def test_resample_loffset_arg_type(frame, create_index, arg, using_hom_api): # GH 13218, 15002 df = frame expected_means = [df.values[i : i + 2].mean() for i in range(0, len(df.values), 2)] @@ -97,7 +97,10 @@ def test_resample_loffset_arg_type(frame, create_index, arg): result_agg = df.resample("2D", loffset="2H").agg(arg) if isinstance(arg, list): - expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) + if using_hom_api: + expected.columns = pd.MultiIndex.from_tuples([("mean", "value")]) + else: + expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) tm.assert_frame_equal(result_agg, expected) @@ -201,7 +204,7 @@ def test_resample_float_base(): @pytest.mark.parametrize("kind", ["period", None, "timestamp"]) @pytest.mark.parametrize("agg_arg", ["mean", {"value": "mean"}, ["mean"]]) -def test_loffset_returns_datetimeindex(frame, kind, agg_arg): +def test_loffset_returns_datetimeindex(frame, kind, agg_arg, using_hom_api): # make sure passing loffset returns DatetimeIndex in all cases # basic method taken from Base.test_resample_loffset_arg_type() df = frame @@ -216,7 +219,10 @@ def test_loffset_returns_datetimeindex(frame, kind, agg_arg): with tm.assert_produces_warning(FutureWarning): result_agg = df.resample("2D", loffset="2H", kind=kind).agg(agg_arg) if isinstance(agg_arg, list): - expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) + if using_hom_api: + expected.columns = pd.MultiIndex.from_tuples([("mean", "value")]) + else: + expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) tm.assert_frame_equal(result_agg, expected) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 86e0411ee3334..d44022cbe5541 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -321,7 +321,7 @@ def test_agg_consistency_int_str_column_mix(): # `Base` test class -def test_agg(): +def test_agg(using_hom_api): # test with all three Resampler apis and TimeGrouper np.random.seed(1234) @@ -348,10 +348,17 @@ def test_agg(): b_std = r["B"].std() b_sum = r["B"].sum() - expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) - expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) + if using_hom_api: + expected = pd.concat([a_mean, b_mean, a_std, b_std], axis=1) + expected.columns = pd.MultiIndex.from_product([["mean", "std"], ["A", "B"]]) + else: + expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) + expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) for t in cases: - warn = FutureWarning if t in cases[1:3] else None + if t in cases[1:3] and not using_hom_api: + warn = FutureWarning + else: + warn = None with tm.assert_produces_warning( warn, match=r"\['date'\] did not aggregate successfully", @@ -616,7 +623,7 @@ def test_selection_api_validation(): @pytest.mark.parametrize( "col_name", ["t2", "t2x", "t2q", "T_2M", "t2p", "t2m", "t2m1", "T2M"] ) -def test_agg_with_datetime_index_list_agg_func(col_name): +def test_agg_with_datetime_index_list_agg_func(col_name, using_hom_api): # GH 22660 # The parametrized column names would get converted to dates by our # date parser. Some would result in OutOfBoundsError (ValueError) while @@ -630,11 +637,22 @@ def test_agg_with_datetime_index_list_agg_func(col_name): columns=[col_name], ) result = df.resample("1d").aggregate(["mean"]) - expected = DataFrame( - [47.5, 143.5, 195.5], - index=date_range(start="2017-01-01", freq="D", periods=3, tz="Europe/Berlin"), - columns=pd.MultiIndex(levels=[[col_name], ["mean"]], codes=[[0], [0]]), - ) + if using_hom_api: + expected = DataFrame( + [47.5, 143.5, 195.5], + index=date_range( + start="2017-01-01", freq="D", periods=3, tz="Europe/Berlin" + ), + columns=pd.MultiIndex(levels=[["mean"], [col_name]], codes=[[0], [0]]), + ) + else: + expected = DataFrame( + [47.5, 143.5, 195.5], + index=date_range( + start="2017-01-01", freq="D", periods=3, tz="Europe/Berlin" + ), + columns=pd.MultiIndex(levels=[[col_name], ["mean"]], codes=[[0], [0]]), + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 6c222669c37db..87c6677c72a72 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1903,7 +1903,7 @@ def test_pivot_table_not_series(self): tm.assert_frame_equal(result, expected) - def test_pivot_margins_name_unicode(self): + def test_pivot_margins_name_unicode(self, using_hom_api): # issue #13292 greek = "\u0394\u03bf\u03ba\u03b9\u03bc\u03ae" frame = DataFrame({"foo": [1, 2, 3]}) @@ -1911,8 +1911,14 @@ def test_pivot_margins_name_unicode(self): frame, index=["foo"], aggfunc=len, margins=True, margins_name=greek ) index = Index([1, 2, 3, greek], dtype="object", name="foo") - expected = DataFrame(index=index) - tm.assert_frame_equal(table, expected) + + if using_hom_api: + expected = Series([1, 1, 1, 3], index=index) + expected.index.name = None + tm.assert_series_equal(table, expected) + else: + expected = DataFrame(index=index) + tm.assert_frame_equal(table, expected) def test_pivot_string_as_func(self): # GH #18713 diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index f84a579247630..b12dec7c25b03 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -79,7 +79,7 @@ def test_skip_sum_object_raises(): tm.assert_frame_equal(result, expected) -def test_agg(): +def test_agg(using_hom_api): df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) r = df.rolling(window=3) @@ -90,8 +90,12 @@ def test_agg(): b_std = r["B"].std() result = r.aggregate([np.mean, np.std]) - expected = concat([a_mean, a_std, b_mean, b_std], axis=1) - expected.columns = MultiIndex.from_product([["A", "B"], ["mean", "std"]]) + if using_hom_api: + expected = concat([a_mean, b_mean, a_std, b_std], axis=1) + expected.columns = MultiIndex.from_product([["mean", "std"], ["A", "B"]]) + else: + expected = concat([a_mean, a_std, b_mean, b_std], axis=1) + expected.columns = MultiIndex.from_product([["A", "B"], ["mean", "std"]]) tm.assert_frame_equal(result, expected) result = r.aggregate({"A": np.mean, "B": np.std}) @@ -141,13 +145,16 @@ def test_agg_apply(raw): tm.assert_frame_equal(result, expected, check_like=True) -def test_agg_consistency(): +def test_agg_consistency(using_hom_api): df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) r = df.rolling(window=3) result = r.agg([np.sum, np.mean]).columns - expected = MultiIndex.from_product([list("AB"), ["sum", "mean"]]) + if using_hom_api: + expected = MultiIndex.from_product([["sum", "mean"], list("AB")]) + else: + expected = MultiIndex.from_product([list("AB"), ["sum", "mean"]]) tm.assert_index_equal(result, expected) result = r["A"].agg([np.sum, np.mean]).columns