From c2d9dabf98dbba1e86182e8e05065155eddb60a1 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 6 Oct 2023 14:37:58 -0400 Subject: [PATCH 1/3] DEPR: pandas.core for groupby --- pandas/core/common.py | 12 ++++++++++++ pandas/meson.build | 1 + pandas/tests/api/test_api.py | 20 ++++++++++++++++++++ scripts/validate_unwanted_patterns.py | 2 ++ 4 files changed, 35 insertions(+) diff --git a/pandas/core/common.py b/pandas/core/common.py index 8fd8b10c6fc32..1c86706d58f88 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -650,3 +650,15 @@ def fill_missing_names(names: Sequence[Hashable | None]) -> list[Hashable]: list of column names with the None values replaced. """ return [f"level_{i}" if name is None else name for i, name in enumerate(names)] + + +def _depr_core() -> None: + warnings.warn( + "pandas.core is deprecated and has been renamed to " + "pandas._core. Accessing `_core` directly is discouraged as " + "members can change without warning. You should use a public module " + "instead that exports the attribute in question. If you still would " + "like to access an attribute from it, please use pandas._core.", + DeprecationWarning, + stacklevel=3, + ) diff --git a/pandas/meson.build b/pandas/meson.build index 435103a954d86..13923706ec57a 100644 --- a/pandas/meson.build +++ b/pandas/meson.build @@ -26,6 +26,7 @@ subdir('_libs') subdirs_list = [ '_config', + '_core', '_testing', 'api', 'arrays', diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 60bcb97aaa364..bc50726a1b5f9 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -381,3 +381,23 @@ def test_pandas_array_alias(): res = pd.arrays.PandasArray assert res is pd.arrays.NumpyExtensionArray + + +@pytest.mark.parametrize("submodule_name", ["groupby"]) +def test_depr_pandas_core_submodule(submodule_name): + # GH#27522 + + submodule = getattr(pd._core, submodule_name) + warning_msg = "pandas.core is deprecated" + for submodule_member_name in dir(submodule): + if submodule_member_name.startswith("__") and submodule_member_name.endswith( + "__" + ): + continue + submodule_member = getattr(submodule, submodule_member_name) + with tm.assert_produces_warning(DeprecationWarning, match=warning_msg): + core_submodule = __import__( + f"pandas.core.{submodule_name}", fromlist=[submodule_member_name] + ) + with tm.assert_produces_warning(DeprecationWarning, match=warning_msg): + assert submodule_member is getattr(core_submodule, submodule_member_name) diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 6e6251425928d..7ac665ea30746 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -55,6 +55,8 @@ "_iLocIndexer", # TODO(3.0): GH#55043 - remove upon removal of ArrayManager "_get_option", + # GH#27522 + "_depr_core", } From 61becb724e939cbdccadcc8b77a14da9db580a4f Mon Sep 17 00:00:00 2001 From: richard Date: Wed, 15 Nov 2023 20:32:10 -0500 Subject: [PATCH 2/3] Automated changes --- ci/code_checks.sh | 24 +- doc/redirects.csv | 140 +- doc/source/whatsnew/v0.20.0.rst | 2 +- doc/source/whatsnew/v0.25.1.rst | 2 +- doc/source/whatsnew/v1.0.0.rst | 24 +- doc/source/whatsnew/v1.1.0.rst | 6 +- pandas/_core/groupby/__init__.py | 15 + pandas/_core/groupby/base.py | 121 + pandas/_core/groupby/categorical.py | 87 + pandas/_core/groupby/generic.py | 2867 ++++++++ pandas/_core/groupby/groupby.py | 5951 ++++++++++++++++ pandas/_core/groupby/grouper.py | 1072 +++ pandas/_core/groupby/indexing.py | 304 + pandas/_core/groupby/numba_.py | 181 + pandas/_core/groupby/ops.py | 1215 ++++ pandas/_typing.py | 10 +- pandas/api/typing/__init__.py | 2 +- pandas/core/api.py | 8 +- pandas/core/apply.py | 8 +- pandas/core/arrays/base.py | 2 +- pandas/core/arrays/categorical.py | 2 +- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/arrays/masked.py | 2 +- pandas/core/frame.py | 4 +- pandas/core/generic.py | 4 +- pandas/core/groupby/__init__.py | 27 +- pandas/core/groupby/base.py | 122 +- pandas/core/groupby/categorical.py | 88 +- pandas/core/groupby/generic.py | 2868 +------- pandas/core/groupby/groupby.py | 5952 +---------------- pandas/core/groupby/grouper.py | 1073 +-- pandas/core/groupby/indexing.py | 305 +- pandas/core/groupby/numba_.py | 182 +- pandas/core/groupby/ops.py | 1216 +--- pandas/core/resample.py | 20 +- pandas/core/reshape/merge.py | 2 +- pandas/core/reshape/pivot.py | 2 +- pandas/core/series.py | 4 +- pandas/core/window/rolling.py | 4 +- pandas/plotting/_core.py | 2 +- pandas/tests/apply/common.py | 2 +- .../tests/groupby/aggregate/test_aggregate.py | 2 +- pandas/tests/groupby/conftest.py | 4 +- pandas/tests/groupby/test_api.py | 6 +- pandas/tests/groupby/test_grouping.py | 2 +- pandas/tests/groupby/test_timegrouper.py | 4 +- pandas/tests/resample/test_base.py | 4 +- pandas/tests/resample/test_datetime_index.py | 2 +- pandas/tests/resample/test_time_grouper.py | 2 +- pandas/tests/window/test_groupby.py | 2 +- pyproject.toml | 10 +- pyright_reportGeneralTypeIssues.json | 8 +- 52 files changed, 12035 insertions(+), 11935 deletions(-) create mode 100644 pandas/_core/groupby/__init__.py create mode 100644 pandas/_core/groupby/base.py create mode 100644 pandas/_core/groupby/categorical.py create mode 100644 pandas/_core/groupby/generic.py create mode 100644 pandas/_core/groupby/groupby.py create mode 100644 pandas/_core/groupby/grouper.py create mode 100644 pandas/_core/groupby/indexing.py create mode 100644 pandas/_core/groupby/numba_.py create mode 100644 pandas/_core/groupby/ops.py diff --git a/ci/code_checks.sh b/ci/code_checks.sh index e91629744463f..2b22da913a067 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -115,18 +115,18 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.DatetimeIndex.day_name \ pandas.core.window.rolling.Rolling.corr \ pandas.Grouper \ - pandas.core.groupby.SeriesGroupBy.apply \ - pandas.core.groupby.DataFrameGroupBy.apply \ - pandas.core.groupby.SeriesGroupBy.transform \ - pandas.core.groupby.SeriesGroupBy.pipe \ - pandas.core.groupby.DataFrameGroupBy.pipe \ - pandas.core.groupby.DataFrameGroupBy.describe \ - pandas.core.groupby.DataFrameGroupBy.idxmax \ - pandas.core.groupby.DataFrameGroupBy.idxmin \ - pandas.core.groupby.DataFrameGroupBy.value_counts \ - pandas.core.groupby.SeriesGroupBy.describe \ - pandas.core.groupby.DataFrameGroupBy.boxplot \ - pandas.core.groupby.DataFrameGroupBy.hist \ + pandas._core.groupby.SeriesGroupBy.apply \ + pandas._core.groupby.DataFrameGroupBy.apply \ + pandas._core.groupby.SeriesGroupBy.transform \ + pandas._core.groupby.SeriesGroupBy.pipe \ + pandas._core.groupby.DataFrameGroupBy.pipe \ + pandas._core.groupby.DataFrameGroupBy.describe \ + pandas._core.groupby.DataFrameGroupBy.idxmax \ + pandas._core.groupby.DataFrameGroupBy.idxmin \ + pandas._core.groupby.DataFrameGroupBy.value_counts \ + pandas._core.groupby.SeriesGroupBy.describe \ + pandas._core.groupby.DataFrameGroupBy.boxplot \ + pandas._core.groupby.DataFrameGroupBy.hist \ pandas.io.formats.style.Styler.map \ pandas.io.formats.style.Styler.apply_index \ pandas.io.formats.style.Styler.map_index \ diff --git a/doc/redirects.csv b/doc/redirects.csv index bd60cc6a732bd..ce48a8ccc91af 100644 --- a/doc/redirects.csv +++ b/doc/redirects.csv @@ -164,76 +164,76 @@ generated/pandas.CategoricalIndex.reorder_categories,../reference/api/pandas.Cat generated/pandas.CategoricalIndex.set_categories,../reference/api/pandas.CategoricalIndex.set_categories generated/pandas.Categorical.ordered,../reference/api/pandas.Categorical.ordered generated/pandas.concat,../reference/api/pandas.concat -generated/pandas.core.groupby.DataFrameGroupBy.all,../reference/api/pandas.core.groupby.DataFrameGroupBy.all -generated/pandas.core.groupby.DataFrameGroupBy.any,../reference/api/pandas.core.groupby.DataFrameGroupBy.any -generated/pandas.core.groupby.DataFrameGroupBy.bfill,../reference/api/pandas.core.groupby.DataFrameGroupBy.bfill -generated/pandas.core.groupby.DataFrameGroupBy.boxplot,../reference/api/pandas.core.groupby.DataFrameGroupBy.boxplot -generated/pandas.core.groupby.DataFrameGroupBy.corr,../reference/api/pandas.core.groupby.DataFrameGroupBy.corr -generated/pandas.core.groupby.DataFrameGroupBy.corrwith,../reference/api/pandas.core.groupby.DataFrameGroupBy.corrwith -generated/pandas.core.groupby.DataFrameGroupBy.count,../reference/api/pandas.core.groupby.DataFrameGroupBy.count -generated/pandas.core.groupby.DataFrameGroupBy.cov,../reference/api/pandas.core.groupby.DataFrameGroupBy.cov -generated/pandas.core.groupby.DataFrameGroupBy.cummax,../reference/api/pandas.core.groupby.DataFrameGroupBy.cummax -generated/pandas.core.groupby.DataFrameGroupBy.cummin,../reference/api/pandas.core.groupby.DataFrameGroupBy.cummin -generated/pandas.core.groupby.DataFrameGroupBy.cumprod,../reference/api/pandas.core.groupby.DataFrameGroupBy.cumprod -generated/pandas.core.groupby.DataFrameGroupBy.cumsum,../reference/api/pandas.core.groupby.DataFrameGroupBy.cumsum -generated/pandas.core.groupby.DataFrameGroupBy.describe,../reference/api/pandas.core.groupby.DataFrameGroupBy.describe -generated/pandas.core.groupby.DataFrameGroupBy.diff,../reference/api/pandas.core.groupby.DataFrameGroupBy.diff -generated/pandas.core.groupby.DataFrameGroupBy.ffill,../reference/api/pandas.core.groupby.DataFrameGroupBy.ffill -generated/pandas.core.groupby.DataFrameGroupBy.fillna,../reference/api/pandas.core.groupby.DataFrameGroupBy.fillna -generated/pandas.core.groupby.DataFrameGroupBy.filter,../reference/api/pandas.core.groupby.DataFrameGroupBy.filter -generated/pandas.core.groupby.DataFrameGroupBy.hist,../reference/api/pandas.core.groupby.DataFrameGroupBy.hist -generated/pandas.core.groupby.DataFrameGroupBy.idxmax,../reference/api/pandas.core.groupby.DataFrameGroupBy.idxmax -generated/pandas.core.groupby.DataFrameGroupBy.idxmin,../reference/api/pandas.core.groupby.DataFrameGroupBy.idxmin -generated/pandas.core.groupby.DataFrameGroupBy.pct_change,../reference/api/pandas.core.groupby.DataFrameGroupBy.pct_change -generated/pandas.core.groupby.DataFrameGroupBy.plot,../reference/api/pandas.core.groupby.DataFrameGroupBy.plot -generated/pandas.core.groupby.DataFrameGroupBy.quantile,../reference/api/pandas.core.groupby.DataFrameGroupBy.quantile -generated/pandas.core.groupby.DataFrameGroupBy.rank,../reference/api/pandas.core.groupby.DataFrameGroupBy.rank -generated/pandas.core.groupby.DataFrameGroupBy.resample,../reference/api/pandas.core.groupby.DataFrameGroupBy.resample -generated/pandas.core.groupby.DataFrameGroupBy.shift,../reference/api/pandas.core.groupby.DataFrameGroupBy.shift -generated/pandas.core.groupby.DataFrameGroupBy.size,../reference/api/pandas.core.groupby.DataFrameGroupBy.size -generated/pandas.core.groupby.DataFrameGroupBy.skew,../reference/api/pandas.core.groupby.DataFrameGroupBy.skew -generated/pandas.core.groupby.DataFrameGroupBy.take,../reference/api/pandas.core.groupby.DataFrameGroupBy.take -generated/pandas.core.groupby.GroupBy.agg,../reference/api/pandas.core.groupby.GroupBy.agg -generated/pandas.core.groupby.GroupBy.aggregate,../reference/api/pandas.core.groupby.GroupBy.aggregate -generated/pandas.core.groupby.GroupBy.all,../reference/api/pandas.core.groupby.GroupBy.all -generated/pandas.core.groupby.GroupBy.any,../reference/api/pandas.core.groupby.GroupBy.any -generated/pandas.core.groupby.GroupBy.apply,../reference/api/pandas.core.groupby.GroupBy.apply -generated/pandas.core.groupby.GroupBy.bfill,../reference/api/pandas.core.groupby.GroupBy.bfill -generated/pandas.core.groupby.GroupBy.count,../reference/api/pandas.core.groupby.GroupBy.count -generated/pandas.core.groupby.GroupBy.cumcount,../reference/api/pandas.core.groupby.GroupBy.cumcount -generated/pandas.core.groupby.GroupBy.ffill,../reference/api/pandas.core.groupby.GroupBy.ffill -generated/pandas.core.groupby.GroupBy.first,../reference/api/pandas.core.groupby.GroupBy.first -generated/pandas.core.groupby.GroupBy.get_group,../reference/api/pandas.core.groupby.GroupBy.get_group -generated/pandas.core.groupby.GroupBy.groups,../reference/api/pandas.core.groupby.GroupBy.groups -generated/pandas.core.groupby.GroupBy.head,../reference/api/pandas.core.groupby.GroupBy.head -generated/pandas.core.groupby.GroupBy.indices,../reference/api/pandas.core.groupby.GroupBy.indices -generated/pandas.core.groupby.GroupBy.__iter__,../reference/api/pandas.core.groupby.GroupBy.__iter__ -generated/pandas.core.groupby.GroupBy.last,../reference/api/pandas.core.groupby.GroupBy.last -generated/pandas.core.groupby.GroupBy.max,../reference/api/pandas.core.groupby.GroupBy.max -generated/pandas.core.groupby.GroupBy.mean,../reference/api/pandas.core.groupby.GroupBy.mean -generated/pandas.core.groupby.GroupBy.median,../reference/api/pandas.core.groupby.GroupBy.median -generated/pandas.core.groupby.GroupBy.min,../reference/api/pandas.core.groupby.GroupBy.min -generated/pandas.core.groupby.GroupBy.ngroup,../reference/api/pandas.core.groupby.GroupBy.ngroup -generated/pandas.core.groupby.GroupBy.nth,../reference/api/pandas.core.groupby.GroupBy.nth -generated/pandas.core.groupby.GroupBy.ohlc,../reference/api/pandas.core.groupby.GroupBy.ohlc -generated/pandas.core.groupby.GroupBy.pct_change,../reference/api/pandas.core.groupby.GroupBy.pct_change -generated/pandas.core.groupby.GroupBy.pipe,../reference/api/pandas.core.groupby.GroupBy.pipe -generated/pandas.core.groupby.GroupBy.prod,../reference/api/pandas.core.groupby.GroupBy.prod -generated/pandas.core.groupby.GroupBy.rank,../reference/api/pandas.core.groupby.GroupBy.rank -generated/pandas.core.groupby.GroupBy.sem,../reference/api/pandas.core.groupby.GroupBy.sem -generated/pandas.core.groupby.GroupBy.size,../reference/api/pandas.core.groupby.GroupBy.size -generated/pandas.core.groupby.GroupBy.std,../reference/api/pandas.core.groupby.GroupBy.std -generated/pandas.core.groupby.GroupBy.sum,../reference/api/pandas.core.groupby.GroupBy.sum -generated/pandas.core.groupby.GroupBy.tail,../reference/api/pandas.core.groupby.GroupBy.tail -generated/pandas.core.groupby.GroupBy.transform,../reference/api/pandas.core.groupby.GroupBy.transform -generated/pandas.core.groupby.GroupBy.var,../reference/api/pandas.core.groupby.GroupBy.var -generated/pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing,../reference/api/pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing -generated/pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing,../reference/api/pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing -generated/pandas.core.groupby.SeriesGroupBy.nlargest,../reference/api/pandas.core.groupby.SeriesGroupBy.nlargest -generated/pandas.core.groupby.SeriesGroupBy.nsmallest,../reference/api/pandas.core.groupby.SeriesGroupBy.nsmallest -generated/pandas.core.groupby.SeriesGroupBy.nunique,../reference/api/pandas.core.groupby.SeriesGroupBy.nunique -generated/pandas.core.groupby.SeriesGroupBy.unique,../reference/api/pandas.core.groupby.SeriesGroupBy.unique -generated/pandas.core.groupby.SeriesGroupBy.value_counts,../reference/api/pandas.core.groupby.SeriesGroupBy.value_counts +generated/pandas._core.groupby.DataFrameGroupBy.all,../reference/api/pandas._core.groupby.DataFrameGroupBy.all +generated/pandas._core.groupby.DataFrameGroupBy.any,../reference/api/pandas._core.groupby.DataFrameGroupBy.any +generated/pandas._core.groupby.DataFrameGroupBy.bfill,../reference/api/pandas._core.groupby.DataFrameGroupBy.bfill +generated/pandas._core.groupby.DataFrameGroupBy.boxplot,../reference/api/pandas._core.groupby.DataFrameGroupBy.boxplot +generated/pandas._core.groupby.DataFrameGroupBy.corr,../reference/api/pandas._core.groupby.DataFrameGroupBy.corr +generated/pandas._core.groupby.DataFrameGroupBy.corrwith,../reference/api/pandas._core.groupby.DataFrameGroupBy.corrwith +generated/pandas._core.groupby.DataFrameGroupBy.count,../reference/api/pandas._core.groupby.DataFrameGroupBy.count +generated/pandas._core.groupby.DataFrameGroupBy.cov,../reference/api/pandas._core.groupby.DataFrameGroupBy.cov +generated/pandas._core.groupby.DataFrameGroupBy.cummax,../reference/api/pandas._core.groupby.DataFrameGroupBy.cummax +generated/pandas._core.groupby.DataFrameGroupBy.cummin,../reference/api/pandas._core.groupby.DataFrameGroupBy.cummin +generated/pandas._core.groupby.DataFrameGroupBy.cumprod,../reference/api/pandas._core.groupby.DataFrameGroupBy.cumprod +generated/pandas._core.groupby.DataFrameGroupBy.cumsum,../reference/api/pandas._core.groupby.DataFrameGroupBy.cumsum +generated/pandas._core.groupby.DataFrameGroupBy.describe,../reference/api/pandas._core.groupby.DataFrameGroupBy.describe +generated/pandas._core.groupby.DataFrameGroupBy.diff,../reference/api/pandas._core.groupby.DataFrameGroupBy.diff +generated/pandas._core.groupby.DataFrameGroupBy.ffill,../reference/api/pandas._core.groupby.DataFrameGroupBy.ffill +generated/pandas._core.groupby.DataFrameGroupBy.fillna,../reference/api/pandas._core.groupby.DataFrameGroupBy.fillna +generated/pandas._core.groupby.DataFrameGroupBy.filter,../reference/api/pandas._core.groupby.DataFrameGroupBy.filter +generated/pandas._core.groupby.DataFrameGroupBy.hist,../reference/api/pandas._core.groupby.DataFrameGroupBy.hist +generated/pandas._core.groupby.DataFrameGroupBy.idxmax,../reference/api/pandas._core.groupby.DataFrameGroupBy.idxmax +generated/pandas._core.groupby.DataFrameGroupBy.idxmin,../reference/api/pandas._core.groupby.DataFrameGroupBy.idxmin +generated/pandas._core.groupby.DataFrameGroupBy.pct_change,../reference/api/pandas._core.groupby.DataFrameGroupBy.pct_change +generated/pandas._core.groupby.DataFrameGroupBy.plot,../reference/api/pandas._core.groupby.DataFrameGroupBy.plot +generated/pandas._core.groupby.DataFrameGroupBy.quantile,../reference/api/pandas._core.groupby.DataFrameGroupBy.quantile +generated/pandas._core.groupby.DataFrameGroupBy.rank,../reference/api/pandas._core.groupby.DataFrameGroupBy.rank +generated/pandas._core.groupby.DataFrameGroupBy.resample,../reference/api/pandas._core.groupby.DataFrameGroupBy.resample +generated/pandas._core.groupby.DataFrameGroupBy.shift,../reference/api/pandas._core.groupby.DataFrameGroupBy.shift +generated/pandas._core.groupby.DataFrameGroupBy.size,../reference/api/pandas._core.groupby.DataFrameGroupBy.size +generated/pandas._core.groupby.DataFrameGroupBy.skew,../reference/api/pandas._core.groupby.DataFrameGroupBy.skew +generated/pandas._core.groupby.DataFrameGroupBy.take,../reference/api/pandas._core.groupby.DataFrameGroupBy.take +generated/pandas._core.groupby.GroupBy.agg,../reference/api/pandas._core.groupby.GroupBy.agg +generated/pandas._core.groupby.GroupBy.aggregate,../reference/api/pandas._core.groupby.GroupBy.aggregate +generated/pandas._core.groupby.GroupBy.all,../reference/api/pandas._core.groupby.GroupBy.all +generated/pandas._core.groupby.GroupBy.any,../reference/api/pandas._core.groupby.GroupBy.any +generated/pandas._core.groupby.GroupBy.apply,../reference/api/pandas._core.groupby.GroupBy.apply +generated/pandas._core.groupby.GroupBy.bfill,../reference/api/pandas._core.groupby.GroupBy.bfill +generated/pandas._core.groupby.GroupBy.count,../reference/api/pandas._core.groupby.GroupBy.count +generated/pandas._core.groupby.GroupBy.cumcount,../reference/api/pandas._core.groupby.GroupBy.cumcount +generated/pandas._core.groupby.GroupBy.ffill,../reference/api/pandas._core.groupby.GroupBy.ffill +generated/pandas._core.groupby.GroupBy.first,../reference/api/pandas._core.groupby.GroupBy.first +generated/pandas._core.groupby.GroupBy.get_group,../reference/api/pandas._core.groupby.GroupBy.get_group +generated/pandas._core.groupby.GroupBy.groups,../reference/api/pandas._core.groupby.GroupBy.groups +generated/pandas._core.groupby.GroupBy.head,../reference/api/pandas._core.groupby.GroupBy.head +generated/pandas._core.groupby.GroupBy.indices,../reference/api/pandas._core.groupby.GroupBy.indices +generated/pandas._core.groupby.GroupBy.__iter__,../reference/api/pandas._core.groupby.GroupBy.__iter__ +generated/pandas._core.groupby.GroupBy.last,../reference/api/pandas._core.groupby.GroupBy.last +generated/pandas._core.groupby.GroupBy.max,../reference/api/pandas._core.groupby.GroupBy.max +generated/pandas._core.groupby.GroupBy.mean,../reference/api/pandas._core.groupby.GroupBy.mean +generated/pandas._core.groupby.GroupBy.median,../reference/api/pandas._core.groupby.GroupBy.median +generated/pandas._core.groupby.GroupBy.min,../reference/api/pandas._core.groupby.GroupBy.min +generated/pandas._core.groupby.GroupBy.ngroup,../reference/api/pandas._core.groupby.GroupBy.ngroup +generated/pandas._core.groupby.GroupBy.nth,../reference/api/pandas._core.groupby.GroupBy.nth +generated/pandas._core.groupby.GroupBy.ohlc,../reference/api/pandas._core.groupby.GroupBy.ohlc +generated/pandas._core.groupby.GroupBy.pct_change,../reference/api/pandas._core.groupby.GroupBy.pct_change +generated/pandas._core.groupby.GroupBy.pipe,../reference/api/pandas._core.groupby.GroupBy.pipe +generated/pandas._core.groupby.GroupBy.prod,../reference/api/pandas._core.groupby.GroupBy.prod +generated/pandas._core.groupby.GroupBy.rank,../reference/api/pandas._core.groupby.GroupBy.rank +generated/pandas._core.groupby.GroupBy.sem,../reference/api/pandas._core.groupby.GroupBy.sem +generated/pandas._core.groupby.GroupBy.size,../reference/api/pandas._core.groupby.GroupBy.size +generated/pandas._core.groupby.GroupBy.std,../reference/api/pandas._core.groupby.GroupBy.std +generated/pandas._core.groupby.GroupBy.sum,../reference/api/pandas._core.groupby.GroupBy.sum +generated/pandas._core.groupby.GroupBy.tail,../reference/api/pandas._core.groupby.GroupBy.tail +generated/pandas._core.groupby.GroupBy.transform,../reference/api/pandas._core.groupby.GroupBy.transform +generated/pandas._core.groupby.GroupBy.var,../reference/api/pandas._core.groupby.GroupBy.var +generated/pandas._core.groupby.SeriesGroupBy.is_monotonic_decreasing,../reference/api/pandas._core.groupby.SeriesGroupBy.is_monotonic_decreasing +generated/pandas._core.groupby.SeriesGroupBy.is_monotonic_increasing,../reference/api/pandas._core.groupby.SeriesGroupBy.is_monotonic_increasing +generated/pandas._core.groupby.SeriesGroupBy.nlargest,../reference/api/pandas._core.groupby.SeriesGroupBy.nlargest +generated/pandas._core.groupby.SeriesGroupBy.nsmallest,../reference/api/pandas._core.groupby.SeriesGroupBy.nsmallest +generated/pandas._core.groupby.SeriesGroupBy.nunique,../reference/api/pandas._core.groupby.SeriesGroupBy.nunique +generated/pandas._core.groupby.SeriesGroupBy.unique,../reference/api/pandas._core.groupby.SeriesGroupBy.unique +generated/pandas._core.groupby.SeriesGroupBy.value_counts,../reference/api/pandas._core.groupby.SeriesGroupBy.value_counts generated/pandas.core.resample.Resampler.aggregate,../reference/api/pandas.core.resample.Resampler.aggregate generated/pandas.core.resample.Resampler.apply,../reference/api/pandas.core.resample.Resampler.apply generated/pandas.core.resample.Resampler.asfreq,../reference/api/pandas.core.resample.Resampler.asfreq diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index 09bf5428d0432..678f0c5dfa621 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -1646,7 +1646,7 @@ Performance improvements - Improved performance when using ``.unstack()`` (:issue:`15503`) - Improved performance of merge/join on ``category`` columns (:issue:`10409`) - Improved performance of ``drop_duplicates()`` on ``bool`` columns (:issue:`12963`) -- Improve performance of ``pd.core.groupby.GroupBy.apply`` when the applied +- Improve performance of ``pd._core.groupby.GroupBy.apply`` when the applied function used the ``.name`` attribute of the group DataFrame (:issue:`15062`). - Improved performance of ``iloc`` indexing with a list or array (:issue:`15504`). - Improved performance of ``Series.sort_index()`` with a monotonic index (:issue:`15694`) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 67017d7c9fb29..534bc9f33c9ec 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -85,7 +85,7 @@ Plotting GroupBy/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Fixed regression in :meth:`pands.core.groupby.DataFrameGroupBy.quantile` raising when multiple quantiles are given (:issue:`27526`) +- Fixed regression in :meth:`pands._core.groupby.DataFrameGroupBy.quantile` raising when multiple quantiles are given (:issue:`27526`) - Bug in :meth:`.DataFrameGroupBy.transform` where applying a timezone conversion lambda function would drop timezone information (:issue:`27496`) - Bug in :meth:`.GroupBy.nth` where ``observed=False`` was being ignored for Categorical groupers (:issue:`26385`) - Bug in windowing over read-only arrays (:issue:`27766`) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 94a8ee7cd1a5d..1afc30ddca5e7 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -918,7 +918,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. - :meth:`DataFrame.to_excel` and :meth:`Series.to_excel` with non-existent columns will no longer reindex (:issue:`17295`) - Removed the previously deprecated keyword "join_axes" from :func:`concat`; use ``reindex_like`` on the result instead (:issue:`22318`) - Removed the previously deprecated keyword "by" from :meth:`DataFrame.sort_index`, use :meth:`DataFrame.sort_values` instead (:issue:`10726`) -- Removed support for nested renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`core.groupby.DataFrameGroupBy.aggregate`, :meth:`core.groupby.SeriesGroupBy.aggregate`, :meth:`core.window.rolling.Rolling.aggregate` (:issue:`18529`) +- Removed support for nested renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`_core.groupby.DataFrameGroupBy.aggregate`, :meth:`_core.groupby.SeriesGroupBy.aggregate`, :meth:`core.window.rolling.Rolling.aggregate` (:issue:`18529`) - Passing ``datetime64`` data to :class:`TimedeltaIndex` or ``timedelta64`` data to ``DatetimeIndex`` now raises ``TypeError`` (:issue:`23539`, :issue:`23937`) - Passing ``int64`` values to :class:`DatetimeIndex` and a timezone now interprets the values as nanosecond timestamps in UTC, not wall times in the given timezone (:issue:`24559`) - A tuple passed to :meth:`DataFrame.groupby` is now exclusively treated as a single key (:issue:`18314`) @@ -958,7 +958,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. - Removed the previously deprecated ``FrozenNDArray`` class in ``pandas.core.indexes.frozen`` (:issue:`29335`) - Removed the previously deprecated keyword "nthreads" from :func:`read_feather`, use "use_threads" instead (:issue:`23053`) - Removed ``Index.is_lexsorted_for_tuple`` (:issue:`29305`) -- Removed support for nested renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`core.groupby.DataFrameGroupBy.aggregate`, :meth:`core.groupby.SeriesGroupBy.aggregate`, :meth:`core.window.rolling.Rolling.aggregate` (:issue:`29608`) +- Removed support for nested renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`_core.groupby.DataFrameGroupBy.aggregate`, :meth:`_core.groupby.SeriesGroupBy.aggregate`, :meth:`core.window.rolling.Rolling.aggregate` (:issue:`29608`) - Removed ``Series.valid``; use :meth:`Series.dropna` instead (:issue:`18800`) - Removed ``DataFrame.is_copy``, ``Series.is_copy`` (:issue:`18812`) - Removed ``DataFrame.get_ftype_counts``, ``Series.get_ftype_counts`` (:issue:`18243`) @@ -1053,10 +1053,10 @@ Categorical - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` that would give incorrect results on categorical data (:issue:`26988`) - Bug where calling :meth:`Categorical.min` or :meth:`Categorical.max` on an empty Categorical would raise a numpy exception (:issue:`30227`) - The following methods now also correctly output values for unobserved categories when called through ``groupby(..., observed=False)`` (:issue:`17605`) - * :meth:`core.groupby.SeriesGroupBy.count` - * :meth:`core.groupby.SeriesGroupBy.size` - * :meth:`core.groupby.SeriesGroupBy.nunique` - * :meth:`core.groupby.SeriesGroupBy.nth` + * :meth:`_core.groupby.SeriesGroupBy.count` + * :meth:`_core.groupby.SeriesGroupBy.size` + * :meth:`_core.groupby.SeriesGroupBy.nunique` + * :meth:`_core.groupby.SeriesGroupBy.nth` Datetimelike @@ -1065,14 +1065,14 @@ Datetimelike - Bug in :meth:`Series.dt` property lookups when the underlying data is read-only (:issue:`27529`) - Bug in ``HDFStore.__getitem__`` incorrectly reading tz attribute created in Python 2 (:issue:`26443`) - Bug in :func:`to_datetime` where passing arrays of malformed ``str`` with errors="coerce" could incorrectly lead to raising ``ValueError`` (:issue:`28299`) -- Bug in :meth:`core.groupby.SeriesGroupBy.nunique` where ``NaT`` values were interfering with the count of unique values (:issue:`27951`) +- Bug in :meth:`_core.groupby.SeriesGroupBy.nunique` where ``NaT`` values were interfering with the count of unique values (:issue:`27951`) - Bug in :class:`Timestamp` subtraction when subtracting a :class:`Timestamp` from a ``np.datetime64`` object incorrectly raising ``TypeError`` (:issue:`28286`) - Addition and subtraction of integer or integer-dtype arrays with :class:`Timestamp` will now raise ``NullFrequencyError`` instead of ``ValueError`` (:issue:`28268`) - Bug in :class:`Series` and :class:`DataFrame` with integer dtype failing to raise ``TypeError`` when adding or subtracting a ``np.datetime64`` object (:issue:`28080`) - Bug in :meth:`Series.astype`, :meth:`Index.astype`, and :meth:`DataFrame.astype` failing to handle ``NaT`` when casting to an integer dtype (:issue:`28492`) - Bug in :class:`Week` with ``weekday`` incorrectly raising ``AttributeError`` instead of ``TypeError`` when adding or subtracting an invalid type (:issue:`28530`) - Bug in :class:`DataFrame` arithmetic operations when operating with a :class:`Series` with dtype ``'timedelta64[ns]'`` (:issue:`28049`) -- Bug in :func:`core.groupby.generic.SeriesGroupBy.apply` raising ``ValueError`` when a column in the original DataFrame is a datetime and the column labels are not standard integers (:issue:`28247`) +- Bug in :func:`_core.groupby.generic.SeriesGroupBy.apply` raising ``ValueError`` when a column in the original DataFrame is a datetime and the column labels are not standard integers (:issue:`28247`) - Bug in :func:`pandas._config.localization.get_locales` where the ``locales -a`` encodes the locales list as windows-1252 (:issue:`23638`, :issue:`24760`, :issue:`27368`) - Bug in :meth:`Series.var` failing to raise ``TypeError`` when called with ``timedelta64[ns]`` dtype (:issue:`28289`) - Bug in :meth:`DatetimeIndex.strftime` and :meth:`Series.dt.strftime` where ``NaT`` was converted to the string ``'NaT'`` instead of ``np.nan`` (:issue:`29578`) @@ -1215,20 +1215,20 @@ Plotting GroupBy/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug in :meth:`core.groupby.DataFrameGroupBy.apply` only showing output from a single group when function returns an :class:`Index` (:issue:`28652`) +- Bug in :meth:`_core.groupby.DataFrameGroupBy.apply` only showing output from a single group when function returns an :class:`Index` (:issue:`28652`) - Bug in :meth:`DataFrame.groupby` with multiple groups where an ``IndexError`` would be raised if any group contained all NA values (:issue:`20519`) - Bug in :meth:`.Resampler.size` and :meth:`.Resampler.count` returning wrong dtype when used with an empty :class:`Series` or :class:`DataFrame` (:issue:`28427`) - Bug in :meth:`DataFrame.rolling` not allowing for rolling over datetimes when ``axis=1`` (:issue:`28192`) - Bug in :meth:`DataFrame.rolling` not allowing rolling over multi-index levels (:issue:`15584`). - Bug in :meth:`DataFrame.rolling` not allowing rolling on monotonic decreasing time indexes (:issue:`19248`). - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`) -- Bug in :meth:`core.groupby.DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) +- Bug in :meth:`_core.groupby.DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) - Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`) - Remove error raised due to duplicated input functions in named aggregation in :meth:`DataFrame.groupby` and :meth:`Series.groupby`. Previously error will be raised if the same function is applied on the same column and now it is allowed if new assigned names are different. (:issue:`28426`) -- :meth:`core.groupby.SeriesGroupBy.value_counts` will be able to handle the case even when the :class:`Grouper` makes empty groups (:issue:`28479`) +- :meth:`_core.groupby.SeriesGroupBy.value_counts` will be able to handle the case even when the :class:`Grouper` makes empty groups (:issue:`28479`) - Bug in :meth:`core.window.rolling.Rolling.quantile` ignoring ``interpolation`` keyword argument when used within a groupby (:issue:`28779`) - Bug in :meth:`DataFrame.groupby` where ``any``, ``all``, ``nunique`` and transform functions would incorrectly handle duplicate column labels (:issue:`21668`) -- Bug in :meth:`core.groupby.DataFrameGroupBy.agg` with timezone-aware datetime64 column incorrectly casting results to the original dtype (:issue:`29641`) +- Bug in :meth:`_core.groupby.DataFrameGroupBy.agg` with timezone-aware datetime64 column incorrectly casting results to the original dtype (:issue:`29641`) - Bug in :meth:`DataFrame.groupby` when using axis=1 and having a single level columns index (:issue:`30208`) - Bug in :meth:`DataFrame.groupby` when using nunique on axis=1 (:issue:`30253`) - Bug in :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` with multiple list-like q value and integer column names (:issue:`30289`) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 37d021efddf0b..3f9dd6e619a71 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -332,7 +332,7 @@ Other enhancements - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``errors`` argument (:issue:`22610`) - :meth:`.DataFrameGroupBy.groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`). - :func:`read_json` now accepts an ``nrows`` parameter. (:issue:`33916`). -- :meth:`DataFrame.hist`, :meth:`Series.hist`, :meth:`core.groupby.DataFrameGroupBy.hist`, and :meth:`core.groupby.SeriesGroupBy.hist` have gained the ``legend`` argument. Set to True to show a legend in the histogram. (:issue:`6279`) +- :meth:`DataFrame.hist`, :meth:`Series.hist`, :meth:`_core.groupby.DataFrameGroupBy.hist`, and :meth:`_core.groupby.SeriesGroupBy.hist` have gained the ``legend`` argument. Set to True to show a legend in the histogram. (:issue:`6279`) - :func:`concat` and :meth:`~DataFrame.append` now preserve extension dtypes, for example combining a nullable integer column with a numpy integer column will no longer result in object dtype but preserve the integer dtype (:issue:`33607`, :issue:`34339`, :issue:`34095`). @@ -1147,8 +1147,8 @@ GroupBy/resample/rolling - Bug in :meth:`DataFrame.groupby` lost the name of the :class:`Index` when one of the ``agg`` keys referenced an empty list (:issue:`32580`) - Bug in :meth:`Rolling.apply` where ``center=True`` was ignored when ``engine='numba'`` was specified (:issue:`34784`) - Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`) -- Bug in :meth:`core.groupby.DataFrameGroupBy.quantile` raised ``TypeError`` for non-numeric types rather than dropping the columns (:issue:`27892`) -- Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`) +- Bug in :meth:`_core.groupby.DataFrameGroupBy.quantile` raised ``TypeError`` for non-numeric types rather than dropping the columns (:issue:`27892`) +- Bug in :meth:`_core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`) - Bug in :meth:`DataFrame.groupby` raising an ``AttributeError`` when selecting a column and aggregating with ``as_index=False`` (:issue:`35246`). - Bug in :meth:`DataFrameGroupBy.first` and :meth:`DataFrameGroupBy.last` that would raise an unnecessary ``ValueError`` when grouping on multiple ``Categoricals`` (:issue:`34951`) diff --git a/pandas/_core/groupby/__init__.py b/pandas/_core/groupby/__init__.py new file mode 100644 index 0000000000000..09107565e7271 --- /dev/null +++ b/pandas/_core/groupby/__init__.py @@ -0,0 +1,15 @@ +from pandas._core.groupby.generic import ( + DataFrameGroupBy, + NamedAgg, + SeriesGroupBy, +) +from pandas._core.groupby.groupby import GroupBy +from pandas._core.groupby.grouper import Grouper + +__all__ = [ + "DataFrameGroupBy", + "NamedAgg", + "SeriesGroupBy", + "GroupBy", + "Grouper", +] diff --git a/pandas/_core/groupby/base.py b/pandas/_core/groupby/base.py new file mode 100644 index 0000000000000..a443597347283 --- /dev/null +++ b/pandas/_core/groupby/base.py @@ -0,0 +1,121 @@ +""" +Provide basic components for groupby. +""" +from __future__ import annotations + +import dataclasses +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Hashable + + +@dataclasses.dataclass(order=True, frozen=True) +class OutputKey: + label: Hashable + position: int + + +# special case to prevent duplicate plots when catching exceptions when +# forwarding methods from NDFrames +plotting_methods = frozenset(["plot", "hist"]) + +# cythonized transformations or canned "agg+broadcast", which do not +# require postprocessing of the result by transform. +cythonized_kernels = frozenset(["cumprod", "cumsum", "shift", "cummin", "cummax"]) + +# List of aggregation/reduction functions. +# These map each group to a single numeric value +reduction_kernels = frozenset( + [ + "all", + "any", + "corrwith", + "count", + "first", + "idxmax", + "idxmin", + "last", + "max", + "mean", + "median", + "min", + "nunique", + "prod", + # as long as `quantile`'s signature accepts only + # a single quantile value, it's a reduction. + # GH#27526 might change that. + "quantile", + "sem", + "size", + "skew", + "std", + "sum", + "var", + ] +) + +# List of transformation functions. +# a transformation is a function that, for each group, +# produces a result that has the same shape as the group. + + +transformation_kernels = frozenset( + [ + "bfill", + "cumcount", + "cummax", + "cummin", + "cumprod", + "cumsum", + "diff", + "ffill", + "fillna", + "ngroup", + "pct_change", + "rank", + "shift", + ] +) + +# these are all the public methods on Grouper which don't belong +# in either of the above lists +groupby_other_methods = frozenset( + [ + "agg", + "aggregate", + "apply", + "boxplot", + # corr and cov return ngroups*ncolumns rows, so they + # are neither a transformation nor a reduction + "corr", + "cov", + "describe", + "dtypes", + "expanding", + "ewm", + "filter", + "get_group", + "groups", + "head", + "hist", + "indices", + "ndim", + "ngroups", + "nth", + "ohlc", + "pipe", + "plot", + "resample", + "rolling", + "tail", + "take", + "transform", + "sample", + "value_counts", + ] +) +# Valid values of `name` for `groupby.transform(name)` +# NOTE: do NOT edit this directly. New additions should be inserted +# into the appropriate list above. +transform_kernel_allowlist = reduction_kernels | transformation_kernels diff --git a/pandas/_core/groupby/categorical.py b/pandas/_core/groupby/categorical.py new file mode 100644 index 0000000000000..6ab98cf4fe55e --- /dev/null +++ b/pandas/_core/groupby/categorical.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +import numpy as np + +from pandas.core.algorithms import unique1d +from pandas.core.arrays.categorical import ( + Categorical, + CategoricalDtype, + recode_for_categories, +) + + +def recode_for_groupby( + c: Categorical, sort: bool, observed: bool +) -> tuple[Categorical, Categorical | None]: + """ + Code the categories to ensure we can groupby for categoricals. + + If observed=True, we return a new Categorical with the observed + categories only. + + If sort=False, return a copy of self, coded with categories as + returned by .unique(), followed by any categories not appearing in + the data. If sort=True, return self. + + This method is needed solely to ensure the categorical index of the + GroupBy result has categories in the order of appearance in the data + (GH-8868). + + Parameters + ---------- + c : Categorical + sort : bool + The value of the sort parameter groupby was called with. + observed : bool + Account only for the observed values + + Returns + ------- + Categorical + If sort=False, the new categories are set to the order of + appearance in codes (unless ordered=True, in which case the + original order is preserved), followed by any unrepresented + categories in the original order. + Categorical or None + If we are observed, return the original categorical, otherwise None + """ + # we only care about observed values + if observed: + # In cases with c.ordered, this is equivalent to + # return c.remove_unused_categories(), c + + unique_codes = unique1d(c.codes) + + take_codes = unique_codes[unique_codes != -1] + if sort: + take_codes = np.sort(take_codes) + + # we recode according to the uniques + categories = c.categories.take(take_codes) + codes = recode_for_categories(c.codes, c.categories, categories) + + # return a new categorical that maps our new codes + # and categories + dtype = CategoricalDtype(categories, ordered=c.ordered) + return Categorical._simple_new(codes, dtype=dtype), c + + # Already sorted according to c.categories; all is fine + if sort: + return c, None + + # sort=False should order groups in as-encountered order (GH-8868) + + # xref GH:46909: Re-ordering codes faster than using (set|add|reorder)_categories + all_codes = np.arange(c.categories.nunique()) + # GH 38140: exclude nan from indexer for categories + unique_notnan_codes = unique1d(c.codes[c.codes != -1]) + if sort: + unique_notnan_codes = np.sort(unique_notnan_codes) + if len(all_codes) > len(unique_notnan_codes): + # GH 13179: All categories need to be present, even if missing from the data + missing_codes = np.setdiff1d(all_codes, unique_notnan_codes, assume_unique=True) + take_codes = np.concatenate((unique_notnan_codes, missing_codes)) + else: + take_codes = unique_notnan_codes + + return Categorical(c, c.unique().categories.take(take_codes)), None diff --git a/pandas/_core/groupby/generic.py b/pandas/_core/groupby/generic.py new file mode 100644 index 0000000000000..fed3de5f82148 --- /dev/null +++ b/pandas/_core/groupby/generic.py @@ -0,0 +1,2867 @@ +""" +Define the SeriesGroupBy and DataFrameGroupBy +classes that hold the groupby interfaces (and some implementations). + +These are user facing as the result of the ``df.groupby(...)`` operations, +which here returns a DataFrameGroupBy object. +""" +from __future__ import annotations + +from collections import abc +from functools import partial +from textwrap import dedent +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Literal, + NamedTuple, + TypeVar, + Union, + cast, +) +import warnings + +import numpy as np + +from pandas._libs import ( + Interval, + lib, +) +from pandas.errors import SpecificationError +from pandas.util._decorators import ( + Appender, + Substitution, + doc, +) +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import ( + ensure_int64, + is_bool, + is_dict_like, + is_integer_dtype, + is_list_like, + is_numeric_dtype, + is_scalar, +) +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + IntervalDtype, +) +from pandas.core.dtypes.inference import is_hashable +from pandas.core.dtypes.missing import ( + isna, + notna, +) + +from pandas._core.groupby import ( + base, + ops, +) +from pandas._core.groupby.groupby import ( + GroupBy, + GroupByPlot, + _agg_template_frame, + _agg_template_series, + _apply_docs, + _transform_template, +) +from pandas.core import algorithms +from pandas.core.apply import ( + GroupByApply, + maybe_mangle_lambdas, + reconstruct_func, + validate_func_kwargs, + warn_alias_replacement, +) +import pandas.core.common as com +from pandas.core.frame import DataFrame +from pandas.core.indexes.api import ( + Index, + MultiIndex, + all_indexes_same, + default_index, +) +from pandas.core.series import Series +from pandas.core.util.numba_ import maybe_use_numba + +from pandas.plotting import boxplot_frame_groupby + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Mapping, + Sequence, + ) + + from pandas._typing import ( + ArrayLike, + Axis, + AxisInt, + CorrelationMethod, + FillnaOptions, + IndexLabel, + Manager, + Manager2D, + SingleManager, + TakeIndexer, + ) + + from pandas import Categorical + from pandas.core.generic import NDFrame + +# TODO(typing) the return value on this callable should be any *scalar*. +AggScalar = Union[str, Callable[..., Any]] +# TODO: validate types on ScalarResult and move to _typing +# Blocked from using by https://github.com/python/mypy/issues/1484 +# See note at _mangle_lambda_list +ScalarResult = TypeVar("ScalarResult") + + +class NamedAgg(NamedTuple): + """ + Helper for column specific aggregation with control over output column names. + + Subclass of typing.NamedTuple. + + Parameters + ---------- + column : Hashable + Column label in the DataFrame to apply aggfunc. + aggfunc : function or str + Function to apply to the provided column. If string, the name of a built-in + pandas function. + + Examples + -------- + >>> df = pd.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]}) + >>> agg_a = pd.NamedAgg(column="a", aggfunc="min") + >>> agg_1 = pd.NamedAgg(column=1, aggfunc=lambda x: np.mean(x)) + >>> df.groupby("key").agg(result_a=agg_a, result_1=agg_1) + result_a result_1 + key + 1 -1 10.5 + 2 1 12.0 + """ + + column: Hashable + aggfunc: AggScalar + + +class SeriesGroupBy(GroupBy[Series]): + def _wrap_agged_manager(self, mgr: Manager) -> Series: + out = self.obj._constructor_from_mgr(mgr, axes=mgr.axes) + out._name = self.obj.name + return out + + def _get_data_to_aggregate( + self, *, numeric_only: bool = False, name: str | None = None + ) -> SingleManager: + ser = self._obj_with_exclusions + single = ser._mgr + if numeric_only and not is_numeric_dtype(ser.dtype): + # GH#41291 match Series behavior + kwd_name = "numeric_only" + raise TypeError( + f"Cannot use {kwd_name}=True with " + f"{type(self).__name__}.{name} and non-numeric dtypes." + ) + return single + + _agg_examples_doc = dedent( + """ + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4]) + + >>> s + 0 1 + 1 2 + 2 3 + 3 4 + dtype: int64 + + >>> s.groupby([1, 1, 2, 2]).min() + 1 1 + 2 3 + dtype: int64 + + >>> s.groupby([1, 1, 2, 2]).agg('min') + 1 1 + 2 3 + dtype: int64 + + >>> s.groupby([1, 1, 2, 2]).agg(['min', 'max']) + min max + 1 1 2 + 2 3 4 + + The output column names can be controlled by passing + the desired column names and aggregations as keyword arguments. + + >>> s.groupby([1, 1, 2, 2]).agg( + ... minimum='min', + ... maximum='max', + ... ) + minimum maximum + 1 1 2 + 2 3 4 + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the aggregating function. + + >>> s.groupby([1, 1, 2, 2]).agg(lambda x: x.astype(float).min()) + 1 1.0 + 2 3.0 + dtype: float64 + """ + ) + + @Appender( + _apply_docs["template"].format( + input="series", examples=_apply_docs["series_examples"] + ) + ) + def apply(self, func, *args, **kwargs) -> Series: + return super().apply(func, *args, **kwargs) + + @doc(_agg_template_series, examples=_agg_examples_doc, klass="Series") + def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): + relabeling = func is None + columns = None + if relabeling: + columns, func = validate_func_kwargs(kwargs) + kwargs = {} + + if isinstance(func, str): + if maybe_use_numba(engine) and engine is not None: + # Not all agg functions support numba, only propagate numba kwargs + # if user asks for numba, and engine is not None + # (if engine is None, the called function will handle the case where + # numba is requested via the global option) + kwargs["engine"] = engine + if engine_kwargs is not None: + kwargs["engine_kwargs"] = engine_kwargs + return getattr(self, func)(*args, **kwargs) + + elif isinstance(func, abc.Iterable): + # Catch instances of lists / tuples + # but not the class list / tuple itself. + func = maybe_mangle_lambdas(func) + kwargs["engine"] = engine + kwargs["engine_kwargs"] = engine_kwargs + ret = self._aggregate_multiple_funcs(func, *args, **kwargs) + if relabeling: + # columns is not narrowed by mypy from relabeling flag + assert columns is not None # for mypy + ret.columns = columns + if not self.as_index: + ret = ret.reset_index() + return ret + + else: + cyfunc = com.get_cython_func(func) + if cyfunc and not args and not kwargs: + warn_alias_replacement(self, func, cyfunc) + return getattr(self, cyfunc)() + + if maybe_use_numba(engine): + return self._aggregate_with_numba( + func, *args, engine_kwargs=engine_kwargs, **kwargs + ) + + if self.ngroups == 0: + # e.g. test_evaluate_with_empty_groups without any groups to + # iterate over, we have no output on which to do dtype + # inference. We default to using the existing dtype. + # xref GH#51445 + obj = self._obj_with_exclusions + return self.obj._constructor( + [], + name=self.obj.name, + index=self.grouper.result_index, + dtype=obj.dtype, + ) + + if self.grouper.nkeys > 1: + return self._python_agg_general(func, *args, **kwargs) + + try: + return self._python_agg_general(func, *args, **kwargs) + except KeyError: + # KeyError raised in test_groupby.test_basic is bc the func does + # a dictionary lookup on group.name, but group name is not + # pinned in _python_agg_general, only in _aggregate_named + result = self._aggregate_named(func, *args, **kwargs) + + warnings.warn( + "Pinning the groupby key to each group in " + f"{type(self).__name__}.agg is deprecated, and cases that " + "relied on it will raise in a future version. " + "If your operation requires utilizing the groupby keys, " + "iterate over the groupby object instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + # result is a dict whose keys are the elements of result_index + result = Series(result, index=self.grouper.result_index) + result = self._wrap_aggregated_output(result) + return result + + agg = aggregate + + def _python_agg_general(self, func, *args, **kwargs): + orig_func = func + func = com.is_builtin_func(func) + if orig_func != func: + alias = com._builtin_table_alias[func] + warn_alias_replacement(self, orig_func, alias) + f = lambda x: func(x, *args, **kwargs) + + obj = self._obj_with_exclusions + result = self.grouper.agg_series(obj, f) + res = obj._constructor(result, name=obj.name) + return self._wrap_aggregated_output(res) + + def _aggregate_multiple_funcs(self, arg, *args, **kwargs) -> DataFrame: + if isinstance(arg, dict): + if self.as_index: + # GH 15931 + raise SpecificationError("nested renamer is not supported") + else: + # GH#50684 - This accidentally worked in 1.x + msg = ( + "Passing a dictionary to SeriesGroupBy.agg is deprecated " + "and will raise in a future version of pandas. Pass a list " + "of aggregations instead." + ) + warnings.warn( + message=msg, + category=FutureWarning, + stacklevel=find_stack_level(), + ) + arg = list(arg.items()) + elif any(isinstance(x, (tuple, list)) for x in arg): + arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg] + else: + # list of functions / function names + columns = (com.get_callable_name(f) or f for f in arg) + arg = zip(columns, arg) + + results: dict[base.OutputKey, DataFrame | Series] = {} + with com.temp_setattr(self, "as_index", True): + # Combine results using the index, need to adjust index after + # if as_index=False (GH#50724) + for idx, (name, func) in enumerate(arg): + key = base.OutputKey(label=name, position=idx) + results[key] = self.aggregate(func, *args, **kwargs) + + if any(isinstance(x, DataFrame) for x in results.values()): + from pandas import concat + + res_df = concat( + results.values(), axis=1, keys=[key.label for key in results] + ) + return res_df + + indexed_output = {key.position: val for key, val in results.items()} + output = self.obj._constructor_expanddim(indexed_output, index=None) + output.columns = Index(key.label for key in results) + + return output + + def _wrap_applied_output( + self, + data: Series, + values: list[Any], + not_indexed_same: bool = False, + is_transform: bool = False, + ) -> DataFrame | Series: + """ + Wrap the output of SeriesGroupBy.apply into the expected result. + + Parameters + ---------- + data : Series + Input data for groupby operation. + values : List[Any] + Applied output for each group. + not_indexed_same : bool, default False + Whether the applied outputs are not indexed the same as the group axes. + + Returns + ------- + DataFrame or Series + """ + if len(values) == 0: + # GH #6265 + if is_transform: + # GH#47787 see test_group_on_empty_multiindex + res_index = data.index + else: + res_index = self.grouper.result_index + + return self.obj._constructor( + [], + name=self.obj.name, + index=res_index, + dtype=data.dtype, + ) + assert values is not None + + if isinstance(values[0], dict): + # GH #823 #24880 + index = self.grouper.result_index + res_df = self.obj._constructor_expanddim(values, index=index) + res_df = self._reindex_output(res_df) + # if self.observed is False, + # keep all-NaN rows created while re-indexing + res_ser = res_df.stack(future_stack=True) + res_ser.name = self.obj.name + return res_ser + elif isinstance(values[0], (Series, DataFrame)): + result = self._concat_objects( + values, + not_indexed_same=not_indexed_same, + is_transform=is_transform, + ) + if isinstance(result, Series): + result.name = self.obj.name + if not self.as_index and not_indexed_same: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) + return result + else: + # GH #6265 #24880 + result = self.obj._constructor( + data=values, index=self.grouper.result_index, name=self.obj.name + ) + if not self.as_index: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) + return self._reindex_output(result) + + def _aggregate_named(self, func, *args, **kwargs): + # Note: this is very similar to _aggregate_series_pure_python, + # but that does not pin group.name + result = {} + initialized = False + + for name, group in self.grouper.get_iterator( + self._obj_with_exclusions, axis=self.axis + ): + # needed for pandas/tests/groupby/test_groupby.py::test_basic_aggregations + object.__setattr__(group, "name", name) + + output = func(group, *args, **kwargs) + output = ops.extract_result(output) + if not initialized: + # We only do this validation on the first iteration + ops.check_result_array(output, group.dtype) + initialized = True + result[name] = output + + return result + + __examples_series_doc = dedent( + """ + >>> ser = pd.Series( + ... [390.0, 350.0, 30.0, 20.0], + ... index=["Falcon", "Falcon", "Parrot", "Parrot"], + ... name="Max Speed") + >>> grouped = ser.groupby([1, 1, 2, 2]) + >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) + Falcon 0.707107 + Falcon -0.707107 + Parrot 0.707107 + Parrot -0.707107 + Name: Max Speed, dtype: float64 + + Broadcast result of the transformation + + >>> grouped.transform(lambda x: x.max() - x.min()) + Falcon 40.0 + Falcon 40.0 + Parrot 10.0 + Parrot 10.0 + Name: Max Speed, dtype: float64 + + >>> grouped.transform("mean") + Falcon 370.0 + Falcon 370.0 + Parrot 25.0 + Parrot 25.0 + Name: Max Speed, dtype: float64 + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + for example: + + >>> grouped.transform(lambda x: x.astype(int).max()) + Falcon 390 + Falcon 390 + Parrot 30 + Parrot 30 + Name: Max Speed, dtype: int64 + """ + ) + + @Substitution(klass="Series", example=__examples_series_doc) + @Appender(_transform_template) + def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): + return self._transform( + func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) + + def _cython_transform( + self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs + ): + assert axis == 0 # handled by caller + + obj = self._obj_with_exclusions + + try: + result = self.grouper._cython_operation( + "transform", obj._values, how, axis, **kwargs + ) + except NotImplementedError as err: + # e.g. test_groupby_raises_string + raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err + + return obj._constructor(result, index=self.obj.index, name=obj.name) + + def _transform_general( + self, func: Callable, engine, engine_kwargs, *args, **kwargs + ) -> Series: + """ + Transform with a callable `func`. + """ + if maybe_use_numba(engine): + return self._transform_with_numba( + func, *args, engine_kwargs=engine_kwargs, **kwargs + ) + assert callable(func) + klass = type(self.obj) + + results = [] + for name, group in self.grouper.get_iterator( + self._obj_with_exclusions, axis=self.axis + ): + # this setattr is needed for test_transform_lambda_with_datetimetz + object.__setattr__(group, "name", name) + res = func(group, *args, **kwargs) + + results.append(klass(res, index=group.index)) + + # check for empty "results" to avoid concat ValueError + if results: + from pandas.core.reshape.concat import concat + + concatenated = concat(results) + result = self._set_result_index_ordered(concatenated) + else: + result = self.obj._constructor(dtype=np.float64) + + result.name = self.obj.name + return result + + def filter(self, func, dropna: bool = True, *args, **kwargs): + """ + Filter elements from groups that don't satisfy a criterion. + + Elements from groups are filtered if they do not satisfy the + boolean criterion specified by func. + + Parameters + ---------- + func : function + Criterion to apply to each group. Should return True or False. + dropna : bool + Drop groups that do not pass the filter. True by default; if False, + groups that evaluate False are filled with NaNs. + + Returns + ------- + Series + + Notes + ----- + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` + for more details. + + Examples + -------- + >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + ... 'foo', 'bar'], + ... 'B' : [1, 2, 3, 4, 5, 6], + ... 'C' : [2.0, 5., 8., 1., 2., 9.]}) + >>> grouped = df.groupby('A') + >>> df.groupby('A').B.filter(lambda x: x.mean() > 3.) + 1 2 + 3 4 + 5 6 + Name: B, dtype: int64 + """ + if isinstance(func, str): + wrapper = lambda x: getattr(x, func)(*args, **kwargs) + else: + wrapper = lambda x: func(x, *args, **kwargs) + + # Interpret np.nan as False. + def true_and_notna(x) -> bool: + b = wrapper(x) + return notna(b) and b + + try: + indices = [ + self._get_index(name) + for name, group in self.grouper.get_iterator( + self._obj_with_exclusions, axis=self.axis + ) + if true_and_notna(group) + ] + except (ValueError, TypeError) as err: + raise TypeError("the filter must return a boolean result") from err + + filtered = self._apply_filter(indices, dropna) + return filtered + + def nunique(self, dropna: bool = True) -> Series | DataFrame: + """ + Return number of unique elements in the group. + + Returns + ------- + Series + Number of unique values within each group. + + Examples + -------- + For SeriesGroupby: + + >>> lst = ['a', 'a', 'b', 'b'] + >>> ser = pd.Series([1, 2, 3, 3], index=lst) + >>> ser + a 1 + a 2 + b 3 + b 3 + dtype: int64 + >>> ser.groupby(level=0).nunique() + a 2 + b 1 + dtype: int64 + + For Resampler: + + >>> ser = pd.Series([1, 2, 3, 3], index=pd.DatetimeIndex( + ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 3 + dtype: int64 + >>> ser.resample('MS').nunique() + 2023-01-01 2 + 2023-02-01 1 + Freq: MS, dtype: int64 + """ + ids, _, _ = self.grouper.group_info + + val = self.obj._values + + codes, _ = algorithms.factorize(val, sort=False) + sorter = np.lexsort((codes, ids)) + codes = codes[sorter] + ids = ids[sorter] + + # group boundaries are where group ids change + # unique observations are where sorted values change + idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] + inc = np.r_[1, codes[1:] != codes[:-1]] + + # 1st item of each group is a new unique observation + mask = codes == -1 + if dropna: + inc[idx] = 1 + inc[mask] = 0 + else: + inc[mask & np.r_[False, mask[:-1]]] = 0 + inc[idx] = 1 + + out = np.add.reduceat(inc, idx).astype("int64", copy=False) + if len(ids): + # NaN/NaT group exists if the head of ids is -1, + # so remove it from res and exclude its index from idx + if ids[0] == -1: + res = out[1:] + idx = idx[np.flatnonzero(idx)] + else: + res = out + else: + res = out[1:] + ri = self.grouper.result_index + + # we might have duplications among the bins + if len(res) != len(ri): + res, out = np.zeros(len(ri), dtype=out.dtype), res + if len(ids) > 0: + # GH#21334s + res[ids[idx]] = out + + result: Series | DataFrame = self.obj._constructor( + res, index=ri, name=self.obj.name + ) + if not self.as_index: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) + return self._reindex_output(result, fill_value=0) + + @doc(Series.describe) + def describe(self, percentiles=None, include=None, exclude=None) -> Series: + return super().describe( + percentiles=percentiles, include=include, exclude=exclude + ) + + def value_counts( + self, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + bins=None, + dropna: bool = True, + ) -> Series | DataFrame: + name = "proportion" if normalize else "count" + + if bins is None: + result = self._value_counts( + normalize=normalize, sort=sort, ascending=ascending, dropna=dropna + ) + result.name = name + return result + + from pandas.core.reshape.merge import get_join_indexers + from pandas.core.reshape.tile import cut + + ids, _, _ = self.grouper.group_info + val = self.obj._values + + index_names = self.grouper.names + [self.obj.name] + + if isinstance(val.dtype, CategoricalDtype) or ( + bins is not None and not np.iterable(bins) + ): + # scalar bins cannot be done at top level + # in a backward compatible way + # GH38672 relates to categorical dtype + ser = self.apply( + Series.value_counts, + normalize=normalize, + sort=sort, + ascending=ascending, + bins=bins, + ) + ser.name = name + ser.index.names = index_names + return ser + + # groupby removes null keys from groupings + mask = ids != -1 + ids, val = ids[mask], val[mask] + + lab: Index | np.ndarray + if bins is None: + lab, lev = algorithms.factorize(val, sort=True) + llab = lambda lab, inc: lab[inc] + else: + # lab is a Categorical with categories an IntervalIndex + cat_ser = cut(Series(val, copy=False), bins, include_lowest=True) + cat_obj = cast("Categorical", cat_ser._values) + lev = cat_obj.categories + lab = lev.take( + cat_obj.codes, + allow_fill=True, + fill_value=lev._na_value, + ) + llab = lambda lab, inc: lab[inc]._multiindex.codes[-1] + + if isinstance(lab.dtype, IntervalDtype): + # TODO: should we do this inside II? + lab_interval = cast(Interval, lab) + + sorter = np.lexsort((lab_interval.left, lab_interval.right, ids)) + else: + sorter = np.lexsort((lab, ids)) + + ids, lab = ids[sorter], lab[sorter] + + # group boundaries are where group ids change + idchanges = 1 + np.nonzero(ids[1:] != ids[:-1])[0] + idx = np.r_[0, idchanges] + if not len(ids): + idx = idchanges + + # new values are where sorted labels change + lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1)) + inc = np.r_[True, lchanges] + if not len(val): + inc = lchanges + inc[idx] = True # group boundaries are also new values + out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts + + # num. of times each group should be repeated + rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) + + # multi-index components + codes = self.grouper.reconstructed_codes + codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)] + levels = [ping.group_index for ping in self.grouper.groupings] + [lev] + + if dropna: + mask = codes[-1] != -1 + if mask.all(): + dropna = False + else: + out, codes = out[mask], [level_codes[mask] for level_codes in codes] + + if normalize: + out = out.astype("float") + d = np.diff(np.r_[idx, len(ids)]) + if dropna: + m = ids[lab == -1] + np.add.at(d, m, -1) + acc = rep(d)[mask] + else: + acc = rep(d) + out /= acc + + if sort and bins is None: + cat = ids[inc][mask] if dropna else ids[inc] + sorter = np.lexsort((out if ascending else -out, cat)) + out, codes[-1] = out[sorter], codes[-1][sorter] + + if bins is not None: + # for compat. with libgroupby.value_counts need to ensure every + # bin is present at every index level, null filled with zeros + diff = np.zeros(len(out), dtype="bool") + for level_codes in codes[:-1]: + diff |= np.r_[True, level_codes[1:] != level_codes[:-1]] + + ncat, nbin = diff.sum(), len(levels[-1]) + + left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)] + + right = [diff.cumsum() - 1, codes[-1]] + + # error: Argument 1 to "get_join_indexers" has incompatible type + # "List[ndarray[Any, Any]]"; expected "List[Union[Union[ExtensionArray, + # ndarray[Any, Any]], Index, Series]] + _, idx = get_join_indexers( + left, right, sort=False, how="left" # type: ignore[arg-type] + ) + out = np.where(idx != -1, out[idx], 0) + + if sort: + sorter = np.lexsort((out if ascending else -out, left[0])) + out, left[-1] = out[sorter], left[-1][sorter] + + # build the multi-index w/ full levels + def build_codes(lev_codes: np.ndarray) -> np.ndarray: + return np.repeat(lev_codes[diff], nbin) + + codes = [build_codes(lev_codes) for lev_codes in codes[:-1]] + codes.append(left[-1]) + + mi = MultiIndex( + levels=levels, codes=codes, names=index_names, verify_integrity=False + ) + + if is_integer_dtype(out.dtype): + out = ensure_int64(out) + result = self.obj._constructor(out, index=mi, name=name) + if not self.as_index: + result = result.reset_index() + return result + + def fillna( + self, + value: object | ArrayLike | None = None, + method: FillnaOptions | None = None, + axis: Axis | None | lib.NoDefault = lib.no_default, + inplace: bool = False, + limit: int | None = None, + downcast: dict | None | lib.NoDefault = lib.no_default, + ) -> Series | None: + """ + Fill NA/NaN values using the specified method within groups. + + Parameters + ---------- + value : scalar, dict, Series, or DataFrame + Value to use to fill holes (e.g. 0), alternately a + dict/Series/DataFrame of values specifying which value to use for + each index (for a Series) or column (for a DataFrame). Values not + in the dict/Series/DataFrame will not be filled. This value cannot + be a list. Users wanting to use the ``value`` argument and not ``method`` + should prefer :meth:`.Series.fillna` as this + will produce the same result and be more performant. + method : {{'bfill', 'ffill', None}}, default None + Method to use for filling holes. ``'ffill'`` will propagate + the last valid observation forward within a group. + ``'bfill'`` will use next valid observation to fill the gap. + + .. deprecated:: 2.1.0 + Use obj.ffill or obj.bfill instead. + + axis : {0 or 'index', 1 or 'columns'} + Unused, only for compatibility with :meth:`DataFrameGroupBy.fillna`. + + .. deprecated:: 2.1.0 + For axis=1, operate on the underlying object instead. Otherwise + the axis keyword is not necessary. + + inplace : bool, default False + Broken. Do not set to True. + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill within a group. In other words, + if there is a gap with more than this number of consecutive NaNs, + it will only be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. Must be greater than 0 if not None. + downcast : dict, default is None + A dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible). + + .. deprecated:: 2.1.0 + + Returns + ------- + Series + Object with missing values filled within groups. + + See Also + -------- + ffill : Forward fill values within a group. + bfill : Backward fill values within a group. + + Examples + -------- + For SeriesGroupBy: + + >>> lst = ['cat', 'cat', 'cat', 'mouse', 'mouse'] + >>> ser = pd.Series([1, None, None, 2, None], index=lst) + >>> ser + cat 1.0 + cat NaN + cat NaN + mouse 2.0 + mouse NaN + dtype: float64 + >>> ser.groupby(level=0).fillna(0, limit=1) + cat 1.0 + cat 0.0 + cat NaN + mouse 2.0 + mouse 0.0 + dtype: float64 + """ + result = self._op_via_apply( + "fillna", + value=value, + method=method, + axis=axis, + inplace=inplace, + limit=limit, + downcast=downcast, + ) + return result + + def take( + self, + indices: TakeIndexer, + axis: Axis | lib.NoDefault = lib.no_default, + **kwargs, + ) -> Series: + """ + Return the elements in the given *positional* indices in each group. + + This means that we are not indexing according to actual values in + the index attribute of the object. We are indexing according to the + actual position of the element in the object. + + If a requested index does not exist for some group, this method will raise. + To get similar behavior that ignores indices that don't exist, see + :meth:`.SeriesGroupBy.nth`. + + Parameters + ---------- + indices : array-like + An array of ints indicating which positions to take in each group. + axis : {0 or 'index', 1 or 'columns', None}, default 0 + The axis on which to select elements. ``0`` means that we are + selecting rows, ``1`` means that we are selecting columns. + For `SeriesGroupBy` this parameter is unused and defaults to 0. + + .. deprecated:: 2.1.0 + For axis=1, operate on the underlying object instead. Otherwise + the axis keyword is not necessary. + + **kwargs + For compatibility with :meth:`numpy.take`. Has no effect on the + output. + + Returns + ------- + Series + A Series containing the elements taken from each group. + + See Also + -------- + Series.take : Take elements from a Series along an axis. + Series.loc : Select a subset of a DataFrame by labels. + Series.iloc : Select a subset of a DataFrame by positions. + numpy.take : Take elements from an array along an axis. + SeriesGroupBy.nth : Similar to take, won't raise if indices don't exist. + + Examples + -------- + >>> df = pd.DataFrame([('falcon', 'bird', 389.0), + ... ('parrot', 'bird', 24.0), + ... ('lion', 'mammal', 80.5), + ... ('monkey', 'mammal', np.nan), + ... ('rabbit', 'mammal', 15.0)], + ... columns=['name', 'class', 'max_speed'], + ... index=[4, 3, 2, 1, 0]) + >>> df + name class max_speed + 4 falcon bird 389.0 + 3 parrot bird 24.0 + 2 lion mammal 80.5 + 1 monkey mammal NaN + 0 rabbit mammal 15.0 + >>> gb = df["name"].groupby([1, 1, 2, 2, 2]) + + Take elements at positions 0 and 1 along the axis 0 in each group (default). + + >>> gb.take([0, 1]) + 1 4 falcon + 3 parrot + 2 2 lion + 1 monkey + Name: name, dtype: object + + We may take elements using negative integers for positive indices, + starting from the end of the object, just like with Python lists. + + >>> gb.take([-1, -2]) + 1 3 parrot + 4 falcon + 2 0 rabbit + 1 monkey + Name: name, dtype: object + """ + result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs) + return result + + def skew( + self, + axis: Axis | lib.NoDefault = lib.no_default, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ) -> Series: + """ + Return unbiased skew within groups. + + Normalized by N-1. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns', None}, default 0 + Axis for the function to be applied on. + This parameter is only for compatibility with DataFrame and is unused. + + .. deprecated:: 2.1.0 + For axis=1, operate on the underlying object instead. Otherwise + the axis keyword is not necessary. + + skipna : bool, default True + Exclude NA/null values when computing the result. + + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series + + See Also + -------- + Series.skew : Return unbiased skew over requested axis. + + Examples + -------- + >>> ser = pd.Series([390., 350., 357., np.nan, 22., 20., 30.], + ... index=['Falcon', 'Falcon', 'Falcon', 'Falcon', + ... 'Parrot', 'Parrot', 'Parrot'], + ... name="Max Speed") + >>> ser + Falcon 390.0 + Falcon 350.0 + Falcon 357.0 + Falcon NaN + Parrot 22.0 + Parrot 20.0 + Parrot 30.0 + Name: Max Speed, dtype: float64 + >>> ser.groupby(level=0).skew() + Falcon 1.525174 + Parrot 1.457863 + Name: Max Speed, dtype: float64 + >>> ser.groupby(level=0).skew(skipna=False) + Falcon NaN + Parrot 1.457863 + Name: Max Speed, dtype: float64 + """ + if axis is lib.no_default: + axis = 0 + + if axis != 0: + result = self._op_via_apply( + "skew", + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + **kwargs, + ) + return result + + def alt(obj): + # This should not be reached since the cython path should raise + # TypeError and not NotImplementedError. + raise TypeError(f"'skew' is not supported for dtype={obj.dtype}") + + return self._cython_agg_general( + "skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs + ) + + @property + @doc(Series.plot.__doc__) + def plot(self) -> GroupByPlot: + result = GroupByPlot(self) + return result + + @doc(Series.nlargest.__doc__) + def nlargest( + self, n: int = 5, keep: Literal["first", "last", "all"] = "first" + ) -> Series: + f = partial(Series.nlargest, n=n, keep=keep) + data = self._obj_with_exclusions + # Don't change behavior if result index happens to be the same, i.e. + # already ordered and n >= all group sizes. + result = self._python_apply_general(f, data, not_indexed_same=True) + return result + + @doc(Series.nsmallest.__doc__) + def nsmallest( + self, n: int = 5, keep: Literal["first", "last", "all"] = "first" + ) -> Series: + f = partial(Series.nsmallest, n=n, keep=keep) + data = self._obj_with_exclusions + # Don't change behavior if result index happens to be the same, i.e. + # already ordered and n >= all group sizes. + result = self._python_apply_general(f, data, not_indexed_same=True) + return result + + @doc(Series.idxmin.__doc__) + def idxmin( + self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True + ) -> Series: + return self._idxmax_idxmin("idxmin", axis=axis, skipna=skipna) + + @doc(Series.idxmax.__doc__) + def idxmax( + self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True + ) -> Series: + return self._idxmax_idxmin("idxmax", axis=axis, skipna=skipna) + + @doc(Series.corr.__doc__) + def corr( + self, + other: Series, + method: CorrelationMethod = "pearson", + min_periods: int | None = None, + ) -> Series: + result = self._op_via_apply( + "corr", other=other, method=method, min_periods=min_periods + ) + return result + + @doc(Series.cov.__doc__) + def cov( + self, other: Series, min_periods: int | None = None, ddof: int | None = 1 + ) -> Series: + result = self._op_via_apply( + "cov", other=other, min_periods=min_periods, ddof=ddof + ) + return result + + @property + def is_monotonic_increasing(self) -> Series: + """ + Return whether each group's values are monotonically increasing. + + Returns + ------- + Series + + Examples + -------- + >>> s = pd.Series([2, 1, 3, 4], index=['Falcon', 'Falcon', 'Parrot', 'Parrot']) + >>> s.groupby(level=0).is_monotonic_increasing + Falcon False + Parrot True + dtype: bool + """ + return self.apply(lambda ser: ser.is_monotonic_increasing) + + @property + def is_monotonic_decreasing(self) -> Series: + """ + Return whether each group's values are monotonically decreasing. + + Returns + ------- + Series + + Examples + -------- + >>> s = pd.Series([2, 1, 3, 4], index=['Falcon', 'Falcon', 'Parrot', 'Parrot']) + >>> s.groupby(level=0).is_monotonic_decreasing + Falcon True + Parrot False + dtype: bool + """ + return self.apply(lambda ser: ser.is_monotonic_decreasing) + + @doc(Series.hist.__doc__) + def hist( + self, + by=None, + ax=None, + grid: bool = True, + xlabelsize: int | None = None, + xrot: float | None = None, + ylabelsize: int | None = None, + yrot: float | None = None, + figsize: tuple[int, int] | None = None, + bins: int | Sequence[int] = 10, + backend: str | None = None, + legend: bool = False, + **kwargs, + ): + result = self._op_via_apply( + "hist", + by=by, + ax=ax, + grid=grid, + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, + figsize=figsize, + bins=bins, + backend=backend, + legend=legend, + **kwargs, + ) + return result + + @property + @doc(Series.dtype.__doc__) + def dtype(self) -> Series: + return self.apply(lambda ser: ser.dtype) + + def unique(self) -> Series: + """ + Return unique values for each group. + + It returns unique values for each of the grouped values. Returned in + order of appearance. Hash table-based unique, therefore does NOT sort. + + Returns + ------- + Series + Unique values for each of the grouped values. + + See Also + -------- + Series.unique : Return unique values of Series object. + + Examples + -------- + >>> df = pd.DataFrame([('Chihuahua', 'dog', 6.1), + ... ('Beagle', 'dog', 15.2), + ... ('Chihuahua', 'dog', 6.9), + ... ('Persian', 'cat', 9.2), + ... ('Chihuahua', 'dog', 7), + ... ('Persian', 'cat', 8.8)], + ... columns=['breed', 'animal', 'height_in']) + >>> df + breed animal height_in + 0 Chihuahua dog 6.1 + 1 Beagle dog 15.2 + 2 Chihuahua dog 6.9 + 3 Persian cat 9.2 + 4 Chihuahua dog 7.0 + 5 Persian cat 8.8 + >>> ser = df.groupby('animal')['breed'].unique() + >>> ser + animal + cat [Persian] + dog [Chihuahua, Beagle] + Name: breed, dtype: object + """ + result = self._op_via_apply("unique") + return result + + +class DataFrameGroupBy(GroupBy[DataFrame]): + _agg_examples_doc = dedent( + """ + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "A": [1, 1, 2, 2], + ... "B": [1, 2, 3, 4], + ... "C": [0.362838, 0.227877, 1.267767, -0.562860], + ... } + ... ) + + >>> df + A B C + 0 1 1 0.362838 + 1 1 2 0.227877 + 2 2 3 1.267767 + 3 2 4 -0.562860 + + The aggregation is for each column. + + >>> df.groupby('A').agg('min') + B C + A + 1 1 0.227877 + 2 3 -0.562860 + + Multiple aggregations + + >>> df.groupby('A').agg(['min', 'max']) + B C + min max min max + A + 1 1 2 0.227877 0.362838 + 2 3 4 -0.562860 1.267767 + + Select a column for aggregation + + >>> df.groupby('A').B.agg(['min', 'max']) + min max + A + 1 1 2 + 2 3 4 + + User-defined function for aggregation + + >>> df.groupby('A').agg(lambda x: sum(x) + 2) + B C + A + 1 5 2.590715 + 2 9 2.704907 + + Different aggregations per column + + >>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'}) + B C + min max sum + A + 1 1 2 0.590715 + 2 3 4 0.704907 + + To control the output names with different aggregations per column, + pandas supports "named aggregation" + + >>> df.groupby("A").agg( + ... b_min=pd.NamedAgg(column="B", aggfunc="min"), + ... c_sum=pd.NamedAgg(column="C", aggfunc="sum")) + b_min c_sum + A + 1 1 0.590715 + 2 3 0.704907 + + - The keywords are the *output* column names + - The values are tuples whose first element is the column to select + and the second element is the aggregation to apply to that column. + Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields + ``['column', 'aggfunc']`` to make it clearer what the arguments are. + As usual, the aggregation can be a callable or a string alias. + + See :ref:`groupby.aggregate.named` for more. + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the aggregating function. + + >>> df.groupby("A")[["B"]].agg(lambda x: x.astype(float).min()) + B + A + 1 1.0 + 2 3.0 + """ + ) + + @doc(_agg_template_frame, examples=_agg_examples_doc, klass="DataFrame") + def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): + relabeling, func, columns, order = reconstruct_func(func, **kwargs) + func = maybe_mangle_lambdas(func) + + if maybe_use_numba(engine): + # Not all agg functions support numba, only propagate numba kwargs + # if user asks for numba + kwargs["engine"] = engine + kwargs["engine_kwargs"] = engine_kwargs + + op = GroupByApply(self, func, args=args, kwargs=kwargs) + result = op.agg() + if not is_dict_like(func) and result is not None: + # GH #52849 + if not self.as_index and is_list_like(func): + return result.reset_index() + else: + return result + elif relabeling: + # this should be the only (non-raising) case with relabeling + # used reordered index of columns + result = cast(DataFrame, result) + result = result.iloc[:, order] + result = cast(DataFrame, result) + # error: Incompatible types in assignment (expression has type + # "Optional[List[str]]", variable has type + # "Union[Union[Union[ExtensionArray, ndarray[Any, Any]], + # Index, Series], Sequence[Any]]") + result.columns = columns # type: ignore[assignment] + + if result is None: + # Remove the kwargs we inserted + # (already stored in engine, engine_kwargs arguments) + if "engine" in kwargs: + del kwargs["engine"] + del kwargs["engine_kwargs"] + # at this point func is not a str, list-like, dict-like, + # or a known callable(e.g. sum) + if maybe_use_numba(engine): + return self._aggregate_with_numba( + func, *args, engine_kwargs=engine_kwargs, **kwargs + ) + # grouper specific aggregations + if self.grouper.nkeys > 1: + # test_groupby_as_index_series_scalar gets here with 'not self.as_index' + return self._python_agg_general(func, *args, **kwargs) + elif args or kwargs: + # test_pass_args_kwargs gets here (with and without as_index) + # can't return early + result = self._aggregate_frame(func, *args, **kwargs) + + elif self.axis == 1: + # _aggregate_multiple_funcs does not allow self.axis == 1 + # Note: axis == 1 precludes 'not self.as_index', see __init__ + result = self._aggregate_frame(func) + return result + + else: + # try to treat as if we are passing a list + gba = GroupByApply(self, [func], args=(), kwargs={}) + try: + result = gba.agg() + + except ValueError as err: + if "No objects to concatenate" not in str(err): + raise + # _aggregate_frame can fail with e.g. func=Series.mode, + # where it expects 1D values but would be getting 2D values + # In other tests, using aggregate_frame instead of GroupByApply + # would give correct values but incorrect dtypes + # object vs float64 in test_cython_agg_empty_buckets + # float64 vs int64 in test_category_order_apply + result = self._aggregate_frame(func) + + else: + # GH#32040, GH#35246 + # e.g. test_groupby_as_index_select_column_sum_empty_df + result = cast(DataFrame, result) + result.columns = self._obj_with_exclusions.columns.copy() + + if not self.as_index: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) + + return result + + agg = aggregate + + def _python_agg_general(self, func, *args, **kwargs): + orig_func = func + func = com.is_builtin_func(func) + if orig_func != func: + alias = com._builtin_table_alias[func] + warn_alias_replacement(self, orig_func, alias) + f = lambda x: func(x, *args, **kwargs) + + if self.ngroups == 0: + # e.g. test_evaluate_with_empty_groups different path gets different + # result dtype in empty case. + return self._python_apply_general(f, self._selected_obj, is_agg=True) + + obj = self._obj_with_exclusions + if self.axis == 1: + obj = obj.T + + if not len(obj.columns): + # e.g. test_margins_no_values_no_cols + return self._python_apply_general(f, self._selected_obj) + + output: dict[int, ArrayLike] = {} + for idx, (name, ser) in enumerate(obj.items()): + result = self.grouper.agg_series(ser, f) + output[idx] = result + + res = self.obj._constructor(output) + res.columns = obj.columns.copy(deep=False) + return self._wrap_aggregated_output(res) + + def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: + if self.grouper.nkeys != 1: + raise AssertionError("Number of keys must be 1") + + obj = self._obj_with_exclusions + + result: dict[Hashable, NDFrame | np.ndarray] = {} + for name, grp_df in self.grouper.get_iterator(obj, self.axis): + fres = func(grp_df, *args, **kwargs) + result[name] = fres + + result_index = self.grouper.result_index + other_ax = obj.axes[1 - self.axis] + out = self.obj._constructor(result, index=other_ax, columns=result_index) + if self.axis == 0: + out = out.T + + return out + + def _wrap_applied_output( + self, + data: DataFrame, + values: list, + not_indexed_same: bool = False, + is_transform: bool = False, + ): + if len(values) == 0: + if is_transform: + # GH#47787 see test_group_on_empty_multiindex + res_index = data.index + else: + res_index = self.grouper.result_index + + result = self.obj._constructor(index=res_index, columns=data.columns) + result = result.astype(data.dtypes, copy=False) + return result + + # GH12824 + # using values[0] here breaks test_groupby_apply_none_first + first_not_none = next(com.not_none(*values), None) + + if first_not_none is None: + # GH9684 - All values are None, return an empty frame. + return self.obj._constructor() + elif isinstance(first_not_none, DataFrame): + return self._concat_objects( + values, + not_indexed_same=not_indexed_same, + is_transform=is_transform, + ) + + key_index = self.grouper.result_index if self.as_index else None + + if isinstance(first_not_none, (np.ndarray, Index)): + # GH#1738: values is list of arrays of unequal lengths + # fall through to the outer else clause + # TODO: sure this is right? we used to do this + # after raising AttributeError above + # GH 18930 + if not is_hashable(self._selection): + # error: Need type annotation for "name" + name = tuple(self._selection) # type: ignore[var-annotated, arg-type] + else: + # error: Incompatible types in assignment + # (expression has type "Hashable", variable + # has type "Tuple[Any, ...]") + name = self._selection # type: ignore[assignment] + return self.obj._constructor_sliced(values, index=key_index, name=name) + elif not isinstance(first_not_none, Series): + # values are not series or array-like but scalars + # self._selection not passed through to Series as the + # result should not take the name of original selection + # of columns + if self.as_index: + return self.obj._constructor_sliced(values, index=key_index) + else: + result = self.obj._constructor(values, columns=[self._selection]) + result = self._insert_inaxis_grouper(result) + return result + else: + # values are Series + return self._wrap_applied_output_series( + values, + not_indexed_same, + first_not_none, + key_index, + is_transform, + ) + + def _wrap_applied_output_series( + self, + values: list[Series], + not_indexed_same: bool, + first_not_none, + key_index: Index | None, + is_transform: bool, + ) -> DataFrame | Series: + kwargs = first_not_none._construct_axes_dict() + backup = Series(**kwargs) + values = [x if (x is not None) else backup for x in values] + + all_indexed_same = all_indexes_same(x.index for x in values) + + if not all_indexed_same: + # GH 8467 + return self._concat_objects( + values, + not_indexed_same=True, + is_transform=is_transform, + ) + + # Combine values + # vstack+constructor is faster than concat and handles MI-columns + stacked_values = np.vstack([np.asarray(v) for v in values]) + + if self.axis == 0: + index = key_index + columns = first_not_none.index.copy() + if columns.name is None: + # GH6124 - propagate name of Series when it's consistent + names = {v.name for v in values} + if len(names) == 1: + columns.name = next(iter(names)) + else: + index = first_not_none.index + columns = key_index + stacked_values = stacked_values.T + + if stacked_values.dtype == object: + # We'll have the DataFrame constructor do inference + stacked_values = stacked_values.tolist() + result = self.obj._constructor(stacked_values, index=index, columns=columns) + + if not self.as_index: + result = self._insert_inaxis_grouper(result) + + return self._reindex_output(result) + + def _cython_transform( + self, + how: str, + numeric_only: bool = False, + axis: AxisInt = 0, + **kwargs, + ) -> DataFrame: + assert axis == 0 # handled by caller + + # With self.axis == 0, we have multi-block tests + # e.g. test_rank_min_int, test_cython_transform_frame + # test_transform_numeric_ret + # With self.axis == 1, _get_data_to_aggregate does a transpose + # so we always have a single block. + mgr: Manager2D = self._get_data_to_aggregate( + numeric_only=numeric_only, name=how + ) + + def arr_func(bvalues: ArrayLike) -> ArrayLike: + return self.grouper._cython_operation( + "transform", bvalues, how, 1, **kwargs + ) + + # We could use `mgr.apply` here and not have to set_axis, but + # we would have to do shape gymnastics for ArrayManager compat + res_mgr = mgr.grouped_reduce(arr_func) + res_mgr.set_axis(1, mgr.axes[1]) + + res_df = self.obj._constructor_from_mgr(res_mgr, axes=res_mgr.axes) + res_df = self._maybe_transpose_result(res_df) + return res_df + + def _transform_general(self, func, engine, engine_kwargs, *args, **kwargs): + if maybe_use_numba(engine): + return self._transform_with_numba( + func, *args, engine_kwargs=engine_kwargs, **kwargs + ) + from pandas.core.reshape.concat import concat + + applied = [] + obj = self._obj_with_exclusions + gen = self.grouper.get_iterator(obj, axis=self.axis) + fast_path, slow_path = self._define_paths(func, *args, **kwargs) + + # Determine whether to use slow or fast path by evaluating on the first group. + # Need to handle the case of an empty generator and process the result so that + # it does not need to be computed again. + try: + name, group = next(gen) + except StopIteration: + pass + else: + # 2023-02-27 No tests broken by disabling this pinning + object.__setattr__(group, "name", name) + try: + path, res = self._choose_path(fast_path, slow_path, group) + except ValueError as err: + # e.g. test_transform_with_non_scalar_group + msg = "transform must return a scalar value for each group" + raise ValueError(msg) from err + if group.size > 0: + res = _wrap_transform_general_frame(self.obj, group, res) + applied.append(res) + + # Compute and process with the remaining groups + for name, group in gen: + if group.size == 0: + continue + # 2023-02-27 No tests broken by disabling this pinning + object.__setattr__(group, "name", name) + res = path(group) + + res = _wrap_transform_general_frame(self.obj, group, res) + applied.append(res) + + concat_index = obj.columns if self.axis == 0 else obj.index + other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1 + concatenated = concat(applied, axis=self.axis, verify_integrity=False) + concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False) + return self._set_result_index_ordered(concatenated) + + __examples_dataframe_doc = dedent( + """ + >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + ... 'foo', 'bar'], + ... 'B' : ['one', 'one', 'two', 'three', + ... 'two', 'two'], + ... 'C' : [1, 5, 5, 2, 5, 5], + ... 'D' : [2.0, 5., 8., 1., 2., 9.]}) + >>> grouped = df.groupby('A')[['C', 'D']] + >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) + C D + 0 -1.154701 -0.577350 + 1 0.577350 0.000000 + 2 0.577350 1.154701 + 3 -1.154701 -1.000000 + 4 0.577350 -0.577350 + 5 0.577350 1.000000 + + Broadcast result of the transformation + + >>> grouped.transform(lambda x: x.max() - x.min()) + C D + 0 4.0 6.0 + 1 3.0 8.0 + 2 4.0 6.0 + 3 3.0 8.0 + 4 4.0 6.0 + 5 3.0 8.0 + + >>> grouped.transform("mean") + C D + 0 3.666667 4.0 + 1 4.000000 5.0 + 2 3.666667 4.0 + 3 4.000000 5.0 + 4 3.666667 4.0 + 5 4.000000 5.0 + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + for example: + + >>> grouped.transform(lambda x: x.astype(int).max()) + C D + 0 5 8 + 1 5 9 + 2 5 8 + 3 5 9 + 4 5 8 + 5 5 9 + """ + ) + + @Substitution(klass="DataFrame", example=__examples_dataframe_doc) + @Appender(_transform_template) + def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): + return self._transform( + func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) + + def _define_paths(self, func, *args, **kwargs): + if isinstance(func, str): + fast_path = lambda group: getattr(group, func)(*args, **kwargs) + slow_path = lambda group: group.apply( + lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis + ) + else: + fast_path = lambda group: func(group, *args, **kwargs) + slow_path = lambda group: group.apply( + lambda x: func(x, *args, **kwargs), axis=self.axis + ) + return fast_path, slow_path + + def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame): + path = slow_path + res = slow_path(group) + + if self.ngroups == 1: + # no need to evaluate multiple paths when only + # a single group exists + return path, res + + # if we make it here, test if we can use the fast path + try: + res_fast = fast_path(group) + except AssertionError: + raise # pragma: no cover + except Exception: + # GH#29631 For user-defined function, we can't predict what may be + # raised; see test_transform.test_transform_fastpath_raises + return path, res + + # verify fast path returns either: + # a DataFrame with columns equal to group.columns + # OR a Series with index equal to group.columns + if isinstance(res_fast, DataFrame): + if not res_fast.columns.equals(group.columns): + return path, res + elif isinstance(res_fast, Series): + if not res_fast.index.equals(group.columns): + return path, res + else: + return path, res + + if res_fast.equals(res): + path = fast_path + + return path, res + + def filter(self, func, dropna: bool = True, *args, **kwargs): + """ + Filter elements from groups that don't satisfy a criterion. + + Elements from groups are filtered if they do not satisfy the + boolean criterion specified by func. + + Parameters + ---------- + func : function + Criterion to apply to each group. Should return True or False. + dropna : bool + Drop groups that do not pass the filter. True by default; if False, + groups that evaluate False are filled with NaNs. + + Returns + ------- + DataFrame + + Notes + ----- + Each subframe is endowed the attribute 'name' in case you need to know + which group you are working on. + + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` + for more details. + + Examples + -------- + >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + ... 'foo', 'bar'], + ... 'B' : [1, 2, 3, 4, 5, 6], + ... 'C' : [2.0, 5., 8., 1., 2., 9.]}) + >>> grouped = df.groupby('A') + >>> grouped.filter(lambda x: x['B'].mean() > 3.) + A B C + 1 bar 2 5.0 + 3 bar 4 1.0 + 5 bar 6 9.0 + """ + indices = [] + + obj = self._selected_obj + gen = self.grouper.get_iterator(obj, axis=self.axis) + + for name, group in gen: + # 2023-02-27 no tests are broken this pinning, but it is documented in the + # docstring above. + object.__setattr__(group, "name", name) + + res = func(group, *args, **kwargs) + + try: + res = res.squeeze() + except AttributeError: # allow e.g., scalars and frames to pass + pass + + # interpret the result of the filter + if is_bool(res) or (is_scalar(res) and isna(res)): + if notna(res) and res: + indices.append(self._get_index(name)) + else: + # non scalars aren't allowed + raise TypeError( + f"filter function returned a {type(res).__name__}, " + "but expected a scalar bool" + ) + + return self._apply_filter(indices, dropna) + + def __getitem__(self, key) -> DataFrameGroupBy | SeriesGroupBy: + if self.axis == 1: + # GH 37725 + raise ValueError("Cannot subset columns when using axis=1") + # per GH 23566 + if isinstance(key, tuple) and len(key) > 1: + # if len == 1, then it becomes a SeriesGroupBy and this is actually + # valid syntax, so don't raise + raise ValueError( + "Cannot subset columns with a tuple with more than one element. " + "Use a list instead." + ) + return super().__getitem__(key) + + def _gotitem(self, key, ndim: int, subset=None): + """ + sub-classes to define + return a sliced object + + Parameters + ---------- + key : string / list of selections + ndim : {1, 2} + requested ndim of result + subset : object, default None + subset to act on + """ + if ndim == 2: + if subset is None: + subset = self.obj + return DataFrameGroupBy( + subset, + self.keys, + axis=self.axis, + level=self.level, + grouper=self.grouper, + exclusions=self.exclusions, + selection=key, + as_index=self.as_index, + sort=self.sort, + group_keys=self.group_keys, + observed=self.observed, + dropna=self.dropna, + ) + elif ndim == 1: + if subset is None: + subset = self.obj[key] + return SeriesGroupBy( + subset, + self.keys, + level=self.level, + grouper=self.grouper, + exclusions=self.exclusions, + selection=key, + as_index=self.as_index, + sort=self.sort, + group_keys=self.group_keys, + observed=self.observed, + dropna=self.dropna, + ) + + raise AssertionError("invalid ndim for _gotitem") + + def _get_data_to_aggregate( + self, *, numeric_only: bool = False, name: str | None = None + ) -> Manager2D: + obj = self._obj_with_exclusions + if self.axis == 1: + mgr = obj.T._mgr + else: + mgr = obj._mgr + + if numeric_only: + mgr = mgr.get_numeric_data(copy=False) + return mgr + + def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: + return self.obj._constructor_from_mgr(mgr, axes=mgr.axes) + + def _apply_to_column_groupbys(self, func) -> DataFrame: + from pandas.core.reshape.concat import concat + + obj = self._obj_with_exclusions + columns = obj.columns + sgbs = [ + SeriesGroupBy( + obj.iloc[:, i], + selection=colname, + grouper=self.grouper, + exclusions=self.exclusions, + observed=self.observed, + ) + for i, colname in enumerate(obj.columns) + ] + results = [func(sgb) for sgb in sgbs] + + if not len(results): + # concat would raise + res_df = DataFrame([], columns=columns, index=self.grouper.result_index) + else: + res_df = concat(results, keys=columns, axis=1) + + if not self.as_index: + res_df.index = default_index(len(res_df)) + res_df = self._insert_inaxis_grouper(res_df) + return res_df + + def nunique(self, dropna: bool = True) -> DataFrame: + """ + Return DataFrame with counts of unique elements in each position. + + Parameters + ---------- + dropna : bool, default True + Don't include NaN in the counts. + + Returns + ------- + nunique: DataFrame + + Examples + -------- + >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam', + ... 'ham', 'ham'], + ... 'value1': [1, 5, 5, 2, 5, 5], + ... 'value2': list('abbaxy')}) + >>> df + id value1 value2 + 0 spam 1 a + 1 egg 5 b + 2 egg 5 b + 3 spam 2 a + 4 ham 5 x + 5 ham 5 y + + >>> df.groupby('id').nunique() + value1 value2 + id + egg 1 1 + ham 1 2 + spam 2 1 + + Check for rows with the same id but conflicting values: + + >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any()) + id value1 value2 + 0 spam 1 a + 3 spam 2 a + 4 ham 5 x + 5 ham 5 y + """ + + if self.axis != 0: + # see test_groupby_crash_on_nunique + return self._python_apply_general( + lambda sgb: sgb.nunique(dropna), self._obj_with_exclusions, is_agg=True + ) + + return self._apply_to_column_groupbys(lambda sgb: sgb.nunique(dropna)) + + def idxmax( + self, + axis: Axis | None | lib.NoDefault = lib.no_default, + skipna: bool = True, + numeric_only: bool = False, + ) -> DataFrame: + """ + Return index of first occurrence of maximum over requested axis. + + NA/null values are excluded. + + Parameters + ---------- + axis : {{0 or 'index', 1 or 'columns'}}, default None + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + If axis is not provided, grouper's axis is used. + + .. versionchanged:: 2.0.0 + + .. deprecated:: 2.1.0 + For axis=1, operate on the underlying object instead. Otherwise + the axis keyword is not necessary. + + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series + Indexes of maxima along the specified axis. + + Raises + ------ + ValueError + * If the row/column is empty + + See Also + -------- + Series.idxmax : Return index of the maximum element. + + Notes + ----- + This method is the DataFrame version of ``ndarray.argmax``. + + Examples + -------- + Consider a dataset containing food consumption in Argentina. + + >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], + ... 'co2_emissions': [37.2, 19.66, 1712]}, + ... index=['Pork', 'Wheat Products', 'Beef']) + + >>> df + consumption co2_emissions + Pork 10.51 37.20 + Wheat Products 103.11 19.66 + Beef 55.48 1712.00 + + By default, it returns the index for the maximum value in each column. + + >>> df.idxmax() + consumption Wheat Products + co2_emissions Beef + dtype: object + + To return the index for the maximum value in each row, use ``axis="columns"``. + + >>> df.idxmax(axis="columns") + Pork co2_emissions + Wheat Products consumption + Beef co2_emissions + dtype: object + """ + return self._idxmax_idxmin( + "idxmax", axis=axis, numeric_only=numeric_only, skipna=skipna + ) + + def idxmin( + self, + axis: Axis | None | lib.NoDefault = lib.no_default, + skipna: bool = True, + numeric_only: bool = False, + ) -> DataFrame: + """ + Return index of first occurrence of minimum over requested axis. + + NA/null values are excluded. + + Parameters + ---------- + axis : {{0 or 'index', 1 or 'columns'}}, default None + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + If axis is not provided, grouper's axis is used. + + .. versionchanged:: 2.0.0 + + .. deprecated:: 2.1.0 + For axis=1, operate on the underlying object instead. Otherwise + the axis keyword is not necessary. + + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series + Indexes of minima along the specified axis. + + Raises + ------ + ValueError + * If the row/column is empty + + See Also + -------- + Series.idxmin : Return index of the minimum element. + + Notes + ----- + This method is the DataFrame version of ``ndarray.argmin``. + + Examples + -------- + Consider a dataset containing food consumption in Argentina. + + >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], + ... 'co2_emissions': [37.2, 19.66, 1712]}, + ... index=['Pork', 'Wheat Products', 'Beef']) + + >>> df + consumption co2_emissions + Pork 10.51 37.20 + Wheat Products 103.11 19.66 + Beef 55.48 1712.00 + + By default, it returns the index for the minimum value in each column. + + >>> df.idxmin() + consumption Pork + co2_emissions Wheat Products + dtype: object + + To return the index for the minimum value in each row, use ``axis="columns"``. + + >>> df.idxmin(axis="columns") + Pork consumption + Wheat Products co2_emissions + Beef consumption + dtype: object + """ + return self._idxmax_idxmin( + "idxmin", axis=axis, numeric_only=numeric_only, skipna=skipna + ) + + boxplot = boxplot_frame_groupby + + def value_counts( + self, + subset: Sequence[Hashable] | None = None, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + dropna: bool = True, + ) -> DataFrame | Series: + """ + Return a Series or DataFrame containing counts of unique rows. + + .. versionadded:: 1.4.0 + + Parameters + ---------- + subset : list-like, optional + Columns to use when counting unique combinations. + normalize : bool, default False + Return proportions rather than frequencies. + sort : bool, default True + Sort by frequencies. + ascending : bool, default False + Sort in ascending order. + dropna : bool, default True + Don't include counts of rows that contain NA values. + + Returns + ------- + Series or DataFrame + Series if the groupby as_index is True, otherwise DataFrame. + + See Also + -------- + Series.value_counts: Equivalent method on Series. + DataFrame.value_counts: Equivalent method on DataFrame. + SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy. + + Notes + ----- + - If the groupby as_index is True then the returned Series will have a + MultiIndex with one level per input column. + - If the groupby as_index is False then the returned DataFrame will have an + additional column with the value_counts. The column is labelled 'count' or + 'proportion', depending on the ``normalize`` parameter. + + By default, rows that contain any NA values are omitted from + the result. + + By default, the result will be in descending order so that the + first element of each group is the most frequently-occurring row. + + Examples + -------- + >>> df = pd.DataFrame({ + ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], + ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], + ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] + ... }) + + >>> df + gender education country + 0 male low US + 1 male medium FR + 2 female high US + 3 male low FR + 4 female high FR + 5 male low FR + + >>> df.groupby('gender').value_counts() + gender education country + female high FR 1 + US 1 + male low FR 2 + US 1 + medium FR 1 + Name: count, dtype: int64 + + >>> df.groupby('gender').value_counts(ascending=True) + gender education country + female high FR 1 + US 1 + male low US 1 + medium FR 1 + low FR 2 + Name: count, dtype: int64 + + >>> df.groupby('gender').value_counts(normalize=True) + gender education country + female high FR 0.50 + US 0.50 + male low FR 0.50 + US 0.25 + medium FR 0.25 + Name: proportion, dtype: float64 + + >>> df.groupby('gender', as_index=False).value_counts() + gender education country count + 0 female high FR 1 + 1 female high US 1 + 2 male low FR 2 + 3 male low US 1 + 4 male medium FR 1 + + >>> df.groupby('gender', as_index=False).value_counts(normalize=True) + gender education country proportion + 0 female high FR 0.50 + 1 female high US 0.50 + 2 male low FR 0.50 + 3 male low US 0.25 + 4 male medium FR 0.25 + """ + return self._value_counts(subset, normalize, sort, ascending, dropna) + + def fillna( + self, + value: Hashable | Mapping | Series | DataFrame | None = None, + method: FillnaOptions | None = None, + axis: Axis | None | lib.NoDefault = lib.no_default, + inplace: bool = False, + limit: int | None = None, + downcast=lib.no_default, + ) -> DataFrame | None: + """ + Fill NA/NaN values using the specified method within groups. + + Parameters + ---------- + value : scalar, dict, Series, or DataFrame + Value to use to fill holes (e.g. 0), alternately a + dict/Series/DataFrame of values specifying which value to use for + each index (for a Series) or column (for a DataFrame). Values not + in the dict/Series/DataFrame will not be filled. This value cannot + be a list. Users wanting to use the ``value`` argument and not ``method`` + should prefer :meth:`.DataFrame.fillna` as this + will produce the same result and be more performant. + method : {{'bfill', 'ffill', None}}, default None + Method to use for filling holes. ``'ffill'`` will propagate + the last valid observation forward within a group. + ``'bfill'`` will use next valid observation to fill the gap. + axis : {0 or 'index', 1 or 'columns'} + Axis along which to fill missing values. When the :class:`DataFrameGroupBy` + ``axis`` argument is ``0``, using ``axis=1`` here will produce + the same results as :meth:`.DataFrame.fillna`. When the + :class:`DataFrameGroupBy` ``axis`` argument is ``1``, using ``axis=0`` + or ``axis=1`` here will produce the same results. + + .. deprecated:: 2.1.0 + For axis=1, operate on the underlying object instead. Otherwise + the axis keyword is not necessary. + + inplace : bool, default False + Broken. Do not set to True. + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill within a group. In other words, + if there is a gap with more than this number of consecutive NaNs, + it will only be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. Must be greater than 0 if not None. + downcast : dict, default is None + A dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible). + + .. deprecated:: 2.1.0 + + Returns + ------- + DataFrame + Object with missing values filled. + + See Also + -------- + ffill : Forward fill values within a group. + bfill : Backward fill values within a group. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "key": [0, 0, 1, 1, 1], + ... "A": [np.nan, 2, np.nan, 3, np.nan], + ... "B": [2, 3, np.nan, np.nan, np.nan], + ... "C": [np.nan, np.nan, 2, np.nan, np.nan], + ... } + ... ) + >>> df + key A B C + 0 0 NaN 2.0 NaN + 1 0 2.0 3.0 NaN + 2 1 NaN NaN 2.0 + 3 1 3.0 NaN NaN + 4 1 NaN NaN NaN + + Propagate non-null values forward or backward within each group along columns. + + >>> df.groupby("key").fillna(method="ffill") + A B C + 0 NaN 2.0 NaN + 1 2.0 3.0 NaN + 2 NaN NaN 2.0 + 3 3.0 NaN 2.0 + 4 3.0 NaN 2.0 + + >>> df.groupby("key").fillna(method="bfill") + A B C + 0 2.0 2.0 NaN + 1 2.0 3.0 NaN + 2 3.0 NaN 2.0 + 3 3.0 NaN NaN + 4 NaN NaN NaN + + Propagate non-null values forward or backward within each group along rows. + + >>> df.T.groupby(np.array([0, 0, 1, 1])).fillna(method="ffill").T + key A B C + 0 0.0 0.0 2.0 2.0 + 1 0.0 2.0 3.0 3.0 + 2 1.0 1.0 NaN 2.0 + 3 1.0 3.0 NaN NaN + 4 1.0 1.0 NaN NaN + + >>> df.T.groupby(np.array([0, 0, 1, 1])).fillna(method="bfill").T + key A B C + 0 0.0 NaN 2.0 NaN + 1 0.0 2.0 3.0 NaN + 2 1.0 NaN 2.0 2.0 + 3 1.0 3.0 NaN NaN + 4 1.0 NaN NaN NaN + + Only replace the first NaN element within a group along rows. + + >>> df.groupby("key").fillna(method="ffill", limit=1) + A B C + 0 NaN 2.0 NaN + 1 2.0 3.0 NaN + 2 NaN NaN 2.0 + 3 3.0 NaN 2.0 + 4 3.0 NaN NaN + """ + if method is not None: + warnings.warn( + f"{type(self).__name__}.fillna with 'method' is deprecated and " + "will raise in a future version. Use obj.ffill() or obj.bfill() " + "instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + result = self._op_via_apply( + "fillna", + value=value, + method=method, + axis=axis, + inplace=inplace, + limit=limit, + downcast=downcast, + ) + return result + + def take( + self, + indices: TakeIndexer, + axis: Axis | None | lib.NoDefault = lib.no_default, + **kwargs, + ) -> DataFrame: + """ + Return the elements in the given *positional* indices in each group. + + This means that we are not indexing according to actual values in + the index attribute of the object. We are indexing according to the + actual position of the element in the object. + + If a requested index does not exist for some group, this method will raise. + To get similar behavior that ignores indices that don't exist, see + :meth:`.DataFrameGroupBy.nth`. + + Parameters + ---------- + indices : array-like + An array of ints indicating which positions to take. + axis : {0 or 'index', 1 or 'columns', None}, default 0 + The axis on which to select elements. ``0`` means that we are + selecting rows, ``1`` means that we are selecting columns. + + .. deprecated:: 2.1.0 + For axis=1, operate on the underlying object instead. Otherwise + the axis keyword is not necessary. + + **kwargs + For compatibility with :meth:`numpy.take`. Has no effect on the + output. + + Returns + ------- + DataFrame + An DataFrame containing the elements taken from each group. + + See Also + -------- + DataFrame.take : Take elements from a Series along an axis. + DataFrame.loc : Select a subset of a DataFrame by labels. + DataFrame.iloc : Select a subset of a DataFrame by positions. + numpy.take : Take elements from an array along an axis. + + Examples + -------- + >>> df = pd.DataFrame([('falcon', 'bird', 389.0), + ... ('parrot', 'bird', 24.0), + ... ('lion', 'mammal', 80.5), + ... ('monkey', 'mammal', np.nan), + ... ('rabbit', 'mammal', 15.0)], + ... columns=['name', 'class', 'max_speed'], + ... index=[4, 3, 2, 1, 0]) + >>> df + name class max_speed + 4 falcon bird 389.0 + 3 parrot bird 24.0 + 2 lion mammal 80.5 + 1 monkey mammal NaN + 0 rabbit mammal 15.0 + >>> gb = df.groupby([1, 1, 2, 2, 2]) + + Take elements at positions 0 and 1 along the axis 0 (default). + + Note how the indices selected in the result do not correspond to + our input indices 0 and 1. That's because we are selecting the 0th + and 1st rows, not rows whose indices equal 0 and 1. + + >>> gb.take([0, 1]) + name class max_speed + 1 4 falcon bird 389.0 + 3 parrot bird 24.0 + 2 2 lion mammal 80.5 + 1 monkey mammal NaN + + The order of the specified indices influences the order in the result. + Here, the order is swapped from the previous example. + + >>> gb.take([1, 0]) + name class max_speed + 1 3 parrot bird 24.0 + 4 falcon bird 389.0 + 2 1 monkey mammal NaN + 2 lion mammal 80.5 + + Take elements at indices 1 and 2 along the axis 1 (column selection). + + We may take elements using negative integers for positive indices, + starting from the end of the object, just like with Python lists. + + >>> gb.take([-1, -2]) + name class max_speed + 1 3 parrot bird 24.0 + 4 falcon bird 389.0 + 2 0 rabbit mammal 15.0 + 1 monkey mammal NaN + """ + result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs) + return result + + def skew( + self, + axis: Axis | None | lib.NoDefault = lib.no_default, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ) -> DataFrame: + """ + Return unbiased skew within groups. + + Normalized by N-1. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns', None}, default 0 + Axis for the function to be applied on. + + Specifying ``axis=None`` will apply the aggregation across both axes. + + .. versionadded:: 2.0.0 + + .. deprecated:: 2.1.0 + For axis=1, operate on the underlying object instead. Otherwise + the axis keyword is not necessary. + + skipna : bool, default True + Exclude NA/null values when computing the result. + + numeric_only : bool, default False + Include only float, int, boolean columns. + + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + DataFrame + + See Also + -------- + DataFrame.skew : Return unbiased skew over requested axis. + + Examples + -------- + >>> arrays = [['falcon', 'parrot', 'cockatoo', 'kiwi', + ... 'lion', 'monkey', 'rabbit'], + ... ['bird', 'bird', 'bird', 'bird', + ... 'mammal', 'mammal', 'mammal']] + >>> index = pd.MultiIndex.from_arrays(arrays, names=('name', 'class')) + >>> df = pd.DataFrame({'max_speed': [389.0, 24.0, 70.0, np.nan, + ... 80.5, 21.5, 15.0]}, + ... index=index) + >>> df + max_speed + name class + falcon bird 389.0 + parrot bird 24.0 + cockatoo bird 70.0 + kiwi bird NaN + lion mammal 80.5 + monkey mammal 21.5 + rabbit mammal 15.0 + >>> gb = df.groupby(["class"]) + >>> gb.skew() + max_speed + class + bird 1.628296 + mammal 1.669046 + >>> gb.skew(skipna=False) + max_speed + class + bird NaN + mammal 1.669046 + """ + if axis is lib.no_default: + axis = 0 + + if axis != 0: + result = self._op_via_apply( + "skew", + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + **kwargs, + ) + return result + + def alt(obj): + # This should not be reached since the cython path should raise + # TypeError and not NotImplementedError. + raise TypeError(f"'skew' is not supported for dtype={obj.dtype}") + + return self._cython_agg_general( + "skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs + ) + + @property + @doc(DataFrame.plot.__doc__) + def plot(self) -> GroupByPlot: + result = GroupByPlot(self) + return result + + @doc(DataFrame.corr.__doc__) + def corr( + self, + method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson", + min_periods: int = 1, + numeric_only: bool = False, + ) -> DataFrame: + result = self._op_via_apply( + "corr", method=method, min_periods=min_periods, numeric_only=numeric_only + ) + return result + + @doc(DataFrame.cov.__doc__) + def cov( + self, + min_periods: int | None = None, + ddof: int | None = 1, + numeric_only: bool = False, + ) -> DataFrame: + result = self._op_via_apply( + "cov", min_periods=min_periods, ddof=ddof, numeric_only=numeric_only + ) + return result + + @doc(DataFrame.hist.__doc__) + def hist( + self, + column: IndexLabel | None = None, + by=None, + grid: bool = True, + xlabelsize: int | None = None, + xrot: float | None = None, + ylabelsize: int | None = None, + yrot: float | None = None, + ax=None, + sharex: bool = False, + sharey: bool = False, + figsize: tuple[int, int] | None = None, + layout: tuple[int, int] | None = None, + bins: int | Sequence[int] = 10, + backend: str | None = None, + legend: bool = False, + **kwargs, + ): + result = self._op_via_apply( + "hist", + column=column, + by=by, + grid=grid, + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, + ax=ax, + sharex=sharex, + sharey=sharey, + figsize=figsize, + layout=layout, + bins=bins, + backend=backend, + legend=legend, + **kwargs, + ) + return result + + @property + @doc(DataFrame.dtypes.__doc__) + def dtypes(self) -> Series: + # GH#51045 + warnings.warn( + f"{type(self).__name__}.dtypes is deprecated and will be removed in " + "a future version. Check the dtypes on the base object instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + + # error: Incompatible return value type (got "DataFrame", expected "Series") + return self._python_apply_general( # type: ignore[return-value] + lambda df: df.dtypes, self._selected_obj + ) + + @doc(DataFrame.corrwith.__doc__) + def corrwith( + self, + other: DataFrame | Series, + axis: Axis | lib.NoDefault = lib.no_default, + drop: bool = False, + method: CorrelationMethod = "pearson", + numeric_only: bool = False, + ) -> DataFrame: + result = self._op_via_apply( + "corrwith", + other=other, + axis=axis, + drop=drop, + method=method, + numeric_only=numeric_only, + ) + return result + + +def _wrap_transform_general_frame( + obj: DataFrame, group: DataFrame, res: DataFrame | Series +) -> DataFrame: + from pandas import concat + + if isinstance(res, Series): + # we need to broadcast across the + # other dimension; this will preserve dtypes + # GH14457 + if res.index.is_(obj.index): + res_frame = concat([res] * len(group.columns), axis=1) + res_frame.columns = group.columns + res_frame.index = group.index + else: + res_frame = obj._constructor( + np.tile(res.values, (len(group.index), 1)), + columns=group.columns, + index=group.index, + ) + assert isinstance(res_frame, DataFrame) + return res_frame + elif isinstance(res, DataFrame) and not res.index.is_(group.index): + return res._align_frame(group)[0] + else: + return res diff --git a/pandas/_core/groupby/groupby.py b/pandas/_core/groupby/groupby.py new file mode 100644 index 0000000000000..67f0a6e0abc9b --- /dev/null +++ b/pandas/_core/groupby/groupby.py @@ -0,0 +1,5951 @@ +""" +Provide the groupby split-apply-combine paradigm. Define the GroupBy +class providing the base-class of operations. + +The SeriesGroupBy and DataFrameGroupBy sub-class +(defined in pandas._core.groupby.generic) +expose these user-facing objects to provide specific functionality. +""" +from __future__ import annotations + +from collections.abc import ( + Hashable, + Iterator, + Mapping, + Sequence, +) +import datetime +from functools import ( + partial, + wraps, +) +import inspect +from textwrap import dedent +from typing import ( + TYPE_CHECKING, + Callable, + Literal, + TypeVar, + Union, + cast, + final, +) +import warnings + +import numpy as np + +from pandas._config.config import option_context + +from pandas._libs import ( + Timestamp, + lib, +) +from pandas._libs.algos import rank_1d +import pandas._libs.groupby as libgroupby +from pandas._libs.missing import NA +from pandas._typing import ( + AnyArrayLike, + ArrayLike, + Axis, + AxisInt, + DtypeObj, + FillnaOptions, + IndexLabel, + NDFrameT, + PositionalIndexer, + RandomState, + Scalar, + T, + npt, +) +from pandas.compat.numpy import function as nv +from pandas.errors import ( + AbstractMethodError, + DataError, +) +from pandas.util._decorators import ( + Appender, + Substitution, + cache_readonly, + doc, +) +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.cast import ( + coerce_indexer_dtype, + ensure_dtype_can_hold_na, +) +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_float_dtype, + is_hashable, + is_integer, + is_integer_dtype, + is_list_like, + is_numeric_dtype, + is_object_dtype, + is_scalar, + needs_i8_conversion, +) +from pandas.core.dtypes.missing import ( + isna, + na_value_for_dtype, + notna, +) + +from pandas._core.groupby import ( + base, + numba_, + ops, +) +from pandas._core.groupby.grouper import get_grouper +from pandas._core.groupby.indexing import ( + GroupByIndexingMixin, + GroupByNthSelector, +) +from pandas.core import ( + algorithms, + sample, +) +from pandas.core._numba import executor +from pandas.core.apply import warn_alias_replacement +from pandas.core.arrays import ( + ArrowExtensionArray, + BaseMaskedArray, + Categorical, + ExtensionArray, + FloatingArray, + IntegerArray, + SparseArray, +) +from pandas.core.arrays.string_ import StringDtype +from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + ArrowStringArrayNumpySemantics, +) +from pandas.core.base import ( + PandasObject, + SelectionMixin, +) +import pandas.core.common as com +from pandas.core.frame import DataFrame +from pandas.core.generic import NDFrame +from pandas.core.indexes.api import ( + CategoricalIndex, + Index, + MultiIndex, + RangeIndex, + default_index, +) +from pandas.core.internals.blocks import ensure_block_shape +from pandas.core.series import Series +from pandas.core.sorting import get_group_index_sorter +from pandas.core.util.numba_ import ( + get_jit_arguments, + maybe_use_numba, +) + +if TYPE_CHECKING: + from typing import Any + + from pandas.core.resample import Resampler + from pandas.core.window import ( + ExpandingGroupby, + ExponentialMovingWindowGroupby, + RollingGroupby, + ) + +_common_see_also = """ + See Also + -------- + Series.%(name)s : Apply a function %(name)s to a Series. + DataFrame.%(name)s : Apply a function %(name)s + to each row or column of a DataFrame. +""" + +_apply_docs = { + "template": """ + Apply function ``func`` group-wise and combine the results together. + + The function passed to ``apply`` must take a {input} as its first + argument and return a DataFrame, Series or scalar. ``apply`` will + then take care of combining the results back together into a single + dataframe or series. ``apply`` is therefore a highly flexible + grouping method. + + While ``apply`` is a very flexible method, its downside is that + using it can be quite a bit slower than using more specific methods + like ``agg`` or ``transform``. Pandas offers a wide range of method that will + be much faster than using ``apply`` for their specific purposes, so try to + use them before reaching for ``apply``. + + Parameters + ---------- + func : callable + A callable that takes a {input} as its first argument, and + returns a dataframe, a series or a scalar. In addition the + callable may take positional and keyword arguments. + include_groups : bool, default True + When True, will attempt to apply ``func`` to the groupings in + the case that they are columns of the DataFrame. If this raises a + TypeError, the result will be computed with the groupings excluded. + When False, the groupings will be excluded when applying ``func``. + + .. versionadded:: 2.2.0 + + .. deprecated:: 2.2.0 + + Setting include_groups to True is deprecated. Only the value + False will be allowed in a future version of pandas. + + args, kwargs : tuple and dict + Optional positional and keyword arguments to pass to ``func``. + + Returns + ------- + Series or DataFrame + + See Also + -------- + pipe : Apply function to the full GroupBy object instead of to each + group. + aggregate : Apply aggregate function to the GroupBy object. + transform : Apply function column-by-column to the GroupBy object. + Series.apply : Apply a function to a Series. + DataFrame.apply : Apply a function to each row or column of a DataFrame. + + Notes + ----- + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + see the examples below. + + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` + for more details. + + Examples + -------- + {examples} + """, + "dataframe_examples": """ + >>> df = pd.DataFrame({'A': 'a a b'.split(), + ... 'B': [1,2,3], + ... 'C': [4,6,5]}) + >>> g1 = df.groupby('A', group_keys=False) + >>> g2 = df.groupby('A', group_keys=True) + + Notice that ``g1`` and ``g2`` have two groups, ``a`` and ``b``, and only + differ in their ``group_keys`` argument. Calling `apply` in various ways, + we can get different grouping results: + + Example 1: below the function passed to `apply` takes a DataFrame as + its argument and returns a DataFrame. `apply` combines the result for + each group together into a new DataFrame: + + >>> g1[['B', 'C']].apply(lambda x: x / x.sum()) + B C + 0 0.333333 0.4 + 1 0.666667 0.6 + 2 1.000000 1.0 + + In the above, the groups are not part of the index. We can have them included + by using ``g2`` where ``group_keys=True``: + + >>> g2[['B', 'C']].apply(lambda x: x / x.sum()) + B C + A + a 0 0.333333 0.4 + 1 0.666667 0.6 + b 2 1.000000 1.0 + + Example 2: The function passed to `apply` takes a DataFrame as + its argument and returns a Series. `apply` combines the result for + each group together into a new DataFrame. + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``. + + >>> g1[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min()) + B C + A + a 1.0 2.0 + b 0.0 0.0 + + >>> g2[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min()) + B C + A + a 1.0 2.0 + b 0.0 0.0 + + The ``group_keys`` argument has no effect here because the result is not + like-indexed (i.e. :ref:`a transform `) when compared + to the input. + + Example 3: The function passed to `apply` takes a DataFrame as + its argument and returns a scalar. `apply` combines the result for + each group together into a Series, including setting the index as + appropriate: + + >>> g1.apply(lambda x: x.C.max() - x.B.min(), include_groups=False) + A + a 5 + b 2 + dtype: int64""", + "series_examples": """ + >>> s = pd.Series([0, 1, 2], index='a a b'.split()) + >>> g1 = s.groupby(s.index, group_keys=False) + >>> g2 = s.groupby(s.index, group_keys=True) + + From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``. + Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only + differ in their ``group_keys`` argument. Calling `apply` in various ways, + we can get different grouping results: + + Example 1: The function passed to `apply` takes a Series as + its argument and returns a Series. `apply` combines the result for + each group together into a new Series. + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``. + + >>> g1.apply(lambda x: x*2 if x.name == 'a' else x/2) + a 0.0 + a 2.0 + b 1.0 + dtype: float64 + + In the above, the groups are not part of the index. We can have them included + by using ``g2`` where ``group_keys=True``: + + >>> g2.apply(lambda x: x*2 if x.name == 'a' else x/2) + a a 0.0 + a 2.0 + b b 1.0 + dtype: float64 + + Example 2: The function passed to `apply` takes a Series as + its argument and returns a scalar. `apply` combines the result for + each group together into a Series, including setting the index as + appropriate: + + >>> g1.apply(lambda x: x.max() - x.min()) + a 1 + b 0 + dtype: int64 + + The ``group_keys`` argument has no effect here because the result is not + like-indexed (i.e. :ref:`a transform `) when compared + to the input. + + >>> g2.apply(lambda x: x.max() - x.min()) + a 1 + b 0 + dtype: int64""", +} + +_groupby_agg_method_template = """ +Compute {fname} of group values. + +Parameters +---------- +numeric_only : bool, default {no} + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None``. + +min_count : int, default {mc} + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + +Returns +------- +Series or DataFrame + Computed {fname} of values within each group. + +Examples +-------- +{example} +""" + +_groupby_agg_method_engine_template = """ +Compute {fname} of group values. + +Parameters +---------- +numeric_only : bool, default {no} + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None``. + +min_count : int, default {mc} + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + +engine : str, default None {e} + * ``'cython'`` : Runs rolling apply through C-extensions from cython. + * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. + Only available when ``raw`` is set to ``True``. + * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` + +engine_kwargs : dict, default None {ek} + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be + applied to both the ``func`` and the ``apply`` groupby aggregation. + +Returns +------- +Series or DataFrame + Computed {fname} of values within each group. + +Examples +-------- +{example} +""" + +_pipe_template = """ +Apply a ``func`` with arguments to this %(klass)s object and return its result. + +Use `.pipe` when you want to improve readability by chaining together +functions that expect Series, DataFrames, GroupBy or Resampler objects. +Instead of writing + +>>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c) # doctest: +SKIP + +You can write + +>>> (df.groupby('group') +... .pipe(f) +... .pipe(g, arg1=a) +... .pipe(h, arg2=b, arg3=c)) # doctest: +SKIP + +which is much more readable. + +Parameters +---------- +func : callable or tuple of (callable, str) + Function to apply to this %(klass)s object or, alternatively, + a `(callable, data_keyword)` tuple where `data_keyword` is a + string indicating the keyword of `callable` that expects the + %(klass)s object. +args : iterable, optional + Positional arguments passed into `func`. +kwargs : dict, optional + A dictionary of keyword arguments passed into `func`. + +Returns +------- +the return type of `func`. + +See Also +-------- +Series.pipe : Apply a function with arguments to a series. +DataFrame.pipe: Apply a function with arguments to a dataframe. +apply : Apply function to each group instead of to the + full %(klass)s object. + +Notes +----- +See more `here +`_ + +Examples +-------- +%(examples)s +""" + +_transform_template = """ +Call function producing a same-indexed %(klass)s on each group. + +Returns a %(klass)s having the same indexes as the original object +filled with the transformed values. + +Parameters +---------- +f : function, str + Function to apply to each group. See the Notes section below for requirements. + + Accepted inputs are: + + - String + - Python function + - Numba JIT function with ``engine='numba'`` specified. + + Only passing a single function is supported with this engine. + If the ``'numba'`` engine is chosen, the function must be + a user defined function with ``values`` and ``index`` as the + first and second arguments respectively in the function signature. + Each group's index will be passed to the user defined function + and optionally available for use. + + If a string is chosen, then it needs to be the name + of the groupby method you want to use. +*args + Positional arguments to pass to func. +engine : str, default None + * ``'cython'`` : Runs the function through C-extensions from cython. + * ``'numba'`` : Runs the function through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or the global setting ``compute.use_numba`` + +engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be + applied to the function + +**kwargs + Keyword arguments to be passed into func. + +Returns +------- +%(klass)s + +See Also +-------- +%(klass)s.groupby.apply : Apply function ``func`` group-wise and combine + the results together. +%(klass)s.groupby.aggregate : Aggregate using one or more + operations over the specified axis. +%(klass)s.transform : Call ``func`` on self producing a %(klass)s with the + same axis shape as self. + +Notes +----- +Each group is endowed the attribute 'name' in case you need to know +which group you are working on. + +The current implementation imposes three requirements on f: + +* f must return a value that either has the same shape as the input + subframe or can be broadcast to the shape of the input subframe. + For example, if `f` returns a scalar it will be broadcast to have the + same shape as the input subframe. +* if this is a DataFrame, f must support application column-by-column + in the subframe. If f also supports application to the entire subframe, + then a fast path is used starting from the second chunk. +* f must not mutate groups. Mutation is not supported and may + produce unexpected results. See :ref:`gotchas.udf-mutation` for more details. + +When using ``engine='numba'``, there will be no "fall back" behavior internally. +The group data and group index will be passed as numpy arrays to the JITed +user defined function, and no alternative execution attempts will be tried. + +.. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + see the examples below. + +.. versionchanged:: 2.0.0 + + When using ``.transform`` on a grouped DataFrame and the transformation function + returns a DataFrame, pandas now aligns the result's index + with the input's index. You can call ``.to_numpy()`` on the + result of the transformation function to avoid alignment. + +Examples +-------- +%(example)s""" + +_agg_template_series = """ +Aggregate using one or more operations over the specified axis. + +Parameters +---------- +func : function, str, list, dict or None + Function to use for aggregating the data. If a function, must either + work when passed a {klass} or when passed to {klass}.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` + - None, in which case ``**kwargs`` are used with Named Aggregation. Here the + output has one column for each element in ``**kwargs``. The name of the + column is keyword, whereas the value determines the aggregation used to compute + the values in the column. + + Can also accept a Numba JIT function with + ``engine='numba'`` specified. Only passing a single function is supported + with this engine. + + If the ``'numba'`` engine is chosen, the function must be + a user defined function with ``values`` and ``index`` as the + first and second arguments respectively in the function signature. + Each group's index will be passed to the user defined function + and optionally available for use. + + .. deprecated:: 2.1.0 + + Passing a dictionary is deprecated and will raise in a future version + of pandas. Pass a list of aggregations instead. +*args + Positional arguments to pass to func. +engine : str, default None + * ``'cython'`` : Runs the function through C-extensions from cython. + * ``'numba'`` : Runs the function through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` + +engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be + applied to the function + +**kwargs + * If ``func`` is None, ``**kwargs`` are used to define the output names and + aggregations via Named Aggregation. See ``func`` entry. + * Otherwise, keyword arguments to be passed into func. + +Returns +------- +{klass} + +See Also +-------- +{klass}.groupby.apply : Apply function func group-wise + and combine the results together. +{klass}.groupby.transform : Transforms the Series on each group + based on the given function. +{klass}.aggregate : Aggregate using one or more + operations over the specified axis. + +Notes +----- +When using ``engine='numba'``, there will be no "fall back" behavior internally. +The group data and group index will be passed as numpy arrays to the JITed +user defined function, and no alternative execution attempts will be tried. + +Functions that mutate the passed object can produce unexpected +behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` +for more details. + +.. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + see the examples below. +{examples}""" + +_agg_template_frame = """ +Aggregate using one or more operations over the specified axis. + +Parameters +---------- +func : function, str, list, dict or None + Function to use for aggregating the data. If a function, must either + work when passed a {klass} or when passed to {klass}.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` + - dict of axis labels -> functions, function names or list of such. + - None, in which case ``**kwargs`` are used with Named Aggregation. Here the + output has one column for each element in ``**kwargs``. The name of the + column is keyword, whereas the value determines the aggregation used to compute + the values in the column. + + Can also accept a Numba JIT function with + ``engine='numba'`` specified. Only passing a single function is supported + with this engine. + + If the ``'numba'`` engine is chosen, the function must be + a user defined function with ``values`` and ``index`` as the + first and second arguments respectively in the function signature. + Each group's index will be passed to the user defined function + and optionally available for use. + +*args + Positional arguments to pass to func. +engine : str, default None + * ``'cython'`` : Runs the function through C-extensions from cython. + * ``'numba'`` : Runs the function through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` + +engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be + applied to the function + +**kwargs + * If ``func`` is None, ``**kwargs`` are used to define the output names and + aggregations via Named Aggregation. See ``func`` entry. + * Otherwise, keyword arguments to be passed into func. + +Returns +------- +{klass} + +See Also +-------- +{klass}.groupby.apply : Apply function func group-wise + and combine the results together. +{klass}.groupby.transform : Transforms the Series on each group + based on the given function. +{klass}.aggregate : Aggregate using one or more + operations over the specified axis. + +Notes +----- +When using ``engine='numba'``, there will be no "fall back" behavior internally. +The group data and group index will be passed as numpy arrays to the JITed +user defined function, and no alternative execution attempts will be tried. + +Functions that mutate the passed object can produce unexpected +behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` +for more details. + +.. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + see the examples below. +{examples}""" + + +@final +class GroupByPlot(PandasObject): + """ + Class implementing the .plot attribute for groupby objects. + """ + + def __init__(self, groupby: GroupBy) -> None: + self._groupby = groupby + + def __call__(self, *args, **kwargs): + def f(self): + return self.plot(*args, **kwargs) + + f.__name__ = "plot" + return self._groupby._python_apply_general(f, self._groupby._selected_obj) + + def __getattr__(self, name: str): + def attr(*args, **kwargs): + def f(self): + return getattr(self.plot, name)(*args, **kwargs) + + return self._groupby._python_apply_general(f, self._groupby._selected_obj) + + return attr + + +_KeysArgType = Union[ + Hashable, + list[Hashable], + Callable[[Hashable], Hashable], + list[Callable[[Hashable], Hashable]], + Mapping[Hashable, Hashable], +] + + +class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin): + _hidden_attrs = PandasObject._hidden_attrs | { + "as_index", + "axis", + "dropna", + "exclusions", + "grouper", + "group_keys", + "keys", + "level", + "obj", + "observed", + "sort", + } + + axis: AxisInt + grouper: ops.BaseGrouper + keys: _KeysArgType | None = None + level: IndexLabel | None = None + group_keys: bool + + @final + def __len__(self) -> int: + return len(self.groups) + + @final + def __repr__(self) -> str: + # TODO: Better repr for GroupBy object + return object.__repr__(self) + + @final + @property + def groups(self) -> dict[Hashable, np.ndarray]: + """ + Dict {group name -> group labels}. + + Examples + -------- + + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b'] + >>> ser = pd.Series([1, 2, 3], index=lst) + >>> ser + a 1 + a 2 + b 3 + dtype: int64 + >>> ser.groupby(level=0).groups + {'a': ['a', 'a'], 'b': ['b']} + + For DataFrameGroupBy: + + >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"]) + >>> df + a b c + 0 1 2 3 + 1 1 5 6 + 2 7 8 9 + >>> df.groupby(by=["a"]).groups + {1: [0, 1], 7: [2]} + + For Resampler: + + >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( + ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + >>> ser.resample('MS').groups + {Timestamp('2023-01-01 00:00:00'): 2, Timestamp('2023-02-01 00:00:00'): 4} + """ + return self.grouper.groups + + @final + @property + def ngroups(self) -> int: + return self.grouper.ngroups + + @final + @property + def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: + """ + Dict {group name -> group indices}. + + Examples + -------- + + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b'] + >>> ser = pd.Series([1, 2, 3], index=lst) + >>> ser + a 1 + a 2 + b 3 + dtype: int64 + >>> ser.groupby(level=0).indices + {'a': array([0, 1]), 'b': array([2])} + + For DataFrameGroupBy: + + >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["owl", "toucan", "eagle"]) + >>> df + a b c + owl 1 2 3 + toucan 1 5 6 + eagle 7 8 9 + >>> df.groupby(by=["a"]).indices + {1: array([0, 1]), 7: array([2])} + + For Resampler: + + >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( + ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + >>> ser.resample('MS').indices + defaultdict(, {Timestamp('2023-01-01 00:00:00'): [0, 1], + Timestamp('2023-02-01 00:00:00'): [2, 3]}) + """ + return self.grouper.indices + + @final + def _get_indices(self, names): + """ + Safe get multiple indices, translate keys for + datelike to underlying repr. + """ + + def get_converter(s): + # possibly convert to the actual key types + # in the indices, could be a Timestamp or a np.datetime64 + if isinstance(s, datetime.datetime): + return lambda key: Timestamp(key) + elif isinstance(s, np.datetime64): + return lambda key: Timestamp(key).asm8 + else: + return lambda key: key + + if len(names) == 0: + return [] + + if len(self.indices) > 0: + index_sample = next(iter(self.indices)) + else: + index_sample = None # Dummy sample + + name_sample = names[0] + if isinstance(index_sample, tuple): + if not isinstance(name_sample, tuple): + msg = "must supply a tuple to get_group with multiple grouping keys" + raise ValueError(msg) + if not len(name_sample) == len(index_sample): + try: + # If the original grouper was a tuple + return [self.indices[name] for name in names] + except KeyError as err: + # turns out it wasn't a tuple + msg = ( + "must supply a same-length tuple to get_group " + "with multiple grouping keys" + ) + raise ValueError(msg) from err + + converters = [get_converter(s) for s in index_sample] + names = (tuple(f(n) for f, n in zip(converters, name)) for name in names) + + else: + converter = get_converter(index_sample) + names = (converter(name) for name in names) + + return [self.indices.get(name, []) for name in names] + + @final + def _get_index(self, name): + """ + Safe get index, translate keys for datelike to underlying repr. + """ + return self._get_indices([name])[0] + + @final + @cache_readonly + def _selected_obj(self): + # Note: _selected_obj is always just `self.obj` for SeriesGroupBy + if isinstance(self.obj, Series): + return self.obj + + if self._selection is not None: + if is_hashable(self._selection): + # i.e. a single key, so selecting it will return a Series. + # In this case, _obj_with_exclusions would wrap the key + # in a list and return a single-column DataFrame. + return self.obj[self._selection] + + # Otherwise _selection is equivalent to _selection_list, so + # _selected_obj matches _obj_with_exclusions, so we can reuse + # that and avoid making a copy. + return self._obj_with_exclusions + + return self.obj + + @final + def _dir_additions(self) -> set[str]: + return self.obj._dir_additions() + + @Substitution( + klass="GroupBy", + examples=dedent( + """\ + >>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]}) + >>> df + A B + 0 a 1 + 1 b 2 + 2 a 3 + 3 b 4 + + To get the difference between each groups maximum and minimum value in one + pass, you can do + + >>> df.groupby('A').pipe(lambda x: x.max() - x.min()) + B + A + a 2 + b 2""" + ), + ) + @Appender(_pipe_template) + def pipe( + self, + func: Callable[..., T] | tuple[Callable[..., T], str], + *args, + **kwargs, + ) -> T: + return com.pipe(self, func, *args, **kwargs) + + @final + def get_group(self, name, obj=None) -> DataFrame | Series: + """ + Construct DataFrame from group with provided name. + + Parameters + ---------- + name : object + The name of the group to get as a DataFrame. + obj : DataFrame, default None + The DataFrame to take the DataFrame out of. If + it is None, the object groupby was called on will + be used. + + .. deprecated:: 2.1.0 + The obj is deprecated and will be removed in a future version. + Do ``df.iloc[gb.indices.get(name)]`` + instead of ``gb.get_group(name, obj=df)``. + + Returns + ------- + same type as obj + + Examples + -------- + + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b'] + >>> ser = pd.Series([1, 2, 3], index=lst) + >>> ser + a 1 + a 2 + b 3 + dtype: int64 + >>> ser.groupby(level=0).get_group("a") + a 1 + a 2 + dtype: int64 + + For DataFrameGroupBy: + + >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["owl", "toucan", "eagle"]) + >>> df + a b c + owl 1 2 3 + toucan 1 5 6 + eagle 7 8 9 + >>> df.groupby(by=["a"]).get_group((1,)) + a b c + owl 1 2 3 + toucan 1 5 6 + + For Resampler: + + >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( + ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + >>> ser.resample('MS').get_group('2023-01-01') + 2023-01-01 1 + 2023-01-15 2 + dtype: int64 + """ + keys = self.keys + level = self.level + # mypy doesn't recognize level/keys as being sized when passed to len + if (is_list_like(level) and len(level) == 1) or ( # type: ignore[arg-type] + is_list_like(keys) and len(keys) == 1 # type: ignore[arg-type] + ): + # GH#25971 + if isinstance(name, tuple) and len(name) == 1: + # Allow users to pass tuples of length 1 to silence warning + name = name[0] + elif not isinstance(name, tuple): + warnings.warn( + "When grouping with a length-1 list-like, " + "you will need to pass a length-1 tuple to get_group in a future " + "version of pandas. Pass `(name,)` instead of `name` to silence " + "this warning.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + inds = self._get_index(name) + if not len(inds): + raise KeyError(name) + + if obj is None: + indexer = inds if self.axis == 0 else (slice(None), inds) + return self._selected_obj.iloc[indexer] + else: + warnings.warn( + "obj is deprecated and will be removed in a future version. " + "Do ``df.iloc[gb.indices.get(name)]`` " + "instead of ``gb.get_group(name, obj=df)``.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return obj._take_with_is_copy(inds, axis=self.axis) + + @final + def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]: + """ + Groupby iterator. + + Returns + ------- + Generator yielding sequence of (name, subsetted object) + for each group + + Examples + -------- + + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b'] + >>> ser = pd.Series([1, 2, 3], index=lst) + >>> ser + a 1 + a 2 + b 3 + dtype: int64 + >>> for x, y in ser.groupby(level=0): + ... print(f'{x}\\n{y}\\n') + a + a 1 + a 2 + dtype: int64 + b + b 3 + dtype: int64 + + For DataFrameGroupBy: + + >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"]) + >>> df + a b c + 0 1 2 3 + 1 1 5 6 + 2 7 8 9 + >>> for x, y in df.groupby(by=["a"]): + ... print(f'{x}\\n{y}\\n') + (1,) + a b c + 0 1 2 3 + 1 1 5 6 + (7,) + a b c + 2 7 8 9 + + For Resampler: + + >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( + ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + >>> for x, y in ser.resample('MS'): + ... print(f'{x}\\n{y}\\n') + 2023-01-01 00:00:00 + 2023-01-01 1 + 2023-01-15 2 + dtype: int64 + 2023-02-01 00:00:00 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + """ + keys = self.keys + level = self.level + result = self.grouper.get_iterator(self._selected_obj, axis=self.axis) + # error: Argument 1 to "len" has incompatible type "Hashable"; expected "Sized" + if is_list_like(level) and len(level) == 1: # type: ignore[arg-type] + # GH 51583 + warnings.warn( + "Creating a Groupby object with a length-1 list-like " + "level parameter will yield indexes as tuples in a future version. " + "To keep indexes as scalars, create Groupby objects with " + "a scalar level parameter instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if isinstance(keys, list) and len(keys) == 1: + # GH#42795 - when keys is a list, return tuples even when length is 1 + result = (((key,), group) for key, group in result) + return result + + +# To track operations that expand dimensions, like ohlc +OutputFrameOrSeries = TypeVar("OutputFrameOrSeries", bound=NDFrame) + + +class GroupBy(BaseGroupBy[NDFrameT]): + """ + Class for grouping and aggregating relational data. + + See aggregate, transform, and apply functions on this object. + + It's easiest to use obj.groupby(...) to use GroupBy, but you can also do: + + :: + + grouped = groupby(obj, ...) + + Parameters + ---------- + obj : pandas object + axis : int, default 0 + level : int, default None + Level of MultiIndex + groupings : list of Grouping objects + Most users should ignore this + exclusions : array-like, optional + List of columns to exclude + name : str + Most users should ignore this + + Returns + ------- + **Attributes** + groups : dict + {group name -> group labels} + len(grouped) : int + Number of groups + + Notes + ----- + After grouping, see aggregate, apply, and transform functions. Here are + some other brief notes about usage. When grouping by multiple groups, the + result index will be a MultiIndex (hierarchical) by default. + + Iteration produces (key, group) tuples, i.e. chunking the data by group. So + you can write code like: + + :: + + grouped = obj.groupby(keys, axis=axis) + for key, group in grouped: + # do something with the data + + Function calls on GroupBy, if not specially implemented, "dispatch" to the + grouped data. So if you group a DataFrame and wish to invoke the std() + method on each group, you can simply do: + + :: + + df.groupby(mapper).std() + + rather than + + :: + + df.groupby(mapper).aggregate(np.std) + + You can pass arguments to these "wrapped" functions, too. + + See the online documentation for full exposition on these topics and much + more + """ + + grouper: ops.BaseGrouper + as_index: bool + + @final + def __init__( + self, + obj: NDFrameT, + keys: _KeysArgType | None = None, + axis: Axis = 0, + level: IndexLabel | None = None, + grouper: ops.BaseGrouper | None = None, + exclusions: frozenset[Hashable] | None = None, + selection: IndexLabel | None = None, + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + observed: bool | lib.NoDefault = lib.no_default, + dropna: bool = True, + ) -> None: + self._selection = selection + + assert isinstance(obj, NDFrame), type(obj) + + self.level = level + + if not as_index: + if axis != 0: + raise ValueError("as_index=False only valid for axis=0") + + self.as_index = as_index + self.keys = keys + self.sort = sort + self.group_keys = group_keys + self.dropna = dropna + + if grouper is None: + grouper, exclusions, obj = get_grouper( + obj, + keys, + axis=axis, + level=level, + sort=sort, + observed=False if observed is lib.no_default else observed, + dropna=self.dropna, + ) + + if observed is lib.no_default: + if any(ping._passed_categorical for ping in grouper.groupings): + warnings.warn( + "The default of observed=False is deprecated and will be changed " + "to True in a future version of pandas. Pass observed=False to " + "retain current behavior or observed=True to adopt the future " + "default and silence this warning.", + FutureWarning, + stacklevel=find_stack_level(), + ) + observed = False + self.observed = observed + + self.obj = obj + self.axis = obj._get_axis_number(axis) + self.grouper = grouper + self.exclusions = frozenset(exclusions) if exclusions else frozenset() + + def __getattr__(self, attr: str): + if attr in self._internal_names_set: + return object.__getattribute__(self, attr) + if attr in self.obj: + return self[attr] + + raise AttributeError( + f"'{type(self).__name__}' object has no attribute '{attr}'" + ) + + @final + def _deprecate_axis(self, axis: int, name: str) -> None: + if axis == 1: + warnings.warn( + f"{type(self).__name__}.{name} with axis=1 is deprecated and " + "will be removed in a future version. Operate on the un-grouped " + "DataFrame instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + warnings.warn( + f"The 'axis' keyword in {type(self).__name__}.{name} is deprecated " + "and will be removed in a future version. " + "Call without passing 'axis' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + @final + def _op_via_apply(self, name: str, *args, **kwargs): + """Compute the result of an operation by using GroupBy's apply.""" + f = getattr(type(self._obj_with_exclusions), name) + sig = inspect.signature(f) + + if "axis" in kwargs and kwargs["axis"] is not lib.no_default: + axis = self.obj._get_axis_number(kwargs["axis"]) + self._deprecate_axis(axis, name) + elif "axis" in kwargs: + # exclude skew here because that was already defaulting to lib.no_default + # before this deprecation was instituted + if name == "skew": + pass + elif name == "fillna": + # maintain the behavior from before the deprecation + kwargs["axis"] = None + else: + kwargs["axis"] = 0 + + # a little trickery for aggregation functions that need an axis + # argument + if "axis" in sig.parameters: + if kwargs.get("axis", None) is None or kwargs.get("axis") is lib.no_default: + kwargs["axis"] = self.axis + + def curried(x): + return f(x, *args, **kwargs) + + # preserve the name so we can detect it when calling plot methods, + # to avoid duplicates + curried.__name__ = name + + # special case otherwise extra plots are created when catching the + # exception below + if name in base.plotting_methods: + return self._python_apply_general(curried, self._selected_obj) + + is_transform = name in base.transformation_kernels + result = self._python_apply_general( + curried, + self._obj_with_exclusions, + is_transform=is_transform, + not_indexed_same=not is_transform, + ) + + if self.grouper.has_dropped_na and is_transform: + # result will have dropped rows due to nans, fill with null + # and ensure index is ordered same as the input + result = self._set_result_index_ordered(result) + return result + + # ----------------------------------------------------------------- + # Dispatch/Wrapping + + @final + def _concat_objects( + self, + values, + not_indexed_same: bool = False, + is_transform: bool = False, + ): + from pandas.core.reshape.concat import concat + + if self.group_keys and not is_transform: + if self.as_index: + # possible MI return case + group_keys = self.grouper.result_index + group_levels = self.grouper.levels + group_names = self.grouper.names + + result = concat( + values, + axis=self.axis, + keys=group_keys, + levels=group_levels, + names=group_names, + sort=False, + ) + else: + # GH5610, returns a MI, with the first level being a + # range index + keys = list(range(len(values))) + result = concat(values, axis=self.axis, keys=keys) + + elif not not_indexed_same: + result = concat(values, axis=self.axis) + + ax = self._selected_obj._get_axis(self.axis) + if self.dropna: + labels = self.grouper.group_info[0] + mask = labels != -1 + ax = ax[mask] + + # this is a very unfortunate situation + # we can't use reindex to restore the original order + # when the ax has duplicates + # so we resort to this + # GH 14776, 30667 + # TODO: can we reuse e.g. _reindex_non_unique? + if ax.has_duplicates and not result.axes[self.axis].equals(ax): + # e.g. test_category_order_transformer + target = algorithms.unique1d(ax._values) + indexer, _ = result.index.get_indexer_non_unique(target) + result = result.take(indexer, axis=self.axis) + else: + result = result.reindex(ax, axis=self.axis, copy=False) + + else: + result = concat(values, axis=self.axis) + + if self.obj.ndim == 1: + name = self.obj.name + elif is_hashable(self._selection): + name = self._selection + else: + name = None + + if isinstance(result, Series) and name is not None: + result.name = name + + return result + + @final + def _set_result_index_ordered( + self, result: OutputFrameOrSeries + ) -> OutputFrameOrSeries: + # set the result index on the passed values object and + # return the new object, xref 8046 + + obj_axis = self.obj._get_axis(self.axis) + + if self.grouper.is_monotonic and not self.grouper.has_dropped_na: + # shortcut if we have an already ordered grouper + result = result.set_axis(obj_axis, axis=self.axis, copy=False) + return result + + # row order is scrambled => sort the rows by position in original index + original_positions = Index(self.grouper.result_ilocs()) + result = result.set_axis(original_positions, axis=self.axis, copy=False) + result = result.sort_index(axis=self.axis) + if self.grouper.has_dropped_na: + # Add back in any missing rows due to dropna - index here is integral + # with values referring to the row of the input so can use RangeIndex + result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis) + result = result.set_axis(obj_axis, axis=self.axis, copy=False) + + return result + + @final + def _insert_inaxis_grouper(self, result: Series | DataFrame) -> DataFrame: + if isinstance(result, Series): + result = result.to_frame() + + # zip in reverse so we can always insert at loc 0 + columns = result.columns + for name, lev, in_axis in zip( + reversed(self.grouper.names), + reversed(self.grouper.get_group_levels()), + reversed([grp.in_axis for grp in self.grouper.groupings]), + ): + # GH #28549 + # When using .apply(-), name will be in columns already + if name not in columns: + if in_axis: + result.insert(0, name, lev) + else: + msg = ( + "A grouping was used that is not in the columns of the " + "DataFrame and so was excluded from the result. This grouping " + "will be included in a future version of pandas. Add the " + "grouping as a column of the DataFrame to silence this warning." + ) + warnings.warn( + message=msg, + category=FutureWarning, + stacklevel=find_stack_level(), + ) + + return result + + @final + def _maybe_transpose_result(self, result: NDFrameT) -> NDFrameT: + if self.axis == 1: + # Only relevant for DataFrameGroupBy, no-op for SeriesGroupBy + result = result.T + if result.index.equals(self.obj.index): + # Retain e.g. DatetimeIndex/TimedeltaIndex freq + # e.g. test_groupby_crash_on_nunique + result.index = self.obj.index.copy() + return result + + @final + def _wrap_aggregated_output( + self, + result: Series | DataFrame, + qs: npt.NDArray[np.float64] | None = None, + ): + """ + Wraps the output of GroupBy aggregations into the expected result. + + Parameters + ---------- + result : Series, DataFrame + + Returns + ------- + Series or DataFrame + """ + # ATM we do not get here for SeriesGroupBy; when we do, we will + # need to require that result.name already match self.obj.name + + if not self.as_index: + # `not self.as_index` is only relevant for DataFrameGroupBy, + # enforced in __init__ + result = self._insert_inaxis_grouper(result) + result = result._consolidate() + index = Index(range(self.grouper.ngroups)) + + else: + index = self.grouper.result_index + + if qs is not None: + # We get here with len(qs) != 1 and not self.as_index + # in test_pass_args_kwargs + index = _insert_quantile_level(index, qs) + + result.index = index + + # error: Argument 1 to "_maybe_transpose_result" of "GroupBy" has + # incompatible type "Union[Series, DataFrame]"; expected "NDFrameT" + res = self._maybe_transpose_result(result) # type: ignore[arg-type] + return self._reindex_output(res, qs=qs) + + def _wrap_applied_output( + self, + data, + values: list, + not_indexed_same: bool = False, + is_transform: bool = False, + ): + raise AbstractMethodError(self) + + # ----------------------------------------------------------------- + # numba + + @final + def _numba_prep(self, data: DataFrame): + ids, _, ngroups = self.grouper.group_info + sorted_index = self.grouper._sort_idx + sorted_ids = self.grouper._sorted_ids + + sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() + # GH 46867 + index_data = data.index + if isinstance(index_data, MultiIndex): + if len(self.grouper.groupings) > 1: + raise NotImplementedError( + "Grouping with more than 1 grouping labels and " + "a MultiIndex is not supported with engine='numba'" + ) + group_key = self.grouper.groupings[0].name + index_data = index_data.get_level_values(group_key) + sorted_index_data = index_data.take(sorted_index).to_numpy() + + starts, ends = lib.generate_slices(sorted_ids, ngroups) + return ( + starts, + ends, + sorted_index_data, + sorted_data, + ) + + def _numba_agg_general( + self, + func: Callable, + dtype_mapping: dict[np.dtype, Any], + engine_kwargs: dict[str, bool] | None, + **aggregator_kwargs, + ): + """ + Perform groupby with a standard numerical aggregation function (e.g. mean) + with Numba. + """ + if not self.as_index: + raise NotImplementedError( + "as_index=False is not supported. Use .reset_index() instead." + ) + if self.axis == 1: + raise NotImplementedError("axis=1 is not supported.") + + data = self._obj_with_exclusions + df = data if data.ndim == 2 else data.to_frame() + + aggregator = executor.generate_shared_aggregator( + func, + dtype_mapping, + True, # is_grouped_kernel + **get_jit_arguments(engine_kwargs), + ) + # Pass group ids to kernel directly if it can handle it + # (This is faster since it doesn't require a sort) + ids, _, _ = self.grouper.group_info + ngroups = self.grouper.ngroups + + res_mgr = df._mgr.apply( + aggregator, labels=ids, ngroups=ngroups, **aggregator_kwargs + ) + res_mgr.axes[1] = self.grouper.result_index + result = df._constructor_from_mgr(res_mgr, axes=res_mgr.axes) + + if data.ndim == 1: + result = result.squeeze("columns") + result.name = data.name + else: + result.columns = data.columns + return result + + @final + def _transform_with_numba(self, func, *args, engine_kwargs=None, **kwargs): + """ + Perform groupby transform routine with the numba engine. + + This routine mimics the data splitting routine of the DataSplitter class + to generate the indices of each group in the sorted data and then passes the + data and indices into a Numba jitted function. + """ + data = self._obj_with_exclusions + df = data if data.ndim == 2 else data.to_frame() + + starts, ends, sorted_index, sorted_data = self._numba_prep(df) + numba_.validate_udf(func) + numba_transform_func = numba_.generate_numba_transform_func( + func, **get_jit_arguments(engine_kwargs, kwargs) + ) + result = numba_transform_func( + sorted_data, + sorted_index, + starts, + ends, + len(df.columns), + *args, + ) + # result values needs to be resorted to their original positions since we + # evaluated the data sorted by group + result = result.take(np.argsort(sorted_index), axis=0) + index = data.index + if data.ndim == 1: + result_kwargs = {"name": data.name} + result = result.ravel() + else: + result_kwargs = {"columns": data.columns} + return data._constructor(result, index=index, **result_kwargs) + + @final + def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs): + """ + Perform groupby aggregation routine with the numba engine. + + This routine mimics the data splitting routine of the DataSplitter class + to generate the indices of each group in the sorted data and then passes the + data and indices into a Numba jitted function. + """ + data = self._obj_with_exclusions + df = data if data.ndim == 2 else data.to_frame() + + starts, ends, sorted_index, sorted_data = self._numba_prep(df) + numba_.validate_udf(func) + numba_agg_func = numba_.generate_numba_agg_func( + func, **get_jit_arguments(engine_kwargs, kwargs) + ) + result = numba_agg_func( + sorted_data, + sorted_index, + starts, + ends, + len(df.columns), + *args, + ) + index = self.grouper.result_index + if data.ndim == 1: + result_kwargs = {"name": data.name} + result = result.ravel() + else: + result_kwargs = {"columns": data.columns} + res = data._constructor(result, index=index, **result_kwargs) + if not self.as_index: + res = self._insert_inaxis_grouper(res) + res.index = default_index(len(res)) + return res + + # ----------------------------------------------------------------- + # apply/agg/transform + + @Appender( + _apply_docs["template"].format( + input="dataframe", examples=_apply_docs["dataframe_examples"] + ) + ) + def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: + orig_func = func + func = com.is_builtin_func(func) + if orig_func != func: + alias = com._builtin_table_alias[orig_func] + warn_alias_replacement(self, orig_func, alias) + + if isinstance(func, str): + if hasattr(self, func): + res = getattr(self, func) + if callable(res): + return res(*args, **kwargs) + elif args or kwargs: + raise ValueError(f"Cannot pass arguments to property {func}") + return res + + else: + raise TypeError(f"apply func should be callable, not '{func}'") + + elif args or kwargs: + if callable(func): + + @wraps(func) + def f(g): + return func(g, *args, **kwargs) + + else: + raise ValueError( + "func must be a callable if args or kwargs are supplied" + ) + else: + f = func + + if not include_groups: + return self._python_apply_general(f, self._obj_with_exclusions) + + # ignore SettingWithCopy here in case the user mutates + with option_context("mode.chained_assignment", None): + try: + result = self._python_apply_general(f, self._selected_obj) + if ( + not isinstance(self.obj, Series) + and self._selection is None + and self._selected_obj.shape != self._obj_with_exclusions.shape + ): + warnings.warn( + message=_apply_groupings_depr.format( + type(self).__name__, "apply" + ), + category=FutureWarning, + stacklevel=find_stack_level(), + ) + except TypeError: + # gh-20949 + # try again, with .apply acting as a filtering + # operation, by excluding the grouping column + # This would normally not be triggered + # except if the udf is trying an operation that + # fails on *some* columns, e.g. a numeric operation + # on a string grouper column + + return self._python_apply_general(f, self._obj_with_exclusions) + + return result + + @final + def _python_apply_general( + self, + f: Callable, + data: DataFrame | Series, + not_indexed_same: bool | None = None, + is_transform: bool = False, + is_agg: bool = False, + ) -> NDFrameT: + """ + Apply function f in python space + + Parameters + ---------- + f : callable + Function to apply + data : Series or DataFrame + Data to apply f to + not_indexed_same: bool, optional + When specified, overrides the value of not_indexed_same. Apply behaves + differently when the result index is equal to the input index, but + this can be coincidental leading to value-dependent behavior. + is_transform : bool, default False + Indicator for whether the function is actually a transform + and should not have group keys prepended. + is_agg : bool, default False + Indicator for whether the function is an aggregation. When the + result is empty, we don't want to warn for this case. + See _GroupBy._python_agg_general. + + Returns + ------- + Series or DataFrame + data after applying f + """ + values, mutated = self.grouper.apply_groupwise(f, data, self.axis) + if not_indexed_same is None: + not_indexed_same = mutated + + return self._wrap_applied_output( + data, + values, + not_indexed_same, + is_transform, + ) + + @final + def _agg_general( + self, + numeric_only: bool = False, + min_count: int = -1, + *, + alias: str, + npfunc: Callable | None = None, + **kwargs, + ): + result = self._cython_agg_general( + how=alias, + alt=npfunc, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + return result.__finalize__(self.obj, method="groupby") + + def _agg_py_fallback( + self, how: str, values: ArrayLike, ndim: int, alt: Callable + ) -> ArrayLike: + """ + Fallback to pure-python aggregation if _cython_operation raises + NotImplementedError. + """ + # We get here with a) EADtypes and b) object dtype + assert alt is not None + + if values.ndim == 1: + # For DataFrameGroupBy we only get here with ExtensionArray + ser = Series(values, copy=False) + else: + # We only get here with values.dtype == object + df = DataFrame(values.T, dtype=values.dtype) + # bc we split object blocks in grouped_reduce, we have only 1 col + # otherwise we'd have to worry about block-splitting GH#39329 + assert df.shape[1] == 1 + # Avoid call to self.values that can occur in DataFrame + # reductions; see GH#28949 + ser = df.iloc[:, 0] + + # We do not get here with UDFs, so we know that our dtype + # should always be preserved by the implemented aggregations + # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype? + try: + res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True) + except Exception as err: + msg = f"agg function failed [how->{how},dtype->{ser.dtype}]" + # preserve the kind of exception that raised + raise type(err)(msg) from err + + if ser.dtype == object: + res_values = res_values.astype(object, copy=False) + + # If we are DataFrameGroupBy and went through a SeriesGroupByPath + # then we need to reshape + # GH#32223 includes case with IntegerArray values, ndarray res_values + # test_groupby_duplicate_columns with object dtype values + return ensure_block_shape(res_values, ndim=ndim) + + @final + def _cython_agg_general( + self, + how: str, + alt: Callable | None = None, + numeric_only: bool = False, + min_count: int = -1, + **kwargs, + ): + # Note: we never get here with how="ohlc" for DataFrameGroupBy; + # that goes through SeriesGroupBy + + data = self._get_data_to_aggregate(numeric_only=numeric_only, name=how) + + def array_func(values: ArrayLike) -> ArrayLike: + try: + result = self.grouper._cython_operation( + "aggregate", + values, + how, + axis=data.ndim - 1, + min_count=min_count, + **kwargs, + ) + except NotImplementedError: + # generally if we have numeric_only=False + # and non-applicable functions + # try to python agg + # TODO: shouldn't min_count matter? + # TODO: avoid special casing SparseArray here + if how in ["any", "all"] and isinstance(values, SparseArray): + pass + elif alt is None or how in ["any", "all", "std", "sem"]: + raise # TODO: re-raise as TypeError? should not be reached + else: + return result + + assert alt is not None + result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt) + return result + + new_mgr = data.grouped_reduce(array_func) + res = self._wrap_agged_manager(new_mgr) + if how in ["idxmin", "idxmax"]: + res = self._wrap_idxmax_idxmin(res) + out = self._wrap_aggregated_output(res) + if self.axis == 1: + out = out.infer_objects(copy=False) + return out + + def _cython_transform( + self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs + ): + raise AbstractMethodError(self) + + @final + def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): + # optimized transforms + orig_func = func + func = com.get_cython_func(func) or func + if orig_func != func: + warn_alias_replacement(self, orig_func, func) + + if not isinstance(func, str): + return self._transform_general(func, engine, engine_kwargs, *args, **kwargs) + + elif func not in base.transform_kernel_allowlist: + msg = f"'{func}' is not a valid function name for transform(name)" + raise ValueError(msg) + elif func in base.cythonized_kernels or func in base.transformation_kernels: + # cythonized transform or canned "agg+broadcast" + if engine is not None: + kwargs["engine"] = engine + kwargs["engine_kwargs"] = engine_kwargs + return getattr(self, func)(*args, **kwargs) + + else: + # i.e. func in base.reduction_kernels + + # GH#30918 Use _transform_fast only when we know func is an aggregation + # If func is a reduction, we need to broadcast the + # result to the whole group. Compute func result + # and deal with possible broadcasting below. + with com.temp_setattr(self, "as_index", True): + # GH#49834 - result needs groups in the index for + # _wrap_transform_fast_result + if func in ["idxmin", "idxmax"]: + func = cast(Literal["idxmin", "idxmax"], func) + result = self._idxmax_idxmin(func, True, *args, **kwargs) + else: + if engine is not None: + kwargs["engine"] = engine + kwargs["engine_kwargs"] = engine_kwargs + result = getattr(self, func)(*args, **kwargs) + + return self._wrap_transform_fast_result(result) + + @final + def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT: + """ + Fast transform path for aggregations. + """ + obj = self._obj_with_exclusions + + # for each col, reshape to size of original frame by take operation + ids, _, _ = self.grouper.group_info + result = result.reindex(self.grouper.result_index, axis=self.axis, copy=False) + + if self.obj.ndim == 1: + # i.e. SeriesGroupBy + out = algorithms.take_nd(result._values, ids) + output = obj._constructor(out, index=obj.index, name=obj.name) + else: + # `.size()` gives Series output on DataFrame input, need axis 0 + axis = 0 if result.ndim == 1 else self.axis + # GH#46209 + # Don't convert indices: negative indices need to give rise + # to null values in the result + new_ax = result.axes[axis].take(ids) + output = result._reindex_with_indexers( + {axis: (new_ax, ids)}, allow_dups=True, copy=False + ) + output = output.set_axis(obj._get_axis(self.axis), axis=axis) + return output + + # ----------------------------------------------------------------- + # Utilities + + @final + def _apply_filter(self, indices, dropna): + if len(indices) == 0: + indices = np.array([], dtype="int64") + else: + indices = np.sort(np.concatenate(indices)) + if dropna: + filtered = self._selected_obj.take(indices, axis=self.axis) + else: + mask = np.empty(len(self._selected_obj.index), dtype=bool) + mask.fill(False) + mask[indices.astype(int)] = True + # mask fails to broadcast when passed to where; broadcast manually. + mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T + filtered = self._selected_obj.where(mask) # Fill with NaNs. + return filtered + + @final + def _cumcount_array(self, ascending: bool = True) -> np.ndarray: + """ + Parameters + ---------- + ascending : bool, default True + If False, number in reverse, from length of group - 1 to 0. + + Notes + ----- + this is currently implementing sort=False + (though the default is sort=True) for groupby in general + """ + ids, _, ngroups = self.grouper.group_info + sorter = get_group_index_sorter(ids, ngroups) + ids, count = ids[sorter], len(ids) + + if count == 0: + return np.empty(0, dtype=np.int64) + + run = np.r_[True, ids[:-1] != ids[1:]] + rep = np.diff(np.r_[np.nonzero(run)[0], count]) + out = (~run).cumsum() + + if ascending: + out -= np.repeat(out[run], rep) + else: + out = np.repeat(out[np.r_[run[1:], True]], rep) - out + + if self.grouper.has_dropped_na: + out = np.where(ids == -1, np.nan, out.astype(np.float64, copy=False)) + else: + out = out.astype(np.int64, copy=False) + + rev = np.empty(count, dtype=np.intp) + rev[sorter] = np.arange(count, dtype=np.intp) + return out[rev] + + # ----------------------------------------------------------------- + + @final + @property + def _obj_1d_constructor(self) -> Callable: + # GH28330 preserve subclassed Series/DataFrames + if isinstance(self.obj, DataFrame): + return self.obj._constructor_sliced + assert isinstance(self.obj, Series) + return self.obj._constructor + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def any(self, skipna: bool = True) -> NDFrameT: + """ + Return True if any value in the group is truthful, else False. + + Parameters + ---------- + skipna : bool, default True + Flag to ignore nan values during truth testing. + + Returns + ------- + Series or DataFrame + DataFrame or Series of boolean values, where a value is True if any element + is True within its respective group, False otherwise. + %(see_also)s + Examples + -------- + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b'] + >>> ser = pd.Series([1, 2, 0], index=lst) + >>> ser + a 1 + a 2 + b 0 + dtype: int64 + >>> ser.groupby(level=0).any() + a True + b False + dtype: bool + + For DataFrameGroupBy: + + >>> data = [[1, 0, 3], [1, 0, 6], [7, 1, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["ostrich", "penguin", "parrot"]) + >>> df + a b c + ostrich 1 0 3 + penguin 1 0 6 + parrot 7 1 9 + >>> df.groupby(by=["a"]).any() + b c + a + 1 False True + 7 True True + """ + return self._cython_agg_general( + "any", + alt=lambda x: Series(x).any(skipna=skipna), + skipna=skipna, + ) + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def all(self, skipna: bool = True) -> NDFrameT: + """ + Return True if all values in the group are truthful, else False. + + Parameters + ---------- + skipna : bool, default True + Flag to ignore nan values during truth testing. + + Returns + ------- + Series or DataFrame + DataFrame or Series of boolean values, where a value is True if all elements + are True within its respective group, False otherwise. + %(see_also)s + Examples + -------- + + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b'] + >>> ser = pd.Series([1, 2, 0], index=lst) + >>> ser + a 1 + a 2 + b 0 + dtype: int64 + >>> ser.groupby(level=0).all() + a True + b False + dtype: bool + + For DataFrameGroupBy: + + >>> data = [[1, 0, 3], [1, 5, 6], [7, 8, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["ostrich", "penguin", "parrot"]) + >>> df + a b c + ostrich 1 0 3 + penguin 1 5 6 + parrot 7 8 9 + >>> df.groupby(by=["a"]).all() + b c + a + 1 False True + 7 True True + """ + return self._cython_agg_general( + "all", + alt=lambda x: Series(x).all(skipna=skipna), + skipna=skipna, + ) + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def count(self) -> NDFrameT: + """ + Compute count of group, excluding missing values. + + Returns + ------- + Series or DataFrame + Count of values within each group. + %(see_also)s + Examples + -------- + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b'] + >>> ser = pd.Series([1, 2, np.nan], index=lst) + >>> ser + a 1.0 + a 2.0 + b NaN + dtype: float64 + >>> ser.groupby(level=0).count() + a 2 + b 0 + dtype: int64 + + For DataFrameGroupBy: + + >>> data = [[1, np.nan, 3], [1, np.nan, 6], [7, 8, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["cow", "horse", "bull"]) + >>> df + a b c + cow 1 NaN 3 + horse 1 NaN 6 + bull 7 8.0 9 + >>> df.groupby("a").count() + b c + a + 1 0 2 + 7 1 1 + + For Resampler: + + >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( + ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + >>> ser.resample('MS').count() + 2023-01-01 2 + 2023-02-01 2 + Freq: MS, dtype: int64 + """ + data = self._get_data_to_aggregate() + ids, _, ngroups = self.grouper.group_info + mask = ids != -1 + + is_series = data.ndim == 1 + + def hfunc(bvalues: ArrayLike) -> ArrayLike: + # TODO(EA2D): reshape would not be necessary with 2D EAs + if bvalues.ndim == 1: + # EA + masked = mask & ~isna(bvalues).reshape(1, -1) + else: + masked = mask & ~isna(bvalues) + + counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups) + if isinstance(bvalues, BaseMaskedArray): + return IntegerArray( + counted[0], mask=np.zeros(counted.shape[1], dtype=np.bool_) + ) + elif isinstance(bvalues, ArrowExtensionArray) and not isinstance( + bvalues.dtype, StringDtype + ): + return type(bvalues)._from_sequence(counted[0]) + if is_series: + assert counted.ndim == 2 + assert counted.shape[0] == 1 + return counted[0] + return counted + + new_mgr = data.grouped_reduce(hfunc) + new_obj = self._wrap_agged_manager(new_mgr) + + # If we are grouping on categoricals we want unobserved categories to + # return zero, rather than the default of NaN which the reindexing in + # _wrap_aggregated_output() returns. GH 35028 + # e.g. test_dataframe_groupby_on_2_categoricals_when_observed_is_false + with com.temp_setattr(self, "observed", True): + result = self._wrap_aggregated_output(new_obj) + + return self._reindex_output(result, fill_value=0) + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def mean( + self, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + """ + Compute mean of groups, excluding missing values. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None`` and defaults to ``False``. + + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + .. versionadded:: 1.4.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` + + .. versionadded:: 1.4.0 + + Returns + ------- + pandas.Series or pandas.DataFrame + %(see_also)s + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2], + ... 'B': [np.nan, 2, 3, 4, 5], + ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C']) + + Groupby one column and return the mean of the remaining columns in + each group. + + >>> df.groupby('A').mean() + B C + A + 1 3.0 1.333333 + 2 4.0 1.500000 + + Groupby two columns and return the mean of the remaining column. + + >>> df.groupby(['A', 'B']).mean() + C + A B + 1 2.0 2.0 + 4.0 1.0 + 2 3.0 1.0 + 5.0 2.0 + + Groupby one column and return the mean of only particular column in + the group. + + >>> df.groupby('A')['B'].mean() + A + 1 3.0 + 2 4.0 + Name: B, dtype: float64 + """ + + if maybe_use_numba(engine): + from pandas.core._numba.kernels import grouped_mean + + return self._numba_agg_general( + grouped_mean, + executor.float_dtype_mapping, + engine_kwargs, + min_periods=0, + ) + else: + result = self._cython_agg_general( + "mean", + alt=lambda x: Series(x).mean(numeric_only=numeric_only), + numeric_only=numeric_only, + ) + return result.__finalize__(self.obj, method="groupby") + + @final + def median(self, numeric_only: bool = False) -> NDFrameT: + """ + Compute median of groups, excluding missing values. + + For multiple groupings, the result index will be a MultiIndex + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None`` and defaults to False. + + Returns + ------- + Series or DataFrame + Median of values within each group. + + Examples + -------- + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst) + >>> ser + a 7 + a 2 + a 8 + b 4 + b 3 + b 3 + dtype: int64 + >>> ser.groupby(level=0).median() + a 7.0 + b 3.0 + dtype: float64 + + For DataFrameGroupBy: + + >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]} + >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog', + ... 'mouse', 'mouse', 'mouse', 'mouse']) + >>> df + a b + dog 1 1 + dog 3 4 + dog 5 8 + mouse 7 4 + mouse 7 4 + mouse 8 2 + mouse 3 1 + >>> df.groupby(level=0).median() + a b + dog 3.0 4.0 + mouse 7.0 3.0 + + For Resampler: + + >>> ser = pd.Series([1, 2, 3, 3, 4, 5], + ... index=pd.DatetimeIndex(['2023-01-01', + ... '2023-01-10', + ... '2023-01-15', + ... '2023-02-01', + ... '2023-02-10', + ... '2023-02-15'])) + >>> ser.resample('MS').median() + 2023-01-01 2.0 + 2023-02-01 4.0 + Freq: MS, dtype: float64 + """ + result = self._cython_agg_general( + "median", + alt=lambda x: Series(x).median(numeric_only=numeric_only), + numeric_only=numeric_only, + ) + return result.__finalize__(self.obj, method="groupby") + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def std( + self, + ddof: int = 1, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + numeric_only: bool = False, + ): + """ + Compute standard deviation of groups, excluding missing values. + + For multiple groupings, the result index will be a MultiIndex. + + Parameters + ---------- + ddof : int, default 1 + Degrees of freedom. + + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + .. versionadded:: 1.4.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` + + .. versionadded:: 1.4.0 + + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + .. versionchanged:: 2.0.0 + + numeric_only now defaults to ``False``. + + Returns + ------- + Series or DataFrame + Standard deviation of values within each group. + %(see_also)s + Examples + -------- + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst) + >>> ser + a 7 + a 2 + a 8 + b 4 + b 3 + b 3 + dtype: int64 + >>> ser.groupby(level=0).std() + a 3.21455 + b 0.57735 + dtype: float64 + + For DataFrameGroupBy: + + >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]} + >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog', + ... 'mouse', 'mouse', 'mouse', 'mouse']) + >>> df + a b + dog 1 1 + dog 3 4 + dog 5 8 + mouse 7 4 + mouse 7 4 + mouse 8 2 + mouse 3 1 + >>> df.groupby(level=0).std() + a b + dog 2.000000 3.511885 + mouse 2.217356 1.500000 + """ + if maybe_use_numba(engine): + from pandas.core._numba.kernels import grouped_var + + return np.sqrt( + self._numba_agg_general( + grouped_var, + executor.float_dtype_mapping, + engine_kwargs, + min_periods=0, + ddof=ddof, + ) + ) + else: + return self._cython_agg_general( + "std", + alt=lambda x: Series(x).std(ddof=ddof), + numeric_only=numeric_only, + ddof=ddof, + ) + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def var( + self, + ddof: int = 1, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + numeric_only: bool = False, + ): + """ + Compute variance of groups, excluding missing values. + + For multiple groupings, the result index will be a MultiIndex. + + Parameters + ---------- + ddof : int, default 1 + Degrees of freedom. + + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + .. versionadded:: 1.4.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` + + .. versionadded:: 1.4.0 + + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + .. versionchanged:: 2.0.0 + + numeric_only now defaults to ``False``. + + Returns + ------- + Series or DataFrame + Variance of values within each group. + %(see_also)s + Examples + -------- + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst) + >>> ser + a 7 + a 2 + a 8 + b 4 + b 3 + b 3 + dtype: int64 + >>> ser.groupby(level=0).var() + a 10.333333 + b 0.333333 + dtype: float64 + + For DataFrameGroupBy: + + >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]} + >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog', + ... 'mouse', 'mouse', 'mouse', 'mouse']) + >>> df + a b + dog 1 1 + dog 3 4 + dog 5 8 + mouse 7 4 + mouse 7 4 + mouse 8 2 + mouse 3 1 + >>> df.groupby(level=0).var() + a b + dog 4.000000 12.333333 + mouse 4.916667 2.250000 + """ + if maybe_use_numba(engine): + from pandas.core._numba.kernels import grouped_var + + return self._numba_agg_general( + grouped_var, + executor.float_dtype_mapping, + engine_kwargs, + min_periods=0, + ddof=ddof, + ) + else: + return self._cython_agg_general( + "var", + alt=lambda x: Series(x).var(ddof=ddof), + numeric_only=numeric_only, + ddof=ddof, + ) + + @final + def _value_counts( + self, + subset: Sequence[Hashable] | None = None, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + dropna: bool = True, + ) -> DataFrame | Series: + """ + Shared implementation of value_counts for SeriesGroupBy and DataFrameGroupBy. + + SeriesGroupBy additionally supports a bins argument. See the docstring of + DataFrameGroupBy.value_counts for a description of arguments. + """ + if self.axis == 1: + raise NotImplementedError( + "DataFrameGroupBy.value_counts only handles axis=0" + ) + name = "proportion" if normalize else "count" + + df = self.obj + obj = self._obj_with_exclusions + + in_axis_names = { + grouping.name for grouping in self.grouper.groupings if grouping.in_axis + } + if isinstance(obj, Series): + _name = obj.name + keys = [] if _name in in_axis_names else [obj] + else: + unique_cols = set(obj.columns) + if subset is not None: + subsetted = set(subset) + clashing = subsetted & set(in_axis_names) + if clashing: + raise ValueError( + f"Keys {clashing} in subset cannot be in " + "the groupby column keys." + ) + doesnt_exist = subsetted - unique_cols + if doesnt_exist: + raise ValueError( + f"Keys {doesnt_exist} in subset do not " + f"exist in the DataFrame." + ) + else: + subsetted = unique_cols + + keys = [ + # Can't use .values because the column label needs to be preserved + obj.iloc[:, idx] + for idx, _name in enumerate(obj.columns) + if _name not in in_axis_names and _name in subsetted + ] + + groupings = list(self.grouper.groupings) + for key in keys: + grouper, _, _ = get_grouper( + df, + key=key, + axis=self.axis, + sort=self.sort, + observed=False, + dropna=dropna, + ) + groupings += list(grouper.groupings) + + # Take the size of the overall columns + gb = df.groupby( + groupings, + sort=self.sort, + observed=self.observed, + dropna=self.dropna, + ) + result_series = cast(Series, gb.size()) + result_series.name = name + + # GH-46357 Include non-observed categories + # of non-grouping columns regardless of `observed` + if any( + isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex)) + and not grouping._observed + for grouping in groupings + ): + levels_list = [ping.result_index for ping in groupings] + multi_index, _ = MultiIndex.from_product( + levels_list, names=[ping.name for ping in groupings] + ).sortlevel() + result_series = result_series.reindex(multi_index, fill_value=0) + + if normalize: + # Normalize the results by dividing by the original group sizes. + # We are guaranteed to have the first N levels be the + # user-requested grouping. + levels = list( + range(len(self.grouper.groupings), result_series.index.nlevels) + ) + indexed_group_size = result_series.groupby( + result_series.index.droplevel(levels), + sort=self.sort, + dropna=self.dropna, + # GH#43999 - deprecation of observed=False + observed=False, + ).transform("sum") + result_series /= indexed_group_size + + # Handle groups of non-observed categories + result_series = result_series.fillna(0.0) + + if sort: + # Sort the values and then resort by the main grouping + index_level = range(len(self.grouper.groupings)) + result_series = result_series.sort_values(ascending=ascending).sort_index( + level=index_level, sort_remaining=False + ) + + result: Series | DataFrame + if self.as_index: + result = result_series + else: + # Convert to frame + index = result_series.index + columns = com.fill_missing_names(index.names) + if name in columns: + raise ValueError(f"Column label '{name}' is duplicate of result column") + result_series.name = name + result_series.index = index.set_names(range(len(columns))) + result_frame = result_series.reset_index() + orig_dtype = self.grouper.groupings[0].obj.columns.dtype # type: ignore[union-attr] + cols = Index(columns, dtype=orig_dtype).insert(len(columns), name) + result_frame.columns = cols + result = result_frame + return result.__finalize__(self.obj, method="value_counts") + + @final + def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: + """ + Compute standard error of the mean of groups, excluding missing values. + + For multiple groupings, the result index will be a MultiIndex. + + Parameters + ---------- + ddof : int, default 1 + Degrees of freedom. + + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + .. versionchanged:: 2.0.0 + + numeric_only now defaults to ``False``. + + Returns + ------- + Series or DataFrame + Standard error of the mean of values within each group. + + Examples + -------- + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b', 'b'] + >>> ser = pd.Series([5, 10, 8, 14], index=lst) + >>> ser + a 5 + a 10 + b 8 + b 14 + dtype: int64 + >>> ser.groupby(level=0).sem() + a 2.5 + b 3.0 + dtype: float64 + + For DataFrameGroupBy: + + >>> data = [[1, 12, 11], [1, 15, 2], [2, 5, 8], [2, 6, 12]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["tuna", "salmon", "catfish", "goldfish"]) + >>> df + a b c + tuna 1 12 11 + salmon 1 15 2 + catfish 2 5 8 + goldfish 2 6 12 + >>> df.groupby("a").sem() + b c + a + 1 1.5 4.5 + 2 0.5 2.0 + + For Resampler: + + >>> ser = pd.Series([1, 3, 2, 4, 3, 8], + ... index=pd.DatetimeIndex(['2023-01-01', + ... '2023-01-10', + ... '2023-01-15', + ... '2023-02-01', + ... '2023-02-10', + ... '2023-02-15'])) + >>> ser.resample('MS').sem() + 2023-01-01 0.577350 + 2023-02-01 1.527525 + Freq: MS, dtype: float64 + """ + if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype): + raise TypeError( + f"{type(self).__name__}.sem called with " + f"numeric_only={numeric_only} and dtype {self.obj.dtype}" + ) + return self._cython_agg_general( + "sem", + alt=lambda x: Series(x).sem(ddof=ddof), + numeric_only=numeric_only, + ddof=ddof, + ) + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def size(self) -> DataFrame | Series: + """ + Compute group sizes. + + Returns + ------- + DataFrame or Series + Number of rows in each group as a Series if as_index is True + or a DataFrame if as_index is False. + %(see_also)s + Examples + -------- + + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b'] + >>> ser = pd.Series([1, 2, 3], index=lst) + >>> ser + a 1 + a 2 + b 3 + dtype: int64 + >>> ser.groupby(level=0).size() + a 2 + b 1 + dtype: int64 + + >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["owl", "toucan", "eagle"]) + >>> df + a b c + owl 1 2 3 + toucan 1 5 6 + eagle 7 8 9 + >>> df.groupby("a").size() + a + 1 2 + 7 1 + dtype: int64 + + For Resampler: + + >>> ser = pd.Series([1, 2, 3], index=pd.DatetimeIndex( + ... ['2023-01-01', '2023-01-15', '2023-02-01'])) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + dtype: int64 + >>> ser.resample('MS').size() + 2023-01-01 2 + 2023-02-01 1 + Freq: MS, dtype: int64 + """ + result = self.grouper.size() + dtype_backend: None | Literal["pyarrow", "numpy_nullable"] = None + if isinstance(self.obj, Series): + if isinstance(self.obj.array, ArrowExtensionArray): + if isinstance(self.obj.array, ArrowStringArrayNumpySemantics): + dtype_backend = None + elif isinstance(self.obj.array, ArrowStringArray): + dtype_backend = "numpy_nullable" + else: + dtype_backend = "pyarrow" + elif isinstance(self.obj.array, BaseMaskedArray): + dtype_backend = "numpy_nullable" + # TODO: For DataFrames what if columns are mixed arrow/numpy/masked? + + # GH28330 preserve subclassed Series/DataFrames through calls + if isinstance(self.obj, Series): + result = self._obj_1d_constructor(result, name=self.obj.name) + else: + result = self._obj_1d_constructor(result) + + if dtype_backend is not None: + result = result.convert_dtypes( + infer_objects=False, + convert_string=False, + convert_boolean=False, + convert_floating=False, + dtype_backend=dtype_backend, + ) + + with com.temp_setattr(self, "as_index", True): + # size already has the desired behavior in GH#49519, but this makes the + # as_index=False path of _reindex_output fail on categorical groupers. + result = self._reindex_output(result, fill_value=0) + if not self.as_index: + # error: Incompatible types in assignment (expression has + # type "DataFrame", variable has type "Series") + result = result.rename("size").reset_index() # type: ignore[assignment] + return result + + @final + @doc( + _groupby_agg_method_engine_template, + fname="sum", + no=False, + mc=0, + e=None, + ek=None, + example=dedent( + """\ + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b', 'b'] + >>> ser = pd.Series([1, 2, 3, 4], index=lst) + >>> ser + a 1 + a 2 + b 3 + b 4 + dtype: int64 + >>> ser.groupby(level=0).sum() + a 3 + b 7 + dtype: int64 + + For DataFrameGroupBy: + + >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["tiger", "leopard", "cheetah", "lion"]) + >>> df + a b c + tiger 1 8 2 + leopard 1 2 5 + cheetah 2 5 8 + lion 2 6 9 + >>> df.groupby("a").sum() + b c + a + 1 10 7 + 2 11 17""" + ), + ) + def sum( + self, + numeric_only: bool = False, + min_count: int = 0, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + if maybe_use_numba(engine): + from pandas.core._numba.kernels import grouped_sum + + return self._numba_agg_general( + grouped_sum, + executor.default_dtype_mapping, + engine_kwargs, + min_periods=min_count, + ) + else: + # If we are grouping on categoricals we want unobserved categories to + # return zero, rather than the default of NaN which the reindexing in + # _agg_general() returns. GH #31422 + with com.temp_setattr(self, "observed", True): + result = self._agg_general( + numeric_only=numeric_only, + min_count=min_count, + alias="sum", + npfunc=np.sum, + ) + + return self._reindex_output(result, fill_value=0) + + @final + @doc( + _groupby_agg_method_template, + fname="prod", + no=False, + mc=0, + example=dedent( + """\ + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b', 'b'] + >>> ser = pd.Series([1, 2, 3, 4], index=lst) + >>> ser + a 1 + a 2 + b 3 + b 4 + dtype: int64 + >>> ser.groupby(level=0).prod() + a 2 + b 12 + dtype: int64 + + For DataFrameGroupBy: + + >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["tiger", "leopard", "cheetah", "lion"]) + >>> df + a b c + tiger 1 8 2 + leopard 1 2 5 + cheetah 2 5 8 + lion 2 6 9 + >>> df.groupby("a").prod() + b c + a + 1 16 10 + 2 30 72""" + ), + ) + def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: + return self._agg_general( + numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod + ) + + @final + @doc( + _groupby_agg_method_engine_template, + fname="min", + no=False, + mc=-1, + e=None, + ek=None, + example=dedent( + """\ + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b', 'b'] + >>> ser = pd.Series([1, 2, 3, 4], index=lst) + >>> ser + a 1 + a 2 + b 3 + b 4 + dtype: int64 + >>> ser.groupby(level=0).min() + a 1 + b 3 + dtype: int64 + + For DataFrameGroupBy: + + >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["tiger", "leopard", "cheetah", "lion"]) + >>> df + a b c + tiger 1 8 2 + leopard 1 2 5 + cheetah 2 5 8 + lion 2 6 9 + >>> df.groupby("a").min() + b c + a + 1 2 2 + 2 5 8""" + ), + ) + def min( + self, + numeric_only: bool = False, + min_count: int = -1, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + if maybe_use_numba(engine): + from pandas.core._numba.kernels import grouped_min_max + + return self._numba_agg_general( + grouped_min_max, + executor.identity_dtype_mapping, + engine_kwargs, + min_periods=min_count, + is_max=False, + ) + else: + return self._agg_general( + numeric_only=numeric_only, + min_count=min_count, + alias="min", + npfunc=np.min, + ) + + @final + @doc( + _groupby_agg_method_engine_template, + fname="max", + no=False, + mc=-1, + e=None, + ek=None, + example=dedent( + """\ + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b', 'b'] + >>> ser = pd.Series([1, 2, 3, 4], index=lst) + >>> ser + a 1 + a 2 + b 3 + b 4 + dtype: int64 + >>> ser.groupby(level=0).max() + a 2 + b 4 + dtype: int64 + + For DataFrameGroupBy: + + >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["tiger", "leopard", "cheetah", "lion"]) + >>> df + a b c + tiger 1 8 2 + leopard 1 2 5 + cheetah 2 5 8 + lion 2 6 9 + >>> df.groupby("a").max() + b c + a + 1 8 5 + 2 6 9""" + ), + ) + def max( + self, + numeric_only: bool = False, + min_count: int = -1, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + if maybe_use_numba(engine): + from pandas.core._numba.kernels import grouped_min_max + + return self._numba_agg_general( + grouped_min_max, + executor.identity_dtype_mapping, + engine_kwargs, + min_periods=min_count, + is_max=True, + ) + else: + return self._agg_general( + numeric_only=numeric_only, + min_count=min_count, + alias="max", + npfunc=np.max, + ) + + @final + def first(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: + """ + Compute the first non-null entry of each column. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + min_count : int, default -1 + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + + Returns + ------- + Series or DataFrame + First non-null of values within each group. + + See Also + -------- + DataFrame.groupby : Apply a function groupby to each row or column of a + DataFrame. + pandas._core.groupby.DataFrameGroupBy.last : Compute the last non-null entry + of each column. + pandas._core.groupby.DataFrameGroupBy.nth : Take the nth row from each group. + + Examples + -------- + >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[None, 5, 6], C=[1, 2, 3], + ... D=['3/11/2000', '3/12/2000', '3/13/2000'])) + >>> df['D'] = pd.to_datetime(df['D']) + >>> df.groupby("A").first() + B C D + A + 1 5.0 1 2000-03-11 + 3 6.0 3 2000-03-13 + >>> df.groupby("A").first(min_count=2) + B C D + A + 1 NaN 1.0 2000-03-11 + 3 NaN NaN NaT + >>> df.groupby("A").first(numeric_only=True) + B C + A + 1 5.0 1 + 3 6.0 3 + """ + + def first_compat(obj: NDFrameT, axis: AxisInt = 0): + def first(x: Series): + """Helper function for first item that isn't NA.""" + arr = x.array[notna(x.array)] + if not len(arr): + return x.array.dtype.na_value + return arr[0] + + if isinstance(obj, DataFrame): + return obj.apply(first, axis=axis) + elif isinstance(obj, Series): + return first(obj) + else: # pragma: no cover + raise TypeError(type(obj)) + + return self._agg_general( + numeric_only=numeric_only, + min_count=min_count, + alias="first", + npfunc=first_compat, + ) + + @final + def last(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: + """ + Compute the last non-null entry of each column. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. If None, will attempt to use + everything, then use only numeric data. + min_count : int, default -1 + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + + Returns + ------- + Series or DataFrame + Last non-null of values within each group. + + See Also + -------- + DataFrame.groupby : Apply a function groupby to each row or column of a + DataFrame. + pandas._core.groupby.DataFrameGroupBy.first : Compute the first non-null entry + of each column. + pandas._core.groupby.DataFrameGroupBy.nth : Take the nth row from each group. + + Examples + -------- + >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[5, None, 6], C=[1, 2, 3])) + >>> df.groupby("A").last() + B C + A + 1 5.0 2 + 3 6.0 3 + """ + + def last_compat(obj: NDFrameT, axis: AxisInt = 0): + def last(x: Series): + """Helper function for last item that isn't NA.""" + arr = x.array[notna(x.array)] + if not len(arr): + return x.array.dtype.na_value + return arr[-1] + + if isinstance(obj, DataFrame): + return obj.apply(last, axis=axis) + elif isinstance(obj, Series): + return last(obj) + else: # pragma: no cover + raise TypeError(type(obj)) + + return self._agg_general( + numeric_only=numeric_only, + min_count=min_count, + alias="last", + npfunc=last_compat, + ) + + @final + def ohlc(self) -> DataFrame: + """ + Compute open, high, low and close values of a group, excluding missing values. + + For multiple groupings, the result index will be a MultiIndex + + Returns + ------- + DataFrame + Open, high, low and close values within each group. + + Examples + -------- + + For SeriesGroupBy: + + >>> lst = ['SPX', 'CAC', 'SPX', 'CAC', 'SPX', 'CAC', 'SPX', 'CAC',] + >>> ser = pd.Series([3.4, 9.0, 7.2, 5.2, 8.8, 9.4, 0.1, 0.5], index=lst) + >>> ser + SPX 3.4 + CAC 9.0 + SPX 7.2 + CAC 5.2 + SPX 8.8 + CAC 9.4 + SPX 0.1 + CAC 0.5 + dtype: float64 + >>> ser.groupby(level=0).ohlc() + open high low close + CAC 9.0 9.4 0.5 0.5 + SPX 3.4 8.8 0.1 0.1 + + For DataFrameGroupBy: + + >>> data = {2022: [1.2, 2.3, 8.9, 4.5, 4.4, 3, 2 , 1], + ... 2023: [3.4, 9.0, 7.2, 5.2, 8.8, 9.4, 8.2, 1.0]} + >>> df = pd.DataFrame(data, index=['SPX', 'CAC', 'SPX', 'CAC', + ... 'SPX', 'CAC', 'SPX', 'CAC']) + >>> df + 2022 2023 + SPX 1.2 3.4 + CAC 2.3 9.0 + SPX 8.9 7.2 + CAC 4.5 5.2 + SPX 4.4 8.8 + CAC 3.0 9.4 + SPX 2.0 8.2 + CAC 1.0 1.0 + >>> df.groupby(level=0).ohlc() + 2022 2023 + open high low close open high low close + CAC 2.3 4.5 1.0 1.0 9.0 9.4 1.0 1.0 + SPX 1.2 8.9 1.2 2.0 3.4 8.8 3.4 8.2 + + For Resampler: + + >>> ser = pd.Series([1, 3, 2, 4, 3, 5], + ... index=pd.DatetimeIndex(['2023-01-01', + ... '2023-01-10', + ... '2023-01-15', + ... '2023-02-01', + ... '2023-02-10', + ... '2023-02-15'])) + >>> ser.resample('MS').ohlc() + open high low close + 2023-01-01 1 3 1 2 + 2023-02-01 4 5 3 5 + """ + if self.obj.ndim == 1: + obj = self._selected_obj + + is_numeric = is_numeric_dtype(obj.dtype) + if not is_numeric: + raise DataError("No numeric types to aggregate") + + res_values = self.grouper._cython_operation( + "aggregate", obj._values, "ohlc", axis=0, min_count=-1 + ) + + agg_names = ["open", "high", "low", "close"] + result = self.obj._constructor_expanddim( + res_values, index=self.grouper.result_index, columns=agg_names + ) + return self._reindex_output(result) + + result = self._apply_to_column_groupbys(lambda sgb: sgb.ohlc()) + return result + + @doc(DataFrame.describe) + def describe( + self, + percentiles=None, + include=None, + exclude=None, + ) -> NDFrameT: + obj = self._obj_with_exclusions + + if len(obj) == 0: + described = obj.describe( + percentiles=percentiles, include=include, exclude=exclude + ) + if obj.ndim == 1: + result = described + else: + result = described.unstack() + return result.to_frame().T.iloc[:0] + + with com.temp_setattr(self, "as_index", True): + result = self._python_apply_general( + lambda x: x.describe( + percentiles=percentiles, include=include, exclude=exclude + ), + obj, + not_indexed_same=True, + ) + if self.axis == 1: + return result.T + + # GH#49256 - properly handle the grouping column(s) + result = result.unstack() + if not self.as_index: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) + + return result + + @final + def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resampler: + """ + Provide resampling when using a TimeGrouper. + + Given a grouper, the function resamples it according to a string + "string" -> "frequency". + + See the :ref:`frequency aliases ` + documentation for more details. + + Parameters + ---------- + rule : str or DateOffset + The offset string or object representing target grouper conversion. + *args + Possible arguments are `how`, `fill_method`, `limit`, `kind` and + `on`, and other arguments of `TimeGrouper`. + include_groups : bool, default True + When True, will attempt to include the groupings in the operation in + the case that they are columns of the DataFrame. If this raises a + TypeError, the result will be computed with the groupings excluded. + When False, the groupings will be excluded when applying ``func``. + + .. versionadded:: 2.2.0 + + .. deprecated:: 2.2.0 + + Setting include_groups to True is deprecated. Only the value + False will be allowed in a future version of pandas. + + **kwargs + Possible arguments are `how`, `fill_method`, `limit`, `kind` and + `on`, and other arguments of `TimeGrouper`. + + Returns + ------- + pandas.api.typing.DatetimeIndexResamplerGroupby, + pandas.api.typing.PeriodIndexResamplerGroupby, or + pandas.api.typing.TimedeltaIndexResamplerGroupby + Return a new groupby object, with type depending on the data + being resampled. + + See Also + -------- + Grouper : Specify a frequency to resample with when + grouping by a key. + DatetimeIndex.resample : Frequency conversion and resampling of + time series. + + Examples + -------- + >>> idx = pd.date_range('1/1/2000', periods=4, freq='min') + >>> df = pd.DataFrame(data=4 * [range(2)], + ... index=idx, + ... columns=['a', 'b']) + >>> df.iloc[2, 0] = 5 + >>> df + a b + 2000-01-01 00:00:00 0 1 + 2000-01-01 00:01:00 0 1 + 2000-01-01 00:02:00 5 1 + 2000-01-01 00:03:00 0 1 + + Downsample the DataFrame into 3 minute bins and sum the values of + the timestamps falling into a bin. + + >>> df.groupby('a').resample('3min', include_groups=False).sum() + b + a + 0 2000-01-01 00:00:00 2 + 2000-01-01 00:03:00 1 + 5 2000-01-01 00:00:00 1 + + Upsample the series into 30 second bins. + + >>> df.groupby('a').resample('30s', include_groups=False).sum() + b + a + 0 2000-01-01 00:00:00 1 + 2000-01-01 00:00:30 0 + 2000-01-01 00:01:00 1 + 2000-01-01 00:01:30 0 + 2000-01-01 00:02:00 0 + 2000-01-01 00:02:30 0 + 2000-01-01 00:03:00 1 + 5 2000-01-01 00:02:00 1 + + Resample by month. Values are assigned to the month of the period. + + >>> df.groupby('a').resample('ME', include_groups=False).sum() + b + a + 0 2000-01-31 3 + 5 2000-01-31 1 + + Downsample the series into 3 minute bins as above, but close the right + side of the bin interval. + + >>> ( + ... df.groupby('a') + ... .resample('3min', closed='right', include_groups=False) + ... .sum() + ... ) + b + a + 0 1999-12-31 23:57:00 1 + 2000-01-01 00:00:00 2 + 5 2000-01-01 00:00:00 1 + + Downsample the series into 3 minute bins and close the right side of + the bin interval, but label each bin using the right edge instead of + the left. + + >>> ( + ... df.groupby('a') + ... .resample('3min', closed='right', label='right', include_groups=False) + ... .sum() + ... ) + b + a + 0 2000-01-01 00:00:00 1 + 2000-01-01 00:03:00 2 + 5 2000-01-01 00:03:00 1 + """ + from pandas.core.resample import get_resampler_for_grouping + + # mypy flags that include_groups could be specified via `*args` or `**kwargs` + # GH#54961 would resolve. + return get_resampler_for_grouping( # type: ignore[misc] + self, rule, *args, include_groups=include_groups, **kwargs + ) + + @final + def rolling(self, *args, **kwargs) -> RollingGroupby: + """ + Return a rolling grouper, providing rolling functionality per group. + + Parameters + ---------- + window : int, timedelta, str, offset, or BaseIndexer subclass + Size of the moving window. + + If an integer, the fixed number of observations used for + each window. + + If a timedelta, str, or offset, the time period of each window. Each + window will be a variable sized based on the observations included in + the time-period. This is only valid for datetimelike indexes. + To learn more about the offsets & frequency strings, please see `this link + `__. + + If a BaseIndexer subclass, the window boundaries + based on the defined ``get_window_bounds`` method. Additional rolling + keyword arguments, namely ``min_periods``, ``center``, ``closed`` and + ``step`` will be passed to ``get_window_bounds``. + + min_periods : int, default None + Minimum number of observations in window required to have a value; + otherwise, result is ``np.nan``. + + For a window that is specified by an offset, + ``min_periods`` will default to 1. + + For a window that is specified by an integer, ``min_periods`` will default + to the size of the window. + + center : bool, default False + If False, set the window labels as the right edge of the window index. + + If True, set the window labels as the center of the window index. + + win_type : str, default None + If ``None``, all points are evenly weighted. + + If a string, it must be a valid `scipy.signal window function + `__. + + Certain Scipy window types require additional parameters to be passed + in the aggregation function. The additional parameters must match + the keywords specified in the Scipy window type method signature. + + on : str, optional + For a DataFrame, a column label or Index level on which + to calculate the rolling window, rather than the DataFrame's index. + + Provided integer column is ignored and excluded from result since + an integer index is not used to calculate the rolling window. + + axis : int or str, default 0 + If ``0`` or ``'index'``, roll across the rows. + + If ``1`` or ``'columns'``, roll across the columns. + + For `Series` this parameter is unused and defaults to 0. + + closed : str, default None + If ``'right'``, the first point in the window is excluded from calculations. + + If ``'left'``, the last point in the window is excluded from calculations. + + If ``'both'``, no points in the window are excluded from calculations. + + If ``'neither'``, the first and last points in the window are excluded + from calculations. + + Default ``None`` (``'right'``). + + method : str {'single', 'table'}, default 'single' + Execute the rolling operation per single column or row (``'single'``) + or over the entire object (``'table'``). + + This argument is only implemented when specifying ``engine='numba'`` + in the method call. + + Returns + ------- + pandas.api.typing.RollingGroupby + Return a new grouper with our rolling appended. + + See Also + -------- + Series.rolling : Calling object with Series data. + DataFrame.rolling : Calling object with DataFrames. + Series.groupby : Apply a function groupby to a Series. + DataFrame.groupby : Apply a function groupby. + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 1, 2, 2], + ... 'B': [1, 2, 3, 4], + ... 'C': [0.362, 0.227, 1.267, -0.562]}) + >>> df + A B C + 0 1 1 0.362 + 1 1 2 0.227 + 2 2 3 1.267 + 3 2 4 -0.562 + + >>> df.groupby('A').rolling(2).sum() + B C + A + 1 0 NaN NaN + 1 3.0 0.589 + 2 2 NaN NaN + 3 7.0 0.705 + + >>> df.groupby('A').rolling(2, min_periods=1).sum() + B C + A + 1 0 1.0 0.362 + 1 3.0 0.589 + 2 2 3.0 1.267 + 3 7.0 0.705 + + >>> df.groupby('A').rolling(2, on='B').sum() + B C + A + 1 0 1 NaN + 1 2 0.589 + 2 2 3 NaN + 3 4 0.705 + """ + from pandas.core.window import RollingGroupby + + return RollingGroupby( + self._selected_obj, + *args, + _grouper=self.grouper, + _as_index=self.as_index, + **kwargs, + ) + + @final + @Substitution(name="groupby") + @Appender(_common_see_also) + def expanding(self, *args, **kwargs) -> ExpandingGroupby: + """ + Return an expanding grouper, providing expanding + functionality per group. + + Returns + ------- + pandas.api.typing.ExpandingGroupby + """ + from pandas.core.window import ExpandingGroupby + + return ExpandingGroupby( + self._selected_obj, + *args, + _grouper=self.grouper, + **kwargs, + ) + + @final + @Substitution(name="groupby") + @Appender(_common_see_also) + def ewm(self, *args, **kwargs) -> ExponentialMovingWindowGroupby: + """ + Return an ewm grouper, providing ewm functionality per group. + + Returns + ------- + pandas.api.typing.ExponentialMovingWindowGroupby + """ + from pandas.core.window import ExponentialMovingWindowGroupby + + return ExponentialMovingWindowGroupby( + self._selected_obj, + *args, + _grouper=self.grouper, + **kwargs, + ) + + @final + def _fill(self, direction: Literal["ffill", "bfill"], limit: int | None = None): + """ + Shared function for `pad` and `backfill` to call Cython method. + + Parameters + ---------- + direction : {'ffill', 'bfill'} + Direction passed to underlying Cython function. `bfill` will cause + values to be filled backwards. `ffill` and any other values will + default to a forward fill + limit : int, default None + Maximum number of consecutive values to fill. If `None`, this + method will convert to -1 prior to passing to Cython + + Returns + ------- + `Series` or `DataFrame` with filled values + + See Also + -------- + pad : Returns Series with minimum number of char in object. + backfill : Backward fill the missing values in the dataset. + """ + # Need int value for Cython + if limit is None: + limit = -1 + + ids, _, _ = self.grouper.group_info + sorted_labels = np.argsort(ids, kind="mergesort").astype(np.intp, copy=False) + if direction == "bfill": + sorted_labels = sorted_labels[::-1] + + col_func = partial( + libgroupby.group_fillna_indexer, + labels=ids, + sorted_labels=sorted_labels, + limit=limit, + dropna=self.dropna, + ) + + def blk_func(values: ArrayLike) -> ArrayLike: + mask = isna(values) + if values.ndim == 1: + indexer = np.empty(values.shape, dtype=np.intp) + col_func(out=indexer, mask=mask) + return algorithms.take_nd(values, indexer) + + else: + # We broadcast algorithms.take_nd analogous to + # np.take_along_axis + if isinstance(values, np.ndarray): + dtype = values.dtype + if self.grouper.has_dropped_na: + # dropped null groups give rise to nan in the result + dtype = ensure_dtype_can_hold_na(values.dtype) + out = np.empty(values.shape, dtype=dtype) + else: + # Note: we only get here with backfill/pad, + # so if we have a dtype that cannot hold NAs, + # then there will be no -1s in indexer, so we can use + # the original dtype (no need to ensure_dtype_can_hold_na) + out = type(values)._empty(values.shape, dtype=values.dtype) + + for i, value_element in enumerate(values): + # call group_fillna_indexer column-wise + indexer = np.empty(values.shape[1], dtype=np.intp) + col_func(out=indexer, mask=mask[i]) + out[i, :] = algorithms.take_nd(value_element, indexer) + return out + + mgr = self._get_data_to_aggregate() + res_mgr = mgr.apply(blk_func) + + new_obj = self._wrap_agged_manager(res_mgr) + + if self.axis == 1: + # Only relevant for DataFrameGroupBy + new_obj = new_obj.T + new_obj.columns = self.obj.columns + + new_obj.index = self.obj.index + return new_obj + + @final + @Substitution(name="groupby") + def ffill(self, limit: int | None = None): + """ + Forward fill the values. + + Parameters + ---------- + limit : int, optional + Limit of how many values to fill. + + Returns + ------- + Series or DataFrame + Object with missing values filled. + + See Also + -------- + Series.ffill: Returns Series with minimum number of char in object. + DataFrame.ffill: Object with missing values filled or None if inplace=True. + Series.fillna: Fill NaN values of a Series. + DataFrame.fillna: Fill NaN values of a DataFrame. + + Examples + -------- + + For SeriesGroupBy: + + >>> key = [0, 0, 1, 1] + >>> ser = pd.Series([np.nan, 2, 3, np.nan], index=key) + >>> ser + 0 NaN + 0 2.0 + 1 3.0 + 1 NaN + dtype: float64 + >>> ser.groupby(level=0).ffill() + 0 NaN + 0 2.0 + 1 3.0 + 1 3.0 + dtype: float64 + + For DataFrameGroupBy: + + >>> df = pd.DataFrame( + ... { + ... "key": [0, 0, 1, 1, 1], + ... "A": [np.nan, 2, np.nan, 3, np.nan], + ... "B": [2, 3, np.nan, np.nan, np.nan], + ... "C": [np.nan, np.nan, 2, np.nan, np.nan], + ... } + ... ) + >>> df + key A B C + 0 0 NaN 2.0 NaN + 1 0 2.0 3.0 NaN + 2 1 NaN NaN 2.0 + 3 1 3.0 NaN NaN + 4 1 NaN NaN NaN + + Propagate non-null values forward or backward within each group along columns. + + >>> df.groupby("key").ffill() + A B C + 0 NaN 2.0 NaN + 1 2.0 3.0 NaN + 2 NaN NaN 2.0 + 3 3.0 NaN 2.0 + 4 3.0 NaN 2.0 + + Propagate non-null values forward or backward within each group along rows. + + >>> df.T.groupby(np.array([0, 0, 1, 1])).ffill().T + key A B C + 0 0.0 0.0 2.0 2.0 + 1 0.0 2.0 3.0 3.0 + 2 1.0 1.0 NaN 2.0 + 3 1.0 3.0 NaN NaN + 4 1.0 1.0 NaN NaN + + Only replace the first NaN element within a group along rows. + + >>> df.groupby("key").ffill(limit=1) + A B C + 0 NaN 2.0 NaN + 1 2.0 3.0 NaN + 2 NaN NaN 2.0 + 3 3.0 NaN 2.0 + 4 3.0 NaN NaN + """ + return self._fill("ffill", limit=limit) + + @final + @Substitution(name="groupby") + def bfill(self, limit: int | None = None): + """ + Backward fill the values. + + Parameters + ---------- + limit : int, optional + Limit of how many values to fill. + + Returns + ------- + Series or DataFrame + Object with missing values filled. + + See Also + -------- + Series.bfill : Backward fill the missing values in the dataset. + DataFrame.bfill: Backward fill the missing values in the dataset. + Series.fillna: Fill NaN values of a Series. + DataFrame.fillna: Fill NaN values of a DataFrame. + + Examples + -------- + + With Series: + + >>> index = ['Falcon', 'Falcon', 'Parrot', 'Parrot', 'Parrot'] + >>> s = pd.Series([None, 1, None, None, 3], index=index) + >>> s + Falcon NaN + Falcon 1.0 + Parrot NaN + Parrot NaN + Parrot 3.0 + dtype: float64 + >>> s.groupby(level=0).bfill() + Falcon 1.0 + Falcon 1.0 + Parrot 3.0 + Parrot 3.0 + Parrot 3.0 + dtype: float64 + >>> s.groupby(level=0).bfill(limit=1) + Falcon 1.0 + Falcon 1.0 + Parrot NaN + Parrot 3.0 + Parrot 3.0 + dtype: float64 + + With DataFrame: + + >>> df = pd.DataFrame({'A': [1, None, None, None, 4], + ... 'B': [None, None, 5, None, 7]}, index=index) + >>> df + A B + Falcon 1.0 NaN + Falcon NaN NaN + Parrot NaN 5.0 + Parrot NaN NaN + Parrot 4.0 7.0 + >>> df.groupby(level=0).bfill() + A B + Falcon 1.0 NaN + Falcon NaN NaN + Parrot 4.0 5.0 + Parrot 4.0 7.0 + Parrot 4.0 7.0 + >>> df.groupby(level=0).bfill(limit=1) + A B + Falcon 1.0 NaN + Falcon NaN NaN + Parrot NaN 5.0 + Parrot 4.0 7.0 + Parrot 4.0 7.0 + """ + return self._fill("bfill", limit=limit) + + @final + @property + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def nth(self) -> GroupByNthSelector: + """ + Take the nth row from each group if n is an int, otherwise a subset of rows. + + Can be either a call or an index. dropna is not available with index notation. + Index notation accepts a comma separated list of integers and slices. + + If dropna, will take the nth non-null row, dropna is either + 'all' or 'any'; this is equivalent to calling dropna(how=dropna) + before the groupby. + + Parameters + ---------- + n : int, slice or list of ints and slices + A single nth value for the row or a list of nth values or slices. + + .. versionchanged:: 1.4.0 + Added slice and lists containing slices. + Added index notation. + + dropna : {'any', 'all', None}, default None + Apply the specified dropna operation before counting which row is + the nth row. Only supported if n is an int. + + Returns + ------- + Series or DataFrame + N-th value within each group. + %(see_also)s + Examples + -------- + + >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2], + ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B']) + >>> g = df.groupby('A') + >>> g.nth(0) + A B + 0 1 NaN + 2 2 3.0 + >>> g.nth(1) + A B + 1 1 2.0 + 4 2 5.0 + >>> g.nth(-1) + A B + 3 1 4.0 + 4 2 5.0 + >>> g.nth([0, 1]) + A B + 0 1 NaN + 1 1 2.0 + 2 2 3.0 + 4 2 5.0 + >>> g.nth(slice(None, -1)) + A B + 0 1 NaN + 1 1 2.0 + 2 2 3.0 + + Index notation may also be used + + >>> g.nth[0, 1] + A B + 0 1 NaN + 1 1 2.0 + 2 2 3.0 + 4 2 5.0 + >>> g.nth[:-1] + A B + 0 1 NaN + 1 1 2.0 + 2 2 3.0 + + Specifying `dropna` allows ignoring ``NaN`` values + + >>> g.nth(0, dropna='any') + A B + 1 1 2.0 + 2 2 3.0 + + When the specified ``n`` is larger than any of the groups, an + empty DataFrame is returned + + >>> g.nth(3, dropna='any') + Empty DataFrame + Columns: [A, B] + Index: [] + """ + return GroupByNthSelector(self) + + def _nth( + self, + n: PositionalIndexer | tuple, + dropna: Literal["any", "all", None] = None, + ) -> NDFrameT: + if not dropna: + mask = self._make_mask_from_positional_indexer(n) + + ids, _, _ = self.grouper.group_info + + # Drop NA values in grouping + mask = mask & (ids != -1) + + out = self._mask_selected_obj(mask) + return out + + # dropna is truthy + if not is_integer(n): + raise ValueError("dropna option only supported for an integer argument") + + if dropna not in ["any", "all"]: + # Note: when agg-ing picker doesn't raise this, just returns NaN + raise ValueError( + "For a DataFrame or Series groupby.nth, dropna must be " + "either None, 'any' or 'all', " + f"(was passed {dropna})." + ) + + # old behaviour, but with all and any support for DataFrames. + # modified in GH 7559 to have better perf + n = cast(int, n) + dropped = self._selected_obj.dropna(how=dropna, axis=self.axis) + + # get a new grouper for our dropped obj + grouper: np.ndarray | Index | ops.BaseGrouper + if len(dropped) == len(self._selected_obj): + # Nothing was dropped, can use the same grouper + grouper = self.grouper + else: + # we don't have the grouper info available + # (e.g. we have selected out + # a column that is not in the current object) + axis = self.grouper.axis + grouper = self.grouper.codes_info[axis.isin(dropped.index)] + if self.grouper.has_dropped_na: + # Null groups need to still be encoded as -1 when passed to groupby + nulls = grouper == -1 + # error: No overload variant of "where" matches argument types + # "Any", "NAType", "Any" + values = np.where(nulls, NA, grouper) # type: ignore[call-overload] + grouper = Index(values, dtype="Int64") + + if self.axis == 1: + grb = dropped.T.groupby(grouper, as_index=self.as_index, sort=self.sort) + else: + grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort) + return grb.nth(n) + + @final + def quantile( + self, + q: float | AnyArrayLike = 0.5, + interpolation: str = "linear", + numeric_only: bool = False, + ): + """ + Return group values at the given quantile, a la numpy.percentile. + + Parameters + ---------- + q : float or array-like, default 0.5 (50% quantile) + Value(s) between 0 and 1 providing the quantile(s) to compute. + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + Method to use when the desired quantile falls between two points. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + .. versionchanged:: 2.0.0 + + numeric_only now defaults to ``False``. + + Returns + ------- + Series or DataFrame + Return type determined by caller of GroupBy object. + + See Also + -------- + Series.quantile : Similar method for Series. + DataFrame.quantile : Similar method for DataFrame. + numpy.percentile : NumPy method to compute qth percentile. + + Examples + -------- + >>> df = pd.DataFrame([ + ... ['a', 1], ['a', 2], ['a', 3], + ... ['b', 1], ['b', 3], ['b', 5] + ... ], columns=['key', 'val']) + >>> df.groupby('key').quantile() + val + key + a 2.0 + b 3.0 + """ + mgr = self._get_data_to_aggregate(numeric_only=numeric_only, name="quantile") + obj = self._wrap_agged_manager(mgr) + if self.axis == 1: + splitter = self.grouper._get_splitter(obj.T, axis=self.axis) + sdata = splitter._sorted_data.T + else: + splitter = self.grouper._get_splitter(obj, axis=self.axis) + sdata = splitter._sorted_data + + starts, ends = lib.generate_slices(splitter._slabels, splitter.ngroups) + + def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]: + if is_object_dtype(vals.dtype): + raise TypeError( + "'quantile' cannot be performed against 'object' dtypes!" + ) + + inference: DtypeObj | None = None + if isinstance(vals, BaseMaskedArray) and is_numeric_dtype(vals.dtype): + out = vals.to_numpy(dtype=float, na_value=np.nan) + inference = vals.dtype + elif is_integer_dtype(vals.dtype): + if isinstance(vals, ExtensionArray): + out = vals.to_numpy(dtype=float, na_value=np.nan) + else: + out = vals + inference = np.dtype(np.int64) + elif is_bool_dtype(vals.dtype) and isinstance(vals, ExtensionArray): + out = vals.to_numpy(dtype=float, na_value=np.nan) + elif is_bool_dtype(vals.dtype): + # GH#51424 deprecate to match Series/DataFrame behavior + warnings.warn( + f"Allowing bool dtype in {type(self).__name__}.quantile is " + "deprecated and will raise in a future version, matching " + "the Series/DataFrame behavior. Cast to uint8 dtype before " + "calling quantile instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + out = np.asarray(vals) + elif needs_i8_conversion(vals.dtype): + inference = vals.dtype + # In this case we need to delay the casting until after the + # np.lexsort below. + # error: Incompatible return value type (got + # "Tuple[Union[ExtensionArray, ndarray[Any, Any]], Union[Any, + # ExtensionDtype]]", expected "Tuple[ndarray[Any, Any], + # Optional[Union[dtype[Any], ExtensionDtype]]]") + return vals, inference # type: ignore[return-value] + elif isinstance(vals, ExtensionArray) and is_float_dtype(vals.dtype): + inference = np.dtype(np.float64) + out = vals.to_numpy(dtype=float, na_value=np.nan) + else: + out = np.asarray(vals) + + return out, inference + + def post_processor( + vals: np.ndarray, + inference: DtypeObj | None, + result_mask: np.ndarray | None, + orig_vals: ArrayLike, + ) -> ArrayLike: + if inference: + # Check for edge case + if isinstance(orig_vals, BaseMaskedArray): + assert result_mask is not None # for mypy + + if interpolation in {"linear", "midpoint"} and not is_float_dtype( + orig_vals + ): + return FloatingArray(vals, result_mask) + else: + # Item "ExtensionDtype" of "Union[ExtensionDtype, str, + # dtype[Any], Type[object]]" has no attribute "numpy_dtype" + # [union-attr] + with warnings.catch_warnings(): + # vals.astype with nan can warn with numpy >1.24 + warnings.filterwarnings("ignore", category=RuntimeWarning) + return type(orig_vals)( + vals.astype( + inference.numpy_dtype # type: ignore[union-attr] + ), + result_mask, + ) + + elif not ( + is_integer_dtype(inference) + and interpolation in {"linear", "midpoint"} + ): + if needs_i8_conversion(inference): + # error: Item "ExtensionArray" of "Union[ExtensionArray, + # ndarray[Any, Any]]" has no attribute "_ndarray" + vals = vals.astype("i8").view( + orig_vals._ndarray.dtype # type: ignore[union-attr] + ) + # error: Item "ExtensionArray" of "Union[ExtensionArray, + # ndarray[Any, Any]]" has no attribute "_from_backing_data" + return orig_vals._from_backing_data( # type: ignore[union-attr] + vals + ) + + assert isinstance(inference, np.dtype) # for mypy + return vals.astype(inference) + + return vals + + qs = np.array(q, dtype=np.float64) + pass_qs: np.ndarray | None = qs + if is_scalar(q): + qs = np.array([q], dtype=np.float64) + pass_qs = None + + ids, _, ngroups = self.grouper.group_info + nqs = len(qs) + + func = partial( + libgroupby.group_quantile, + labels=ids, + qs=qs, + interpolation=interpolation, + starts=starts, + ends=ends, + ) + + def blk_func(values: ArrayLike) -> ArrayLike: + orig_vals = values + if isinstance(values, BaseMaskedArray): + mask = values._mask + result_mask = np.zeros((ngroups, nqs), dtype=np.bool_) + else: + mask = isna(values) + result_mask = None + + is_datetimelike = needs_i8_conversion(values.dtype) + + vals, inference = pre_processor(values) + + ncols = 1 + if vals.ndim == 2: + ncols = vals.shape[0] + + out = np.empty((ncols, ngroups, nqs), dtype=np.float64) + + if is_datetimelike: + vals = vals.view("i8") + + if vals.ndim == 1: + # EA is always 1d + func( + out[0], + values=vals, + mask=mask, + result_mask=result_mask, + is_datetimelike=is_datetimelike, + ) + else: + for i in range(ncols): + func( + out[i], + values=vals[i], + mask=mask[i], + result_mask=None, + is_datetimelike=is_datetimelike, + ) + + if vals.ndim == 1: + out = out.ravel("K") + if result_mask is not None: + result_mask = result_mask.ravel("K") + else: + out = out.reshape(ncols, ngroups * nqs) + + return post_processor(out, inference, result_mask, orig_vals) + + res_mgr = sdata._mgr.grouped_reduce(blk_func) + + res = self._wrap_agged_manager(res_mgr) + return self._wrap_aggregated_output(res, qs=pass_qs) + + @final + @Substitution(name="groupby") + def ngroup(self, ascending: bool = True): + """ + Number each group from 0 to the number of groups - 1. + + This is the enumerative complement of cumcount. Note that the + numbers given to the groups match the order in which the groups + would be seen when iterating over the groupby object, not the + order they are first observed. + + Groups with missing keys (where `pd.isna()` is True) will be labeled with `NaN` + and will be skipped from the count. + + Parameters + ---------- + ascending : bool, default True + If False, number in reverse, from number of group - 1 to 0. + + Returns + ------- + Series + Unique numbers for each group. + + See Also + -------- + .cumcount : Number the rows in each group. + + Examples + -------- + >>> df = pd.DataFrame({"color": ["red", None, "red", "blue", "blue", "red"]}) + >>> df + color + 0 red + 1 None + 2 red + 3 blue + 4 blue + 5 red + >>> df.groupby("color").ngroup() + 0 1.0 + 1 NaN + 2 1.0 + 3 0.0 + 4 0.0 + 5 1.0 + dtype: float64 + >>> df.groupby("color", dropna=False).ngroup() + 0 1 + 1 2 + 2 1 + 3 0 + 4 0 + 5 1 + dtype: int64 + >>> df.groupby("color", dropna=False).ngroup(ascending=False) + 0 1 + 1 0 + 2 1 + 3 2 + 4 2 + 5 1 + dtype: int64 + """ + obj = self._obj_with_exclusions + index = obj._get_axis(self.axis) + comp_ids = self.grouper.group_info[0] + + dtype: type + if self.grouper.has_dropped_na: + comp_ids = np.where(comp_ids == -1, np.nan, comp_ids) + dtype = np.float64 + else: + dtype = np.int64 + + if any(ping._passed_categorical for ping in self.grouper.groupings): + # comp_ids reflect non-observed groups, we need only observed + comp_ids = rank_1d(comp_ids, ties_method="dense") - 1 + + result = self._obj_1d_constructor(comp_ids, index, dtype=dtype) + if not ascending: + result = self.ngroups - 1 - result + return result + + @final + @Substitution(name="groupby") + def cumcount(self, ascending: bool = True): + """ + Number each item in each group from 0 to the length of that group - 1. + + Essentially this is equivalent to + + .. code-block:: python + + self.apply(lambda x: pd.Series(np.arange(len(x)), x.index)) + + Parameters + ---------- + ascending : bool, default True + If False, number in reverse, from length of group - 1 to 0. + + Returns + ------- + Series + Sequence number of each element within each group. + + See Also + -------- + .ngroup : Number the groups themselves. + + Examples + -------- + >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']], + ... columns=['A']) + >>> df + A + 0 a + 1 a + 2 a + 3 b + 4 b + 5 a + >>> df.groupby('A').cumcount() + 0 0 + 1 1 + 2 2 + 3 0 + 4 1 + 5 3 + dtype: int64 + >>> df.groupby('A').cumcount(ascending=False) + 0 3 + 1 2 + 2 1 + 3 1 + 4 0 + 5 0 + dtype: int64 + """ + index = self._obj_with_exclusions._get_axis(self.axis) + cumcounts = self._cumcount_array(ascending=ascending) + return self._obj_1d_constructor(cumcounts, index) + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def rank( + self, + method: str = "average", + ascending: bool = True, + na_option: str = "keep", + pct: bool = False, + axis: AxisInt | lib.NoDefault = lib.no_default, + ) -> NDFrameT: + """ + Provide the rank of values within each group. + + Parameters + ---------- + method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' + * average: average rank of group. + * min: lowest rank in group. + * max: highest rank in group. + * first: ranks assigned in order they appear in the array. + * dense: like 'min', but rank always increases by 1 between groups. + ascending : bool, default True + False for ranks by high (1) to low (N). + na_option : {'keep', 'top', 'bottom'}, default 'keep' + * keep: leave NA values where they are. + * top: smallest rank if ascending. + * bottom: smallest rank if descending. + pct : bool, default False + Compute percentage rank of data within each group. + axis : int, default 0 + The axis of the object over which to compute the rank. + + .. deprecated:: 2.1.0 + For axis=1, operate on the underlying object instead. Otherwise + the axis keyword is not necessary. + + Returns + ------- + DataFrame with ranking of values within each group + %(see_also)s + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "group": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], + ... "value": [2, 4, 2, 3, 5, 1, 2, 4, 1, 5], + ... } + ... ) + >>> df + group value + 0 a 2 + 1 a 4 + 2 a 2 + 3 a 3 + 4 a 5 + 5 b 1 + 6 b 2 + 7 b 4 + 8 b 1 + 9 b 5 + >>> for method in ['average', 'min', 'max', 'dense', 'first']: + ... df[f'{method}_rank'] = df.groupby('group')['value'].rank(method) + >>> df + group value average_rank min_rank max_rank dense_rank first_rank + 0 a 2 1.5 1.0 2.0 1.0 1.0 + 1 a 4 4.0 4.0 4.0 3.0 4.0 + 2 a 2 1.5 1.0 2.0 1.0 2.0 + 3 a 3 3.0 3.0 3.0 2.0 3.0 + 4 a 5 5.0 5.0 5.0 4.0 5.0 + 5 b 1 1.5 1.0 2.0 1.0 1.0 + 6 b 2 3.0 3.0 3.0 2.0 3.0 + 7 b 4 4.0 4.0 4.0 3.0 4.0 + 8 b 1 1.5 1.0 2.0 1.0 2.0 + 9 b 5 5.0 5.0 5.0 4.0 5.0 + """ + if na_option not in {"keep", "top", "bottom"}: + msg = "na_option must be one of 'keep', 'top', or 'bottom'" + raise ValueError(msg) + + if axis is not lib.no_default: + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, "rank") + else: + axis = 0 + + kwargs = { + "ties_method": method, + "ascending": ascending, + "na_option": na_option, + "pct": pct, + } + if axis != 0: + # DataFrame uses different keyword name + kwargs["method"] = kwargs.pop("ties_method") + f = lambda x: x.rank(axis=axis, numeric_only=False, **kwargs) + result = self._python_apply_general( + f, self._selected_obj, is_transform=True + ) + return result + + return self._cython_transform( + "rank", + numeric_only=False, + axis=axis, + **kwargs, + ) + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def cumprod( + self, axis: Axis | lib.NoDefault = lib.no_default, *args, **kwargs + ) -> NDFrameT: + """ + Cumulative product for each group. + + Returns + ------- + Series or DataFrame + %(see_also)s + Examples + -------- + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b'] + >>> ser = pd.Series([6, 2, 0], index=lst) + >>> ser + a 6 + a 2 + b 0 + dtype: int64 + >>> ser.groupby(level=0).cumprod() + a 6 + a 12 + b 0 + dtype: int64 + + For DataFrameGroupBy: + + >>> data = [[1, 8, 2], [1, 2, 5], [2, 6, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["cow", "horse", "bull"]) + >>> df + a b c + cow 1 8 2 + horse 1 2 5 + bull 2 6 9 + >>> df.groupby("a").groups + {1: ['cow', 'horse'], 2: ['bull']} + >>> df.groupby("a").cumprod() + b c + cow 8 2 + horse 16 10 + bull 6 9 + """ + nv.validate_groupby_func("cumprod", args, kwargs, ["numeric_only", "skipna"]) + if axis is not lib.no_default: + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, "cumprod") + else: + axis = 0 + + if axis != 0: + f = lambda x: x.cumprod(axis=axis, **kwargs) + return self._python_apply_general(f, self._selected_obj, is_transform=True) + + return self._cython_transform("cumprod", **kwargs) + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def cumsum( + self, axis: Axis | lib.NoDefault = lib.no_default, *args, **kwargs + ) -> NDFrameT: + """ + Cumulative sum for each group. + + Returns + ------- + Series or DataFrame + %(see_also)s + Examples + -------- + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b'] + >>> ser = pd.Series([6, 2, 0], index=lst) + >>> ser + a 6 + a 2 + b 0 + dtype: int64 + >>> ser.groupby(level=0).cumsum() + a 6 + a 8 + b 0 + dtype: int64 + + For DataFrameGroupBy: + + >>> data = [[1, 8, 2], [1, 2, 5], [2, 6, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["fox", "gorilla", "lion"]) + >>> df + a b c + fox 1 8 2 + gorilla 1 2 5 + lion 2 6 9 + >>> df.groupby("a").groups + {1: ['fox', 'gorilla'], 2: ['lion']} + >>> df.groupby("a").cumsum() + b c + fox 8 2 + gorilla 10 7 + lion 6 9 + """ + nv.validate_groupby_func("cumsum", args, kwargs, ["numeric_only", "skipna"]) + if axis is not lib.no_default: + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, "cumsum") + else: + axis = 0 + + if axis != 0: + f = lambda x: x.cumsum(axis=axis, **kwargs) + return self._python_apply_general(f, self._selected_obj, is_transform=True) + + return self._cython_transform("cumsum", **kwargs) + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def cummin( + self, + axis: AxisInt | lib.NoDefault = lib.no_default, + numeric_only: bool = False, + **kwargs, + ) -> NDFrameT: + """ + Cumulative min for each group. + + Returns + ------- + Series or DataFrame + %(see_also)s + Examples + -------- + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> ser = pd.Series([1, 6, 2, 3, 0, 4], index=lst) + >>> ser + a 1 + a 6 + a 2 + b 3 + b 0 + b 4 + dtype: int64 + >>> ser.groupby(level=0).cummin() + a 1 + a 1 + a 1 + b 3 + b 0 + b 0 + dtype: int64 + + For DataFrameGroupBy: + + >>> data = [[1, 0, 2], [1, 1, 5], [6, 6, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["snake", "rabbit", "turtle"]) + >>> df + a b c + snake 1 0 2 + rabbit 1 1 5 + turtle 6 6 9 + >>> df.groupby("a").groups + {1: ['snake', 'rabbit'], 6: ['turtle']} + >>> df.groupby("a").cummin() + b c + snake 0 2 + rabbit 0 2 + turtle 6 9 + """ + skipna = kwargs.get("skipna", True) + if axis is not lib.no_default: + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, "cummin") + else: + axis = 0 + + if axis != 0: + f = lambda x: np.minimum.accumulate(x, axis) + obj = self._selected_obj + if numeric_only: + obj = obj._get_numeric_data() + return self._python_apply_general(f, obj, is_transform=True) + + return self._cython_transform( + "cummin", numeric_only=numeric_only, skipna=skipna + ) + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def cummax( + self, + axis: AxisInt | lib.NoDefault = lib.no_default, + numeric_only: bool = False, + **kwargs, + ) -> NDFrameT: + """ + Cumulative max for each group. + + Returns + ------- + Series or DataFrame + %(see_also)s + Examples + -------- + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> ser = pd.Series([1, 6, 2, 3, 1, 4], index=lst) + >>> ser + a 1 + a 6 + a 2 + b 3 + b 1 + b 4 + dtype: int64 + >>> ser.groupby(level=0).cummax() + a 1 + a 6 + a 6 + b 3 + b 3 + b 4 + dtype: int64 + + For DataFrameGroupBy: + + >>> data = [[1, 8, 2], [1, 1, 0], [2, 6, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["cow", "horse", "bull"]) + >>> df + a b c + cow 1 8 2 + horse 1 1 0 + bull 2 6 9 + >>> df.groupby("a").groups + {1: ['cow', 'horse'], 2: ['bull']} + >>> df.groupby("a").cummax() + b c + cow 8 2 + horse 8 2 + bull 6 9 + """ + skipna = kwargs.get("skipna", True) + if axis is not lib.no_default: + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, "cummax") + else: + axis = 0 + + if axis != 0: + f = lambda x: np.maximum.accumulate(x, axis) + obj = self._selected_obj + if numeric_only: + obj = obj._get_numeric_data() + return self._python_apply_general(f, obj, is_transform=True) + + return self._cython_transform( + "cummax", numeric_only=numeric_only, skipna=skipna + ) + + @final + @Substitution(name="groupby") + def shift( + self, + periods: int | Sequence[int] = 1, + freq=None, + axis: Axis | lib.NoDefault = lib.no_default, + fill_value=lib.no_default, + suffix: str | None = None, + ): + """ + Shift each group by periods observations. + + If freq is passed, the index will be increased using the periods and the freq. + + Parameters + ---------- + periods : int | Sequence[int], default 1 + Number of periods to shift. If a list of values, shift each group by + each period. + freq : str, optional + Frequency string. + axis : axis to shift, default 0 + Shift direction. + + .. deprecated:: 2.1.0 + For axis=1, operate on the underlying object instead. Otherwise + the axis keyword is not necessary. + + fill_value : optional + The scalar value to use for newly introduced missing values. + + .. versionchanged:: 2.1.0 + Will raise a ``ValueError`` if ``freq`` is provided too. + + suffix : str, optional + A string to add to each shifted column if there are multiple periods. + Ignored otherwise. + + Returns + ------- + Series or DataFrame + Object shifted within each group. + + See Also + -------- + Index.shift : Shift values of Index. + + Examples + -------- + + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b', 'b'] + >>> ser = pd.Series([1, 2, 3, 4], index=lst) + >>> ser + a 1 + a 2 + b 3 + b 4 + dtype: int64 + >>> ser.groupby(level=0).shift(1) + a NaN + a 1.0 + b NaN + b 3.0 + dtype: float64 + + For DataFrameGroupBy: + + >>> data = [[1, 2, 3], [1, 5, 6], [2, 5, 8], [2, 6, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["tuna", "salmon", "catfish", "goldfish"]) + >>> df + a b c + tuna 1 2 3 + salmon 1 5 6 + catfish 2 5 8 + goldfish 2 6 9 + >>> df.groupby("a").shift(1) + b c + tuna NaN NaN + salmon 2.0 3.0 + catfish NaN NaN + goldfish 5.0 8.0 + """ + if axis is not lib.no_default: + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, "shift") + else: + axis = 0 + + if is_list_like(periods): + if axis == 1: + raise ValueError( + "If `periods` contains multiple shifts, `axis` cannot be 1." + ) + periods = cast(Sequence, periods) + if len(periods) == 0: + raise ValueError("If `periods` is an iterable, it cannot be empty.") + from pandas.core.reshape.concat import concat + + add_suffix = True + else: + if not is_integer(periods): + raise TypeError( + f"Periods must be integer, but {periods} is {type(periods)}." + ) + if suffix: + raise ValueError("Cannot specify `suffix` if `periods` is an int.") + periods = [cast(int, periods)] + add_suffix = False + + shifted_dataframes = [] + for period in periods: + if not is_integer(period): + raise TypeError( + f"Periods must be integer, but {period} is {type(period)}." + ) + period = cast(int, period) + if freq is not None or axis != 0: + f = lambda x: x.shift( + period, freq, axis, fill_value # pylint: disable=cell-var-from-loop + ) + shifted = self._python_apply_general( + f, self._selected_obj, is_transform=True + ) + else: + if fill_value is lib.no_default: + fill_value = None + ids, _, ngroups = self.grouper.group_info + res_indexer = np.zeros(len(ids), dtype=np.int64) + + libgroupby.group_shift_indexer(res_indexer, ids, ngroups, period) + + obj = self._obj_with_exclusions + + shifted = obj._reindex_with_indexers( + {self.axis: (obj.axes[self.axis], res_indexer)}, + fill_value=fill_value, + allow_dups=True, + ) + + if add_suffix: + if isinstance(shifted, Series): + shifted = cast(NDFrameT, shifted.to_frame()) + shifted = shifted.add_suffix( + f"{suffix}_{period}" if suffix else f"_{period}" + ) + shifted_dataframes.append(cast(Union[Series, DataFrame], shifted)) + + return ( + shifted_dataframes[0] + if len(shifted_dataframes) == 1 + else concat(shifted_dataframes, axis=1) + ) + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def diff( + self, periods: int = 1, axis: AxisInt | lib.NoDefault = lib.no_default + ) -> NDFrameT: + """ + First discrete difference of element. + + Calculates the difference of each element compared with another + element in the group (default is element in previous row). + + Parameters + ---------- + periods : int, default 1 + Periods to shift for calculating difference, accepts negative values. + axis : axis to shift, default 0 + Take difference over rows (0) or columns (1). + + .. deprecated:: 2.1.0 + For axis=1, operate on the underlying object instead. Otherwise + the axis keyword is not necessary. + + Returns + ------- + Series or DataFrame + First differences. + %(see_also)s + Examples + -------- + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst) + >>> ser + a 7 + a 2 + a 8 + b 4 + b 3 + b 3 + dtype: int64 + >>> ser.groupby(level=0).diff() + a NaN + a -5.0 + a 6.0 + b NaN + b -1.0 + b 0.0 + dtype: float64 + + For DataFrameGroupBy: + + >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]} + >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog', + ... 'mouse', 'mouse', 'mouse', 'mouse']) + >>> df + a b + dog 1 1 + dog 3 4 + dog 5 8 + mouse 7 4 + mouse 7 4 + mouse 8 2 + mouse 3 1 + >>> df.groupby(level=0).diff() + a b + dog NaN NaN + dog 2.0 3.0 + dog 2.0 4.0 + mouse NaN NaN + mouse 0.0 0.0 + mouse 1.0 -2.0 + mouse -5.0 -1.0 + """ + if axis is not lib.no_default: + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, "diff") + else: + axis = 0 + + if axis != 0: + return self.apply(lambda x: x.diff(periods=periods, axis=axis)) + + obj = self._obj_with_exclusions + shifted = self.shift(periods=periods) + + # GH45562 - to retain existing behavior and match behavior of Series.diff(), + # int8 and int16 are coerced to float32 rather than float64. + dtypes_to_f32 = ["int8", "int16"] + if obj.ndim == 1: + if obj.dtype in dtypes_to_f32: + shifted = shifted.astype("float32") + else: + to_coerce = [c for c, dtype in obj.dtypes.items() if dtype in dtypes_to_f32] + if len(to_coerce): + shifted = shifted.astype({c: "float32" for c in to_coerce}) + + return obj - shifted + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def pct_change( + self, + periods: int = 1, + fill_method: FillnaOptions | None | lib.NoDefault = lib.no_default, + limit: int | None | lib.NoDefault = lib.no_default, + freq=None, + axis: Axis | lib.NoDefault = lib.no_default, + ): + """ + Calculate pct_change of each value to previous entry in group. + + Returns + ------- + Series or DataFrame + Percentage changes within each group. + %(see_also)s + Examples + -------- + + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b', 'b'] + >>> ser = pd.Series([1, 2, 3, 4], index=lst) + >>> ser + a 1 + a 2 + b 3 + b 4 + dtype: int64 + >>> ser.groupby(level=0).pct_change() + a NaN + a 1.000000 + b NaN + b 0.333333 + dtype: float64 + + For DataFrameGroupBy: + + >>> data = [[1, 2, 3], [1, 5, 6], [2, 5, 8], [2, 6, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["tuna", "salmon", "catfish", "goldfish"]) + >>> df + a b c + tuna 1 2 3 + salmon 1 5 6 + catfish 2 5 8 + goldfish 2 6 9 + >>> df.groupby("a").pct_change() + b c + tuna NaN NaN + salmon 1.5 1.000 + catfish NaN NaN + goldfish 0.2 0.125 + """ + # GH#53491 + if fill_method not in (lib.no_default, None) or limit is not lib.no_default: + warnings.warn( + "The 'fill_method' keyword being not None and the 'limit' keyword in " + f"{type(self).__name__}.pct_change are deprecated and will be removed " + "in a future version. Either fill in any non-leading NA values prior " + "to calling pct_change or specify 'fill_method=None' to not fill NA " + "values.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if fill_method is lib.no_default: + if limit is lib.no_default and any( + grp.isna().values.any() for _, grp in self + ): + warnings.warn( + "The default fill_method='ffill' in " + f"{type(self).__name__}.pct_change is deprecated and will " + "be removed in a future version. Either fill in any " + "non-leading NA values prior to calling pct_change or " + "specify 'fill_method=None' to not fill NA values.", + FutureWarning, + stacklevel=find_stack_level(), + ) + fill_method = "ffill" + if limit is lib.no_default: + limit = None + + if axis is not lib.no_default: + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, "pct_change") + else: + axis = 0 + + # TODO(GH#23918): Remove this conditional for SeriesGroupBy when + # GH#23918 is fixed + if freq is not None or axis != 0: + f = lambda x: x.pct_change( + periods=periods, + fill_method=fill_method, + limit=limit, + freq=freq, + axis=axis, + ) + return self._python_apply_general(f, self._selected_obj, is_transform=True) + + if fill_method is None: # GH30463 + fill_method = "ffill" + limit = 0 + filled = getattr(self, fill_method)(limit=limit) + if self.axis == 0: + fill_grp = filled.groupby(self.grouper.codes, group_keys=self.group_keys) + else: + fill_grp = filled.T.groupby(self.grouper.codes, group_keys=self.group_keys) + shifted = fill_grp.shift(periods=periods, freq=freq) + if self.axis == 1: + shifted = shifted.T + return (filled / shifted) - 1 + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def head(self, n: int = 5) -> NDFrameT: + """ + Return first n rows of each group. + + Similar to ``.apply(lambda x: x.head(n))``, but it returns a subset of rows + from the original DataFrame with original index and order preserved + (``as_index`` flag is ignored). + + Parameters + ---------- + n : int + If positive: number of entries to include from start of each group. + If negative: number of entries to exclude from end of each group. + + Returns + ------- + Series or DataFrame + Subset of original Series or DataFrame as determined by n. + %(see_also)s + Examples + -------- + + >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], + ... columns=['A', 'B']) + >>> df.groupby('A').head(1) + A B + 0 1 2 + 2 5 6 + >>> df.groupby('A').head(-1) + A B + 0 1 2 + """ + mask = self._make_mask_from_positional_indexer(slice(None, n)) + return self._mask_selected_obj(mask) + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def tail(self, n: int = 5) -> NDFrameT: + """ + Return last n rows of each group. + + Similar to ``.apply(lambda x: x.tail(n))``, but it returns a subset of rows + from the original DataFrame with original index and order preserved + (``as_index`` flag is ignored). + + Parameters + ---------- + n : int + If positive: number of entries to include from end of each group. + If negative: number of entries to exclude from start of each group. + + Returns + ------- + Series or DataFrame + Subset of original Series or DataFrame as determined by n. + %(see_also)s + Examples + -------- + + >>> df = pd.DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]], + ... columns=['A', 'B']) + >>> df.groupby('A').tail(1) + A B + 1 a 2 + 3 b 2 + >>> df.groupby('A').tail(-1) + A B + 1 a 2 + 3 b 2 + """ + if n: + mask = self._make_mask_from_positional_indexer(slice(-n, None)) + else: + mask = self._make_mask_from_positional_indexer([]) + + return self._mask_selected_obj(mask) + + @final + def _mask_selected_obj(self, mask: npt.NDArray[np.bool_]) -> NDFrameT: + """ + Return _selected_obj with mask applied to the correct axis. + + Parameters + ---------- + mask : np.ndarray[bool] + Boolean mask to apply. + + Returns + ------- + Series or DataFrame + Filtered _selected_obj. + """ + ids = self.grouper.group_info[0] + mask = mask & (ids != -1) + + if self.axis == 0: + return self._selected_obj[mask] + else: + return self._selected_obj.iloc[:, mask] + + @final + def _reindex_output( + self, + output: OutputFrameOrSeries, + fill_value: Scalar = np.nan, + qs: npt.NDArray[np.float64] | None = None, + ) -> OutputFrameOrSeries: + """ + If we have categorical groupers, then we might want to make sure that + we have a fully re-indexed output to the levels. This means expanding + the output space to accommodate all values in the cartesian product of + our groups, regardless of whether they were observed in the data or + not. This will expand the output space if there are missing groups. + + The method returns early without modifying the input if the number of + groupings is less than 2, self.observed == True or none of the groupers + are categorical. + + Parameters + ---------- + output : Series or DataFrame + Object resulting from grouping and applying an operation. + fill_value : scalar, default np.nan + Value to use for unobserved categories if self.observed is False. + qs : np.ndarray[float64] or None, default None + quantile values, only relevant for quantile. + + Returns + ------- + Series or DataFrame + Object (potentially) re-indexed to include all possible groups. + """ + groupings = self.grouper.groupings + if len(groupings) == 1: + return output + + # if we only care about the observed values + # we are done + elif self.observed: + return output + + # reindexing only applies to a Categorical grouper + elif not any( + isinstance(ping.grouping_vector, (Categorical, CategoricalIndex)) + for ping in groupings + ): + return output + + levels_list = [ping.group_index for ping in groupings] + names = self.grouper.names + if qs is not None: + # error: Argument 1 to "append" of "list" has incompatible type + # "ndarray[Any, dtype[floating[_64Bit]]]"; expected "Index" + levels_list.append(qs) # type: ignore[arg-type] + names = names + [None] + index = MultiIndex.from_product(levels_list, names=names) + if self.sort: + index = index.sort_values() + + if self.as_index: + # Always holds for SeriesGroupBy unless GH#36507 is implemented + d = { + self.obj._get_axis_name(self.axis): index, + "copy": False, + "fill_value": fill_value, + } + return output.reindex(**d) # type: ignore[arg-type] + + # GH 13204 + # Here, the categorical in-axis groupers, which need to be fully + # expanded, are columns in `output`. An idea is to do: + # output = output.set_index(self.grouper.names) + # .reindex(index).reset_index() + # but special care has to be taken because of possible not-in-axis + # groupers. + # So, we manually select and drop the in-axis grouper columns, + # reindex `output`, and then reset the in-axis grouper columns. + + # Select in-axis groupers + in_axis_grps = [ + (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis + ] + if len(in_axis_grps) > 0: + g_nums, g_names = zip(*in_axis_grps) + output = output.drop(labels=list(g_names), axis=1) + + # Set a temp index and reindex (possibly expanding) + output = output.set_index(self.grouper.result_index).reindex( + index, copy=False, fill_value=fill_value + ) + + # Reset in-axis grouper columns + # (using level numbers `g_nums` because level names may not be unique) + if len(in_axis_grps) > 0: + output = output.reset_index(level=g_nums) + + return output.reset_index(drop=True) + + @final + def sample( + self, + n: int | None = None, + frac: float | None = None, + replace: bool = False, + weights: Sequence | Series | None = None, + random_state: RandomState | None = None, + ): + """ + Return a random sample of items from each group. + + You can use `random_state` for reproducibility. + + Parameters + ---------- + n : int, optional + Number of items to return for each group. Cannot be used with + `frac` and must be no larger than the smallest group unless + `replace` is True. Default is one if `frac` is None. + frac : float, optional + Fraction of items to return. Cannot be used with `n`. + replace : bool, default False + Allow or disallow sampling of the same row more than once. + weights : list-like, optional + Default None results in equal probability weighting. + If passed a list-like then values must have the same length as + the underlying DataFrame or Series object and will be used as + sampling probabilities after normalization within each group. + Values must be non-negative with at least one positive element + within each group. + random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional + If int, array-like, or BitGenerator, seed for random number generator. + If np.random.RandomState or np.random.Generator, use as given. + + .. versionchanged:: 1.4.0 + + np.random.Generator objects now accepted + + Returns + ------- + Series or DataFrame + A new object of same type as caller containing items randomly + sampled within each group from the caller object. + + See Also + -------- + DataFrame.sample: Generate random samples from a DataFrame object. + numpy.random.choice: Generate a random sample from a given 1-D numpy + array. + + Examples + -------- + >>> df = pd.DataFrame( + ... {"a": ["red"] * 2 + ["blue"] * 2 + ["black"] * 2, "b": range(6)} + ... ) + >>> df + a b + 0 red 0 + 1 red 1 + 2 blue 2 + 3 blue 3 + 4 black 4 + 5 black 5 + + Select one row at random for each distinct value in column a. The + `random_state` argument can be used to guarantee reproducibility: + + >>> df.groupby("a").sample(n=1, random_state=1) + a b + 4 black 4 + 2 blue 2 + 1 red 1 + + Set `frac` to sample fixed proportions rather than counts: + + >>> df.groupby("a")["b"].sample(frac=0.5, random_state=2) + 5 5 + 2 2 + 0 0 + Name: b, dtype: int64 + + Control sample probabilities within groups by setting weights: + + >>> df.groupby("a").sample( + ... n=1, + ... weights=[1, 1, 1, 0, 0, 1], + ... random_state=1, + ... ) + a b + 5 black 5 + 2 blue 2 + 0 red 0 + """ # noqa: E501 + if self._selected_obj.empty: + # GH48459 prevent ValueError when object is empty + return self._selected_obj + size = sample.process_sampling_size(n, frac, replace) + if weights is not None: + weights_arr = sample.preprocess_weights( + self._selected_obj, weights, axis=self.axis + ) + + random_state = com.random_state(random_state) + + group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis) + + sampled_indices = [] + for labels, obj in group_iterator: + grp_indices = self.indices[labels] + group_size = len(grp_indices) + if size is not None: + sample_size = size + else: + assert frac is not None + sample_size = round(frac * group_size) + + grp_sample = sample.sample( + group_size, + size=sample_size, + replace=replace, + weights=None if weights is None else weights_arr[grp_indices], + random_state=random_state, + ) + sampled_indices.append(grp_indices[grp_sample]) + + sampled_indices = np.concatenate(sampled_indices) + return self._selected_obj.take(sampled_indices, axis=self.axis) + + def _idxmax_idxmin( + self, + how: Literal["idxmax", "idxmin"], + ignore_unobserved: bool = False, + axis: Axis | None | lib.NoDefault = lib.no_default, + skipna: bool = True, + numeric_only: bool = False, + ) -> NDFrameT: + """Compute idxmax/idxmin. + + Parameters + ---------- + how : {'idxmin', 'idxmax'} + Whether to compute idxmin or idxmax. + axis : {{0 or 'index', 1 or 'columns'}}, default None + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + If axis is not provided, grouper's axis is used. + numeric_only : bool, default False + Include only float, int, boolean columns. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ignore_unobserved : bool, default False + When True and an unobserved group is encountered, do not raise. This used + for transform where unobserved groups do not play an impact on the result. + + Returns + ------- + Series or DataFrame + idxmax or idxmin for the groupby operation. + """ + if axis is not lib.no_default: + if axis is None: + axis = self.axis + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, how) + else: + axis = self.axis + + if not self.observed and any( + ping._passed_categorical for ping in self.grouper.groupings + ): + expected_len = np.prod( + [len(ping.group_index) for ping in self.grouper.groupings] + ) + if len(self.grouper.groupings) == 1: + result_len = len(self.grouper.groupings[0].grouping_vector.unique()) + else: + # result_index only contains observed groups in this case + result_len = len(self.grouper.result_index) + assert result_len <= expected_len + has_unobserved = result_len < expected_len + + raise_err: bool | np.bool_ = not ignore_unobserved and has_unobserved + # Only raise an error if there are columns to compute; otherwise we return + # an empty DataFrame with an index (possibly including unobserved) but no + # columns + data = self._obj_with_exclusions + if raise_err and isinstance(data, DataFrame): + if numeric_only: + data = data._get_numeric_data() + raise_err = len(data.columns) > 0 + + if raise_err: + raise ValueError( + f"Can't get {how} of an empty group due to unobserved categories. " + "Specify observed=True in groupby instead." + ) + elif not skipna: + if self._obj_with_exclusions.isna().any(axis=None): + warnings.warn( + f"The behavior of {type(self).__name__}.{how} with all-NA " + "values, or any-NA and skipna=False, is deprecated. In a future " + "version this will raise ValueError", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if axis == 1: + try: + + def func(df): + method = getattr(df, how) + return method(axis=axis, skipna=skipna, numeric_only=numeric_only) + + func.__name__ = how + result = self._python_apply_general( + func, self._obj_with_exclusions, not_indexed_same=True + ) + except ValueError as err: + name = "argmax" if how == "idxmax" else "argmin" + if f"attempt to get {name} of an empty sequence" in str(err): + raise ValueError( + f"Can't get {how} of an empty group due to unobserved " + "categories. Specify observed=True in groupby instead." + ) from None + raise + return result + + result = self._agg_general( + numeric_only=numeric_only, + min_count=1, + alias=how, + skipna=skipna, + ) + return result + + def _wrap_idxmax_idxmin(self, res: NDFrameT) -> NDFrameT: + index = self.obj._get_axis(self.axis) + if res.size == 0: + result = res.astype(index.dtype) + else: + if isinstance(index, MultiIndex): + index = index.to_flat_index() + values = res._values + assert isinstance(values, np.ndarray) + na_value = na_value_for_dtype(index.dtype, compat=False) + if isinstance(res, Series): + # mypy: expression has type "Series", variable has type "NDFrameT" + result = res._constructor( # type: ignore[assignment] + index.array.take(values, allow_fill=True, fill_value=na_value), + index=res.index, + name=res.name, + ) + else: + data = {} + for k, column_values in enumerate(values.T): + data[k] = index.array.take( + column_values, allow_fill=True, fill_value=na_value + ) + result = self.obj._constructor(data, index=res.index) + result.columns = res.columns + return result + + +@doc(GroupBy) +def get_groupby( + obj: NDFrame, + by: _KeysArgType | None = None, + axis: AxisInt = 0, + grouper: ops.BaseGrouper | None = None, + group_keys: bool = True, +) -> GroupBy: + klass: type[GroupBy] + if isinstance(obj, Series): + from pandas._core.groupby.generic import SeriesGroupBy + + klass = SeriesGroupBy + elif isinstance(obj, DataFrame): + from pandas._core.groupby.generic import DataFrameGroupBy + + klass = DataFrameGroupBy + else: # pragma: no cover + raise TypeError(f"invalid type: {obj}") + + return klass( + obj=obj, + keys=by, + axis=axis, + grouper=grouper, + group_keys=group_keys, + ) + + +def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiIndex: + """ + Insert the sequence 'qs' of quantiles as the inner-most level of a MultiIndex. + + The quantile level in the MultiIndex is a repeated copy of 'qs'. + + Parameters + ---------- + idx : Index + qs : np.ndarray[float64] + + Returns + ------- + MultiIndex + """ + nqs = len(qs) + lev_codes, lev = Index(qs).factorize() + lev_codes = coerce_indexer_dtype(lev_codes, lev) + + if idx._is_multi: + idx = cast(MultiIndex, idx) + levels = list(idx.levels) + [lev] + codes = [np.repeat(x, nqs) for x in idx.codes] + [np.tile(lev_codes, len(idx))] + mi = MultiIndex(levels=levels, codes=codes, names=idx.names + [None]) + else: + nidx = len(idx) + idx_codes = coerce_indexer_dtype(np.arange(nidx), idx) + levels = [idx, lev] + codes = [np.repeat(idx_codes, nqs), np.tile(lev_codes, nidx)] + mi = MultiIndex(levels=levels, codes=codes, names=[idx.name, None]) + + return mi + + +# GH#7155 +_apply_groupings_depr = ( + "{}.{} operated on the grouping columns. This behavior is deprecated, " + "and in a future version of pandas the grouping columns will be excluded " + "from the operation. Either pass `include_groups=False` to exclude the " + "groupings or explicitly select the grouping columns after groupby to silence " + "this warning." +) diff --git a/pandas/_core/groupby/grouper.py b/pandas/_core/groupby/grouper.py new file mode 100644 index 0000000000000..a86ea4982f91c --- /dev/null +++ b/pandas/_core/groupby/grouper.py @@ -0,0 +1,1072 @@ +""" +Provide user facing operators for doing the split part of the +split-apply-combine paradigm. +""" +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + final, +) +import warnings + +import numpy as np + +from pandas._config import ( + using_copy_on_write, + warn_copy_on_write, +) + +from pandas._libs import lib +from pandas._libs.tslibs import OutOfBoundsDatetime +from pandas.errors import InvalidIndexError +from pandas.util._decorators import cache_readonly +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import ( + is_list_like, + is_scalar, +) +from pandas.core.dtypes.dtypes import CategoricalDtype + +from pandas._core.groupby import ops +from pandas._core.groupby.categorical import recode_for_groupby +from pandas.core import algorithms +from pandas.core.arrays import ( + Categorical, + ExtensionArray, +) +import pandas.core.common as com +from pandas.core.frame import DataFrame +from pandas.core.indexes.api import ( + CategoricalIndex, + Index, + MultiIndex, +) +from pandas.core.series import Series + +from pandas.io.formats.printing import pprint_thing + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Iterator, + ) + + from pandas._typing import ( + ArrayLike, + Axis, + NDFrameT, + npt, + ) + + from pandas.core.generic import NDFrame + + +class Grouper: + """ + A Grouper allows the user to specify a groupby instruction for an object. + + This specification will select a column via the key parameter, or if the + level and/or axis parameters are given, a level of the index of the target + object. + + If `axis` and/or `level` are passed as keywords to both `Grouper` and + `groupby`, the values passed to `Grouper` take precedence. + + Parameters + ---------- + key : str, defaults to None + Groupby key, which selects the grouping column of the target. + level : name/number, defaults to None + The level for the target index. + freq : str / frequency object, defaults to None + This will groupby the specified frequency if the target selection + (via key or level) is a datetime-like object. For full specification + of available frequencies, please see `here + `_. + axis : str, int, defaults to 0 + Number/name of the axis. + sort : bool, default to False + Whether to sort the resulting labels. + closed : {'left' or 'right'} + Closed end of interval. Only when `freq` parameter is passed. + label : {'left' or 'right'} + Interval boundary to use for labeling. + Only when `freq` parameter is passed. + convention : {'start', 'end', 'e', 's'} + If grouper is PeriodIndex and `freq` parameter is passed. + + origin : Timestamp or str, default 'start_day' + The timestamp on which to adjust the grouping. The timezone of origin must + match the timezone of the index. + If string, must be one of the following: + + - 'epoch': `origin` is 1970-01-01 + - 'start': `origin` is the first value of the timeseries + - 'start_day': `origin` is the first day at midnight of the timeseries + + - 'end': `origin` is the last value of the timeseries + - 'end_day': `origin` is the ceiling midnight of the last day + + .. versionadded:: 1.3.0 + + offset : Timedelta or str, default is None + An offset timedelta added to the origin. + + dropna : bool, default True + If True, and if group keys contain NA values, NA values together with + row/column will be dropped. If False, NA values will also be treated as + the key in groups. + + .. versionadded:: 1.2.0 + + Returns + ------- + Grouper or pandas.api.typing.TimeGrouper + A TimeGrouper is returned if ``freq`` is not ``None``. Otherwise, a Grouper + is returned. + + Examples + -------- + ``df.groupby(pd.Grouper(key="Animal"))`` is equivalent to ``df.groupby('Animal')`` + + >>> df = pd.DataFrame( + ... { + ... "Animal": ["Falcon", "Parrot", "Falcon", "Falcon", "Parrot"], + ... "Speed": [100, 5, 200, 300, 15], + ... } + ... ) + >>> df + Animal Speed + 0 Falcon 100 + 1 Parrot 5 + 2 Falcon 200 + 3 Falcon 300 + 4 Parrot 15 + >>> df.groupby(pd.Grouper(key="Animal")).mean() + Speed + Animal + Falcon 200.0 + Parrot 10.0 + + Specify a resample operation on the column 'Publish date' + + >>> df = pd.DataFrame( + ... { + ... "Publish date": [ + ... pd.Timestamp("2000-01-02"), + ... pd.Timestamp("2000-01-02"), + ... pd.Timestamp("2000-01-09"), + ... pd.Timestamp("2000-01-16") + ... ], + ... "ID": [0, 1, 2, 3], + ... "Price": [10, 20, 30, 40] + ... } + ... ) + >>> df + Publish date ID Price + 0 2000-01-02 0 10 + 1 2000-01-02 1 20 + 2 2000-01-09 2 30 + 3 2000-01-16 3 40 + >>> df.groupby(pd.Grouper(key="Publish date", freq="1W")).mean() + ID Price + Publish date + 2000-01-02 0.5 15.0 + 2000-01-09 2.0 30.0 + 2000-01-16 3.0 40.0 + + If you want to adjust the start of the bins based on a fixed timestamp: + + >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00' + >>> rng = pd.date_range(start, end, freq='7min') + >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng) + >>> ts + 2000-10-01 23:30:00 0 + 2000-10-01 23:37:00 3 + 2000-10-01 23:44:00 6 + 2000-10-01 23:51:00 9 + 2000-10-01 23:58:00 12 + 2000-10-02 00:05:00 15 + 2000-10-02 00:12:00 18 + 2000-10-02 00:19:00 21 + 2000-10-02 00:26:00 24 + Freq: 7min, dtype: int64 + + >>> ts.groupby(pd.Grouper(freq='17min')).sum() + 2000-10-01 23:14:00 0 + 2000-10-01 23:31:00 9 + 2000-10-01 23:48:00 21 + 2000-10-02 00:05:00 54 + 2000-10-02 00:22:00 24 + Freq: 17min, dtype: int64 + + >>> ts.groupby(pd.Grouper(freq='17min', origin='epoch')).sum() + 2000-10-01 23:18:00 0 + 2000-10-01 23:35:00 18 + 2000-10-01 23:52:00 27 + 2000-10-02 00:09:00 39 + 2000-10-02 00:26:00 24 + Freq: 17min, dtype: int64 + + >>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum() + 2000-10-01 23:24:00 3 + 2000-10-01 23:41:00 15 + 2000-10-01 23:58:00 45 + 2000-10-02 00:15:00 45 + Freq: 17min, dtype: int64 + + If you want to adjust the start of the bins with an `offset` Timedelta, the two + following lines are equivalent: + + >>> ts.groupby(pd.Grouper(freq='17min', origin='start')).sum() + 2000-10-01 23:30:00 9 + 2000-10-01 23:47:00 21 + 2000-10-02 00:04:00 54 + 2000-10-02 00:21:00 24 + Freq: 17min, dtype: int64 + + >>> ts.groupby(pd.Grouper(freq='17min', offset='23h30min')).sum() + 2000-10-01 23:30:00 9 + 2000-10-01 23:47:00 21 + 2000-10-02 00:04:00 54 + 2000-10-02 00:21:00 24 + Freq: 17min, dtype: int64 + + To replace the use of the deprecated `base` argument, you can now use `offset`, + in this example it is equivalent to have `base=2`: + + >>> ts.groupby(pd.Grouper(freq='17min', offset='2min')).sum() + 2000-10-01 23:16:00 0 + 2000-10-01 23:33:00 9 + 2000-10-01 23:50:00 36 + 2000-10-02 00:07:00 39 + 2000-10-02 00:24:00 24 + Freq: 17min, dtype: int64 + """ + + sort: bool + dropna: bool + _gpr_index: Index | None + _grouper: Index | None + + _attributes: tuple[str, ...] = ("key", "level", "freq", "axis", "sort", "dropna") + + def __new__(cls, *args, **kwargs): + if kwargs.get("freq") is not None: + from pandas.core.resample import TimeGrouper + + cls = TimeGrouper + return super().__new__(cls) + + def __init__( + self, + key=None, + level=None, + freq=None, + axis: Axis | lib.NoDefault = lib.no_default, + sort: bool = False, + dropna: bool = True, + ) -> None: + if type(self) is Grouper: + # i.e. not TimeGrouper + if axis is not lib.no_default: + warnings.warn( + "Grouper axis keyword is deprecated and will be removed in a " + "future version. To group on axis=1, use obj.T.groupby(...) " + "instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + axis = 0 + if axis is lib.no_default: + axis = 0 + + self.key = key + self.level = level + self.freq = freq + self.axis = axis + self.sort = sort + self.dropna = dropna + + self._grouper_deprecated = None + self._indexer_deprecated: npt.NDArray[np.intp] | None = None + self._obj_deprecated = None + self._gpr_index = None + self.binner = None + self._grouper = None + self._indexer: npt.NDArray[np.intp] | None = None + + def _get_grouper( + self, obj: NDFrameT, validate: bool = True + ) -> tuple[ops.BaseGrouper, NDFrameT]: + """ + Parameters + ---------- + obj : Series or DataFrame + validate : bool, default True + if True, validate the grouper + + Returns + ------- + a tuple of grouper, obj (possibly sorted) + """ + obj, _, _ = self._set_grouper(obj) + grouper, _, obj = get_grouper( + obj, + [self.key], + axis=self.axis, + level=self.level, + sort=self.sort, + validate=validate, + dropna=self.dropna, + ) + # Without setting this, subsequent lookups to .groups raise + # error: Incompatible types in assignment (expression has type "BaseGrouper", + # variable has type "None") + self._grouper_deprecated = grouper # type: ignore[assignment] + + return grouper, obj + + @final + def _set_grouper( + self, obj: NDFrameT, sort: bool = False, *, gpr_index: Index | None = None + ) -> tuple[NDFrameT, Index, npt.NDArray[np.intp] | None]: + """ + given an object and the specifications, setup the internal grouper + for this particular specification + + Parameters + ---------- + obj : Series or DataFrame + sort : bool, default False + whether the resulting grouper should be sorted + gpr_index : Index or None, default None + + Returns + ------- + NDFrame + Index + np.ndarray[np.intp] | None + """ + assert obj is not None + + if self.key is not None and self.level is not None: + raise ValueError("The Grouper cannot specify both a key and a level!") + + # Keep self._grouper value before overriding + if self._grouper is None: + # TODO: What are we assuming about subsequent calls? + self._grouper = gpr_index + self._indexer = self._indexer_deprecated + + # the key must be a valid info item + if self.key is not None: + key = self.key + # The 'on' is already defined + if getattr(gpr_index, "name", None) == key and isinstance(obj, Series): + # Sometimes self._grouper will have been resorted while + # obj has not. In this case there is a mismatch when we + # call self._grouper.take(obj.index) so we need to undo the sorting + # before we call _grouper.take. + assert self._grouper is not None + if self._indexer is not None: + reverse_indexer = self._indexer.argsort() + unsorted_ax = self._grouper.take(reverse_indexer) + ax = unsorted_ax.take(obj.index) + else: + ax = self._grouper.take(obj.index) + else: + if key not in obj._info_axis: + raise KeyError(f"The grouper name {key} is not found") + ax = Index(obj[key], name=key) + + else: + ax = obj._get_axis(self.axis) + if self.level is not None: + level = self.level + + # if a level is given it must be a mi level or + # equivalent to the axis name + if isinstance(ax, MultiIndex): + level = ax._get_level_number(level) + ax = Index(ax._get_level_values(level), name=ax.names[level]) + + else: + if level not in (0, ax.name): + raise ValueError(f"The level {level} is not valid") + + # possibly sort + indexer: npt.NDArray[np.intp] | None = None + if (self.sort or sort) and not ax.is_monotonic_increasing: + # use stable sort to support first, last, nth + # TODO: why does putting na_position="first" fix datetimelike cases? + indexer = self._indexer_deprecated = ax.array.argsort( + kind="mergesort", na_position="first" + ) + ax = ax.take(indexer) + obj = obj.take(indexer, axis=self.axis) + + # error: Incompatible types in assignment (expression has type + # "NDFrameT", variable has type "None") + self._obj_deprecated = obj # type: ignore[assignment] + self._gpr_index = ax + return obj, ax, indexer + + @final + @property + def ax(self) -> Index: + warnings.warn( + f"{type(self).__name__}.ax is deprecated and will be removed in a " + "future version. Use Resampler.ax instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + index = self._gpr_index + if index is None: + raise ValueError("_set_grouper must be called before ax is accessed") + return index + + @final + @property + def indexer(self): + warnings.warn( + f"{type(self).__name__}.indexer is deprecated and will be removed " + "in a future version. Use Resampler.indexer instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self._indexer_deprecated + + @final + @property + def obj(self): + # TODO(3.0): enforcing these deprecations on Grouper should close + # GH#25564, GH#41930 + warnings.warn( + f"{type(self).__name__}.obj is deprecated and will be removed " + "in a future version. Use GroupBy.indexer instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self._obj_deprecated + + @final + @property + def grouper(self): + warnings.warn( + f"{type(self).__name__}.grouper is deprecated and will be removed " + "in a future version. Use GroupBy.grouper instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self._grouper_deprecated + + @final + @property + def groups(self): + warnings.warn( + f"{type(self).__name__}.groups is deprecated and will be removed " + "in a future version. Use GroupBy.groups instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + # error: "None" has no attribute "groups" + return self._grouper_deprecated.groups # type: ignore[attr-defined] + + @final + def __repr__(self) -> str: + attrs_list = ( + f"{attr_name}={repr(getattr(self, attr_name))}" + for attr_name in self._attributes + if getattr(self, attr_name) is not None + ) + attrs = ", ".join(attrs_list) + cls_name = type(self).__name__ + return f"{cls_name}({attrs})" + + +@final +class Grouping: + """ + Holds the grouping information for a single key + + Parameters + ---------- + index : Index + grouper : + obj : DataFrame or Series + name : Label + level : + observed : bool, default False + If we are a Categorical, use the observed values + in_axis : if the Grouping is a column in self.obj and hence among + Groupby.exclusions list + dropna : bool, default True + Whether to drop NA groups. + uniques : Array-like, optional + When specified, will be used for unique values. Enables including empty groups + in the result for a BinGrouper. Must not contain duplicates. + + Attributes + ------- + indices : dict + Mapping of {group -> index_list} + codes : ndarray + Group codes + group_index : Index or None + unique groups + groups : dict + Mapping of {group -> label_list} + """ + + _codes: npt.NDArray[np.signedinteger] | None = None + _group_index: Index | None = None + _all_grouper: Categorical | None + _orig_cats: Index | None + _index: Index + + def __init__( + self, + index: Index, + grouper=None, + obj: NDFrame | None = None, + level=None, + sort: bool = True, + observed: bool = False, + in_axis: bool = False, + dropna: bool = True, + uniques: ArrayLike | None = None, + ) -> None: + self.level = level + self._orig_grouper = grouper + grouping_vector = _convert_grouper(index, grouper) + self._all_grouper = None + self._orig_cats = None + self._index = index + self._sort = sort + self.obj = obj + self._observed = observed + self.in_axis = in_axis + self._dropna = dropna + self._uniques = uniques + + # we have a single grouper which may be a myriad of things, + # some of which are dependent on the passing in level + + ilevel = self._ilevel + if ilevel is not None: + # In extant tests, the new self.grouping_vector matches + # `index.get_level_values(ilevel)` whenever + # mapper is None and isinstance(index, MultiIndex) + if isinstance(index, MultiIndex): + index_level = index.get_level_values(ilevel) + else: + index_level = index + + if grouping_vector is None: + grouping_vector = index_level + else: + mapper = grouping_vector + grouping_vector = index_level.map(mapper) + + # a passed Grouper like, directly get the grouper in the same way + # as single grouper groupby, use the group_info to get codes + elif isinstance(grouping_vector, Grouper): + # get the new grouper; we already have disambiguated + # what key/level refer to exactly, don't need to + # check again as we have by this point converted these + # to an actual value (rather than a pd.Grouper) + assert self.obj is not None # for mypy + newgrouper, newobj = grouping_vector._get_grouper(self.obj, validate=False) + self.obj = newobj + + if isinstance(newgrouper, ops.BinGrouper): + # TODO: can we unwrap this and get a tighter typing + # for self.grouping_vector? + grouping_vector = newgrouper + else: + # ops.BaseGrouper + # TODO: 2023-02-03 no test cases with len(newgrouper.groupings) > 1. + # If that were to occur, would we be throwing out information? + # error: Cannot determine type of "grouping_vector" [has-type] + ng = newgrouper.groupings[0].grouping_vector # type: ignore[has-type] + # use Index instead of ndarray so we can recover the name + grouping_vector = Index(ng, name=newgrouper.result_index.name) + + elif not isinstance( + grouping_vector, (Series, Index, ExtensionArray, np.ndarray) + ): + # no level passed + if getattr(grouping_vector, "ndim", 1) != 1: + t = str(type(grouping_vector)) + raise ValueError(f"Grouper for '{t}' not 1-dimensional") + + grouping_vector = index.map(grouping_vector) + + if not ( + hasattr(grouping_vector, "__len__") + and len(grouping_vector) == len(index) + ): + grper = pprint_thing(grouping_vector) + errmsg = ( + "Grouper result violates len(labels) == " + f"len(data)\nresult: {grper}" + ) + raise AssertionError(errmsg) + + if isinstance(grouping_vector, np.ndarray): + if grouping_vector.dtype.kind in "mM": + # if we have a date/time-like grouper, make sure that we have + # Timestamps like + # TODO 2022-10-08 we only have one test that gets here and + # values are already in nanoseconds in that case. + grouping_vector = Series(grouping_vector).to_numpy() + elif isinstance(getattr(grouping_vector, "dtype", None), CategoricalDtype): + # a passed Categorical + self._orig_cats = grouping_vector.categories + grouping_vector, self._all_grouper = recode_for_groupby( + grouping_vector, sort, observed + ) + + self.grouping_vector = grouping_vector + + def __repr__(self) -> str: + return f"Grouping({self.name})" + + def __iter__(self) -> Iterator: + return iter(self.indices) + + @cache_readonly + def _passed_categorical(self) -> bool: + dtype = getattr(self.grouping_vector, "dtype", None) + return isinstance(dtype, CategoricalDtype) + + @cache_readonly + def name(self) -> Hashable: + ilevel = self._ilevel + if ilevel is not None: + return self._index.names[ilevel] + + if isinstance(self._orig_grouper, (Index, Series)): + return self._orig_grouper.name + + elif isinstance(self.grouping_vector, ops.BaseGrouper): + return self.grouping_vector.result_index.name + + elif isinstance(self.grouping_vector, Index): + return self.grouping_vector.name + + # otherwise we have ndarray or ExtensionArray -> no name + return None + + @cache_readonly + def _ilevel(self) -> int | None: + """ + If necessary, converted index level name to index level position. + """ + level = self.level + if level is None: + return None + if not isinstance(level, int): + index = self._index + if level not in index.names: + raise AssertionError(f"Level {level} not in index") + return index.names.index(level) + return level + + @property + def ngroups(self) -> int: + return len(self.group_index) + + @cache_readonly + def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: + # we have a list of groupers + if isinstance(self.grouping_vector, ops.BaseGrouper): + return self.grouping_vector.indices + + values = Categorical(self.grouping_vector) + return values._reverse_indexer() + + @property + def codes(self) -> npt.NDArray[np.signedinteger]: + return self._codes_and_uniques[0] + + @cache_readonly + def group_arraylike(self) -> ArrayLike: + """ + Analogous to result_index, but holding an ArrayLike to ensure + we can retain ExtensionDtypes. + """ + if self._all_grouper is not None: + # retain dtype for categories, including unobserved ones + return self.result_index._values + + elif self._passed_categorical: + return self.group_index._values + + return self._codes_and_uniques[1] + + @cache_readonly + def result_index(self) -> Index: + # result_index retains dtype for categories, including unobserved ones, + # which group_index does not + if self._all_grouper is not None: + group_idx = self.group_index + assert isinstance(group_idx, CategoricalIndex) + cats = self._orig_cats + # set_categories is dynamically added + return group_idx.set_categories(cats) # type: ignore[attr-defined] + return self.group_index + + @cache_readonly + def group_index(self) -> Index: + codes, uniques = self._codes_and_uniques + if not self._dropna and self._passed_categorical: + assert isinstance(uniques, Categorical) + if self._sort and (codes == len(uniques)).any(): + # Add NA value on the end when sorting + uniques = Categorical.from_codes( + np.append(uniques.codes, [-1]), uniques.categories, validate=False + ) + elif len(codes) > 0: + # Need to determine proper placement of NA value when not sorting + cat = self.grouping_vector + na_idx = (cat.codes < 0).argmax() + if cat.codes[na_idx] < 0: + # count number of unique codes that comes before the nan value + na_unique_idx = algorithms.nunique_ints(cat.codes[:na_idx]) + new_codes = np.insert(uniques.codes, na_unique_idx, -1) + uniques = Categorical.from_codes( + new_codes, uniques.categories, validate=False + ) + return Index._with_infer(uniques, name=self.name) + + @cache_readonly + def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: + uniques: ArrayLike + if self._passed_categorical: + # we make a CategoricalIndex out of the cat grouper + # preserving the categories / ordered attributes; + # doesn't (yet - GH#46909) handle dropna=False + cat = self.grouping_vector + categories = cat.categories + + if self._observed: + ucodes = algorithms.unique1d(cat.codes) + ucodes = ucodes[ucodes != -1] + if self._sort: + ucodes = np.sort(ucodes) + else: + ucodes = np.arange(len(categories)) + + uniques = Categorical.from_codes( + codes=ucodes, categories=categories, ordered=cat.ordered, validate=False + ) + + codes = cat.codes + if not self._dropna: + na_mask = codes < 0 + if np.any(na_mask): + if self._sort: + # Replace NA codes with `largest code + 1` + na_code = len(categories) + codes = np.where(na_mask, na_code, codes) + else: + # Insert NA code into the codes based on first appearance + # A negative code must exist, no need to check codes[na_idx] < 0 + na_idx = na_mask.argmax() + # count number of unique codes that comes before the nan value + na_code = algorithms.nunique_ints(codes[:na_idx]) + codes = np.where(codes >= na_code, codes + 1, codes) + codes = np.where(na_mask, na_code, codes) + + if not self._observed: + uniques = uniques.reorder_categories(self._orig_cats) + + return codes, uniques + + elif isinstance(self.grouping_vector, ops.BaseGrouper): + # we have a list of groupers + codes = self.grouping_vector.codes_info + uniques = self.grouping_vector.result_index._values + elif self._uniques is not None: + # GH#50486 Code grouping_vector using _uniques; allows + # including uniques that are not present in grouping_vector. + cat = Categorical(self.grouping_vector, categories=self._uniques) + codes = cat.codes + uniques = self._uniques + else: + # GH35667, replace dropna=False with use_na_sentinel=False + # error: Incompatible types in assignment (expression has type "Union[ + # ndarray[Any, Any], Index]", variable has type "Categorical") + codes, uniques = algorithms.factorize( # type: ignore[assignment] + self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna + ) + return codes, uniques + + @cache_readonly + def groups(self) -> dict[Hashable, np.ndarray]: + cats = Categorical.from_codes(self.codes, self.group_index, validate=False) + return self._index.groupby(cats) + + +def get_grouper( + obj: NDFrameT, + key=None, + axis: Axis = 0, + level=None, + sort: bool = True, + observed: bool = False, + validate: bool = True, + dropna: bool = True, +) -> tuple[ops.BaseGrouper, frozenset[Hashable], NDFrameT]: + """ + Create and return a BaseGrouper, which is an internal + mapping of how to create the grouper indexers. + This may be composed of multiple Grouping objects, indicating + multiple groupers + + Groupers are ultimately index mappings. They can originate as: + index mappings, keys to columns, functions, or Groupers + + Groupers enable local references to axis,level,sort, while + the passed in axis, level, and sort are 'global'. + + This routine tries to figure out what the passing in references + are and then creates a Grouping for each one, combined into + a BaseGrouper. + + If observed & we have a categorical grouper, only show the observed + values. + + If validate, then check for key/level overlaps. + + """ + group_axis = obj._get_axis(axis) + + # validate that the passed single level is compatible with the passed + # axis of the object + if level is not None: + # TODO: These if-block and else-block are almost same. + # MultiIndex instance check is removable, but it seems that there are + # some processes only for non-MultiIndex in else-block, + # eg. `obj.index.name != level`. We have to consider carefully whether + # these are applicable for MultiIndex. Even if these are applicable, + # we need to check if it makes no side effect to subsequent processes + # on the outside of this condition. + # (GH 17621) + if isinstance(group_axis, MultiIndex): + if is_list_like(level) and len(level) == 1: + level = level[0] + + if key is None and is_scalar(level): + # Get the level values from group_axis + key = group_axis.get_level_values(level) + level = None + + else: + # allow level to be a length-one list-like object + # (e.g., level=[0]) + # GH 13901 + if is_list_like(level): + nlevels = len(level) + if nlevels == 1: + level = level[0] + elif nlevels == 0: + raise ValueError("No group keys passed!") + else: + raise ValueError("multiple levels only valid with MultiIndex") + + if isinstance(level, str): + if obj._get_axis(axis).name != level: + raise ValueError( + f"level name {level} is not the name " + f"of the {obj._get_axis_name(axis)}" + ) + elif level > 0 or level < -1: + raise ValueError("level > 0 or level < -1 only valid with MultiIndex") + + # NOTE: `group_axis` and `group_axis.get_level_values(level)` + # are same in this section. + level = None + key = group_axis + + # a passed-in Grouper, directly convert + if isinstance(key, Grouper): + grouper, obj = key._get_grouper(obj, validate=False) + if key.key is None: + return grouper, frozenset(), obj + else: + return grouper, frozenset({key.key}), obj + + # already have a BaseGrouper, just return it + elif isinstance(key, ops.BaseGrouper): + return key, frozenset(), obj + + if not isinstance(key, list): + keys = [key] + match_axis_length = False + else: + keys = key + match_axis_length = len(keys) == len(group_axis) + + # what are we after, exactly? + any_callable = any(callable(g) or isinstance(g, dict) for g in keys) + any_groupers = any(isinstance(g, (Grouper, Grouping)) for g in keys) + any_arraylike = any( + isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys + ) + + # is this an index replacement? + if ( + not any_callable + and not any_arraylike + and not any_groupers + and match_axis_length + and level is None + ): + if isinstance(obj, DataFrame): + all_in_columns_index = all( + g in obj.columns or g in obj.index.names for g in keys + ) + else: + assert isinstance(obj, Series) + all_in_columns_index = all(g in obj.index.names for g in keys) + + if not all_in_columns_index: + keys = [com.asarray_tuplesafe(keys)] + + if isinstance(level, (tuple, list)): + if key is None: + keys = [None] * len(level) + levels = level + else: + levels = [level] * len(keys) + + groupings: list[Grouping] = [] + exclusions: set[Hashable] = set() + + # if the actual grouper should be obj[key] + def is_in_axis(key) -> bool: + if not _is_label_like(key): + if obj.ndim == 1: + return False + + # items -> .columns for DataFrame, .index for Series + items = obj.axes[-1] + try: + items.get_loc(key) + except (KeyError, TypeError, InvalidIndexError): + # TypeError shows up here if we pass e.g. an Index + return False + + return True + + # if the grouper is obj[name] + def is_in_obj(gpr) -> bool: + if not hasattr(gpr, "name"): + return False + if using_copy_on_write() or warn_copy_on_write(): + # For the CoW case, we check the references to determine if the + # series is part of the object + try: + obj_gpr_column = obj[gpr.name] + except (KeyError, IndexError, InvalidIndexError, OutOfBoundsDatetime): + return False + if isinstance(gpr, Series) and isinstance(obj_gpr_column, Series): + return gpr._mgr.references_same_values( # type: ignore[union-attr] + obj_gpr_column._mgr, 0 # type: ignore[arg-type] + ) + return False + try: + return gpr is obj[gpr.name] + except (KeyError, IndexError, InvalidIndexError, OutOfBoundsDatetime): + # IndexError reached in e.g. test_skip_group_keys when we pass + # lambda here + # InvalidIndexError raised on key-types inappropriate for index, + # e.g. DatetimeIndex.get_loc(tuple()) + # OutOfBoundsDatetime raised when obj is a Series with DatetimeIndex + # and gpr.name is month str + return False + + for gpr, level in zip(keys, levels): + if is_in_obj(gpr): # df.groupby(df['name']) + in_axis = True + exclusions.add(gpr.name) + + elif is_in_axis(gpr): # df.groupby('name') + if obj.ndim != 1 and gpr in obj: + if validate: + obj._check_label_or_level_ambiguity(gpr, axis=axis) + in_axis, name, gpr = True, gpr, obj[gpr] + if gpr.ndim != 1: + # non-unique columns; raise here to get the name in the + # exception message + raise ValueError(f"Grouper for '{name}' not 1-dimensional") + exclusions.add(name) + elif obj._is_level_reference(gpr, axis=axis): + in_axis, level, gpr = False, gpr, None + else: + raise KeyError(gpr) + elif isinstance(gpr, Grouper) and gpr.key is not None: + # Add key to exclusions + exclusions.add(gpr.key) + in_axis = True + else: + in_axis = False + + # create the Grouping + # allow us to passing the actual Grouping as the gpr + ping = ( + Grouping( + group_axis, + gpr, + obj=obj, + level=level, + sort=sort, + observed=observed, + in_axis=in_axis, + dropna=dropna, + ) + if not isinstance(gpr, Grouping) + else gpr + ) + + groupings.append(ping) + + if len(groupings) == 0 and len(obj): + raise ValueError("No group keys passed!") + if len(groupings) == 0: + groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) + + # create the internals grouper + grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, dropna=dropna) + return grouper, frozenset(exclusions), obj + + +def _is_label_like(val) -> bool: + return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val)) + + +def _convert_grouper(axis: Index, grouper): + if isinstance(grouper, dict): + return grouper.get + elif isinstance(grouper, Series): + if grouper.index.equals(axis): + return grouper._values + else: + return grouper.reindex(axis)._values + elif isinstance(grouper, MultiIndex): + return grouper._values + elif isinstance(grouper, (list, tuple, Index, Categorical, np.ndarray)): + if len(grouper) != len(axis): + raise ValueError("Grouper and axis must be same length") + + if isinstance(grouper, (list, tuple)): + grouper = com.asarray_tuplesafe(grouper) + return grouper + else: + return grouper diff --git a/pandas/_core/groupby/indexing.py b/pandas/_core/groupby/indexing.py new file mode 100644 index 0000000000000..9ea763b74aec2 --- /dev/null +++ b/pandas/_core/groupby/indexing.py @@ -0,0 +1,304 @@ +from __future__ import annotations + +from collections.abc import Iterable +from typing import ( + TYPE_CHECKING, + Literal, + cast, +) + +import numpy as np + +from pandas.util._decorators import ( + cache_readonly, + doc, +) + +from pandas.core.dtypes.common import ( + is_integer, + is_list_like, +) + +if TYPE_CHECKING: + from pandas._typing import PositionalIndexer + + from pandas import ( + DataFrame, + Series, + ) + from pandas._core.groupby import groupby + + +class GroupByIndexingMixin: + """ + Mixin for adding ._positional_selector to GroupBy. + """ + + @cache_readonly + def _positional_selector(self) -> GroupByPositionalSelector: + """ + Return positional selection for each group. + + ``groupby._positional_selector[i:j]`` is similar to + ``groupby.apply(lambda x: x.iloc[i:j])`` + but much faster and preserves the original index and order. + + ``_positional_selector[]`` is compatible with and extends :meth:`~GroupBy.head` + and :meth:`~GroupBy.tail`. For example: + + - ``head(5)`` + - ``_positional_selector[5:-5]`` + - ``tail(5)`` + + together return all the rows. + + Allowed inputs for the index are: + + - An integer valued iterable, e.g. ``range(2, 4)``. + - A comma separated list of integers and slices, e.g. ``5``, ``2, 4``, ``2:4``. + + The output format is the same as :meth:`~GroupBy.head` and + :meth:`~GroupBy.tail`, namely + a subset of the ``DataFrame`` or ``Series`` with the index and order preserved. + + Returns + ------- + Series + The filtered subset of the original Series. + DataFrame + The filtered subset of the original DataFrame. + + See Also + -------- + DataFrame.iloc : Purely integer-location based indexing for selection by + position. + GroupBy.head : Return first n rows of each group. + GroupBy.tail : Return last n rows of each group. + GroupBy.nth : Take the nth row from each group if n is an int, or a + subset of rows, if n is a list of ints. + + Notes + ----- + - The slice step cannot be negative. + - If the index specification results in overlaps, the item is not duplicated. + - If the index specification changes the order of items, then + they are returned in their original order. + By contrast, ``DataFrame.iloc`` can change the row order. + - ``groupby()`` parameters such as as_index and dropna are ignored. + + The differences between ``_positional_selector[]`` and :meth:`~GroupBy.nth` + with ``as_index=False`` are: + + - Input to ``_positional_selector`` can include + one or more slices whereas ``nth`` + just handles an integer or a list of integers. + - ``_positional_selector`` can accept a slice relative to the + last row of each group. + - ``_positional_selector`` does not have an equivalent to the + ``nth()`` ``dropna`` parameter. + + Examples + -------- + >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], + ... columns=["A", "B"]) + >>> df.groupby("A")._positional_selector[1:2] + A B + 1 a 2 + 4 b 5 + + >>> df.groupby("A")._positional_selector[1, -1] + A B + 1 a 2 + 2 a 3 + 4 b 5 + """ + if TYPE_CHECKING: + # pylint: disable-next=used-before-assignment + groupby_self = cast(groupby.GroupBy, self) + else: + groupby_self = self + + return GroupByPositionalSelector(groupby_self) + + def _make_mask_from_positional_indexer( + self, + arg: PositionalIndexer | tuple, + ) -> np.ndarray: + if is_list_like(arg): + if all(is_integer(i) for i in cast(Iterable, arg)): + mask = self._make_mask_from_list(cast(Iterable[int], arg)) + else: + mask = self._make_mask_from_tuple(cast(tuple, arg)) + + elif isinstance(arg, slice): + mask = self._make_mask_from_slice(arg) + elif is_integer(arg): + mask = self._make_mask_from_int(cast(int, arg)) + else: + raise TypeError( + f"Invalid index {type(arg)}. " + "Must be integer, list-like, slice or a tuple of " + "integers and slices" + ) + + if isinstance(mask, bool): + if mask: + mask = self._ascending_count >= 0 + else: + mask = self._ascending_count < 0 + + return cast(np.ndarray, mask) + + def _make_mask_from_int(self, arg: int) -> np.ndarray: + if arg >= 0: + return self._ascending_count == arg + else: + return self._descending_count == (-arg - 1) + + def _make_mask_from_list(self, args: Iterable[int]) -> bool | np.ndarray: + positive = [arg for arg in args if arg >= 0] + negative = [-arg - 1 for arg in args if arg < 0] + + mask: bool | np.ndarray = False + + if positive: + mask |= np.isin(self._ascending_count, positive) + + if negative: + mask |= np.isin(self._descending_count, negative) + + return mask + + def _make_mask_from_tuple(self, args: tuple) -> bool | np.ndarray: + mask: bool | np.ndarray = False + + for arg in args: + if is_integer(arg): + mask |= self._make_mask_from_int(cast(int, arg)) + elif isinstance(arg, slice): + mask |= self._make_mask_from_slice(arg) + else: + raise ValueError( + f"Invalid argument {type(arg)}. Should be int or slice." + ) + + return mask + + def _make_mask_from_slice(self, arg: slice) -> bool | np.ndarray: + start = arg.start + stop = arg.stop + step = arg.step + + if step is not None and step < 0: + raise ValueError(f"Invalid step {step}. Must be non-negative") + + mask: bool | np.ndarray = True + + if step is None: + step = 1 + + if start is None: + if step > 1: + mask &= self._ascending_count % step == 0 + + elif start >= 0: + mask &= self._ascending_count >= start + + if step > 1: + mask &= (self._ascending_count - start) % step == 0 + + else: + mask &= self._descending_count < -start + + offset_array = self._descending_count + start + 1 + limit_array = ( + self._ascending_count + self._descending_count + (start + 1) + ) < 0 + offset_array = np.where(limit_array, self._ascending_count, offset_array) + + mask &= offset_array % step == 0 + + if stop is not None: + if stop >= 0: + mask &= self._ascending_count < stop + else: + mask &= self._descending_count >= -stop + + return mask + + @cache_readonly + def _ascending_count(self) -> np.ndarray: + if TYPE_CHECKING: + groupby_self = cast(groupby.GroupBy, self) + else: + groupby_self = self + + return groupby_self._cumcount_array() + + @cache_readonly + def _descending_count(self) -> np.ndarray: + if TYPE_CHECKING: + groupby_self = cast(groupby.GroupBy, self) + else: + groupby_self = self + + return groupby_self._cumcount_array(ascending=False) + + +@doc(GroupByIndexingMixin._positional_selector) +class GroupByPositionalSelector: + def __init__(self, groupby_object: groupby.GroupBy) -> None: + self.groupby_object = groupby_object + + def __getitem__(self, arg: PositionalIndexer | tuple) -> DataFrame | Series: + """ + Select by positional index per group. + + Implements GroupBy._positional_selector + + Parameters + ---------- + arg : PositionalIndexer | tuple + Allowed values are: + - int + - int valued iterable such as list or range + - slice with step either None or positive + - tuple of integers and slices + + Returns + ------- + Series + The filtered subset of the original groupby Series. + DataFrame + The filtered subset of the original groupby DataFrame. + + See Also + -------- + DataFrame.iloc : Integer-location based indexing for selection by position. + GroupBy.head : Return first n rows of each group. + GroupBy.tail : Return last n rows of each group. + GroupBy._positional_selector : Return positional selection for each group. + GroupBy.nth : Take the nth row from each group if n is an int, or a + subset of rows, if n is a list of ints. + """ + mask = self.groupby_object._make_mask_from_positional_indexer(arg) + return self.groupby_object._mask_selected_obj(mask) + + +class GroupByNthSelector: + """ + Dynamically substituted for GroupBy.nth to enable both call and index + """ + + def __init__(self, groupby_object: groupby.GroupBy) -> None: + self.groupby_object = groupby_object + + def __call__( + self, + n: PositionalIndexer | tuple, + dropna: Literal["any", "all", None] = None, + ) -> DataFrame | Series: + return self.groupby_object._nth(n, dropna) + + def __getitem__(self, n: PositionalIndexer | tuple) -> DataFrame | Series: + return self.groupby_object._nth(n) diff --git a/pandas/_core/groupby/numba_.py b/pandas/_core/groupby/numba_.py new file mode 100644 index 0000000000000..3b7a58e87603e --- /dev/null +++ b/pandas/_core/groupby/numba_.py @@ -0,0 +1,181 @@ +"""Common utilities for Numba operations with groupby ops""" +from __future__ import annotations + +import functools +import inspect +from typing import ( + TYPE_CHECKING, + Any, + Callable, +) + +import numpy as np + +from pandas.compat._optional import import_optional_dependency + +from pandas.core.util.numba_ import ( + NumbaUtilError, + jit_user_function, +) + +if TYPE_CHECKING: + from pandas._typing import Scalar + + +def validate_udf(func: Callable) -> None: + """ + Validate user defined function for ops when using Numba with groupby ops. + + The first signature arguments should include: + + def f(values, index, ...): + ... + + Parameters + ---------- + func : function, default False + user defined function + + Returns + ------- + None + + Raises + ------ + NumbaUtilError + """ + if not callable(func): + raise NotImplementedError( + "Numba engine can only be used with a single function." + ) + udf_signature = list(inspect.signature(func).parameters.keys()) + expected_args = ["values", "index"] + min_number_args = len(expected_args) + if ( + len(udf_signature) < min_number_args + or udf_signature[:min_number_args] != expected_args + ): + raise NumbaUtilError( + f"The first {min_number_args} arguments to {func.__name__} must be " + f"{expected_args}" + ) + + +@functools.cache +def generate_numba_agg_func( + func: Callable[..., Scalar], + nopython: bool, + nogil: bool, + parallel: bool, +) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, Any], np.ndarray]: + """ + Generate a numba jitted agg function specified by values from engine_kwargs. + + 1. jit the user's function + 2. Return a groupby agg function with the jitted function inline + + Configurations specified in engine_kwargs apply to both the user's + function _AND_ the groupby evaluation loop. + + Parameters + ---------- + func : function + function to be applied to each group and will be JITed + nopython : bool + nopython to be passed into numba.jit + nogil : bool + nogil to be passed into numba.jit + parallel : bool + parallel to be passed into numba.jit + + Returns + ------- + Numba function + """ + numba_func = jit_user_function(func) + if TYPE_CHECKING: + import numba + else: + numba = import_optional_dependency("numba") + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def group_agg( + values: np.ndarray, + index: np.ndarray, + begin: np.ndarray, + end: np.ndarray, + num_columns: int, + *args: Any, + ) -> np.ndarray: + assert len(begin) == len(end) + num_groups = len(begin) + + result = np.empty((num_groups, num_columns)) + for i in numba.prange(num_groups): + group_index = index[begin[i] : end[i]] + for j in numba.prange(num_columns): + group = values[begin[i] : end[i], j] + result[i, j] = numba_func(group, group_index, *args) + return result + + return group_agg + + +@functools.cache +def generate_numba_transform_func( + func: Callable[..., np.ndarray], + nopython: bool, + nogil: bool, + parallel: bool, +) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, Any], np.ndarray]: + """ + Generate a numba jitted transform function specified by values from engine_kwargs. + + 1. jit the user's function + 2. Return a groupby transform function with the jitted function inline + + Configurations specified in engine_kwargs apply to both the user's + function _AND_ the groupby evaluation loop. + + Parameters + ---------- + func : function + function to be applied to each window and will be JITed + nopython : bool + nopython to be passed into numba.jit + nogil : bool + nogil to be passed into numba.jit + parallel : bool + parallel to be passed into numba.jit + + Returns + ------- + Numba function + """ + numba_func = jit_user_function(func) + if TYPE_CHECKING: + import numba + else: + numba = import_optional_dependency("numba") + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def group_transform( + values: np.ndarray, + index: np.ndarray, + begin: np.ndarray, + end: np.ndarray, + num_columns: int, + *args: Any, + ) -> np.ndarray: + assert len(begin) == len(end) + num_groups = len(begin) + + result = np.empty((len(values), num_columns)) + for i in numba.prange(num_groups): + group_index = index[begin[i] : end[i]] + for j in numba.prange(num_columns): + group = values[begin[i] : end[i], j] + result[begin[i] : end[i], j] = numba_func(group, group_index, *args) + return result + + return group_transform diff --git a/pandas/_core/groupby/ops.py b/pandas/_core/groupby/ops.py new file mode 100644 index 0000000000000..fc22c01070f8b --- /dev/null +++ b/pandas/_core/groupby/ops.py @@ -0,0 +1,1215 @@ +""" +Provide classes to perform the groupby aggregate operations. + +These are not exposed to the user and provide implementations of the grouping +operations, primarily in cython. These classes (BaseGrouper and BinGrouper) +are contained *in* the SeriesGroupBy and DataFrameGroupBy objects. +""" +from __future__ import annotations + +import collections +import functools +from typing import ( + TYPE_CHECKING, + Callable, + Generic, + final, +) + +import numpy as np + +from pandas._libs import ( + NaT, + lib, +) +import pandas._libs.groupby as libgroupby +from pandas._typing import ( + ArrayLike, + AxisInt, + NDFrameT, + Shape, + npt, +) +from pandas.errors import AbstractMethodError +from pandas.util._decorators import cache_readonly + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.cast import ( + maybe_cast_pointwise_result, + maybe_downcast_to_dtype, +) +from pandas.core.dtypes.common import ( + ensure_float64, + ensure_int64, + ensure_platform_int, + ensure_uint64, + is_1d_only_ea_dtype, +) +from pandas.core.dtypes.missing import ( + isna, + maybe_fill, +) + +from pandas._core.groupby import grouper +from pandas.core.frame import DataFrame +from pandas.core.indexes.api import ( + CategoricalIndex, + Index, + MultiIndex, + ensure_index, +) +from pandas.core.series import Series +from pandas.core.sorting import ( + compress_group_index, + decons_obs_group_ids, + get_flattened_list, + get_group_index, + get_group_index_sorter, + get_indexer_dict, +) + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Iterator, + Sequence, + ) + + from pandas.core.generic import NDFrame + + +def check_result_array(obj, dtype) -> None: + # Our operation is supposed to be an aggregation/reduction. If + # it returns an ndarray, this likely means an invalid operation has + # been passed. See test_apply_without_aggregation, test_agg_must_agg + if isinstance(obj, np.ndarray): + if dtype != object: + # If it is object dtype, the function can be a reduction/aggregation + # and still return an ndarray e.g. test_agg_over_numpy_arrays + raise ValueError("Must produce aggregated value") + + +def extract_result(res): + """ + Extract the result object, it might be a 0-dim ndarray + or a len-1 0-dim, or a scalar + """ + if hasattr(res, "_values"): + # Preserve EA + res = res._values + if res.ndim == 1 and len(res) == 1: + # see test_agg_lambda_with_timezone, test_resampler_grouper.py::test_apply + res = res[0] + return res + + +class WrappedCythonOp: + """ + Dispatch logic for functions defined in _libs.groupby + + Parameters + ---------- + kind: str + Whether the operation is an aggregate or transform. + how: str + Operation name, e.g. "mean". + has_dropped_na: bool + True precisely when dropna=True and the grouper contains a null value. + """ + + # Functions for which we do _not_ attempt to cast the cython result + # back to the original dtype. + cast_blocklist = frozenset( + ["any", "all", "rank", "count", "size", "idxmin", "idxmax"] + ) + + def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: + self.kind = kind + self.how = how + self.has_dropped_na = has_dropped_na + + _CYTHON_FUNCTIONS: dict[str, dict] = { + "aggregate": { + "any": functools.partial(libgroupby.group_any_all, val_test="any"), + "all": functools.partial(libgroupby.group_any_all, val_test="all"), + "sum": "group_sum", + "prod": "group_prod", + "idxmin": functools.partial(libgroupby.group_idxmin_idxmax, name="idxmin"), + "idxmax": functools.partial(libgroupby.group_idxmin_idxmax, name="idxmax"), + "min": "group_min", + "max": "group_max", + "mean": "group_mean", + "median": "group_median_float64", + "var": "group_var", + "std": functools.partial(libgroupby.group_var, name="std"), + "sem": functools.partial(libgroupby.group_var, name="sem"), + "skew": "group_skew", + "first": "group_nth", + "last": "group_last", + "ohlc": "group_ohlc", + }, + "transform": { + "cumprod": "group_cumprod", + "cumsum": "group_cumsum", + "cummin": "group_cummin", + "cummax": "group_cummax", + "rank": "group_rank", + }, + } + + _cython_arity = {"ohlc": 4} # OHLC + + @classmethod + def get_kind_from_how(cls, how: str) -> str: + if how in cls._CYTHON_FUNCTIONS["aggregate"]: + return "aggregate" + return "transform" + + # Note: we make this a classmethod and pass kind+how so that caching + # works at the class level and not the instance level + @classmethod + @functools.cache + def _get_cython_function( + cls, kind: str, how: str, dtype: np.dtype, is_numeric: bool + ): + dtype_str = dtype.name + ftype = cls._CYTHON_FUNCTIONS[kind][how] + + # see if there is a fused-type version of function + # only valid for numeric + if callable(ftype): + f = ftype + else: + f = getattr(libgroupby, ftype) + if is_numeric: + return f + elif dtype == np.dtype(object): + if how in ["median", "cumprod"]: + # no fused types -> no __signatures__ + raise NotImplementedError( + f"function is not implemented for this dtype: " + f"[how->{how},dtype->{dtype_str}]" + ) + elif how in ["std", "sem", "idxmin", "idxmax"]: + # We have a partial object that does not have __signatures__ + return f + elif how == "skew": + # _get_cython_vals will convert to float64 + pass + elif "object" not in f.__signatures__: + # raise NotImplementedError here rather than TypeError later + raise NotImplementedError( + f"function is not implemented for this dtype: " + f"[how->{how},dtype->{dtype_str}]" + ) + return f + else: + raise NotImplementedError( + "This should not be reached. Please report a bug at " + "github.com/pandas-dev/pandas/", + dtype, + ) + + def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: + """ + Cast numeric dtypes to float64 for functions that only support that. + + Parameters + ---------- + values : np.ndarray + + Returns + ------- + values : np.ndarray + """ + how = self.how + + if how in ["median", "std", "sem", "skew"]: + # median only has a float64 implementation + # We should only get here with is_numeric, as non-numeric cases + # should raise in _get_cython_function + values = ensure_float64(values) + + elif values.dtype.kind in "iu": + if how in ["var", "mean"] or ( + self.kind == "transform" and self.has_dropped_na + ): + # has_dropped_na check need for test_null_group_str_transformer + # result may still include NaN, so we have to cast + values = ensure_float64(values) + + elif how in ["sum", "ohlc", "prod", "cumsum", "cumprod"]: + # Avoid overflow during group op + if values.dtype.kind == "i": + values = ensure_int64(values) + else: + values = ensure_uint64(values) + + return values + + def _get_output_shape(self, ngroups: int, values: np.ndarray) -> Shape: + how = self.how + kind = self.kind + + arity = self._cython_arity.get(how, 1) + + out_shape: Shape + if how == "ohlc": + out_shape = (ngroups, arity) + elif arity > 1: + raise NotImplementedError( + "arity of more than 1 is not supported for the 'how' argument" + ) + elif kind == "transform": + out_shape = values.shape + else: + out_shape = (ngroups,) + values.shape[1:] + return out_shape + + def _get_out_dtype(self, dtype: np.dtype) -> np.dtype: + how = self.how + + if how == "rank": + out_dtype = "float64" + elif how in ["idxmin", "idxmax"]: + # The Cython implementation only produces the row number; we'll take + # from the index using this in post processing + out_dtype = "intp" + else: + if dtype.kind in "iufcb": + out_dtype = f"{dtype.kind}{dtype.itemsize}" + else: + out_dtype = "object" + return np.dtype(out_dtype) + + def _get_result_dtype(self, dtype: np.dtype) -> np.dtype: + """ + Get the desired dtype of a result based on the + input dtype and how it was computed. + + Parameters + ---------- + dtype : np.dtype + + Returns + ------- + np.dtype + The desired dtype of the result. + """ + how = self.how + + if how in ["sum", "cumsum", "sum", "prod", "cumprod"]: + if dtype == np.dtype(bool): + return np.dtype(np.int64) + elif how in ["mean", "median", "var", "std", "sem"]: + if dtype.kind in "fc": + return dtype + elif dtype.kind in "iub": + return np.dtype(np.float64) + return dtype + + @final + def _cython_op_ndim_compat( + self, + values: np.ndarray, + *, + min_count: int, + ngroups: int, + comp_ids: np.ndarray, + mask: npt.NDArray[np.bool_] | None = None, + result_mask: npt.NDArray[np.bool_] | None = None, + **kwargs, + ) -> np.ndarray: + if values.ndim == 1: + # expand to 2d, dispatch, then squeeze if appropriate + values2d = values[None, :] + if mask is not None: + mask = mask[None, :] + if result_mask is not None: + result_mask = result_mask[None, :] + res = self._call_cython_op( + values2d, + min_count=min_count, + ngroups=ngroups, + comp_ids=comp_ids, + mask=mask, + result_mask=result_mask, + **kwargs, + ) + if res.shape[0] == 1: + return res[0] + + # otherwise we have OHLC + return res.T + + return self._call_cython_op( + values, + min_count=min_count, + ngroups=ngroups, + comp_ids=comp_ids, + mask=mask, + result_mask=result_mask, + **kwargs, + ) + + @final + def _call_cython_op( + self, + values: np.ndarray, # np.ndarray[ndim=2] + *, + min_count: int, + ngroups: int, + comp_ids: np.ndarray, + mask: npt.NDArray[np.bool_] | None, + result_mask: npt.NDArray[np.bool_] | None, + **kwargs, + ) -> np.ndarray: # np.ndarray[ndim=2] + orig_values = values + + dtype = values.dtype + is_numeric = dtype.kind in "iufcb" + + is_datetimelike = dtype.kind in "mM" + + if is_datetimelike: + values = values.view("int64") + is_numeric = True + elif dtype.kind == "b": + values = values.view("uint8") + if values.dtype == "float16": + values = values.astype(np.float32) + + if self.how in ["any", "all"]: + if mask is None: + mask = isna(values) + if dtype == object: + if kwargs["skipna"]: + # GH#37501: don't raise on pd.NA when skipna=True + if mask.any(): + # mask on original values computed separately + values = values.copy() + values[mask] = True + values = values.astype(bool, copy=False).view(np.int8) + is_numeric = True + + values = values.T + if mask is not None: + mask = mask.T + if result_mask is not None: + result_mask = result_mask.T + + out_shape = self._get_output_shape(ngroups, values) + func = self._get_cython_function(self.kind, self.how, values.dtype, is_numeric) + values = self._get_cython_vals(values) + out_dtype = self._get_out_dtype(values.dtype) + + result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) + if self.kind == "aggregate": + counts = np.zeros(ngroups, dtype=np.int64) + if self.how in [ + "idxmin", + "idxmax", + "min", + "max", + "mean", + "last", + "first", + "sum", + ]: + func( + out=result, + counts=counts, + values=values, + labels=comp_ids, + min_count=min_count, + mask=mask, + result_mask=result_mask, + is_datetimelike=is_datetimelike, + ) + elif self.how in ["sem", "std", "var", "ohlc", "prod", "median"]: + if self.how in ["std", "sem"]: + kwargs["is_datetimelike"] = is_datetimelike + func( + result, + counts, + values, + comp_ids, + min_count=min_count, + mask=mask, + result_mask=result_mask, + **kwargs, + ) + elif self.how in ["any", "all"]: + func( + out=result, + values=values, + labels=comp_ids, + mask=mask, + result_mask=result_mask, + **kwargs, + ) + result = result.astype(bool, copy=False) + elif self.how in ["skew"]: + func( + out=result, + counts=counts, + values=values, + labels=comp_ids, + mask=mask, + result_mask=result_mask, + **kwargs, + ) + if dtype == object: + result = result.astype(object) + + else: + raise NotImplementedError(f"{self.how} is not implemented") + else: + # TODO: min_count + if self.how != "rank": + # TODO: should rank take result_mask? + kwargs["result_mask"] = result_mask + func( + out=result, + values=values, + labels=comp_ids, + ngroups=ngroups, + is_datetimelike=is_datetimelike, + mask=mask, + **kwargs, + ) + + if self.kind == "aggregate" and self.how not in ["idxmin", "idxmax"]: + # i.e. counts is defined. Locations where count None: + if values.ndim > 2: + raise NotImplementedError("number of dimensions is currently limited to 2") + if values.ndim == 2: + assert axis == 1, axis + elif not is_1d_only_ea_dtype(values.dtype): + # Note: it is *not* the case that axis is always 0 for 1-dim values, + # as we can have 1D ExtensionArrays that we need to treat as 2D + assert axis == 0 + + @final + def cython_operation( + self, + *, + values: ArrayLike, + axis: AxisInt, + min_count: int = -1, + comp_ids: np.ndarray, + ngroups: int, + **kwargs, + ) -> ArrayLike: + """ + Call our cython function, with appropriate pre- and post- processing. + """ + self._validate_axis(axis, values) + + if not isinstance(values, np.ndarray): + # i.e. ExtensionArray + return values._groupby_op( + how=self.how, + has_dropped_na=self.has_dropped_na, + min_count=min_count, + ngroups=ngroups, + ids=comp_ids, + **kwargs, + ) + + return self._cython_op_ndim_compat( + values, + min_count=min_count, + ngroups=ngroups, + comp_ids=comp_ids, + mask=None, + **kwargs, + ) + + +class BaseGrouper: + """ + This is an internal Grouper class, which actually holds + the generated groups + + Parameters + ---------- + axis : Index + groupings : Sequence[Grouping] + all the grouping instances to handle in this grouper + for example for grouper list to groupby, need to pass the list + sort : bool, default True + whether this grouper will give sorted result or not + + """ + + axis: Index + + def __init__( + self, + axis: Index, + groupings: Sequence[grouper.Grouping], + sort: bool = True, + dropna: bool = True, + ) -> None: + assert isinstance(axis, Index), axis + + self.axis = axis + self._groupings: list[grouper.Grouping] = list(groupings) + self._sort = sort + self.dropna = dropna + + @property + def groupings(self) -> list[grouper.Grouping]: + return self._groupings + + @property + def shape(self) -> Shape: + return tuple(ping.ngroups for ping in self.groupings) + + def __iter__(self) -> Iterator[Hashable]: + return iter(self.indices) + + @property + def nkeys(self) -> int: + return len(self.groupings) + + def get_iterator( + self, data: NDFrameT, axis: AxisInt = 0 + ) -> Iterator[tuple[Hashable, NDFrameT]]: + """ + Groupby iterator + + Returns + ------- + Generator yielding sequence of (name, subsetted object) + for each group + """ + splitter = self._get_splitter(data, axis=axis) + keys = self.group_keys_seq + yield from zip(keys, splitter) + + @final + def _get_splitter(self, data: NDFrame, axis: AxisInt = 0) -> DataSplitter: + """ + Returns + ------- + Generator yielding subsetted objects + """ + ids, _, ngroups = self.group_info + return _get_splitter( + data, + ids, + ngroups, + sorted_ids=self._sorted_ids, + sort_idx=self._sort_idx, + axis=axis, + ) + + @final + @cache_readonly + def group_keys_seq(self): + if len(self.groupings) == 1: + return self.levels[0] + else: + ids, _, ngroups = self.group_info + + # provide "flattened" iterator for multi-group setting + return get_flattened_list(ids, ngroups, self.levels, self.codes) + + @cache_readonly + def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: + """dict {group name -> group indices}""" + if len(self.groupings) == 1 and isinstance(self.result_index, CategoricalIndex): + # This shows unused categories in indices GH#38642 + return self.groupings[0].indices + codes_list = [ping.codes for ping in self.groupings] + keys = [ping.group_index for ping in self.groupings] + return get_indexer_dict(codes_list, keys) + + @final + def result_ilocs(self) -> npt.NDArray[np.intp]: + """ + Get the original integer locations of result_index in the input. + """ + # Original indices are where group_index would go via sorting. + # But when dropna is true, we need to remove null values while accounting for + # any gaps that then occur because of them. + group_index = get_group_index( + self.codes, self.shape, sort=self._sort, xnull=True + ) + group_index, _ = compress_group_index(group_index, sort=self._sort) + + if self.has_dropped_na: + mask = np.where(group_index >= 0) + # Count how many gaps are caused by previous null values for each position + null_gaps = np.cumsum(group_index == -1)[mask] + group_index = group_index[mask] + + result = get_group_index_sorter(group_index, self.ngroups) + + if self.has_dropped_na: + # Shift by the number of prior null gaps + result += np.take(null_gaps, result) + + return result + + @final + @property + def codes(self) -> list[npt.NDArray[np.signedinteger]]: + return [ping.codes for ping in self.groupings] + + @property + def levels(self) -> list[Index]: + return [ping.group_index for ping in self.groupings] + + @property + def names(self) -> list[Hashable]: + return [ping.name for ping in self.groupings] + + @final + def size(self) -> Series: + """ + Compute group sizes. + """ + ids, _, ngroups = self.group_info + out: np.ndarray | list + if ngroups: + out = np.bincount(ids[ids != -1], minlength=ngroups) + else: + out = [] + return Series(out, index=self.result_index, dtype="int64") + + @cache_readonly + def groups(self) -> dict[Hashable, np.ndarray]: + """dict {group name -> group labels}""" + if len(self.groupings) == 1: + return self.groupings[0].groups + else: + to_groupby = [] + for ping in self.groupings: + gv = ping.grouping_vector + if not isinstance(gv, BaseGrouper): + to_groupby.append(gv) + else: + to_groupby.append(gv.groupings[0].grouping_vector) + index = MultiIndex.from_arrays(to_groupby) + return self.axis.groupby(index) + + @final + @cache_readonly + def is_monotonic(self) -> bool: + # return if my group orderings are monotonic + return Index(self.group_info[0]).is_monotonic_increasing + + @final + @cache_readonly + def has_dropped_na(self) -> bool: + """ + Whether grouper has null value(s) that are dropped. + """ + return bool((self.group_info[0] < 0).any()) + + @cache_readonly + def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: + comp_ids, obs_group_ids = self._get_compressed_codes() + + ngroups = len(obs_group_ids) + comp_ids = ensure_platform_int(comp_ids) + + return comp_ids, obs_group_ids, ngroups + + @cache_readonly + def codes_info(self) -> npt.NDArray[np.intp]: + # return the codes of items in original grouped axis + ids, _, _ = self.group_info + return ids + + @final + def _get_compressed_codes( + self, + ) -> tuple[npt.NDArray[np.signedinteger], npt.NDArray[np.intp]]: + # The first returned ndarray may have any signed integer dtype + if len(self.groupings) > 1: + group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True) + return compress_group_index(group_index, sort=self._sort) + # FIXME: compress_group_index's second return value is int64, not intp + + ping = self.groupings[0] + return ping.codes, np.arange(len(ping.group_index), dtype=np.intp) + + @final + @cache_readonly + def ngroups(self) -> int: + return len(self.result_index) + + @property + def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]: + codes = self.codes + ids, obs_ids, _ = self.group_info + return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True) + + @cache_readonly + def result_index(self) -> Index: + if len(self.groupings) == 1: + return self.groupings[0].result_index.rename(self.names[0]) + + codes = self.reconstructed_codes + levels = [ping.result_index for ping in self.groupings] + return MultiIndex( + levels=levels, codes=codes, verify_integrity=False, names=self.names + ) + + @final + def get_group_levels(self) -> list[ArrayLike]: + # Note: only called from _insert_inaxis_grouper, which + # is only called for BaseGrouper, never for BinGrouper + if len(self.groupings) == 1: + return [self.groupings[0].group_arraylike] + + name_list = [] + for ping, codes in zip(self.groupings, self.reconstructed_codes): + codes = ensure_platform_int(codes) + levels = ping.group_arraylike.take(codes) + + name_list.append(levels) + + return name_list + + # ------------------------------------------------------------ + # Aggregation functions + + @final + def _cython_operation( + self, + kind: str, + values, + how: str, + axis: AxisInt, + min_count: int = -1, + **kwargs, + ) -> ArrayLike: + """ + Returns the values of a cython operation. + """ + assert kind in ["transform", "aggregate"] + + cy_op = WrappedCythonOp(kind=kind, how=how, has_dropped_na=self.has_dropped_na) + + ids, _, _ = self.group_info + ngroups = self.ngroups + return cy_op.cython_operation( + values=values, + axis=axis, + min_count=min_count, + comp_ids=ids, + ngroups=ngroups, + **kwargs, + ) + + @final + def agg_series( + self, obj: Series, func: Callable, preserve_dtype: bool = False + ) -> ArrayLike: + """ + Parameters + ---------- + obj : Series + func : function taking a Series and returning a scalar-like + preserve_dtype : bool + Whether the aggregation is known to be dtype-preserving. + + Returns + ------- + np.ndarray or ExtensionArray + """ + + if not isinstance(obj._values, np.ndarray): + # we can preserve a little bit more aggressively with EA dtype + # because maybe_cast_pointwise_result will do a try/except + # with _from_sequence. NB we are assuming here that _from_sequence + # is sufficiently strict that it casts appropriately. + preserve_dtype = True + + result = self._aggregate_series_pure_python(obj, func) + + if len(obj) == 0 and len(result) == 0 and isinstance(obj.dtype, ExtensionDtype): + cls = obj.dtype.construct_array_type() + out = cls._from_sequence(result) + + else: + npvalues = lib.maybe_convert_objects(result, try_float=False) + if preserve_dtype: + out = maybe_cast_pointwise_result( + npvalues, obj.dtype, numeric_only=True + ) + else: + out = npvalues + return out + + @final + def _aggregate_series_pure_python( + self, obj: Series, func: Callable + ) -> npt.NDArray[np.object_]: + _, _, ngroups = self.group_info + + result = np.empty(ngroups, dtype="O") + initialized = False + + splitter = self._get_splitter(obj, axis=0) + + for i, group in enumerate(splitter): + res = func(group) + res = extract_result(res) + + if not initialized: + # We only do this validation on the first iteration + check_result_array(res, group.dtype) + initialized = True + + result[i] = res + + return result + + @final + def apply_groupwise( + self, f: Callable, data: DataFrame | Series, axis: AxisInt = 0 + ) -> tuple[list, bool]: + mutated = False + splitter = self._get_splitter(data, axis=axis) + group_keys = self.group_keys_seq + result_values = [] + + # This calls DataSplitter.__iter__ + zipped = zip(group_keys, splitter) + + for key, group in zipped: + # Pinning name is needed for + # test_group_apply_once_per_group, + # test_inconsistent_return_type, test_set_group_name, + # test_group_name_available_in_inference_pass, + # test_groupby_multi_timezone + object.__setattr__(group, "name", key) + + # group might be modified + group_axes = group.axes + res = f(group) + if not mutated and not _is_indexed_like(res, group_axes, axis): + mutated = True + result_values.append(res) + # getattr pattern for __name__ is needed for functools.partial objects + if len(group_keys) == 0 and getattr(f, "__name__", None) in [ + "skew", + "sum", + "prod", + ]: + # If group_keys is empty, then no function calls have been made, + # so we will not have raised even if this is an invalid dtype. + # So do one dummy call here to raise appropriate TypeError. + f(data.iloc[:0]) + + return result_values, mutated + + # ------------------------------------------------------------ + # Methods for sorting subsets of our GroupBy's object + + @final + @cache_readonly + def _sort_idx(self) -> npt.NDArray[np.intp]: + # Counting sort indexer + ids, _, ngroups = self.group_info + return get_group_index_sorter(ids, ngroups) + + @final + @cache_readonly + def _sorted_ids(self) -> npt.NDArray[np.intp]: + ids, _, _ = self.group_info + return ids.take(self._sort_idx) + + +class BinGrouper(BaseGrouper): + """ + This is an internal Grouper class + + Parameters + ---------- + bins : the split index of binlabels to group the item of axis + binlabels : the label list + indexer : np.ndarray[np.intp], optional + the indexer created by Grouper + some groupers (TimeGrouper) will sort its axis and its + group_info is also sorted, so need the indexer to reorder + + Examples + -------- + bins: [2, 4, 6, 8, 10] + binlabels: DatetimeIndex(['2005-01-01', '2005-01-03', + '2005-01-05', '2005-01-07', '2005-01-09'], + dtype='datetime64[ns]', freq='2D') + + the group_info, which contains the label of each item in grouped + axis, the index of label in label list, group number, is + + (array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]), array([0, 1, 2, 3, 4]), 5) + + means that, the grouped axis has 10 items, can be grouped into 5 + labels, the first and second items belong to the first label, the + third and forth items belong to the second label, and so on + + """ + + bins: npt.NDArray[np.int64] + binlabels: Index + + def __init__( + self, + bins, + binlabels, + indexer=None, + ) -> None: + self.bins = ensure_int64(bins) + self.binlabels = ensure_index(binlabels) + self.indexer = indexer + + # These lengths must match, otherwise we could call agg_series + # with empty self.bins, which would raise later. + assert len(self.binlabels) == len(self.bins) + + @cache_readonly + def groups(self): + """dict {group name -> group labels}""" + # this is mainly for compat + # GH 3881 + result = { + key: value + for key, value in zip(self.binlabels, self.bins) + if key is not NaT + } + return result + + @property + def nkeys(self) -> int: + # still matches len(self.groupings), but we can hard-code + return 1 + + @cache_readonly + def codes_info(self) -> npt.NDArray[np.intp]: + # return the codes of items in original grouped axis + ids, _, _ = self.group_info + if self.indexer is not None: + sorter = np.lexsort((ids, self.indexer)) + ids = ids[sorter] + return ids + + def get_iterator(self, data: NDFrame, axis: AxisInt = 0): + """ + Groupby iterator + + Returns + ------- + Generator yielding sequence of (name, subsetted object) + for each group + """ + if axis == 0: + slicer = lambda start, edge: data.iloc[start:edge] + else: + slicer = lambda start, edge: data.iloc[:, start:edge] + + length = len(data.axes[axis]) + + start = 0 + for edge, label in zip(self.bins, self.binlabels): + if label is not NaT: + yield label, slicer(start, edge) + start = edge + + if start < length: + yield self.binlabels[-1], slicer(start, None) + + @cache_readonly + def indices(self): + indices = collections.defaultdict(list) + + i = 0 + for label, bin in zip(self.binlabels, self.bins): + if i < bin: + if label is not NaT: + indices[label] = list(range(i, bin)) + i = bin + return indices + + @cache_readonly + def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: + ngroups = self.ngroups + obs_group_ids = np.arange(ngroups, dtype=np.intp) + rep = np.diff(np.r_[0, self.bins]) + + rep = ensure_platform_int(rep) + if ngroups == len(self.bins): + comp_ids = np.repeat(np.arange(ngroups), rep) + else: + comp_ids = np.repeat(np.r_[-1, np.arange(ngroups)], rep) + + return ( + ensure_platform_int(comp_ids), + obs_group_ids, + ngroups, + ) + + @cache_readonly + def reconstructed_codes(self) -> list[np.ndarray]: + # get unique result indices, and prepend 0 as groupby starts from the first + return [np.r_[0, np.flatnonzero(self.bins[1:] != self.bins[:-1]) + 1]] + + @cache_readonly + def result_index(self) -> Index: + if len(self.binlabels) != 0 and isna(self.binlabels[0]): + return self.binlabels[1:] + + return self.binlabels + + @property + def levels(self) -> list[Index]: + return [self.binlabels] + + @property + def names(self) -> list[Hashable]: + return [self.binlabels.name] + + @property + def groupings(self) -> list[grouper.Grouping]: + lev = self.binlabels + codes = self.group_info[0] + labels = lev.take(codes) + ping = grouper.Grouping( + labels, labels, in_axis=False, level=None, uniques=lev._values + ) + return [ping] + + +def _is_indexed_like(obj, axes, axis: AxisInt) -> bool: + if isinstance(obj, Series): + if len(axes) > 1: + return False + return obj.axes[axis].equals(axes[axis]) + elif isinstance(obj, DataFrame): + return obj.axes[axis].equals(axes[axis]) + + return False + + +# ---------------------------------------------------------------------- +# Splitting / application + + +class DataSplitter(Generic[NDFrameT]): + def __init__( + self, + data: NDFrameT, + labels: npt.NDArray[np.intp], + ngroups: int, + *, + sort_idx: npt.NDArray[np.intp], + sorted_ids: npt.NDArray[np.intp], + axis: AxisInt = 0, + ) -> None: + self.data = data + self.labels = ensure_platform_int(labels) # _should_ already be np.intp + self.ngroups = ngroups + + self._slabels = sorted_ids + self._sort_idx = sort_idx + + self.axis = axis + assert isinstance(axis, int), axis + + def __iter__(self) -> Iterator: + sdata = self._sorted_data + + if self.ngroups == 0: + # we are inside a generator, rather than raise StopIteration + # we merely return signal the end + return + + starts, ends = lib.generate_slices(self._slabels, self.ngroups) + + for start, end in zip(starts, ends): + yield self._chop(sdata, slice(start, end)) + + @cache_readonly + def _sorted_data(self) -> NDFrameT: + return self.data.take(self._sort_idx, axis=self.axis) + + def _chop(self, sdata, slice_obj: slice) -> NDFrame: + raise AbstractMethodError(self) + + +class SeriesSplitter(DataSplitter): + def _chop(self, sdata: Series, slice_obj: slice) -> Series: + # fastpath equivalent to `sdata.iloc[slice_obj]` + mgr = sdata._mgr.get_slice(slice_obj) + ser = sdata._constructor_from_mgr(mgr, axes=mgr.axes) + ser._name = sdata.name + return ser.__finalize__(sdata, method="groupby") + + +class FrameSplitter(DataSplitter): + def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: + # Fastpath equivalent to: + # if self.axis == 0: + # return sdata.iloc[slice_obj] + # else: + # return sdata.iloc[:, slice_obj] + mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis) + df = sdata._constructor_from_mgr(mgr, axes=mgr.axes) + return df.__finalize__(sdata, method="groupby") + + +def _get_splitter( + data: NDFrame, + labels: npt.NDArray[np.intp], + ngroups: int, + *, + sort_idx: npt.NDArray[np.intp], + sorted_ids: npt.NDArray[np.intp], + axis: AxisInt = 0, +) -> DataSplitter: + if isinstance(data, Series): + klass: type[DataSplitter] = SeriesSplitter + else: + # i.e. DataFrame + klass = FrameSplitter + + return klass( + data, labels, ngroups, sort_idx=sort_idx, sorted_ids=sorted_ids, axis=axis + ) diff --git a/pandas/_typing.py b/pandas/_typing.py index c2d51f63eb2ab..0682ad6be9fa6 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -47,6 +47,11 @@ from pandas.core.dtypes.dtypes import ExtensionDtype from pandas import Interval + from pandas._core.groupby.generic import ( + DataFrameGroupBy, + GroupBy, + SeriesGroupBy, + ) from pandas.arrays import ( DatetimeArray, TimedeltaArray, @@ -54,11 +59,6 @@ from pandas.core.arrays.base import ExtensionArray from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame - from pandas.core.groupby.generic import ( - DataFrameGroupBy, - GroupBy, - SeriesGroupBy, - ) from pandas.core.indexes.base import Index from pandas.core.internals import ( ArrayManager, diff --git a/pandas/api/typing/__init__.py b/pandas/api/typing/__init__.py index 9b5d2cb06b523..b951845f50a55 100644 --- a/pandas/api/typing/__init__.py +++ b/pandas/api/typing/__init__.py @@ -5,7 +5,7 @@ from pandas._libs import NaTType from pandas._libs.missing import NAType -from pandas.core.groupby import ( +from pandas._core.groupby import ( DataFrameGroupBy, SeriesGroupBy, ) diff --git a/pandas/core/api.py b/pandas/core/api.py index 2cfe5ffc0170d..9ad6b9400b7b6 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -20,6 +20,10 @@ notnull, ) +from pandas._core.groupby import ( + Grouper, + NamedAgg, +) from pandas.core.algorithms import ( factorize, unique, @@ -44,10 +48,6 @@ from pandas.core.arrays.string_ import StringDtype from pandas.core.construction import array from pandas.core.flags import Flags -from pandas.core.groupby import ( - Grouper, - NamedAgg, -) from pandas.core.indexes.api import ( CategoricalIndex, DatetimeIndex, diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 3b79882d3c762..4433662ef52b0 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -70,7 +70,7 @@ Index, Series, ) - from pandas.core.groupby import GroupBy + from pandas._core.groupby import GroupBy from pandas.core.resample import Resampler from pandas.core.window.rolling import BaseWindow @@ -450,7 +450,7 @@ def compute_dict_like( Data for result. When aggregating with a Series, this can contain any Python object. """ - from pandas.core.groupby.generic import ( + from pandas._core.groupby.generic import ( DataFrameGroupBy, SeriesGroupBy, ) @@ -567,7 +567,7 @@ def apply_str(self) -> DataFrame | Series: obj = self.obj - from pandas.core.groupby.generic import ( + from pandas._core.groupby.generic import ( DataFrameGroupBy, SeriesGroupBy, ) @@ -1558,7 +1558,7 @@ def agg_or_apply_list_like( def agg_or_apply_dict_like( self, op_name: Literal["agg", "apply"] ) -> DataFrame | Series: - from pandas.core.groupby.generic import ( + from pandas._core.groupby.generic import ( DataFrameGroupBy, SeriesGroupBy, ) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index e61e374009163..f6fec56d9025c 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2343,8 +2343,8 @@ def _groupby_op( ------- np.ndarray or ExtensionArray """ + from pandas._core.groupby.ops import WrappedCythonOp from pandas.core.arrays.string_ import StringDtype - from pandas.core.groupby.ops import WrappedCythonOp kind = WrappedCythonOp.get_kind_from_how(how) op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index eec833c600177..d1e4eceb7d667 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2693,7 +2693,7 @@ def _groupby_op( ids: npt.NDArray[np.intp], **kwargs, ): - from pandas.core.groupby.ops import WrappedCythonOp + from pandas._core.groupby.ops import WrappedCythonOp kind = WrappedCythonOp.get_kind_from_how(how) op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 33b2f65340a3b..f4943235b831e 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1671,7 +1671,7 @@ def _groupby_op( # operate on the tz-naive equivalents npvalues = self._ndarray.view("M8[ns]") - from pandas.core.groupby.ops import WrappedCythonOp + from pandas._core.groupby.ops import WrappedCythonOp kind = WrappedCythonOp.get_kind_from_how(how) op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index c8447397c7bfe..c11e78df87234 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1477,7 +1477,7 @@ def _groupby_op( ids: npt.NDArray[np.intp], **kwargs, ): - from pandas.core.groupby.ops import WrappedCythonOp + from pandas._core.groupby.ops import WrappedCythonOp kind = WrappedCythonOp.get_kind_from_how(how) op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c3696be0579b0..beaa83f6b9d6d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -259,7 +259,7 @@ npt, ) - from pandas.core.groupby.generic import DataFrameGroupBy + from pandas._core.groupby.generic import DataFrameGroupBy from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg from pandas.core.internals import SingleDataManager @@ -9028,7 +9028,7 @@ def groupby( else: axis = 0 - from pandas.core.groupby.generic import DataFrameGroupBy + from pandas._core.groupby.generic import DataFrameGroupBy if level is None and by is None: raise TypeError("You have to supply one of 'by' and 'level'") diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7918e43b48719..21239303db8ed 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9796,8 +9796,8 @@ def rank( See Also -------- - core.groupby.DataFrameGroupBy.rank : Rank of values within each group. - core.groupby.SeriesGroupBy.rank : Rank of values within each group. + _core.groupby.DataFrameGroupBy.rank : Rank of values within each group. + _core.groupby.SeriesGroupBy.rank : Rank of values within each group. Examples -------- diff --git a/pandas/core/groupby/__init__.py b/pandas/core/groupby/__init__.py index 8248f378e2c1a..875e20e4243b8 100644 --- a/pandas/core/groupby/__init__.py +++ b/pandas/core/groupby/__init__.py @@ -1,15 +1,12 @@ -from pandas.core.groupby.generic import ( - DataFrameGroupBy, - NamedAgg, - SeriesGroupBy, -) -from pandas.core.groupby.groupby import GroupBy -from pandas.core.groupby.grouper import Grouper - -__all__ = [ - "DataFrameGroupBy", - "NamedAgg", - "SeriesGroupBy", - "GroupBy", - "Grouper", -] +from __future__ import annotations + +from typing import Any + +from pandas._core import groupby as groupby_ +from pandas.core.common import _depr_core + + +def __getattr__(attr_name: str) -> Any: + attr = getattr(groupby_, attr_name) + _depr_core() + return attr diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index a443597347283..8def53ba34318 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -1,121 +1,11 @@ -""" -Provide basic components for groupby. -""" from __future__ import annotations -import dataclasses -from typing import TYPE_CHECKING +from pandas._core.groupby import base +from pandas.core.common import _depr_core -if TYPE_CHECKING: - from collections.abc import Hashable +_depr_core() +_globals = globals() -@dataclasses.dataclass(order=True, frozen=True) -class OutputKey: - label: Hashable - position: int - - -# special case to prevent duplicate plots when catching exceptions when -# forwarding methods from NDFrames -plotting_methods = frozenset(["plot", "hist"]) - -# cythonized transformations or canned "agg+broadcast", which do not -# require postprocessing of the result by transform. -cythonized_kernels = frozenset(["cumprod", "cumsum", "shift", "cummin", "cummax"]) - -# List of aggregation/reduction functions. -# These map each group to a single numeric value -reduction_kernels = frozenset( - [ - "all", - "any", - "corrwith", - "count", - "first", - "idxmax", - "idxmin", - "last", - "max", - "mean", - "median", - "min", - "nunique", - "prod", - # as long as `quantile`'s signature accepts only - # a single quantile value, it's a reduction. - # GH#27526 might change that. - "quantile", - "sem", - "size", - "skew", - "std", - "sum", - "var", - ] -) - -# List of transformation functions. -# a transformation is a function that, for each group, -# produces a result that has the same shape as the group. - - -transformation_kernels = frozenset( - [ - "bfill", - "cumcount", - "cummax", - "cummin", - "cumprod", - "cumsum", - "diff", - "ffill", - "fillna", - "ngroup", - "pct_change", - "rank", - "shift", - ] -) - -# these are all the public methods on Grouper which don't belong -# in either of the above lists -groupby_other_methods = frozenset( - [ - "agg", - "aggregate", - "apply", - "boxplot", - # corr and cov return ngroups*ncolumns rows, so they - # are neither a transformation nor a reduction - "corr", - "cov", - "describe", - "dtypes", - "expanding", - "ewm", - "filter", - "get_group", - "groups", - "head", - "hist", - "indices", - "ndim", - "ngroups", - "nth", - "ohlc", - "pipe", - "plot", - "resample", - "rolling", - "tail", - "take", - "transform", - "sample", - "value_counts", - ] -) -# Valid values of `name` for `groupby.transform(name)` -# NOTE: do NOT edit this directly. New additions should be inserted -# into the appropriate list above. -transform_kernel_allowlist = reduction_kernels | transformation_kernels +for item in base.__dir__(): + _globals[item] = getattr(base, item) diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index 6ab98cf4fe55e..431da48b6ea0e 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -1,87 +1,11 @@ from __future__ import annotations -import numpy as np +from pandas._core.groupby import categorical +from pandas.core.common import _depr_core -from pandas.core.algorithms import unique1d -from pandas.core.arrays.categorical import ( - Categorical, - CategoricalDtype, - recode_for_categories, -) +_depr_core() +_globals = globals() -def recode_for_groupby( - c: Categorical, sort: bool, observed: bool -) -> tuple[Categorical, Categorical | None]: - """ - Code the categories to ensure we can groupby for categoricals. - - If observed=True, we return a new Categorical with the observed - categories only. - - If sort=False, return a copy of self, coded with categories as - returned by .unique(), followed by any categories not appearing in - the data. If sort=True, return self. - - This method is needed solely to ensure the categorical index of the - GroupBy result has categories in the order of appearance in the data - (GH-8868). - - Parameters - ---------- - c : Categorical - sort : bool - The value of the sort parameter groupby was called with. - observed : bool - Account only for the observed values - - Returns - ------- - Categorical - If sort=False, the new categories are set to the order of - appearance in codes (unless ordered=True, in which case the - original order is preserved), followed by any unrepresented - categories in the original order. - Categorical or None - If we are observed, return the original categorical, otherwise None - """ - # we only care about observed values - if observed: - # In cases with c.ordered, this is equivalent to - # return c.remove_unused_categories(), c - - unique_codes = unique1d(c.codes) - - take_codes = unique_codes[unique_codes != -1] - if sort: - take_codes = np.sort(take_codes) - - # we recode according to the uniques - categories = c.categories.take(take_codes) - codes = recode_for_categories(c.codes, c.categories, categories) - - # return a new categorical that maps our new codes - # and categories - dtype = CategoricalDtype(categories, ordered=c.ordered) - return Categorical._simple_new(codes, dtype=dtype), c - - # Already sorted according to c.categories; all is fine - if sort: - return c, None - - # sort=False should order groups in as-encountered order (GH-8868) - - # xref GH:46909: Re-ordering codes faster than using (set|add|reorder)_categories - all_codes = np.arange(c.categories.nunique()) - # GH 38140: exclude nan from indexer for categories - unique_notnan_codes = unique1d(c.codes[c.codes != -1]) - if sort: - unique_notnan_codes = np.sort(unique_notnan_codes) - if len(all_codes) > len(unique_notnan_codes): - # GH 13179: All categories need to be present, even if missing from the data - missing_codes = np.setdiff1d(all_codes, unique_notnan_codes, assume_unique=True) - take_codes = np.concatenate((unique_notnan_codes, missing_codes)) - else: - take_codes = unique_notnan_codes - - return Categorical(c, c.unique().categories.take(take_codes)), None +for item in categorical.__dir__(): + _globals[item] = getattr(categorical, item) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1bdba5a3e71fb..6fc9fa7527ee8 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1,2867 +1,11 @@ -""" -Define the SeriesGroupBy and DataFrameGroupBy -classes that hold the groupby interfaces (and some implementations). - -These are user facing as the result of the ``df.groupby(...)`` operations, -which here returns a DataFrameGroupBy object. -""" from __future__ import annotations -from collections import abc -from functools import partial -from textwrap import dedent -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Literal, - NamedTuple, - TypeVar, - Union, - cast, -) -import warnings - -import numpy as np - -from pandas._libs import ( - Interval, - lib, -) -from pandas.errors import SpecificationError -from pandas.util._decorators import ( - Appender, - Substitution, - doc, -) -from pandas.util._exceptions import find_stack_level - -from pandas.core.dtypes.common import ( - ensure_int64, - is_bool, - is_dict_like, - is_integer_dtype, - is_list_like, - is_numeric_dtype, - is_scalar, -) -from pandas.core.dtypes.dtypes import ( - CategoricalDtype, - IntervalDtype, -) -from pandas.core.dtypes.inference import is_hashable -from pandas.core.dtypes.missing import ( - isna, - notna, -) - -from pandas.core import algorithms -from pandas.core.apply import ( - GroupByApply, - maybe_mangle_lambdas, - reconstruct_func, - validate_func_kwargs, - warn_alias_replacement, -) -import pandas.core.common as com -from pandas.core.frame import DataFrame -from pandas.core.groupby import ( - base, - ops, -) -from pandas.core.groupby.groupby import ( - GroupBy, - GroupByPlot, - _agg_template_frame, - _agg_template_series, - _apply_docs, - _transform_template, -) -from pandas.core.indexes.api import ( - Index, - MultiIndex, - all_indexes_same, - default_index, -) -from pandas.core.series import Series -from pandas.core.util.numba_ import maybe_use_numba - -from pandas.plotting import boxplot_frame_groupby - -if TYPE_CHECKING: - from collections.abc import ( - Hashable, - Mapping, - Sequence, - ) - - from pandas._typing import ( - ArrayLike, - Axis, - AxisInt, - CorrelationMethod, - FillnaOptions, - IndexLabel, - Manager, - Manager2D, - SingleManager, - TakeIndexer, - ) - - from pandas import Categorical - from pandas.core.generic import NDFrame - -# TODO(typing) the return value on this callable should be any *scalar*. -AggScalar = Union[str, Callable[..., Any]] -# TODO: validate types on ScalarResult and move to _typing -# Blocked from using by https://github.com/python/mypy/issues/1484 -# See note at _mangle_lambda_list -ScalarResult = TypeVar("ScalarResult") - - -class NamedAgg(NamedTuple): - """ - Helper for column specific aggregation with control over output column names. - - Subclass of typing.NamedTuple. - - Parameters - ---------- - column : Hashable - Column label in the DataFrame to apply aggfunc. - aggfunc : function or str - Function to apply to the provided column. If string, the name of a built-in - pandas function. - - Examples - -------- - >>> df = pd.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]}) - >>> agg_a = pd.NamedAgg(column="a", aggfunc="min") - >>> agg_1 = pd.NamedAgg(column=1, aggfunc=lambda x: np.mean(x)) - >>> df.groupby("key").agg(result_a=agg_a, result_1=agg_1) - result_a result_1 - key - 1 -1 10.5 - 2 1 12.0 - """ - - column: Hashable - aggfunc: AggScalar - - -class SeriesGroupBy(GroupBy[Series]): - def _wrap_agged_manager(self, mgr: Manager) -> Series: - out = self.obj._constructor_from_mgr(mgr, axes=mgr.axes) - out._name = self.obj.name - return out - - def _get_data_to_aggregate( - self, *, numeric_only: bool = False, name: str | None = None - ) -> SingleManager: - ser = self._obj_with_exclusions - single = ser._mgr - if numeric_only and not is_numeric_dtype(ser.dtype): - # GH#41291 match Series behavior - kwd_name = "numeric_only" - raise TypeError( - f"Cannot use {kwd_name}=True with " - f"{type(self).__name__}.{name} and non-numeric dtypes." - ) - return single - - _agg_examples_doc = dedent( - """ - Examples - -------- - >>> s = pd.Series([1, 2, 3, 4]) - - >>> s - 0 1 - 1 2 - 2 3 - 3 4 - dtype: int64 - - >>> s.groupby([1, 1, 2, 2]).min() - 1 1 - 2 3 - dtype: int64 - - >>> s.groupby([1, 1, 2, 2]).agg('min') - 1 1 - 2 3 - dtype: int64 - - >>> s.groupby([1, 1, 2, 2]).agg(['min', 'max']) - min max - 1 1 2 - 2 3 4 - - The output column names can be controlled by passing - the desired column names and aggregations as keyword arguments. - - >>> s.groupby([1, 1, 2, 2]).agg( - ... minimum='min', - ... maximum='max', - ... ) - minimum maximum - 1 1 2 - 2 3 4 - - .. versionchanged:: 1.3.0 - - The resulting dtype will reflect the return value of the aggregating function. - - >>> s.groupby([1, 1, 2, 2]).agg(lambda x: x.astype(float).min()) - 1 1.0 - 2 3.0 - dtype: float64 - """ - ) - - @Appender( - _apply_docs["template"].format( - input="series", examples=_apply_docs["series_examples"] - ) - ) - def apply(self, func, *args, **kwargs) -> Series: - return super().apply(func, *args, **kwargs) - - @doc(_agg_template_series, examples=_agg_examples_doc, klass="Series") - def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): - relabeling = func is None - columns = None - if relabeling: - columns, func = validate_func_kwargs(kwargs) - kwargs = {} - - if isinstance(func, str): - if maybe_use_numba(engine) and engine is not None: - # Not all agg functions support numba, only propagate numba kwargs - # if user asks for numba, and engine is not None - # (if engine is None, the called function will handle the case where - # numba is requested via the global option) - kwargs["engine"] = engine - if engine_kwargs is not None: - kwargs["engine_kwargs"] = engine_kwargs - return getattr(self, func)(*args, **kwargs) - - elif isinstance(func, abc.Iterable): - # Catch instances of lists / tuples - # but not the class list / tuple itself. - func = maybe_mangle_lambdas(func) - kwargs["engine"] = engine - kwargs["engine_kwargs"] = engine_kwargs - ret = self._aggregate_multiple_funcs(func, *args, **kwargs) - if relabeling: - # columns is not narrowed by mypy from relabeling flag - assert columns is not None # for mypy - ret.columns = columns - if not self.as_index: - ret = ret.reset_index() - return ret - - else: - cyfunc = com.get_cython_func(func) - if cyfunc and not args and not kwargs: - warn_alias_replacement(self, func, cyfunc) - return getattr(self, cyfunc)() - - if maybe_use_numba(engine): - return self._aggregate_with_numba( - func, *args, engine_kwargs=engine_kwargs, **kwargs - ) - - if self.ngroups == 0: - # e.g. test_evaluate_with_empty_groups without any groups to - # iterate over, we have no output on which to do dtype - # inference. We default to using the existing dtype. - # xref GH#51445 - obj = self._obj_with_exclusions - return self.obj._constructor( - [], - name=self.obj.name, - index=self.grouper.result_index, - dtype=obj.dtype, - ) - - if self.grouper.nkeys > 1: - return self._python_agg_general(func, *args, **kwargs) - - try: - return self._python_agg_general(func, *args, **kwargs) - except KeyError: - # KeyError raised in test_groupby.test_basic is bc the func does - # a dictionary lookup on group.name, but group name is not - # pinned in _python_agg_general, only in _aggregate_named - result = self._aggregate_named(func, *args, **kwargs) - - warnings.warn( - "Pinning the groupby key to each group in " - f"{type(self).__name__}.agg is deprecated, and cases that " - "relied on it will raise in a future version. " - "If your operation requires utilizing the groupby keys, " - "iterate over the groupby object instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - # result is a dict whose keys are the elements of result_index - result = Series(result, index=self.grouper.result_index) - result = self._wrap_aggregated_output(result) - return result - - agg = aggregate - - def _python_agg_general(self, func, *args, **kwargs): - orig_func = func - func = com.is_builtin_func(func) - if orig_func != func: - alias = com._builtin_table_alias[func] - warn_alias_replacement(self, orig_func, alias) - f = lambda x: func(x, *args, **kwargs) - - obj = self._obj_with_exclusions - result = self.grouper.agg_series(obj, f) - res = obj._constructor(result, name=obj.name) - return self._wrap_aggregated_output(res) - - def _aggregate_multiple_funcs(self, arg, *args, **kwargs) -> DataFrame: - if isinstance(arg, dict): - if self.as_index: - # GH 15931 - raise SpecificationError("nested renamer is not supported") - else: - # GH#50684 - This accidentally worked in 1.x - msg = ( - "Passing a dictionary to SeriesGroupBy.agg is deprecated " - "and will raise in a future version of pandas. Pass a list " - "of aggregations instead." - ) - warnings.warn( - message=msg, - category=FutureWarning, - stacklevel=find_stack_level(), - ) - arg = list(arg.items()) - elif any(isinstance(x, (tuple, list)) for x in arg): - arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg] - else: - # list of functions / function names - columns = (com.get_callable_name(f) or f for f in arg) - arg = zip(columns, arg) - - results: dict[base.OutputKey, DataFrame | Series] = {} - with com.temp_setattr(self, "as_index", True): - # Combine results using the index, need to adjust index after - # if as_index=False (GH#50724) - for idx, (name, func) in enumerate(arg): - key = base.OutputKey(label=name, position=idx) - results[key] = self.aggregate(func, *args, **kwargs) - - if any(isinstance(x, DataFrame) for x in results.values()): - from pandas import concat - - res_df = concat( - results.values(), axis=1, keys=[key.label for key in results] - ) - return res_df - - indexed_output = {key.position: val for key, val in results.items()} - output = self.obj._constructor_expanddim(indexed_output, index=None) - output.columns = Index(key.label for key in results) - - return output - - def _wrap_applied_output( - self, - data: Series, - values: list[Any], - not_indexed_same: bool = False, - is_transform: bool = False, - ) -> DataFrame | Series: - """ - Wrap the output of SeriesGroupBy.apply into the expected result. - - Parameters - ---------- - data : Series - Input data for groupby operation. - values : List[Any] - Applied output for each group. - not_indexed_same : bool, default False - Whether the applied outputs are not indexed the same as the group axes. - - Returns - ------- - DataFrame or Series - """ - if len(values) == 0: - # GH #6265 - if is_transform: - # GH#47787 see test_group_on_empty_multiindex - res_index = data.index - else: - res_index = self.grouper.result_index - - return self.obj._constructor( - [], - name=self.obj.name, - index=res_index, - dtype=data.dtype, - ) - assert values is not None - - if isinstance(values[0], dict): - # GH #823 #24880 - index = self.grouper.result_index - res_df = self.obj._constructor_expanddim(values, index=index) - res_df = self._reindex_output(res_df) - # if self.observed is False, - # keep all-NaN rows created while re-indexing - res_ser = res_df.stack(future_stack=True) - res_ser.name = self.obj.name - return res_ser - elif isinstance(values[0], (Series, DataFrame)): - result = self._concat_objects( - values, - not_indexed_same=not_indexed_same, - is_transform=is_transform, - ) - if isinstance(result, Series): - result.name = self.obj.name - if not self.as_index and not_indexed_same: - result = self._insert_inaxis_grouper(result) - result.index = default_index(len(result)) - return result - else: - # GH #6265 #24880 - result = self.obj._constructor( - data=values, index=self.grouper.result_index, name=self.obj.name - ) - if not self.as_index: - result = self._insert_inaxis_grouper(result) - result.index = default_index(len(result)) - return self._reindex_output(result) - - def _aggregate_named(self, func, *args, **kwargs): - # Note: this is very similar to _aggregate_series_pure_python, - # but that does not pin group.name - result = {} - initialized = False - - for name, group in self.grouper.get_iterator( - self._obj_with_exclusions, axis=self.axis - ): - # needed for pandas/tests/groupby/test_groupby.py::test_basic_aggregations - object.__setattr__(group, "name", name) - - output = func(group, *args, **kwargs) - output = ops.extract_result(output) - if not initialized: - # We only do this validation on the first iteration - ops.check_result_array(output, group.dtype) - initialized = True - result[name] = output - - return result - - __examples_series_doc = dedent( - """ - >>> ser = pd.Series( - ... [390.0, 350.0, 30.0, 20.0], - ... index=["Falcon", "Falcon", "Parrot", "Parrot"], - ... name="Max Speed") - >>> grouped = ser.groupby([1, 1, 2, 2]) - >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) - Falcon 0.707107 - Falcon -0.707107 - Parrot 0.707107 - Parrot -0.707107 - Name: Max Speed, dtype: float64 - - Broadcast result of the transformation - - >>> grouped.transform(lambda x: x.max() - x.min()) - Falcon 40.0 - Falcon 40.0 - Parrot 10.0 - Parrot 10.0 - Name: Max Speed, dtype: float64 - - >>> grouped.transform("mean") - Falcon 370.0 - Falcon 370.0 - Parrot 25.0 - Parrot 25.0 - Name: Max Speed, dtype: float64 - - .. versionchanged:: 1.3.0 - - The resulting dtype will reflect the return value of the passed ``func``, - for example: - - >>> grouped.transform(lambda x: x.astype(int).max()) - Falcon 390 - Falcon 390 - Parrot 30 - Parrot 30 - Name: Max Speed, dtype: int64 - """ - ) - - @Substitution(klass="Series", example=__examples_series_doc) - @Appender(_transform_template) - def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): - return self._transform( - func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs - ) - - def _cython_transform( - self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs - ): - assert axis == 0 # handled by caller - - obj = self._obj_with_exclusions - - try: - result = self.grouper._cython_operation( - "transform", obj._values, how, axis, **kwargs - ) - except NotImplementedError as err: - # e.g. test_groupby_raises_string - raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err - - return obj._constructor(result, index=self.obj.index, name=obj.name) - - def _transform_general( - self, func: Callable, engine, engine_kwargs, *args, **kwargs - ) -> Series: - """ - Transform with a callable `func`. - """ - if maybe_use_numba(engine): - return self._transform_with_numba( - func, *args, engine_kwargs=engine_kwargs, **kwargs - ) - assert callable(func) - klass = type(self.obj) - - results = [] - for name, group in self.grouper.get_iterator( - self._obj_with_exclusions, axis=self.axis - ): - # this setattr is needed for test_transform_lambda_with_datetimetz - object.__setattr__(group, "name", name) - res = func(group, *args, **kwargs) - - results.append(klass(res, index=group.index)) - - # check for empty "results" to avoid concat ValueError - if results: - from pandas.core.reshape.concat import concat - - concatenated = concat(results) - result = self._set_result_index_ordered(concatenated) - else: - result = self.obj._constructor(dtype=np.float64) - - result.name = self.obj.name - return result - - def filter(self, func, dropna: bool = True, *args, **kwargs): - """ - Filter elements from groups that don't satisfy a criterion. - - Elements from groups are filtered if they do not satisfy the - boolean criterion specified by func. - - Parameters - ---------- - func : function - Criterion to apply to each group. Should return True or False. - dropna : bool - Drop groups that do not pass the filter. True by default; if False, - groups that evaluate False are filled with NaNs. - - Returns - ------- - Series - - Notes - ----- - Functions that mutate the passed object can produce unexpected - behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` - for more details. - - Examples - -------- - >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', - ... 'foo', 'bar'], - ... 'B' : [1, 2, 3, 4, 5, 6], - ... 'C' : [2.0, 5., 8., 1., 2., 9.]}) - >>> grouped = df.groupby('A') - >>> df.groupby('A').B.filter(lambda x: x.mean() > 3.) - 1 2 - 3 4 - 5 6 - Name: B, dtype: int64 - """ - if isinstance(func, str): - wrapper = lambda x: getattr(x, func)(*args, **kwargs) - else: - wrapper = lambda x: func(x, *args, **kwargs) - - # Interpret np.nan as False. - def true_and_notna(x) -> bool: - b = wrapper(x) - return notna(b) and b - - try: - indices = [ - self._get_index(name) - for name, group in self.grouper.get_iterator( - self._obj_with_exclusions, axis=self.axis - ) - if true_and_notna(group) - ] - except (ValueError, TypeError) as err: - raise TypeError("the filter must return a boolean result") from err - - filtered = self._apply_filter(indices, dropna) - return filtered - - def nunique(self, dropna: bool = True) -> Series | DataFrame: - """ - Return number of unique elements in the group. - - Returns - ------- - Series - Number of unique values within each group. - - Examples - -------- - For SeriesGroupby: - - >>> lst = ['a', 'a', 'b', 'b'] - >>> ser = pd.Series([1, 2, 3, 3], index=lst) - >>> ser - a 1 - a 2 - b 3 - b 3 - dtype: int64 - >>> ser.groupby(level=0).nunique() - a 2 - b 1 - dtype: int64 - - For Resampler: - - >>> ser = pd.Series([1, 2, 3, 3], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) - >>> ser - 2023-01-01 1 - 2023-01-15 2 - 2023-02-01 3 - 2023-02-15 3 - dtype: int64 - >>> ser.resample('MS').nunique() - 2023-01-01 2 - 2023-02-01 1 - Freq: MS, dtype: int64 - """ - ids, _, _ = self.grouper.group_info - - val = self.obj._values - - codes, _ = algorithms.factorize(val, sort=False) - sorter = np.lexsort((codes, ids)) - codes = codes[sorter] - ids = ids[sorter] - - # group boundaries are where group ids change - # unique observations are where sorted values change - idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] - inc = np.r_[1, codes[1:] != codes[:-1]] - - # 1st item of each group is a new unique observation - mask = codes == -1 - if dropna: - inc[idx] = 1 - inc[mask] = 0 - else: - inc[mask & np.r_[False, mask[:-1]]] = 0 - inc[idx] = 1 - - out = np.add.reduceat(inc, idx).astype("int64", copy=False) - if len(ids): - # NaN/NaT group exists if the head of ids is -1, - # so remove it from res and exclude its index from idx - if ids[0] == -1: - res = out[1:] - idx = idx[np.flatnonzero(idx)] - else: - res = out - else: - res = out[1:] - ri = self.grouper.result_index - - # we might have duplications among the bins - if len(res) != len(ri): - res, out = np.zeros(len(ri), dtype=out.dtype), res - if len(ids) > 0: - # GH#21334s - res[ids[idx]] = out - - result: Series | DataFrame = self.obj._constructor( - res, index=ri, name=self.obj.name - ) - if not self.as_index: - result = self._insert_inaxis_grouper(result) - result.index = default_index(len(result)) - return self._reindex_output(result, fill_value=0) - - @doc(Series.describe) - def describe(self, percentiles=None, include=None, exclude=None) -> Series: - return super().describe( - percentiles=percentiles, include=include, exclude=exclude - ) - - def value_counts( - self, - normalize: bool = False, - sort: bool = True, - ascending: bool = False, - bins=None, - dropna: bool = True, - ) -> Series | DataFrame: - name = "proportion" if normalize else "count" - - if bins is None: - result = self._value_counts( - normalize=normalize, sort=sort, ascending=ascending, dropna=dropna - ) - result.name = name - return result - - from pandas.core.reshape.merge import get_join_indexers - from pandas.core.reshape.tile import cut - - ids, _, _ = self.grouper.group_info - val = self.obj._values - - index_names = self.grouper.names + [self.obj.name] - - if isinstance(val.dtype, CategoricalDtype) or ( - bins is not None and not np.iterable(bins) - ): - # scalar bins cannot be done at top level - # in a backward compatible way - # GH38672 relates to categorical dtype - ser = self.apply( - Series.value_counts, - normalize=normalize, - sort=sort, - ascending=ascending, - bins=bins, - ) - ser.name = name - ser.index.names = index_names - return ser - - # groupby removes null keys from groupings - mask = ids != -1 - ids, val = ids[mask], val[mask] - - lab: Index | np.ndarray - if bins is None: - lab, lev = algorithms.factorize(val, sort=True) - llab = lambda lab, inc: lab[inc] - else: - # lab is a Categorical with categories an IntervalIndex - cat_ser = cut(Series(val, copy=False), bins, include_lowest=True) - cat_obj = cast("Categorical", cat_ser._values) - lev = cat_obj.categories - lab = lev.take( - cat_obj.codes, - allow_fill=True, - fill_value=lev._na_value, - ) - llab = lambda lab, inc: lab[inc]._multiindex.codes[-1] - - if isinstance(lab.dtype, IntervalDtype): - # TODO: should we do this inside II? - lab_interval = cast(Interval, lab) - - sorter = np.lexsort((lab_interval.left, lab_interval.right, ids)) - else: - sorter = np.lexsort((lab, ids)) - - ids, lab = ids[sorter], lab[sorter] - - # group boundaries are where group ids change - idchanges = 1 + np.nonzero(ids[1:] != ids[:-1])[0] - idx = np.r_[0, idchanges] - if not len(ids): - idx = idchanges - - # new values are where sorted labels change - lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1)) - inc = np.r_[True, lchanges] - if not len(val): - inc = lchanges - inc[idx] = True # group boundaries are also new values - out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts - - # num. of times each group should be repeated - rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) - - # multi-index components - codes = self.grouper.reconstructed_codes - codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)] - levels = [ping.group_index for ping in self.grouper.groupings] + [lev] - - if dropna: - mask = codes[-1] != -1 - if mask.all(): - dropna = False - else: - out, codes = out[mask], [level_codes[mask] for level_codes in codes] - - if normalize: - out = out.astype("float") - d = np.diff(np.r_[idx, len(ids)]) - if dropna: - m = ids[lab == -1] - np.add.at(d, m, -1) - acc = rep(d)[mask] - else: - acc = rep(d) - out /= acc - - if sort and bins is None: - cat = ids[inc][mask] if dropna else ids[inc] - sorter = np.lexsort((out if ascending else -out, cat)) - out, codes[-1] = out[sorter], codes[-1][sorter] - - if bins is not None: - # for compat. with libgroupby.value_counts need to ensure every - # bin is present at every index level, null filled with zeros - diff = np.zeros(len(out), dtype="bool") - for level_codes in codes[:-1]: - diff |= np.r_[True, level_codes[1:] != level_codes[:-1]] - - ncat, nbin = diff.sum(), len(levels[-1]) - - left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)] - - right = [diff.cumsum() - 1, codes[-1]] - - # error: Argument 1 to "get_join_indexers" has incompatible type - # "List[ndarray[Any, Any]]"; expected "List[Union[Union[ExtensionArray, - # ndarray[Any, Any]], Index, Series]] - _, idx = get_join_indexers( - left, right, sort=False, how="left" # type: ignore[arg-type] - ) - out = np.where(idx != -1, out[idx], 0) - - if sort: - sorter = np.lexsort((out if ascending else -out, left[0])) - out, left[-1] = out[sorter], left[-1][sorter] - - # build the multi-index w/ full levels - def build_codes(lev_codes: np.ndarray) -> np.ndarray: - return np.repeat(lev_codes[diff], nbin) - - codes = [build_codes(lev_codes) for lev_codes in codes[:-1]] - codes.append(left[-1]) - - mi = MultiIndex( - levels=levels, codes=codes, names=index_names, verify_integrity=False - ) - - if is_integer_dtype(out.dtype): - out = ensure_int64(out) - result = self.obj._constructor(out, index=mi, name=name) - if not self.as_index: - result = result.reset_index() - return result - - def fillna( - self, - value: object | ArrayLike | None = None, - method: FillnaOptions | None = None, - axis: Axis | None | lib.NoDefault = lib.no_default, - inplace: bool = False, - limit: int | None = None, - downcast: dict | None | lib.NoDefault = lib.no_default, - ) -> Series | None: - """ - Fill NA/NaN values using the specified method within groups. - - Parameters - ---------- - value : scalar, dict, Series, or DataFrame - Value to use to fill holes (e.g. 0), alternately a - dict/Series/DataFrame of values specifying which value to use for - each index (for a Series) or column (for a DataFrame). Values not - in the dict/Series/DataFrame will not be filled. This value cannot - be a list. Users wanting to use the ``value`` argument and not ``method`` - should prefer :meth:`.Series.fillna` as this - will produce the same result and be more performant. - method : {{'bfill', 'ffill', None}}, default None - Method to use for filling holes. ``'ffill'`` will propagate - the last valid observation forward within a group. - ``'bfill'`` will use next valid observation to fill the gap. - - .. deprecated:: 2.1.0 - Use obj.ffill or obj.bfill instead. - - axis : {0 or 'index', 1 or 'columns'} - Unused, only for compatibility with :meth:`DataFrameGroupBy.fillna`. - - .. deprecated:: 2.1.0 - For axis=1, operate on the underlying object instead. Otherwise - the axis keyword is not necessary. - - inplace : bool, default False - Broken. Do not set to True. - limit : int, default None - If method is specified, this is the maximum number of consecutive - NaN values to forward/backward fill within a group. In other words, - if there is a gap with more than this number of consecutive NaNs, - it will only be partially filled. If method is not specified, this is the - maximum number of entries along the entire axis where NaNs will be - filled. Must be greater than 0 if not None. - downcast : dict, default is None - A dict of item->dtype of what to downcast if possible, - or the string 'infer' which will try to downcast to an appropriate - equal type (e.g. float64 to int64 if possible). - - .. deprecated:: 2.1.0 - - Returns - ------- - Series - Object with missing values filled within groups. - - See Also - -------- - ffill : Forward fill values within a group. - bfill : Backward fill values within a group. - - Examples - -------- - For SeriesGroupBy: - - >>> lst = ['cat', 'cat', 'cat', 'mouse', 'mouse'] - >>> ser = pd.Series([1, None, None, 2, None], index=lst) - >>> ser - cat 1.0 - cat NaN - cat NaN - mouse 2.0 - mouse NaN - dtype: float64 - >>> ser.groupby(level=0).fillna(0, limit=1) - cat 1.0 - cat 0.0 - cat NaN - mouse 2.0 - mouse 0.0 - dtype: float64 - """ - result = self._op_via_apply( - "fillna", - value=value, - method=method, - axis=axis, - inplace=inplace, - limit=limit, - downcast=downcast, - ) - return result - - def take( - self, - indices: TakeIndexer, - axis: Axis | lib.NoDefault = lib.no_default, - **kwargs, - ) -> Series: - """ - Return the elements in the given *positional* indices in each group. - - This means that we are not indexing according to actual values in - the index attribute of the object. We are indexing according to the - actual position of the element in the object. - - If a requested index does not exist for some group, this method will raise. - To get similar behavior that ignores indices that don't exist, see - :meth:`.SeriesGroupBy.nth`. - - Parameters - ---------- - indices : array-like - An array of ints indicating which positions to take in each group. - axis : {0 or 'index', 1 or 'columns', None}, default 0 - The axis on which to select elements. ``0`` means that we are - selecting rows, ``1`` means that we are selecting columns. - For `SeriesGroupBy` this parameter is unused and defaults to 0. - - .. deprecated:: 2.1.0 - For axis=1, operate on the underlying object instead. Otherwise - the axis keyword is not necessary. - - **kwargs - For compatibility with :meth:`numpy.take`. Has no effect on the - output. - - Returns - ------- - Series - A Series containing the elements taken from each group. - - See Also - -------- - Series.take : Take elements from a Series along an axis. - Series.loc : Select a subset of a DataFrame by labels. - Series.iloc : Select a subset of a DataFrame by positions. - numpy.take : Take elements from an array along an axis. - SeriesGroupBy.nth : Similar to take, won't raise if indices don't exist. - - Examples - -------- - >>> df = pd.DataFrame([('falcon', 'bird', 389.0), - ... ('parrot', 'bird', 24.0), - ... ('lion', 'mammal', 80.5), - ... ('monkey', 'mammal', np.nan), - ... ('rabbit', 'mammal', 15.0)], - ... columns=['name', 'class', 'max_speed'], - ... index=[4, 3, 2, 1, 0]) - >>> df - name class max_speed - 4 falcon bird 389.0 - 3 parrot bird 24.0 - 2 lion mammal 80.5 - 1 monkey mammal NaN - 0 rabbit mammal 15.0 - >>> gb = df["name"].groupby([1, 1, 2, 2, 2]) - - Take elements at positions 0 and 1 along the axis 0 in each group (default). - - >>> gb.take([0, 1]) - 1 4 falcon - 3 parrot - 2 2 lion - 1 monkey - Name: name, dtype: object - - We may take elements using negative integers for positive indices, - starting from the end of the object, just like with Python lists. - - >>> gb.take([-1, -2]) - 1 3 parrot - 4 falcon - 2 0 rabbit - 1 monkey - Name: name, dtype: object - """ - result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs) - return result - - def skew( - self, - axis: Axis | lib.NoDefault = lib.no_default, - skipna: bool = True, - numeric_only: bool = False, - **kwargs, - ) -> Series: - """ - Return unbiased skew within groups. - - Normalized by N-1. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns', None}, default 0 - Axis for the function to be applied on. - This parameter is only for compatibility with DataFrame and is unused. - - .. deprecated:: 2.1.0 - For axis=1, operate on the underlying object instead. Otherwise - the axis keyword is not necessary. - - skipna : bool, default True - Exclude NA/null values when computing the result. - - numeric_only : bool, default False - Include only float, int, boolean columns. Not implemented for Series. - - **kwargs - Additional keyword arguments to be passed to the function. - - Returns - ------- - Series - - See Also - -------- - Series.skew : Return unbiased skew over requested axis. - - Examples - -------- - >>> ser = pd.Series([390., 350., 357., np.nan, 22., 20., 30.], - ... index=['Falcon', 'Falcon', 'Falcon', 'Falcon', - ... 'Parrot', 'Parrot', 'Parrot'], - ... name="Max Speed") - >>> ser - Falcon 390.0 - Falcon 350.0 - Falcon 357.0 - Falcon NaN - Parrot 22.0 - Parrot 20.0 - Parrot 30.0 - Name: Max Speed, dtype: float64 - >>> ser.groupby(level=0).skew() - Falcon 1.525174 - Parrot 1.457863 - Name: Max Speed, dtype: float64 - >>> ser.groupby(level=0).skew(skipna=False) - Falcon NaN - Parrot 1.457863 - Name: Max Speed, dtype: float64 - """ - if axis is lib.no_default: - axis = 0 - - if axis != 0: - result = self._op_via_apply( - "skew", - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - **kwargs, - ) - return result - - def alt(obj): - # This should not be reached since the cython path should raise - # TypeError and not NotImplementedError. - raise TypeError(f"'skew' is not supported for dtype={obj.dtype}") - - return self._cython_agg_general( - "skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs - ) - - @property - @doc(Series.plot.__doc__) - def plot(self) -> GroupByPlot: - result = GroupByPlot(self) - return result - - @doc(Series.nlargest.__doc__) - def nlargest( - self, n: int = 5, keep: Literal["first", "last", "all"] = "first" - ) -> Series: - f = partial(Series.nlargest, n=n, keep=keep) - data = self._obj_with_exclusions - # Don't change behavior if result index happens to be the same, i.e. - # already ordered and n >= all group sizes. - result = self._python_apply_general(f, data, not_indexed_same=True) - return result - - @doc(Series.nsmallest.__doc__) - def nsmallest( - self, n: int = 5, keep: Literal["first", "last", "all"] = "first" - ) -> Series: - f = partial(Series.nsmallest, n=n, keep=keep) - data = self._obj_with_exclusions - # Don't change behavior if result index happens to be the same, i.e. - # already ordered and n >= all group sizes. - result = self._python_apply_general(f, data, not_indexed_same=True) - return result - - @doc(Series.idxmin.__doc__) - def idxmin( - self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True - ) -> Series: - return self._idxmax_idxmin("idxmin", axis=axis, skipna=skipna) - - @doc(Series.idxmax.__doc__) - def idxmax( - self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True - ) -> Series: - return self._idxmax_idxmin("idxmax", axis=axis, skipna=skipna) - - @doc(Series.corr.__doc__) - def corr( - self, - other: Series, - method: CorrelationMethod = "pearson", - min_periods: int | None = None, - ) -> Series: - result = self._op_via_apply( - "corr", other=other, method=method, min_periods=min_periods - ) - return result - - @doc(Series.cov.__doc__) - def cov( - self, other: Series, min_periods: int | None = None, ddof: int | None = 1 - ) -> Series: - result = self._op_via_apply( - "cov", other=other, min_periods=min_periods, ddof=ddof - ) - return result - - @property - def is_monotonic_increasing(self) -> Series: - """ - Return whether each group's values are monotonically increasing. - - Returns - ------- - Series - - Examples - -------- - >>> s = pd.Series([2, 1, 3, 4], index=['Falcon', 'Falcon', 'Parrot', 'Parrot']) - >>> s.groupby(level=0).is_monotonic_increasing - Falcon False - Parrot True - dtype: bool - """ - return self.apply(lambda ser: ser.is_monotonic_increasing) - - @property - def is_monotonic_decreasing(self) -> Series: - """ - Return whether each group's values are monotonically decreasing. - - Returns - ------- - Series - - Examples - -------- - >>> s = pd.Series([2, 1, 3, 4], index=['Falcon', 'Falcon', 'Parrot', 'Parrot']) - >>> s.groupby(level=0).is_monotonic_decreasing - Falcon True - Parrot False - dtype: bool - """ - return self.apply(lambda ser: ser.is_monotonic_decreasing) - - @doc(Series.hist.__doc__) - def hist( - self, - by=None, - ax=None, - grid: bool = True, - xlabelsize: int | None = None, - xrot: float | None = None, - ylabelsize: int | None = None, - yrot: float | None = None, - figsize: tuple[int, int] | None = None, - bins: int | Sequence[int] = 10, - backend: str | None = None, - legend: bool = False, - **kwargs, - ): - result = self._op_via_apply( - "hist", - by=by, - ax=ax, - grid=grid, - xlabelsize=xlabelsize, - xrot=xrot, - ylabelsize=ylabelsize, - yrot=yrot, - figsize=figsize, - bins=bins, - backend=backend, - legend=legend, - **kwargs, - ) - return result - - @property - @doc(Series.dtype.__doc__) - def dtype(self) -> Series: - return self.apply(lambda ser: ser.dtype) - - def unique(self) -> Series: - """ - Return unique values for each group. - - It returns unique values for each of the grouped values. Returned in - order of appearance. Hash table-based unique, therefore does NOT sort. - - Returns - ------- - Series - Unique values for each of the grouped values. - - See Also - -------- - Series.unique : Return unique values of Series object. - - Examples - -------- - >>> df = pd.DataFrame([('Chihuahua', 'dog', 6.1), - ... ('Beagle', 'dog', 15.2), - ... ('Chihuahua', 'dog', 6.9), - ... ('Persian', 'cat', 9.2), - ... ('Chihuahua', 'dog', 7), - ... ('Persian', 'cat', 8.8)], - ... columns=['breed', 'animal', 'height_in']) - >>> df - breed animal height_in - 0 Chihuahua dog 6.1 - 1 Beagle dog 15.2 - 2 Chihuahua dog 6.9 - 3 Persian cat 9.2 - 4 Chihuahua dog 7.0 - 5 Persian cat 8.8 - >>> ser = df.groupby('animal')['breed'].unique() - >>> ser - animal - cat [Persian] - dog [Chihuahua, Beagle] - Name: breed, dtype: object - """ - result = self._op_via_apply("unique") - return result - - -class DataFrameGroupBy(GroupBy[DataFrame]): - _agg_examples_doc = dedent( - """ - Examples - -------- - >>> df = pd.DataFrame( - ... { - ... "A": [1, 1, 2, 2], - ... "B": [1, 2, 3, 4], - ... "C": [0.362838, 0.227877, 1.267767, -0.562860], - ... } - ... ) - - >>> df - A B C - 0 1 1 0.362838 - 1 1 2 0.227877 - 2 2 3 1.267767 - 3 2 4 -0.562860 - - The aggregation is for each column. - - >>> df.groupby('A').agg('min') - B C - A - 1 1 0.227877 - 2 3 -0.562860 - - Multiple aggregations - - >>> df.groupby('A').agg(['min', 'max']) - B C - min max min max - A - 1 1 2 0.227877 0.362838 - 2 3 4 -0.562860 1.267767 - - Select a column for aggregation - - >>> df.groupby('A').B.agg(['min', 'max']) - min max - A - 1 1 2 - 2 3 4 - - User-defined function for aggregation - - >>> df.groupby('A').agg(lambda x: sum(x) + 2) - B C - A - 1 5 2.590715 - 2 9 2.704907 - - Different aggregations per column - - >>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'}) - B C - min max sum - A - 1 1 2 0.590715 - 2 3 4 0.704907 - - To control the output names with different aggregations per column, - pandas supports "named aggregation" - - >>> df.groupby("A").agg( - ... b_min=pd.NamedAgg(column="B", aggfunc="min"), - ... c_sum=pd.NamedAgg(column="C", aggfunc="sum")) - b_min c_sum - A - 1 1 0.590715 - 2 3 0.704907 - - - The keywords are the *output* column names - - The values are tuples whose first element is the column to select - and the second element is the aggregation to apply to that column. - Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields - ``['column', 'aggfunc']`` to make it clearer what the arguments are. - As usual, the aggregation can be a callable or a string alias. - - See :ref:`groupby.aggregate.named` for more. - - .. versionchanged:: 1.3.0 - - The resulting dtype will reflect the return value of the aggregating function. - - >>> df.groupby("A")[["B"]].agg(lambda x: x.astype(float).min()) - B - A - 1 1.0 - 2 3.0 - """ - ) - - @doc(_agg_template_frame, examples=_agg_examples_doc, klass="DataFrame") - def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): - relabeling, func, columns, order = reconstruct_func(func, **kwargs) - func = maybe_mangle_lambdas(func) - - if maybe_use_numba(engine): - # Not all agg functions support numba, only propagate numba kwargs - # if user asks for numba - kwargs["engine"] = engine - kwargs["engine_kwargs"] = engine_kwargs - - op = GroupByApply(self, func, args=args, kwargs=kwargs) - result = op.agg() - if not is_dict_like(func) and result is not None: - # GH #52849 - if not self.as_index and is_list_like(func): - return result.reset_index() - else: - return result - elif relabeling: - # this should be the only (non-raising) case with relabeling - # used reordered index of columns - result = cast(DataFrame, result) - result = result.iloc[:, order] - result = cast(DataFrame, result) - # error: Incompatible types in assignment (expression has type - # "Optional[List[str]]", variable has type - # "Union[Union[Union[ExtensionArray, ndarray[Any, Any]], - # Index, Series], Sequence[Any]]") - result.columns = columns # type: ignore[assignment] - - if result is None: - # Remove the kwargs we inserted - # (already stored in engine, engine_kwargs arguments) - if "engine" in kwargs: - del kwargs["engine"] - del kwargs["engine_kwargs"] - # at this point func is not a str, list-like, dict-like, - # or a known callable(e.g. sum) - if maybe_use_numba(engine): - return self._aggregate_with_numba( - func, *args, engine_kwargs=engine_kwargs, **kwargs - ) - # grouper specific aggregations - if self.grouper.nkeys > 1: - # test_groupby_as_index_series_scalar gets here with 'not self.as_index' - return self._python_agg_general(func, *args, **kwargs) - elif args or kwargs: - # test_pass_args_kwargs gets here (with and without as_index) - # can't return early - result = self._aggregate_frame(func, *args, **kwargs) - - elif self.axis == 1: - # _aggregate_multiple_funcs does not allow self.axis == 1 - # Note: axis == 1 precludes 'not self.as_index', see __init__ - result = self._aggregate_frame(func) - return result - - else: - # try to treat as if we are passing a list - gba = GroupByApply(self, [func], args=(), kwargs={}) - try: - result = gba.agg() - - except ValueError as err: - if "No objects to concatenate" not in str(err): - raise - # _aggregate_frame can fail with e.g. func=Series.mode, - # where it expects 1D values but would be getting 2D values - # In other tests, using aggregate_frame instead of GroupByApply - # would give correct values but incorrect dtypes - # object vs float64 in test_cython_agg_empty_buckets - # float64 vs int64 in test_category_order_apply - result = self._aggregate_frame(func) - - else: - # GH#32040, GH#35246 - # e.g. test_groupby_as_index_select_column_sum_empty_df - result = cast(DataFrame, result) - result.columns = self._obj_with_exclusions.columns.copy() - - if not self.as_index: - result = self._insert_inaxis_grouper(result) - result.index = default_index(len(result)) - - return result - - agg = aggregate - - def _python_agg_general(self, func, *args, **kwargs): - orig_func = func - func = com.is_builtin_func(func) - if orig_func != func: - alias = com._builtin_table_alias[func] - warn_alias_replacement(self, orig_func, alias) - f = lambda x: func(x, *args, **kwargs) - - if self.ngroups == 0: - # e.g. test_evaluate_with_empty_groups different path gets different - # result dtype in empty case. - return self._python_apply_general(f, self._selected_obj, is_agg=True) - - obj = self._obj_with_exclusions - if self.axis == 1: - obj = obj.T - - if not len(obj.columns): - # e.g. test_margins_no_values_no_cols - return self._python_apply_general(f, self._selected_obj) - - output: dict[int, ArrayLike] = {} - for idx, (name, ser) in enumerate(obj.items()): - result = self.grouper.agg_series(ser, f) - output[idx] = result - - res = self.obj._constructor(output) - res.columns = obj.columns.copy(deep=False) - return self._wrap_aggregated_output(res) - - def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: - if self.grouper.nkeys != 1: - raise AssertionError("Number of keys must be 1") - - obj = self._obj_with_exclusions - - result: dict[Hashable, NDFrame | np.ndarray] = {} - for name, grp_df in self.grouper.get_iterator(obj, self.axis): - fres = func(grp_df, *args, **kwargs) - result[name] = fres - - result_index = self.grouper.result_index - other_ax = obj.axes[1 - self.axis] - out = self.obj._constructor(result, index=other_ax, columns=result_index) - if self.axis == 0: - out = out.T - - return out - - def _wrap_applied_output( - self, - data: DataFrame, - values: list, - not_indexed_same: bool = False, - is_transform: bool = False, - ): - if len(values) == 0: - if is_transform: - # GH#47787 see test_group_on_empty_multiindex - res_index = data.index - else: - res_index = self.grouper.result_index - - result = self.obj._constructor(index=res_index, columns=data.columns) - result = result.astype(data.dtypes, copy=False) - return result - - # GH12824 - # using values[0] here breaks test_groupby_apply_none_first - first_not_none = next(com.not_none(*values), None) - - if first_not_none is None: - # GH9684 - All values are None, return an empty frame. - return self.obj._constructor() - elif isinstance(first_not_none, DataFrame): - return self._concat_objects( - values, - not_indexed_same=not_indexed_same, - is_transform=is_transform, - ) - - key_index = self.grouper.result_index if self.as_index else None - - if isinstance(first_not_none, (np.ndarray, Index)): - # GH#1738: values is list of arrays of unequal lengths - # fall through to the outer else clause - # TODO: sure this is right? we used to do this - # after raising AttributeError above - # GH 18930 - if not is_hashable(self._selection): - # error: Need type annotation for "name" - name = tuple(self._selection) # type: ignore[var-annotated, arg-type] - else: - # error: Incompatible types in assignment - # (expression has type "Hashable", variable - # has type "Tuple[Any, ...]") - name = self._selection # type: ignore[assignment] - return self.obj._constructor_sliced(values, index=key_index, name=name) - elif not isinstance(first_not_none, Series): - # values are not series or array-like but scalars - # self._selection not passed through to Series as the - # result should not take the name of original selection - # of columns - if self.as_index: - return self.obj._constructor_sliced(values, index=key_index) - else: - result = self.obj._constructor(values, columns=[self._selection]) - result = self._insert_inaxis_grouper(result) - return result - else: - # values are Series - return self._wrap_applied_output_series( - values, - not_indexed_same, - first_not_none, - key_index, - is_transform, - ) - - def _wrap_applied_output_series( - self, - values: list[Series], - not_indexed_same: bool, - first_not_none, - key_index: Index | None, - is_transform: bool, - ) -> DataFrame | Series: - kwargs = first_not_none._construct_axes_dict() - backup = Series(**kwargs) - values = [x if (x is not None) else backup for x in values] - - all_indexed_same = all_indexes_same(x.index for x in values) - - if not all_indexed_same: - # GH 8467 - return self._concat_objects( - values, - not_indexed_same=True, - is_transform=is_transform, - ) - - # Combine values - # vstack+constructor is faster than concat and handles MI-columns - stacked_values = np.vstack([np.asarray(v) for v in values]) - - if self.axis == 0: - index = key_index - columns = first_not_none.index.copy() - if columns.name is None: - # GH6124 - propagate name of Series when it's consistent - names = {v.name for v in values} - if len(names) == 1: - columns.name = next(iter(names)) - else: - index = first_not_none.index - columns = key_index - stacked_values = stacked_values.T - - if stacked_values.dtype == object: - # We'll have the DataFrame constructor do inference - stacked_values = stacked_values.tolist() - result = self.obj._constructor(stacked_values, index=index, columns=columns) - - if not self.as_index: - result = self._insert_inaxis_grouper(result) - - return self._reindex_output(result) - - def _cython_transform( - self, - how: str, - numeric_only: bool = False, - axis: AxisInt = 0, - **kwargs, - ) -> DataFrame: - assert axis == 0 # handled by caller - - # With self.axis == 0, we have multi-block tests - # e.g. test_rank_min_int, test_cython_transform_frame - # test_transform_numeric_ret - # With self.axis == 1, _get_data_to_aggregate does a transpose - # so we always have a single block. - mgr: Manager2D = self._get_data_to_aggregate( - numeric_only=numeric_only, name=how - ) - - def arr_func(bvalues: ArrayLike) -> ArrayLike: - return self.grouper._cython_operation( - "transform", bvalues, how, 1, **kwargs - ) - - # We could use `mgr.apply` here and not have to set_axis, but - # we would have to do shape gymnastics for ArrayManager compat - res_mgr = mgr.grouped_reduce(arr_func) - res_mgr.set_axis(1, mgr.axes[1]) - - res_df = self.obj._constructor_from_mgr(res_mgr, axes=res_mgr.axes) - res_df = self._maybe_transpose_result(res_df) - return res_df - - def _transform_general(self, func, engine, engine_kwargs, *args, **kwargs): - if maybe_use_numba(engine): - return self._transform_with_numba( - func, *args, engine_kwargs=engine_kwargs, **kwargs - ) - from pandas.core.reshape.concat import concat - - applied = [] - obj = self._obj_with_exclusions - gen = self.grouper.get_iterator(obj, axis=self.axis) - fast_path, slow_path = self._define_paths(func, *args, **kwargs) - - # Determine whether to use slow or fast path by evaluating on the first group. - # Need to handle the case of an empty generator and process the result so that - # it does not need to be computed again. - try: - name, group = next(gen) - except StopIteration: - pass - else: - # 2023-02-27 No tests broken by disabling this pinning - object.__setattr__(group, "name", name) - try: - path, res = self._choose_path(fast_path, slow_path, group) - except ValueError as err: - # e.g. test_transform_with_non_scalar_group - msg = "transform must return a scalar value for each group" - raise ValueError(msg) from err - if group.size > 0: - res = _wrap_transform_general_frame(self.obj, group, res) - applied.append(res) - - # Compute and process with the remaining groups - for name, group in gen: - if group.size == 0: - continue - # 2023-02-27 No tests broken by disabling this pinning - object.__setattr__(group, "name", name) - res = path(group) - - res = _wrap_transform_general_frame(self.obj, group, res) - applied.append(res) - - concat_index = obj.columns if self.axis == 0 else obj.index - other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1 - concatenated = concat(applied, axis=self.axis, verify_integrity=False) - concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False) - return self._set_result_index_ordered(concatenated) - - __examples_dataframe_doc = dedent( - """ - >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', - ... 'foo', 'bar'], - ... 'B' : ['one', 'one', 'two', 'three', - ... 'two', 'two'], - ... 'C' : [1, 5, 5, 2, 5, 5], - ... 'D' : [2.0, 5., 8., 1., 2., 9.]}) - >>> grouped = df.groupby('A')[['C', 'D']] - >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) - C D - 0 -1.154701 -0.577350 - 1 0.577350 0.000000 - 2 0.577350 1.154701 - 3 -1.154701 -1.000000 - 4 0.577350 -0.577350 - 5 0.577350 1.000000 - - Broadcast result of the transformation - - >>> grouped.transform(lambda x: x.max() - x.min()) - C D - 0 4.0 6.0 - 1 3.0 8.0 - 2 4.0 6.0 - 3 3.0 8.0 - 4 4.0 6.0 - 5 3.0 8.0 - - >>> grouped.transform("mean") - C D - 0 3.666667 4.0 - 1 4.000000 5.0 - 2 3.666667 4.0 - 3 4.000000 5.0 - 4 3.666667 4.0 - 5 4.000000 5.0 - - .. versionchanged:: 1.3.0 - - The resulting dtype will reflect the return value of the passed ``func``, - for example: - - >>> grouped.transform(lambda x: x.astype(int).max()) - C D - 0 5 8 - 1 5 9 - 2 5 8 - 3 5 9 - 4 5 8 - 5 5 9 - """ - ) - - @Substitution(klass="DataFrame", example=__examples_dataframe_doc) - @Appender(_transform_template) - def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): - return self._transform( - func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs - ) - - def _define_paths(self, func, *args, **kwargs): - if isinstance(func, str): - fast_path = lambda group: getattr(group, func)(*args, **kwargs) - slow_path = lambda group: group.apply( - lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis - ) - else: - fast_path = lambda group: func(group, *args, **kwargs) - slow_path = lambda group: group.apply( - lambda x: func(x, *args, **kwargs), axis=self.axis - ) - return fast_path, slow_path - - def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame): - path = slow_path - res = slow_path(group) - - if self.ngroups == 1: - # no need to evaluate multiple paths when only - # a single group exists - return path, res - - # if we make it here, test if we can use the fast path - try: - res_fast = fast_path(group) - except AssertionError: - raise # pragma: no cover - except Exception: - # GH#29631 For user-defined function, we can't predict what may be - # raised; see test_transform.test_transform_fastpath_raises - return path, res - - # verify fast path returns either: - # a DataFrame with columns equal to group.columns - # OR a Series with index equal to group.columns - if isinstance(res_fast, DataFrame): - if not res_fast.columns.equals(group.columns): - return path, res - elif isinstance(res_fast, Series): - if not res_fast.index.equals(group.columns): - return path, res - else: - return path, res - - if res_fast.equals(res): - path = fast_path - - return path, res - - def filter(self, func, dropna: bool = True, *args, **kwargs): - """ - Filter elements from groups that don't satisfy a criterion. - - Elements from groups are filtered if they do not satisfy the - boolean criterion specified by func. - - Parameters - ---------- - func : function - Criterion to apply to each group. Should return True or False. - dropna : bool - Drop groups that do not pass the filter. True by default; if False, - groups that evaluate False are filled with NaNs. - - Returns - ------- - DataFrame - - Notes - ----- - Each subframe is endowed the attribute 'name' in case you need to know - which group you are working on. - - Functions that mutate the passed object can produce unexpected - behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` - for more details. - - Examples - -------- - >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', - ... 'foo', 'bar'], - ... 'B' : [1, 2, 3, 4, 5, 6], - ... 'C' : [2.0, 5., 8., 1., 2., 9.]}) - >>> grouped = df.groupby('A') - >>> grouped.filter(lambda x: x['B'].mean() > 3.) - A B C - 1 bar 2 5.0 - 3 bar 4 1.0 - 5 bar 6 9.0 - """ - indices = [] - - obj = self._selected_obj - gen = self.grouper.get_iterator(obj, axis=self.axis) - - for name, group in gen: - # 2023-02-27 no tests are broken this pinning, but it is documented in the - # docstring above. - object.__setattr__(group, "name", name) - - res = func(group, *args, **kwargs) - - try: - res = res.squeeze() - except AttributeError: # allow e.g., scalars and frames to pass - pass - - # interpret the result of the filter - if is_bool(res) or (is_scalar(res) and isna(res)): - if notna(res) and res: - indices.append(self._get_index(name)) - else: - # non scalars aren't allowed - raise TypeError( - f"filter function returned a {type(res).__name__}, " - "but expected a scalar bool" - ) - - return self._apply_filter(indices, dropna) - - def __getitem__(self, key) -> DataFrameGroupBy | SeriesGroupBy: - if self.axis == 1: - # GH 37725 - raise ValueError("Cannot subset columns when using axis=1") - # per GH 23566 - if isinstance(key, tuple) and len(key) > 1: - # if len == 1, then it becomes a SeriesGroupBy and this is actually - # valid syntax, so don't raise - raise ValueError( - "Cannot subset columns with a tuple with more than one element. " - "Use a list instead." - ) - return super().__getitem__(key) - - def _gotitem(self, key, ndim: int, subset=None): - """ - sub-classes to define - return a sliced object - - Parameters - ---------- - key : string / list of selections - ndim : {1, 2} - requested ndim of result - subset : object, default None - subset to act on - """ - if ndim == 2: - if subset is None: - subset = self.obj - return DataFrameGroupBy( - subset, - self.keys, - axis=self.axis, - level=self.level, - grouper=self.grouper, - exclusions=self.exclusions, - selection=key, - as_index=self.as_index, - sort=self.sort, - group_keys=self.group_keys, - observed=self.observed, - dropna=self.dropna, - ) - elif ndim == 1: - if subset is None: - subset = self.obj[key] - return SeriesGroupBy( - subset, - self.keys, - level=self.level, - grouper=self.grouper, - exclusions=self.exclusions, - selection=key, - as_index=self.as_index, - sort=self.sort, - group_keys=self.group_keys, - observed=self.observed, - dropna=self.dropna, - ) - - raise AssertionError("invalid ndim for _gotitem") - - def _get_data_to_aggregate( - self, *, numeric_only: bool = False, name: str | None = None - ) -> Manager2D: - obj = self._obj_with_exclusions - if self.axis == 1: - mgr = obj.T._mgr - else: - mgr = obj._mgr - - if numeric_only: - mgr = mgr.get_numeric_data(copy=False) - return mgr - - def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: - return self.obj._constructor_from_mgr(mgr, axes=mgr.axes) - - def _apply_to_column_groupbys(self, func) -> DataFrame: - from pandas.core.reshape.concat import concat - - obj = self._obj_with_exclusions - columns = obj.columns - sgbs = [ - SeriesGroupBy( - obj.iloc[:, i], - selection=colname, - grouper=self.grouper, - exclusions=self.exclusions, - observed=self.observed, - ) - for i, colname in enumerate(obj.columns) - ] - results = [func(sgb) for sgb in sgbs] - - if not len(results): - # concat would raise - res_df = DataFrame([], columns=columns, index=self.grouper.result_index) - else: - res_df = concat(results, keys=columns, axis=1) - - if not self.as_index: - res_df.index = default_index(len(res_df)) - res_df = self._insert_inaxis_grouper(res_df) - return res_df - - def nunique(self, dropna: bool = True) -> DataFrame: - """ - Return DataFrame with counts of unique elements in each position. - - Parameters - ---------- - dropna : bool, default True - Don't include NaN in the counts. - - Returns - ------- - nunique: DataFrame - - Examples - -------- - >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam', - ... 'ham', 'ham'], - ... 'value1': [1, 5, 5, 2, 5, 5], - ... 'value2': list('abbaxy')}) - >>> df - id value1 value2 - 0 spam 1 a - 1 egg 5 b - 2 egg 5 b - 3 spam 2 a - 4 ham 5 x - 5 ham 5 y - - >>> df.groupby('id').nunique() - value1 value2 - id - egg 1 1 - ham 1 2 - spam 2 1 - - Check for rows with the same id but conflicting values: - - >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any()) - id value1 value2 - 0 spam 1 a - 3 spam 2 a - 4 ham 5 x - 5 ham 5 y - """ - - if self.axis != 0: - # see test_groupby_crash_on_nunique - return self._python_apply_general( - lambda sgb: sgb.nunique(dropna), self._obj_with_exclusions, is_agg=True - ) - - return self._apply_to_column_groupbys(lambda sgb: sgb.nunique(dropna)) - - def idxmax( - self, - axis: Axis | None | lib.NoDefault = lib.no_default, - skipna: bool = True, - numeric_only: bool = False, - ) -> DataFrame: - """ - Return index of first occurrence of maximum over requested axis. - - NA/null values are excluded. - - Parameters - ---------- - axis : {{0 or 'index', 1 or 'columns'}}, default None - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - If axis is not provided, grouper's axis is used. - - .. versionchanged:: 2.0.0 - - .. deprecated:: 2.1.0 - For axis=1, operate on the underlying object instead. Otherwise - the axis keyword is not necessary. - - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - numeric_only : bool, default False - Include only `float`, `int` or `boolean` data. - - .. versionadded:: 1.5.0 - - Returns - ------- - Series - Indexes of maxima along the specified axis. - - Raises - ------ - ValueError - * If the row/column is empty - - See Also - -------- - Series.idxmax : Return index of the maximum element. - - Notes - ----- - This method is the DataFrame version of ``ndarray.argmax``. - - Examples - -------- - Consider a dataset containing food consumption in Argentina. - - >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], - ... 'co2_emissions': [37.2, 19.66, 1712]}, - ... index=['Pork', 'Wheat Products', 'Beef']) - - >>> df - consumption co2_emissions - Pork 10.51 37.20 - Wheat Products 103.11 19.66 - Beef 55.48 1712.00 - - By default, it returns the index for the maximum value in each column. - - >>> df.idxmax() - consumption Wheat Products - co2_emissions Beef - dtype: object - - To return the index for the maximum value in each row, use ``axis="columns"``. - - >>> df.idxmax(axis="columns") - Pork co2_emissions - Wheat Products consumption - Beef co2_emissions - dtype: object - """ - return self._idxmax_idxmin( - "idxmax", axis=axis, numeric_only=numeric_only, skipna=skipna - ) - - def idxmin( - self, - axis: Axis | None | lib.NoDefault = lib.no_default, - skipna: bool = True, - numeric_only: bool = False, - ) -> DataFrame: - """ - Return index of first occurrence of minimum over requested axis. - - NA/null values are excluded. - - Parameters - ---------- - axis : {{0 or 'index', 1 or 'columns'}}, default None - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - If axis is not provided, grouper's axis is used. - - .. versionchanged:: 2.0.0 - - .. deprecated:: 2.1.0 - For axis=1, operate on the underlying object instead. Otherwise - the axis keyword is not necessary. - - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - numeric_only : bool, default False - Include only `float`, `int` or `boolean` data. - - .. versionadded:: 1.5.0 - - Returns - ------- - Series - Indexes of minima along the specified axis. - - Raises - ------ - ValueError - * If the row/column is empty - - See Also - -------- - Series.idxmin : Return index of the minimum element. - - Notes - ----- - This method is the DataFrame version of ``ndarray.argmin``. - - Examples - -------- - Consider a dataset containing food consumption in Argentina. - - >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], - ... 'co2_emissions': [37.2, 19.66, 1712]}, - ... index=['Pork', 'Wheat Products', 'Beef']) - - >>> df - consumption co2_emissions - Pork 10.51 37.20 - Wheat Products 103.11 19.66 - Beef 55.48 1712.00 - - By default, it returns the index for the minimum value in each column. - - >>> df.idxmin() - consumption Pork - co2_emissions Wheat Products - dtype: object - - To return the index for the minimum value in each row, use ``axis="columns"``. - - >>> df.idxmin(axis="columns") - Pork consumption - Wheat Products co2_emissions - Beef consumption - dtype: object - """ - return self._idxmax_idxmin( - "idxmin", axis=axis, numeric_only=numeric_only, skipna=skipna - ) - - boxplot = boxplot_frame_groupby - - def value_counts( - self, - subset: Sequence[Hashable] | None = None, - normalize: bool = False, - sort: bool = True, - ascending: bool = False, - dropna: bool = True, - ) -> DataFrame | Series: - """ - Return a Series or DataFrame containing counts of unique rows. - - .. versionadded:: 1.4.0 - - Parameters - ---------- - subset : list-like, optional - Columns to use when counting unique combinations. - normalize : bool, default False - Return proportions rather than frequencies. - sort : bool, default True - Sort by frequencies. - ascending : bool, default False - Sort in ascending order. - dropna : bool, default True - Don't include counts of rows that contain NA values. - - Returns - ------- - Series or DataFrame - Series if the groupby as_index is True, otherwise DataFrame. - - See Also - -------- - Series.value_counts: Equivalent method on Series. - DataFrame.value_counts: Equivalent method on DataFrame. - SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy. - - Notes - ----- - - If the groupby as_index is True then the returned Series will have a - MultiIndex with one level per input column. - - If the groupby as_index is False then the returned DataFrame will have an - additional column with the value_counts. The column is labelled 'count' or - 'proportion', depending on the ``normalize`` parameter. - - By default, rows that contain any NA values are omitted from - the result. - - By default, the result will be in descending order so that the - first element of each group is the most frequently-occurring row. - - Examples - -------- - >>> df = pd.DataFrame({ - ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], - ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], - ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] - ... }) - - >>> df - gender education country - 0 male low US - 1 male medium FR - 2 female high US - 3 male low FR - 4 female high FR - 5 male low FR - - >>> df.groupby('gender').value_counts() - gender education country - female high FR 1 - US 1 - male low FR 2 - US 1 - medium FR 1 - Name: count, dtype: int64 - - >>> df.groupby('gender').value_counts(ascending=True) - gender education country - female high FR 1 - US 1 - male low US 1 - medium FR 1 - low FR 2 - Name: count, dtype: int64 - - >>> df.groupby('gender').value_counts(normalize=True) - gender education country - female high FR 0.50 - US 0.50 - male low FR 0.50 - US 0.25 - medium FR 0.25 - Name: proportion, dtype: float64 - - >>> df.groupby('gender', as_index=False).value_counts() - gender education country count - 0 female high FR 1 - 1 female high US 1 - 2 male low FR 2 - 3 male low US 1 - 4 male medium FR 1 - - >>> df.groupby('gender', as_index=False).value_counts(normalize=True) - gender education country proportion - 0 female high FR 0.50 - 1 female high US 0.50 - 2 male low FR 0.50 - 3 male low US 0.25 - 4 male medium FR 0.25 - """ - return self._value_counts(subset, normalize, sort, ascending, dropna) - - def fillna( - self, - value: Hashable | Mapping | Series | DataFrame | None = None, - method: FillnaOptions | None = None, - axis: Axis | None | lib.NoDefault = lib.no_default, - inplace: bool = False, - limit: int | None = None, - downcast=lib.no_default, - ) -> DataFrame | None: - """ - Fill NA/NaN values using the specified method within groups. - - Parameters - ---------- - value : scalar, dict, Series, or DataFrame - Value to use to fill holes (e.g. 0), alternately a - dict/Series/DataFrame of values specifying which value to use for - each index (for a Series) or column (for a DataFrame). Values not - in the dict/Series/DataFrame will not be filled. This value cannot - be a list. Users wanting to use the ``value`` argument and not ``method`` - should prefer :meth:`.DataFrame.fillna` as this - will produce the same result and be more performant. - method : {{'bfill', 'ffill', None}}, default None - Method to use for filling holes. ``'ffill'`` will propagate - the last valid observation forward within a group. - ``'bfill'`` will use next valid observation to fill the gap. - axis : {0 or 'index', 1 or 'columns'} - Axis along which to fill missing values. When the :class:`DataFrameGroupBy` - ``axis`` argument is ``0``, using ``axis=1`` here will produce - the same results as :meth:`.DataFrame.fillna`. When the - :class:`DataFrameGroupBy` ``axis`` argument is ``1``, using ``axis=0`` - or ``axis=1`` here will produce the same results. - - .. deprecated:: 2.1.0 - For axis=1, operate on the underlying object instead. Otherwise - the axis keyword is not necessary. - - inplace : bool, default False - Broken. Do not set to True. - limit : int, default None - If method is specified, this is the maximum number of consecutive - NaN values to forward/backward fill within a group. In other words, - if there is a gap with more than this number of consecutive NaNs, - it will only be partially filled. If method is not specified, this is the - maximum number of entries along the entire axis where NaNs will be - filled. Must be greater than 0 if not None. - downcast : dict, default is None - A dict of item->dtype of what to downcast if possible, - or the string 'infer' which will try to downcast to an appropriate - equal type (e.g. float64 to int64 if possible). - - .. deprecated:: 2.1.0 - - Returns - ------- - DataFrame - Object with missing values filled. - - See Also - -------- - ffill : Forward fill values within a group. - bfill : Backward fill values within a group. - - Examples - -------- - >>> df = pd.DataFrame( - ... { - ... "key": [0, 0, 1, 1, 1], - ... "A": [np.nan, 2, np.nan, 3, np.nan], - ... "B": [2, 3, np.nan, np.nan, np.nan], - ... "C": [np.nan, np.nan, 2, np.nan, np.nan], - ... } - ... ) - >>> df - key A B C - 0 0 NaN 2.0 NaN - 1 0 2.0 3.0 NaN - 2 1 NaN NaN 2.0 - 3 1 3.0 NaN NaN - 4 1 NaN NaN NaN - - Propagate non-null values forward or backward within each group along columns. - - >>> df.groupby("key").fillna(method="ffill") - A B C - 0 NaN 2.0 NaN - 1 2.0 3.0 NaN - 2 NaN NaN 2.0 - 3 3.0 NaN 2.0 - 4 3.0 NaN 2.0 - - >>> df.groupby("key").fillna(method="bfill") - A B C - 0 2.0 2.0 NaN - 1 2.0 3.0 NaN - 2 3.0 NaN 2.0 - 3 3.0 NaN NaN - 4 NaN NaN NaN - - Propagate non-null values forward or backward within each group along rows. - - >>> df.T.groupby(np.array([0, 0, 1, 1])).fillna(method="ffill").T - key A B C - 0 0.0 0.0 2.0 2.0 - 1 0.0 2.0 3.0 3.0 - 2 1.0 1.0 NaN 2.0 - 3 1.0 3.0 NaN NaN - 4 1.0 1.0 NaN NaN - - >>> df.T.groupby(np.array([0, 0, 1, 1])).fillna(method="bfill").T - key A B C - 0 0.0 NaN 2.0 NaN - 1 0.0 2.0 3.0 NaN - 2 1.0 NaN 2.0 2.0 - 3 1.0 3.0 NaN NaN - 4 1.0 NaN NaN NaN - - Only replace the first NaN element within a group along rows. - - >>> df.groupby("key").fillna(method="ffill", limit=1) - A B C - 0 NaN 2.0 NaN - 1 2.0 3.0 NaN - 2 NaN NaN 2.0 - 3 3.0 NaN 2.0 - 4 3.0 NaN NaN - """ - if method is not None: - warnings.warn( - f"{type(self).__name__}.fillna with 'method' is deprecated and " - "will raise in a future version. Use obj.ffill() or obj.bfill() " - "instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - result = self._op_via_apply( - "fillna", - value=value, - method=method, - axis=axis, - inplace=inplace, - limit=limit, - downcast=downcast, - ) - return result - - def take( - self, - indices: TakeIndexer, - axis: Axis | None | lib.NoDefault = lib.no_default, - **kwargs, - ) -> DataFrame: - """ - Return the elements in the given *positional* indices in each group. - - This means that we are not indexing according to actual values in - the index attribute of the object. We are indexing according to the - actual position of the element in the object. - - If a requested index does not exist for some group, this method will raise. - To get similar behavior that ignores indices that don't exist, see - :meth:`.DataFrameGroupBy.nth`. - - Parameters - ---------- - indices : array-like - An array of ints indicating which positions to take. - axis : {0 or 'index', 1 or 'columns', None}, default 0 - The axis on which to select elements. ``0`` means that we are - selecting rows, ``1`` means that we are selecting columns. - - .. deprecated:: 2.1.0 - For axis=1, operate on the underlying object instead. Otherwise - the axis keyword is not necessary. - - **kwargs - For compatibility with :meth:`numpy.take`. Has no effect on the - output. - - Returns - ------- - DataFrame - An DataFrame containing the elements taken from each group. - - See Also - -------- - DataFrame.take : Take elements from a Series along an axis. - DataFrame.loc : Select a subset of a DataFrame by labels. - DataFrame.iloc : Select a subset of a DataFrame by positions. - numpy.take : Take elements from an array along an axis. - - Examples - -------- - >>> df = pd.DataFrame([('falcon', 'bird', 389.0), - ... ('parrot', 'bird', 24.0), - ... ('lion', 'mammal', 80.5), - ... ('monkey', 'mammal', np.nan), - ... ('rabbit', 'mammal', 15.0)], - ... columns=['name', 'class', 'max_speed'], - ... index=[4, 3, 2, 1, 0]) - >>> df - name class max_speed - 4 falcon bird 389.0 - 3 parrot bird 24.0 - 2 lion mammal 80.5 - 1 monkey mammal NaN - 0 rabbit mammal 15.0 - >>> gb = df.groupby([1, 1, 2, 2, 2]) - - Take elements at positions 0 and 1 along the axis 0 (default). - - Note how the indices selected in the result do not correspond to - our input indices 0 and 1. That's because we are selecting the 0th - and 1st rows, not rows whose indices equal 0 and 1. - - >>> gb.take([0, 1]) - name class max_speed - 1 4 falcon bird 389.0 - 3 parrot bird 24.0 - 2 2 lion mammal 80.5 - 1 monkey mammal NaN - - The order of the specified indices influences the order in the result. - Here, the order is swapped from the previous example. - - >>> gb.take([1, 0]) - name class max_speed - 1 3 parrot bird 24.0 - 4 falcon bird 389.0 - 2 1 monkey mammal NaN - 2 lion mammal 80.5 - - Take elements at indices 1 and 2 along the axis 1 (column selection). - - We may take elements using negative integers for positive indices, - starting from the end of the object, just like with Python lists. - - >>> gb.take([-1, -2]) - name class max_speed - 1 3 parrot bird 24.0 - 4 falcon bird 389.0 - 2 0 rabbit mammal 15.0 - 1 monkey mammal NaN - """ - result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs) - return result - - def skew( - self, - axis: Axis | None | lib.NoDefault = lib.no_default, - skipna: bool = True, - numeric_only: bool = False, - **kwargs, - ) -> DataFrame: - """ - Return unbiased skew within groups. - - Normalized by N-1. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns', None}, default 0 - Axis for the function to be applied on. - - Specifying ``axis=None`` will apply the aggregation across both axes. - - .. versionadded:: 2.0.0 - - .. deprecated:: 2.1.0 - For axis=1, operate on the underlying object instead. Otherwise - the axis keyword is not necessary. - - skipna : bool, default True - Exclude NA/null values when computing the result. - - numeric_only : bool, default False - Include only float, int, boolean columns. - - **kwargs - Additional keyword arguments to be passed to the function. - - Returns - ------- - DataFrame - - See Also - -------- - DataFrame.skew : Return unbiased skew over requested axis. - - Examples - -------- - >>> arrays = [['falcon', 'parrot', 'cockatoo', 'kiwi', - ... 'lion', 'monkey', 'rabbit'], - ... ['bird', 'bird', 'bird', 'bird', - ... 'mammal', 'mammal', 'mammal']] - >>> index = pd.MultiIndex.from_arrays(arrays, names=('name', 'class')) - >>> df = pd.DataFrame({'max_speed': [389.0, 24.0, 70.0, np.nan, - ... 80.5, 21.5, 15.0]}, - ... index=index) - >>> df - max_speed - name class - falcon bird 389.0 - parrot bird 24.0 - cockatoo bird 70.0 - kiwi bird NaN - lion mammal 80.5 - monkey mammal 21.5 - rabbit mammal 15.0 - >>> gb = df.groupby(["class"]) - >>> gb.skew() - max_speed - class - bird 1.628296 - mammal 1.669046 - >>> gb.skew(skipna=False) - max_speed - class - bird NaN - mammal 1.669046 - """ - if axis is lib.no_default: - axis = 0 - - if axis != 0: - result = self._op_via_apply( - "skew", - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - **kwargs, - ) - return result - - def alt(obj): - # This should not be reached since the cython path should raise - # TypeError and not NotImplementedError. - raise TypeError(f"'skew' is not supported for dtype={obj.dtype}") - - return self._cython_agg_general( - "skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs - ) - - @property - @doc(DataFrame.plot.__doc__) - def plot(self) -> GroupByPlot: - result = GroupByPlot(self) - return result - - @doc(DataFrame.corr.__doc__) - def corr( - self, - method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson", - min_periods: int = 1, - numeric_only: bool = False, - ) -> DataFrame: - result = self._op_via_apply( - "corr", method=method, min_periods=min_periods, numeric_only=numeric_only - ) - return result - - @doc(DataFrame.cov.__doc__) - def cov( - self, - min_periods: int | None = None, - ddof: int | None = 1, - numeric_only: bool = False, - ) -> DataFrame: - result = self._op_via_apply( - "cov", min_periods=min_periods, ddof=ddof, numeric_only=numeric_only - ) - return result - - @doc(DataFrame.hist.__doc__) - def hist( - self, - column: IndexLabel | None = None, - by=None, - grid: bool = True, - xlabelsize: int | None = None, - xrot: float | None = None, - ylabelsize: int | None = None, - yrot: float | None = None, - ax=None, - sharex: bool = False, - sharey: bool = False, - figsize: tuple[int, int] | None = None, - layout: tuple[int, int] | None = None, - bins: int | Sequence[int] = 10, - backend: str | None = None, - legend: bool = False, - **kwargs, - ): - result = self._op_via_apply( - "hist", - column=column, - by=by, - grid=grid, - xlabelsize=xlabelsize, - xrot=xrot, - ylabelsize=ylabelsize, - yrot=yrot, - ax=ax, - sharex=sharex, - sharey=sharey, - figsize=figsize, - layout=layout, - bins=bins, - backend=backend, - legend=legend, - **kwargs, - ) - return result - - @property - @doc(DataFrame.dtypes.__doc__) - def dtypes(self) -> Series: - # GH#51045 - warnings.warn( - f"{type(self).__name__}.dtypes is deprecated and will be removed in " - "a future version. Check the dtypes on the base object instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - - # error: Incompatible return value type (got "DataFrame", expected "Series") - return self._python_apply_general( # type: ignore[return-value] - lambda df: df.dtypes, self._selected_obj - ) - - @doc(DataFrame.corrwith.__doc__) - def corrwith( - self, - other: DataFrame | Series, - axis: Axis | lib.NoDefault = lib.no_default, - drop: bool = False, - method: CorrelationMethod = "pearson", - numeric_only: bool = False, - ) -> DataFrame: - result = self._op_via_apply( - "corrwith", - other=other, - axis=axis, - drop=drop, - method=method, - numeric_only=numeric_only, - ) - return result +from pandas._core.groupby import generic +from pandas.core.common import _depr_core +_depr_core() -def _wrap_transform_general_frame( - obj: DataFrame, group: DataFrame, res: DataFrame | Series -) -> DataFrame: - from pandas import concat +_globals = globals() - if isinstance(res, Series): - # we need to broadcast across the - # other dimension; this will preserve dtypes - # GH14457 - if res.index.is_(obj.index): - res_frame = concat([res] * len(group.columns), axis=1) - res_frame.columns = group.columns - res_frame.index = group.index - else: - res_frame = obj._constructor( - np.tile(res.values, (len(group.index), 1)), - columns=group.columns, - index=group.index, - ) - assert isinstance(res_frame, DataFrame) - return res_frame - elif isinstance(res, DataFrame) and not res.index.is_(group.index): - return res._align_frame(group)[0] - else: - return res +for item in generic.__dir__(): + _globals[item] = getattr(generic, item) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 3412f18a40313..b409cadc12aa3 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1,5951 +1,11 @@ -""" -Provide the groupby split-apply-combine paradigm. Define the GroupBy -class providing the base-class of operations. - -The SeriesGroupBy and DataFrameGroupBy sub-class -(defined in pandas.core.groupby.generic) -expose these user-facing objects to provide specific functionality. -""" from __future__ import annotations -from collections.abc import ( - Hashable, - Iterator, - Mapping, - Sequence, -) -import datetime -from functools import ( - partial, - wraps, -) -import inspect -from textwrap import dedent -from typing import ( - TYPE_CHECKING, - Callable, - Literal, - TypeVar, - Union, - cast, - final, -) -import warnings - -import numpy as np - -from pandas._config.config import option_context - -from pandas._libs import ( - Timestamp, - lib, -) -from pandas._libs.algos import rank_1d -import pandas._libs.groupby as libgroupby -from pandas._libs.missing import NA -from pandas._typing import ( - AnyArrayLike, - ArrayLike, - Axis, - AxisInt, - DtypeObj, - FillnaOptions, - IndexLabel, - NDFrameT, - PositionalIndexer, - RandomState, - Scalar, - T, - npt, -) -from pandas.compat.numpy import function as nv -from pandas.errors import ( - AbstractMethodError, - DataError, -) -from pandas.util._decorators import ( - Appender, - Substitution, - cache_readonly, - doc, -) -from pandas.util._exceptions import find_stack_level - -from pandas.core.dtypes.cast import ( - coerce_indexer_dtype, - ensure_dtype_can_hold_na, -) -from pandas.core.dtypes.common import ( - is_bool_dtype, - is_float_dtype, - is_hashable, - is_integer, - is_integer_dtype, - is_list_like, - is_numeric_dtype, - is_object_dtype, - is_scalar, - needs_i8_conversion, -) -from pandas.core.dtypes.missing import ( - isna, - na_value_for_dtype, - notna, -) - -from pandas.core import ( - algorithms, - sample, -) -from pandas.core._numba import executor -from pandas.core.apply import warn_alias_replacement -from pandas.core.arrays import ( - ArrowExtensionArray, - BaseMaskedArray, - Categorical, - ExtensionArray, - FloatingArray, - IntegerArray, - SparseArray, -) -from pandas.core.arrays.string_ import StringDtype -from pandas.core.arrays.string_arrow import ( - ArrowStringArray, - ArrowStringArrayNumpySemantics, -) -from pandas.core.base import ( - PandasObject, - SelectionMixin, -) -import pandas.core.common as com -from pandas.core.frame import DataFrame -from pandas.core.generic import NDFrame -from pandas.core.groupby import ( - base, - numba_, - ops, -) -from pandas.core.groupby.grouper import get_grouper -from pandas.core.groupby.indexing import ( - GroupByIndexingMixin, - GroupByNthSelector, -) -from pandas.core.indexes.api import ( - CategoricalIndex, - Index, - MultiIndex, - RangeIndex, - default_index, -) -from pandas.core.internals.blocks import ensure_block_shape -from pandas.core.series import Series -from pandas.core.sorting import get_group_index_sorter -from pandas.core.util.numba_ import ( - get_jit_arguments, - maybe_use_numba, -) - -if TYPE_CHECKING: - from typing import Any - - from pandas.core.resample import Resampler - from pandas.core.window import ( - ExpandingGroupby, - ExponentialMovingWindowGroupby, - RollingGroupby, - ) - -_common_see_also = """ - See Also - -------- - Series.%(name)s : Apply a function %(name)s to a Series. - DataFrame.%(name)s : Apply a function %(name)s - to each row or column of a DataFrame. -""" - -_apply_docs = { - "template": """ - Apply function ``func`` group-wise and combine the results together. - - The function passed to ``apply`` must take a {input} as its first - argument and return a DataFrame, Series or scalar. ``apply`` will - then take care of combining the results back together into a single - dataframe or series. ``apply`` is therefore a highly flexible - grouping method. - - While ``apply`` is a very flexible method, its downside is that - using it can be quite a bit slower than using more specific methods - like ``agg`` or ``transform``. Pandas offers a wide range of method that will - be much faster than using ``apply`` for their specific purposes, so try to - use them before reaching for ``apply``. - - Parameters - ---------- - func : callable - A callable that takes a {input} as its first argument, and - returns a dataframe, a series or a scalar. In addition the - callable may take positional and keyword arguments. - include_groups : bool, default True - When True, will attempt to apply ``func`` to the groupings in - the case that they are columns of the DataFrame. If this raises a - TypeError, the result will be computed with the groupings excluded. - When False, the groupings will be excluded when applying ``func``. - - .. versionadded:: 2.2.0 - - .. deprecated:: 2.2.0 - - Setting include_groups to True is deprecated. Only the value - False will be allowed in a future version of pandas. - - args, kwargs : tuple and dict - Optional positional and keyword arguments to pass to ``func``. - - Returns - ------- - Series or DataFrame - - See Also - -------- - pipe : Apply function to the full GroupBy object instead of to each - group. - aggregate : Apply aggregate function to the GroupBy object. - transform : Apply function column-by-column to the GroupBy object. - Series.apply : Apply a function to a Series. - DataFrame.apply : Apply a function to each row or column of a DataFrame. - - Notes - ----- - - .. versionchanged:: 1.3.0 - - The resulting dtype will reflect the return value of the passed ``func``, - see the examples below. - - Functions that mutate the passed object can produce unexpected - behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` - for more details. - - Examples - -------- - {examples} - """, - "dataframe_examples": """ - >>> df = pd.DataFrame({'A': 'a a b'.split(), - ... 'B': [1,2,3], - ... 'C': [4,6,5]}) - >>> g1 = df.groupby('A', group_keys=False) - >>> g2 = df.groupby('A', group_keys=True) - - Notice that ``g1`` and ``g2`` have two groups, ``a`` and ``b``, and only - differ in their ``group_keys`` argument. Calling `apply` in various ways, - we can get different grouping results: - - Example 1: below the function passed to `apply` takes a DataFrame as - its argument and returns a DataFrame. `apply` combines the result for - each group together into a new DataFrame: - - >>> g1[['B', 'C']].apply(lambda x: x / x.sum()) - B C - 0 0.333333 0.4 - 1 0.666667 0.6 - 2 1.000000 1.0 - - In the above, the groups are not part of the index. We can have them included - by using ``g2`` where ``group_keys=True``: - - >>> g2[['B', 'C']].apply(lambda x: x / x.sum()) - B C - A - a 0 0.333333 0.4 - 1 0.666667 0.6 - b 2 1.000000 1.0 - - Example 2: The function passed to `apply` takes a DataFrame as - its argument and returns a Series. `apply` combines the result for - each group together into a new DataFrame. - - .. versionchanged:: 1.3.0 - - The resulting dtype will reflect the return value of the passed ``func``. - - >>> g1[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min()) - B C - A - a 1.0 2.0 - b 0.0 0.0 - - >>> g2[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min()) - B C - A - a 1.0 2.0 - b 0.0 0.0 - - The ``group_keys`` argument has no effect here because the result is not - like-indexed (i.e. :ref:`a transform `) when compared - to the input. - - Example 3: The function passed to `apply` takes a DataFrame as - its argument and returns a scalar. `apply` combines the result for - each group together into a Series, including setting the index as - appropriate: - - >>> g1.apply(lambda x: x.C.max() - x.B.min(), include_groups=False) - A - a 5 - b 2 - dtype: int64""", - "series_examples": """ - >>> s = pd.Series([0, 1, 2], index='a a b'.split()) - >>> g1 = s.groupby(s.index, group_keys=False) - >>> g2 = s.groupby(s.index, group_keys=True) - - From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``. - Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only - differ in their ``group_keys`` argument. Calling `apply` in various ways, - we can get different grouping results: - - Example 1: The function passed to `apply` takes a Series as - its argument and returns a Series. `apply` combines the result for - each group together into a new Series. - - .. versionchanged:: 1.3.0 - - The resulting dtype will reflect the return value of the passed ``func``. - - >>> g1.apply(lambda x: x*2 if x.name == 'a' else x/2) - a 0.0 - a 2.0 - b 1.0 - dtype: float64 - - In the above, the groups are not part of the index. We can have them included - by using ``g2`` where ``group_keys=True``: - - >>> g2.apply(lambda x: x*2 if x.name == 'a' else x/2) - a a 0.0 - a 2.0 - b b 1.0 - dtype: float64 - - Example 2: The function passed to `apply` takes a Series as - its argument and returns a scalar. `apply` combines the result for - each group together into a Series, including setting the index as - appropriate: - - >>> g1.apply(lambda x: x.max() - x.min()) - a 1 - b 0 - dtype: int64 - - The ``group_keys`` argument has no effect here because the result is not - like-indexed (i.e. :ref:`a transform `) when compared - to the input. - - >>> g2.apply(lambda x: x.max() - x.min()) - a 1 - b 0 - dtype: int64""", -} - -_groupby_agg_method_template = """ -Compute {fname} of group values. - -Parameters ----------- -numeric_only : bool, default {no} - Include only float, int, boolean columns. - - .. versionchanged:: 2.0.0 - - numeric_only no longer accepts ``None``. - -min_count : int, default {mc} - The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. - -Returns -------- -Series or DataFrame - Computed {fname} of values within each group. - -Examples --------- -{example} -""" - -_groupby_agg_method_engine_template = """ -Compute {fname} of group values. - -Parameters ----------- -numeric_only : bool, default {no} - Include only float, int, boolean columns. - - .. versionchanged:: 2.0.0 - - numeric_only no longer accepts ``None``. - -min_count : int, default {mc} - The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. - -engine : str, default None {e} - * ``'cython'`` : Runs rolling apply through C-extensions from cython. - * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. - Only available when ``raw`` is set to ``True``. - * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` - -engine_kwargs : dict, default None {ek} - * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` - * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or - ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be - applied to both the ``func`` and the ``apply`` groupby aggregation. - -Returns -------- -Series or DataFrame - Computed {fname} of values within each group. - -Examples --------- -{example} -""" - -_pipe_template = """ -Apply a ``func`` with arguments to this %(klass)s object and return its result. - -Use `.pipe` when you want to improve readability by chaining together -functions that expect Series, DataFrames, GroupBy or Resampler objects. -Instead of writing - ->>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c) # doctest: +SKIP - -You can write - ->>> (df.groupby('group') -... .pipe(f) -... .pipe(g, arg1=a) -... .pipe(h, arg2=b, arg3=c)) # doctest: +SKIP - -which is much more readable. - -Parameters ----------- -func : callable or tuple of (callable, str) - Function to apply to this %(klass)s object or, alternatively, - a `(callable, data_keyword)` tuple where `data_keyword` is a - string indicating the keyword of `callable` that expects the - %(klass)s object. -args : iterable, optional - Positional arguments passed into `func`. -kwargs : dict, optional - A dictionary of keyword arguments passed into `func`. - -Returns -------- -the return type of `func`. - -See Also --------- -Series.pipe : Apply a function with arguments to a series. -DataFrame.pipe: Apply a function with arguments to a dataframe. -apply : Apply function to each group instead of to the - full %(klass)s object. - -Notes ------ -See more `here -`_ - -Examples --------- -%(examples)s -""" - -_transform_template = """ -Call function producing a same-indexed %(klass)s on each group. - -Returns a %(klass)s having the same indexes as the original object -filled with the transformed values. - -Parameters ----------- -f : function, str - Function to apply to each group. See the Notes section below for requirements. - - Accepted inputs are: - - - String - - Python function - - Numba JIT function with ``engine='numba'`` specified. - - Only passing a single function is supported with this engine. - If the ``'numba'`` engine is chosen, the function must be - a user defined function with ``values`` and ``index`` as the - first and second arguments respectively in the function signature. - Each group's index will be passed to the user defined function - and optionally available for use. - - If a string is chosen, then it needs to be the name - of the groupby method you want to use. -*args - Positional arguments to pass to func. -engine : str, default None - * ``'cython'`` : Runs the function through C-extensions from cython. - * ``'numba'`` : Runs the function through JIT compiled code from numba. - * ``None`` : Defaults to ``'cython'`` or the global setting ``compute.use_numba`` - -engine_kwargs : dict, default None - * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` - * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or - ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be - applied to the function - -**kwargs - Keyword arguments to be passed into func. - -Returns -------- -%(klass)s - -See Also --------- -%(klass)s.groupby.apply : Apply function ``func`` group-wise and combine - the results together. -%(klass)s.groupby.aggregate : Aggregate using one or more - operations over the specified axis. -%(klass)s.transform : Call ``func`` on self producing a %(klass)s with the - same axis shape as self. - -Notes ------ -Each group is endowed the attribute 'name' in case you need to know -which group you are working on. - -The current implementation imposes three requirements on f: - -* f must return a value that either has the same shape as the input - subframe or can be broadcast to the shape of the input subframe. - For example, if `f` returns a scalar it will be broadcast to have the - same shape as the input subframe. -* if this is a DataFrame, f must support application column-by-column - in the subframe. If f also supports application to the entire subframe, - then a fast path is used starting from the second chunk. -* f must not mutate groups. Mutation is not supported and may - produce unexpected results. See :ref:`gotchas.udf-mutation` for more details. - -When using ``engine='numba'``, there will be no "fall back" behavior internally. -The group data and group index will be passed as numpy arrays to the JITed -user defined function, and no alternative execution attempts will be tried. - -.. versionchanged:: 1.3.0 - - The resulting dtype will reflect the return value of the passed ``func``, - see the examples below. - -.. versionchanged:: 2.0.0 - - When using ``.transform`` on a grouped DataFrame and the transformation function - returns a DataFrame, pandas now aligns the result's index - with the input's index. You can call ``.to_numpy()`` on the - result of the transformation function to avoid alignment. - -Examples --------- -%(example)s""" - -_agg_template_series = """ -Aggregate using one or more operations over the specified axis. - -Parameters ----------- -func : function, str, list, dict or None - Function to use for aggregating the data. If a function, must either - work when passed a {klass} or when passed to {klass}.apply. - - Accepted combinations are: - - - function - - string function name - - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` - - None, in which case ``**kwargs`` are used with Named Aggregation. Here the - output has one column for each element in ``**kwargs``. The name of the - column is keyword, whereas the value determines the aggregation used to compute - the values in the column. - - Can also accept a Numba JIT function with - ``engine='numba'`` specified. Only passing a single function is supported - with this engine. - - If the ``'numba'`` engine is chosen, the function must be - a user defined function with ``values`` and ``index`` as the - first and second arguments respectively in the function signature. - Each group's index will be passed to the user defined function - and optionally available for use. - - .. deprecated:: 2.1.0 - - Passing a dictionary is deprecated and will raise in a future version - of pandas. Pass a list of aggregations instead. -*args - Positional arguments to pass to func. -engine : str, default None - * ``'cython'`` : Runs the function through C-extensions from cython. - * ``'numba'`` : Runs the function through JIT compiled code from numba. - * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` - -engine_kwargs : dict, default None - * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` - * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or - ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be - applied to the function - -**kwargs - * If ``func`` is None, ``**kwargs`` are used to define the output names and - aggregations via Named Aggregation. See ``func`` entry. - * Otherwise, keyword arguments to be passed into func. - -Returns -------- -{klass} - -See Also --------- -{klass}.groupby.apply : Apply function func group-wise - and combine the results together. -{klass}.groupby.transform : Transforms the Series on each group - based on the given function. -{klass}.aggregate : Aggregate using one or more - operations over the specified axis. - -Notes ------ -When using ``engine='numba'``, there will be no "fall back" behavior internally. -The group data and group index will be passed as numpy arrays to the JITed -user defined function, and no alternative execution attempts will be tried. - -Functions that mutate the passed object can produce unexpected -behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` -for more details. - -.. versionchanged:: 1.3.0 - - The resulting dtype will reflect the return value of the passed ``func``, - see the examples below. -{examples}""" - -_agg_template_frame = """ -Aggregate using one or more operations over the specified axis. - -Parameters ----------- -func : function, str, list, dict or None - Function to use for aggregating the data. If a function, must either - work when passed a {klass} or when passed to {klass}.apply. - - Accepted combinations are: - - - function - - string function name - - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` - - dict of axis labels -> functions, function names or list of such. - - None, in which case ``**kwargs`` are used with Named Aggregation. Here the - output has one column for each element in ``**kwargs``. The name of the - column is keyword, whereas the value determines the aggregation used to compute - the values in the column. - - Can also accept a Numba JIT function with - ``engine='numba'`` specified. Only passing a single function is supported - with this engine. - - If the ``'numba'`` engine is chosen, the function must be - a user defined function with ``values`` and ``index`` as the - first and second arguments respectively in the function signature. - Each group's index will be passed to the user defined function - and optionally available for use. - -*args - Positional arguments to pass to func. -engine : str, default None - * ``'cython'`` : Runs the function through C-extensions from cython. - * ``'numba'`` : Runs the function through JIT compiled code from numba. - * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` - -engine_kwargs : dict, default None - * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` - * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or - ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be - applied to the function - -**kwargs - * If ``func`` is None, ``**kwargs`` are used to define the output names and - aggregations via Named Aggregation. See ``func`` entry. - * Otherwise, keyword arguments to be passed into func. - -Returns -------- -{klass} - -See Also --------- -{klass}.groupby.apply : Apply function func group-wise - and combine the results together. -{klass}.groupby.transform : Transforms the Series on each group - based on the given function. -{klass}.aggregate : Aggregate using one or more - operations over the specified axis. - -Notes ------ -When using ``engine='numba'``, there will be no "fall back" behavior internally. -The group data and group index will be passed as numpy arrays to the JITed -user defined function, and no alternative execution attempts will be tried. - -Functions that mutate the passed object can produce unexpected -behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` -for more details. - -.. versionchanged:: 1.3.0 - - The resulting dtype will reflect the return value of the passed ``func``, - see the examples below. -{examples}""" - - -@final -class GroupByPlot(PandasObject): - """ - Class implementing the .plot attribute for groupby objects. - """ - - def __init__(self, groupby: GroupBy) -> None: - self._groupby = groupby - - def __call__(self, *args, **kwargs): - def f(self): - return self.plot(*args, **kwargs) - - f.__name__ = "plot" - return self._groupby._python_apply_general(f, self._groupby._selected_obj) - - def __getattr__(self, name: str): - def attr(*args, **kwargs): - def f(self): - return getattr(self.plot, name)(*args, **kwargs) - - return self._groupby._python_apply_general(f, self._groupby._selected_obj) - - return attr - - -_KeysArgType = Union[ - Hashable, - list[Hashable], - Callable[[Hashable], Hashable], - list[Callable[[Hashable], Hashable]], - Mapping[Hashable, Hashable], -] - - -class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin): - _hidden_attrs = PandasObject._hidden_attrs | { - "as_index", - "axis", - "dropna", - "exclusions", - "grouper", - "group_keys", - "keys", - "level", - "obj", - "observed", - "sort", - } - - axis: AxisInt - grouper: ops.BaseGrouper - keys: _KeysArgType | None = None - level: IndexLabel | None = None - group_keys: bool - - @final - def __len__(self) -> int: - return len(self.groups) - - @final - def __repr__(self) -> str: - # TODO: Better repr for GroupBy object - return object.__repr__(self) - - @final - @property - def groups(self) -> dict[Hashable, np.ndarray]: - """ - Dict {group name -> group labels}. - - Examples - -------- - - For SeriesGroupBy: - - >>> lst = ['a', 'a', 'b'] - >>> ser = pd.Series([1, 2, 3], index=lst) - >>> ser - a 1 - a 2 - b 3 - dtype: int64 - >>> ser.groupby(level=0).groups - {'a': ['a', 'a'], 'b': ['b']} - - For DataFrameGroupBy: - - >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"]) - >>> df - a b c - 0 1 2 3 - 1 1 5 6 - 2 7 8 9 - >>> df.groupby(by=["a"]).groups - {1: [0, 1], 7: [2]} - - For Resampler: - - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) - >>> ser - 2023-01-01 1 - 2023-01-15 2 - 2023-02-01 3 - 2023-02-15 4 - dtype: int64 - >>> ser.resample('MS').groups - {Timestamp('2023-01-01 00:00:00'): 2, Timestamp('2023-02-01 00:00:00'): 4} - """ - return self.grouper.groups - - @final - @property - def ngroups(self) -> int: - return self.grouper.ngroups - - @final - @property - def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: - """ - Dict {group name -> group indices}. - - Examples - -------- - - For SeriesGroupBy: - - >>> lst = ['a', 'a', 'b'] - >>> ser = pd.Series([1, 2, 3], index=lst) - >>> ser - a 1 - a 2 - b 3 - dtype: int64 - >>> ser.groupby(level=0).indices - {'a': array([0, 1]), 'b': array([2])} - - For DataFrameGroupBy: - - >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["owl", "toucan", "eagle"]) - >>> df - a b c - owl 1 2 3 - toucan 1 5 6 - eagle 7 8 9 - >>> df.groupby(by=["a"]).indices - {1: array([0, 1]), 7: array([2])} - - For Resampler: - - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) - >>> ser - 2023-01-01 1 - 2023-01-15 2 - 2023-02-01 3 - 2023-02-15 4 - dtype: int64 - >>> ser.resample('MS').indices - defaultdict(, {Timestamp('2023-01-01 00:00:00'): [0, 1], - Timestamp('2023-02-01 00:00:00'): [2, 3]}) - """ - return self.grouper.indices - - @final - def _get_indices(self, names): - """ - Safe get multiple indices, translate keys for - datelike to underlying repr. - """ - - def get_converter(s): - # possibly convert to the actual key types - # in the indices, could be a Timestamp or a np.datetime64 - if isinstance(s, datetime.datetime): - return lambda key: Timestamp(key) - elif isinstance(s, np.datetime64): - return lambda key: Timestamp(key).asm8 - else: - return lambda key: key - - if len(names) == 0: - return [] - - if len(self.indices) > 0: - index_sample = next(iter(self.indices)) - else: - index_sample = None # Dummy sample - - name_sample = names[0] - if isinstance(index_sample, tuple): - if not isinstance(name_sample, tuple): - msg = "must supply a tuple to get_group with multiple grouping keys" - raise ValueError(msg) - if not len(name_sample) == len(index_sample): - try: - # If the original grouper was a tuple - return [self.indices[name] for name in names] - except KeyError as err: - # turns out it wasn't a tuple - msg = ( - "must supply a same-length tuple to get_group " - "with multiple grouping keys" - ) - raise ValueError(msg) from err - - converters = [get_converter(s) for s in index_sample] - names = (tuple(f(n) for f, n in zip(converters, name)) for name in names) - - else: - converter = get_converter(index_sample) - names = (converter(name) for name in names) - - return [self.indices.get(name, []) for name in names] - - @final - def _get_index(self, name): - """ - Safe get index, translate keys for datelike to underlying repr. - """ - return self._get_indices([name])[0] - - @final - @cache_readonly - def _selected_obj(self): - # Note: _selected_obj is always just `self.obj` for SeriesGroupBy - if isinstance(self.obj, Series): - return self.obj - - if self._selection is not None: - if is_hashable(self._selection): - # i.e. a single key, so selecting it will return a Series. - # In this case, _obj_with_exclusions would wrap the key - # in a list and return a single-column DataFrame. - return self.obj[self._selection] - - # Otherwise _selection is equivalent to _selection_list, so - # _selected_obj matches _obj_with_exclusions, so we can reuse - # that and avoid making a copy. - return self._obj_with_exclusions - - return self.obj - - @final - def _dir_additions(self) -> set[str]: - return self.obj._dir_additions() - - @Substitution( - klass="GroupBy", - examples=dedent( - """\ - >>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]}) - >>> df - A B - 0 a 1 - 1 b 2 - 2 a 3 - 3 b 4 - - To get the difference between each groups maximum and minimum value in one - pass, you can do - - >>> df.groupby('A').pipe(lambda x: x.max() - x.min()) - B - A - a 2 - b 2""" - ), - ) - @Appender(_pipe_template) - def pipe( - self, - func: Callable[..., T] | tuple[Callable[..., T], str], - *args, - **kwargs, - ) -> T: - return com.pipe(self, func, *args, **kwargs) - - @final - def get_group(self, name, obj=None) -> DataFrame | Series: - """ - Construct DataFrame from group with provided name. - - Parameters - ---------- - name : object - The name of the group to get as a DataFrame. - obj : DataFrame, default None - The DataFrame to take the DataFrame out of. If - it is None, the object groupby was called on will - be used. - - .. deprecated:: 2.1.0 - The obj is deprecated and will be removed in a future version. - Do ``df.iloc[gb.indices.get(name)]`` - instead of ``gb.get_group(name, obj=df)``. - - Returns - ------- - same type as obj - - Examples - -------- - - For SeriesGroupBy: - - >>> lst = ['a', 'a', 'b'] - >>> ser = pd.Series([1, 2, 3], index=lst) - >>> ser - a 1 - a 2 - b 3 - dtype: int64 - >>> ser.groupby(level=0).get_group("a") - a 1 - a 2 - dtype: int64 - - For DataFrameGroupBy: - - >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["owl", "toucan", "eagle"]) - >>> df - a b c - owl 1 2 3 - toucan 1 5 6 - eagle 7 8 9 - >>> df.groupby(by=["a"]).get_group((1,)) - a b c - owl 1 2 3 - toucan 1 5 6 - - For Resampler: - - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) - >>> ser - 2023-01-01 1 - 2023-01-15 2 - 2023-02-01 3 - 2023-02-15 4 - dtype: int64 - >>> ser.resample('MS').get_group('2023-01-01') - 2023-01-01 1 - 2023-01-15 2 - dtype: int64 - """ - keys = self.keys - level = self.level - # mypy doesn't recognize level/keys as being sized when passed to len - if (is_list_like(level) and len(level) == 1) or ( # type: ignore[arg-type] - is_list_like(keys) and len(keys) == 1 # type: ignore[arg-type] - ): - # GH#25971 - if isinstance(name, tuple) and len(name) == 1: - # Allow users to pass tuples of length 1 to silence warning - name = name[0] - elif not isinstance(name, tuple): - warnings.warn( - "When grouping with a length-1 list-like, " - "you will need to pass a length-1 tuple to get_group in a future " - "version of pandas. Pass `(name,)` instead of `name` to silence " - "this warning.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - inds = self._get_index(name) - if not len(inds): - raise KeyError(name) - - if obj is None: - indexer = inds if self.axis == 0 else (slice(None), inds) - return self._selected_obj.iloc[indexer] - else: - warnings.warn( - "obj is deprecated and will be removed in a future version. " - "Do ``df.iloc[gb.indices.get(name)]`` " - "instead of ``gb.get_group(name, obj=df)``.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return obj._take_with_is_copy(inds, axis=self.axis) - - @final - def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]: - """ - Groupby iterator. - - Returns - ------- - Generator yielding sequence of (name, subsetted object) - for each group - - Examples - -------- - - For SeriesGroupBy: - - >>> lst = ['a', 'a', 'b'] - >>> ser = pd.Series([1, 2, 3], index=lst) - >>> ser - a 1 - a 2 - b 3 - dtype: int64 - >>> for x, y in ser.groupby(level=0): - ... print(f'{x}\\n{y}\\n') - a - a 1 - a 2 - dtype: int64 - b - b 3 - dtype: int64 - - For DataFrameGroupBy: - - >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"]) - >>> df - a b c - 0 1 2 3 - 1 1 5 6 - 2 7 8 9 - >>> for x, y in df.groupby(by=["a"]): - ... print(f'{x}\\n{y}\\n') - (1,) - a b c - 0 1 2 3 - 1 1 5 6 - (7,) - a b c - 2 7 8 9 - - For Resampler: - - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) - >>> ser - 2023-01-01 1 - 2023-01-15 2 - 2023-02-01 3 - 2023-02-15 4 - dtype: int64 - >>> for x, y in ser.resample('MS'): - ... print(f'{x}\\n{y}\\n') - 2023-01-01 00:00:00 - 2023-01-01 1 - 2023-01-15 2 - dtype: int64 - 2023-02-01 00:00:00 - 2023-02-01 3 - 2023-02-15 4 - dtype: int64 - """ - keys = self.keys - level = self.level - result = self.grouper.get_iterator(self._selected_obj, axis=self.axis) - # error: Argument 1 to "len" has incompatible type "Hashable"; expected "Sized" - if is_list_like(level) and len(level) == 1: # type: ignore[arg-type] - # GH 51583 - warnings.warn( - "Creating a Groupby object with a length-1 list-like " - "level parameter will yield indexes as tuples in a future version. " - "To keep indexes as scalars, create Groupby objects with " - "a scalar level parameter instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if isinstance(keys, list) and len(keys) == 1: - # GH#42795 - when keys is a list, return tuples even when length is 1 - result = (((key,), group) for key, group in result) - return result - - -# To track operations that expand dimensions, like ohlc -OutputFrameOrSeries = TypeVar("OutputFrameOrSeries", bound=NDFrame) - - -class GroupBy(BaseGroupBy[NDFrameT]): - """ - Class for grouping and aggregating relational data. - - See aggregate, transform, and apply functions on this object. - - It's easiest to use obj.groupby(...) to use GroupBy, but you can also do: - - :: - - grouped = groupby(obj, ...) - - Parameters - ---------- - obj : pandas object - axis : int, default 0 - level : int, default None - Level of MultiIndex - groupings : list of Grouping objects - Most users should ignore this - exclusions : array-like, optional - List of columns to exclude - name : str - Most users should ignore this - - Returns - ------- - **Attributes** - groups : dict - {group name -> group labels} - len(grouped) : int - Number of groups - - Notes - ----- - After grouping, see aggregate, apply, and transform functions. Here are - some other brief notes about usage. When grouping by multiple groups, the - result index will be a MultiIndex (hierarchical) by default. - - Iteration produces (key, group) tuples, i.e. chunking the data by group. So - you can write code like: - - :: - - grouped = obj.groupby(keys, axis=axis) - for key, group in grouped: - # do something with the data - - Function calls on GroupBy, if not specially implemented, "dispatch" to the - grouped data. So if you group a DataFrame and wish to invoke the std() - method on each group, you can simply do: - - :: - - df.groupby(mapper).std() - - rather than - - :: - - df.groupby(mapper).aggregate(np.std) - - You can pass arguments to these "wrapped" functions, too. - - See the online documentation for full exposition on these topics and much - more - """ - - grouper: ops.BaseGrouper - as_index: bool - - @final - def __init__( - self, - obj: NDFrameT, - keys: _KeysArgType | None = None, - axis: Axis = 0, - level: IndexLabel | None = None, - grouper: ops.BaseGrouper | None = None, - exclusions: frozenset[Hashable] | None = None, - selection: IndexLabel | None = None, - as_index: bool = True, - sort: bool = True, - group_keys: bool = True, - observed: bool | lib.NoDefault = lib.no_default, - dropna: bool = True, - ) -> None: - self._selection = selection - - assert isinstance(obj, NDFrame), type(obj) - - self.level = level - - if not as_index: - if axis != 0: - raise ValueError("as_index=False only valid for axis=0") - - self.as_index = as_index - self.keys = keys - self.sort = sort - self.group_keys = group_keys - self.dropna = dropna - - if grouper is None: - grouper, exclusions, obj = get_grouper( - obj, - keys, - axis=axis, - level=level, - sort=sort, - observed=False if observed is lib.no_default else observed, - dropna=self.dropna, - ) - - if observed is lib.no_default: - if any(ping._passed_categorical for ping in grouper.groupings): - warnings.warn( - "The default of observed=False is deprecated and will be changed " - "to True in a future version of pandas. Pass observed=False to " - "retain current behavior or observed=True to adopt the future " - "default and silence this warning.", - FutureWarning, - stacklevel=find_stack_level(), - ) - observed = False - self.observed = observed - - self.obj = obj - self.axis = obj._get_axis_number(axis) - self.grouper = grouper - self.exclusions = frozenset(exclusions) if exclusions else frozenset() - - def __getattr__(self, attr: str): - if attr in self._internal_names_set: - return object.__getattribute__(self, attr) - if attr in self.obj: - return self[attr] - - raise AttributeError( - f"'{type(self).__name__}' object has no attribute '{attr}'" - ) - - @final - def _deprecate_axis(self, axis: int, name: str) -> None: - if axis == 1: - warnings.warn( - f"{type(self).__name__}.{name} with axis=1 is deprecated and " - "will be removed in a future version. Operate on the un-grouped " - "DataFrame instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - warnings.warn( - f"The 'axis' keyword in {type(self).__name__}.{name} is deprecated " - "and will be removed in a future version. " - "Call without passing 'axis' instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - @final - def _op_via_apply(self, name: str, *args, **kwargs): - """Compute the result of an operation by using GroupBy's apply.""" - f = getattr(type(self._obj_with_exclusions), name) - sig = inspect.signature(f) - - if "axis" in kwargs and kwargs["axis"] is not lib.no_default: - axis = self.obj._get_axis_number(kwargs["axis"]) - self._deprecate_axis(axis, name) - elif "axis" in kwargs: - # exclude skew here because that was already defaulting to lib.no_default - # before this deprecation was instituted - if name == "skew": - pass - elif name == "fillna": - # maintain the behavior from before the deprecation - kwargs["axis"] = None - else: - kwargs["axis"] = 0 - - # a little trickery for aggregation functions that need an axis - # argument - if "axis" in sig.parameters: - if kwargs.get("axis", None) is None or kwargs.get("axis") is lib.no_default: - kwargs["axis"] = self.axis - - def curried(x): - return f(x, *args, **kwargs) - - # preserve the name so we can detect it when calling plot methods, - # to avoid duplicates - curried.__name__ = name - - # special case otherwise extra plots are created when catching the - # exception below - if name in base.plotting_methods: - return self._python_apply_general(curried, self._selected_obj) - - is_transform = name in base.transformation_kernels - result = self._python_apply_general( - curried, - self._obj_with_exclusions, - is_transform=is_transform, - not_indexed_same=not is_transform, - ) - - if self.grouper.has_dropped_na and is_transform: - # result will have dropped rows due to nans, fill with null - # and ensure index is ordered same as the input - result = self._set_result_index_ordered(result) - return result - - # ----------------------------------------------------------------- - # Dispatch/Wrapping - - @final - def _concat_objects( - self, - values, - not_indexed_same: bool = False, - is_transform: bool = False, - ): - from pandas.core.reshape.concat import concat - - if self.group_keys and not is_transform: - if self.as_index: - # possible MI return case - group_keys = self.grouper.result_index - group_levels = self.grouper.levels - group_names = self.grouper.names - - result = concat( - values, - axis=self.axis, - keys=group_keys, - levels=group_levels, - names=group_names, - sort=False, - ) - else: - # GH5610, returns a MI, with the first level being a - # range index - keys = list(range(len(values))) - result = concat(values, axis=self.axis, keys=keys) - - elif not not_indexed_same: - result = concat(values, axis=self.axis) - - ax = self._selected_obj._get_axis(self.axis) - if self.dropna: - labels = self.grouper.group_info[0] - mask = labels != -1 - ax = ax[mask] - - # this is a very unfortunate situation - # we can't use reindex to restore the original order - # when the ax has duplicates - # so we resort to this - # GH 14776, 30667 - # TODO: can we reuse e.g. _reindex_non_unique? - if ax.has_duplicates and not result.axes[self.axis].equals(ax): - # e.g. test_category_order_transformer - target = algorithms.unique1d(ax._values) - indexer, _ = result.index.get_indexer_non_unique(target) - result = result.take(indexer, axis=self.axis) - else: - result = result.reindex(ax, axis=self.axis, copy=False) - - else: - result = concat(values, axis=self.axis) - - if self.obj.ndim == 1: - name = self.obj.name - elif is_hashable(self._selection): - name = self._selection - else: - name = None - - if isinstance(result, Series) and name is not None: - result.name = name - - return result - - @final - def _set_result_index_ordered( - self, result: OutputFrameOrSeries - ) -> OutputFrameOrSeries: - # set the result index on the passed values object and - # return the new object, xref 8046 - - obj_axis = self.obj._get_axis(self.axis) - - if self.grouper.is_monotonic and not self.grouper.has_dropped_na: - # shortcut if we have an already ordered grouper - result = result.set_axis(obj_axis, axis=self.axis, copy=False) - return result - - # row order is scrambled => sort the rows by position in original index - original_positions = Index(self.grouper.result_ilocs()) - result = result.set_axis(original_positions, axis=self.axis, copy=False) - result = result.sort_index(axis=self.axis) - if self.grouper.has_dropped_na: - # Add back in any missing rows due to dropna - index here is integral - # with values referring to the row of the input so can use RangeIndex - result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis) - result = result.set_axis(obj_axis, axis=self.axis, copy=False) - - return result - - @final - def _insert_inaxis_grouper(self, result: Series | DataFrame) -> DataFrame: - if isinstance(result, Series): - result = result.to_frame() - - # zip in reverse so we can always insert at loc 0 - columns = result.columns - for name, lev, in_axis in zip( - reversed(self.grouper.names), - reversed(self.grouper.get_group_levels()), - reversed([grp.in_axis for grp in self.grouper.groupings]), - ): - # GH #28549 - # When using .apply(-), name will be in columns already - if name not in columns: - if in_axis: - result.insert(0, name, lev) - else: - msg = ( - "A grouping was used that is not in the columns of the " - "DataFrame and so was excluded from the result. This grouping " - "will be included in a future version of pandas. Add the " - "grouping as a column of the DataFrame to silence this warning." - ) - warnings.warn( - message=msg, - category=FutureWarning, - stacklevel=find_stack_level(), - ) - - return result - - @final - def _maybe_transpose_result(self, result: NDFrameT) -> NDFrameT: - if self.axis == 1: - # Only relevant for DataFrameGroupBy, no-op for SeriesGroupBy - result = result.T - if result.index.equals(self.obj.index): - # Retain e.g. DatetimeIndex/TimedeltaIndex freq - # e.g. test_groupby_crash_on_nunique - result.index = self.obj.index.copy() - return result - - @final - def _wrap_aggregated_output( - self, - result: Series | DataFrame, - qs: npt.NDArray[np.float64] | None = None, - ): - """ - Wraps the output of GroupBy aggregations into the expected result. - - Parameters - ---------- - result : Series, DataFrame - - Returns - ------- - Series or DataFrame - """ - # ATM we do not get here for SeriesGroupBy; when we do, we will - # need to require that result.name already match self.obj.name - - if not self.as_index: - # `not self.as_index` is only relevant for DataFrameGroupBy, - # enforced in __init__ - result = self._insert_inaxis_grouper(result) - result = result._consolidate() - index = Index(range(self.grouper.ngroups)) - - else: - index = self.grouper.result_index - - if qs is not None: - # We get here with len(qs) != 1 and not self.as_index - # in test_pass_args_kwargs - index = _insert_quantile_level(index, qs) - - result.index = index - - # error: Argument 1 to "_maybe_transpose_result" of "GroupBy" has - # incompatible type "Union[Series, DataFrame]"; expected "NDFrameT" - res = self._maybe_transpose_result(result) # type: ignore[arg-type] - return self._reindex_output(res, qs=qs) - - def _wrap_applied_output( - self, - data, - values: list, - not_indexed_same: bool = False, - is_transform: bool = False, - ): - raise AbstractMethodError(self) - - # ----------------------------------------------------------------- - # numba - - @final - def _numba_prep(self, data: DataFrame): - ids, _, ngroups = self.grouper.group_info - sorted_index = self.grouper._sort_idx - sorted_ids = self.grouper._sorted_ids - - sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() - # GH 46867 - index_data = data.index - if isinstance(index_data, MultiIndex): - if len(self.grouper.groupings) > 1: - raise NotImplementedError( - "Grouping with more than 1 grouping labels and " - "a MultiIndex is not supported with engine='numba'" - ) - group_key = self.grouper.groupings[0].name - index_data = index_data.get_level_values(group_key) - sorted_index_data = index_data.take(sorted_index).to_numpy() - - starts, ends = lib.generate_slices(sorted_ids, ngroups) - return ( - starts, - ends, - sorted_index_data, - sorted_data, - ) - - def _numba_agg_general( - self, - func: Callable, - dtype_mapping: dict[np.dtype, Any], - engine_kwargs: dict[str, bool] | None, - **aggregator_kwargs, - ): - """ - Perform groupby with a standard numerical aggregation function (e.g. mean) - with Numba. - """ - if not self.as_index: - raise NotImplementedError( - "as_index=False is not supported. Use .reset_index() instead." - ) - if self.axis == 1: - raise NotImplementedError("axis=1 is not supported.") - - data = self._obj_with_exclusions - df = data if data.ndim == 2 else data.to_frame() - - aggregator = executor.generate_shared_aggregator( - func, - dtype_mapping, - True, # is_grouped_kernel - **get_jit_arguments(engine_kwargs), - ) - # Pass group ids to kernel directly if it can handle it - # (This is faster since it doesn't require a sort) - ids, _, _ = self.grouper.group_info - ngroups = self.grouper.ngroups - - res_mgr = df._mgr.apply( - aggregator, labels=ids, ngroups=ngroups, **aggregator_kwargs - ) - res_mgr.axes[1] = self.grouper.result_index - result = df._constructor_from_mgr(res_mgr, axes=res_mgr.axes) - - if data.ndim == 1: - result = result.squeeze("columns") - result.name = data.name - else: - result.columns = data.columns - return result - - @final - def _transform_with_numba(self, func, *args, engine_kwargs=None, **kwargs): - """ - Perform groupby transform routine with the numba engine. - - This routine mimics the data splitting routine of the DataSplitter class - to generate the indices of each group in the sorted data and then passes the - data and indices into a Numba jitted function. - """ - data = self._obj_with_exclusions - df = data if data.ndim == 2 else data.to_frame() - - starts, ends, sorted_index, sorted_data = self._numba_prep(df) - numba_.validate_udf(func) - numba_transform_func = numba_.generate_numba_transform_func( - func, **get_jit_arguments(engine_kwargs, kwargs) - ) - result = numba_transform_func( - sorted_data, - sorted_index, - starts, - ends, - len(df.columns), - *args, - ) - # result values needs to be resorted to their original positions since we - # evaluated the data sorted by group - result = result.take(np.argsort(sorted_index), axis=0) - index = data.index - if data.ndim == 1: - result_kwargs = {"name": data.name} - result = result.ravel() - else: - result_kwargs = {"columns": data.columns} - return data._constructor(result, index=index, **result_kwargs) - - @final - def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs): - """ - Perform groupby aggregation routine with the numba engine. - - This routine mimics the data splitting routine of the DataSplitter class - to generate the indices of each group in the sorted data and then passes the - data and indices into a Numba jitted function. - """ - data = self._obj_with_exclusions - df = data if data.ndim == 2 else data.to_frame() - - starts, ends, sorted_index, sorted_data = self._numba_prep(df) - numba_.validate_udf(func) - numba_agg_func = numba_.generate_numba_agg_func( - func, **get_jit_arguments(engine_kwargs, kwargs) - ) - result = numba_agg_func( - sorted_data, - sorted_index, - starts, - ends, - len(df.columns), - *args, - ) - index = self.grouper.result_index - if data.ndim == 1: - result_kwargs = {"name": data.name} - result = result.ravel() - else: - result_kwargs = {"columns": data.columns} - res = data._constructor(result, index=index, **result_kwargs) - if not self.as_index: - res = self._insert_inaxis_grouper(res) - res.index = default_index(len(res)) - return res - - # ----------------------------------------------------------------- - # apply/agg/transform - - @Appender( - _apply_docs["template"].format( - input="dataframe", examples=_apply_docs["dataframe_examples"] - ) - ) - def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: - orig_func = func - func = com.is_builtin_func(func) - if orig_func != func: - alias = com._builtin_table_alias[orig_func] - warn_alias_replacement(self, orig_func, alias) - - if isinstance(func, str): - if hasattr(self, func): - res = getattr(self, func) - if callable(res): - return res(*args, **kwargs) - elif args or kwargs: - raise ValueError(f"Cannot pass arguments to property {func}") - return res - - else: - raise TypeError(f"apply func should be callable, not '{func}'") - - elif args or kwargs: - if callable(func): - - @wraps(func) - def f(g): - return func(g, *args, **kwargs) - - else: - raise ValueError( - "func must be a callable if args or kwargs are supplied" - ) - else: - f = func - - if not include_groups: - return self._python_apply_general(f, self._obj_with_exclusions) - - # ignore SettingWithCopy here in case the user mutates - with option_context("mode.chained_assignment", None): - try: - result = self._python_apply_general(f, self._selected_obj) - if ( - not isinstance(self.obj, Series) - and self._selection is None - and self._selected_obj.shape != self._obj_with_exclusions.shape - ): - warnings.warn( - message=_apply_groupings_depr.format( - type(self).__name__, "apply" - ), - category=FutureWarning, - stacklevel=find_stack_level(), - ) - except TypeError: - # gh-20949 - # try again, with .apply acting as a filtering - # operation, by excluding the grouping column - # This would normally not be triggered - # except if the udf is trying an operation that - # fails on *some* columns, e.g. a numeric operation - # on a string grouper column - - return self._python_apply_general(f, self._obj_with_exclusions) - - return result - - @final - def _python_apply_general( - self, - f: Callable, - data: DataFrame | Series, - not_indexed_same: bool | None = None, - is_transform: bool = False, - is_agg: bool = False, - ) -> NDFrameT: - """ - Apply function f in python space - - Parameters - ---------- - f : callable - Function to apply - data : Series or DataFrame - Data to apply f to - not_indexed_same: bool, optional - When specified, overrides the value of not_indexed_same. Apply behaves - differently when the result index is equal to the input index, but - this can be coincidental leading to value-dependent behavior. - is_transform : bool, default False - Indicator for whether the function is actually a transform - and should not have group keys prepended. - is_agg : bool, default False - Indicator for whether the function is an aggregation. When the - result is empty, we don't want to warn for this case. - See _GroupBy._python_agg_general. - - Returns - ------- - Series or DataFrame - data after applying f - """ - values, mutated = self.grouper.apply_groupwise(f, data, self.axis) - if not_indexed_same is None: - not_indexed_same = mutated - - return self._wrap_applied_output( - data, - values, - not_indexed_same, - is_transform, - ) - - @final - def _agg_general( - self, - numeric_only: bool = False, - min_count: int = -1, - *, - alias: str, - npfunc: Callable | None = None, - **kwargs, - ): - result = self._cython_agg_general( - how=alias, - alt=npfunc, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - return result.__finalize__(self.obj, method="groupby") - - def _agg_py_fallback( - self, how: str, values: ArrayLike, ndim: int, alt: Callable - ) -> ArrayLike: - """ - Fallback to pure-python aggregation if _cython_operation raises - NotImplementedError. - """ - # We get here with a) EADtypes and b) object dtype - assert alt is not None - - if values.ndim == 1: - # For DataFrameGroupBy we only get here with ExtensionArray - ser = Series(values, copy=False) - else: - # We only get here with values.dtype == object - df = DataFrame(values.T, dtype=values.dtype) - # bc we split object blocks in grouped_reduce, we have only 1 col - # otherwise we'd have to worry about block-splitting GH#39329 - assert df.shape[1] == 1 - # Avoid call to self.values that can occur in DataFrame - # reductions; see GH#28949 - ser = df.iloc[:, 0] - - # We do not get here with UDFs, so we know that our dtype - # should always be preserved by the implemented aggregations - # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype? - try: - res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True) - except Exception as err: - msg = f"agg function failed [how->{how},dtype->{ser.dtype}]" - # preserve the kind of exception that raised - raise type(err)(msg) from err - - if ser.dtype == object: - res_values = res_values.astype(object, copy=False) - - # If we are DataFrameGroupBy and went through a SeriesGroupByPath - # then we need to reshape - # GH#32223 includes case with IntegerArray values, ndarray res_values - # test_groupby_duplicate_columns with object dtype values - return ensure_block_shape(res_values, ndim=ndim) - - @final - def _cython_agg_general( - self, - how: str, - alt: Callable | None = None, - numeric_only: bool = False, - min_count: int = -1, - **kwargs, - ): - # Note: we never get here with how="ohlc" for DataFrameGroupBy; - # that goes through SeriesGroupBy - - data = self._get_data_to_aggregate(numeric_only=numeric_only, name=how) - - def array_func(values: ArrayLike) -> ArrayLike: - try: - result = self.grouper._cython_operation( - "aggregate", - values, - how, - axis=data.ndim - 1, - min_count=min_count, - **kwargs, - ) - except NotImplementedError: - # generally if we have numeric_only=False - # and non-applicable functions - # try to python agg - # TODO: shouldn't min_count matter? - # TODO: avoid special casing SparseArray here - if how in ["any", "all"] and isinstance(values, SparseArray): - pass - elif alt is None or how in ["any", "all", "std", "sem"]: - raise # TODO: re-raise as TypeError? should not be reached - else: - return result - - assert alt is not None - result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt) - return result - - new_mgr = data.grouped_reduce(array_func) - res = self._wrap_agged_manager(new_mgr) - if how in ["idxmin", "idxmax"]: - res = self._wrap_idxmax_idxmin(res) - out = self._wrap_aggregated_output(res) - if self.axis == 1: - out = out.infer_objects(copy=False) - return out - - def _cython_transform( - self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs - ): - raise AbstractMethodError(self) - - @final - def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): - # optimized transforms - orig_func = func - func = com.get_cython_func(func) or func - if orig_func != func: - warn_alias_replacement(self, orig_func, func) - - if not isinstance(func, str): - return self._transform_general(func, engine, engine_kwargs, *args, **kwargs) - - elif func not in base.transform_kernel_allowlist: - msg = f"'{func}' is not a valid function name for transform(name)" - raise ValueError(msg) - elif func in base.cythonized_kernels or func in base.transformation_kernels: - # cythonized transform or canned "agg+broadcast" - if engine is not None: - kwargs["engine"] = engine - kwargs["engine_kwargs"] = engine_kwargs - return getattr(self, func)(*args, **kwargs) - - else: - # i.e. func in base.reduction_kernels - - # GH#30918 Use _transform_fast only when we know func is an aggregation - # If func is a reduction, we need to broadcast the - # result to the whole group. Compute func result - # and deal with possible broadcasting below. - with com.temp_setattr(self, "as_index", True): - # GH#49834 - result needs groups in the index for - # _wrap_transform_fast_result - if func in ["idxmin", "idxmax"]: - func = cast(Literal["idxmin", "idxmax"], func) - result = self._idxmax_idxmin(func, True, *args, **kwargs) - else: - if engine is not None: - kwargs["engine"] = engine - kwargs["engine_kwargs"] = engine_kwargs - result = getattr(self, func)(*args, **kwargs) - - return self._wrap_transform_fast_result(result) - - @final - def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT: - """ - Fast transform path for aggregations. - """ - obj = self._obj_with_exclusions - - # for each col, reshape to size of original frame by take operation - ids, _, _ = self.grouper.group_info - result = result.reindex(self.grouper.result_index, axis=self.axis, copy=False) - - if self.obj.ndim == 1: - # i.e. SeriesGroupBy - out = algorithms.take_nd(result._values, ids) - output = obj._constructor(out, index=obj.index, name=obj.name) - else: - # `.size()` gives Series output on DataFrame input, need axis 0 - axis = 0 if result.ndim == 1 else self.axis - # GH#46209 - # Don't convert indices: negative indices need to give rise - # to null values in the result - new_ax = result.axes[axis].take(ids) - output = result._reindex_with_indexers( - {axis: (new_ax, ids)}, allow_dups=True, copy=False - ) - output = output.set_axis(obj._get_axis(self.axis), axis=axis) - return output - - # ----------------------------------------------------------------- - # Utilities - - @final - def _apply_filter(self, indices, dropna): - if len(indices) == 0: - indices = np.array([], dtype="int64") - else: - indices = np.sort(np.concatenate(indices)) - if dropna: - filtered = self._selected_obj.take(indices, axis=self.axis) - else: - mask = np.empty(len(self._selected_obj.index), dtype=bool) - mask.fill(False) - mask[indices.astype(int)] = True - # mask fails to broadcast when passed to where; broadcast manually. - mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T - filtered = self._selected_obj.where(mask) # Fill with NaNs. - return filtered - - @final - def _cumcount_array(self, ascending: bool = True) -> np.ndarray: - """ - Parameters - ---------- - ascending : bool, default True - If False, number in reverse, from length of group - 1 to 0. - - Notes - ----- - this is currently implementing sort=False - (though the default is sort=True) for groupby in general - """ - ids, _, ngroups = self.grouper.group_info - sorter = get_group_index_sorter(ids, ngroups) - ids, count = ids[sorter], len(ids) - - if count == 0: - return np.empty(0, dtype=np.int64) - - run = np.r_[True, ids[:-1] != ids[1:]] - rep = np.diff(np.r_[np.nonzero(run)[0], count]) - out = (~run).cumsum() - - if ascending: - out -= np.repeat(out[run], rep) - else: - out = np.repeat(out[np.r_[run[1:], True]], rep) - out - - if self.grouper.has_dropped_na: - out = np.where(ids == -1, np.nan, out.astype(np.float64, copy=False)) - else: - out = out.astype(np.int64, copy=False) - - rev = np.empty(count, dtype=np.intp) - rev[sorter] = np.arange(count, dtype=np.intp) - return out[rev] - - # ----------------------------------------------------------------- - - @final - @property - def _obj_1d_constructor(self) -> Callable: - # GH28330 preserve subclassed Series/DataFrames - if isinstance(self.obj, DataFrame): - return self.obj._constructor_sliced - assert isinstance(self.obj, Series) - return self.obj._constructor - - @final - @Substitution(name="groupby") - @Substitution(see_also=_common_see_also) - def any(self, skipna: bool = True) -> NDFrameT: - """ - Return True if any value in the group is truthful, else False. - - Parameters - ---------- - skipna : bool, default True - Flag to ignore nan values during truth testing. - - Returns - ------- - Series or DataFrame - DataFrame or Series of boolean values, where a value is True if any element - is True within its respective group, False otherwise. - %(see_also)s - Examples - -------- - For SeriesGroupBy: - - >>> lst = ['a', 'a', 'b'] - >>> ser = pd.Series([1, 2, 0], index=lst) - >>> ser - a 1 - a 2 - b 0 - dtype: int64 - >>> ser.groupby(level=0).any() - a True - b False - dtype: bool - - For DataFrameGroupBy: - - >>> data = [[1, 0, 3], [1, 0, 6], [7, 1, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["ostrich", "penguin", "parrot"]) - >>> df - a b c - ostrich 1 0 3 - penguin 1 0 6 - parrot 7 1 9 - >>> df.groupby(by=["a"]).any() - b c - a - 1 False True - 7 True True - """ - return self._cython_agg_general( - "any", - alt=lambda x: Series(x).any(skipna=skipna), - skipna=skipna, - ) - - @final - @Substitution(name="groupby") - @Substitution(see_also=_common_see_also) - def all(self, skipna: bool = True) -> NDFrameT: - """ - Return True if all values in the group are truthful, else False. - - Parameters - ---------- - skipna : bool, default True - Flag to ignore nan values during truth testing. - - Returns - ------- - Series or DataFrame - DataFrame or Series of boolean values, where a value is True if all elements - are True within its respective group, False otherwise. - %(see_also)s - Examples - -------- - - For SeriesGroupBy: - - >>> lst = ['a', 'a', 'b'] - >>> ser = pd.Series([1, 2, 0], index=lst) - >>> ser - a 1 - a 2 - b 0 - dtype: int64 - >>> ser.groupby(level=0).all() - a True - b False - dtype: bool - - For DataFrameGroupBy: - - >>> data = [[1, 0, 3], [1, 5, 6], [7, 8, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["ostrich", "penguin", "parrot"]) - >>> df - a b c - ostrich 1 0 3 - penguin 1 5 6 - parrot 7 8 9 - >>> df.groupby(by=["a"]).all() - b c - a - 1 False True - 7 True True - """ - return self._cython_agg_general( - "all", - alt=lambda x: Series(x).all(skipna=skipna), - skipna=skipna, - ) - - @final - @Substitution(name="groupby") - @Substitution(see_also=_common_see_also) - def count(self) -> NDFrameT: - """ - Compute count of group, excluding missing values. - - Returns - ------- - Series or DataFrame - Count of values within each group. - %(see_also)s - Examples - -------- - For SeriesGroupBy: - - >>> lst = ['a', 'a', 'b'] - >>> ser = pd.Series([1, 2, np.nan], index=lst) - >>> ser - a 1.0 - a 2.0 - b NaN - dtype: float64 - >>> ser.groupby(level=0).count() - a 2 - b 0 - dtype: int64 - - For DataFrameGroupBy: - - >>> data = [[1, np.nan, 3], [1, np.nan, 6], [7, 8, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["cow", "horse", "bull"]) - >>> df - a b c - cow 1 NaN 3 - horse 1 NaN 6 - bull 7 8.0 9 - >>> df.groupby("a").count() - b c - a - 1 0 2 - 7 1 1 - - For Resampler: - - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) - >>> ser - 2023-01-01 1 - 2023-01-15 2 - 2023-02-01 3 - 2023-02-15 4 - dtype: int64 - >>> ser.resample('MS').count() - 2023-01-01 2 - 2023-02-01 2 - Freq: MS, dtype: int64 - """ - data = self._get_data_to_aggregate() - ids, _, ngroups = self.grouper.group_info - mask = ids != -1 - - is_series = data.ndim == 1 - - def hfunc(bvalues: ArrayLike) -> ArrayLike: - # TODO(EA2D): reshape would not be necessary with 2D EAs - if bvalues.ndim == 1: - # EA - masked = mask & ~isna(bvalues).reshape(1, -1) - else: - masked = mask & ~isna(bvalues) - - counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups) - if isinstance(bvalues, BaseMaskedArray): - return IntegerArray( - counted[0], mask=np.zeros(counted.shape[1], dtype=np.bool_) - ) - elif isinstance(bvalues, ArrowExtensionArray) and not isinstance( - bvalues.dtype, StringDtype - ): - return type(bvalues)._from_sequence(counted[0]) - if is_series: - assert counted.ndim == 2 - assert counted.shape[0] == 1 - return counted[0] - return counted - - new_mgr = data.grouped_reduce(hfunc) - new_obj = self._wrap_agged_manager(new_mgr) - - # If we are grouping on categoricals we want unobserved categories to - # return zero, rather than the default of NaN which the reindexing in - # _wrap_aggregated_output() returns. GH 35028 - # e.g. test_dataframe_groupby_on_2_categoricals_when_observed_is_false - with com.temp_setattr(self, "observed", True): - result = self._wrap_aggregated_output(new_obj) - - return self._reindex_output(result, fill_value=0) - - @final - @Substitution(name="groupby") - @Substitution(see_also=_common_see_also) - def mean( - self, - numeric_only: bool = False, - engine: Literal["cython", "numba"] | None = None, - engine_kwargs: dict[str, bool] | None = None, - ): - """ - Compute mean of groups, excluding missing values. - - Parameters - ---------- - numeric_only : bool, default False - Include only float, int, boolean columns. - - .. versionchanged:: 2.0.0 - - numeric_only no longer accepts ``None`` and defaults to ``False``. - - engine : str, default None - * ``'cython'`` : Runs the operation through C-extensions from cython. - * ``'numba'`` : Runs the operation through JIT compiled code from numba. - * ``None`` : Defaults to ``'cython'`` or globally setting - ``compute.use_numba`` - - .. versionadded:: 1.4.0 - - engine_kwargs : dict, default None - * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` - * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or - ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` - - .. versionadded:: 1.4.0 - - Returns - ------- - pandas.Series or pandas.DataFrame - %(see_also)s - Examples - -------- - >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2], - ... 'B': [np.nan, 2, 3, 4, 5], - ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C']) - - Groupby one column and return the mean of the remaining columns in - each group. - - >>> df.groupby('A').mean() - B C - A - 1 3.0 1.333333 - 2 4.0 1.500000 - - Groupby two columns and return the mean of the remaining column. - - >>> df.groupby(['A', 'B']).mean() - C - A B - 1 2.0 2.0 - 4.0 1.0 - 2 3.0 1.0 - 5.0 2.0 - - Groupby one column and return the mean of only particular column in - the group. - - >>> df.groupby('A')['B'].mean() - A - 1 3.0 - 2 4.0 - Name: B, dtype: float64 - """ - - if maybe_use_numba(engine): - from pandas.core._numba.kernels import grouped_mean - - return self._numba_agg_general( - grouped_mean, - executor.float_dtype_mapping, - engine_kwargs, - min_periods=0, - ) - else: - result = self._cython_agg_general( - "mean", - alt=lambda x: Series(x).mean(numeric_only=numeric_only), - numeric_only=numeric_only, - ) - return result.__finalize__(self.obj, method="groupby") - - @final - def median(self, numeric_only: bool = False) -> NDFrameT: - """ - Compute median of groups, excluding missing values. - - For multiple groupings, the result index will be a MultiIndex - - Parameters - ---------- - numeric_only : bool, default False - Include only float, int, boolean columns. - - .. versionchanged:: 2.0.0 - - numeric_only no longer accepts ``None`` and defaults to False. - - Returns - ------- - Series or DataFrame - Median of values within each group. - - Examples - -------- - For SeriesGroupBy: - - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] - >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst) - >>> ser - a 7 - a 2 - a 8 - b 4 - b 3 - b 3 - dtype: int64 - >>> ser.groupby(level=0).median() - a 7.0 - b 3.0 - dtype: float64 - - For DataFrameGroupBy: - - >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]} - >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog', - ... 'mouse', 'mouse', 'mouse', 'mouse']) - >>> df - a b - dog 1 1 - dog 3 4 - dog 5 8 - mouse 7 4 - mouse 7 4 - mouse 8 2 - mouse 3 1 - >>> df.groupby(level=0).median() - a b - dog 3.0 4.0 - mouse 7.0 3.0 - - For Resampler: - - >>> ser = pd.Series([1, 2, 3, 3, 4, 5], - ... index=pd.DatetimeIndex(['2023-01-01', - ... '2023-01-10', - ... '2023-01-15', - ... '2023-02-01', - ... '2023-02-10', - ... '2023-02-15'])) - >>> ser.resample('MS').median() - 2023-01-01 2.0 - 2023-02-01 4.0 - Freq: MS, dtype: float64 - """ - result = self._cython_agg_general( - "median", - alt=lambda x: Series(x).median(numeric_only=numeric_only), - numeric_only=numeric_only, - ) - return result.__finalize__(self.obj, method="groupby") - - @final - @Substitution(name="groupby") - @Substitution(see_also=_common_see_also) - def std( - self, - ddof: int = 1, - engine: Literal["cython", "numba"] | None = None, - engine_kwargs: dict[str, bool] | None = None, - numeric_only: bool = False, - ): - """ - Compute standard deviation of groups, excluding missing values. - - For multiple groupings, the result index will be a MultiIndex. - - Parameters - ---------- - ddof : int, default 1 - Degrees of freedom. - - engine : str, default None - * ``'cython'`` : Runs the operation through C-extensions from cython. - * ``'numba'`` : Runs the operation through JIT compiled code from numba. - * ``None`` : Defaults to ``'cython'`` or globally setting - ``compute.use_numba`` - - .. versionadded:: 1.4.0 - - engine_kwargs : dict, default None - * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` - * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or - ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` - - .. versionadded:: 1.4.0 - - numeric_only : bool, default False - Include only `float`, `int` or `boolean` data. - - .. versionadded:: 1.5.0 - - .. versionchanged:: 2.0.0 - - numeric_only now defaults to ``False``. - - Returns - ------- - Series or DataFrame - Standard deviation of values within each group. - %(see_also)s - Examples - -------- - For SeriesGroupBy: - - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] - >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst) - >>> ser - a 7 - a 2 - a 8 - b 4 - b 3 - b 3 - dtype: int64 - >>> ser.groupby(level=0).std() - a 3.21455 - b 0.57735 - dtype: float64 - - For DataFrameGroupBy: - - >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]} - >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog', - ... 'mouse', 'mouse', 'mouse', 'mouse']) - >>> df - a b - dog 1 1 - dog 3 4 - dog 5 8 - mouse 7 4 - mouse 7 4 - mouse 8 2 - mouse 3 1 - >>> df.groupby(level=0).std() - a b - dog 2.000000 3.511885 - mouse 2.217356 1.500000 - """ - if maybe_use_numba(engine): - from pandas.core._numba.kernels import grouped_var - - return np.sqrt( - self._numba_agg_general( - grouped_var, - executor.float_dtype_mapping, - engine_kwargs, - min_periods=0, - ddof=ddof, - ) - ) - else: - return self._cython_agg_general( - "std", - alt=lambda x: Series(x).std(ddof=ddof), - numeric_only=numeric_only, - ddof=ddof, - ) - - @final - @Substitution(name="groupby") - @Substitution(see_also=_common_see_also) - def var( - self, - ddof: int = 1, - engine: Literal["cython", "numba"] | None = None, - engine_kwargs: dict[str, bool] | None = None, - numeric_only: bool = False, - ): - """ - Compute variance of groups, excluding missing values. - - For multiple groupings, the result index will be a MultiIndex. - - Parameters - ---------- - ddof : int, default 1 - Degrees of freedom. - - engine : str, default None - * ``'cython'`` : Runs the operation through C-extensions from cython. - * ``'numba'`` : Runs the operation through JIT compiled code from numba. - * ``None`` : Defaults to ``'cython'`` or globally setting - ``compute.use_numba`` - - .. versionadded:: 1.4.0 - - engine_kwargs : dict, default None - * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` - * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or - ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` - - .. versionadded:: 1.4.0 - - numeric_only : bool, default False - Include only `float`, `int` or `boolean` data. - - .. versionadded:: 1.5.0 - - .. versionchanged:: 2.0.0 - - numeric_only now defaults to ``False``. - - Returns - ------- - Series or DataFrame - Variance of values within each group. - %(see_also)s - Examples - -------- - For SeriesGroupBy: - - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] - >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst) - >>> ser - a 7 - a 2 - a 8 - b 4 - b 3 - b 3 - dtype: int64 - >>> ser.groupby(level=0).var() - a 10.333333 - b 0.333333 - dtype: float64 - - For DataFrameGroupBy: - - >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]} - >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog', - ... 'mouse', 'mouse', 'mouse', 'mouse']) - >>> df - a b - dog 1 1 - dog 3 4 - dog 5 8 - mouse 7 4 - mouse 7 4 - mouse 8 2 - mouse 3 1 - >>> df.groupby(level=0).var() - a b - dog 4.000000 12.333333 - mouse 4.916667 2.250000 - """ - if maybe_use_numba(engine): - from pandas.core._numba.kernels import grouped_var - - return self._numba_agg_general( - grouped_var, - executor.float_dtype_mapping, - engine_kwargs, - min_periods=0, - ddof=ddof, - ) - else: - return self._cython_agg_general( - "var", - alt=lambda x: Series(x).var(ddof=ddof), - numeric_only=numeric_only, - ddof=ddof, - ) - - @final - def _value_counts( - self, - subset: Sequence[Hashable] | None = None, - normalize: bool = False, - sort: bool = True, - ascending: bool = False, - dropna: bool = True, - ) -> DataFrame | Series: - """ - Shared implementation of value_counts for SeriesGroupBy and DataFrameGroupBy. - - SeriesGroupBy additionally supports a bins argument. See the docstring of - DataFrameGroupBy.value_counts for a description of arguments. - """ - if self.axis == 1: - raise NotImplementedError( - "DataFrameGroupBy.value_counts only handles axis=0" - ) - name = "proportion" if normalize else "count" - - df = self.obj - obj = self._obj_with_exclusions - - in_axis_names = { - grouping.name for grouping in self.grouper.groupings if grouping.in_axis - } - if isinstance(obj, Series): - _name = obj.name - keys = [] if _name in in_axis_names else [obj] - else: - unique_cols = set(obj.columns) - if subset is not None: - subsetted = set(subset) - clashing = subsetted & set(in_axis_names) - if clashing: - raise ValueError( - f"Keys {clashing} in subset cannot be in " - "the groupby column keys." - ) - doesnt_exist = subsetted - unique_cols - if doesnt_exist: - raise ValueError( - f"Keys {doesnt_exist} in subset do not " - f"exist in the DataFrame." - ) - else: - subsetted = unique_cols - - keys = [ - # Can't use .values because the column label needs to be preserved - obj.iloc[:, idx] - for idx, _name in enumerate(obj.columns) - if _name not in in_axis_names and _name in subsetted - ] - - groupings = list(self.grouper.groupings) - for key in keys: - grouper, _, _ = get_grouper( - df, - key=key, - axis=self.axis, - sort=self.sort, - observed=False, - dropna=dropna, - ) - groupings += list(grouper.groupings) - - # Take the size of the overall columns - gb = df.groupby( - groupings, - sort=self.sort, - observed=self.observed, - dropna=self.dropna, - ) - result_series = cast(Series, gb.size()) - result_series.name = name - - # GH-46357 Include non-observed categories - # of non-grouping columns regardless of `observed` - if any( - isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex)) - and not grouping._observed - for grouping in groupings - ): - levels_list = [ping.result_index for ping in groupings] - multi_index, _ = MultiIndex.from_product( - levels_list, names=[ping.name for ping in groupings] - ).sortlevel() - result_series = result_series.reindex(multi_index, fill_value=0) - - if normalize: - # Normalize the results by dividing by the original group sizes. - # We are guaranteed to have the first N levels be the - # user-requested grouping. - levels = list( - range(len(self.grouper.groupings), result_series.index.nlevels) - ) - indexed_group_size = result_series.groupby( - result_series.index.droplevel(levels), - sort=self.sort, - dropna=self.dropna, - # GH#43999 - deprecation of observed=False - observed=False, - ).transform("sum") - result_series /= indexed_group_size - - # Handle groups of non-observed categories - result_series = result_series.fillna(0.0) - - if sort: - # Sort the values and then resort by the main grouping - index_level = range(len(self.grouper.groupings)) - result_series = result_series.sort_values(ascending=ascending).sort_index( - level=index_level, sort_remaining=False - ) - - result: Series | DataFrame - if self.as_index: - result = result_series - else: - # Convert to frame - index = result_series.index - columns = com.fill_missing_names(index.names) - if name in columns: - raise ValueError(f"Column label '{name}' is duplicate of result column") - result_series.name = name - result_series.index = index.set_names(range(len(columns))) - result_frame = result_series.reset_index() - orig_dtype = self.grouper.groupings[0].obj.columns.dtype # type: ignore[union-attr] - cols = Index(columns, dtype=orig_dtype).insert(len(columns), name) - result_frame.columns = cols - result = result_frame - return result.__finalize__(self.obj, method="value_counts") - - @final - def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: - """ - Compute standard error of the mean of groups, excluding missing values. - - For multiple groupings, the result index will be a MultiIndex. - - Parameters - ---------- - ddof : int, default 1 - Degrees of freedom. - - numeric_only : bool, default False - Include only `float`, `int` or `boolean` data. - - .. versionadded:: 1.5.0 - - .. versionchanged:: 2.0.0 - - numeric_only now defaults to ``False``. - - Returns - ------- - Series or DataFrame - Standard error of the mean of values within each group. - - Examples - -------- - For SeriesGroupBy: - - >>> lst = ['a', 'a', 'b', 'b'] - >>> ser = pd.Series([5, 10, 8, 14], index=lst) - >>> ser - a 5 - a 10 - b 8 - b 14 - dtype: int64 - >>> ser.groupby(level=0).sem() - a 2.5 - b 3.0 - dtype: float64 - - For DataFrameGroupBy: - - >>> data = [[1, 12, 11], [1, 15, 2], [2, 5, 8], [2, 6, 12]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["tuna", "salmon", "catfish", "goldfish"]) - >>> df - a b c - tuna 1 12 11 - salmon 1 15 2 - catfish 2 5 8 - goldfish 2 6 12 - >>> df.groupby("a").sem() - b c - a - 1 1.5 4.5 - 2 0.5 2.0 - - For Resampler: - - >>> ser = pd.Series([1, 3, 2, 4, 3, 8], - ... index=pd.DatetimeIndex(['2023-01-01', - ... '2023-01-10', - ... '2023-01-15', - ... '2023-02-01', - ... '2023-02-10', - ... '2023-02-15'])) - >>> ser.resample('MS').sem() - 2023-01-01 0.577350 - 2023-02-01 1.527525 - Freq: MS, dtype: float64 - """ - if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype): - raise TypeError( - f"{type(self).__name__}.sem called with " - f"numeric_only={numeric_only} and dtype {self.obj.dtype}" - ) - return self._cython_agg_general( - "sem", - alt=lambda x: Series(x).sem(ddof=ddof), - numeric_only=numeric_only, - ddof=ddof, - ) - - @final - @Substitution(name="groupby") - @Substitution(see_also=_common_see_also) - def size(self) -> DataFrame | Series: - """ - Compute group sizes. - - Returns - ------- - DataFrame or Series - Number of rows in each group as a Series if as_index is True - or a DataFrame if as_index is False. - %(see_also)s - Examples - -------- - - For SeriesGroupBy: - - >>> lst = ['a', 'a', 'b'] - >>> ser = pd.Series([1, 2, 3], index=lst) - >>> ser - a 1 - a 2 - b 3 - dtype: int64 - >>> ser.groupby(level=0).size() - a 2 - b 1 - dtype: int64 - - >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["owl", "toucan", "eagle"]) - >>> df - a b c - owl 1 2 3 - toucan 1 5 6 - eagle 7 8 9 - >>> df.groupby("a").size() - a - 1 2 - 7 1 - dtype: int64 - - For Resampler: - - >>> ser = pd.Series([1, 2, 3], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01'])) - >>> ser - 2023-01-01 1 - 2023-01-15 2 - 2023-02-01 3 - dtype: int64 - >>> ser.resample('MS').size() - 2023-01-01 2 - 2023-02-01 1 - Freq: MS, dtype: int64 - """ - result = self.grouper.size() - dtype_backend: None | Literal["pyarrow", "numpy_nullable"] = None - if isinstance(self.obj, Series): - if isinstance(self.obj.array, ArrowExtensionArray): - if isinstance(self.obj.array, ArrowStringArrayNumpySemantics): - dtype_backend = None - elif isinstance(self.obj.array, ArrowStringArray): - dtype_backend = "numpy_nullable" - else: - dtype_backend = "pyarrow" - elif isinstance(self.obj.array, BaseMaskedArray): - dtype_backend = "numpy_nullable" - # TODO: For DataFrames what if columns are mixed arrow/numpy/masked? - - # GH28330 preserve subclassed Series/DataFrames through calls - if isinstance(self.obj, Series): - result = self._obj_1d_constructor(result, name=self.obj.name) - else: - result = self._obj_1d_constructor(result) - - if dtype_backend is not None: - result = result.convert_dtypes( - infer_objects=False, - convert_string=False, - convert_boolean=False, - convert_floating=False, - dtype_backend=dtype_backend, - ) - - with com.temp_setattr(self, "as_index", True): - # size already has the desired behavior in GH#49519, but this makes the - # as_index=False path of _reindex_output fail on categorical groupers. - result = self._reindex_output(result, fill_value=0) - if not self.as_index: - # error: Incompatible types in assignment (expression has - # type "DataFrame", variable has type "Series") - result = result.rename("size").reset_index() # type: ignore[assignment] - return result - - @final - @doc( - _groupby_agg_method_engine_template, - fname="sum", - no=False, - mc=0, - e=None, - ek=None, - example=dedent( - """\ - For SeriesGroupBy: - - >>> lst = ['a', 'a', 'b', 'b'] - >>> ser = pd.Series([1, 2, 3, 4], index=lst) - >>> ser - a 1 - a 2 - b 3 - b 4 - dtype: int64 - >>> ser.groupby(level=0).sum() - a 3 - b 7 - dtype: int64 - - For DataFrameGroupBy: - - >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["tiger", "leopard", "cheetah", "lion"]) - >>> df - a b c - tiger 1 8 2 - leopard 1 2 5 - cheetah 2 5 8 - lion 2 6 9 - >>> df.groupby("a").sum() - b c - a - 1 10 7 - 2 11 17""" - ), - ) - def sum( - self, - numeric_only: bool = False, - min_count: int = 0, - engine: Literal["cython", "numba"] | None = None, - engine_kwargs: dict[str, bool] | None = None, - ): - if maybe_use_numba(engine): - from pandas.core._numba.kernels import grouped_sum - - return self._numba_agg_general( - grouped_sum, - executor.default_dtype_mapping, - engine_kwargs, - min_periods=min_count, - ) - else: - # If we are grouping on categoricals we want unobserved categories to - # return zero, rather than the default of NaN which the reindexing in - # _agg_general() returns. GH #31422 - with com.temp_setattr(self, "observed", True): - result = self._agg_general( - numeric_only=numeric_only, - min_count=min_count, - alias="sum", - npfunc=np.sum, - ) - - return self._reindex_output(result, fill_value=0) - - @final - @doc( - _groupby_agg_method_template, - fname="prod", - no=False, - mc=0, - example=dedent( - """\ - For SeriesGroupBy: - - >>> lst = ['a', 'a', 'b', 'b'] - >>> ser = pd.Series([1, 2, 3, 4], index=lst) - >>> ser - a 1 - a 2 - b 3 - b 4 - dtype: int64 - >>> ser.groupby(level=0).prod() - a 2 - b 12 - dtype: int64 - - For DataFrameGroupBy: - - >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["tiger", "leopard", "cheetah", "lion"]) - >>> df - a b c - tiger 1 8 2 - leopard 1 2 5 - cheetah 2 5 8 - lion 2 6 9 - >>> df.groupby("a").prod() - b c - a - 1 16 10 - 2 30 72""" - ), - ) - def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: - return self._agg_general( - numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod - ) - - @final - @doc( - _groupby_agg_method_engine_template, - fname="min", - no=False, - mc=-1, - e=None, - ek=None, - example=dedent( - """\ - For SeriesGroupBy: - - >>> lst = ['a', 'a', 'b', 'b'] - >>> ser = pd.Series([1, 2, 3, 4], index=lst) - >>> ser - a 1 - a 2 - b 3 - b 4 - dtype: int64 - >>> ser.groupby(level=0).min() - a 1 - b 3 - dtype: int64 - - For DataFrameGroupBy: - - >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["tiger", "leopard", "cheetah", "lion"]) - >>> df - a b c - tiger 1 8 2 - leopard 1 2 5 - cheetah 2 5 8 - lion 2 6 9 - >>> df.groupby("a").min() - b c - a - 1 2 2 - 2 5 8""" - ), - ) - def min( - self, - numeric_only: bool = False, - min_count: int = -1, - engine: Literal["cython", "numba"] | None = None, - engine_kwargs: dict[str, bool] | None = None, - ): - if maybe_use_numba(engine): - from pandas.core._numba.kernels import grouped_min_max - - return self._numba_agg_general( - grouped_min_max, - executor.identity_dtype_mapping, - engine_kwargs, - min_periods=min_count, - is_max=False, - ) - else: - return self._agg_general( - numeric_only=numeric_only, - min_count=min_count, - alias="min", - npfunc=np.min, - ) - - @final - @doc( - _groupby_agg_method_engine_template, - fname="max", - no=False, - mc=-1, - e=None, - ek=None, - example=dedent( - """\ - For SeriesGroupBy: - - >>> lst = ['a', 'a', 'b', 'b'] - >>> ser = pd.Series([1, 2, 3, 4], index=lst) - >>> ser - a 1 - a 2 - b 3 - b 4 - dtype: int64 - >>> ser.groupby(level=0).max() - a 2 - b 4 - dtype: int64 - - For DataFrameGroupBy: - - >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["tiger", "leopard", "cheetah", "lion"]) - >>> df - a b c - tiger 1 8 2 - leopard 1 2 5 - cheetah 2 5 8 - lion 2 6 9 - >>> df.groupby("a").max() - b c - a - 1 8 5 - 2 6 9""" - ), - ) - def max( - self, - numeric_only: bool = False, - min_count: int = -1, - engine: Literal["cython", "numba"] | None = None, - engine_kwargs: dict[str, bool] | None = None, - ): - if maybe_use_numba(engine): - from pandas.core._numba.kernels import grouped_min_max - - return self._numba_agg_general( - grouped_min_max, - executor.identity_dtype_mapping, - engine_kwargs, - min_periods=min_count, - is_max=True, - ) - else: - return self._agg_general( - numeric_only=numeric_only, - min_count=min_count, - alias="max", - npfunc=np.max, - ) - - @final - def first(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: - """ - Compute the first non-null entry of each column. - - Parameters - ---------- - numeric_only : bool, default False - Include only float, int, boolean columns. - min_count : int, default -1 - The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. - - Returns - ------- - Series or DataFrame - First non-null of values within each group. - - See Also - -------- - DataFrame.groupby : Apply a function groupby to each row or column of a - DataFrame. - pandas.core.groupby.DataFrameGroupBy.last : Compute the last non-null entry - of each column. - pandas.core.groupby.DataFrameGroupBy.nth : Take the nth row from each group. - - Examples - -------- - >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[None, 5, 6], C=[1, 2, 3], - ... D=['3/11/2000', '3/12/2000', '3/13/2000'])) - >>> df['D'] = pd.to_datetime(df['D']) - >>> df.groupby("A").first() - B C D - A - 1 5.0 1 2000-03-11 - 3 6.0 3 2000-03-13 - >>> df.groupby("A").first(min_count=2) - B C D - A - 1 NaN 1.0 2000-03-11 - 3 NaN NaN NaT - >>> df.groupby("A").first(numeric_only=True) - B C - A - 1 5.0 1 - 3 6.0 3 - """ - - def first_compat(obj: NDFrameT, axis: AxisInt = 0): - def first(x: Series): - """Helper function for first item that isn't NA.""" - arr = x.array[notna(x.array)] - if not len(arr): - return x.array.dtype.na_value - return arr[0] - - if isinstance(obj, DataFrame): - return obj.apply(first, axis=axis) - elif isinstance(obj, Series): - return first(obj) - else: # pragma: no cover - raise TypeError(type(obj)) - - return self._agg_general( - numeric_only=numeric_only, - min_count=min_count, - alias="first", - npfunc=first_compat, - ) - - @final - def last(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: - """ - Compute the last non-null entry of each column. - - Parameters - ---------- - numeric_only : bool, default False - Include only float, int, boolean columns. If None, will attempt to use - everything, then use only numeric data. - min_count : int, default -1 - The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. - - Returns - ------- - Series or DataFrame - Last non-null of values within each group. - - See Also - -------- - DataFrame.groupby : Apply a function groupby to each row or column of a - DataFrame. - pandas.core.groupby.DataFrameGroupBy.first : Compute the first non-null entry - of each column. - pandas.core.groupby.DataFrameGroupBy.nth : Take the nth row from each group. - - Examples - -------- - >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[5, None, 6], C=[1, 2, 3])) - >>> df.groupby("A").last() - B C - A - 1 5.0 2 - 3 6.0 3 - """ - - def last_compat(obj: NDFrameT, axis: AxisInt = 0): - def last(x: Series): - """Helper function for last item that isn't NA.""" - arr = x.array[notna(x.array)] - if not len(arr): - return x.array.dtype.na_value - return arr[-1] - - if isinstance(obj, DataFrame): - return obj.apply(last, axis=axis) - elif isinstance(obj, Series): - return last(obj) - else: # pragma: no cover - raise TypeError(type(obj)) - - return self._agg_general( - numeric_only=numeric_only, - min_count=min_count, - alias="last", - npfunc=last_compat, - ) - - @final - def ohlc(self) -> DataFrame: - """ - Compute open, high, low and close values of a group, excluding missing values. - - For multiple groupings, the result index will be a MultiIndex - - Returns - ------- - DataFrame - Open, high, low and close values within each group. - - Examples - -------- - - For SeriesGroupBy: - - >>> lst = ['SPX', 'CAC', 'SPX', 'CAC', 'SPX', 'CAC', 'SPX', 'CAC',] - >>> ser = pd.Series([3.4, 9.0, 7.2, 5.2, 8.8, 9.4, 0.1, 0.5], index=lst) - >>> ser - SPX 3.4 - CAC 9.0 - SPX 7.2 - CAC 5.2 - SPX 8.8 - CAC 9.4 - SPX 0.1 - CAC 0.5 - dtype: float64 - >>> ser.groupby(level=0).ohlc() - open high low close - CAC 9.0 9.4 0.5 0.5 - SPX 3.4 8.8 0.1 0.1 - - For DataFrameGroupBy: - - >>> data = {2022: [1.2, 2.3, 8.9, 4.5, 4.4, 3, 2 , 1], - ... 2023: [3.4, 9.0, 7.2, 5.2, 8.8, 9.4, 8.2, 1.0]} - >>> df = pd.DataFrame(data, index=['SPX', 'CAC', 'SPX', 'CAC', - ... 'SPX', 'CAC', 'SPX', 'CAC']) - >>> df - 2022 2023 - SPX 1.2 3.4 - CAC 2.3 9.0 - SPX 8.9 7.2 - CAC 4.5 5.2 - SPX 4.4 8.8 - CAC 3.0 9.4 - SPX 2.0 8.2 - CAC 1.0 1.0 - >>> df.groupby(level=0).ohlc() - 2022 2023 - open high low close open high low close - CAC 2.3 4.5 1.0 1.0 9.0 9.4 1.0 1.0 - SPX 1.2 8.9 1.2 2.0 3.4 8.8 3.4 8.2 - - For Resampler: - - >>> ser = pd.Series([1, 3, 2, 4, 3, 5], - ... index=pd.DatetimeIndex(['2023-01-01', - ... '2023-01-10', - ... '2023-01-15', - ... '2023-02-01', - ... '2023-02-10', - ... '2023-02-15'])) - >>> ser.resample('MS').ohlc() - open high low close - 2023-01-01 1 3 1 2 - 2023-02-01 4 5 3 5 - """ - if self.obj.ndim == 1: - obj = self._selected_obj - - is_numeric = is_numeric_dtype(obj.dtype) - if not is_numeric: - raise DataError("No numeric types to aggregate") - - res_values = self.grouper._cython_operation( - "aggregate", obj._values, "ohlc", axis=0, min_count=-1 - ) - - agg_names = ["open", "high", "low", "close"] - result = self.obj._constructor_expanddim( - res_values, index=self.grouper.result_index, columns=agg_names - ) - return self._reindex_output(result) - - result = self._apply_to_column_groupbys(lambda sgb: sgb.ohlc()) - return result - - @doc(DataFrame.describe) - def describe( - self, - percentiles=None, - include=None, - exclude=None, - ) -> NDFrameT: - obj = self._obj_with_exclusions - - if len(obj) == 0: - described = obj.describe( - percentiles=percentiles, include=include, exclude=exclude - ) - if obj.ndim == 1: - result = described - else: - result = described.unstack() - return result.to_frame().T.iloc[:0] - - with com.temp_setattr(self, "as_index", True): - result = self._python_apply_general( - lambda x: x.describe( - percentiles=percentiles, include=include, exclude=exclude - ), - obj, - not_indexed_same=True, - ) - if self.axis == 1: - return result.T - - # GH#49256 - properly handle the grouping column(s) - result = result.unstack() - if not self.as_index: - result = self._insert_inaxis_grouper(result) - result.index = default_index(len(result)) - - return result - - @final - def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resampler: - """ - Provide resampling when using a TimeGrouper. - - Given a grouper, the function resamples it according to a string - "string" -> "frequency". - - See the :ref:`frequency aliases ` - documentation for more details. - - Parameters - ---------- - rule : str or DateOffset - The offset string or object representing target grouper conversion. - *args - Possible arguments are `how`, `fill_method`, `limit`, `kind` and - `on`, and other arguments of `TimeGrouper`. - include_groups : bool, default True - When True, will attempt to include the groupings in the operation in - the case that they are columns of the DataFrame. If this raises a - TypeError, the result will be computed with the groupings excluded. - When False, the groupings will be excluded when applying ``func``. - - .. versionadded:: 2.2.0 - - .. deprecated:: 2.2.0 - - Setting include_groups to True is deprecated. Only the value - False will be allowed in a future version of pandas. - - **kwargs - Possible arguments are `how`, `fill_method`, `limit`, `kind` and - `on`, and other arguments of `TimeGrouper`. - - Returns - ------- - pandas.api.typing.DatetimeIndexResamplerGroupby, - pandas.api.typing.PeriodIndexResamplerGroupby, or - pandas.api.typing.TimedeltaIndexResamplerGroupby - Return a new groupby object, with type depending on the data - being resampled. - - See Also - -------- - Grouper : Specify a frequency to resample with when - grouping by a key. - DatetimeIndex.resample : Frequency conversion and resampling of - time series. - - Examples - -------- - >>> idx = pd.date_range('1/1/2000', periods=4, freq='min') - >>> df = pd.DataFrame(data=4 * [range(2)], - ... index=idx, - ... columns=['a', 'b']) - >>> df.iloc[2, 0] = 5 - >>> df - a b - 2000-01-01 00:00:00 0 1 - 2000-01-01 00:01:00 0 1 - 2000-01-01 00:02:00 5 1 - 2000-01-01 00:03:00 0 1 - - Downsample the DataFrame into 3 minute bins and sum the values of - the timestamps falling into a bin. - - >>> df.groupby('a').resample('3min', include_groups=False).sum() - b - a - 0 2000-01-01 00:00:00 2 - 2000-01-01 00:03:00 1 - 5 2000-01-01 00:00:00 1 - - Upsample the series into 30 second bins. - - >>> df.groupby('a').resample('30s', include_groups=False).sum() - b - a - 0 2000-01-01 00:00:00 1 - 2000-01-01 00:00:30 0 - 2000-01-01 00:01:00 1 - 2000-01-01 00:01:30 0 - 2000-01-01 00:02:00 0 - 2000-01-01 00:02:30 0 - 2000-01-01 00:03:00 1 - 5 2000-01-01 00:02:00 1 - - Resample by month. Values are assigned to the month of the period. - - >>> df.groupby('a').resample('ME', include_groups=False).sum() - b - a - 0 2000-01-31 3 - 5 2000-01-31 1 - - Downsample the series into 3 minute bins as above, but close the right - side of the bin interval. - - >>> ( - ... df.groupby('a') - ... .resample('3min', closed='right', include_groups=False) - ... .sum() - ... ) - b - a - 0 1999-12-31 23:57:00 1 - 2000-01-01 00:00:00 2 - 5 2000-01-01 00:00:00 1 - - Downsample the series into 3 minute bins and close the right side of - the bin interval, but label each bin using the right edge instead of - the left. - - >>> ( - ... df.groupby('a') - ... .resample('3min', closed='right', label='right', include_groups=False) - ... .sum() - ... ) - b - a - 0 2000-01-01 00:00:00 1 - 2000-01-01 00:03:00 2 - 5 2000-01-01 00:03:00 1 - """ - from pandas.core.resample import get_resampler_for_grouping - - # mypy flags that include_groups could be specified via `*args` or `**kwargs` - # GH#54961 would resolve. - return get_resampler_for_grouping( # type: ignore[misc] - self, rule, *args, include_groups=include_groups, **kwargs - ) - - @final - def rolling(self, *args, **kwargs) -> RollingGroupby: - """ - Return a rolling grouper, providing rolling functionality per group. - - Parameters - ---------- - window : int, timedelta, str, offset, or BaseIndexer subclass - Size of the moving window. - - If an integer, the fixed number of observations used for - each window. - - If a timedelta, str, or offset, the time period of each window. Each - window will be a variable sized based on the observations included in - the time-period. This is only valid for datetimelike indexes. - To learn more about the offsets & frequency strings, please see `this link - `__. - - If a BaseIndexer subclass, the window boundaries - based on the defined ``get_window_bounds`` method. Additional rolling - keyword arguments, namely ``min_periods``, ``center``, ``closed`` and - ``step`` will be passed to ``get_window_bounds``. - - min_periods : int, default None - Minimum number of observations in window required to have a value; - otherwise, result is ``np.nan``. - - For a window that is specified by an offset, - ``min_periods`` will default to 1. - - For a window that is specified by an integer, ``min_periods`` will default - to the size of the window. - - center : bool, default False - If False, set the window labels as the right edge of the window index. - - If True, set the window labels as the center of the window index. - - win_type : str, default None - If ``None``, all points are evenly weighted. - - If a string, it must be a valid `scipy.signal window function - `__. - - Certain Scipy window types require additional parameters to be passed - in the aggregation function. The additional parameters must match - the keywords specified in the Scipy window type method signature. - - on : str, optional - For a DataFrame, a column label or Index level on which - to calculate the rolling window, rather than the DataFrame's index. - - Provided integer column is ignored and excluded from result since - an integer index is not used to calculate the rolling window. - - axis : int or str, default 0 - If ``0`` or ``'index'``, roll across the rows. - - If ``1`` or ``'columns'``, roll across the columns. - - For `Series` this parameter is unused and defaults to 0. - - closed : str, default None - If ``'right'``, the first point in the window is excluded from calculations. - - If ``'left'``, the last point in the window is excluded from calculations. - - If ``'both'``, no points in the window are excluded from calculations. - - If ``'neither'``, the first and last points in the window are excluded - from calculations. - - Default ``None`` (``'right'``). - - method : str {'single', 'table'}, default 'single' - Execute the rolling operation per single column or row (``'single'``) - or over the entire object (``'table'``). - - This argument is only implemented when specifying ``engine='numba'`` - in the method call. - - Returns - ------- - pandas.api.typing.RollingGroupby - Return a new grouper with our rolling appended. - - See Also - -------- - Series.rolling : Calling object with Series data. - DataFrame.rolling : Calling object with DataFrames. - Series.groupby : Apply a function groupby to a Series. - DataFrame.groupby : Apply a function groupby. - - Examples - -------- - >>> df = pd.DataFrame({'A': [1, 1, 2, 2], - ... 'B': [1, 2, 3, 4], - ... 'C': [0.362, 0.227, 1.267, -0.562]}) - >>> df - A B C - 0 1 1 0.362 - 1 1 2 0.227 - 2 2 3 1.267 - 3 2 4 -0.562 - - >>> df.groupby('A').rolling(2).sum() - B C - A - 1 0 NaN NaN - 1 3.0 0.589 - 2 2 NaN NaN - 3 7.0 0.705 - - >>> df.groupby('A').rolling(2, min_periods=1).sum() - B C - A - 1 0 1.0 0.362 - 1 3.0 0.589 - 2 2 3.0 1.267 - 3 7.0 0.705 - - >>> df.groupby('A').rolling(2, on='B').sum() - B C - A - 1 0 1 NaN - 1 2 0.589 - 2 2 3 NaN - 3 4 0.705 - """ - from pandas.core.window import RollingGroupby - - return RollingGroupby( - self._selected_obj, - *args, - _grouper=self.grouper, - _as_index=self.as_index, - **kwargs, - ) - - @final - @Substitution(name="groupby") - @Appender(_common_see_also) - def expanding(self, *args, **kwargs) -> ExpandingGroupby: - """ - Return an expanding grouper, providing expanding - functionality per group. - - Returns - ------- - pandas.api.typing.ExpandingGroupby - """ - from pandas.core.window import ExpandingGroupby - - return ExpandingGroupby( - self._selected_obj, - *args, - _grouper=self.grouper, - **kwargs, - ) - - @final - @Substitution(name="groupby") - @Appender(_common_see_also) - def ewm(self, *args, **kwargs) -> ExponentialMovingWindowGroupby: - """ - Return an ewm grouper, providing ewm functionality per group. - - Returns - ------- - pandas.api.typing.ExponentialMovingWindowGroupby - """ - from pandas.core.window import ExponentialMovingWindowGroupby - - return ExponentialMovingWindowGroupby( - self._selected_obj, - *args, - _grouper=self.grouper, - **kwargs, - ) - - @final - def _fill(self, direction: Literal["ffill", "bfill"], limit: int | None = None): - """ - Shared function for `pad` and `backfill` to call Cython method. - - Parameters - ---------- - direction : {'ffill', 'bfill'} - Direction passed to underlying Cython function. `bfill` will cause - values to be filled backwards. `ffill` and any other values will - default to a forward fill - limit : int, default None - Maximum number of consecutive values to fill. If `None`, this - method will convert to -1 prior to passing to Cython - - Returns - ------- - `Series` or `DataFrame` with filled values - - See Also - -------- - pad : Returns Series with minimum number of char in object. - backfill : Backward fill the missing values in the dataset. - """ - # Need int value for Cython - if limit is None: - limit = -1 - - ids, _, _ = self.grouper.group_info - sorted_labels = np.argsort(ids, kind="mergesort").astype(np.intp, copy=False) - if direction == "bfill": - sorted_labels = sorted_labels[::-1] - - col_func = partial( - libgroupby.group_fillna_indexer, - labels=ids, - sorted_labels=sorted_labels, - limit=limit, - dropna=self.dropna, - ) - - def blk_func(values: ArrayLike) -> ArrayLike: - mask = isna(values) - if values.ndim == 1: - indexer = np.empty(values.shape, dtype=np.intp) - col_func(out=indexer, mask=mask) - return algorithms.take_nd(values, indexer) - - else: - # We broadcast algorithms.take_nd analogous to - # np.take_along_axis - if isinstance(values, np.ndarray): - dtype = values.dtype - if self.grouper.has_dropped_na: - # dropped null groups give rise to nan in the result - dtype = ensure_dtype_can_hold_na(values.dtype) - out = np.empty(values.shape, dtype=dtype) - else: - # Note: we only get here with backfill/pad, - # so if we have a dtype that cannot hold NAs, - # then there will be no -1s in indexer, so we can use - # the original dtype (no need to ensure_dtype_can_hold_na) - out = type(values)._empty(values.shape, dtype=values.dtype) - - for i, value_element in enumerate(values): - # call group_fillna_indexer column-wise - indexer = np.empty(values.shape[1], dtype=np.intp) - col_func(out=indexer, mask=mask[i]) - out[i, :] = algorithms.take_nd(value_element, indexer) - return out - - mgr = self._get_data_to_aggregate() - res_mgr = mgr.apply(blk_func) - - new_obj = self._wrap_agged_manager(res_mgr) - - if self.axis == 1: - # Only relevant for DataFrameGroupBy - new_obj = new_obj.T - new_obj.columns = self.obj.columns - - new_obj.index = self.obj.index - return new_obj - - @final - @Substitution(name="groupby") - def ffill(self, limit: int | None = None): - """ - Forward fill the values. - - Parameters - ---------- - limit : int, optional - Limit of how many values to fill. - - Returns - ------- - Series or DataFrame - Object with missing values filled. - - See Also - -------- - Series.ffill: Returns Series with minimum number of char in object. - DataFrame.ffill: Object with missing values filled or None if inplace=True. - Series.fillna: Fill NaN values of a Series. - DataFrame.fillna: Fill NaN values of a DataFrame. - - Examples - -------- - - For SeriesGroupBy: - - >>> key = [0, 0, 1, 1] - >>> ser = pd.Series([np.nan, 2, 3, np.nan], index=key) - >>> ser - 0 NaN - 0 2.0 - 1 3.0 - 1 NaN - dtype: float64 - >>> ser.groupby(level=0).ffill() - 0 NaN - 0 2.0 - 1 3.0 - 1 3.0 - dtype: float64 - - For DataFrameGroupBy: - - >>> df = pd.DataFrame( - ... { - ... "key": [0, 0, 1, 1, 1], - ... "A": [np.nan, 2, np.nan, 3, np.nan], - ... "B": [2, 3, np.nan, np.nan, np.nan], - ... "C": [np.nan, np.nan, 2, np.nan, np.nan], - ... } - ... ) - >>> df - key A B C - 0 0 NaN 2.0 NaN - 1 0 2.0 3.0 NaN - 2 1 NaN NaN 2.0 - 3 1 3.0 NaN NaN - 4 1 NaN NaN NaN - - Propagate non-null values forward or backward within each group along columns. - - >>> df.groupby("key").ffill() - A B C - 0 NaN 2.0 NaN - 1 2.0 3.0 NaN - 2 NaN NaN 2.0 - 3 3.0 NaN 2.0 - 4 3.0 NaN 2.0 - - Propagate non-null values forward or backward within each group along rows. - - >>> df.T.groupby(np.array([0, 0, 1, 1])).ffill().T - key A B C - 0 0.0 0.0 2.0 2.0 - 1 0.0 2.0 3.0 3.0 - 2 1.0 1.0 NaN 2.0 - 3 1.0 3.0 NaN NaN - 4 1.0 1.0 NaN NaN - - Only replace the first NaN element within a group along rows. - - >>> df.groupby("key").ffill(limit=1) - A B C - 0 NaN 2.0 NaN - 1 2.0 3.0 NaN - 2 NaN NaN 2.0 - 3 3.0 NaN 2.0 - 4 3.0 NaN NaN - """ - return self._fill("ffill", limit=limit) - - @final - @Substitution(name="groupby") - def bfill(self, limit: int | None = None): - """ - Backward fill the values. - - Parameters - ---------- - limit : int, optional - Limit of how many values to fill. - - Returns - ------- - Series or DataFrame - Object with missing values filled. - - See Also - -------- - Series.bfill : Backward fill the missing values in the dataset. - DataFrame.bfill: Backward fill the missing values in the dataset. - Series.fillna: Fill NaN values of a Series. - DataFrame.fillna: Fill NaN values of a DataFrame. - - Examples - -------- - - With Series: - - >>> index = ['Falcon', 'Falcon', 'Parrot', 'Parrot', 'Parrot'] - >>> s = pd.Series([None, 1, None, None, 3], index=index) - >>> s - Falcon NaN - Falcon 1.0 - Parrot NaN - Parrot NaN - Parrot 3.0 - dtype: float64 - >>> s.groupby(level=0).bfill() - Falcon 1.0 - Falcon 1.0 - Parrot 3.0 - Parrot 3.0 - Parrot 3.0 - dtype: float64 - >>> s.groupby(level=0).bfill(limit=1) - Falcon 1.0 - Falcon 1.0 - Parrot NaN - Parrot 3.0 - Parrot 3.0 - dtype: float64 - - With DataFrame: - - >>> df = pd.DataFrame({'A': [1, None, None, None, 4], - ... 'B': [None, None, 5, None, 7]}, index=index) - >>> df - A B - Falcon 1.0 NaN - Falcon NaN NaN - Parrot NaN 5.0 - Parrot NaN NaN - Parrot 4.0 7.0 - >>> df.groupby(level=0).bfill() - A B - Falcon 1.0 NaN - Falcon NaN NaN - Parrot 4.0 5.0 - Parrot 4.0 7.0 - Parrot 4.0 7.0 - >>> df.groupby(level=0).bfill(limit=1) - A B - Falcon 1.0 NaN - Falcon NaN NaN - Parrot NaN 5.0 - Parrot 4.0 7.0 - Parrot 4.0 7.0 - """ - return self._fill("bfill", limit=limit) - - @final - @property - @Substitution(name="groupby") - @Substitution(see_also=_common_see_also) - def nth(self) -> GroupByNthSelector: - """ - Take the nth row from each group if n is an int, otherwise a subset of rows. - - Can be either a call or an index. dropna is not available with index notation. - Index notation accepts a comma separated list of integers and slices. - - If dropna, will take the nth non-null row, dropna is either - 'all' or 'any'; this is equivalent to calling dropna(how=dropna) - before the groupby. - - Parameters - ---------- - n : int, slice or list of ints and slices - A single nth value for the row or a list of nth values or slices. - - .. versionchanged:: 1.4.0 - Added slice and lists containing slices. - Added index notation. - - dropna : {'any', 'all', None}, default None - Apply the specified dropna operation before counting which row is - the nth row. Only supported if n is an int. - - Returns - ------- - Series or DataFrame - N-th value within each group. - %(see_also)s - Examples - -------- - - >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2], - ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B']) - >>> g = df.groupby('A') - >>> g.nth(0) - A B - 0 1 NaN - 2 2 3.0 - >>> g.nth(1) - A B - 1 1 2.0 - 4 2 5.0 - >>> g.nth(-1) - A B - 3 1 4.0 - 4 2 5.0 - >>> g.nth([0, 1]) - A B - 0 1 NaN - 1 1 2.0 - 2 2 3.0 - 4 2 5.0 - >>> g.nth(slice(None, -1)) - A B - 0 1 NaN - 1 1 2.0 - 2 2 3.0 - - Index notation may also be used - - >>> g.nth[0, 1] - A B - 0 1 NaN - 1 1 2.0 - 2 2 3.0 - 4 2 5.0 - >>> g.nth[:-1] - A B - 0 1 NaN - 1 1 2.0 - 2 2 3.0 - - Specifying `dropna` allows ignoring ``NaN`` values - - >>> g.nth(0, dropna='any') - A B - 1 1 2.0 - 2 2 3.0 - - When the specified ``n`` is larger than any of the groups, an - empty DataFrame is returned - - >>> g.nth(3, dropna='any') - Empty DataFrame - Columns: [A, B] - Index: [] - """ - return GroupByNthSelector(self) - - def _nth( - self, - n: PositionalIndexer | tuple, - dropna: Literal["any", "all", None] = None, - ) -> NDFrameT: - if not dropna: - mask = self._make_mask_from_positional_indexer(n) - - ids, _, _ = self.grouper.group_info - - # Drop NA values in grouping - mask = mask & (ids != -1) - - out = self._mask_selected_obj(mask) - return out - - # dropna is truthy - if not is_integer(n): - raise ValueError("dropna option only supported for an integer argument") - - if dropna not in ["any", "all"]: - # Note: when agg-ing picker doesn't raise this, just returns NaN - raise ValueError( - "For a DataFrame or Series groupby.nth, dropna must be " - "either None, 'any' or 'all', " - f"(was passed {dropna})." - ) - - # old behaviour, but with all and any support for DataFrames. - # modified in GH 7559 to have better perf - n = cast(int, n) - dropped = self._selected_obj.dropna(how=dropna, axis=self.axis) - - # get a new grouper for our dropped obj - grouper: np.ndarray | Index | ops.BaseGrouper - if len(dropped) == len(self._selected_obj): - # Nothing was dropped, can use the same grouper - grouper = self.grouper - else: - # we don't have the grouper info available - # (e.g. we have selected out - # a column that is not in the current object) - axis = self.grouper.axis - grouper = self.grouper.codes_info[axis.isin(dropped.index)] - if self.grouper.has_dropped_na: - # Null groups need to still be encoded as -1 when passed to groupby - nulls = grouper == -1 - # error: No overload variant of "where" matches argument types - # "Any", "NAType", "Any" - values = np.where(nulls, NA, grouper) # type: ignore[call-overload] - grouper = Index(values, dtype="Int64") - - if self.axis == 1: - grb = dropped.T.groupby(grouper, as_index=self.as_index, sort=self.sort) - else: - grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort) - return grb.nth(n) - - @final - def quantile( - self, - q: float | AnyArrayLike = 0.5, - interpolation: str = "linear", - numeric_only: bool = False, - ): - """ - Return group values at the given quantile, a la numpy.percentile. - - Parameters - ---------- - q : float or array-like, default 0.5 (50% quantile) - Value(s) between 0 and 1 providing the quantile(s) to compute. - interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} - Method to use when the desired quantile falls between two points. - numeric_only : bool, default False - Include only `float`, `int` or `boolean` data. - - .. versionadded:: 1.5.0 - - .. versionchanged:: 2.0.0 - - numeric_only now defaults to ``False``. - - Returns - ------- - Series or DataFrame - Return type determined by caller of GroupBy object. - - See Also - -------- - Series.quantile : Similar method for Series. - DataFrame.quantile : Similar method for DataFrame. - numpy.percentile : NumPy method to compute qth percentile. - - Examples - -------- - >>> df = pd.DataFrame([ - ... ['a', 1], ['a', 2], ['a', 3], - ... ['b', 1], ['b', 3], ['b', 5] - ... ], columns=['key', 'val']) - >>> df.groupby('key').quantile() - val - key - a 2.0 - b 3.0 - """ - mgr = self._get_data_to_aggregate(numeric_only=numeric_only, name="quantile") - obj = self._wrap_agged_manager(mgr) - if self.axis == 1: - splitter = self.grouper._get_splitter(obj.T, axis=self.axis) - sdata = splitter._sorted_data.T - else: - splitter = self.grouper._get_splitter(obj, axis=self.axis) - sdata = splitter._sorted_data - - starts, ends = lib.generate_slices(splitter._slabels, splitter.ngroups) - - def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]: - if is_object_dtype(vals.dtype): - raise TypeError( - "'quantile' cannot be performed against 'object' dtypes!" - ) - - inference: DtypeObj | None = None - if isinstance(vals, BaseMaskedArray) and is_numeric_dtype(vals.dtype): - out = vals.to_numpy(dtype=float, na_value=np.nan) - inference = vals.dtype - elif is_integer_dtype(vals.dtype): - if isinstance(vals, ExtensionArray): - out = vals.to_numpy(dtype=float, na_value=np.nan) - else: - out = vals - inference = np.dtype(np.int64) - elif is_bool_dtype(vals.dtype) and isinstance(vals, ExtensionArray): - out = vals.to_numpy(dtype=float, na_value=np.nan) - elif is_bool_dtype(vals.dtype): - # GH#51424 deprecate to match Series/DataFrame behavior - warnings.warn( - f"Allowing bool dtype in {type(self).__name__}.quantile is " - "deprecated and will raise in a future version, matching " - "the Series/DataFrame behavior. Cast to uint8 dtype before " - "calling quantile instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - out = np.asarray(vals) - elif needs_i8_conversion(vals.dtype): - inference = vals.dtype - # In this case we need to delay the casting until after the - # np.lexsort below. - # error: Incompatible return value type (got - # "Tuple[Union[ExtensionArray, ndarray[Any, Any]], Union[Any, - # ExtensionDtype]]", expected "Tuple[ndarray[Any, Any], - # Optional[Union[dtype[Any], ExtensionDtype]]]") - return vals, inference # type: ignore[return-value] - elif isinstance(vals, ExtensionArray) and is_float_dtype(vals.dtype): - inference = np.dtype(np.float64) - out = vals.to_numpy(dtype=float, na_value=np.nan) - else: - out = np.asarray(vals) - - return out, inference - - def post_processor( - vals: np.ndarray, - inference: DtypeObj | None, - result_mask: np.ndarray | None, - orig_vals: ArrayLike, - ) -> ArrayLike: - if inference: - # Check for edge case - if isinstance(orig_vals, BaseMaskedArray): - assert result_mask is not None # for mypy - - if interpolation in {"linear", "midpoint"} and not is_float_dtype( - orig_vals - ): - return FloatingArray(vals, result_mask) - else: - # Item "ExtensionDtype" of "Union[ExtensionDtype, str, - # dtype[Any], Type[object]]" has no attribute "numpy_dtype" - # [union-attr] - with warnings.catch_warnings(): - # vals.astype with nan can warn with numpy >1.24 - warnings.filterwarnings("ignore", category=RuntimeWarning) - return type(orig_vals)( - vals.astype( - inference.numpy_dtype # type: ignore[union-attr] - ), - result_mask, - ) - - elif not ( - is_integer_dtype(inference) - and interpolation in {"linear", "midpoint"} - ): - if needs_i8_conversion(inference): - # error: Item "ExtensionArray" of "Union[ExtensionArray, - # ndarray[Any, Any]]" has no attribute "_ndarray" - vals = vals.astype("i8").view( - orig_vals._ndarray.dtype # type: ignore[union-attr] - ) - # error: Item "ExtensionArray" of "Union[ExtensionArray, - # ndarray[Any, Any]]" has no attribute "_from_backing_data" - return orig_vals._from_backing_data( # type: ignore[union-attr] - vals - ) - - assert isinstance(inference, np.dtype) # for mypy - return vals.astype(inference) - - return vals - - qs = np.array(q, dtype=np.float64) - pass_qs: np.ndarray | None = qs - if is_scalar(q): - qs = np.array([q], dtype=np.float64) - pass_qs = None - - ids, _, ngroups = self.grouper.group_info - nqs = len(qs) - - func = partial( - libgroupby.group_quantile, - labels=ids, - qs=qs, - interpolation=interpolation, - starts=starts, - ends=ends, - ) - - def blk_func(values: ArrayLike) -> ArrayLike: - orig_vals = values - if isinstance(values, BaseMaskedArray): - mask = values._mask - result_mask = np.zeros((ngroups, nqs), dtype=np.bool_) - else: - mask = isna(values) - result_mask = None - - is_datetimelike = needs_i8_conversion(values.dtype) - - vals, inference = pre_processor(values) - - ncols = 1 - if vals.ndim == 2: - ncols = vals.shape[0] - - out = np.empty((ncols, ngroups, nqs), dtype=np.float64) - - if is_datetimelike: - vals = vals.view("i8") - - if vals.ndim == 1: - # EA is always 1d - func( - out[0], - values=vals, - mask=mask, - result_mask=result_mask, - is_datetimelike=is_datetimelike, - ) - else: - for i in range(ncols): - func( - out[i], - values=vals[i], - mask=mask[i], - result_mask=None, - is_datetimelike=is_datetimelike, - ) - - if vals.ndim == 1: - out = out.ravel("K") - if result_mask is not None: - result_mask = result_mask.ravel("K") - else: - out = out.reshape(ncols, ngroups * nqs) - - return post_processor(out, inference, result_mask, orig_vals) - - res_mgr = sdata._mgr.grouped_reduce(blk_func) - - res = self._wrap_agged_manager(res_mgr) - return self._wrap_aggregated_output(res, qs=pass_qs) - - @final - @Substitution(name="groupby") - def ngroup(self, ascending: bool = True): - """ - Number each group from 0 to the number of groups - 1. - - This is the enumerative complement of cumcount. Note that the - numbers given to the groups match the order in which the groups - would be seen when iterating over the groupby object, not the - order they are first observed. - - Groups with missing keys (where `pd.isna()` is True) will be labeled with `NaN` - and will be skipped from the count. - - Parameters - ---------- - ascending : bool, default True - If False, number in reverse, from number of group - 1 to 0. - - Returns - ------- - Series - Unique numbers for each group. - - See Also - -------- - .cumcount : Number the rows in each group. - - Examples - -------- - >>> df = pd.DataFrame({"color": ["red", None, "red", "blue", "blue", "red"]}) - >>> df - color - 0 red - 1 None - 2 red - 3 blue - 4 blue - 5 red - >>> df.groupby("color").ngroup() - 0 1.0 - 1 NaN - 2 1.0 - 3 0.0 - 4 0.0 - 5 1.0 - dtype: float64 - >>> df.groupby("color", dropna=False).ngroup() - 0 1 - 1 2 - 2 1 - 3 0 - 4 0 - 5 1 - dtype: int64 - >>> df.groupby("color", dropna=False).ngroup(ascending=False) - 0 1 - 1 0 - 2 1 - 3 2 - 4 2 - 5 1 - dtype: int64 - """ - obj = self._obj_with_exclusions - index = obj._get_axis(self.axis) - comp_ids = self.grouper.group_info[0] - - dtype: type - if self.grouper.has_dropped_na: - comp_ids = np.where(comp_ids == -1, np.nan, comp_ids) - dtype = np.float64 - else: - dtype = np.int64 - - if any(ping._passed_categorical for ping in self.grouper.groupings): - # comp_ids reflect non-observed groups, we need only observed - comp_ids = rank_1d(comp_ids, ties_method="dense") - 1 - - result = self._obj_1d_constructor(comp_ids, index, dtype=dtype) - if not ascending: - result = self.ngroups - 1 - result - return result - - @final - @Substitution(name="groupby") - def cumcount(self, ascending: bool = True): - """ - Number each item in each group from 0 to the length of that group - 1. - - Essentially this is equivalent to - - .. code-block:: python - - self.apply(lambda x: pd.Series(np.arange(len(x)), x.index)) - - Parameters - ---------- - ascending : bool, default True - If False, number in reverse, from length of group - 1 to 0. - - Returns - ------- - Series - Sequence number of each element within each group. - - See Also - -------- - .ngroup : Number the groups themselves. - - Examples - -------- - >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']], - ... columns=['A']) - >>> df - A - 0 a - 1 a - 2 a - 3 b - 4 b - 5 a - >>> df.groupby('A').cumcount() - 0 0 - 1 1 - 2 2 - 3 0 - 4 1 - 5 3 - dtype: int64 - >>> df.groupby('A').cumcount(ascending=False) - 0 3 - 1 2 - 2 1 - 3 1 - 4 0 - 5 0 - dtype: int64 - """ - index = self._obj_with_exclusions._get_axis(self.axis) - cumcounts = self._cumcount_array(ascending=ascending) - return self._obj_1d_constructor(cumcounts, index) - - @final - @Substitution(name="groupby") - @Substitution(see_also=_common_see_also) - def rank( - self, - method: str = "average", - ascending: bool = True, - na_option: str = "keep", - pct: bool = False, - axis: AxisInt | lib.NoDefault = lib.no_default, - ) -> NDFrameT: - """ - Provide the rank of values within each group. - - Parameters - ---------- - method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' - * average: average rank of group. - * min: lowest rank in group. - * max: highest rank in group. - * first: ranks assigned in order they appear in the array. - * dense: like 'min', but rank always increases by 1 between groups. - ascending : bool, default True - False for ranks by high (1) to low (N). - na_option : {'keep', 'top', 'bottom'}, default 'keep' - * keep: leave NA values where they are. - * top: smallest rank if ascending. - * bottom: smallest rank if descending. - pct : bool, default False - Compute percentage rank of data within each group. - axis : int, default 0 - The axis of the object over which to compute the rank. - - .. deprecated:: 2.1.0 - For axis=1, operate on the underlying object instead. Otherwise - the axis keyword is not necessary. - - Returns - ------- - DataFrame with ranking of values within each group - %(see_also)s - Examples - -------- - >>> df = pd.DataFrame( - ... { - ... "group": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], - ... "value": [2, 4, 2, 3, 5, 1, 2, 4, 1, 5], - ... } - ... ) - >>> df - group value - 0 a 2 - 1 a 4 - 2 a 2 - 3 a 3 - 4 a 5 - 5 b 1 - 6 b 2 - 7 b 4 - 8 b 1 - 9 b 5 - >>> for method in ['average', 'min', 'max', 'dense', 'first']: - ... df[f'{method}_rank'] = df.groupby('group')['value'].rank(method) - >>> df - group value average_rank min_rank max_rank dense_rank first_rank - 0 a 2 1.5 1.0 2.0 1.0 1.0 - 1 a 4 4.0 4.0 4.0 3.0 4.0 - 2 a 2 1.5 1.0 2.0 1.0 2.0 - 3 a 3 3.0 3.0 3.0 2.0 3.0 - 4 a 5 5.0 5.0 5.0 4.0 5.0 - 5 b 1 1.5 1.0 2.0 1.0 1.0 - 6 b 2 3.0 3.0 3.0 2.0 3.0 - 7 b 4 4.0 4.0 4.0 3.0 4.0 - 8 b 1 1.5 1.0 2.0 1.0 2.0 - 9 b 5 5.0 5.0 5.0 4.0 5.0 - """ - if na_option not in {"keep", "top", "bottom"}: - msg = "na_option must be one of 'keep', 'top', or 'bottom'" - raise ValueError(msg) - - if axis is not lib.no_default: - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "rank") - else: - axis = 0 - - kwargs = { - "ties_method": method, - "ascending": ascending, - "na_option": na_option, - "pct": pct, - } - if axis != 0: - # DataFrame uses different keyword name - kwargs["method"] = kwargs.pop("ties_method") - f = lambda x: x.rank(axis=axis, numeric_only=False, **kwargs) - result = self._python_apply_general( - f, self._selected_obj, is_transform=True - ) - return result - - return self._cython_transform( - "rank", - numeric_only=False, - axis=axis, - **kwargs, - ) - - @final - @Substitution(name="groupby") - @Substitution(see_also=_common_see_also) - def cumprod( - self, axis: Axis | lib.NoDefault = lib.no_default, *args, **kwargs - ) -> NDFrameT: - """ - Cumulative product for each group. - - Returns - ------- - Series or DataFrame - %(see_also)s - Examples - -------- - For SeriesGroupBy: - - >>> lst = ['a', 'a', 'b'] - >>> ser = pd.Series([6, 2, 0], index=lst) - >>> ser - a 6 - a 2 - b 0 - dtype: int64 - >>> ser.groupby(level=0).cumprod() - a 6 - a 12 - b 0 - dtype: int64 - - For DataFrameGroupBy: - - >>> data = [[1, 8, 2], [1, 2, 5], [2, 6, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["cow", "horse", "bull"]) - >>> df - a b c - cow 1 8 2 - horse 1 2 5 - bull 2 6 9 - >>> df.groupby("a").groups - {1: ['cow', 'horse'], 2: ['bull']} - >>> df.groupby("a").cumprod() - b c - cow 8 2 - horse 16 10 - bull 6 9 - """ - nv.validate_groupby_func("cumprod", args, kwargs, ["numeric_only", "skipna"]) - if axis is not lib.no_default: - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "cumprod") - else: - axis = 0 - - if axis != 0: - f = lambda x: x.cumprod(axis=axis, **kwargs) - return self._python_apply_general(f, self._selected_obj, is_transform=True) - - return self._cython_transform("cumprod", **kwargs) - - @final - @Substitution(name="groupby") - @Substitution(see_also=_common_see_also) - def cumsum( - self, axis: Axis | lib.NoDefault = lib.no_default, *args, **kwargs - ) -> NDFrameT: - """ - Cumulative sum for each group. - - Returns - ------- - Series or DataFrame - %(see_also)s - Examples - -------- - For SeriesGroupBy: - - >>> lst = ['a', 'a', 'b'] - >>> ser = pd.Series([6, 2, 0], index=lst) - >>> ser - a 6 - a 2 - b 0 - dtype: int64 - >>> ser.groupby(level=0).cumsum() - a 6 - a 8 - b 0 - dtype: int64 - - For DataFrameGroupBy: - - >>> data = [[1, 8, 2], [1, 2, 5], [2, 6, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["fox", "gorilla", "lion"]) - >>> df - a b c - fox 1 8 2 - gorilla 1 2 5 - lion 2 6 9 - >>> df.groupby("a").groups - {1: ['fox', 'gorilla'], 2: ['lion']} - >>> df.groupby("a").cumsum() - b c - fox 8 2 - gorilla 10 7 - lion 6 9 - """ - nv.validate_groupby_func("cumsum", args, kwargs, ["numeric_only", "skipna"]) - if axis is not lib.no_default: - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "cumsum") - else: - axis = 0 - - if axis != 0: - f = lambda x: x.cumsum(axis=axis, **kwargs) - return self._python_apply_general(f, self._selected_obj, is_transform=True) - - return self._cython_transform("cumsum", **kwargs) - - @final - @Substitution(name="groupby") - @Substitution(see_also=_common_see_also) - def cummin( - self, - axis: AxisInt | lib.NoDefault = lib.no_default, - numeric_only: bool = False, - **kwargs, - ) -> NDFrameT: - """ - Cumulative min for each group. - - Returns - ------- - Series or DataFrame - %(see_also)s - Examples - -------- - For SeriesGroupBy: - - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] - >>> ser = pd.Series([1, 6, 2, 3, 0, 4], index=lst) - >>> ser - a 1 - a 6 - a 2 - b 3 - b 0 - b 4 - dtype: int64 - >>> ser.groupby(level=0).cummin() - a 1 - a 1 - a 1 - b 3 - b 0 - b 0 - dtype: int64 - - For DataFrameGroupBy: - - >>> data = [[1, 0, 2], [1, 1, 5], [6, 6, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["snake", "rabbit", "turtle"]) - >>> df - a b c - snake 1 0 2 - rabbit 1 1 5 - turtle 6 6 9 - >>> df.groupby("a").groups - {1: ['snake', 'rabbit'], 6: ['turtle']} - >>> df.groupby("a").cummin() - b c - snake 0 2 - rabbit 0 2 - turtle 6 9 - """ - skipna = kwargs.get("skipna", True) - if axis is not lib.no_default: - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "cummin") - else: - axis = 0 - - if axis != 0: - f = lambda x: np.minimum.accumulate(x, axis) - obj = self._selected_obj - if numeric_only: - obj = obj._get_numeric_data() - return self._python_apply_general(f, obj, is_transform=True) - - return self._cython_transform( - "cummin", numeric_only=numeric_only, skipna=skipna - ) - - @final - @Substitution(name="groupby") - @Substitution(see_also=_common_see_also) - def cummax( - self, - axis: AxisInt | lib.NoDefault = lib.no_default, - numeric_only: bool = False, - **kwargs, - ) -> NDFrameT: - """ - Cumulative max for each group. - - Returns - ------- - Series or DataFrame - %(see_also)s - Examples - -------- - For SeriesGroupBy: - - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] - >>> ser = pd.Series([1, 6, 2, 3, 1, 4], index=lst) - >>> ser - a 1 - a 6 - a 2 - b 3 - b 1 - b 4 - dtype: int64 - >>> ser.groupby(level=0).cummax() - a 1 - a 6 - a 6 - b 3 - b 3 - b 4 - dtype: int64 - - For DataFrameGroupBy: - - >>> data = [[1, 8, 2], [1, 1, 0], [2, 6, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["cow", "horse", "bull"]) - >>> df - a b c - cow 1 8 2 - horse 1 1 0 - bull 2 6 9 - >>> df.groupby("a").groups - {1: ['cow', 'horse'], 2: ['bull']} - >>> df.groupby("a").cummax() - b c - cow 8 2 - horse 8 2 - bull 6 9 - """ - skipna = kwargs.get("skipna", True) - if axis is not lib.no_default: - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "cummax") - else: - axis = 0 - - if axis != 0: - f = lambda x: np.maximum.accumulate(x, axis) - obj = self._selected_obj - if numeric_only: - obj = obj._get_numeric_data() - return self._python_apply_general(f, obj, is_transform=True) - - return self._cython_transform( - "cummax", numeric_only=numeric_only, skipna=skipna - ) - - @final - @Substitution(name="groupby") - def shift( - self, - periods: int | Sequence[int] = 1, - freq=None, - axis: Axis | lib.NoDefault = lib.no_default, - fill_value=lib.no_default, - suffix: str | None = None, - ): - """ - Shift each group by periods observations. - - If freq is passed, the index will be increased using the periods and the freq. - - Parameters - ---------- - periods : int | Sequence[int], default 1 - Number of periods to shift. If a list of values, shift each group by - each period. - freq : str, optional - Frequency string. - axis : axis to shift, default 0 - Shift direction. - - .. deprecated:: 2.1.0 - For axis=1, operate on the underlying object instead. Otherwise - the axis keyword is not necessary. - - fill_value : optional - The scalar value to use for newly introduced missing values. - - .. versionchanged:: 2.1.0 - Will raise a ``ValueError`` if ``freq`` is provided too. - - suffix : str, optional - A string to add to each shifted column if there are multiple periods. - Ignored otherwise. - - Returns - ------- - Series or DataFrame - Object shifted within each group. - - See Also - -------- - Index.shift : Shift values of Index. - - Examples - -------- - - For SeriesGroupBy: - - >>> lst = ['a', 'a', 'b', 'b'] - >>> ser = pd.Series([1, 2, 3, 4], index=lst) - >>> ser - a 1 - a 2 - b 3 - b 4 - dtype: int64 - >>> ser.groupby(level=0).shift(1) - a NaN - a 1.0 - b NaN - b 3.0 - dtype: float64 - - For DataFrameGroupBy: - - >>> data = [[1, 2, 3], [1, 5, 6], [2, 5, 8], [2, 6, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["tuna", "salmon", "catfish", "goldfish"]) - >>> df - a b c - tuna 1 2 3 - salmon 1 5 6 - catfish 2 5 8 - goldfish 2 6 9 - >>> df.groupby("a").shift(1) - b c - tuna NaN NaN - salmon 2.0 3.0 - catfish NaN NaN - goldfish 5.0 8.0 - """ - if axis is not lib.no_default: - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "shift") - else: - axis = 0 - - if is_list_like(periods): - if axis == 1: - raise ValueError( - "If `periods` contains multiple shifts, `axis` cannot be 1." - ) - periods = cast(Sequence, periods) - if len(periods) == 0: - raise ValueError("If `periods` is an iterable, it cannot be empty.") - from pandas.core.reshape.concat import concat - - add_suffix = True - else: - if not is_integer(periods): - raise TypeError( - f"Periods must be integer, but {periods} is {type(periods)}." - ) - if suffix: - raise ValueError("Cannot specify `suffix` if `periods` is an int.") - periods = [cast(int, periods)] - add_suffix = False - - shifted_dataframes = [] - for period in periods: - if not is_integer(period): - raise TypeError( - f"Periods must be integer, but {period} is {type(period)}." - ) - period = cast(int, period) - if freq is not None or axis != 0: - f = lambda x: x.shift( - period, freq, axis, fill_value # pylint: disable=cell-var-from-loop - ) - shifted = self._python_apply_general( - f, self._selected_obj, is_transform=True - ) - else: - if fill_value is lib.no_default: - fill_value = None - ids, _, ngroups = self.grouper.group_info - res_indexer = np.zeros(len(ids), dtype=np.int64) - - libgroupby.group_shift_indexer(res_indexer, ids, ngroups, period) - - obj = self._obj_with_exclusions - - shifted = obj._reindex_with_indexers( - {self.axis: (obj.axes[self.axis], res_indexer)}, - fill_value=fill_value, - allow_dups=True, - ) - - if add_suffix: - if isinstance(shifted, Series): - shifted = cast(NDFrameT, shifted.to_frame()) - shifted = shifted.add_suffix( - f"{suffix}_{period}" if suffix else f"_{period}" - ) - shifted_dataframes.append(cast(Union[Series, DataFrame], shifted)) - - return ( - shifted_dataframes[0] - if len(shifted_dataframes) == 1 - else concat(shifted_dataframes, axis=1) - ) - - @final - @Substitution(name="groupby") - @Substitution(see_also=_common_see_also) - def diff( - self, periods: int = 1, axis: AxisInt | lib.NoDefault = lib.no_default - ) -> NDFrameT: - """ - First discrete difference of element. - - Calculates the difference of each element compared with another - element in the group (default is element in previous row). - - Parameters - ---------- - periods : int, default 1 - Periods to shift for calculating difference, accepts negative values. - axis : axis to shift, default 0 - Take difference over rows (0) or columns (1). - - .. deprecated:: 2.1.0 - For axis=1, operate on the underlying object instead. Otherwise - the axis keyword is not necessary. - - Returns - ------- - Series or DataFrame - First differences. - %(see_also)s - Examples - -------- - For SeriesGroupBy: - - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] - >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst) - >>> ser - a 7 - a 2 - a 8 - b 4 - b 3 - b 3 - dtype: int64 - >>> ser.groupby(level=0).diff() - a NaN - a -5.0 - a 6.0 - b NaN - b -1.0 - b 0.0 - dtype: float64 - - For DataFrameGroupBy: - - >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]} - >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog', - ... 'mouse', 'mouse', 'mouse', 'mouse']) - >>> df - a b - dog 1 1 - dog 3 4 - dog 5 8 - mouse 7 4 - mouse 7 4 - mouse 8 2 - mouse 3 1 - >>> df.groupby(level=0).diff() - a b - dog NaN NaN - dog 2.0 3.0 - dog 2.0 4.0 - mouse NaN NaN - mouse 0.0 0.0 - mouse 1.0 -2.0 - mouse -5.0 -1.0 - """ - if axis is not lib.no_default: - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "diff") - else: - axis = 0 - - if axis != 0: - return self.apply(lambda x: x.diff(periods=periods, axis=axis)) - - obj = self._obj_with_exclusions - shifted = self.shift(periods=periods) - - # GH45562 - to retain existing behavior and match behavior of Series.diff(), - # int8 and int16 are coerced to float32 rather than float64. - dtypes_to_f32 = ["int8", "int16"] - if obj.ndim == 1: - if obj.dtype in dtypes_to_f32: - shifted = shifted.astype("float32") - else: - to_coerce = [c for c, dtype in obj.dtypes.items() if dtype in dtypes_to_f32] - if len(to_coerce): - shifted = shifted.astype({c: "float32" for c in to_coerce}) - - return obj - shifted - - @final - @Substitution(name="groupby") - @Substitution(see_also=_common_see_also) - def pct_change( - self, - periods: int = 1, - fill_method: FillnaOptions | None | lib.NoDefault = lib.no_default, - limit: int | None | lib.NoDefault = lib.no_default, - freq=None, - axis: Axis | lib.NoDefault = lib.no_default, - ): - """ - Calculate pct_change of each value to previous entry in group. - - Returns - ------- - Series or DataFrame - Percentage changes within each group. - %(see_also)s - Examples - -------- - - For SeriesGroupBy: - - >>> lst = ['a', 'a', 'b', 'b'] - >>> ser = pd.Series([1, 2, 3, 4], index=lst) - >>> ser - a 1 - a 2 - b 3 - b 4 - dtype: int64 - >>> ser.groupby(level=0).pct_change() - a NaN - a 1.000000 - b NaN - b 0.333333 - dtype: float64 - - For DataFrameGroupBy: - - >>> data = [[1, 2, 3], [1, 5, 6], [2, 5, 8], [2, 6, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["tuna", "salmon", "catfish", "goldfish"]) - >>> df - a b c - tuna 1 2 3 - salmon 1 5 6 - catfish 2 5 8 - goldfish 2 6 9 - >>> df.groupby("a").pct_change() - b c - tuna NaN NaN - salmon 1.5 1.000 - catfish NaN NaN - goldfish 0.2 0.125 - """ - # GH#53491 - if fill_method not in (lib.no_default, None) or limit is not lib.no_default: - warnings.warn( - "The 'fill_method' keyword being not None and the 'limit' keyword in " - f"{type(self).__name__}.pct_change are deprecated and will be removed " - "in a future version. Either fill in any non-leading NA values prior " - "to calling pct_change or specify 'fill_method=None' to not fill NA " - "values.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if fill_method is lib.no_default: - if limit is lib.no_default and any( - grp.isna().values.any() for _, grp in self - ): - warnings.warn( - "The default fill_method='ffill' in " - f"{type(self).__name__}.pct_change is deprecated and will " - "be removed in a future version. Either fill in any " - "non-leading NA values prior to calling pct_change or " - "specify 'fill_method=None' to not fill NA values.", - FutureWarning, - stacklevel=find_stack_level(), - ) - fill_method = "ffill" - if limit is lib.no_default: - limit = None - - if axis is not lib.no_default: - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "pct_change") - else: - axis = 0 - - # TODO(GH#23918): Remove this conditional for SeriesGroupBy when - # GH#23918 is fixed - if freq is not None or axis != 0: - f = lambda x: x.pct_change( - periods=periods, - fill_method=fill_method, - limit=limit, - freq=freq, - axis=axis, - ) - return self._python_apply_general(f, self._selected_obj, is_transform=True) - - if fill_method is None: # GH30463 - fill_method = "ffill" - limit = 0 - filled = getattr(self, fill_method)(limit=limit) - if self.axis == 0: - fill_grp = filled.groupby(self.grouper.codes, group_keys=self.group_keys) - else: - fill_grp = filled.T.groupby(self.grouper.codes, group_keys=self.group_keys) - shifted = fill_grp.shift(periods=periods, freq=freq) - if self.axis == 1: - shifted = shifted.T - return (filled / shifted) - 1 - - @final - @Substitution(name="groupby") - @Substitution(see_also=_common_see_also) - def head(self, n: int = 5) -> NDFrameT: - """ - Return first n rows of each group. - - Similar to ``.apply(lambda x: x.head(n))``, but it returns a subset of rows - from the original DataFrame with original index and order preserved - (``as_index`` flag is ignored). - - Parameters - ---------- - n : int - If positive: number of entries to include from start of each group. - If negative: number of entries to exclude from end of each group. - - Returns - ------- - Series or DataFrame - Subset of original Series or DataFrame as determined by n. - %(see_also)s - Examples - -------- - - >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], - ... columns=['A', 'B']) - >>> df.groupby('A').head(1) - A B - 0 1 2 - 2 5 6 - >>> df.groupby('A').head(-1) - A B - 0 1 2 - """ - mask = self._make_mask_from_positional_indexer(slice(None, n)) - return self._mask_selected_obj(mask) - - @final - @Substitution(name="groupby") - @Substitution(see_also=_common_see_also) - def tail(self, n: int = 5) -> NDFrameT: - """ - Return last n rows of each group. - - Similar to ``.apply(lambda x: x.tail(n))``, but it returns a subset of rows - from the original DataFrame with original index and order preserved - (``as_index`` flag is ignored). - - Parameters - ---------- - n : int - If positive: number of entries to include from end of each group. - If negative: number of entries to exclude from start of each group. - - Returns - ------- - Series or DataFrame - Subset of original Series or DataFrame as determined by n. - %(see_also)s - Examples - -------- - - >>> df = pd.DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]], - ... columns=['A', 'B']) - >>> df.groupby('A').tail(1) - A B - 1 a 2 - 3 b 2 - >>> df.groupby('A').tail(-1) - A B - 1 a 2 - 3 b 2 - """ - if n: - mask = self._make_mask_from_positional_indexer(slice(-n, None)) - else: - mask = self._make_mask_from_positional_indexer([]) - - return self._mask_selected_obj(mask) - - @final - def _mask_selected_obj(self, mask: npt.NDArray[np.bool_]) -> NDFrameT: - """ - Return _selected_obj with mask applied to the correct axis. - - Parameters - ---------- - mask : np.ndarray[bool] - Boolean mask to apply. - - Returns - ------- - Series or DataFrame - Filtered _selected_obj. - """ - ids = self.grouper.group_info[0] - mask = mask & (ids != -1) - - if self.axis == 0: - return self._selected_obj[mask] - else: - return self._selected_obj.iloc[:, mask] - - @final - def _reindex_output( - self, - output: OutputFrameOrSeries, - fill_value: Scalar = np.nan, - qs: npt.NDArray[np.float64] | None = None, - ) -> OutputFrameOrSeries: - """ - If we have categorical groupers, then we might want to make sure that - we have a fully re-indexed output to the levels. This means expanding - the output space to accommodate all values in the cartesian product of - our groups, regardless of whether they were observed in the data or - not. This will expand the output space if there are missing groups. - - The method returns early without modifying the input if the number of - groupings is less than 2, self.observed == True or none of the groupers - are categorical. - - Parameters - ---------- - output : Series or DataFrame - Object resulting from grouping and applying an operation. - fill_value : scalar, default np.nan - Value to use for unobserved categories if self.observed is False. - qs : np.ndarray[float64] or None, default None - quantile values, only relevant for quantile. - - Returns - ------- - Series or DataFrame - Object (potentially) re-indexed to include all possible groups. - """ - groupings = self.grouper.groupings - if len(groupings) == 1: - return output - - # if we only care about the observed values - # we are done - elif self.observed: - return output - - # reindexing only applies to a Categorical grouper - elif not any( - isinstance(ping.grouping_vector, (Categorical, CategoricalIndex)) - for ping in groupings - ): - return output - - levels_list = [ping.group_index for ping in groupings] - names = self.grouper.names - if qs is not None: - # error: Argument 1 to "append" of "list" has incompatible type - # "ndarray[Any, dtype[floating[_64Bit]]]"; expected "Index" - levels_list.append(qs) # type: ignore[arg-type] - names = names + [None] - index = MultiIndex.from_product(levels_list, names=names) - if self.sort: - index = index.sort_values() - - if self.as_index: - # Always holds for SeriesGroupBy unless GH#36507 is implemented - d = { - self.obj._get_axis_name(self.axis): index, - "copy": False, - "fill_value": fill_value, - } - return output.reindex(**d) # type: ignore[arg-type] - - # GH 13204 - # Here, the categorical in-axis groupers, which need to be fully - # expanded, are columns in `output`. An idea is to do: - # output = output.set_index(self.grouper.names) - # .reindex(index).reset_index() - # but special care has to be taken because of possible not-in-axis - # groupers. - # So, we manually select and drop the in-axis grouper columns, - # reindex `output`, and then reset the in-axis grouper columns. - - # Select in-axis groupers - in_axis_grps = [ - (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis - ] - if len(in_axis_grps) > 0: - g_nums, g_names = zip(*in_axis_grps) - output = output.drop(labels=list(g_names), axis=1) - - # Set a temp index and reindex (possibly expanding) - output = output.set_index(self.grouper.result_index).reindex( - index, copy=False, fill_value=fill_value - ) - - # Reset in-axis grouper columns - # (using level numbers `g_nums` because level names may not be unique) - if len(in_axis_grps) > 0: - output = output.reset_index(level=g_nums) - - return output.reset_index(drop=True) - - @final - def sample( - self, - n: int | None = None, - frac: float | None = None, - replace: bool = False, - weights: Sequence | Series | None = None, - random_state: RandomState | None = None, - ): - """ - Return a random sample of items from each group. - - You can use `random_state` for reproducibility. - - Parameters - ---------- - n : int, optional - Number of items to return for each group. Cannot be used with - `frac` and must be no larger than the smallest group unless - `replace` is True. Default is one if `frac` is None. - frac : float, optional - Fraction of items to return. Cannot be used with `n`. - replace : bool, default False - Allow or disallow sampling of the same row more than once. - weights : list-like, optional - Default None results in equal probability weighting. - If passed a list-like then values must have the same length as - the underlying DataFrame or Series object and will be used as - sampling probabilities after normalization within each group. - Values must be non-negative with at least one positive element - within each group. - random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional - If int, array-like, or BitGenerator, seed for random number generator. - If np.random.RandomState or np.random.Generator, use as given. - - .. versionchanged:: 1.4.0 - - np.random.Generator objects now accepted - - Returns - ------- - Series or DataFrame - A new object of same type as caller containing items randomly - sampled within each group from the caller object. - - See Also - -------- - DataFrame.sample: Generate random samples from a DataFrame object. - numpy.random.choice: Generate a random sample from a given 1-D numpy - array. - - Examples - -------- - >>> df = pd.DataFrame( - ... {"a": ["red"] * 2 + ["blue"] * 2 + ["black"] * 2, "b": range(6)} - ... ) - >>> df - a b - 0 red 0 - 1 red 1 - 2 blue 2 - 3 blue 3 - 4 black 4 - 5 black 5 - - Select one row at random for each distinct value in column a. The - `random_state` argument can be used to guarantee reproducibility: - - >>> df.groupby("a").sample(n=1, random_state=1) - a b - 4 black 4 - 2 blue 2 - 1 red 1 - - Set `frac` to sample fixed proportions rather than counts: - - >>> df.groupby("a")["b"].sample(frac=0.5, random_state=2) - 5 5 - 2 2 - 0 0 - Name: b, dtype: int64 - - Control sample probabilities within groups by setting weights: - - >>> df.groupby("a").sample( - ... n=1, - ... weights=[1, 1, 1, 0, 0, 1], - ... random_state=1, - ... ) - a b - 5 black 5 - 2 blue 2 - 0 red 0 - """ # noqa: E501 - if self._selected_obj.empty: - # GH48459 prevent ValueError when object is empty - return self._selected_obj - size = sample.process_sampling_size(n, frac, replace) - if weights is not None: - weights_arr = sample.preprocess_weights( - self._selected_obj, weights, axis=self.axis - ) - - random_state = com.random_state(random_state) - - group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis) - - sampled_indices = [] - for labels, obj in group_iterator: - grp_indices = self.indices[labels] - group_size = len(grp_indices) - if size is not None: - sample_size = size - else: - assert frac is not None - sample_size = round(frac * group_size) - - grp_sample = sample.sample( - group_size, - size=sample_size, - replace=replace, - weights=None if weights is None else weights_arr[grp_indices], - random_state=random_state, - ) - sampled_indices.append(grp_indices[grp_sample]) - - sampled_indices = np.concatenate(sampled_indices) - return self._selected_obj.take(sampled_indices, axis=self.axis) - - def _idxmax_idxmin( - self, - how: Literal["idxmax", "idxmin"], - ignore_unobserved: bool = False, - axis: Axis | None | lib.NoDefault = lib.no_default, - skipna: bool = True, - numeric_only: bool = False, - ) -> NDFrameT: - """Compute idxmax/idxmin. - - Parameters - ---------- - how : {'idxmin', 'idxmax'} - Whether to compute idxmin or idxmax. - axis : {{0 or 'index', 1 or 'columns'}}, default None - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - If axis is not provided, grouper's axis is used. - numeric_only : bool, default False - Include only float, int, boolean columns. - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - ignore_unobserved : bool, default False - When True and an unobserved group is encountered, do not raise. This used - for transform where unobserved groups do not play an impact on the result. - - Returns - ------- - Series or DataFrame - idxmax or idxmin for the groupby operation. - """ - if axis is not lib.no_default: - if axis is None: - axis = self.axis - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, how) - else: - axis = self.axis - - if not self.observed and any( - ping._passed_categorical for ping in self.grouper.groupings - ): - expected_len = np.prod( - [len(ping.group_index) for ping in self.grouper.groupings] - ) - if len(self.grouper.groupings) == 1: - result_len = len(self.grouper.groupings[0].grouping_vector.unique()) - else: - # result_index only contains observed groups in this case - result_len = len(self.grouper.result_index) - assert result_len <= expected_len - has_unobserved = result_len < expected_len - - raise_err: bool | np.bool_ = not ignore_unobserved and has_unobserved - # Only raise an error if there are columns to compute; otherwise we return - # an empty DataFrame with an index (possibly including unobserved) but no - # columns - data = self._obj_with_exclusions - if raise_err and isinstance(data, DataFrame): - if numeric_only: - data = data._get_numeric_data() - raise_err = len(data.columns) > 0 - - if raise_err: - raise ValueError( - f"Can't get {how} of an empty group due to unobserved categories. " - "Specify observed=True in groupby instead." - ) - elif not skipna: - if self._obj_with_exclusions.isna().any(axis=None): - warnings.warn( - f"The behavior of {type(self).__name__}.{how} with all-NA " - "values, or any-NA and skipna=False, is deprecated. In a future " - "version this will raise ValueError", - FutureWarning, - stacklevel=find_stack_level(), - ) - - if axis == 1: - try: - - def func(df): - method = getattr(df, how) - return method(axis=axis, skipna=skipna, numeric_only=numeric_only) - - func.__name__ = how - result = self._python_apply_general( - func, self._obj_with_exclusions, not_indexed_same=True - ) - except ValueError as err: - name = "argmax" if how == "idxmax" else "argmin" - if f"attempt to get {name} of an empty sequence" in str(err): - raise ValueError( - f"Can't get {how} of an empty group due to unobserved " - "categories. Specify observed=True in groupby instead." - ) from None - raise - return result - - result = self._agg_general( - numeric_only=numeric_only, - min_count=1, - alias=how, - skipna=skipna, - ) - return result - - def _wrap_idxmax_idxmin(self, res: NDFrameT) -> NDFrameT: - index = self.obj._get_axis(self.axis) - if res.size == 0: - result = res.astype(index.dtype) - else: - if isinstance(index, MultiIndex): - index = index.to_flat_index() - values = res._values - assert isinstance(values, np.ndarray) - na_value = na_value_for_dtype(index.dtype, compat=False) - if isinstance(res, Series): - # mypy: expression has type "Series", variable has type "NDFrameT" - result = res._constructor( # type: ignore[assignment] - index.array.take(values, allow_fill=True, fill_value=na_value), - index=res.index, - name=res.name, - ) - else: - data = {} - for k, column_values in enumerate(values.T): - data[k] = index.array.take( - column_values, allow_fill=True, fill_value=na_value - ) - result = self.obj._constructor(data, index=res.index) - result.columns = res.columns - return result - - -@doc(GroupBy) -def get_groupby( - obj: NDFrame, - by: _KeysArgType | None = None, - axis: AxisInt = 0, - grouper: ops.BaseGrouper | None = None, - group_keys: bool = True, -) -> GroupBy: - klass: type[GroupBy] - if isinstance(obj, Series): - from pandas.core.groupby.generic import SeriesGroupBy - - klass = SeriesGroupBy - elif isinstance(obj, DataFrame): - from pandas.core.groupby.generic import DataFrameGroupBy - - klass = DataFrameGroupBy - else: # pragma: no cover - raise TypeError(f"invalid type: {obj}") - - return klass( - obj=obj, - keys=by, - axis=axis, - grouper=grouper, - group_keys=group_keys, - ) - - -def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiIndex: - """ - Insert the sequence 'qs' of quantiles as the inner-most level of a MultiIndex. - - The quantile level in the MultiIndex is a repeated copy of 'qs'. - - Parameters - ---------- - idx : Index - qs : np.ndarray[float64] - - Returns - ------- - MultiIndex - """ - nqs = len(qs) - lev_codes, lev = Index(qs).factorize() - lev_codes = coerce_indexer_dtype(lev_codes, lev) - - if idx._is_multi: - idx = cast(MultiIndex, idx) - levels = list(idx.levels) + [lev] - codes = [np.repeat(x, nqs) for x in idx.codes] + [np.tile(lev_codes, len(idx))] - mi = MultiIndex(levels=levels, codes=codes, names=idx.names + [None]) - else: - nidx = len(idx) - idx_codes = coerce_indexer_dtype(np.arange(nidx), idx) - levels = [idx, lev] - codes = [np.repeat(idx_codes, nqs), np.tile(lev_codes, nidx)] - mi = MultiIndex(levels=levels, codes=codes, names=[idx.name, None]) +from pandas._core.groupby import groupby +from pandas.core.common import _depr_core - return mi +_depr_core() +_globals = globals() -# GH#7155 -_apply_groupings_depr = ( - "{}.{} operated on the grouping columns. This behavior is deprecated, " - "and in a future version of pandas the grouping columns will be excluded " - "from the operation. Either pass `include_groups=False` to exclude the " - "groupings or explicitly select the grouping columns after groupby to silence " - "this warning." -) +for item in groupby.__dir__(): + _globals[item] = getattr(groupby, item) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index fd0479e17d2bd..715d83fed8239 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -1,1072 +1,11 @@ -""" -Provide user facing operators for doing the split part of the -split-apply-combine paradigm. -""" from __future__ import annotations -from typing import ( - TYPE_CHECKING, - final, -) -import warnings +from pandas._core.groupby import grouper +from pandas.core.common import _depr_core -import numpy as np +_depr_core() -from pandas._config import ( - using_copy_on_write, - warn_copy_on_write, -) +_globals = globals() -from pandas._libs import lib -from pandas._libs.tslibs import OutOfBoundsDatetime -from pandas.errors import InvalidIndexError -from pandas.util._decorators import cache_readonly -from pandas.util._exceptions import find_stack_level - -from pandas.core.dtypes.common import ( - is_list_like, - is_scalar, -) -from pandas.core.dtypes.dtypes import CategoricalDtype - -from pandas.core import algorithms -from pandas.core.arrays import ( - Categorical, - ExtensionArray, -) -import pandas.core.common as com -from pandas.core.frame import DataFrame -from pandas.core.groupby import ops -from pandas.core.groupby.categorical import recode_for_groupby -from pandas.core.indexes.api import ( - CategoricalIndex, - Index, - MultiIndex, -) -from pandas.core.series import Series - -from pandas.io.formats.printing import pprint_thing - -if TYPE_CHECKING: - from collections.abc import ( - Hashable, - Iterator, - ) - - from pandas._typing import ( - ArrayLike, - Axis, - NDFrameT, - npt, - ) - - from pandas.core.generic import NDFrame - - -class Grouper: - """ - A Grouper allows the user to specify a groupby instruction for an object. - - This specification will select a column via the key parameter, or if the - level and/or axis parameters are given, a level of the index of the target - object. - - If `axis` and/or `level` are passed as keywords to both `Grouper` and - `groupby`, the values passed to `Grouper` take precedence. - - Parameters - ---------- - key : str, defaults to None - Groupby key, which selects the grouping column of the target. - level : name/number, defaults to None - The level for the target index. - freq : str / frequency object, defaults to None - This will groupby the specified frequency if the target selection - (via key or level) is a datetime-like object. For full specification - of available frequencies, please see `here - `_. - axis : str, int, defaults to 0 - Number/name of the axis. - sort : bool, default to False - Whether to sort the resulting labels. - closed : {'left' or 'right'} - Closed end of interval. Only when `freq` parameter is passed. - label : {'left' or 'right'} - Interval boundary to use for labeling. - Only when `freq` parameter is passed. - convention : {'start', 'end', 'e', 's'} - If grouper is PeriodIndex and `freq` parameter is passed. - - origin : Timestamp or str, default 'start_day' - The timestamp on which to adjust the grouping. The timezone of origin must - match the timezone of the index. - If string, must be one of the following: - - - 'epoch': `origin` is 1970-01-01 - - 'start': `origin` is the first value of the timeseries - - 'start_day': `origin` is the first day at midnight of the timeseries - - - 'end': `origin` is the last value of the timeseries - - 'end_day': `origin` is the ceiling midnight of the last day - - .. versionadded:: 1.3.0 - - offset : Timedelta or str, default is None - An offset timedelta added to the origin. - - dropna : bool, default True - If True, and if group keys contain NA values, NA values together with - row/column will be dropped. If False, NA values will also be treated as - the key in groups. - - .. versionadded:: 1.2.0 - - Returns - ------- - Grouper or pandas.api.typing.TimeGrouper - A TimeGrouper is returned if ``freq`` is not ``None``. Otherwise, a Grouper - is returned. - - Examples - -------- - ``df.groupby(pd.Grouper(key="Animal"))`` is equivalent to ``df.groupby('Animal')`` - - >>> df = pd.DataFrame( - ... { - ... "Animal": ["Falcon", "Parrot", "Falcon", "Falcon", "Parrot"], - ... "Speed": [100, 5, 200, 300, 15], - ... } - ... ) - >>> df - Animal Speed - 0 Falcon 100 - 1 Parrot 5 - 2 Falcon 200 - 3 Falcon 300 - 4 Parrot 15 - >>> df.groupby(pd.Grouper(key="Animal")).mean() - Speed - Animal - Falcon 200.0 - Parrot 10.0 - - Specify a resample operation on the column 'Publish date' - - >>> df = pd.DataFrame( - ... { - ... "Publish date": [ - ... pd.Timestamp("2000-01-02"), - ... pd.Timestamp("2000-01-02"), - ... pd.Timestamp("2000-01-09"), - ... pd.Timestamp("2000-01-16") - ... ], - ... "ID": [0, 1, 2, 3], - ... "Price": [10, 20, 30, 40] - ... } - ... ) - >>> df - Publish date ID Price - 0 2000-01-02 0 10 - 1 2000-01-02 1 20 - 2 2000-01-09 2 30 - 3 2000-01-16 3 40 - >>> df.groupby(pd.Grouper(key="Publish date", freq="1W")).mean() - ID Price - Publish date - 2000-01-02 0.5 15.0 - 2000-01-09 2.0 30.0 - 2000-01-16 3.0 40.0 - - If you want to adjust the start of the bins based on a fixed timestamp: - - >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00' - >>> rng = pd.date_range(start, end, freq='7min') - >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng) - >>> ts - 2000-10-01 23:30:00 0 - 2000-10-01 23:37:00 3 - 2000-10-01 23:44:00 6 - 2000-10-01 23:51:00 9 - 2000-10-01 23:58:00 12 - 2000-10-02 00:05:00 15 - 2000-10-02 00:12:00 18 - 2000-10-02 00:19:00 21 - 2000-10-02 00:26:00 24 - Freq: 7min, dtype: int64 - - >>> ts.groupby(pd.Grouper(freq='17min')).sum() - 2000-10-01 23:14:00 0 - 2000-10-01 23:31:00 9 - 2000-10-01 23:48:00 21 - 2000-10-02 00:05:00 54 - 2000-10-02 00:22:00 24 - Freq: 17min, dtype: int64 - - >>> ts.groupby(pd.Grouper(freq='17min', origin='epoch')).sum() - 2000-10-01 23:18:00 0 - 2000-10-01 23:35:00 18 - 2000-10-01 23:52:00 27 - 2000-10-02 00:09:00 39 - 2000-10-02 00:26:00 24 - Freq: 17min, dtype: int64 - - >>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum() - 2000-10-01 23:24:00 3 - 2000-10-01 23:41:00 15 - 2000-10-01 23:58:00 45 - 2000-10-02 00:15:00 45 - Freq: 17min, dtype: int64 - - If you want to adjust the start of the bins with an `offset` Timedelta, the two - following lines are equivalent: - - >>> ts.groupby(pd.Grouper(freq='17min', origin='start')).sum() - 2000-10-01 23:30:00 9 - 2000-10-01 23:47:00 21 - 2000-10-02 00:04:00 54 - 2000-10-02 00:21:00 24 - Freq: 17min, dtype: int64 - - >>> ts.groupby(pd.Grouper(freq='17min', offset='23h30min')).sum() - 2000-10-01 23:30:00 9 - 2000-10-01 23:47:00 21 - 2000-10-02 00:04:00 54 - 2000-10-02 00:21:00 24 - Freq: 17min, dtype: int64 - - To replace the use of the deprecated `base` argument, you can now use `offset`, - in this example it is equivalent to have `base=2`: - - >>> ts.groupby(pd.Grouper(freq='17min', offset='2min')).sum() - 2000-10-01 23:16:00 0 - 2000-10-01 23:33:00 9 - 2000-10-01 23:50:00 36 - 2000-10-02 00:07:00 39 - 2000-10-02 00:24:00 24 - Freq: 17min, dtype: int64 - """ - - sort: bool - dropna: bool - _gpr_index: Index | None - _grouper: Index | None - - _attributes: tuple[str, ...] = ("key", "level", "freq", "axis", "sort", "dropna") - - def __new__(cls, *args, **kwargs): - if kwargs.get("freq") is not None: - from pandas.core.resample import TimeGrouper - - cls = TimeGrouper - return super().__new__(cls) - - def __init__( - self, - key=None, - level=None, - freq=None, - axis: Axis | lib.NoDefault = lib.no_default, - sort: bool = False, - dropna: bool = True, - ) -> None: - if type(self) is Grouper: - # i.e. not TimeGrouper - if axis is not lib.no_default: - warnings.warn( - "Grouper axis keyword is deprecated and will be removed in a " - "future version. To group on axis=1, use obj.T.groupby(...) " - "instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - axis = 0 - if axis is lib.no_default: - axis = 0 - - self.key = key - self.level = level - self.freq = freq - self.axis = axis - self.sort = sort - self.dropna = dropna - - self._grouper_deprecated = None - self._indexer_deprecated: npt.NDArray[np.intp] | None = None - self._obj_deprecated = None - self._gpr_index = None - self.binner = None - self._grouper = None - self._indexer: npt.NDArray[np.intp] | None = None - - def _get_grouper( - self, obj: NDFrameT, validate: bool = True - ) -> tuple[ops.BaseGrouper, NDFrameT]: - """ - Parameters - ---------- - obj : Series or DataFrame - validate : bool, default True - if True, validate the grouper - - Returns - ------- - a tuple of grouper, obj (possibly sorted) - """ - obj, _, _ = self._set_grouper(obj) - grouper, _, obj = get_grouper( - obj, - [self.key], - axis=self.axis, - level=self.level, - sort=self.sort, - validate=validate, - dropna=self.dropna, - ) - # Without setting this, subsequent lookups to .groups raise - # error: Incompatible types in assignment (expression has type "BaseGrouper", - # variable has type "None") - self._grouper_deprecated = grouper # type: ignore[assignment] - - return grouper, obj - - @final - def _set_grouper( - self, obj: NDFrameT, sort: bool = False, *, gpr_index: Index | None = None - ) -> tuple[NDFrameT, Index, npt.NDArray[np.intp] | None]: - """ - given an object and the specifications, setup the internal grouper - for this particular specification - - Parameters - ---------- - obj : Series or DataFrame - sort : bool, default False - whether the resulting grouper should be sorted - gpr_index : Index or None, default None - - Returns - ------- - NDFrame - Index - np.ndarray[np.intp] | None - """ - assert obj is not None - - if self.key is not None and self.level is not None: - raise ValueError("The Grouper cannot specify both a key and a level!") - - # Keep self._grouper value before overriding - if self._grouper is None: - # TODO: What are we assuming about subsequent calls? - self._grouper = gpr_index - self._indexer = self._indexer_deprecated - - # the key must be a valid info item - if self.key is not None: - key = self.key - # The 'on' is already defined - if getattr(gpr_index, "name", None) == key and isinstance(obj, Series): - # Sometimes self._grouper will have been resorted while - # obj has not. In this case there is a mismatch when we - # call self._grouper.take(obj.index) so we need to undo the sorting - # before we call _grouper.take. - assert self._grouper is not None - if self._indexer is not None: - reverse_indexer = self._indexer.argsort() - unsorted_ax = self._grouper.take(reverse_indexer) - ax = unsorted_ax.take(obj.index) - else: - ax = self._grouper.take(obj.index) - else: - if key not in obj._info_axis: - raise KeyError(f"The grouper name {key} is not found") - ax = Index(obj[key], name=key) - - else: - ax = obj._get_axis(self.axis) - if self.level is not None: - level = self.level - - # if a level is given it must be a mi level or - # equivalent to the axis name - if isinstance(ax, MultiIndex): - level = ax._get_level_number(level) - ax = Index(ax._get_level_values(level), name=ax.names[level]) - - else: - if level not in (0, ax.name): - raise ValueError(f"The level {level} is not valid") - - # possibly sort - indexer: npt.NDArray[np.intp] | None = None - if (self.sort or sort) and not ax.is_monotonic_increasing: - # use stable sort to support first, last, nth - # TODO: why does putting na_position="first" fix datetimelike cases? - indexer = self._indexer_deprecated = ax.array.argsort( - kind="mergesort", na_position="first" - ) - ax = ax.take(indexer) - obj = obj.take(indexer, axis=self.axis) - - # error: Incompatible types in assignment (expression has type - # "NDFrameT", variable has type "None") - self._obj_deprecated = obj # type: ignore[assignment] - self._gpr_index = ax - return obj, ax, indexer - - @final - @property - def ax(self) -> Index: - warnings.warn( - f"{type(self).__name__}.ax is deprecated and will be removed in a " - "future version. Use Resampler.ax instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - index = self._gpr_index - if index is None: - raise ValueError("_set_grouper must be called before ax is accessed") - return index - - @final - @property - def indexer(self): - warnings.warn( - f"{type(self).__name__}.indexer is deprecated and will be removed " - "in a future version. Use Resampler.indexer instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self._indexer_deprecated - - @final - @property - def obj(self): - # TODO(3.0): enforcing these deprecations on Grouper should close - # GH#25564, GH#41930 - warnings.warn( - f"{type(self).__name__}.obj is deprecated and will be removed " - "in a future version. Use GroupBy.indexer instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self._obj_deprecated - - @final - @property - def grouper(self): - warnings.warn( - f"{type(self).__name__}.grouper is deprecated and will be removed " - "in a future version. Use GroupBy.grouper instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self._grouper_deprecated - - @final - @property - def groups(self): - warnings.warn( - f"{type(self).__name__}.groups is deprecated and will be removed " - "in a future version. Use GroupBy.groups instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - # error: "None" has no attribute "groups" - return self._grouper_deprecated.groups # type: ignore[attr-defined] - - @final - def __repr__(self) -> str: - attrs_list = ( - f"{attr_name}={repr(getattr(self, attr_name))}" - for attr_name in self._attributes - if getattr(self, attr_name) is not None - ) - attrs = ", ".join(attrs_list) - cls_name = type(self).__name__ - return f"{cls_name}({attrs})" - - -@final -class Grouping: - """ - Holds the grouping information for a single key - - Parameters - ---------- - index : Index - grouper : - obj : DataFrame or Series - name : Label - level : - observed : bool, default False - If we are a Categorical, use the observed values - in_axis : if the Grouping is a column in self.obj and hence among - Groupby.exclusions list - dropna : bool, default True - Whether to drop NA groups. - uniques : Array-like, optional - When specified, will be used for unique values. Enables including empty groups - in the result for a BinGrouper. Must not contain duplicates. - - Attributes - ------- - indices : dict - Mapping of {group -> index_list} - codes : ndarray - Group codes - group_index : Index or None - unique groups - groups : dict - Mapping of {group -> label_list} - """ - - _codes: npt.NDArray[np.signedinteger] | None = None - _group_index: Index | None = None - _all_grouper: Categorical | None - _orig_cats: Index | None - _index: Index - - def __init__( - self, - index: Index, - grouper=None, - obj: NDFrame | None = None, - level=None, - sort: bool = True, - observed: bool = False, - in_axis: bool = False, - dropna: bool = True, - uniques: ArrayLike | None = None, - ) -> None: - self.level = level - self._orig_grouper = grouper - grouping_vector = _convert_grouper(index, grouper) - self._all_grouper = None - self._orig_cats = None - self._index = index - self._sort = sort - self.obj = obj - self._observed = observed - self.in_axis = in_axis - self._dropna = dropna - self._uniques = uniques - - # we have a single grouper which may be a myriad of things, - # some of which are dependent on the passing in level - - ilevel = self._ilevel - if ilevel is not None: - # In extant tests, the new self.grouping_vector matches - # `index.get_level_values(ilevel)` whenever - # mapper is None and isinstance(index, MultiIndex) - if isinstance(index, MultiIndex): - index_level = index.get_level_values(ilevel) - else: - index_level = index - - if grouping_vector is None: - grouping_vector = index_level - else: - mapper = grouping_vector - grouping_vector = index_level.map(mapper) - - # a passed Grouper like, directly get the grouper in the same way - # as single grouper groupby, use the group_info to get codes - elif isinstance(grouping_vector, Grouper): - # get the new grouper; we already have disambiguated - # what key/level refer to exactly, don't need to - # check again as we have by this point converted these - # to an actual value (rather than a pd.Grouper) - assert self.obj is not None # for mypy - newgrouper, newobj = grouping_vector._get_grouper(self.obj, validate=False) - self.obj = newobj - - if isinstance(newgrouper, ops.BinGrouper): - # TODO: can we unwrap this and get a tighter typing - # for self.grouping_vector? - grouping_vector = newgrouper - else: - # ops.BaseGrouper - # TODO: 2023-02-03 no test cases with len(newgrouper.groupings) > 1. - # If that were to occur, would we be throwing out information? - # error: Cannot determine type of "grouping_vector" [has-type] - ng = newgrouper.groupings[0].grouping_vector # type: ignore[has-type] - # use Index instead of ndarray so we can recover the name - grouping_vector = Index(ng, name=newgrouper.result_index.name) - - elif not isinstance( - grouping_vector, (Series, Index, ExtensionArray, np.ndarray) - ): - # no level passed - if getattr(grouping_vector, "ndim", 1) != 1: - t = str(type(grouping_vector)) - raise ValueError(f"Grouper for '{t}' not 1-dimensional") - - grouping_vector = index.map(grouping_vector) - - if not ( - hasattr(grouping_vector, "__len__") - and len(grouping_vector) == len(index) - ): - grper = pprint_thing(grouping_vector) - errmsg = ( - "Grouper result violates len(labels) == " - f"len(data)\nresult: {grper}" - ) - raise AssertionError(errmsg) - - if isinstance(grouping_vector, np.ndarray): - if grouping_vector.dtype.kind in "mM": - # if we have a date/time-like grouper, make sure that we have - # Timestamps like - # TODO 2022-10-08 we only have one test that gets here and - # values are already in nanoseconds in that case. - grouping_vector = Series(grouping_vector).to_numpy() - elif isinstance(getattr(grouping_vector, "dtype", None), CategoricalDtype): - # a passed Categorical - self._orig_cats = grouping_vector.categories - grouping_vector, self._all_grouper = recode_for_groupby( - grouping_vector, sort, observed - ) - - self.grouping_vector = grouping_vector - - def __repr__(self) -> str: - return f"Grouping({self.name})" - - def __iter__(self) -> Iterator: - return iter(self.indices) - - @cache_readonly - def _passed_categorical(self) -> bool: - dtype = getattr(self.grouping_vector, "dtype", None) - return isinstance(dtype, CategoricalDtype) - - @cache_readonly - def name(self) -> Hashable: - ilevel = self._ilevel - if ilevel is not None: - return self._index.names[ilevel] - - if isinstance(self._orig_grouper, (Index, Series)): - return self._orig_grouper.name - - elif isinstance(self.grouping_vector, ops.BaseGrouper): - return self.grouping_vector.result_index.name - - elif isinstance(self.grouping_vector, Index): - return self.grouping_vector.name - - # otherwise we have ndarray or ExtensionArray -> no name - return None - - @cache_readonly - def _ilevel(self) -> int | None: - """ - If necessary, converted index level name to index level position. - """ - level = self.level - if level is None: - return None - if not isinstance(level, int): - index = self._index - if level not in index.names: - raise AssertionError(f"Level {level} not in index") - return index.names.index(level) - return level - - @property - def ngroups(self) -> int: - return len(self.group_index) - - @cache_readonly - def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: - # we have a list of groupers - if isinstance(self.grouping_vector, ops.BaseGrouper): - return self.grouping_vector.indices - - values = Categorical(self.grouping_vector) - return values._reverse_indexer() - - @property - def codes(self) -> npt.NDArray[np.signedinteger]: - return self._codes_and_uniques[0] - - @cache_readonly - def group_arraylike(self) -> ArrayLike: - """ - Analogous to result_index, but holding an ArrayLike to ensure - we can retain ExtensionDtypes. - """ - if self._all_grouper is not None: - # retain dtype for categories, including unobserved ones - return self.result_index._values - - elif self._passed_categorical: - return self.group_index._values - - return self._codes_and_uniques[1] - - @cache_readonly - def result_index(self) -> Index: - # result_index retains dtype for categories, including unobserved ones, - # which group_index does not - if self._all_grouper is not None: - group_idx = self.group_index - assert isinstance(group_idx, CategoricalIndex) - cats = self._orig_cats - # set_categories is dynamically added - return group_idx.set_categories(cats) # type: ignore[attr-defined] - return self.group_index - - @cache_readonly - def group_index(self) -> Index: - codes, uniques = self._codes_and_uniques - if not self._dropna and self._passed_categorical: - assert isinstance(uniques, Categorical) - if self._sort and (codes == len(uniques)).any(): - # Add NA value on the end when sorting - uniques = Categorical.from_codes( - np.append(uniques.codes, [-1]), uniques.categories, validate=False - ) - elif len(codes) > 0: - # Need to determine proper placement of NA value when not sorting - cat = self.grouping_vector - na_idx = (cat.codes < 0).argmax() - if cat.codes[na_idx] < 0: - # count number of unique codes that comes before the nan value - na_unique_idx = algorithms.nunique_ints(cat.codes[:na_idx]) - new_codes = np.insert(uniques.codes, na_unique_idx, -1) - uniques = Categorical.from_codes( - new_codes, uniques.categories, validate=False - ) - return Index._with_infer(uniques, name=self.name) - - @cache_readonly - def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: - uniques: ArrayLike - if self._passed_categorical: - # we make a CategoricalIndex out of the cat grouper - # preserving the categories / ordered attributes; - # doesn't (yet - GH#46909) handle dropna=False - cat = self.grouping_vector - categories = cat.categories - - if self._observed: - ucodes = algorithms.unique1d(cat.codes) - ucodes = ucodes[ucodes != -1] - if self._sort: - ucodes = np.sort(ucodes) - else: - ucodes = np.arange(len(categories)) - - uniques = Categorical.from_codes( - codes=ucodes, categories=categories, ordered=cat.ordered, validate=False - ) - - codes = cat.codes - if not self._dropna: - na_mask = codes < 0 - if np.any(na_mask): - if self._sort: - # Replace NA codes with `largest code + 1` - na_code = len(categories) - codes = np.where(na_mask, na_code, codes) - else: - # Insert NA code into the codes based on first appearance - # A negative code must exist, no need to check codes[na_idx] < 0 - na_idx = na_mask.argmax() - # count number of unique codes that comes before the nan value - na_code = algorithms.nunique_ints(codes[:na_idx]) - codes = np.where(codes >= na_code, codes + 1, codes) - codes = np.where(na_mask, na_code, codes) - - if not self._observed: - uniques = uniques.reorder_categories(self._orig_cats) - - return codes, uniques - - elif isinstance(self.grouping_vector, ops.BaseGrouper): - # we have a list of groupers - codes = self.grouping_vector.codes_info - uniques = self.grouping_vector.result_index._values - elif self._uniques is not None: - # GH#50486 Code grouping_vector using _uniques; allows - # including uniques that are not present in grouping_vector. - cat = Categorical(self.grouping_vector, categories=self._uniques) - codes = cat.codes - uniques = self._uniques - else: - # GH35667, replace dropna=False with use_na_sentinel=False - # error: Incompatible types in assignment (expression has type "Union[ - # ndarray[Any, Any], Index]", variable has type "Categorical") - codes, uniques = algorithms.factorize( # type: ignore[assignment] - self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna - ) - return codes, uniques - - @cache_readonly - def groups(self) -> dict[Hashable, np.ndarray]: - cats = Categorical.from_codes(self.codes, self.group_index, validate=False) - return self._index.groupby(cats) - - -def get_grouper( - obj: NDFrameT, - key=None, - axis: Axis = 0, - level=None, - sort: bool = True, - observed: bool = False, - validate: bool = True, - dropna: bool = True, -) -> tuple[ops.BaseGrouper, frozenset[Hashable], NDFrameT]: - """ - Create and return a BaseGrouper, which is an internal - mapping of how to create the grouper indexers. - This may be composed of multiple Grouping objects, indicating - multiple groupers - - Groupers are ultimately index mappings. They can originate as: - index mappings, keys to columns, functions, or Groupers - - Groupers enable local references to axis,level,sort, while - the passed in axis, level, and sort are 'global'. - - This routine tries to figure out what the passing in references - are and then creates a Grouping for each one, combined into - a BaseGrouper. - - If observed & we have a categorical grouper, only show the observed - values. - - If validate, then check for key/level overlaps. - - """ - group_axis = obj._get_axis(axis) - - # validate that the passed single level is compatible with the passed - # axis of the object - if level is not None: - # TODO: These if-block and else-block are almost same. - # MultiIndex instance check is removable, but it seems that there are - # some processes only for non-MultiIndex in else-block, - # eg. `obj.index.name != level`. We have to consider carefully whether - # these are applicable for MultiIndex. Even if these are applicable, - # we need to check if it makes no side effect to subsequent processes - # on the outside of this condition. - # (GH 17621) - if isinstance(group_axis, MultiIndex): - if is_list_like(level) and len(level) == 1: - level = level[0] - - if key is None and is_scalar(level): - # Get the level values from group_axis - key = group_axis.get_level_values(level) - level = None - - else: - # allow level to be a length-one list-like object - # (e.g., level=[0]) - # GH 13901 - if is_list_like(level): - nlevels = len(level) - if nlevels == 1: - level = level[0] - elif nlevels == 0: - raise ValueError("No group keys passed!") - else: - raise ValueError("multiple levels only valid with MultiIndex") - - if isinstance(level, str): - if obj._get_axis(axis).name != level: - raise ValueError( - f"level name {level} is not the name " - f"of the {obj._get_axis_name(axis)}" - ) - elif level > 0 or level < -1: - raise ValueError("level > 0 or level < -1 only valid with MultiIndex") - - # NOTE: `group_axis` and `group_axis.get_level_values(level)` - # are same in this section. - level = None - key = group_axis - - # a passed-in Grouper, directly convert - if isinstance(key, Grouper): - grouper, obj = key._get_grouper(obj, validate=False) - if key.key is None: - return grouper, frozenset(), obj - else: - return grouper, frozenset({key.key}), obj - - # already have a BaseGrouper, just return it - elif isinstance(key, ops.BaseGrouper): - return key, frozenset(), obj - - if not isinstance(key, list): - keys = [key] - match_axis_length = False - else: - keys = key - match_axis_length = len(keys) == len(group_axis) - - # what are we after, exactly? - any_callable = any(callable(g) or isinstance(g, dict) for g in keys) - any_groupers = any(isinstance(g, (Grouper, Grouping)) for g in keys) - any_arraylike = any( - isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys - ) - - # is this an index replacement? - if ( - not any_callable - and not any_arraylike - and not any_groupers - and match_axis_length - and level is None - ): - if isinstance(obj, DataFrame): - all_in_columns_index = all( - g in obj.columns or g in obj.index.names for g in keys - ) - else: - assert isinstance(obj, Series) - all_in_columns_index = all(g in obj.index.names for g in keys) - - if not all_in_columns_index: - keys = [com.asarray_tuplesafe(keys)] - - if isinstance(level, (tuple, list)): - if key is None: - keys = [None] * len(level) - levels = level - else: - levels = [level] * len(keys) - - groupings: list[Grouping] = [] - exclusions: set[Hashable] = set() - - # if the actual grouper should be obj[key] - def is_in_axis(key) -> bool: - if not _is_label_like(key): - if obj.ndim == 1: - return False - - # items -> .columns for DataFrame, .index for Series - items = obj.axes[-1] - try: - items.get_loc(key) - except (KeyError, TypeError, InvalidIndexError): - # TypeError shows up here if we pass e.g. an Index - return False - - return True - - # if the grouper is obj[name] - def is_in_obj(gpr) -> bool: - if not hasattr(gpr, "name"): - return False - if using_copy_on_write() or warn_copy_on_write(): - # For the CoW case, we check the references to determine if the - # series is part of the object - try: - obj_gpr_column = obj[gpr.name] - except (KeyError, IndexError, InvalidIndexError, OutOfBoundsDatetime): - return False - if isinstance(gpr, Series) and isinstance(obj_gpr_column, Series): - return gpr._mgr.references_same_values( # type: ignore[union-attr] - obj_gpr_column._mgr, 0 # type: ignore[arg-type] - ) - return False - try: - return gpr is obj[gpr.name] - except (KeyError, IndexError, InvalidIndexError, OutOfBoundsDatetime): - # IndexError reached in e.g. test_skip_group_keys when we pass - # lambda here - # InvalidIndexError raised on key-types inappropriate for index, - # e.g. DatetimeIndex.get_loc(tuple()) - # OutOfBoundsDatetime raised when obj is a Series with DatetimeIndex - # and gpr.name is month str - return False - - for gpr, level in zip(keys, levels): - if is_in_obj(gpr): # df.groupby(df['name']) - in_axis = True - exclusions.add(gpr.name) - - elif is_in_axis(gpr): # df.groupby('name') - if obj.ndim != 1 and gpr in obj: - if validate: - obj._check_label_or_level_ambiguity(gpr, axis=axis) - in_axis, name, gpr = True, gpr, obj[gpr] - if gpr.ndim != 1: - # non-unique columns; raise here to get the name in the - # exception message - raise ValueError(f"Grouper for '{name}' not 1-dimensional") - exclusions.add(name) - elif obj._is_level_reference(gpr, axis=axis): - in_axis, level, gpr = False, gpr, None - else: - raise KeyError(gpr) - elif isinstance(gpr, Grouper) and gpr.key is not None: - # Add key to exclusions - exclusions.add(gpr.key) - in_axis = True - else: - in_axis = False - - # create the Grouping - # allow us to passing the actual Grouping as the gpr - ping = ( - Grouping( - group_axis, - gpr, - obj=obj, - level=level, - sort=sort, - observed=observed, - in_axis=in_axis, - dropna=dropna, - ) - if not isinstance(gpr, Grouping) - else gpr - ) - - groupings.append(ping) - - if len(groupings) == 0 and len(obj): - raise ValueError("No group keys passed!") - if len(groupings) == 0: - groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) - - # create the internals grouper - grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, dropna=dropna) - return grouper, frozenset(exclusions), obj - - -def _is_label_like(val) -> bool: - return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val)) - - -def _convert_grouper(axis: Index, grouper): - if isinstance(grouper, dict): - return grouper.get - elif isinstance(grouper, Series): - if grouper.index.equals(axis): - return grouper._values - else: - return grouper.reindex(axis)._values - elif isinstance(grouper, MultiIndex): - return grouper._values - elif isinstance(grouper, (list, tuple, Index, Categorical, np.ndarray)): - if len(grouper) != len(axis): - raise ValueError("Grouper and axis must be same length") - - if isinstance(grouper, (list, tuple)): - grouper = com.asarray_tuplesafe(grouper) - return grouper - else: - return grouper +for item in grouper.__dir__(): + _globals[item] = getattr(grouper, item) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index a3c5ab8edc94e..a49aea4ff09ab 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -1,304 +1,11 @@ from __future__ import annotations -from collections.abc import Iterable -from typing import ( - TYPE_CHECKING, - Literal, - cast, -) +from pandas._core.groupby import indexing +from pandas.core.common import _depr_core -import numpy as np +_depr_core() -from pandas.util._decorators import ( - cache_readonly, - doc, -) +_globals = globals() -from pandas.core.dtypes.common import ( - is_integer, - is_list_like, -) - -if TYPE_CHECKING: - from pandas._typing import PositionalIndexer - - from pandas import ( - DataFrame, - Series, - ) - from pandas.core.groupby import groupby - - -class GroupByIndexingMixin: - """ - Mixin for adding ._positional_selector to GroupBy. - """ - - @cache_readonly - def _positional_selector(self) -> GroupByPositionalSelector: - """ - Return positional selection for each group. - - ``groupby._positional_selector[i:j]`` is similar to - ``groupby.apply(lambda x: x.iloc[i:j])`` - but much faster and preserves the original index and order. - - ``_positional_selector[]`` is compatible with and extends :meth:`~GroupBy.head` - and :meth:`~GroupBy.tail`. For example: - - - ``head(5)`` - - ``_positional_selector[5:-5]`` - - ``tail(5)`` - - together return all the rows. - - Allowed inputs for the index are: - - - An integer valued iterable, e.g. ``range(2, 4)``. - - A comma separated list of integers and slices, e.g. ``5``, ``2, 4``, ``2:4``. - - The output format is the same as :meth:`~GroupBy.head` and - :meth:`~GroupBy.tail`, namely - a subset of the ``DataFrame`` or ``Series`` with the index and order preserved. - - Returns - ------- - Series - The filtered subset of the original Series. - DataFrame - The filtered subset of the original DataFrame. - - See Also - -------- - DataFrame.iloc : Purely integer-location based indexing for selection by - position. - GroupBy.head : Return first n rows of each group. - GroupBy.tail : Return last n rows of each group. - GroupBy.nth : Take the nth row from each group if n is an int, or a - subset of rows, if n is a list of ints. - - Notes - ----- - - The slice step cannot be negative. - - If the index specification results in overlaps, the item is not duplicated. - - If the index specification changes the order of items, then - they are returned in their original order. - By contrast, ``DataFrame.iloc`` can change the row order. - - ``groupby()`` parameters such as as_index and dropna are ignored. - - The differences between ``_positional_selector[]`` and :meth:`~GroupBy.nth` - with ``as_index=False`` are: - - - Input to ``_positional_selector`` can include - one or more slices whereas ``nth`` - just handles an integer or a list of integers. - - ``_positional_selector`` can accept a slice relative to the - last row of each group. - - ``_positional_selector`` does not have an equivalent to the - ``nth()`` ``dropna`` parameter. - - Examples - -------- - >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], - ... columns=["A", "B"]) - >>> df.groupby("A")._positional_selector[1:2] - A B - 1 a 2 - 4 b 5 - - >>> df.groupby("A")._positional_selector[1, -1] - A B - 1 a 2 - 2 a 3 - 4 b 5 - """ - if TYPE_CHECKING: - # pylint: disable-next=used-before-assignment - groupby_self = cast(groupby.GroupBy, self) - else: - groupby_self = self - - return GroupByPositionalSelector(groupby_self) - - def _make_mask_from_positional_indexer( - self, - arg: PositionalIndexer | tuple, - ) -> np.ndarray: - if is_list_like(arg): - if all(is_integer(i) for i in cast(Iterable, arg)): - mask = self._make_mask_from_list(cast(Iterable[int], arg)) - else: - mask = self._make_mask_from_tuple(cast(tuple, arg)) - - elif isinstance(arg, slice): - mask = self._make_mask_from_slice(arg) - elif is_integer(arg): - mask = self._make_mask_from_int(cast(int, arg)) - else: - raise TypeError( - f"Invalid index {type(arg)}. " - "Must be integer, list-like, slice or a tuple of " - "integers and slices" - ) - - if isinstance(mask, bool): - if mask: - mask = self._ascending_count >= 0 - else: - mask = self._ascending_count < 0 - - return cast(np.ndarray, mask) - - def _make_mask_from_int(self, arg: int) -> np.ndarray: - if arg >= 0: - return self._ascending_count == arg - else: - return self._descending_count == (-arg - 1) - - def _make_mask_from_list(self, args: Iterable[int]) -> bool | np.ndarray: - positive = [arg for arg in args if arg >= 0] - negative = [-arg - 1 for arg in args if arg < 0] - - mask: bool | np.ndarray = False - - if positive: - mask |= np.isin(self._ascending_count, positive) - - if negative: - mask |= np.isin(self._descending_count, negative) - - return mask - - def _make_mask_from_tuple(self, args: tuple) -> bool | np.ndarray: - mask: bool | np.ndarray = False - - for arg in args: - if is_integer(arg): - mask |= self._make_mask_from_int(cast(int, arg)) - elif isinstance(arg, slice): - mask |= self._make_mask_from_slice(arg) - else: - raise ValueError( - f"Invalid argument {type(arg)}. Should be int or slice." - ) - - return mask - - def _make_mask_from_slice(self, arg: slice) -> bool | np.ndarray: - start = arg.start - stop = arg.stop - step = arg.step - - if step is not None and step < 0: - raise ValueError(f"Invalid step {step}. Must be non-negative") - - mask: bool | np.ndarray = True - - if step is None: - step = 1 - - if start is None: - if step > 1: - mask &= self._ascending_count % step == 0 - - elif start >= 0: - mask &= self._ascending_count >= start - - if step > 1: - mask &= (self._ascending_count - start) % step == 0 - - else: - mask &= self._descending_count < -start - - offset_array = self._descending_count + start + 1 - limit_array = ( - self._ascending_count + self._descending_count + (start + 1) - ) < 0 - offset_array = np.where(limit_array, self._ascending_count, offset_array) - - mask &= offset_array % step == 0 - - if stop is not None: - if stop >= 0: - mask &= self._ascending_count < stop - else: - mask &= self._descending_count >= -stop - - return mask - - @cache_readonly - def _ascending_count(self) -> np.ndarray: - if TYPE_CHECKING: - groupby_self = cast(groupby.GroupBy, self) - else: - groupby_self = self - - return groupby_self._cumcount_array() - - @cache_readonly - def _descending_count(self) -> np.ndarray: - if TYPE_CHECKING: - groupby_self = cast(groupby.GroupBy, self) - else: - groupby_self = self - - return groupby_self._cumcount_array(ascending=False) - - -@doc(GroupByIndexingMixin._positional_selector) -class GroupByPositionalSelector: - def __init__(self, groupby_object: groupby.GroupBy) -> None: - self.groupby_object = groupby_object - - def __getitem__(self, arg: PositionalIndexer | tuple) -> DataFrame | Series: - """ - Select by positional index per group. - - Implements GroupBy._positional_selector - - Parameters - ---------- - arg : PositionalIndexer | tuple - Allowed values are: - - int - - int valued iterable such as list or range - - slice with step either None or positive - - tuple of integers and slices - - Returns - ------- - Series - The filtered subset of the original groupby Series. - DataFrame - The filtered subset of the original groupby DataFrame. - - See Also - -------- - DataFrame.iloc : Integer-location based indexing for selection by position. - GroupBy.head : Return first n rows of each group. - GroupBy.tail : Return last n rows of each group. - GroupBy._positional_selector : Return positional selection for each group. - GroupBy.nth : Take the nth row from each group if n is an int, or a - subset of rows, if n is a list of ints. - """ - mask = self.groupby_object._make_mask_from_positional_indexer(arg) - return self.groupby_object._mask_selected_obj(mask) - - -class GroupByNthSelector: - """ - Dynamically substituted for GroupBy.nth to enable both call and index - """ - - def __init__(self, groupby_object: groupby.GroupBy) -> None: - self.groupby_object = groupby_object - - def __call__( - self, - n: PositionalIndexer | tuple, - dropna: Literal["any", "all", None] = None, - ) -> DataFrame | Series: - return self.groupby_object._nth(n, dropna) - - def __getitem__(self, n: PositionalIndexer | tuple) -> DataFrame | Series: - return self.groupby_object._nth(n) +for item in indexing.__dir__(): + _globals[item] = getattr(indexing, item) diff --git a/pandas/core/groupby/numba_.py b/pandas/core/groupby/numba_.py index 3b7a58e87603e..f044f41b07490 100644 --- a/pandas/core/groupby/numba_.py +++ b/pandas/core/groupby/numba_.py @@ -1,181 +1,11 @@ -"""Common utilities for Numba operations with groupby ops""" from __future__ import annotations -import functools -import inspect -from typing import ( - TYPE_CHECKING, - Any, - Callable, -) +from pandas._core.groupby import numba_ +from pandas.core.common import _depr_core -import numpy as np +_depr_core() -from pandas.compat._optional import import_optional_dependency +_globals = globals() -from pandas.core.util.numba_ import ( - NumbaUtilError, - jit_user_function, -) - -if TYPE_CHECKING: - from pandas._typing import Scalar - - -def validate_udf(func: Callable) -> None: - """ - Validate user defined function for ops when using Numba with groupby ops. - - The first signature arguments should include: - - def f(values, index, ...): - ... - - Parameters - ---------- - func : function, default False - user defined function - - Returns - ------- - None - - Raises - ------ - NumbaUtilError - """ - if not callable(func): - raise NotImplementedError( - "Numba engine can only be used with a single function." - ) - udf_signature = list(inspect.signature(func).parameters.keys()) - expected_args = ["values", "index"] - min_number_args = len(expected_args) - if ( - len(udf_signature) < min_number_args - or udf_signature[:min_number_args] != expected_args - ): - raise NumbaUtilError( - f"The first {min_number_args} arguments to {func.__name__} must be " - f"{expected_args}" - ) - - -@functools.cache -def generate_numba_agg_func( - func: Callable[..., Scalar], - nopython: bool, - nogil: bool, - parallel: bool, -) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, Any], np.ndarray]: - """ - Generate a numba jitted agg function specified by values from engine_kwargs. - - 1. jit the user's function - 2. Return a groupby agg function with the jitted function inline - - Configurations specified in engine_kwargs apply to both the user's - function _AND_ the groupby evaluation loop. - - Parameters - ---------- - func : function - function to be applied to each group and will be JITed - nopython : bool - nopython to be passed into numba.jit - nogil : bool - nogil to be passed into numba.jit - parallel : bool - parallel to be passed into numba.jit - - Returns - ------- - Numba function - """ - numba_func = jit_user_function(func) - if TYPE_CHECKING: - import numba - else: - numba = import_optional_dependency("numba") - - @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) - def group_agg( - values: np.ndarray, - index: np.ndarray, - begin: np.ndarray, - end: np.ndarray, - num_columns: int, - *args: Any, - ) -> np.ndarray: - assert len(begin) == len(end) - num_groups = len(begin) - - result = np.empty((num_groups, num_columns)) - for i in numba.prange(num_groups): - group_index = index[begin[i] : end[i]] - for j in numba.prange(num_columns): - group = values[begin[i] : end[i], j] - result[i, j] = numba_func(group, group_index, *args) - return result - - return group_agg - - -@functools.cache -def generate_numba_transform_func( - func: Callable[..., np.ndarray], - nopython: bool, - nogil: bool, - parallel: bool, -) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, Any], np.ndarray]: - """ - Generate a numba jitted transform function specified by values from engine_kwargs. - - 1. jit the user's function - 2. Return a groupby transform function with the jitted function inline - - Configurations specified in engine_kwargs apply to both the user's - function _AND_ the groupby evaluation loop. - - Parameters - ---------- - func : function - function to be applied to each window and will be JITed - nopython : bool - nopython to be passed into numba.jit - nogil : bool - nogil to be passed into numba.jit - parallel : bool - parallel to be passed into numba.jit - - Returns - ------- - Numba function - """ - numba_func = jit_user_function(func) - if TYPE_CHECKING: - import numba - else: - numba = import_optional_dependency("numba") - - @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) - def group_transform( - values: np.ndarray, - index: np.ndarray, - begin: np.ndarray, - end: np.ndarray, - num_columns: int, - *args: Any, - ) -> np.ndarray: - assert len(begin) == len(end) - num_groups = len(begin) - - result = np.empty((len(values), num_columns)) - for i in numba.prange(num_groups): - group_index = index[begin[i] : end[i]] - for j in numba.prange(num_columns): - group = values[begin[i] : end[i], j] - result[begin[i] : end[i], j] = numba_func(group, group_index, *args) - return result - - return group_transform +for item in numba_.__dir__(): + _globals[item] = getattr(numba_, item) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 466bbac641077..e834a1484afba 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -1,1215 +1,11 @@ -""" -Provide classes to perform the groupby aggregate operations. - -These are not exposed to the user and provide implementations of the grouping -operations, primarily in cython. These classes (BaseGrouper and BinGrouper) -are contained *in* the SeriesGroupBy and DataFrameGroupBy objects. -""" from __future__ import annotations -import collections -import functools -from typing import ( - TYPE_CHECKING, - Callable, - Generic, - final, -) - -import numpy as np - -from pandas._libs import ( - NaT, - lib, -) -import pandas._libs.groupby as libgroupby -from pandas._typing import ( - ArrayLike, - AxisInt, - NDFrameT, - Shape, - npt, -) -from pandas.errors import AbstractMethodError -from pandas.util._decorators import cache_readonly - -from pandas.core.dtypes.base import ExtensionDtype -from pandas.core.dtypes.cast import ( - maybe_cast_pointwise_result, - maybe_downcast_to_dtype, -) -from pandas.core.dtypes.common import ( - ensure_float64, - ensure_int64, - ensure_platform_int, - ensure_uint64, - is_1d_only_ea_dtype, -) -from pandas.core.dtypes.missing import ( - isna, - maybe_fill, -) - -from pandas.core.frame import DataFrame -from pandas.core.groupby import grouper -from pandas.core.indexes.api import ( - CategoricalIndex, - Index, - MultiIndex, - ensure_index, -) -from pandas.core.series import Series -from pandas.core.sorting import ( - compress_group_index, - decons_obs_group_ids, - get_flattened_list, - get_group_index, - get_group_index_sorter, - get_indexer_dict, -) - -if TYPE_CHECKING: - from collections.abc import ( - Hashable, - Iterator, - Sequence, - ) - - from pandas.core.generic import NDFrame - - -def check_result_array(obj, dtype) -> None: - # Our operation is supposed to be an aggregation/reduction. If - # it returns an ndarray, this likely means an invalid operation has - # been passed. See test_apply_without_aggregation, test_agg_must_agg - if isinstance(obj, np.ndarray): - if dtype != object: - # If it is object dtype, the function can be a reduction/aggregation - # and still return an ndarray e.g. test_agg_over_numpy_arrays - raise ValueError("Must produce aggregated value") - - -def extract_result(res): - """ - Extract the result object, it might be a 0-dim ndarray - or a len-1 0-dim, or a scalar - """ - if hasattr(res, "_values"): - # Preserve EA - res = res._values - if res.ndim == 1 and len(res) == 1: - # see test_agg_lambda_with_timezone, test_resampler_grouper.py::test_apply - res = res[0] - return res - - -class WrappedCythonOp: - """ - Dispatch logic for functions defined in _libs.groupby - - Parameters - ---------- - kind: str - Whether the operation is an aggregate or transform. - how: str - Operation name, e.g. "mean". - has_dropped_na: bool - True precisely when dropna=True and the grouper contains a null value. - """ - - # Functions for which we do _not_ attempt to cast the cython result - # back to the original dtype. - cast_blocklist = frozenset( - ["any", "all", "rank", "count", "size", "idxmin", "idxmax"] - ) - - def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: - self.kind = kind - self.how = how - self.has_dropped_na = has_dropped_na - - _CYTHON_FUNCTIONS: dict[str, dict] = { - "aggregate": { - "any": functools.partial(libgroupby.group_any_all, val_test="any"), - "all": functools.partial(libgroupby.group_any_all, val_test="all"), - "sum": "group_sum", - "prod": "group_prod", - "idxmin": functools.partial(libgroupby.group_idxmin_idxmax, name="idxmin"), - "idxmax": functools.partial(libgroupby.group_idxmin_idxmax, name="idxmax"), - "min": "group_min", - "max": "group_max", - "mean": "group_mean", - "median": "group_median_float64", - "var": "group_var", - "std": functools.partial(libgroupby.group_var, name="std"), - "sem": functools.partial(libgroupby.group_var, name="sem"), - "skew": "group_skew", - "first": "group_nth", - "last": "group_last", - "ohlc": "group_ohlc", - }, - "transform": { - "cumprod": "group_cumprod", - "cumsum": "group_cumsum", - "cummin": "group_cummin", - "cummax": "group_cummax", - "rank": "group_rank", - }, - } - - _cython_arity = {"ohlc": 4} # OHLC - - @classmethod - def get_kind_from_how(cls, how: str) -> str: - if how in cls._CYTHON_FUNCTIONS["aggregate"]: - return "aggregate" - return "transform" - - # Note: we make this a classmethod and pass kind+how so that caching - # works at the class level and not the instance level - @classmethod - @functools.cache - def _get_cython_function( - cls, kind: str, how: str, dtype: np.dtype, is_numeric: bool - ): - dtype_str = dtype.name - ftype = cls._CYTHON_FUNCTIONS[kind][how] - - # see if there is a fused-type version of function - # only valid for numeric - if callable(ftype): - f = ftype - else: - f = getattr(libgroupby, ftype) - if is_numeric: - return f - elif dtype == np.dtype(object): - if how in ["median", "cumprod"]: - # no fused types -> no __signatures__ - raise NotImplementedError( - f"function is not implemented for this dtype: " - f"[how->{how},dtype->{dtype_str}]" - ) - elif how in ["std", "sem", "idxmin", "idxmax"]: - # We have a partial object that does not have __signatures__ - return f - elif how == "skew": - # _get_cython_vals will convert to float64 - pass - elif "object" not in f.__signatures__: - # raise NotImplementedError here rather than TypeError later - raise NotImplementedError( - f"function is not implemented for this dtype: " - f"[how->{how},dtype->{dtype_str}]" - ) - return f - else: - raise NotImplementedError( - "This should not be reached. Please report a bug at " - "github.com/pandas-dev/pandas/", - dtype, - ) - - def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: - """ - Cast numeric dtypes to float64 for functions that only support that. - - Parameters - ---------- - values : np.ndarray - - Returns - ------- - values : np.ndarray - """ - how = self.how - - if how in ["median", "std", "sem", "skew"]: - # median only has a float64 implementation - # We should only get here with is_numeric, as non-numeric cases - # should raise in _get_cython_function - values = ensure_float64(values) - - elif values.dtype.kind in "iu": - if how in ["var", "mean"] or ( - self.kind == "transform" and self.has_dropped_na - ): - # has_dropped_na check need for test_null_group_str_transformer - # result may still include NaN, so we have to cast - values = ensure_float64(values) - - elif how in ["sum", "ohlc", "prod", "cumsum", "cumprod"]: - # Avoid overflow during group op - if values.dtype.kind == "i": - values = ensure_int64(values) - else: - values = ensure_uint64(values) - - return values - - def _get_output_shape(self, ngroups: int, values: np.ndarray) -> Shape: - how = self.how - kind = self.kind - - arity = self._cython_arity.get(how, 1) - - out_shape: Shape - if how == "ohlc": - out_shape = (ngroups, arity) - elif arity > 1: - raise NotImplementedError( - "arity of more than 1 is not supported for the 'how' argument" - ) - elif kind == "transform": - out_shape = values.shape - else: - out_shape = (ngroups,) + values.shape[1:] - return out_shape - - def _get_out_dtype(self, dtype: np.dtype) -> np.dtype: - how = self.how - - if how == "rank": - out_dtype = "float64" - elif how in ["idxmin", "idxmax"]: - # The Cython implementation only produces the row number; we'll take - # from the index using this in post processing - out_dtype = "intp" - else: - if dtype.kind in "iufcb": - out_dtype = f"{dtype.kind}{dtype.itemsize}" - else: - out_dtype = "object" - return np.dtype(out_dtype) - - def _get_result_dtype(self, dtype: np.dtype) -> np.dtype: - """ - Get the desired dtype of a result based on the - input dtype and how it was computed. - - Parameters - ---------- - dtype : np.dtype - - Returns - ------- - np.dtype - The desired dtype of the result. - """ - how = self.how - - if how in ["sum", "cumsum", "sum", "prod", "cumprod"]: - if dtype == np.dtype(bool): - return np.dtype(np.int64) - elif how in ["mean", "median", "var", "std", "sem"]: - if dtype.kind in "fc": - return dtype - elif dtype.kind in "iub": - return np.dtype(np.float64) - return dtype - - @final - def _cython_op_ndim_compat( - self, - values: np.ndarray, - *, - min_count: int, - ngroups: int, - comp_ids: np.ndarray, - mask: npt.NDArray[np.bool_] | None = None, - result_mask: npt.NDArray[np.bool_] | None = None, - **kwargs, - ) -> np.ndarray: - if values.ndim == 1: - # expand to 2d, dispatch, then squeeze if appropriate - values2d = values[None, :] - if mask is not None: - mask = mask[None, :] - if result_mask is not None: - result_mask = result_mask[None, :] - res = self._call_cython_op( - values2d, - min_count=min_count, - ngroups=ngroups, - comp_ids=comp_ids, - mask=mask, - result_mask=result_mask, - **kwargs, - ) - if res.shape[0] == 1: - return res[0] - - # otherwise we have OHLC - return res.T - - return self._call_cython_op( - values, - min_count=min_count, - ngroups=ngroups, - comp_ids=comp_ids, - mask=mask, - result_mask=result_mask, - **kwargs, - ) - - @final - def _call_cython_op( - self, - values: np.ndarray, # np.ndarray[ndim=2] - *, - min_count: int, - ngroups: int, - comp_ids: np.ndarray, - mask: npt.NDArray[np.bool_] | None, - result_mask: npt.NDArray[np.bool_] | None, - **kwargs, - ) -> np.ndarray: # np.ndarray[ndim=2] - orig_values = values - - dtype = values.dtype - is_numeric = dtype.kind in "iufcb" - - is_datetimelike = dtype.kind in "mM" - - if is_datetimelike: - values = values.view("int64") - is_numeric = True - elif dtype.kind == "b": - values = values.view("uint8") - if values.dtype == "float16": - values = values.astype(np.float32) - - if self.how in ["any", "all"]: - if mask is None: - mask = isna(values) - if dtype == object: - if kwargs["skipna"]: - # GH#37501: don't raise on pd.NA when skipna=True - if mask.any(): - # mask on original values computed separately - values = values.copy() - values[mask] = True - values = values.astype(bool, copy=False).view(np.int8) - is_numeric = True - - values = values.T - if mask is not None: - mask = mask.T - if result_mask is not None: - result_mask = result_mask.T - - out_shape = self._get_output_shape(ngroups, values) - func = self._get_cython_function(self.kind, self.how, values.dtype, is_numeric) - values = self._get_cython_vals(values) - out_dtype = self._get_out_dtype(values.dtype) - - result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) - if self.kind == "aggregate": - counts = np.zeros(ngroups, dtype=np.int64) - if self.how in [ - "idxmin", - "idxmax", - "min", - "max", - "mean", - "last", - "first", - "sum", - ]: - func( - out=result, - counts=counts, - values=values, - labels=comp_ids, - min_count=min_count, - mask=mask, - result_mask=result_mask, - is_datetimelike=is_datetimelike, - ) - elif self.how in ["sem", "std", "var", "ohlc", "prod", "median"]: - if self.how in ["std", "sem"]: - kwargs["is_datetimelike"] = is_datetimelike - func( - result, - counts, - values, - comp_ids, - min_count=min_count, - mask=mask, - result_mask=result_mask, - **kwargs, - ) - elif self.how in ["any", "all"]: - func( - out=result, - values=values, - labels=comp_ids, - mask=mask, - result_mask=result_mask, - **kwargs, - ) - result = result.astype(bool, copy=False) - elif self.how in ["skew"]: - func( - out=result, - counts=counts, - values=values, - labels=comp_ids, - mask=mask, - result_mask=result_mask, - **kwargs, - ) - if dtype == object: - result = result.astype(object) - - else: - raise NotImplementedError(f"{self.how} is not implemented") - else: - # TODO: min_count - if self.how != "rank": - # TODO: should rank take result_mask? - kwargs["result_mask"] = result_mask - func( - out=result, - values=values, - labels=comp_ids, - ngroups=ngroups, - is_datetimelike=is_datetimelike, - mask=mask, - **kwargs, - ) - - if self.kind == "aggregate" and self.how not in ["idxmin", "idxmax"]: - # i.e. counts is defined. Locations where count None: - if values.ndim > 2: - raise NotImplementedError("number of dimensions is currently limited to 2") - if values.ndim == 2: - assert axis == 1, axis - elif not is_1d_only_ea_dtype(values.dtype): - # Note: it is *not* the case that axis is always 0 for 1-dim values, - # as we can have 1D ExtensionArrays that we need to treat as 2D - assert axis == 0 - - @final - def cython_operation( - self, - *, - values: ArrayLike, - axis: AxisInt, - min_count: int = -1, - comp_ids: np.ndarray, - ngroups: int, - **kwargs, - ) -> ArrayLike: - """ - Call our cython function, with appropriate pre- and post- processing. - """ - self._validate_axis(axis, values) - - if not isinstance(values, np.ndarray): - # i.e. ExtensionArray - return values._groupby_op( - how=self.how, - has_dropped_na=self.has_dropped_na, - min_count=min_count, - ngroups=ngroups, - ids=comp_ids, - **kwargs, - ) - - return self._cython_op_ndim_compat( - values, - min_count=min_count, - ngroups=ngroups, - comp_ids=comp_ids, - mask=None, - **kwargs, - ) - - -class BaseGrouper: - """ - This is an internal Grouper class, which actually holds - the generated groups - - Parameters - ---------- - axis : Index - groupings : Sequence[Grouping] - all the grouping instances to handle in this grouper - for example for grouper list to groupby, need to pass the list - sort : bool, default True - whether this grouper will give sorted result or not - - """ - - axis: Index - - def __init__( - self, - axis: Index, - groupings: Sequence[grouper.Grouping], - sort: bool = True, - dropna: bool = True, - ) -> None: - assert isinstance(axis, Index), axis - - self.axis = axis - self._groupings: list[grouper.Grouping] = list(groupings) - self._sort = sort - self.dropna = dropna - - @property - def groupings(self) -> list[grouper.Grouping]: - return self._groupings - - @property - def shape(self) -> Shape: - return tuple(ping.ngroups for ping in self.groupings) - - def __iter__(self) -> Iterator[Hashable]: - return iter(self.indices) - - @property - def nkeys(self) -> int: - return len(self.groupings) - - def get_iterator( - self, data: NDFrameT, axis: AxisInt = 0 - ) -> Iterator[tuple[Hashable, NDFrameT]]: - """ - Groupby iterator - - Returns - ------- - Generator yielding sequence of (name, subsetted object) - for each group - """ - splitter = self._get_splitter(data, axis=axis) - keys = self.group_keys_seq - yield from zip(keys, splitter) - - @final - def _get_splitter(self, data: NDFrame, axis: AxisInt = 0) -> DataSplitter: - """ - Returns - ------- - Generator yielding subsetted objects - """ - ids, _, ngroups = self.group_info - return _get_splitter( - data, - ids, - ngroups, - sorted_ids=self._sorted_ids, - sort_idx=self._sort_idx, - axis=axis, - ) - - @final - @cache_readonly - def group_keys_seq(self): - if len(self.groupings) == 1: - return self.levels[0] - else: - ids, _, ngroups = self.group_info - - # provide "flattened" iterator for multi-group setting - return get_flattened_list(ids, ngroups, self.levels, self.codes) - - @cache_readonly - def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: - """dict {group name -> group indices}""" - if len(self.groupings) == 1 and isinstance(self.result_index, CategoricalIndex): - # This shows unused categories in indices GH#38642 - return self.groupings[0].indices - codes_list = [ping.codes for ping in self.groupings] - keys = [ping.group_index for ping in self.groupings] - return get_indexer_dict(codes_list, keys) - - @final - def result_ilocs(self) -> npt.NDArray[np.intp]: - """ - Get the original integer locations of result_index in the input. - """ - # Original indices are where group_index would go via sorting. - # But when dropna is true, we need to remove null values while accounting for - # any gaps that then occur because of them. - group_index = get_group_index( - self.codes, self.shape, sort=self._sort, xnull=True - ) - group_index, _ = compress_group_index(group_index, sort=self._sort) - - if self.has_dropped_na: - mask = np.where(group_index >= 0) - # Count how many gaps are caused by previous null values for each position - null_gaps = np.cumsum(group_index == -1)[mask] - group_index = group_index[mask] - - result = get_group_index_sorter(group_index, self.ngroups) - - if self.has_dropped_na: - # Shift by the number of prior null gaps - result += np.take(null_gaps, result) - - return result - - @final - @property - def codes(self) -> list[npt.NDArray[np.signedinteger]]: - return [ping.codes for ping in self.groupings] - - @property - def levels(self) -> list[Index]: - return [ping.group_index for ping in self.groupings] - - @property - def names(self) -> list[Hashable]: - return [ping.name for ping in self.groupings] - - @final - def size(self) -> Series: - """ - Compute group sizes. - """ - ids, _, ngroups = self.group_info - out: np.ndarray | list - if ngroups: - out = np.bincount(ids[ids != -1], minlength=ngroups) - else: - out = [] - return Series(out, index=self.result_index, dtype="int64") - - @cache_readonly - def groups(self) -> dict[Hashable, np.ndarray]: - """dict {group name -> group labels}""" - if len(self.groupings) == 1: - return self.groupings[0].groups - else: - to_groupby = [] - for ping in self.groupings: - gv = ping.grouping_vector - if not isinstance(gv, BaseGrouper): - to_groupby.append(gv) - else: - to_groupby.append(gv.groupings[0].grouping_vector) - index = MultiIndex.from_arrays(to_groupby) - return self.axis.groupby(index) - - @final - @cache_readonly - def is_monotonic(self) -> bool: - # return if my group orderings are monotonic - return Index(self.group_info[0]).is_monotonic_increasing - - @final - @cache_readonly - def has_dropped_na(self) -> bool: - """ - Whether grouper has null value(s) that are dropped. - """ - return bool((self.group_info[0] < 0).any()) - - @cache_readonly - def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: - comp_ids, obs_group_ids = self._get_compressed_codes() - - ngroups = len(obs_group_ids) - comp_ids = ensure_platform_int(comp_ids) - - return comp_ids, obs_group_ids, ngroups - - @cache_readonly - def codes_info(self) -> npt.NDArray[np.intp]: - # return the codes of items in original grouped axis - ids, _, _ = self.group_info - return ids - - @final - def _get_compressed_codes( - self, - ) -> tuple[npt.NDArray[np.signedinteger], npt.NDArray[np.intp]]: - # The first returned ndarray may have any signed integer dtype - if len(self.groupings) > 1: - group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True) - return compress_group_index(group_index, sort=self._sort) - # FIXME: compress_group_index's second return value is int64, not intp - - ping = self.groupings[0] - return ping.codes, np.arange(len(ping.group_index), dtype=np.intp) - - @final - @cache_readonly - def ngroups(self) -> int: - return len(self.result_index) - - @property - def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]: - codes = self.codes - ids, obs_ids, _ = self.group_info - return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True) - - @cache_readonly - def result_index(self) -> Index: - if len(self.groupings) == 1: - return self.groupings[0].result_index.rename(self.names[0]) - - codes = self.reconstructed_codes - levels = [ping.result_index for ping in self.groupings] - return MultiIndex( - levels=levels, codes=codes, verify_integrity=False, names=self.names - ) - - @final - def get_group_levels(self) -> list[ArrayLike]: - # Note: only called from _insert_inaxis_grouper, which - # is only called for BaseGrouper, never for BinGrouper - if len(self.groupings) == 1: - return [self.groupings[0].group_arraylike] - - name_list = [] - for ping, codes in zip(self.groupings, self.reconstructed_codes): - codes = ensure_platform_int(codes) - levels = ping.group_arraylike.take(codes) - - name_list.append(levels) - - return name_list - - # ------------------------------------------------------------ - # Aggregation functions - - @final - def _cython_operation( - self, - kind: str, - values, - how: str, - axis: AxisInt, - min_count: int = -1, - **kwargs, - ) -> ArrayLike: - """ - Returns the values of a cython operation. - """ - assert kind in ["transform", "aggregate"] - - cy_op = WrappedCythonOp(kind=kind, how=how, has_dropped_na=self.has_dropped_na) - - ids, _, _ = self.group_info - ngroups = self.ngroups - return cy_op.cython_operation( - values=values, - axis=axis, - min_count=min_count, - comp_ids=ids, - ngroups=ngroups, - **kwargs, - ) - - @final - def agg_series( - self, obj: Series, func: Callable, preserve_dtype: bool = False - ) -> ArrayLike: - """ - Parameters - ---------- - obj : Series - func : function taking a Series and returning a scalar-like - preserve_dtype : bool - Whether the aggregation is known to be dtype-preserving. - - Returns - ------- - np.ndarray or ExtensionArray - """ - - if not isinstance(obj._values, np.ndarray): - # we can preserve a little bit more aggressively with EA dtype - # because maybe_cast_pointwise_result will do a try/except - # with _from_sequence. NB we are assuming here that _from_sequence - # is sufficiently strict that it casts appropriately. - preserve_dtype = True - - result = self._aggregate_series_pure_python(obj, func) - - if len(obj) == 0 and len(result) == 0 and isinstance(obj.dtype, ExtensionDtype): - cls = obj.dtype.construct_array_type() - out = cls._from_sequence(result) - - else: - npvalues = lib.maybe_convert_objects(result, try_float=False) - if preserve_dtype: - out = maybe_cast_pointwise_result( - npvalues, obj.dtype, numeric_only=True - ) - else: - out = npvalues - return out - - @final - def _aggregate_series_pure_python( - self, obj: Series, func: Callable - ) -> npt.NDArray[np.object_]: - _, _, ngroups = self.group_info - - result = np.empty(ngroups, dtype="O") - initialized = False - - splitter = self._get_splitter(obj, axis=0) - - for i, group in enumerate(splitter): - res = func(group) - res = extract_result(res) - - if not initialized: - # We only do this validation on the first iteration - check_result_array(res, group.dtype) - initialized = True - - result[i] = res - - return result - - @final - def apply_groupwise( - self, f: Callable, data: DataFrame | Series, axis: AxisInt = 0 - ) -> tuple[list, bool]: - mutated = False - splitter = self._get_splitter(data, axis=axis) - group_keys = self.group_keys_seq - result_values = [] - - # This calls DataSplitter.__iter__ - zipped = zip(group_keys, splitter) - - for key, group in zipped: - # Pinning name is needed for - # test_group_apply_once_per_group, - # test_inconsistent_return_type, test_set_group_name, - # test_group_name_available_in_inference_pass, - # test_groupby_multi_timezone - object.__setattr__(group, "name", key) - - # group might be modified - group_axes = group.axes - res = f(group) - if not mutated and not _is_indexed_like(res, group_axes, axis): - mutated = True - result_values.append(res) - # getattr pattern for __name__ is needed for functools.partial objects - if len(group_keys) == 0 and getattr(f, "__name__", None) in [ - "skew", - "sum", - "prod", - ]: - # If group_keys is empty, then no function calls have been made, - # so we will not have raised even if this is an invalid dtype. - # So do one dummy call here to raise appropriate TypeError. - f(data.iloc[:0]) - - return result_values, mutated - - # ------------------------------------------------------------ - # Methods for sorting subsets of our GroupBy's object - - @final - @cache_readonly - def _sort_idx(self) -> npt.NDArray[np.intp]: - # Counting sort indexer - ids, _, ngroups = self.group_info - return get_group_index_sorter(ids, ngroups) - - @final - @cache_readonly - def _sorted_ids(self) -> npt.NDArray[np.intp]: - ids, _, _ = self.group_info - return ids.take(self._sort_idx) - - -class BinGrouper(BaseGrouper): - """ - This is an internal Grouper class - - Parameters - ---------- - bins : the split index of binlabels to group the item of axis - binlabels : the label list - indexer : np.ndarray[np.intp], optional - the indexer created by Grouper - some groupers (TimeGrouper) will sort its axis and its - group_info is also sorted, so need the indexer to reorder - - Examples - -------- - bins: [2, 4, 6, 8, 10] - binlabels: DatetimeIndex(['2005-01-01', '2005-01-03', - '2005-01-05', '2005-01-07', '2005-01-09'], - dtype='datetime64[ns]', freq='2D') - - the group_info, which contains the label of each item in grouped - axis, the index of label in label list, group number, is - - (array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]), array([0, 1, 2, 3, 4]), 5) - - means that, the grouped axis has 10 items, can be grouped into 5 - labels, the first and second items belong to the first label, the - third and forth items belong to the second label, and so on - - """ - - bins: npt.NDArray[np.int64] - binlabels: Index - - def __init__( - self, - bins, - binlabels, - indexer=None, - ) -> None: - self.bins = ensure_int64(bins) - self.binlabels = ensure_index(binlabels) - self.indexer = indexer - - # These lengths must match, otherwise we could call agg_series - # with empty self.bins, which would raise later. - assert len(self.binlabels) == len(self.bins) - - @cache_readonly - def groups(self): - """dict {group name -> group labels}""" - # this is mainly for compat - # GH 3881 - result = { - key: value - for key, value in zip(self.binlabels, self.bins) - if key is not NaT - } - return result - - @property - def nkeys(self) -> int: - # still matches len(self.groupings), but we can hard-code - return 1 - - @cache_readonly - def codes_info(self) -> npt.NDArray[np.intp]: - # return the codes of items in original grouped axis - ids, _, _ = self.group_info - if self.indexer is not None: - sorter = np.lexsort((ids, self.indexer)) - ids = ids[sorter] - return ids - - def get_iterator(self, data: NDFrame, axis: AxisInt = 0): - """ - Groupby iterator - - Returns - ------- - Generator yielding sequence of (name, subsetted object) - for each group - """ - if axis == 0: - slicer = lambda start, edge: data.iloc[start:edge] - else: - slicer = lambda start, edge: data.iloc[:, start:edge] - - length = len(data.axes[axis]) - - start = 0 - for edge, label in zip(self.bins, self.binlabels): - if label is not NaT: - yield label, slicer(start, edge) - start = edge - - if start < length: - yield self.binlabels[-1], slicer(start, None) - - @cache_readonly - def indices(self): - indices = collections.defaultdict(list) - - i = 0 - for label, bin in zip(self.binlabels, self.bins): - if i < bin: - if label is not NaT: - indices[label] = list(range(i, bin)) - i = bin - return indices - - @cache_readonly - def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: - ngroups = self.ngroups - obs_group_ids = np.arange(ngroups, dtype=np.intp) - rep = np.diff(np.r_[0, self.bins]) - - rep = ensure_platform_int(rep) - if ngroups == len(self.bins): - comp_ids = np.repeat(np.arange(ngroups), rep) - else: - comp_ids = np.repeat(np.r_[-1, np.arange(ngroups)], rep) - - return ( - ensure_platform_int(comp_ids), - obs_group_ids, - ngroups, - ) - - @cache_readonly - def reconstructed_codes(self) -> list[np.ndarray]: - # get unique result indices, and prepend 0 as groupby starts from the first - return [np.r_[0, np.flatnonzero(self.bins[1:] != self.bins[:-1]) + 1]] - - @cache_readonly - def result_index(self) -> Index: - if len(self.binlabels) != 0 and isna(self.binlabels[0]): - return self.binlabels[1:] - - return self.binlabels - - @property - def levels(self) -> list[Index]: - return [self.binlabels] - - @property - def names(self) -> list[Hashable]: - return [self.binlabels.name] - - @property - def groupings(self) -> list[grouper.Grouping]: - lev = self.binlabels - codes = self.group_info[0] - labels = lev.take(codes) - ping = grouper.Grouping( - labels, labels, in_axis=False, level=None, uniques=lev._values - ) - return [ping] - - -def _is_indexed_like(obj, axes, axis: AxisInt) -> bool: - if isinstance(obj, Series): - if len(axes) > 1: - return False - return obj.axes[axis].equals(axes[axis]) - elif isinstance(obj, DataFrame): - return obj.axes[axis].equals(axes[axis]) - - return False - - -# ---------------------------------------------------------------------- -# Splitting / application - - -class DataSplitter(Generic[NDFrameT]): - def __init__( - self, - data: NDFrameT, - labels: npt.NDArray[np.intp], - ngroups: int, - *, - sort_idx: npt.NDArray[np.intp], - sorted_ids: npt.NDArray[np.intp], - axis: AxisInt = 0, - ) -> None: - self.data = data - self.labels = ensure_platform_int(labels) # _should_ already be np.intp - self.ngroups = ngroups - - self._slabels = sorted_ids - self._sort_idx = sort_idx - - self.axis = axis - assert isinstance(axis, int), axis - - def __iter__(self) -> Iterator: - sdata = self._sorted_data - - if self.ngroups == 0: - # we are inside a generator, rather than raise StopIteration - # we merely return signal the end - return - - starts, ends = lib.generate_slices(self._slabels, self.ngroups) - - for start, end in zip(starts, ends): - yield self._chop(sdata, slice(start, end)) - - @cache_readonly - def _sorted_data(self) -> NDFrameT: - return self.data.take(self._sort_idx, axis=self.axis) - - def _chop(self, sdata, slice_obj: slice) -> NDFrame: - raise AbstractMethodError(self) - - -class SeriesSplitter(DataSplitter): - def _chop(self, sdata: Series, slice_obj: slice) -> Series: - # fastpath equivalent to `sdata.iloc[slice_obj]` - mgr = sdata._mgr.get_slice(slice_obj) - ser = sdata._constructor_from_mgr(mgr, axes=mgr.axes) - ser._name = sdata.name - return ser.__finalize__(sdata, method="groupby") - - -class FrameSplitter(DataSplitter): - def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: - # Fastpath equivalent to: - # if self.axis == 0: - # return sdata.iloc[slice_obj] - # else: - # return sdata.iloc[:, slice_obj] - mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis) - df = sdata._constructor_from_mgr(mgr, axes=mgr.axes) - return df.__finalize__(sdata, method="groupby") +from pandas._core.groupby import ops +from pandas.core.common import _depr_core +_depr_core() -def _get_splitter( - data: NDFrame, - labels: npt.NDArray[np.intp], - ngroups: int, - *, - sort_idx: npt.NDArray[np.intp], - sorted_ids: npt.NDArray[np.intp], - axis: AxisInt = 0, -) -> DataSplitter: - if isinstance(data, Series): - klass: type[DataSplitter] = SeriesSplitter - else: - # i.e. DataFrame - klass = FrameSplitter +_globals = globals() - return klass( - data, labels, ngroups, sort_idx=sort_idx, sorted_ids=sorted_ids, axis=axis - ) +for item in ops.__dir__(): + _globals[item] = getattr(ops, item) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 14cd77ec8559b..e48c9448a5df8 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -43,6 +43,16 @@ ABCSeries, ) +from pandas._core.groupby.generic import SeriesGroupBy +from pandas._core.groupby.groupby import ( + BaseGroupBy, + GroupBy, + _apply_groupings_depr, + _pipe_template, + get_groupby, +) +from pandas._core.groupby.grouper import Grouper +from pandas._core.groupby.ops import BinGrouper import pandas.core.algorithms as algos from pandas.core.apply import ( ResamplerWindowApply, @@ -57,16 +67,6 @@ NDFrame, _shared_docs, ) -from pandas.core.groupby.generic import SeriesGroupBy -from pandas.core.groupby.groupby import ( - BaseGroupBy, - GroupBy, - _apply_groupings_depr, - _pipe_template, - get_groupby, -) -from pandas.core.groupby.grouper import Grouper -from pandas.core.groupby.ops import BinGrouper from pandas.core.indexes.api import MultiIndex from pandas.core.indexes.datetimes import ( DatetimeIndex, diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f8575b1b53908..7711d6e2ab9aa 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -104,7 +104,7 @@ if TYPE_CHECKING: from pandas import DataFrame - from pandas.core import groupby + from pandas._core import groupby from pandas.core.arrays import DatetimeArray from pandas.core.indexes.frozen import FrozenList diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 79354fdd12a2d..83a925eb37aad 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -31,9 +31,9 @@ ABCSeries, ) +from pandas._core.groupby import Grouper import pandas.core.common as com from pandas.core.frame import _shared_docs -from pandas.core.groupby import Grouper from pandas.core.indexes.api import ( Index, MultiIndex, diff --git a/pandas/core/series.py b/pandas/core/series.py index a021ea7961cc0..83048f76ea233 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -193,8 +193,8 @@ npt, ) + from pandas._core.groupby.generic import SeriesGroupBy from pandas.core.frame import DataFrame - from pandas.core.groupby.generic import SeriesGroupBy __all__ = ["Series"] @@ -2190,7 +2190,7 @@ def groupby( observed: bool | lib.NoDefault = lib.no_default, dropna: bool = True, ) -> SeriesGroupBy: - from pandas.core.groupby.generic import SeriesGroupBy + from pandas._core.groupby.generic import SeriesGroupBy if level is None and by is None: raise TypeError("You have to supply one of 'by' and 'level'") diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index f90863a8ea1ef..bf0f4fec84bd2 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -111,7 +111,7 @@ Series, ) from pandas.core.generic import NDFrame - from pandas.core.groupby.ops import BaseGrouper + from pandas._core.groupby.ops import BaseGrouper from pandas.core.arrays.datetimelike import dtype_to_unit @@ -695,7 +695,7 @@ def __init__( _as_index: bool = True, **kwargs, ) -> None: - from pandas.core.groupby.ops import BaseGrouper + from pandas._core.groupby.ops import BaseGrouper if not isinstance(_grouper, BaseGrouper): raise ValueError("Must pass a BaseGrouper object.") diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index a017787f2dc2d..09174ccf02225 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -41,7 +41,7 @@ DataFrame, Series, ) - from pandas.core.groupby.generic import DataFrameGroupBy + from pandas._core.groupby.generic import DataFrameGroupBy def hist_series( diff --git a/pandas/tests/apply/common.py b/pandas/tests/apply/common.py index b4d153df54059..6d30c0ac3cf91 100644 --- a/pandas/tests/apply/common.py +++ b/pandas/tests/apply/common.py @@ -1,4 +1,4 @@ -from pandas.core.groupby.base import transformation_kernels +from pandas._core.groupby.base import transformation_kernels # There is no Series.cumcount or DataFrame.cumcount series_transform_kernels = [ diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 882f42ff18bdd..359ae530e8b62 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -22,8 +22,8 @@ concat, to_datetime, ) +from pandas._core.groupby.grouper import Grouping import pandas._testing as tm -from pandas.core.groupby.grouper import Grouping def test_groupby_agg_no_extra_calls(): diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index 49fa9dc51f0d3..61b4f785d1635 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -2,11 +2,11 @@ import pytest from pandas import DataFrame -import pandas._testing as tm -from pandas.core.groupby.base import ( +from pandas._core.groupby.base import ( reduction_kernels, transformation_kernels, ) +import pandas._testing as tm @pytest.fixture(params=[True, False]) diff --git a/pandas/tests/groupby/test_api.py b/pandas/tests/groupby/test_api.py index 1a030841ba3ab..ed4681449d348 100644 --- a/pandas/tests/groupby/test_api.py +++ b/pandas/tests/groupby/test_api.py @@ -13,12 +13,12 @@ DataFrame, Series, ) -from pandas.core.groupby.base import ( +from pandas._core.groupby.base import ( groupby_other_methods, reduction_kernels, transformation_kernels, ) -from pandas.core.groupby.generic import ( +from pandas._core.groupby.generic import ( DataFrameGroupBy, SeriesGroupBy, ) @@ -119,7 +119,7 @@ def test_all_methods_categorized(mframe): Was a new method recently added? Every public method On Grouper must appear in exactly one the -following three lists defined in pandas.core.groupby.base: +following three lists defined in pandas._core.groupby.base: - `reduction_kernels` - `transformation_kernels` - `groupby_other_methods` diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 01768582299eb..3084cd8fb0712 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -20,8 +20,8 @@ Timestamp, date_range, ) +from pandas._core.groupby.grouper import Grouping import pandas._testing as tm -from pandas.core.groupby.grouper import Grouping # selection # -------------------------------- diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index be02c7f79ba01..d883a10670f38 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -21,9 +21,9 @@ date_range, offsets, ) +from pandas._core.groupby.grouper import Grouper +from pandas._core.groupby.ops import BinGrouper import pandas._testing as tm -from pandas.core.groupby.grouper import Grouper -from pandas.core.groupby.ops import BinGrouper @pytest.fixture diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index fee52780585b8..11aadbdb30a4c 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -11,9 +11,9 @@ Series, TimedeltaIndex, ) +from pandas._core.groupby.groupby import DataError +from pandas._core.groupby.grouper import Grouper import pandas._testing as tm -from pandas.core.groupby.groupby import DataError -from pandas.core.groupby.grouper import Grouper from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.period import period_range from pandas.core.indexes.timedeltas import timedelta_range diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 9cf4e68a9c5ec..fa51b63d6953b 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -17,8 +17,8 @@ isna, notna, ) +from pandas._core.groupby.grouper import Grouper import pandas._testing as tm -from pandas.core.groupby.grouper import Grouper from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.period import ( Period, diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 41d34be79bc9c..8145d1b995cba 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -10,8 +10,8 @@ Series, Timestamp, ) +from pandas._core.groupby.grouper import Grouper import pandas._testing as tm -from pandas.core.groupby.grouper import Grouper from pandas.core.indexes.datetimes import date_range diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index a23c91df5eef6..fdf13b59bf965 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -10,9 +10,9 @@ date_range, to_datetime, ) +from pandas._core.groupby.groupby import get_groupby import pandas._testing as tm from pandas.api.indexers import BaseIndexer -from pandas.core.groupby.groupby import get_groupby @pytest.fixture diff --git a/pyproject.toml b/pyproject.toml index 26d52d97b0934..adcb35959de2f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -598,11 +598,11 @@ module = [ "pandas.core.dtypes.generic", # TODO "pandas.core.dtypes.inference", # TODO "pandas.core.dtypes.missing", # TODO - "pandas.core.groupby.categorical", # TODO - "pandas.core.groupby.generic", # TODO - "pandas.core.groupby.grouper", # TODO - "pandas.core.groupby.groupby", # TODO - "pandas.core.groupby.ops", # TODO + "pandas._core.groupby.categorical", # TODO + "pandas._core.groupby.generic", # TODO + "pandas._core.groupby.grouper", # TODO + "pandas._core.groupby.groupby", # TODO + "pandas._core.groupby.ops", # TODO "pandas.core.indexers.*", # TODO "pandas.core.indexes.*", # TODO "pandas.core.interchange.column", # TODO diff --git a/pyright_reportGeneralTypeIssues.json b/pyright_reportGeneralTypeIssues.json index e155b34053069..eb90ce359c4e9 100644 --- a/pyright_reportGeneralTypeIssues.json +++ b/pyright_reportGeneralTypeIssues.json @@ -46,10 +46,10 @@ "pandas/core/dtypes/dtypes.py", "pandas/core/frame.py", "pandas/core/generic.py", - "pandas/core/groupby/generic.py", - "pandas/core/groupby/groupby.py", - "pandas/core/groupby/grouper.py", - "pandas/core/groupby/ops.py", + "pandas/_core/groupby/generic.py", + "pandas/_core/groupby/groupby.py", + "pandas/_core/groupby/grouper.py", + "pandas/_core/groupby/ops.py", "pandas/core/indexers/utils.py", "pandas/core/indexes/base.py", "pandas/core/indexes/category.py", From 2136bb277899f710573fdd71e34f0a4208425c13 Mon Sep 17 00:00:00 2001 From: richard Date: Wed, 15 Nov 2023 20:35:05 -0500 Subject: [PATCH 3/3] Manual changes --- pandas/__init__.py | 4 +++- pandas/_core/__init__.py | 0 pandas/tests/api/test_api.py | 1 + pandas/util/_tester.py | 1 + pyright_reportGeneralTypeIssues.json | 1 + 5 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 pandas/_core/__init__.py diff --git a/pandas/__init__.py b/pandas/__init__.py index 7fab662ed2de4..d8b499282d2d0 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -44,7 +44,7 @@ ) # let init-time option registration happen -import pandas.core.config_init # pyright: ignore[reportUnusedImport] # noqa: F401 +import pandas.core.config_init # pyright: ignore[reportUnusedImport] from pandas.core.api import ( # dtype @@ -365,3 +365,5 @@ "value_counts", "wide_to_long", ] + +import pandas.core.groupby # pyright: ignore[reportUnusedImport] # noqa: F401 diff --git a/pandas/_core/__init__.py b/pandas/_core/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index bc50726a1b5f9..dd462caa75b2c 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -186,6 +186,7 @@ class TestPDApi(Base): # private modules in pandas namespace private_modules = [ "_config", + "_core", "_libs", "_is_numpy_dev", "_pandas_datetime_CAPI", diff --git a/pandas/util/_tester.py b/pandas/util/_tester.py index 7cfddef7ddff8..c3e2379bc9fc8 100644 --- a/pandas/util/_tester.py +++ b/pandas/util/_tester.py @@ -43,6 +43,7 @@ def test(extra_args: list[str] | None = None, run_doctests: bool = False) -> Non "--doctest-modules", "--doctest-cython", f"--ignore={os.path.join(PKG, 'tests')}", + f"--ignore={os.path.join(PKG, 'core', 'groupby')}", ] cmd += [PKG] joined = " ".join(cmd) diff --git a/pyright_reportGeneralTypeIssues.json b/pyright_reportGeneralTypeIssues.json index eb90ce359c4e9..b6dd9accb9e45 100644 --- a/pyright_reportGeneralTypeIssues.json +++ b/pyright_reportGeneralTypeIssues.json @@ -10,6 +10,7 @@ "exclude": [ "pandas/tests", + "pandas/core/groupby", "pandas/io/clipboard", "pandas/util/version",