From 11ba53510098618f4c2441e4df7f24b7b80164e3 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Sat, 3 Dec 2022 19:41:10 -0900 Subject: [PATCH 1/4] DOC: Improve groupby().ngroup() explanation for missing groups --- pandas/core/groupby/groupby.py | 37 ++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 659ca228bdcb0..d475aa31e7c4b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3212,6 +3212,9 @@ def ngroup(self, ascending: bool = True): would be seen when iterating over the groupby object, not the order they are first observed. + If a group would be excluded (due to null keys) then that + group is labeled as np.nan. See examples below. + Parameters ---------- ascending : bool, default True @@ -3228,15 +3231,17 @@ def ngroup(self, ascending: bool = True): Examples -------- - >>> df = pd.DataFrame({"A": list("aaabba")}) + >>> df = pd.DataFrame() + >>> df["A"] = ["a", "a", "a", "b", "b", "a"] + >>> df["B"] = ["a", None, "a", "b", "b", "a"] >>> df - A - 0 a - 1 a - 2 a - 3 b - 4 b - 5 a + A B + 0 a a + 1 a None + 2 a a + 3 b b + 4 b b + 5 a a >>> df.groupby('A').ngroup() 0 0 1 0 @@ -3261,6 +3266,22 @@ def ngroup(self, ascending: bool = True): 4 2 5 0 dtype: int64 + >>> df.groupby("B").ngroup() + 0 0.0 + 1 NaN + 2 0.0 + 3 1.0 + 4 1.0 + 5 0.0 + dtype: float64 + >>> df.groupby("B", dropna=False).ngroup() + 0 0 + 1 2 + 2 0 + 3 1 + 4 1 + 5 0 + dtype: int64 """ with self._group_selection_context(): index = self._selected_obj.index From 9466d4d5747f034737dd2ffb546e97ec500abbad Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 30 Dec 2022 15:32:59 -0900 Subject: [PATCH 2/4] DOC: fixup PR suggestions Per comments at https://github.com/pandas-dev/pandas/pull/50049 --- pandas/core/groupby/groupby.py | 50 ++++++++-------------------------- 1 file changed, 12 insertions(+), 38 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d475aa31e7c4b..82b83da519e1e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3212,8 +3212,8 @@ def ngroup(self, ascending: bool = True): would be seen when iterating over the groupby object, not the order they are first observed. - If a group would be excluded (due to null keys) then that - group is labeled as np.nan. See examples below. + Groups with missing keys (where `pd.isna()` is True) will be labeled with `NaN` + and will be skipped from the count. Parameters ---------- @@ -3231,42 +3231,16 @@ def ngroup(self, ascending: bool = True): Examples -------- - >>> df = pd.DataFrame() - >>> df["A"] = ["a", "a", "a", "b", "b", "a"] - >>> df["B"] = ["a", None, "a", "b", "b", "a"] + >>> df = pd.DataFrame({"color": ["red", None, "red", "blue", "blue", "red"]}) >>> df - A B - 0 a a - 1 a None - 2 a a - 3 b b - 4 b b - 5 a a - >>> df.groupby('A').ngroup() - 0 0 - 1 0 - 2 0 - 3 1 - 4 1 - 5 0 - dtype: int64 - >>> df.groupby('A').ngroup(ascending=False) - 0 1 - 1 1 - 2 1 - 3 0 - 4 0 - 5 1 - dtype: int64 - >>> df.groupby(["A", [1,1,2,3,2,1]]).ngroup() - 0 0 - 1 0 - 2 1 - 3 3 - 4 2 - 5 0 - dtype: int64 - >>> df.groupby("B").ngroup() + color + 0 red + 1 None + 2 red + 3 blue + 4 blue + 5 red + >>> df.groupby("color").ngroup() 0 0.0 1 NaN 2 0.0 @@ -3274,7 +3248,7 @@ def ngroup(self, ascending: bool = True): 4 1.0 5 0.0 dtype: float64 - >>> df.groupby("B", dropna=False).ngroup() + >>> df.groupby("color", dropna=False).ngroup() 0 0 1 2 2 0 From 66ae02eb6ebd754f87b779f635058e02aa155f88 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Sun, 1 Jan 2023 20:41:41 -0900 Subject: [PATCH 3/4] DOC: fixup: update order of labels Now I guess since the groups are lexicographically sorted, and we are using "red" and "blue" instead of "a" and "b", the ngroup labels have swapped order. I think therefore that this should be deterministic and not flaky. --- pandas/core/groupby/groupby.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 82b83da519e1e..b2659a1d019b5 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3241,20 +3241,20 @@ def ngroup(self, ascending: bool = True): 4 blue 5 red >>> df.groupby("color").ngroup() - 0 0.0 + 0 1.0 1 NaN - 2 0.0 - 3 1.0 - 4 1.0 - 5 0.0 + 2 1.0 + 3 0.0 + 4 0.0 + 5 1.0 dtype: float64 >>> df.groupby("color", dropna=False).ngroup() - 0 0 + 0 1 1 2 - 2 0 - 3 1 - 4 1 - 5 0 + 2 1 + 3 0 + 4 0 + 5 1 dtype: int64 """ with self._group_selection_context(): From e5d0075b216c15c1cc4c508e78f8239ab3070569 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Mon, 2 Jan 2023 22:09:46 -0900 Subject: [PATCH 4/4] DOC: fixup: restore ascending=false example in docstring I chose to use `dropna=False` because I wanted to show that NA keys are placed BEFORE other keys. I figured the `dropna=True` example was obvious enough from this and I didn't need that one as well, otherwise I thought things got very verbose. --- pandas/core/groupby/groupby.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b2659a1d019b5..9a813e866e8d0 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3256,6 +3256,14 @@ def ngroup(self, ascending: bool = True): 4 0 5 1 dtype: int64 + >>> df.groupby("color", dropna=False).ngroup(ascending=False) + 0 1 + 1 0 + 2 1 + 3 2 + 4 2 + 5 1 + dtype: int64 """ with self._group_selection_context(): index = self._selected_obj.index