From e3d1a79a76b4a29663b5cdb8b0701a878722e6e1 Mon Sep 17 00:00:00 2001 From: Tony Tao Date: Mon, 1 Oct 2018 00:30:23 -0500 Subject: [PATCH 1/9] DOC GH22893 Fix docstring of groupby in pandas/core/generic.py --- pandas/core/generic.py | 63 ++++++++++++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 15 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 393e7caae5fab..c56c55c213acd 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7063,8 +7063,10 @@ def clip_lower(self, threshold, axis=None, inplace=False): def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False, observed=False, **kwargs): """ - Group series using mapper (dict or key function, apply given function - to group, return result as series) or by a series of columns. + Group series using a mapper or by a series of columns. + + The mapper is a dict or key function that applies the given function + to group and return result as series. Parameters ---------- @@ -7078,26 +7080,29 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, labels may be passed to group by the columns in ``self``. Notice that a tuple is interpreted a (single) key. axis : int, default 0 + If 0, split by rows. If 1, split by columns. level : int, level name, or sequence of such, default None If the axis is a MultiIndex (hierarchical), group by a particular - level or levels + level or levels. as_index : boolean, default True For aggregated output, return object with group labels as the index. Only relevant for DataFrame input. as_index=False is - effectively "SQL-style" grouped output + effectively "SQL-style" grouped output. sort : boolean, default True Sort group keys. Get better performance by turning this off. Note this does not influence the order of observations within each group. groupby preserves the order of rows within each group. group_keys : boolean, default True - When calling apply, add group keys to index to identify pieces + When calling apply, add group keys to index to identify pieces. squeeze : boolean, default False - reduce the dimensionality of the return type if possible, - otherwise return a consistent type + Reduce the dimensionality of the return type if possible, + otherwise return a consistent type. observed : boolean, default False This only applies if any of the groupers are Categoricals If True: only show observed values for categorical groupers. If False: show all values for categorical groupers. + **kwargs + Only accepts argument 'mutated'. .. versionadded:: 0.23.0 @@ -7107,14 +7112,42 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, Examples -------- - DataFrame results - - >>> data.groupby(func, axis=0).mean() - >>> data.groupby(['col1', 'col2'])['col3'].mean() - - DataFrame with hierarchical index - - >>> data.groupby(['col1', 'col2']).mean() + >>> df = pd.DataFrame({'col1' : ['A', 'A', 'B', 'B'], + ... 'col2' : [1, 2, 3, 4]}) + >>> df + col1 col2 + 0 A 1 + 1 A 2 + 2 B 3 + 3 B 4 + >>> df.groupby(['col1']).mean() + col2 + col1 + A 1.5 + B 3.5 + + **Hierarchical indexes** + + We can groupby different levels of a hierarchical index + using the `level` parameter: + + >>> arrays = [np.array(['A', 'A', 'B', 'B']), + ... np.array(['foo', 'bar', 'foo', 'bar'])] + >>> df = pd.DataFrame(np.array([1, 2, 3, 4]), index=arrays) + >>> df + 0 + A foo 1 + bar 2 + B foo 3 + bar 4 + >>> df.groupby(level=0).mean() + 0 + A 1.5 + B 3.5 + >>> df.groupby(level=1).mean() + 0 + bar 3 + foo 2 Notes ----- From 62c9c3ad6024c80f297d12639fc00b3db5e97faa Mon Sep 17 00:00:00 2001 From: Tony Tao Date: Mon, 1 Oct 2018 00:48:14 -0500 Subject: [PATCH 2/9] DOC GH22893 Fix docstring of groupby in pandas/core/generic.py --- pandas/core/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c56c55c213acd..8308c5e94c85b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7066,7 +7066,7 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, Group series using a mapper or by a series of columns. The mapper is a dict or key function that applies the given function - to group and return result as series. + on the selected axis and returns the result as a series. Parameters ---------- @@ -7080,7 +7080,7 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, labels may be passed to group by the columns in ``self``. Notice that a tuple is interpreted a (single) key. axis : int, default 0 - If 0, split by rows. If 1, split by columns. + If 0, group by rows. If 1, group by columns. level : int, level name, or sequence of such, default None If the axis is a MultiIndex (hierarchical), group by a particular level or levels. From a02652fc4c3fd54991ce161c34be8e9e6abfbcf5 Mon Sep 17 00:00:00 2001 From: Tony Tao Date: Mon, 1 Oct 2018 02:15:38 -0500 Subject: [PATCH 3/9] Minor fixes --- pandas/core/generic.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8308c5e94c85b..d4e044fcee753 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7091,18 +7091,19 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, sort : boolean, default True Sort group keys. Get better performance by turning this off. Note this does not influence the order of observations within each - group. groupby preserves the order of rows within each group. + group. Groupby preserves the order of rows within each group. group_keys : boolean, default True When calling apply, add group keys to index to identify pieces. squeeze : boolean, default False Reduce the dimensionality of the return type if possible, otherwise return a consistent type. observed : boolean, default False - This only applies if any of the groupers are Categoricals + This only applies if any of the groupers are Categoricals. If True: only show observed values for categorical groupers. If False: show all values for categorical groupers. **kwargs - Only accepts argument 'mutated'. + Optional, only accepts keyword argument 'mutated' + and is passed to groupby. .. versionadded:: 0.23.0 From 3b09a713ada839e23362e234694b6cac974235ba Mon Sep 17 00:00:00 2001 From: Tony Tao Date: Mon, 1 Oct 2018 04:19:00 -0500 Subject: [PATCH 4/9] Minor fixes and updated description --- pandas/core/generic.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d4e044fcee753..53a96ee56095a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7065,8 +7065,10 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, """ Group series using a mapper or by a series of columns. - The mapper is a dict or key function that applies the given function - on the selected axis and returns the result as a series. + Any groupby operation involves some combination of splitting the + object, applying a function, and combining the results. This can be + used to group large amounts of data and compute operations on these + groups. Parameters ---------- @@ -7079,8 +7081,8 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, values are used as-is determine the groups. A label or list of labels may be passed to group by the columns in ``self``. Notice that a tuple is interpreted a (single) key. - axis : int, default 0 - If 0, group by rows. If 1, group by columns. + axis : {0 or 'index', 1 or 'columns', None} + Split along rows (0) or columns (1). level : int, level name, or sequence of such, default None If the axis is a MultiIndex (hierarchical), group by a particular level or levels. @@ -7101,12 +7103,13 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, This only applies if any of the groupers are Categoricals. If True: only show observed values for categorical groupers. If False: show all values for categorical groupers. + + .. versionadded:: 0.23.0 + **kwargs Optional, only accepts keyword argument 'mutated' and is passed to groupby. - .. versionadded:: 0.23.0 - Returns ------- GroupBy object From 1579ba18ad81411acc554e52d03effabc2d937d1 Mon Sep 17 00:00:00 2001 From: Tony Tao Date: Mon, 1 Oct 2018 04:47:44 -0500 Subject: [PATCH 5/9] Minor fixes, updated return description --- pandas/core/generic.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 53a96ee56095a..6e4ea9e4377e6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7065,7 +7065,7 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, """ Group series using a mapper or by a series of columns. - Any groupby operation involves some combination of splitting the + A groupby operation involves some combination of splitting the object, applying a function, and combining the results. This can be used to group large amounts of data and compute operations on these groups. @@ -7081,25 +7081,25 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, values are used as-is determine the groups. A label or list of labels may be passed to group by the columns in ``self``. Notice that a tuple is interpreted a (single) key. - axis : {0 or 'index', 1 or 'columns', None} + axis : {0 or 'index', 1 or 'columns'} Split along rows (0) or columns (1). level : int, level name, or sequence of such, default None If the axis is a MultiIndex (hierarchical), group by a particular level or levels. - as_index : boolean, default True + as_index : bool, default True For aggregated output, return object with group labels as the index. Only relevant for DataFrame input. as_index=False is effectively "SQL-style" grouped output. - sort : boolean, default True + sort : bool, default True Sort group keys. Get better performance by turning this off. Note this does not influence the order of observations within each group. Groupby preserves the order of rows within each group. - group_keys : boolean, default True + group_keys : bool, default True When calling apply, add group keys to index to identify pieces. - squeeze : boolean, default False + squeeze : bool, default False Reduce the dimensionality of the return type if possible, otherwise return a consistent type. - observed : boolean, default False + observed : bool, default False This only applies if any of the groupers are Categoricals. If True: only show observed values for categorical groupers. If False: show all values for categorical groupers. @@ -7107,12 +7107,18 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, .. versionadded:: 0.23.0 **kwargs - Optional, only accepts keyword argument 'mutated' - and is passed to groupby. + Optional, only accepts keyword argument 'mutated' and is passed + to groupby. Returns ------- - GroupBy object + DataFrameGroupBy object + An object that contains information about the groups. + + See Also + -------- + resample : Convenience method for frequency conversion and resampling + of time series. Examples -------- @@ -7130,7 +7136,7 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, A 1.5 B 3.5 - **Hierarchical indexes** + **Hierarchical Indexes** We can groupby different levels of a hierarchical index using the `level` parameter: @@ -7157,11 +7163,6 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, ----- See the `user guide `_ for more. - - See also - -------- - resample : Convenience method for frequency conversion and resampling - of time series. """ from pandas.core.groupby.groupby import groupby From bc75b8abbcdcbe645e9e41f904768c79be489c8f Mon Sep 17 00:00:00 2001 From: Tony Tao Date: Mon, 1 Oct 2018 16:33:29 -0500 Subject: [PATCH 6/9] Various fixes, meaningful examples added --- pandas/core/generic.py | 78 +++++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 36 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6e4ea9e4377e6..32fdb6a04aeff 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7063,7 +7063,7 @@ def clip_lower(self, threshold, axis=None, inplace=False): def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False, observed=False, **kwargs): """ - Group series using a mapper or by a series of columns. + Group dataframe or series using a mapper or by a series of columns. A groupby operation involves some combination of splitting the object, applying a function, and combining the results. This can be @@ -7081,7 +7081,7 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, values are used as-is determine the groups. A label or list of labels may be passed to group by the columns in ``self``. Notice that a tuple is interpreted a (single) key. - axis : {0 or 'index', 1 or 'columns'} + axis : {0 or 'index', 1 or 'columns'}, default 0 Split along rows (0) or columns (1). level : int, level name, or sequence of such, default None If the axis is a MultiIndex (hierarchical), group by a particular @@ -7112,57 +7112,63 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, Returns ------- - DataFrameGroupBy object - An object that contains information about the groups. + DataFrameGroupBy or SeriesGroupBy + Depends on the calling object and returns groupby object that + contains information about the groups. See Also -------- resample : Convenience method for frequency conversion and resampling of time series. + Notes + ----- + See the `user guide + `_ for more. + Examples -------- - >>> df = pd.DataFrame({'col1' : ['A', 'A', 'B', 'B'], - ... 'col2' : [1, 2, 3, 4]}) + >>> df = pd.DataFrame({'Student' : ['Bob', 'Bob', 'Mary', 'Mary'], + ... 'Grade' : [100, 92, 82, 85]}) >>> df - col1 col2 - 0 A 1 - 1 A 2 - 2 B 3 - 3 B 4 - >>> df.groupby(['col1']).mean() - col2 - col1 - A 1.5 - B 3.5 + Student Grade + 0 Bob 100 + 1 Bob 92 + 2 Mary 82 + 3 Mary 85 + >>> df.groupby(['Student']).mean() + Grade + Student + Bob 96.0 + Mary 83.5 **Hierarchical Indexes** We can groupby different levels of a hierarchical index using the `level` parameter: - >>> arrays = [np.array(['A', 'A', 'B', 'B']), - ... np.array(['foo', 'bar', 'foo', 'bar'])] - >>> df = pd.DataFrame(np.array([1, 2, 3, 4]), index=arrays) + >>> arrays = [['TX', 'TX', 'NY', 'NY'], + ... ['Urban', 'Rural', 'Urban', 'Rural']] + >>> index = pd.MultiIndex.from_arrays(arrays, names=('State', 'Type')) + >>> df = pd.DataFrame({'Pop %' : [84.7, 15.3, 87.9, 12.1]}, + ... index=index) >>> df - 0 - A foo 1 - bar 2 - B foo 3 - bar 4 - >>> df.groupby(level=0).mean() - 0 - A 1.5 - B 3.5 + Pop % + State Type + TX Urban 84.7 + Rural 15.3 + NY Urban 87.9 + Rural 12.1 + >>> df.groupby(level=0).sum() + Pop % + State + NY 100.0 + TX 100.0 >>> df.groupby(level=1).mean() - 0 - bar 3 - foo 2 - - Notes - ----- - See the `user guide - `_ for more. + Pop % + Type + Rural 13.7 + Urban 86.3 """ from pandas.core.groupby.groupby import groupby From d44a867d02d9b74c164e3556974ec93af02474c8 Mon Sep 17 00:00:00 2001 From: Tony Tao Date: Mon, 1 Oct 2018 16:36:00 -0500 Subject: [PATCH 7/9] Removed trailing whitespaces --- pandas/core/generic.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 32fdb6a04aeff..15eb2d6dc839e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7113,7 +7113,7 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, Returns ------- DataFrameGroupBy or SeriesGroupBy - Depends on the calling object and returns groupby object that + Depends on the calling object and returns groupby object that contains information about the groups. See Also @@ -7138,7 +7138,7 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, 3 Mary 85 >>> df.groupby(['Student']).mean() Grade - Student + Student Bob 96.0 Mary 83.5 @@ -7154,19 +7154,19 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, ... index=index) >>> df Pop % - State Type + State Type TX Urban 84.7 Rural 15.3 NY Urban 87.9 Rural 12.1 >>> df.groupby(level=0).sum() Pop % - State + State NY 100.0 TX 100.0 >>> df.groupby(level=1).mean() Pop % - Type + Type Rural 13.7 Urban 86.3 """ From 3f748f07005a6e0cdd03b99eb4a8e7f11f5f7a6f Mon Sep 17 00:00:00 2001 From: Tony Tao Date: Mon, 1 Oct 2018 17:14:22 -0500 Subject: [PATCH 8/9] Redid examples to fit convention --- pandas/core/generic.py | 61 +++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 15eb2d6dc839e..5edc3142f1726 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7128,47 +7128,48 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, Examples -------- - >>> df = pd.DataFrame({'Student' : ['Bob', 'Bob', 'Mary', 'Mary'], - ... 'Grade' : [100, 92, 82, 85]}) + >>> df = pd.DataFrame({'Animal' : ['Falcon', 'Falcon', + ... 'Parrot', 'Parrot'], + ... 'Max Speed' : [380., 370., 24., 26.]}) >>> df - Student Grade - 0 Bob 100 - 1 Bob 92 - 2 Mary 82 - 3 Mary 85 - >>> df.groupby(['Student']).mean() - Grade - Student - Bob 96.0 - Mary 83.5 + Animal Max Speed + 0 Falcon 380.0 + 1 Falcon 370.0 + 2 Parrot 24.0 + 3 Parrot 26.0 + >>> df.groupby(['Animal']).mean() + Max Speed + Animal + Falcon 375.0 + Parrot 25.0 **Hierarchical Indexes** We can groupby different levels of a hierarchical index using the `level` parameter: - >>> arrays = [['TX', 'TX', 'NY', 'NY'], - ... ['Urban', 'Rural', 'Urban', 'Rural']] - >>> index = pd.MultiIndex.from_arrays(arrays, names=('State', 'Type')) - >>> df = pd.DataFrame({'Pop %' : [84.7, 15.3, 87.9, 12.1]}, + >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], + ... ['Capitve', 'Wild', 'Capitve', 'Wild']] + >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) + >>> df = pd.DataFrame({'Max Speed' : [390., 350., 30., 20.]}, ... index=index) >>> df - Pop % - State Type - TX Urban 84.7 - Rural 15.3 - NY Urban 87.9 - Rural 12.1 - >>> df.groupby(level=0).sum() - Pop % - State - NY 100.0 - TX 100.0 + Max Speed + Animal Type + Falcon Capitve 390.0 + Wild 350.0 + Parrot Capitve 30.0 + Wild 20.0 + >>> df.groupby(level=0).mean() + Max Speed + Animal + Falcon 370.0 + Parrot 25.0 >>> df.groupby(level=1).mean() - Pop % + Max Speed Type - Rural 13.7 - Urban 86.3 + Capitve 210.0 + Wild 185.0 """ from pandas.core.groupby.groupby import groupby From 1a7237d25bf7d14fbbe879fc27608a6eeaad3cd1 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Tue, 2 Oct 2018 17:00:41 -0700 Subject: [PATCH 9/9] Update generic.py --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5edc3142f1726..5b4ce5a382324 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7063,7 +7063,7 @@ def clip_lower(self, threshold, axis=None, inplace=False): def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False, observed=False, **kwargs): """ - Group dataframe or series using a mapper or by a series of columns. + Group DataFrame or Series using a mapper or by a Series of columns. A groupby operation involves some combination of splitting the object, applying a function, and combining the results. This can be