From ccd751805d90ad8182b723e4ea3427721ef4b77b Mon Sep 17 00:00:00 2001 From: ed_abati Date: Sun, 21 Oct 2018 15:50:21 +0100 Subject: [PATCH 1/4] DOC: Updated the docstring of Series.rank / DataFrame.rank --- pandas/core/generic.py | 107 ++++++++++++++++++++++++++++++++++------- 1 file changed, 90 insertions(+), 17 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 31b700abcfdb3..d58b3bb378362 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7715,34 +7715,107 @@ def last(self, offset): def rank(self, axis=0, method='average', numeric_only=None, na_option='keep', ascending=True, pct=False): """ - Compute numerical data ranks (1 through n) along axis. Equal values are - assigned a rank that is the average of the ranks of those values + Compute numerical data ranks (1 through n) along axis. + + By default, equal values are assigned a rank that is the average of the + ranks of those values. Parameters ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 - index to direct ranking - method : {'average', 'min', 'max', 'first', 'dense'} - * average: average rank of group - * min: lowest rank in group - * max: highest rank in group - * first: ranks assigned in order they appear in the array - * dense: like 'min', but rank always increases by 1 between groups + Index to direct ranking. + method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' + Which method to use to rank equal values: + * average: average rank of group. + * min: lowest rank in group. + * max: highest rank in group. + * first: ranks assigned in order they appear in the array. + * dense: like 'min', but rank always increases by 1 between groups. numeric_only : boolean, default None Include only float, int, boolean data. Valid only for DataFrame or - Panel objects - na_option : {'keep', 'top', 'bottom'} - * keep: leave NA values where they are - * top: smallest rank if ascending - * bottom: smallest rank if descending + Panel objects. + na_option : {'keep', 'top', 'bottom'}, default 'keep' + How to rank NaN values: + * keep: assign NaN rank to NaN values. + * top: assign smallest rank to NaN values if ascending. + * bottom: assign highest rank to NaN values if ascending. ascending : boolean, default True - False for ranks by high (1) to low (N) + False for ranks by high (1) to low (N). pct : boolean, default False - Computes percentage rank of data + Computes percentage rank of data. Returns ------- - ranks : same type as caller + ranks : Series or DataFrame + + Examples + -------- + + The default behaviour returns average ranks of every columns + + >>> df = pd.DataFrame(data={'Customer':['A','B','C','D','E'], + ... 'Tot_Spend':[12,20,20,18,16]}) + >>> df.rank() + Customer Tot_Spend + 0 1.0 1.0 + 1 2.0 4.5 + 2 3.0 4.5 + 3 4.0 3.0 + 4 5.0 2.0 + + The argument numeric_only will only return rank for float, int and + boolean data + + >>> df.rank(numeric_only=True) + Tot_Spend + 0 1.0 + 1 4.5 + 2 4.5 + 3 3.0 + 4 2.0 + + The following examples show how rank behaves with every different + method and setting pct = True + + >>> df['default_rank'] = df['Tot_Spend'].rank() + >>> df['min_rank'] = df['Tot_Spend'].rank(method='min') + >>> df['max_rank'] = df['Tot_Spend'].rank(method='max') + >>> df['dense_rank'] = df['Tot_Spend'].rank(method='dense') + >>> df[['Tot_Spend','default_rank','min_rank','max_rank','dense_rank']] + Tot_Spend default_rank min_rank max_rank dense_rank + 0 12 1.0 1.0 1.0 1.0 + 1 20 4.5 4.0 5.0 4.0 + 2 20 4.5 4.0 5.0 4.0 + 3 18 3.0 3.0 3.0 3.0 + 4 16 2.0 2.0 2.0 2.0 + >>> df['default_rank'] = df['Tot_Spend'].rank() + >>> df['pct_rank'] = df['Tot_Spend'].rank(pct=True) + >>> df[['Tot_Spend','default_rank','pct_rank']] + Tot_Spend default_rank pct_rank + 0 12 1.0 0.2 + 1 20 4.5 0.9 + 2 20 4.5 0.9 + 3 18 3.0 0.6 + 4 16 2.0 0.4 + + The following example shows how rank behaves with NAs + + >>> df = pd.DataFrame(data={'Student':['A','B','C','D','E'], + ... 'Score':[78, np.nan, 68, 90, 68]}) + >>> df['NA_keep'] = df['Score'].rank() + >>> df['NA_min'] = df['Score'].rank(na_option='bottom') + >>> df['NA_max'] = df['Score'].rank(na_option='top') + >>> df + Student Score NA_keep NA_min NA_max + 0 A 78.0 3.0 3.0 4.0 + 1 B NaN NaN 5.0 1.0 + 2 C 68.0 1.5 1.5 2.5 + 3 D 90.0 4.0 4.0 5.0 + 4 E 68.0 1.5 1.5 2.5 + + See also + -------- + GroupBy.rank : Rank of values within each group. """ axis = self._get_axis_number(axis) From 363ccc0ba9edb667f403f940412cc26d29e4d881 Mon Sep 17 00:00:00 2001 From: ed_abati Date: Sun, 21 Oct 2018 17:59:11 +0100 Subject: [PATCH 2/4] DOC: updated the rank docstring with proposed changes --- pandas/core/generic.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d58b3bb378362..c06ce35cbdc0a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7731,7 +7731,7 @@ def rank(self, axis=0, method='average', numeric_only=None, * max: highest rank in group. * first: ranks assigned in order they appear in the array. * dense: like 'min', but rank always increases by 1 between groups. - numeric_only : boolean, default None + numeric_only : bool, default None Include only float, int, boolean data. Valid only for DataFrame or Panel objects. na_option : {'keep', 'top', 'bottom'}, default 'keep' @@ -7739,9 +7739,9 @@ def rank(self, axis=0, method='average', numeric_only=None, * keep: assign NaN rank to NaN values. * top: assign smallest rank to NaN values if ascending. * bottom: assign highest rank to NaN values if ascending. - ascending : boolean, default True + ascending : bool, default True False for ranks by high (1) to low (N). - pct : boolean, default False + pct : bool, default False Computes percentage rank of data. Returns @@ -7751,7 +7751,7 @@ def rank(self, axis=0, method='average', numeric_only=None, Examples -------- - The default behaviour returns average ranks of every columns + The default behavior returns average ranks of every columns >>> df = pd.DataFrame(data={'Customer':['A','B','C','D','E'], ... 'Tot_Spend':[12,20,20,18,16]}) From 878c82999d58610ce0ff9d17a5d194de0917b76f Mon Sep 17 00:00:00 2001 From: ed_abati Date: Sat, 27 Oct 2018 19:29:23 +0100 Subject: [PATCH 3/4] fixed punctuation, pep8 in examples, see also, removed numeric_only example --- pandas/core/generic.py | 65 ++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 37 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c06ce35cbdc0a..68fbdbeb17af7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7725,20 +7725,20 @@ def rank(self, axis=0, method='average', numeric_only=None, axis : {0 or 'index', 1 or 'columns'}, default 0 Index to direct ranking. method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' - Which method to use to rank equal values: - * average: average rank of group. - * min: lowest rank in group. - * max: highest rank in group. - * first: ranks assigned in order they appear in the array. - * dense: like 'min', but rank always increases by 1 between groups. - numeric_only : bool, default None - Include only float, int, boolean data. Valid only for DataFrame or - Panel objects. + * average: average rank of group + * min: lowest rank in group + * max: highest rank in group + * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups + numeric_only : bool, optional + If True, rank only float, int and boolean data. + Valid only for DataFrame or Panel objects. na_option : {'keep', 'top', 'bottom'}, default 'keep' How to rank NaN values: - * keep: assign NaN rank to NaN values. - * top: assign smallest rank to NaN values if ascending. - * bottom: assign highest rank to NaN values if ascending. + + * keep: assign NaN rank to NaN values + * top: assign smallest rank to NaN values if ascending + * bottom: assign highest rank to NaN values if ascending ascending : bool, default True False for ranks by high (1) to low (N). pct : bool, default False @@ -7746,15 +7746,19 @@ def rank(self, axis=0, method='average', numeric_only=None, Returns ------- - ranks : Series or DataFrame + ranks : same type as caller + Return a Series or DataFrame with data ranks as values - Examples + See Also -------- + pandas.core.groupby.GroupBy.rank : Rank of values within each group. - The default behavior returns average ranks of every columns + Examples + -------- + The default behavior returns average ranks of every column - >>> df = pd.DataFrame(data={'Customer':['A','B','C','D','E'], - ... 'Tot_Spend':[12,20,20,18,16]}) + >>> df = pd.DataFrame(data={'Customer': [1, 2, 3, 4, 5], + ... 'Tot_Spend': [12, 20, 20, 18, 16]}) >>> df.rank() Customer Tot_Spend 0 1.0 1.0 @@ -7763,34 +7767,25 @@ def rank(self, axis=0, method='average', numeric_only=None, 3 4.0 3.0 4 5.0 2.0 - The argument numeric_only will only return rank for float, int and - boolean data - - >>> df.rank(numeric_only=True) - Tot_Spend - 0 1.0 - 1 4.5 - 2 4.5 - 3 3.0 - 4 2.0 - The following examples show how rank behaves with every different - method and setting pct = True + method and when setting pct = True >>> df['default_rank'] = df['Tot_Spend'].rank() >>> df['min_rank'] = df['Tot_Spend'].rank(method='min') >>> df['max_rank'] = df['Tot_Spend'].rank(method='max') >>> df['dense_rank'] = df['Tot_Spend'].rank(method='dense') - >>> df[['Tot_Spend','default_rank','min_rank','max_rank','dense_rank']] + >>> df[['Tot_Spend', 'default_rank', 'min_rank', 'max_rank', + ... 'dense_rank']] Tot_Spend default_rank min_rank max_rank dense_rank 0 12 1.0 1.0 1.0 1.0 1 20 4.5 4.0 5.0 4.0 2 20 4.5 4.0 5.0 4.0 3 18 3.0 3.0 3.0 3.0 4 16 2.0 2.0 2.0 2.0 + >>> df['default_rank'] = df['Tot_Spend'].rank() >>> df['pct_rank'] = df['Tot_Spend'].rank(pct=True) - >>> df[['Tot_Spend','default_rank','pct_rank']] + >>> df[['Tot_Spend', 'default_rank', 'pct_rank']] Tot_Spend default_rank pct_rank 0 12 1.0 0.2 1 20 4.5 0.9 @@ -7800,8 +7795,8 @@ def rank(self, axis=0, method='average', numeric_only=None, The following example shows how rank behaves with NAs - >>> df = pd.DataFrame(data={'Student':['A','B','C','D','E'], - ... 'Score':[78, np.nan, 68, 90, 68]}) + >>> df = pd.DataFrame(data={'Student': ['A', 'B', 'C', 'D', 'E'], + ... 'Score': [78, np.nan, 68, 90, 68]}) >>> df['NA_keep'] = df['Score'].rank() >>> df['NA_min'] = df['Score'].rank(na_option='bottom') >>> df['NA_max'] = df['Score'].rank(na_option='top') @@ -7812,10 +7807,6 @@ def rank(self, axis=0, method='average', numeric_only=None, 2 C 68.0 1.5 1.5 2.5 3 D 90.0 4.0 4.0 5.0 4 E 68.0 1.5 1.5 2.5 - - See also - -------- - GroupBy.rank : Rank of values within each group. """ axis = self._get_axis_number(axis) From 57f566ba29d545a009f0408040ef5befc02affad Mon Sep 17 00:00:00 2001 From: ed_abati Date: Wed, 14 Nov 2018 23:55:02 +0000 Subject: [PATCH 4/4] Adjusted numeric_only description --- pandas/core/generic.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0187026702268..7efd5d4c1e03a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7938,8 +7938,7 @@ def rank(self, axis=0, method='average', numeric_only=None, * first: ranks assigned in order they appear in the array * dense: like 'min', but rank always increases by 1 between groups numeric_only : bool, optional - If True, rank only float, int and boolean data. - Valid only for DataFrame or Panel objects. + For DataFrame objects, rank only numeric columns if set to True. na_option : {'keep', 'top', 'bottom'}, default 'keep' How to rank NaN values: