From 7f15d7a2f29006a855ed6ee43f7b4911f5e5fe99 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Moreira dos Santos Date: Sat, 10 Mar 2018 19:11:19 -0300 Subject: [PATCH 1/4] DOC: Improve the docstring of DataFrame.nlargest Co-authored-by: Igor C. A. de Lima Signed-off-by: Carlos Eduardo Moreira dos Santos Signed-off-by: Igor C. A. de Lima --- pandas/core/frame.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a66d00fff9714..b5abced106831 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3835,23 +3835,33 @@ def sortlevel(self, level=0, axis=0, ascending=True, inplace=False, inplace=inplace, sort_remaining=sort_remaining) def nlargest(self, n, columns, keep='first'): - """Get the rows of a DataFrame sorted by the `n` largest - values of `columns`. + """ + Return the `n` largest rows sorted by `columns`. + + Sort the DataFrame by `columns` in descending order and return the top + `n` rows. Parameters ---------- n : int - Number of items to retrieve + Number of items to retrieve. columns : list or str - Column name or names to order by + Column name or names to retrieve values from. keep : {'first', 'last'}, default 'first' Where there are duplicate values: - - ``first`` : take the first occurrence. - - ``last`` : take the last occurrence. + - `first` : take the first occurrence; + - `last` : take the last occurrence. Returns ------- DataFrame + The `n` largest rows in the DataFrame, sorted by the given columns + in descending order. + + See Also + -------- + DataFrame.nsmallest : Return the `n` smallest rows sorted by given + columns. Examples -------- @@ -3859,10 +3869,10 @@ def nlargest(self, n, columns, keep='first'): ... 'b': list('abdce'), ... 'c': [1.0, 2.0, np.nan, 3.0, 4.0]}) >>> df.nlargest(3, 'a') - a b c - 3 11 c 3 - 1 10 b 2 - 2 8 d NaN + a b c + 3 11 c 3.0 + 1 10 b 2.0 + 2 8 d NaN """ return algorithms.SelectNFrame(self, n=n, From 873efa237ec57e6ece3a4bcf587b3815a30db706 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Moreira dos Santos Date: Sun, 11 Mar 2018 20:49:48 -0300 Subject: [PATCH 2/4] DOC: DataFrame.nlargest - apply review suggestions --- pandas/core/frame.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b5abced106831..86f60b39b6ecd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3836,10 +3836,11 @@ def sortlevel(self, level=0, axis=0, ascending=True, inplace=False, def nlargest(self, n, columns, keep='first'): """ - Return the `n` largest rows sorted by `columns`. + Return the `n` largest rows ordered by `columns`. - Sort the DataFrame by `columns` in descending order and return the top - `n` rows. + Return the `n` largest rows of `columns` in descending order. The + remaining columns, although not used for ordering, are returned as + well. Parameters ---------- @@ -3855,22 +3856,36 @@ def nlargest(self, n, columns, keep='first'): Returns ------- DataFrame - The `n` largest rows in the DataFrame, sorted by the given columns + The `n` largest rows in the DataFrame, ordered by the given columns in descending order. See Also -------- - DataFrame.nsmallest : Return the `n` smallest rows sorted by given + DataFrame.nsmallest : Return the `n` smallest rows ordered by the given columns. Examples -------- - >>> df = pd.DataFrame({'a': [1, 10, 8, 11, -1], + >>> df = pd.DataFrame({'a': [1, 10, 8, 10, -1], ... 'b': list('abdce'), ... 'c': [1.0, 2.0, np.nan, 3.0, 4.0]}) + >>> df + a b c + 0 1 a 1.0 + 1 10 b 2.0 + 2 8 d NaN + 3 10 c 3.0 + 4 -1 e 4.0 + >>> df.nlargest(3, 'a') a b c - 3 11 c 3.0 + 1 10 b 2.0 + 3 10 c 3.0 + 2 8 d NaN + + >>> df.nlargest(3, 'a', keep='last') + a b c + 3 10 c 3.0 1 10 b 2.0 2 8 d NaN """ From de08075a5ce8a546a8ac69c2c1dbe1a46321af45 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Moreira dos Santos Date: Mon, 12 Mar 2018 20:58:35 -0300 Subject: [PATCH 3/4] DOC: DataFrame.nlargest - apply review suggestions --- pandas/core/frame.py | 54 +++++++++++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 13 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 86f60b39b6ecd..ed352a14ad05b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3836,33 +3836,40 @@ def sortlevel(self, level=0, axis=0, ascending=True, inplace=False, def nlargest(self, n, columns, keep='first'): """ - Return the `n` largest rows ordered by `columns`. + Return the `n` first rows ordered by `columns` in descending order. - Return the `n` largest rows of `columns` in descending order. The - remaining columns, although not used for ordering, are returned as - well. + Return the `n` first rows with the largest values in `columns`, in + descending order. The columns that are not specified are returned as + well, but not used for ordering. Parameters ---------- n : int - Number of items to retrieve. - columns : list or str - Column name or names to retrieve values from. + Number of rows to return. + columns : iterable or single value + Column label(s) to order by. keep : {'first', 'last'}, default 'first' Where there are duplicate values: - - `first` : take the first occurrence; - - `last` : take the last occurrence. + + - `first` : prioritize the first occurrence(s) + - `last` : prioritize the last occurrence(s) Returns ------- DataFrame - The `n` largest rows in the DataFrame, ordered by the given columns - in descending order. + The `n` first rows ordered by the given columns in descending + order. See Also -------- - DataFrame.nsmallest : Return the `n` smallest rows ordered by the given - columns. + DataFrame.nsmallest : Return the `n` first rows ordered by `columns` in + ascending order. + + Notes + ----- + This function cannot be used with all column types. For example, when + specifying columns with `object` or `category` dtypes, ``TypeError`` is + raised. Examples -------- @@ -3877,17 +3884,38 @@ def nlargest(self, n, columns, keep='first'): 3 10 c 3.0 4 -1 e 4.0 + In the following example, we will use ``nlargest`` to select the three + rows having the largest values in column "a". + >>> df.nlargest(3, 'a') a b c 1 10 b 2.0 3 10 c 3.0 2 8 d NaN + When using ``keep='last'``, ties are resolved in reverse order: + >>> df.nlargest(3, 'a', keep='last') a b c 3 10 c 3.0 1 10 b 2.0 2 8 d NaN + + To order by the largest values in column "a" and then "c", we can + specify multiple columns like in the next example. + + >>> df.nlargest(3, ['a', 'c']) + a b c + 3 10 c 3.0 + 1 10 b 2.0 + 2 8 d NaN + + The dtype of column "b" is `object` and attempting to get its largest + values raises a ``TypeError`` exception: + + >>> df.nlargest(3, 'b') + Traceback (most recent call last): + TypeError: Column 'b' has dtype object, cannot use method 'nlargest' with this dtype """ return algorithms.SelectNFrame(self, n=n, From 7dd4f02466cf9b35e76af2916dd599048e6d5281 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 17 Mar 2018 11:28:15 +0100 Subject: [PATCH 4/4] updates --- pandas/core/frame.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ed352a14ad05b..b2ef75ab7fdce 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3836,17 +3836,21 @@ def sortlevel(self, level=0, axis=0, ascending=True, inplace=False, def nlargest(self, n, columns, keep='first'): """ - Return the `n` first rows ordered by `columns` in descending order. + Return the first `n` rows ordered by `columns` in descending order. - Return the `n` first rows with the largest values in `columns`, in + Return the first `n` rows with the largest values in `columns`, in descending order. The columns that are not specified are returned as well, but not used for ordering. + This method is equivalent to + ``df.sort_values(columns, ascending=False).head(n)``, but more + performant. + Parameters ---------- n : int Number of rows to return. - columns : iterable or single value + columns : label or list of labels Column label(s) to order by. keep : {'first', 'last'}, default 'first' Where there are duplicate values: @@ -3857,13 +3861,15 @@ def nlargest(self, n, columns, keep='first'): Returns ------- DataFrame - The `n` first rows ordered by the given columns in descending + The first `n` rows ordered by the given columns in descending order. See Also -------- - DataFrame.nsmallest : Return the `n` first rows ordered by `columns` in + DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in ascending order. + DataFrame.sort_values : Sort DataFrame by the values + DataFrame.head : Return the first `n` rows without re-ordering. Notes ----- @@ -3900,7 +3906,7 @@ def nlargest(self, n, columns, keep='first'): 3 10 c 3.0 1 10 b 2.0 2 8 d NaN - + To order by the largest values in column "a" and then "c", we can specify multiple columns like in the next example. @@ -3910,12 +3916,12 @@ def nlargest(self, n, columns, keep='first'): 1 10 b 2.0 2 8 d NaN - The dtype of column "b" is `object` and attempting to get its largest - values raises a ``TypeError`` exception: + Attempting to use ``nlargest`` on non-numeric dtypes will raise a + ``TypeError``: >>> df.nlargest(3, 'b') Traceback (most recent call last): - TypeError: Column 'b' has dtype object, cannot use method 'nlargest' with this dtype + TypeError: Column 'b' has dtype object, cannot use method 'nlargest' """ return algorithms.SelectNFrame(self, n=n,