diff --git a/ci/code_checks.sh b/ci/code_checks.sh index b829cbefe8f7a..04813cc31f603 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -123,7 +123,7 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then MSG='Doctests frame.py' ; echo $MSG pytest -q --doctest-modules pandas/core/frame.py \ - -k"-axes -combine -itertuples -join -nlargest -nsmallest -nunique -pivot_table -quantile -query -reindex -reindex_axis -replace -round -set_index -stack -to_stata" + -k"-axes -combine -itertuples -join -nunique -pivot_table -quantile -query -reindex -reindex_axis -replace -round -set_index -stack -to_stata" RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Doctests series.py' ; echo $MSG diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a58d34574d28d..9c85139fffcc4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4694,60 +4694,63 @@ def nlargest(self, n, columns, keep='first'): Examples -------- - >>> df = pd.DataFrame({'a': [1, 10, 8, 11, 8, 2], - ... 'b': list('abdcef'), - ... 'c': [1.0, 2.0, np.nan, 3.0, 4.0, 9.0]}) + >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, + ... 434000, 434000, 337000, 11300, + ... 11300, 11300], + ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, + ... 17036, 182, 38, 311], + ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", + ... "IS", "NR", "TV", "AI"]}, + ... index=["Italy", "France", "Malta", + ... "Maldives", "Brunei", "Iceland", + ... "Nauru", "Tuvalu", "Anguilla"]) >>> df - a b c - 0 1 a 1.0 - 1 10 b 2.0 - 2 8 d NaN - 3 11 c 3.0 - 4 8 e 4.0 - 5 2 f 9.0 + population GDP alpha-2 + Italy 59000000 1937894 IT + France 65000000 2583560 FR + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN + Iceland 337000 17036 IS + Nauru 11300 182 NR + Tuvalu 11300 38 TV + Anguilla 11300 311 AI In the following example, we will use ``nlargest`` to select the three - rows having the largest values in column "a". + rows having the largest values in column "population". - >>> df.nlargest(3, 'a') - a b c - 3 11 c 3.0 - 1 10 b 2.0 - 2 8 d NaN + >>> df.nlargest(3, 'population') + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Malta 434000 12011 MT When using ``keep='last'``, ties are resolved in reverse order: - >>> df.nlargest(3, 'a', keep='last') - a b c - 3 11 c 3.0 - 1 10 b 2.0 - 4 8 e 4.0 + >>> df.nlargest(3, 'population', keep='last') + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Brunei 434000 12128 BN When using ``keep='all'``, all duplicate items are maintained: - >>> df.nlargest(3, 'a', keep='all') - a b c - 3 11 c 3.0 - 1 10 b 2.0 - 2 8 d NaN - 4 8 e 4.0 + >>> df.nlargest(3, 'population', keep='all') + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN - To order by the largest values in column "a" and then "c", we can - specify multiple columns like in the next example. - - >>> df.nlargest(3, ['a', 'c']) - a b c - 4 8 e 4.0 - 3 11 c 3.0 - 1 10 b 2.0 - - Attempting to use ``nlargest`` on non-numeric dtypes will raise a - ``TypeError``: - - >>> df.nlargest(3, 'b') + To order by the largest values in column "population" and then "GDP", + we can specify multiple columns like in the next example. - Traceback (most recent call last): - TypeError: Column 'b' has dtype object, cannot use method 'nlargest' + >>> df.nlargest(3, ['population', 'GDP']) + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Brunei 434000 12128 BN """ return algorithms.SelectNFrame(self, n=n, @@ -4755,15 +4758,23 @@ def nlargest(self, n, columns, keep='first'): columns=columns).nlargest() def nsmallest(self, n, columns, keep='first'): - """Get the rows of a DataFrame sorted by the `n` smallest - values of `columns`. + """ + Return the first `n` rows ordered by `columns` in ascending order. + + Return the first `n` rows with the smallest values in `columns`, in + ascending order. The columns that are not specified are returned as + well, but not used for ordering. + + This method is equivalent to + ``df.sort_values(columns, ascending=True).head(n)``, but more + performant. Parameters ---------- n : int - Number of items to retrieve + Number of items to retrieve. columns : list or str - Column name or names to order by + Column name or names to order by. keep : {'first', 'last', 'all'}, default 'first' Where there are duplicate values: @@ -4778,62 +4789,70 @@ def nsmallest(self, n, columns, keep='first'): ------- DataFrame + See Also + -------- + DataFrame.nlargest : Return the first `n` rows ordered by `columns` in + descending order. + DataFrame.sort_values : Sort DataFrame by the values. + DataFrame.head : Return the first `n` rows without re-ordering. + Examples -------- - >>> df = pd.DataFrame({'a': [1, 10, 8, 11, 8, 2], - ... 'b': list('abdcef'), - ... 'c': [1.0, 2.0, np.nan, 3.0, 4.0, 9.0]}) + >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, + ... 434000, 434000, 337000, 11300, + ... 11300, 11300], + ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, + ... 17036, 182, 38, 311], + ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", + ... "IS", "NR", "TV", "AI"]}, + ... index=["Italy", "France", "Malta", + ... "Maldives", "Brunei", "Iceland", + ... "Nauru", "Tuvalu", "Anguilla"]) >>> df - a b c - 0 1 a 1.0 - 1 10 b 2.0 - 2 8 d NaN - 3 11 c 3.0 - 4 8 e 4.0 - 5 2 f 9.0 + population GDP alpha-2 + Italy 59000000 1937894 IT + France 65000000 2583560 FR + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN + Iceland 337000 17036 IS + Nauru 11300 182 NR + Tuvalu 11300 38 TV + Anguilla 11300 311 AI In the following example, we will use ``nsmallest`` to select the three rows having the smallest values in column "a". - >>> df.nsmallest(3, 'a') - a b c - 0 1 a 1.0 - 5 2 f 9.0 - 2 8 d NaN + >>> df.nsmallest(3, 'population') + population GDP alpha-2 + Nauru 11300 182 NR + Tuvalu 11300 38 TV + Anguilla 11300 311 AI When using ``keep='last'``, ties are resolved in reverse order: - >>> df.nsmallest(3, 'a', keep='last') - a b c - 0 1 a 1.0 - 5 2 f 9.0 - 4 8 e 4.0 + >>> df.nsmallest(3, 'population', keep='last') + population GDP alpha-2 + Anguilla 11300 311 AI + Tuvalu 11300 38 TV + Nauru 11300 182 NR When using ``keep='all'``, all duplicate items are maintained: - >>> df.nsmallest(3, 'a', keep='all') - a b c - 0 1 a 1.0 - 5 2 f 9.0 - 2 8 d NaN - 4 8 e 4.0 + >>> df.nsmallest(3, 'population', keep='all') + population GDP alpha-2 + Nauru 11300 182 NR + Tuvalu 11300 38 TV + Anguilla 11300 311 AI To order by the largest values in column "a" and then "c", we can specify multiple columns like in the next example. - >>> df.nsmallest(3, ['a', 'c']) - a b c - 0 1 a 1.0 - 5 2 f 9.0 - 4 8 e 4.0 - - Attempting to use ``nsmallest`` on non-numeric dtypes will raise a - ``TypeError``: - - >>> df.nsmallest(3, 'b') - - Traceback (most recent call last): - TypeError: Column 'b' has dtype object, cannot use method 'nsmallest' + >>> df.nsmallest(3, ['population', 'GDP']) + population GDP alpha-2 + Tuvalu 11300 38 TV + Nauru 11300 182 NR + Anguilla 11300 311 AI """ return algorithms.SelectNFrame(self, n=n,