From e8395b946338c5a31a60418d5b8edadcbf25daf8 Mon Sep 17 00:00:00 2001 From: tdpetrou Date: Tue, 5 Dec 2017 22:16:03 -0500 Subject: [PATCH 1/4] added option keep=False to nlargests/nsmallest --- pandas/core/algorithms.py | 10 +++++++--- pandas/tests/frame/test_analytics.py | 15 +++++++++++++++ pandas/tests/series/test_analytics.py | 11 +++++++++++ 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 0ceb8966fd3c8..80e47b8b05ac4 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -910,8 +910,8 @@ def __init__(self, obj, n, keep): self.n = n self.keep = keep - if self.keep not in ('first', 'last'): - raise ValueError('keep must be either "first", "last"') + if self.keep not in ('first', 'last', False): + raise ValueError('keep must be either "first", "last", or False') def nlargest(self): return self.compute('nlargest') @@ -979,7 +979,11 @@ def compute(self, method): kth_val = algos.kth_smallest(arr.copy(), n - 1) ns, = np.nonzero(arr <= kth_val) - inds = ns[arr[ns].argsort(kind='mergesort')][:n] + inds = ns[arr[ns].argsort(kind='mergesort')] + + if self.keep is not False: + inds = inds[:n] + if self.keep == 'last': # reverse indices inds = narr - 1 - inds diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 4bba6d7601ae8..6d1409d4fc1d9 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -2202,6 +2202,21 @@ def test_n_duplicate_index(self, df_duplicates, n, order): expected = df.sort_values(order, ascending=False).head(n) tm.assert_frame_equal(result, expected) + def test_keep_false(self): + df = pd.DataFrame({'a': [5, 4, 4, 2, 3, 3, 3, 3], + 'b': [10, 9, 8, 7, 5, 50, 10, 20]}) + result = df.nlargest(4, 'a', keep=False) + expected = pd.DataFrame({'a': {0: 5, 1: 4, 2: 4, 4: 3, + 5: 3, 6: 3, 7: 3}, + 'b': {0: 10, 1: 9, 2: 8, 4: 5, + 5: 50, 6: 10, 7: 20}}) + tm.assert_frame_equal(result, expected) + + result = df.nsmallest(2, 'a', keep=False) + expected = pd.DataFrame({'a': {3: 2, 4: 3, 5: 3, 6: 3, 7: 3}, + 'b': {3: 7, 4: 5, 5: 50, 6: 10, 7: 20}}) + tm.assert_frame_equal(result, expected) + def test_series_broadcasting(self): # smoke test for numpy warnings # GH 16378, GH 16306 diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 289b5c01c1263..d9d3f777e84ac 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1867,6 +1867,17 @@ def test_n(self, n): expected = s.sort_values().head(n) assert_series_equal(result, expected) + def test_keep_false(self): + s = Series([10, 9, 8, 7, 7, 7, 7, 6]) + result = s.nlargest(4, keep=False) + expected = Series([10, 9, 8, 7, 7, 7, 7]) + print(result, expected) + assert_series_equal(result, expected) + + result = s.nsmallest(2, keep=False) + expected = Series([6, 7, 7, 7, 7], index=[7, 3, 4, 5, 6]) + assert_series_equal(result, expected) + class TestCategoricalSeriesAnalytics(object): From 5b1e7b046f733374a6a25a0c70e7cb55a50c71e4 Mon Sep 17 00:00:00 2001 From: tdpetrou Date: Wed, 13 Dec 2017 12:28:00 -0500 Subject: [PATCH 2/4] add "all" argument for nlargest/nsmallest --- pandas/core/algorithms.py | 6 +-- pandas/core/frame.py | 58 +++++++++++++++++++++------ pandas/tests/frame/test_analytics.py | 7 ++-- pandas/tests/series/test_analytics.py | 7 ++-- 4 files changed, 57 insertions(+), 21 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 80e47b8b05ac4..099fec74d266c 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -910,8 +910,8 @@ def __init__(self, obj, n, keep): self.n = n self.keep = keep - if self.keep not in ('first', 'last', False): - raise ValueError('keep must be either "first", "last", or False') + if self.keep not in ('first', 'last', 'all'): + raise ValueError('keep must be either "first", "last", or "all"') def nlargest(self): return self.compute('nlargest') @@ -981,7 +981,7 @@ def compute(self, method): ns, = np.nonzero(arr <= kth_val) inds = ns[arr[ns].argsort(kind='mergesort')] - if self.keep is not False: + if self.keep != 'all': inds = inds[:n] if self.keep == 'last': diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5f323d0f040bc..98b6801c5f394 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3773,6 +3773,9 @@ def nlargest(self, n, columns, keep='first'): Where there are duplicate values: - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. + - ``all`` : keep all ties of nth largest value. + + .. versionadded:: 0.22.0 Returns ------- @@ -3780,14 +3783,28 @@ def nlargest(self, n, columns, keep='first'): Examples -------- - >>> df = DataFrame({'a': [1, 10, 8, 11, -1], - ... 'b': list('abdce'), - ... 'c': [1.0, 2.0, np.nan, 3.0, 4.0]}) - >>> df.nlargest(3, 'a') + >>> df = pd.DataFrame({'a': [1, 10, 8, 11, 8, 2], + ... 'b': list('abdcef'), + ... 'c': [1.0, 2.0, np.nan, 3.0, 4.0, 9.0]}) + + >>> df.nlargest(3, 'a', keep='first') + a b c + 3 11 c 3 + 1 10 b 2 + 2 8 d NaN + + >>> df.nlargest(3, 'a', keep='last') + a b c + 3 11 c 3 + 1 10 b 2 + 4 8 e 4 + + >>> df.nlargest(3, 'a', keep='all') a b c 3 11 c 3 1 10 b 2 2 8 d NaN + 4 8 e 4 """ return algorithms.SelectNFrame(self, n=n, @@ -3808,6 +3825,9 @@ def nsmallest(self, n, columns, keep='first'): Where there are duplicate values: - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. + - ``all`` : keep all ties of nth largest value. + + .. versionadded:: 0.22.0 Returns ------- @@ -3815,14 +3835,28 @@ def nsmallest(self, n, columns, keep='first'): Examples -------- - >>> df = DataFrame({'a': [1, 10, 8, 11, -1], - ... 'b': list('abdce'), - ... 'c': [1.0, 2.0, np.nan, 3.0, 4.0]}) - >>> df.nsmallest(3, 'a') - a b c - 4 -1 e 4 - 0 1 a 1 - 2 8 d NaN + >>> df = pd.DataFrame({'a': [1, 10, 8, 11, 8, 2], + ... 'b': list('abdcef'), + ... 'c': [1.0, 2.0, np.nan, 3.0, 4.0, 9.0]}) + + >>> df.nsmallest(3, 'a', keep='first') + a b c + 0 1 a 1.0 + 5 2 f 9.0 + 2 8 d NaN + + >>> df.nsmallest(3, 'a', keep='last') + a b c + 0 1 a 1.0 + 5 2 f 9.0 + 4 8 e 4.0 + + >>> df.nsmallest(3, 'a', keep='all') + a b c + 0 1 a 1.0 + 5 2 f 9.0 + 2 8 d NaN + 4 8 e 4.0 """ return algorithms.SelectNFrame(self, n=n, diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 6d1409d4fc1d9..c038d76879ce1 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -2202,17 +2202,18 @@ def test_n_duplicate_index(self, df_duplicates, n, order): expected = df.sort_values(order, ascending=False).head(n) tm.assert_frame_equal(result, expected) - def test_keep_false(self): + def test_keep_all_ties(self): + # GH 16818 df = pd.DataFrame({'a': [5, 4, 4, 2, 3, 3, 3, 3], 'b': [10, 9, 8, 7, 5, 50, 10, 20]}) - result = df.nlargest(4, 'a', keep=False) + result = df.nlargest(4, 'a', keep='all') expected = pd.DataFrame({'a': {0: 5, 1: 4, 2: 4, 4: 3, 5: 3, 6: 3, 7: 3}, 'b': {0: 10, 1: 9, 2: 8, 4: 5, 5: 50, 6: 10, 7: 20}}) tm.assert_frame_equal(result, expected) - result = df.nsmallest(2, 'a', keep=False) + result = df.nsmallest(2, 'a', keep='all') expected = pd.DataFrame({'a': {3: 2, 4: 3, 5: 3, 6: 3, 7: 3}, 'b': {3: 7, 4: 5, 5: 50, 6: 10, 7: 20}}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index d9d3f777e84ac..2e4a84f8bcd6b 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1867,14 +1867,15 @@ def test_n(self, n): expected = s.sort_values().head(n) assert_series_equal(result, expected) - def test_keep_false(self): + def test_keep_all_ties(self): + # GH 16818 s = Series([10, 9, 8, 7, 7, 7, 7, 6]) - result = s.nlargest(4, keep=False) + result = s.nlargest(4, keep='all') expected = Series([10, 9, 8, 7, 7, 7, 7]) print(result, expected) assert_series_equal(result, expected) - result = s.nsmallest(2, keep=False) + result = s.nsmallest(2, keep='all') expected = Series([6, 7, 7, 7, 7], index=[7, 3, 4, 5, 6]) assert_series_equal(result, expected) From 5cd3a8dc97cb5e66a90e0e560ddf881b598af5d0 Mon Sep 17 00:00:00 2001 From: tdpetrou Date: Wed, 13 Dec 2017 12:56:10 -0500 Subject: [PATCH 3/4] added whatsnew and cleaned up docstrings --- doc/source/whatsnew/v0.22.0.txt | 2 ++ pandas/core/frame.py | 16 ++++++++-------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index ab7f18bce47d3..53d8aa5946845 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -138,6 +138,8 @@ Other Enhancements - :func:`Series` / :func:`DataFrame` tab completion also returns identifiers in the first level of a :func:`MultiIndex`. (:issue:`16326`) - :func:`read_excel()` has gained the ``nrows`` parameter (:issue:`16645`) - :func:``DataFrame.to_json`` and ``Series.to_json`` now accept an ``index`` argument which allows the user to exclude the index from the JSON output (:issue:`17394`) +- :func:`Series` / :func:`DataFrame` methods :func:`nlargest` / :func:`nsmallest` now accept the value 'all' for the `keep` argument. This keeps all ties for the nth largests/smallest value (:issue:`16818`). + .. _whatsnew_0220.api_breaking: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 98b6801c5f394..14c81975ab159 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3769,11 +3769,11 @@ def nlargest(self, n, columns, keep='first'): Number of items to retrieve columns : list or str Column name or names to order by - keep : {'first', 'last'}, default 'first' + keep : {'first', 'last', 'all'}, default 'first' Where there are duplicate values: - - ``first`` : take the first occurrence. - - ``last`` : take the last occurrence. - - ``all`` : keep all ties of nth largest value. + - 'first' : take the first occurrence. + - 'last' : take the last occurrence. + - 'all' : keep all ties of nth largest value. .. versionadded:: 0.22.0 @@ -3821,11 +3821,11 @@ def nsmallest(self, n, columns, keep='first'): Number of items to retrieve columns : list or str Column name or names to order by - keep : {'first', 'last'}, default 'first' + keep : {'first', 'last', 'all'}, default 'first' Where there are duplicate values: - - ``first`` : take the first occurrence. - - ``last`` : take the last occurrence. - - ``all`` : keep all ties of nth largest value. + - 'first' : take the first occurrence. + - 'last' : take the last occurrence. + - 'all' : keep all ties of nth largest value. .. versionadded:: 0.22.0 From 56954b4ec614d650ed6d239ba219746bd56c4098 Mon Sep 17 00:00:00 2001 From: tdpetrou Date: Wed, 13 Dec 2017 13:00:50 -0500 Subject: [PATCH 4/4] cleaned up docstrings --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 14c81975ab159..d1441da0d810f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3825,7 +3825,7 @@ def nsmallest(self, n, columns, keep='first'): Where there are duplicate values: - 'first' : take the first occurrence. - 'last' : take the last occurrence. - - 'all' : keep all ties of nth largest value. + - 'all' : keep all ties of nth smallest value. .. versionadded:: 0.22.0