Skip to content

BUG: nlargest/nsmallest can now consider nan values like sort_values(ascending=True).head(n) #43060

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Sep 7, 2021
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
3451902
Update test_to_datetime.py
usersblock Jul 20, 2021
6c3a339
Merge branch 'pandas-dev:master' into master
usersblock Jul 20, 2021
1b52822
Update test_to_datetime.py
usersblock Jul 20, 2021
493b70d
Merge branch 'master' of https://github.com/usersblock/pandas
usersblock Jul 20, 2021
59d4145
Update test_to_datetime.py
usersblock Jul 21, 2021
763d84c
Update test_to_datetime.py
usersblock Jul 22, 2021
1404f7a
Update test_to_datetime.py
usersblock Jul 23, 2021
c649998
Update test_to_datetime.py
usersblock Jul 23, 2021
44645d7
Update test_to_datetime.py
usersblock Jul 24, 2021
e3cb80c
Update test_to_datetime.py
usersblock Jul 24, 2021
970ed06
Update test_to_datetime.py
usersblock Jul 25, 2021
4571bd6
Merge branch 'pandas-dev:master' into master
usersblock Aug 9, 2021
d57f608
Merge branch 'pandas-dev:master' into master
usersblock Aug 13, 2021
e6edb59
Merge branch 'pandas-dev:master' into master
usersblock Aug 14, 2021
71be209
Updated Test and fixed nlargest
usersblock Aug 15, 2021
c5c751c
Update algorithms.py
usersblock Aug 16, 2021
a00fb9e
Update algorithms.py
usersblock Aug 16, 2021
6e5c7b0
Update algorithms.py
usersblock Aug 16, 2021
a1a9d06
Update test_nlargest.py
usersblock Aug 17, 2021
491d077
Update test_nlargest.py
usersblock Aug 17, 2021
8708cc0
Update test_nlargest.py
usersblock Aug 17, 2021
a5cb642
Update test_nlargest.py
usersblock Aug 25, 2021
7d2aac4
Updated test_nlargest and docs
usersblock Aug 27, 2021
2d0931f
Update v1.4.0.rst
usersblock Aug 27, 2021
ca3af24
Update test_apply.py
usersblock Sep 3, 2021
80d4fcb
Update test_apply.py
usersblock Sep 3, 2021
5717466
Update algorithms.py
usersblock Sep 7, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ Indexing
- Bug in :meth:`Index.get_indexer_non_unique` when index contains multiple ``np.nan`` (:issue:`35392`)
- Bug in :meth:`DataFrame.query` did not handle the degree sign in a backticked column name, such as \`Temp(°C)\`, used in an expression to query a dataframe (:issue:`42826`)
- Bug in :meth:`DataFrame.drop` where the error message did not show missing labels with commas when raising ``KeyError`` (:issue:`42881`)
-
- Bug in :meth:`DataFrame.nlargest` and :meth:`Series.nlargest` where sorted result did not count indexes containing ``np.nan`` (:issue:`28984`)

Missing
^^^^^^^
Expand Down
10 changes: 8 additions & 2 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1247,6 +1247,8 @@ class SelectNSeries(SelectN):

def compute(self, method: str) -> Series:

from pandas.core.reshape.concat import concat

n = self.n
dtype = self.obj.dtype
if not self.is_valid_dtype_n_method(dtype):
Expand All @@ -1256,6 +1258,7 @@ def compute(self, method: str) -> Series:
return self.obj[[]]

dropped = self.obj.dropna()
nan_index = self.obj.drop(dropped.index)

if is_extension_array_dtype(dropped.dtype):
# GH#41816 bc we have dropped NAs above, MaskedArrays can use the
Expand All @@ -1272,7 +1275,7 @@ def compute(self, method: str) -> Series:
# slow method
if n >= len(self.obj):
ascending = method == "nsmallest"
return dropped.sort_values(ascending=ascending).head(n)
return self.obj.sort_values(ascending=ascending).head(n)

# fast method
new_dtype = dropped.dtype
Expand All @@ -1290,6 +1293,8 @@ def compute(self, method: str) -> Series:
if self.keep == "last":
arr = arr[::-1]

nbase = n
findex = len(self.obj)
narr = len(arr)
n = min(n, narr)

Expand All @@ -1301,12 +1306,13 @@ def compute(self, method: str) -> Series:

if self.keep != "all":
inds = inds[:n]
findex = nbase

if self.keep == "last":
# reverse indices
inds = narr - 1 - inds

return dropped.iloc[inds]
return concat([dropped.iloc[inds], nan_index])[:findex]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dont' you need to use .iloc here? as this will not be a positional lookup (it might work with a range index though)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be done now



class SelectNFrame(SelectN):
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/frame/methods/test_nlargest.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,3 +209,10 @@ def test_nlargest_multiindex_column_lookup(self):
result = df.nlargest(3, ("x", "b"))
expected = df.iloc[[3, 2, 1]]
tm.assert_frame_equal(result, expected)

def test_nlargest_nan(self):
# GH#43060
df = pd.DataFrame([np.nan, np.nan, 0, 1, 2, 3])
result = df.nlargest(5, 0)
expected = df.sort_values(0, ascending=False).head(5)
tm.assert_frame_equal(result, expected)
8 changes: 6 additions & 2 deletions pandas/tests/series/methods/test_nlargest.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,12 @@ def test_nsmallest_nlargest(self, s_main_dtypes_split):
def test_nlargest_misc(self):

ser = Series([3.0, np.nan, 1, 2, 5])
tm.assert_series_equal(ser.nlargest(), ser.iloc[[4, 0, 3, 2]])
tm.assert_series_equal(ser.nsmallest(), ser.iloc[[2, 3, 0, 4]])
result = ser.nlargest()
expected = ser.iloc[[4, 0, 3, 2, 1]]
tm.assert_series_equal(result, expected)
result = ser.nsmallest()
expected = ser.iloc[[2, 3, 0, 4, 1]]
tm.assert_series_equal(result, expected)

msg = 'keep must be either "first", "last"'
with pytest.raises(ValueError, match=msg):
Expand Down