Skip to content

Commit 7f8cd04

Browse files
author
Roger Thomas
committed
Fix nsmallest/nlargest With Identical Values
1 parent 4cb730e commit 7f8cd04

File tree

4 files changed

+263
-147
lines changed

4 files changed

+263
-147
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1055,6 +1055,7 @@ Reshaping
10551055
- Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`)
10561056
- Bug in ``pd.concat()`` in which concatting with an empty dataframe with ``join='inner'`` was being improperly handled (:issue:`15328`)
10571057
- Bug with ``sort=True`` in ``DataFrame.join`` and ``pd.merge`` when joining on indexes (:issue:`15582`)
1058+
- Bug in ``DataFrame.nsmallest`` and ``DataFrame.nlargest`` where identical values resulted in duplicated rows (:issue:`15297`)
10581059

10591060
Numeric
10601061
^^^^^^^

pandas/core/algorithms.py

+55-5
Original file line numberDiff line numberDiff line change
@@ -944,14 +944,64 @@ def select_n_frame(frame, columns, n, method, keep):
944944
-------
945945
nordered : DataFrame
946946
"""
947-
from pandas.core.series import Series
947+
from pandas import Int64Index
948948
if not is_list_like(columns):
949949
columns = [columns]
950950
columns = list(columns)
951-
ser = getattr(frame[columns[0]], method)(n, keep=keep)
952-
if isinstance(ser, Series):
953-
ser = ser.to_frame()
954-
return ser.merge(frame, on=columns[0], left_index=True)[frame.columns]
951+
for column in columns:
952+
dtype = frame[column].dtype
953+
if not issubclass(dtype.type, (np.integer, np.floating, np.datetime64,
954+
np.timedelta64)):
955+
msg = (
956+
"{column!r} has dtype: {dtype}, cannot use method {method!r} "
957+
"with this dtype"
958+
).format(column=column, dtype=dtype, method=method)
959+
raise TypeError(msg)
960+
961+
# Below we save and reset the index in case index contains duplicates
962+
original_index = frame.index
963+
cur_frame = frame = frame.reset_index(drop=True)
964+
cur_n = n
965+
indexer = Int64Index([])
966+
for i, column in enumerate(columns):
967+
968+
# For each column we apply method to cur_frame[column]. If it is the
969+
# last column in columns, or if the values returned are unique in
970+
# frame[column] we save this index and break
971+
# Otherwise we must save the index of the non duplicated values
972+
# and set the next cur_frame to cur_frame filtered on all duplcicated
973+
# values (#GH15297)
974+
series = cur_frame[column]
975+
values = getattr(series, method)(cur_n, keep=keep)
976+
is_last_column = len(columns) - 1 == i
977+
if is_last_column or len(values.unique()) == sum(series.isin(values)):
978+
979+
# Last column in columns or values are unique in series => values
980+
# is all that matters
981+
if method == 'nsmallest':
982+
indexer = indexer.append(values.index)
983+
else:
984+
indexer = values.index.append(indexer)
985+
break
986+
duplicated_filter = series.duplicated(keep=False)
987+
non_duplicated = values[~duplicated_filter]
988+
duplicated = values[duplicated_filter]
989+
if method == 'nsmallest':
990+
indexer = indexer.append(non_duplicated.index)
991+
else:
992+
indexer = non_duplicated.index.append(indexer)
993+
994+
# Must set cur frame to include all duplicated values to consider for
995+
# the next column, we also can reduce cur_n by the current length of
996+
# the indexer
997+
cur_frame = cur_frame[series.isin(duplicated)]
998+
cur_n = n - len(indexer)
999+
1000+
frame = frame.take(indexer)
1001+
1002+
# Restore the index on frame
1003+
frame.index = original_index.take(indexer)
1004+
return frame
9551005

9561006

9571007
def _finalize_nsmallest(arr, kth_val, n, keep, narr):

pandas/tests/frame/test_analytics.py

+110-68
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,12 @@
77
import sys
88
import pytest
99

10+
from string import ascii_lowercase
1011
from numpy import nan
1112
from numpy.random import randn
1213
import numpy as np
1314

14-
from pandas.compat import lrange
15+
from pandas.compat import lrange, product
1516
from pandas import (compat, isnull, notnull, DataFrame, Series,
1617
MultiIndex, date_range, Timestamp)
1718
import pandas as pd
@@ -1120,73 +1121,6 @@ def __nonzero__(self):
11201121
self.assertTrue(r1.all())
11211122

11221123
# ----------------------------------------------------------------------
1123-
# Top / bottom
1124-
1125-
def test_nlargest(self):
1126-
# GH10393
1127-
from string import ascii_lowercase
1128-
df = pd.DataFrame({'a': np.random.permutation(10),
1129-
'b': list(ascii_lowercase[:10])})
1130-
result = df.nlargest(5, 'a')
1131-
expected = df.sort_values('a', ascending=False).head(5)
1132-
tm.assert_frame_equal(result, expected)
1133-
1134-
def test_nlargest_multiple_columns(self):
1135-
from string import ascii_lowercase
1136-
df = pd.DataFrame({'a': np.random.permutation(10),
1137-
'b': list(ascii_lowercase[:10]),
1138-
'c': np.random.permutation(10).astype('float64')})
1139-
result = df.nlargest(5, ['a', 'b'])
1140-
expected = df.sort_values(['a', 'b'], ascending=False).head(5)
1141-
tm.assert_frame_equal(result, expected)
1142-
1143-
def test_nsmallest(self):
1144-
from string import ascii_lowercase
1145-
df = pd.DataFrame({'a': np.random.permutation(10),
1146-
'b': list(ascii_lowercase[:10])})
1147-
result = df.nsmallest(5, 'a')
1148-
expected = df.sort_values('a').head(5)
1149-
tm.assert_frame_equal(result, expected)
1150-
1151-
def test_nsmallest_multiple_columns(self):
1152-
from string import ascii_lowercase
1153-
df = pd.DataFrame({'a': np.random.permutation(10),
1154-
'b': list(ascii_lowercase[:10]),
1155-
'c': np.random.permutation(10).astype('float64')})
1156-
result = df.nsmallest(5, ['a', 'c'])
1157-
expected = df.sort_values(['a', 'c']).head(5)
1158-
tm.assert_frame_equal(result, expected)
1159-
1160-
def test_nsmallest_nlargest_duplicate_index(self):
1161-
# GH 13412
1162-
df = pd.DataFrame({'a': [1, 2, 3, 4],
1163-
'b': [4, 3, 2, 1],
1164-
'c': [0, 1, 2, 3]},
1165-
index=[0, 0, 1, 1])
1166-
result = df.nsmallest(4, 'a')
1167-
expected = df.sort_values('a').head(4)
1168-
tm.assert_frame_equal(result, expected)
1169-
1170-
result = df.nlargest(4, 'a')
1171-
expected = df.sort_values('a', ascending=False).head(4)
1172-
tm.assert_frame_equal(result, expected)
1173-
1174-
result = df.nsmallest(4, ['a', 'c'])
1175-
expected = df.sort_values(['a', 'c']).head(4)
1176-
tm.assert_frame_equal(result, expected)
1177-
1178-
result = df.nsmallest(4, ['c', 'a'])
1179-
expected = df.sort_values(['c', 'a']).head(4)
1180-
tm.assert_frame_equal(result, expected)
1181-
1182-
result = df.nlargest(4, ['a', 'c'])
1183-
expected = df.sort_values(['a', 'c'], ascending=False).head(4)
1184-
tm.assert_frame_equal(result, expected)
1185-
1186-
result = df.nlargest(4, ['c', 'a'])
1187-
expected = df.sort_values(['c', 'a'], ascending=False).head(4)
1188-
tm.assert_frame_equal(result, expected)
1189-
# ----------------------------------------------------------------------
11901124
# Isin
11911125

11921126
def test_isin(self):
@@ -1965,3 +1899,111 @@ def test_dot(self):
19651899

19661900
with tm.assertRaisesRegexp(ValueError, 'aligned'):
19671901
df.dot(df2)
1902+
1903+
1904+
@pytest.fixture
1905+
def df_duplicates():
1906+
return pd.DataFrame({'a': [1, 2, 3, 4, 4],
1907+
'b': [1, 1, 1, 1, 1],
1908+
'c': [0, 1, 2, 5, 4]},
1909+
index=[0, 0, 1, 1, 1])
1910+
1911+
1912+
@pytest.fixture
1913+
def df_strings():
1914+
return pd.DataFrame({'a': np.random.permutation(10),
1915+
'b': list(ascii_lowercase[:10]),
1916+
'c': np.random.permutation(10).astype('float64')})
1917+
1918+
1919+
class TestNLargestNSmallest(object):
1920+
1921+
# ----------------------------------------------------------------------
1922+
# Top / bottom
1923+
@pytest.mark.parametrize(
1924+
'n, order',
1925+
product(range(1, 11),
1926+
[['a'],
1927+
['c'],
1928+
['a', 'b'],
1929+
['a', 'c'],
1930+
['b', 'a'],
1931+
['b', 'c'],
1932+
['a', 'b', 'c'],
1933+
['c', 'a', 'b'],
1934+
['c', 'b', 'a'],
1935+
['b', 'c', 'a'],
1936+
['b', 'a', 'c'],
1937+
1938+
# dups!
1939+
['b', 'c', 'c'],
1940+
1941+
]))
1942+
def test_n(self, df_strings, n, order):
1943+
# GH10393
1944+
df = df_strings
1945+
1946+
error_msg = (
1947+
"'b' has dtype: object, cannot use method 'nsmallest' "
1948+
"with this dtype"
1949+
)
1950+
if 'b' in order:
1951+
with pytest.raises(TypeError) as exception:
1952+
df.nsmallest(n, order)
1953+
assert exception.value, error_msg
1954+
else:
1955+
result = df.nsmallest(n, order)
1956+
expected = df.sort_values(order).head(n)
1957+
tm.assert_frame_equal(result, expected)
1958+
1959+
if 'b' in order:
1960+
with pytest.raises(TypeError) as exception:
1961+
df.nsmallest(n, order)
1962+
assert exception.value, error_msg
1963+
else:
1964+
result = df.nlargest(n, order)
1965+
expected = df.sort_values(order, ascending=False).head(n)
1966+
tm.assert_frame_equal(result, expected)
1967+
1968+
def test_n_error(self, df_strings):
1969+
# b alone raises a TypeError
1970+
df = df_strings
1971+
with pytest.raises(TypeError):
1972+
df.nsmallest(1, 'b')
1973+
with pytest.raises(TypeError):
1974+
df.nlargest(1, 'b')
1975+
1976+
def test_n_identical_values(self):
1977+
# GH15297
1978+
df = pd.DataFrame({'a': [1] * 5, 'b': [1, 2, 3, 4, 5]})
1979+
1980+
result = df.nlargest(3, 'a')
1981+
expected = pd.DataFrame(
1982+
{'a': [1] * 3, 'b': [1, 2, 3]}, index=[0, 1, 2]
1983+
)
1984+
tm.assert_frame_equal(result, expected)
1985+
1986+
result = df.nsmallest(3, 'a')
1987+
expected = pd.DataFrame({'a': [1] * 3, 'b': [1, 2, 3]})
1988+
tm.assert_frame_equal(result, expected)
1989+
1990+
@pytest.mark.parametrize(
1991+
'n, order',
1992+
product([1, 2, 3, 4, 5],
1993+
[['a', 'b', 'c'],
1994+
['c', 'b', 'a'],
1995+
['a'],
1996+
['b'],
1997+
['a', 'b'],
1998+
['c', 'b']]))
1999+
def test_n_duplicate_index(self, df_duplicates, n, order):
2000+
# GH 13412
2001+
2002+
df = df_duplicates
2003+
result = df.nsmallest(n, order)
2004+
expected = df.sort_values(order).head(n)
2005+
tm.assert_frame_equal(result, expected)
2006+
2007+
result = df.nlargest(n, order)
2008+
expected = df.sort_values(order, ascending=False).head(n)
2009+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)