Skip to content

Commit ec5956e

Browse files
jschendeljreback
authored andcommitted
BUG: Fix Series.nlargest for integer boundary values (#21432)
1 parent c50a9dc commit ec5956e

File tree

5 files changed

+147
-43
lines changed

5 files changed

+147
-43
lines changed

doc/source/whatsnew/v0.23.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -82,4 +82,5 @@ Bug Fixes
8282

8383
**Other**
8484

85+
- Bug in :meth:`Series.nlargest` for signed and unsigned integer dtypes when the minimum value is present (:issue:`21426`)
8586
-

pandas/conftest.py

+71
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,14 @@ def join_type(request):
129129
return request.param
130130

131131

132+
@pytest.fixture(params=['nlargest', 'nsmallest'])
133+
def nselect_method(request):
134+
"""
135+
Fixture for trying all nselect methods
136+
"""
137+
return request.param
138+
139+
132140
@pytest.fixture(params=[None, np.nan, pd.NaT, float('nan'), np.float('NaN')])
133141
def nulls_fixture(request):
134142
"""
@@ -170,3 +178,66 @@ def string_dtype(request):
170178
* 'U'
171179
"""
172180
return request.param
181+
182+
183+
@pytest.fixture(params=["float32", "float64"])
184+
def float_dtype(request):
185+
"""
186+
Parameterized fixture for float dtypes.
187+
188+
* float32
189+
* float64
190+
"""
191+
192+
return request.param
193+
194+
195+
UNSIGNED_INT_DTYPES = ["uint8", "uint16", "uint32", "uint64"]
196+
SIGNED_INT_DTYPES = ["int8", "int16", "int32", "int64"]
197+
ALL_INT_DTYPES = UNSIGNED_INT_DTYPES + SIGNED_INT_DTYPES
198+
199+
200+
@pytest.fixture(params=SIGNED_INT_DTYPES)
201+
def sint_dtype(request):
202+
"""
203+
Parameterized fixture for signed integer dtypes.
204+
205+
* int8
206+
* int16
207+
* int32
208+
* int64
209+
"""
210+
211+
return request.param
212+
213+
214+
@pytest.fixture(params=UNSIGNED_INT_DTYPES)
215+
def uint_dtype(request):
216+
"""
217+
Parameterized fixture for unsigned integer dtypes.
218+
219+
* uint8
220+
* uint16
221+
* uint32
222+
* uint64
223+
"""
224+
225+
return request.param
226+
227+
228+
@pytest.fixture(params=ALL_INT_DTYPES)
229+
def any_int_dtype(request):
230+
"""
231+
Parameterized fixture for any integer dtypes.
232+
233+
* int8
234+
* uint8
235+
* int16
236+
* uint16
237+
* int32
238+
* uint32
239+
* int64
240+
* uint64
241+
"""
242+
243+
return request.param

pandas/core/algorithms.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1133,9 +1133,12 @@ def compute(self, method):
11331133
return dropped[slc].sort_values(ascending=ascending).head(n)
11341134

11351135
# fast method
1136-
arr, _, _ = _ensure_data(dropped.values)
1136+
arr, pandas_dtype, _ = _ensure_data(dropped.values)
11371137
if method == 'nlargest':
11381138
arr = -arr
1139+
if is_integer_dtype(pandas_dtype):
1140+
# GH 21426: ensure reverse ordering at boundaries
1141+
arr -= 1
11391142

11401143
if self.keep == 'last':
11411144
arr = arr[::-1]

pandas/tests/frame/test_analytics.py

+36-42
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from numpy.random import randn
1313
import numpy as np
1414

15-
from pandas.compat import lrange, product, PY35
15+
from pandas.compat import lrange, PY35
1616
from pandas import (compat, isna, notna, DataFrame, Series,
1717
MultiIndex, date_range, Timestamp, Categorical,
1818
_np_version_under1p12, _np_version_under1p15,
@@ -2260,54 +2260,49 @@ class TestNLargestNSmallest(object):
22602260

22612261
# ----------------------------------------------------------------------
22622262
# Top / bottom
2263-
@pytest.mark.parametrize(
2264-
'method, n, order',
2265-
product(['nsmallest', 'nlargest'], range(1, 11),
2266-
[['a'],
2267-
['c'],
2268-
['a', 'b'],
2269-
['a', 'c'],
2270-
['b', 'a'],
2271-
['b', 'c'],
2272-
['a', 'b', 'c'],
2273-
['c', 'a', 'b'],
2274-
['c', 'b', 'a'],
2275-
['b', 'c', 'a'],
2276-
['b', 'a', 'c'],
2277-
2278-
# dups!
2279-
['b', 'c', 'c'],
2280-
2281-
]))
2282-
def test_n(self, df_strings, method, n, order):
2263+
@pytest.mark.parametrize('order', [
2264+
['a'],
2265+
['c'],
2266+
['a', 'b'],
2267+
['a', 'c'],
2268+
['b', 'a'],
2269+
['b', 'c'],
2270+
['a', 'b', 'c'],
2271+
['c', 'a', 'b'],
2272+
['c', 'b', 'a'],
2273+
['b', 'c', 'a'],
2274+
['b', 'a', 'c'],
2275+
2276+
# dups!
2277+
['b', 'c', 'c']])
2278+
@pytest.mark.parametrize('n', range(1, 11))
2279+
def test_n(self, df_strings, nselect_method, n, order):
22832280
# GH10393
22842281
df = df_strings
22852282
if 'b' in order:
22862283

22872284
error_msg = self.dtype_error_msg_template.format(
2288-
column='b', method=method, dtype='object')
2285+
column='b', method=nselect_method, dtype='object')
22892286
with tm.assert_raises_regex(TypeError, error_msg):
2290-
getattr(df, method)(n, order)
2287+
getattr(df, nselect_method)(n, order)
22912288
else:
2292-
ascending = method == 'nsmallest'
2293-
result = getattr(df, method)(n, order)
2289+
ascending = nselect_method == 'nsmallest'
2290+
result = getattr(df, nselect_method)(n, order)
22942291
expected = df.sort_values(order, ascending=ascending).head(n)
22952292
tm.assert_frame_equal(result, expected)
22962293

2297-
@pytest.mark.parametrize(
2298-
'method, columns',
2299-
product(['nsmallest', 'nlargest'],
2300-
product(['group'], ['category_string', 'string'])
2301-
))
2302-
def test_n_error(self, df_main_dtypes, method, columns):
2294+
@pytest.mark.parametrize('columns', [
2295+
('group', 'category_string'), ('group', 'string')])
2296+
def test_n_error(self, df_main_dtypes, nselect_method, columns):
23032297
df = df_main_dtypes
2298+
col = columns[1]
23042299
error_msg = self.dtype_error_msg_template.format(
2305-
column=columns[1], method=method, dtype=df[columns[1]].dtype)
2300+
column=col, method=nselect_method, dtype=df[col].dtype)
23062301
# escape some characters that may be in the repr
23072302
error_msg = (error_msg.replace('(', '\\(').replace(")", "\\)")
23082303
.replace("[", "\\[").replace("]", "\\]"))
23092304
with tm.assert_raises_regex(TypeError, error_msg):
2310-
getattr(df, method)(2, columns)
2305+
getattr(df, nselect_method)(2, columns)
23112306

23122307
def test_n_all_dtypes(self, df_main_dtypes):
23132308
df = df_main_dtypes
@@ -2328,15 +2323,14 @@ def test_n_identical_values(self):
23282323
expected = pd.DataFrame({'a': [1] * 3, 'b': [1, 2, 3]})
23292324
tm.assert_frame_equal(result, expected)
23302325

2331-
@pytest.mark.parametrize(
2332-
'n, order',
2333-
product([1, 2, 3, 4, 5],
2334-
[['a', 'b', 'c'],
2335-
['c', 'b', 'a'],
2336-
['a'],
2337-
['b'],
2338-
['a', 'b'],
2339-
['c', 'b']]))
2326+
@pytest.mark.parametrize('order', [
2327+
['a', 'b', 'c'],
2328+
['c', 'b', 'a'],
2329+
['a'],
2330+
['b'],
2331+
['a', 'b'],
2332+
['c', 'b']])
2333+
@pytest.mark.parametrize('n', range(1, 6))
23402334
def test_n_duplicate_index(self, df_duplicates, n, order):
23412335
# GH 13412
23422336

pandas/tests/series/test_analytics.py

+35
Original file line numberDiff line numberDiff line change
@@ -1944,6 +1944,15 @@ def test_mode_sortwarning(self):
19441944
tm.assert_series_equal(result, expected)
19451945

19461946

1947+
def assert_check_nselect_boundary(vals, dtype, method):
1948+
# helper function for 'test_boundary_{dtype}' tests
1949+
s = Series(vals, dtype=dtype)
1950+
result = getattr(s, method)(3)
1951+
expected_idxr = [0, 1, 2] if method == 'nsmallest' else [3, 2, 1]
1952+
expected = s.loc[expected_idxr]
1953+
tm.assert_series_equal(result, expected)
1954+
1955+
19471956
class TestNLargestNSmallest(object):
19481957

19491958
@pytest.mark.parametrize(
@@ -2028,6 +2037,32 @@ def test_n(self, n):
20282037
expected = s.sort_values().head(n)
20292038
assert_series_equal(result, expected)
20302039

2040+
def test_boundary_integer(self, nselect_method, any_int_dtype):
2041+
# GH 21426
2042+
dtype_info = np.iinfo(any_int_dtype)
2043+
min_val, max_val = dtype_info.min, dtype_info.max
2044+
vals = [min_val, min_val + 1, max_val - 1, max_val]
2045+
assert_check_nselect_boundary(vals, any_int_dtype, nselect_method)
2046+
2047+
def test_boundary_float(self, nselect_method, float_dtype):
2048+
# GH 21426
2049+
dtype_info = np.finfo(float_dtype)
2050+
min_val, max_val = dtype_info.min, dtype_info.max
2051+
min_2nd, max_2nd = np.nextafter(
2052+
[min_val, max_val], 0, dtype=float_dtype)
2053+
vals = [min_val, min_2nd, max_2nd, max_val]
2054+
assert_check_nselect_boundary(vals, float_dtype, nselect_method)
2055+
2056+
@pytest.mark.parametrize('dtype', ['datetime64[ns]', 'timedelta64[ns]'])
2057+
def test_boundary_datetimelike(self, nselect_method, dtype):
2058+
# GH 21426
2059+
# use int64 bounds and +1 to min_val since true minimum is NaT
2060+
# (include min_val/NaT at end to maintain same expected_idxr)
2061+
dtype_info = np.iinfo('int64')
2062+
min_val, max_val = dtype_info.min, dtype_info.max
2063+
vals = [min_val + 1, min_val + 2, max_val - 1, max_val, min_val]
2064+
assert_check_nselect_boundary(vals, dtype, nselect_method)
2065+
20312066

20322067
class TestCategoricalSeriesAnalytics(object):
20332068

0 commit comments

Comments
 (0)