Skip to content

Commit 0ea0fdd

Browse files
jbrockmendeljreback
authored andcommitted
REF: targeted test files for nlargest, searchsorted, value_counts (#30385)
1 parent 0eb010e commit 0ea0fdd

File tree

7 files changed

+700
-678
lines changed

7 files changed

+700
-678
lines changed
+211
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
"""
2+
Note: for naming purposes, most tests are title with as e.g. "test_nlargest_foo"
3+
but are implicitly also testing nsmallest_foo.
4+
"""
5+
from string import ascii_lowercase
6+
7+
import numpy as np
8+
import pytest
9+
10+
import pandas as pd
11+
import pandas.util.testing as tm
12+
13+
14+
@pytest.fixture
15+
def df_duplicates():
16+
return pd.DataFrame(
17+
{"a": [1, 2, 3, 4, 4], "b": [1, 1, 1, 1, 1], "c": [0, 1, 2, 5, 4]},
18+
index=[0, 0, 1, 1, 1],
19+
)
20+
21+
22+
@pytest.fixture
23+
def df_strings():
24+
return pd.DataFrame(
25+
{
26+
"a": np.random.permutation(10),
27+
"b": list(ascii_lowercase[:10]),
28+
"c": np.random.permutation(10).astype("float64"),
29+
}
30+
)
31+
32+
33+
@pytest.fixture
34+
def df_main_dtypes():
35+
return pd.DataFrame(
36+
{
37+
"group": [1, 1, 2],
38+
"int": [1, 2, 3],
39+
"float": [4.0, 5.0, 6.0],
40+
"string": list("abc"),
41+
"category_string": pd.Series(list("abc")).astype("category"),
42+
"category_int": [7, 8, 9],
43+
"datetime": pd.date_range("20130101", periods=3),
44+
"datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"),
45+
"timedelta": pd.timedelta_range("1 s", periods=3, freq="s"),
46+
},
47+
columns=[
48+
"group",
49+
"int",
50+
"float",
51+
"string",
52+
"category_string",
53+
"category_int",
54+
"datetime",
55+
"datetimetz",
56+
"timedelta",
57+
],
58+
)
59+
60+
61+
class TestNLargestNSmallest:
62+
63+
# ----------------------------------------------------------------------
64+
# Top / bottom
65+
@pytest.mark.parametrize(
66+
"order",
67+
[
68+
["a"],
69+
["c"],
70+
["a", "b"],
71+
["a", "c"],
72+
["b", "a"],
73+
["b", "c"],
74+
["a", "b", "c"],
75+
["c", "a", "b"],
76+
["c", "b", "a"],
77+
["b", "c", "a"],
78+
["b", "a", "c"],
79+
# dups!
80+
["b", "c", "c"],
81+
],
82+
)
83+
@pytest.mark.parametrize("n", range(1, 11))
84+
def test_nlargest_n(self, df_strings, nselect_method, n, order):
85+
# GH#10393
86+
df = df_strings
87+
if "b" in order:
88+
89+
error_msg = (
90+
f"Column 'b' has dtype object, "
91+
f"cannot use method '{nselect_method}' with this dtype"
92+
)
93+
with pytest.raises(TypeError, match=error_msg):
94+
getattr(df, nselect_method)(n, order)
95+
else:
96+
ascending = nselect_method == "nsmallest"
97+
result = getattr(df, nselect_method)(n, order)
98+
expected = df.sort_values(order, ascending=ascending).head(n)
99+
tm.assert_frame_equal(result, expected)
100+
101+
@pytest.mark.parametrize(
102+
"columns", [["group", "category_string"], ["group", "string"]]
103+
)
104+
def test_nlargest_error(self, df_main_dtypes, nselect_method, columns):
105+
df = df_main_dtypes
106+
col = columns[1]
107+
error_msg = (
108+
f"Column '{col}' has dtype {df[col].dtype}, "
109+
f"cannot use method '{nselect_method}' with this dtype"
110+
)
111+
# escape some characters that may be in the repr
112+
error_msg = (
113+
error_msg.replace("(", "\\(")
114+
.replace(")", "\\)")
115+
.replace("[", "\\[")
116+
.replace("]", "\\]")
117+
)
118+
with pytest.raises(TypeError, match=error_msg):
119+
getattr(df, nselect_method)(2, columns)
120+
121+
def test_nlargest_all_dtypes(self, df_main_dtypes):
122+
df = df_main_dtypes
123+
df.nsmallest(2, list(set(df) - {"category_string", "string"}))
124+
df.nlargest(2, list(set(df) - {"category_string", "string"}))
125+
126+
def test_nlargest_duplicates_on_starter_columns(self):
127+
# regression test for GH#22752
128+
129+
df = pd.DataFrame({"a": [2, 2, 2, 1, 1, 1], "b": [1, 2, 3, 3, 2, 1]})
130+
131+
result = df.nlargest(4, columns=["a", "b"])
132+
expected = pd.DataFrame(
133+
{"a": [2, 2, 2, 1], "b": [3, 2, 1, 3]}, index=[2, 1, 0, 3]
134+
)
135+
tm.assert_frame_equal(result, expected)
136+
137+
result = df.nsmallest(4, columns=["a", "b"])
138+
expected = pd.DataFrame(
139+
{"a": [1, 1, 1, 2], "b": [1, 2, 3, 1]}, index=[5, 4, 3, 0]
140+
)
141+
tm.assert_frame_equal(result, expected)
142+
143+
def test_nlargest_n_identical_values(self):
144+
# GH#15297
145+
df = pd.DataFrame({"a": [1] * 5, "b": [1, 2, 3, 4, 5]})
146+
147+
result = df.nlargest(3, "a")
148+
expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}, index=[0, 1, 2])
149+
tm.assert_frame_equal(result, expected)
150+
151+
result = df.nsmallest(3, "a")
152+
expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]})
153+
tm.assert_frame_equal(result, expected)
154+
155+
@pytest.mark.parametrize(
156+
"order",
157+
[["a", "b", "c"], ["c", "b", "a"], ["a"], ["b"], ["a", "b"], ["c", "b"]],
158+
)
159+
@pytest.mark.parametrize("n", range(1, 6))
160+
def test_nlargest_n_duplicate_index(self, df_duplicates, n, order):
161+
# GH#13412
162+
163+
df = df_duplicates
164+
result = df.nsmallest(n, order)
165+
expected = df.sort_values(order).head(n)
166+
tm.assert_frame_equal(result, expected)
167+
168+
result = df.nlargest(n, order)
169+
expected = df.sort_values(order, ascending=False).head(n)
170+
tm.assert_frame_equal(result, expected)
171+
172+
def test_nlargest_duplicate_keep_all_ties(self):
173+
# GH#16818
174+
df = pd.DataFrame(
175+
{"a": [5, 4, 4, 2, 3, 3, 3, 3], "b": [10, 9, 8, 7, 5, 50, 10, 20]}
176+
)
177+
result = df.nlargest(4, "a", keep="all")
178+
expected = pd.DataFrame(
179+
{
180+
"a": {0: 5, 1: 4, 2: 4, 4: 3, 5: 3, 6: 3, 7: 3},
181+
"b": {0: 10, 1: 9, 2: 8, 4: 5, 5: 50, 6: 10, 7: 20},
182+
}
183+
)
184+
tm.assert_frame_equal(result, expected)
185+
186+
result = df.nsmallest(2, "a", keep="all")
187+
expected = pd.DataFrame(
188+
{
189+
"a": {3: 2, 4: 3, 5: 3, 6: 3, 7: 3},
190+
"b": {3: 7, 4: 5, 5: 50, 6: 10, 7: 20},
191+
}
192+
)
193+
tm.assert_frame_equal(result, expected)
194+
195+
def test_nlargest_multiindex_column_lookup(self):
196+
# Check whether tuples are correctly treated as multi-level lookups.
197+
# GH#23033
198+
df = pd.DataFrame(
199+
columns=pd.MultiIndex.from_product([["x"], ["a", "b"]]),
200+
data=[[0.33, 0.13], [0.86, 0.25], [0.25, 0.70], [0.85, 0.91]],
201+
)
202+
203+
# nsmallest
204+
result = df.nsmallest(3, ("x", "a"))
205+
expected = df.iloc[[2, 0, 3]]
206+
tm.assert_frame_equal(result, expected)
207+
208+
# nlargest
209+
result = df.nlargest(3, ("x", "b"))
210+
expected = df.iloc[[3, 2, 1]]
211+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)