Skip to content

Commit 3af9e78

Browse files
jbrockmendelKevin D Smith
authored and
Kevin D Smith
committed
TST/REF: finish collecting sample tests (pandas-dev#37470)
1 parent 935e086 commit 3af9e78

File tree

3 files changed

+309
-292
lines changed

3 files changed

+309
-292
lines changed

pandas/tests/frame/methods/test_sample.py

-53
This file was deleted.
+309
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,309 @@
1+
import numpy as np
2+
import pytest
3+
4+
from pandas.compat.numpy import np_version_under1p17
5+
6+
from pandas import DataFrame, Series
7+
import pandas._testing as tm
8+
import pandas.core.common as com
9+
10+
11+
class TestSample:
12+
@pytest.fixture(params=[Series, DataFrame])
13+
def obj(self, request):
14+
klass = request.param
15+
if klass is Series:
16+
arr = np.random.randn(10)
17+
else:
18+
arr = np.random.randn(10, 10)
19+
return klass(arr, dtype=None)
20+
21+
@pytest.mark.parametrize("test", list(range(10)))
22+
def test_sample(self, test, obj):
23+
# Fixes issue: 2419
24+
# Check behavior of random_state argument
25+
# Check for stability when receives seed or random state -- run 10
26+
# times.
27+
28+
seed = np.random.randint(0, 100)
29+
tm.assert_equal(
30+
obj.sample(n=4, random_state=seed), obj.sample(n=4, random_state=seed)
31+
)
32+
33+
tm.assert_equal(
34+
obj.sample(frac=0.7, random_state=seed),
35+
obj.sample(frac=0.7, random_state=seed),
36+
)
37+
38+
tm.assert_equal(
39+
obj.sample(n=4, random_state=np.random.RandomState(test)),
40+
obj.sample(n=4, random_state=np.random.RandomState(test)),
41+
)
42+
43+
tm.assert_equal(
44+
obj.sample(frac=0.7, random_state=np.random.RandomState(test)),
45+
obj.sample(frac=0.7, random_state=np.random.RandomState(test)),
46+
)
47+
48+
tm.assert_equal(
49+
obj.sample(frac=2, replace=True, random_state=np.random.RandomState(test)),
50+
obj.sample(frac=2, replace=True, random_state=np.random.RandomState(test)),
51+
)
52+
53+
os1, os2 = [], []
54+
for _ in range(2):
55+
np.random.seed(test)
56+
os1.append(obj.sample(n=4))
57+
os2.append(obj.sample(frac=0.7))
58+
tm.assert_equal(*os1)
59+
tm.assert_equal(*os2)
60+
61+
def test_sample_lengths(self, obj):
62+
# Check lengths are right
63+
assert len(obj.sample(n=4) == 4)
64+
assert len(obj.sample(frac=0.34) == 3)
65+
assert len(obj.sample(frac=0.36) == 4)
66+
67+
def test_sample_invalid_random_state(self, obj):
68+
# Check for error when random_state argument invalid.
69+
with pytest.raises(ValueError):
70+
obj.sample(random_state="astring!")
71+
72+
def test_sample_wont_accept_n_and_frac(self, obj):
73+
# Giving both frac and N throws error
74+
with pytest.raises(ValueError):
75+
obj.sample(n=3, frac=0.3)
76+
77+
def test_sample_requires_positive_n_frac(self, obj):
78+
with pytest.raises(ValueError):
79+
obj.sample(n=-3)
80+
with pytest.raises(ValueError):
81+
obj.sample(frac=-0.3)
82+
83+
def test_sample_requires_integer_n(self, obj):
84+
# Make sure float values of `n` give error
85+
with pytest.raises(ValueError):
86+
obj.sample(n=3.2)
87+
88+
def test_sample_invalid_weight_lengths(self, obj):
89+
# Weight length must be right
90+
with pytest.raises(ValueError):
91+
obj.sample(n=3, weights=[0, 1])
92+
93+
with pytest.raises(ValueError):
94+
bad_weights = [0.5] * 11
95+
obj.sample(n=3, weights=bad_weights)
96+
97+
with pytest.raises(ValueError):
98+
bad_weight_series = Series([0, 0, 0.2])
99+
obj.sample(n=4, weights=bad_weight_series)
100+
101+
def test_sample_negative_weights(self, obj):
102+
# Check won't accept negative weights
103+
with pytest.raises(ValueError):
104+
bad_weights = [-0.1] * 10
105+
obj.sample(n=3, weights=bad_weights)
106+
107+
def test_sample_inf_weights(self, obj):
108+
# Check inf and -inf throw errors:
109+
110+
with pytest.raises(ValueError):
111+
weights_with_inf = [0.1] * 10
112+
weights_with_inf[0] = np.inf
113+
obj.sample(n=3, weights=weights_with_inf)
114+
115+
with pytest.raises(ValueError):
116+
weights_with_ninf = [0.1] * 10
117+
weights_with_ninf[0] = -np.inf
118+
obj.sample(n=3, weights=weights_with_ninf)
119+
120+
def test_sample_zero_weights(self, obj):
121+
# All zeros raises errors
122+
123+
zero_weights = [0] * 10
124+
with pytest.raises(ValueError):
125+
obj.sample(n=3, weights=zero_weights)
126+
127+
def test_sample_missing_weights(self, obj):
128+
# All missing weights
129+
130+
nan_weights = [np.nan] * 10
131+
with pytest.raises(ValueError):
132+
obj.sample(n=3, weights=nan_weights)
133+
134+
def test_sample_none_weights(self, obj):
135+
# Check None are also replaced by zeros.
136+
weights_with_None = [None] * 10
137+
weights_with_None[5] = 0.5
138+
tm.assert_equal(
139+
obj.sample(n=1, axis=0, weights=weights_with_None), obj.iloc[5:6]
140+
)
141+
142+
@pytest.mark.parametrize(
143+
"func_str,arg",
144+
[
145+
("np.array", [2, 3, 1, 0]),
146+
pytest.param(
147+
"np.random.MT19937",
148+
3,
149+
marks=pytest.mark.skipif(np_version_under1p17, reason="NumPy<1.17"),
150+
),
151+
pytest.param(
152+
"np.random.PCG64",
153+
11,
154+
marks=pytest.mark.skipif(np_version_under1p17, reason="NumPy<1.17"),
155+
),
156+
],
157+
)
158+
@pytest.mark.parametrize("klass", [Series, DataFrame])
159+
def test_sample_random_state(self, func_str, arg, klass):
160+
# GH#32503
161+
obj = DataFrame({"col1": range(10, 20), "col2": range(20, 30)})
162+
if klass is Series:
163+
obj = obj["col1"]
164+
result = obj.sample(n=3, random_state=eval(func_str)(arg))
165+
expected = obj.sample(n=3, random_state=com.random_state(eval(func_str)(arg)))
166+
tm.assert_equal(result, expected)
167+
168+
@pytest.mark.parametrize("klass", [Series, DataFrame])
169+
def test_sample_upsampling_without_replacement(self, klass):
170+
# GH#27451
171+
172+
obj = DataFrame({"A": list("abc")})
173+
if klass is Series:
174+
obj = obj["A"]
175+
176+
msg = (
177+
"Replace has to be set to `True` when "
178+
"upsampling the population `frac` > 1."
179+
)
180+
with pytest.raises(ValueError, match=msg):
181+
obj.sample(frac=2, replace=False)
182+
183+
184+
class TestSampleDataFrame:
185+
# Tests which are relevant only for DataFrame, so these are
186+
# as fully parametrized as they can get.
187+
188+
def test_sample(self):
189+
# GH#2419
190+
# additional specific object based tests
191+
192+
# A few dataframe test with degenerate weights.
193+
easy_weight_list = [0] * 10
194+
easy_weight_list[5] = 1
195+
196+
df = DataFrame(
197+
{
198+
"col1": range(10, 20),
199+
"col2": range(20, 30),
200+
"colString": ["a"] * 10,
201+
"easyweights": easy_weight_list,
202+
}
203+
)
204+
sample1 = df.sample(n=1, weights="easyweights")
205+
tm.assert_frame_equal(sample1, df.iloc[5:6])
206+
207+
# Ensure proper error if string given as weight for Series or
208+
# DataFrame with axis = 1.
209+
ser = Series(range(10))
210+
with pytest.raises(ValueError):
211+
ser.sample(n=3, weights="weight_column")
212+
213+
with pytest.raises(ValueError):
214+
df.sample(n=1, weights="weight_column", axis=1)
215+
216+
# Check weighting key error
217+
with pytest.raises(
218+
KeyError, match="'String passed to weights not a valid column'"
219+
):
220+
df.sample(n=3, weights="not_a_real_column_name")
221+
222+
# Check that re-normalizes weights that don't sum to one.
223+
weights_less_than_1 = [0] * 10
224+
weights_less_than_1[0] = 0.5
225+
tm.assert_frame_equal(df.sample(n=1, weights=weights_less_than_1), df.iloc[:1])
226+
227+
###
228+
# Test axis argument
229+
###
230+
231+
# Test axis argument
232+
df = DataFrame({"col1": range(10), "col2": ["a"] * 10})
233+
second_column_weight = [0, 1]
234+
tm.assert_frame_equal(
235+
df.sample(n=1, axis=1, weights=second_column_weight), df[["col2"]]
236+
)
237+
238+
# Different axis arg types
239+
tm.assert_frame_equal(
240+
df.sample(n=1, axis="columns", weights=second_column_weight), df[["col2"]]
241+
)
242+
243+
weight = [0] * 10
244+
weight[5] = 0.5
245+
tm.assert_frame_equal(df.sample(n=1, axis="rows", weights=weight), df.iloc[5:6])
246+
tm.assert_frame_equal(
247+
df.sample(n=1, axis="index", weights=weight), df.iloc[5:6]
248+
)
249+
250+
# Check out of range axis values
251+
with pytest.raises(ValueError):
252+
df.sample(n=1, axis=2)
253+
254+
with pytest.raises(ValueError):
255+
df.sample(n=1, axis="not_a_name")
256+
257+
with pytest.raises(ValueError):
258+
ser = Series(range(10))
259+
ser.sample(n=1, axis=1)
260+
261+
# Test weight length compared to correct axis
262+
with pytest.raises(ValueError):
263+
df.sample(n=1, axis=1, weights=[0.5] * 10)
264+
265+
def test_sample_axis1(self):
266+
# Check weights with axis = 1
267+
easy_weight_list = [0] * 3
268+
easy_weight_list[2] = 1
269+
270+
df = DataFrame(
271+
{"col1": range(10, 20), "col2": range(20, 30), "colString": ["a"] * 10}
272+
)
273+
sample1 = df.sample(n=1, axis=1, weights=easy_weight_list)
274+
tm.assert_frame_equal(sample1, df[["colString"]])
275+
276+
# Test default axes
277+
tm.assert_frame_equal(
278+
df.sample(n=3, random_state=42), df.sample(n=3, axis=0, random_state=42)
279+
)
280+
281+
def test_sample_aligns_weights_with_frame(self):
282+
283+
# Test that function aligns weights with frame
284+
df = DataFrame({"col1": [5, 6, 7], "col2": ["a", "b", "c"]}, index=[9, 5, 3])
285+
ser = Series([1, 0, 0], index=[3, 5, 9])
286+
tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=ser))
287+
288+
# Weights have index values to be dropped because not in
289+
# sampled DataFrame
290+
ser2 = Series([0.001, 0, 10000], index=[3, 5, 10])
291+
tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=ser2))
292+
293+
# Weights have empty values to be filed with zeros
294+
ser3 = Series([0.01, 0], index=[3, 5])
295+
tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=ser3))
296+
297+
# No overlap in weight and sampled DataFrame indices
298+
ser4 = Series([1, 0], index=[1, 2])
299+
with pytest.raises(ValueError):
300+
df.sample(1, weights=ser4)
301+
302+
def test_sample_is_copy(self):
303+
# GH#27357, GH#30784: ensure the result of sample is an actual copy and
304+
# doesn't track the parent dataframe / doesn't give SettingWithCopy warnings
305+
df = DataFrame(np.random.randn(10, 3), columns=["a", "b", "c"])
306+
df2 = df.sample(3)
307+
308+
with tm.assert_produces_warning(None):
309+
df2["d"] = 1

0 commit comments

Comments
 (0)