|
| 1 | +import numpy as np |
| 2 | +import pytest |
| 3 | + |
| 4 | +from pandas.compat.numpy import np_version_under1p17 |
| 5 | + |
| 6 | +from pandas import DataFrame, Series |
| 7 | +import pandas._testing as tm |
| 8 | +import pandas.core.common as com |
| 9 | + |
| 10 | + |
| 11 | +class TestSample: |
| 12 | + @pytest.fixture(params=[Series, DataFrame]) |
| 13 | + def obj(self, request): |
| 14 | + klass = request.param |
| 15 | + if klass is Series: |
| 16 | + arr = np.random.randn(10) |
| 17 | + else: |
| 18 | + arr = np.random.randn(10, 10) |
| 19 | + return klass(arr, dtype=None) |
| 20 | + |
| 21 | + @pytest.mark.parametrize("test", list(range(10))) |
| 22 | + def test_sample(self, test, obj): |
| 23 | + # Fixes issue: 2419 |
| 24 | + # Check behavior of random_state argument |
| 25 | + # Check for stability when receives seed or random state -- run 10 |
| 26 | + # times. |
| 27 | + |
| 28 | + seed = np.random.randint(0, 100) |
| 29 | + tm.assert_equal( |
| 30 | + obj.sample(n=4, random_state=seed), obj.sample(n=4, random_state=seed) |
| 31 | + ) |
| 32 | + |
| 33 | + tm.assert_equal( |
| 34 | + obj.sample(frac=0.7, random_state=seed), |
| 35 | + obj.sample(frac=0.7, random_state=seed), |
| 36 | + ) |
| 37 | + |
| 38 | + tm.assert_equal( |
| 39 | + obj.sample(n=4, random_state=np.random.RandomState(test)), |
| 40 | + obj.sample(n=4, random_state=np.random.RandomState(test)), |
| 41 | + ) |
| 42 | + |
| 43 | + tm.assert_equal( |
| 44 | + obj.sample(frac=0.7, random_state=np.random.RandomState(test)), |
| 45 | + obj.sample(frac=0.7, random_state=np.random.RandomState(test)), |
| 46 | + ) |
| 47 | + |
| 48 | + tm.assert_equal( |
| 49 | + obj.sample(frac=2, replace=True, random_state=np.random.RandomState(test)), |
| 50 | + obj.sample(frac=2, replace=True, random_state=np.random.RandomState(test)), |
| 51 | + ) |
| 52 | + |
| 53 | + os1, os2 = [], [] |
| 54 | + for _ in range(2): |
| 55 | + np.random.seed(test) |
| 56 | + os1.append(obj.sample(n=4)) |
| 57 | + os2.append(obj.sample(frac=0.7)) |
| 58 | + tm.assert_equal(*os1) |
| 59 | + tm.assert_equal(*os2) |
| 60 | + |
| 61 | + def test_sample_lengths(self, obj): |
| 62 | + # Check lengths are right |
| 63 | + assert len(obj.sample(n=4) == 4) |
| 64 | + assert len(obj.sample(frac=0.34) == 3) |
| 65 | + assert len(obj.sample(frac=0.36) == 4) |
| 66 | + |
| 67 | + def test_sample_invalid_random_state(self, obj): |
| 68 | + # Check for error when random_state argument invalid. |
| 69 | + with pytest.raises(ValueError): |
| 70 | + obj.sample(random_state="astring!") |
| 71 | + |
| 72 | + def test_sample_wont_accept_n_and_frac(self, obj): |
| 73 | + # Giving both frac and N throws error |
| 74 | + with pytest.raises(ValueError): |
| 75 | + obj.sample(n=3, frac=0.3) |
| 76 | + |
| 77 | + def test_sample_requires_positive_n_frac(self, obj): |
| 78 | + with pytest.raises(ValueError): |
| 79 | + obj.sample(n=-3) |
| 80 | + with pytest.raises(ValueError): |
| 81 | + obj.sample(frac=-0.3) |
| 82 | + |
| 83 | + def test_sample_requires_integer_n(self, obj): |
| 84 | + # Make sure float values of `n` give error |
| 85 | + with pytest.raises(ValueError): |
| 86 | + obj.sample(n=3.2) |
| 87 | + |
| 88 | + def test_sample_invalid_weight_lengths(self, obj): |
| 89 | + # Weight length must be right |
| 90 | + with pytest.raises(ValueError): |
| 91 | + obj.sample(n=3, weights=[0, 1]) |
| 92 | + |
| 93 | + with pytest.raises(ValueError): |
| 94 | + bad_weights = [0.5] * 11 |
| 95 | + obj.sample(n=3, weights=bad_weights) |
| 96 | + |
| 97 | + with pytest.raises(ValueError): |
| 98 | + bad_weight_series = Series([0, 0, 0.2]) |
| 99 | + obj.sample(n=4, weights=bad_weight_series) |
| 100 | + |
| 101 | + def test_sample_negative_weights(self, obj): |
| 102 | + # Check won't accept negative weights |
| 103 | + with pytest.raises(ValueError): |
| 104 | + bad_weights = [-0.1] * 10 |
| 105 | + obj.sample(n=3, weights=bad_weights) |
| 106 | + |
| 107 | + def test_sample_inf_weights(self, obj): |
| 108 | + # Check inf and -inf throw errors: |
| 109 | + |
| 110 | + with pytest.raises(ValueError): |
| 111 | + weights_with_inf = [0.1] * 10 |
| 112 | + weights_with_inf[0] = np.inf |
| 113 | + obj.sample(n=3, weights=weights_with_inf) |
| 114 | + |
| 115 | + with pytest.raises(ValueError): |
| 116 | + weights_with_ninf = [0.1] * 10 |
| 117 | + weights_with_ninf[0] = -np.inf |
| 118 | + obj.sample(n=3, weights=weights_with_ninf) |
| 119 | + |
| 120 | + def test_sample_zero_weights(self, obj): |
| 121 | + # All zeros raises errors |
| 122 | + |
| 123 | + zero_weights = [0] * 10 |
| 124 | + with pytest.raises(ValueError): |
| 125 | + obj.sample(n=3, weights=zero_weights) |
| 126 | + |
| 127 | + def test_sample_missing_weights(self, obj): |
| 128 | + # All missing weights |
| 129 | + |
| 130 | + nan_weights = [np.nan] * 10 |
| 131 | + with pytest.raises(ValueError): |
| 132 | + obj.sample(n=3, weights=nan_weights) |
| 133 | + |
| 134 | + def test_sample_none_weights(self, obj): |
| 135 | + # Check None are also replaced by zeros. |
| 136 | + weights_with_None = [None] * 10 |
| 137 | + weights_with_None[5] = 0.5 |
| 138 | + tm.assert_equal( |
| 139 | + obj.sample(n=1, axis=0, weights=weights_with_None), obj.iloc[5:6] |
| 140 | + ) |
| 141 | + |
| 142 | + @pytest.mark.parametrize( |
| 143 | + "func_str,arg", |
| 144 | + [ |
| 145 | + ("np.array", [2, 3, 1, 0]), |
| 146 | + pytest.param( |
| 147 | + "np.random.MT19937", |
| 148 | + 3, |
| 149 | + marks=pytest.mark.skipif(np_version_under1p17, reason="NumPy<1.17"), |
| 150 | + ), |
| 151 | + pytest.param( |
| 152 | + "np.random.PCG64", |
| 153 | + 11, |
| 154 | + marks=pytest.mark.skipif(np_version_under1p17, reason="NumPy<1.17"), |
| 155 | + ), |
| 156 | + ], |
| 157 | + ) |
| 158 | + @pytest.mark.parametrize("klass", [Series, DataFrame]) |
| 159 | + def test_sample_random_state(self, func_str, arg, klass): |
| 160 | + # GH#32503 |
| 161 | + obj = DataFrame({"col1": range(10, 20), "col2": range(20, 30)}) |
| 162 | + if klass is Series: |
| 163 | + obj = obj["col1"] |
| 164 | + result = obj.sample(n=3, random_state=eval(func_str)(arg)) |
| 165 | + expected = obj.sample(n=3, random_state=com.random_state(eval(func_str)(arg))) |
| 166 | + tm.assert_equal(result, expected) |
| 167 | + |
| 168 | + @pytest.mark.parametrize("klass", [Series, DataFrame]) |
| 169 | + def test_sample_upsampling_without_replacement(self, klass): |
| 170 | + # GH#27451 |
| 171 | + |
| 172 | + obj = DataFrame({"A": list("abc")}) |
| 173 | + if klass is Series: |
| 174 | + obj = obj["A"] |
| 175 | + |
| 176 | + msg = ( |
| 177 | + "Replace has to be set to `True` when " |
| 178 | + "upsampling the population `frac` > 1." |
| 179 | + ) |
| 180 | + with pytest.raises(ValueError, match=msg): |
| 181 | + obj.sample(frac=2, replace=False) |
| 182 | + |
| 183 | + |
| 184 | +class TestSampleDataFrame: |
| 185 | + # Tests which are relevant only for DataFrame, so these are |
| 186 | + # as fully parametrized as they can get. |
| 187 | + |
| 188 | + def test_sample(self): |
| 189 | + # GH#2419 |
| 190 | + # additional specific object based tests |
| 191 | + |
| 192 | + # A few dataframe test with degenerate weights. |
| 193 | + easy_weight_list = [0] * 10 |
| 194 | + easy_weight_list[5] = 1 |
| 195 | + |
| 196 | + df = DataFrame( |
| 197 | + { |
| 198 | + "col1": range(10, 20), |
| 199 | + "col2": range(20, 30), |
| 200 | + "colString": ["a"] * 10, |
| 201 | + "easyweights": easy_weight_list, |
| 202 | + } |
| 203 | + ) |
| 204 | + sample1 = df.sample(n=1, weights="easyweights") |
| 205 | + tm.assert_frame_equal(sample1, df.iloc[5:6]) |
| 206 | + |
| 207 | + # Ensure proper error if string given as weight for Series or |
| 208 | + # DataFrame with axis = 1. |
| 209 | + ser = Series(range(10)) |
| 210 | + with pytest.raises(ValueError): |
| 211 | + ser.sample(n=3, weights="weight_column") |
| 212 | + |
| 213 | + with pytest.raises(ValueError): |
| 214 | + df.sample(n=1, weights="weight_column", axis=1) |
| 215 | + |
| 216 | + # Check weighting key error |
| 217 | + with pytest.raises( |
| 218 | + KeyError, match="'String passed to weights not a valid column'" |
| 219 | + ): |
| 220 | + df.sample(n=3, weights="not_a_real_column_name") |
| 221 | + |
| 222 | + # Check that re-normalizes weights that don't sum to one. |
| 223 | + weights_less_than_1 = [0] * 10 |
| 224 | + weights_less_than_1[0] = 0.5 |
| 225 | + tm.assert_frame_equal(df.sample(n=1, weights=weights_less_than_1), df.iloc[:1]) |
| 226 | + |
| 227 | + ### |
| 228 | + # Test axis argument |
| 229 | + ### |
| 230 | + |
| 231 | + # Test axis argument |
| 232 | + df = DataFrame({"col1": range(10), "col2": ["a"] * 10}) |
| 233 | + second_column_weight = [0, 1] |
| 234 | + tm.assert_frame_equal( |
| 235 | + df.sample(n=1, axis=1, weights=second_column_weight), df[["col2"]] |
| 236 | + ) |
| 237 | + |
| 238 | + # Different axis arg types |
| 239 | + tm.assert_frame_equal( |
| 240 | + df.sample(n=1, axis="columns", weights=second_column_weight), df[["col2"]] |
| 241 | + ) |
| 242 | + |
| 243 | + weight = [0] * 10 |
| 244 | + weight[5] = 0.5 |
| 245 | + tm.assert_frame_equal(df.sample(n=1, axis="rows", weights=weight), df.iloc[5:6]) |
| 246 | + tm.assert_frame_equal( |
| 247 | + df.sample(n=1, axis="index", weights=weight), df.iloc[5:6] |
| 248 | + ) |
| 249 | + |
| 250 | + # Check out of range axis values |
| 251 | + with pytest.raises(ValueError): |
| 252 | + df.sample(n=1, axis=2) |
| 253 | + |
| 254 | + with pytest.raises(ValueError): |
| 255 | + df.sample(n=1, axis="not_a_name") |
| 256 | + |
| 257 | + with pytest.raises(ValueError): |
| 258 | + ser = Series(range(10)) |
| 259 | + ser.sample(n=1, axis=1) |
| 260 | + |
| 261 | + # Test weight length compared to correct axis |
| 262 | + with pytest.raises(ValueError): |
| 263 | + df.sample(n=1, axis=1, weights=[0.5] * 10) |
| 264 | + |
| 265 | + def test_sample_axis1(self): |
| 266 | + # Check weights with axis = 1 |
| 267 | + easy_weight_list = [0] * 3 |
| 268 | + easy_weight_list[2] = 1 |
| 269 | + |
| 270 | + df = DataFrame( |
| 271 | + {"col1": range(10, 20), "col2": range(20, 30), "colString": ["a"] * 10} |
| 272 | + ) |
| 273 | + sample1 = df.sample(n=1, axis=1, weights=easy_weight_list) |
| 274 | + tm.assert_frame_equal(sample1, df[["colString"]]) |
| 275 | + |
| 276 | + # Test default axes |
| 277 | + tm.assert_frame_equal( |
| 278 | + df.sample(n=3, random_state=42), df.sample(n=3, axis=0, random_state=42) |
| 279 | + ) |
| 280 | + |
| 281 | + def test_sample_aligns_weights_with_frame(self): |
| 282 | + |
| 283 | + # Test that function aligns weights with frame |
| 284 | + df = DataFrame({"col1": [5, 6, 7], "col2": ["a", "b", "c"]}, index=[9, 5, 3]) |
| 285 | + ser = Series([1, 0, 0], index=[3, 5, 9]) |
| 286 | + tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=ser)) |
| 287 | + |
| 288 | + # Weights have index values to be dropped because not in |
| 289 | + # sampled DataFrame |
| 290 | + ser2 = Series([0.001, 0, 10000], index=[3, 5, 10]) |
| 291 | + tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=ser2)) |
| 292 | + |
| 293 | + # Weights have empty values to be filed with zeros |
| 294 | + ser3 = Series([0.01, 0], index=[3, 5]) |
| 295 | + tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=ser3)) |
| 296 | + |
| 297 | + # No overlap in weight and sampled DataFrame indices |
| 298 | + ser4 = Series([1, 0], index=[1, 2]) |
| 299 | + with pytest.raises(ValueError): |
| 300 | + df.sample(1, weights=ser4) |
| 301 | + |
| 302 | + def test_sample_is_copy(self): |
| 303 | + # GH#27357, GH#30784: ensure the result of sample is an actual copy and |
| 304 | + # doesn't track the parent dataframe / doesn't give SettingWithCopy warnings |
| 305 | + df = DataFrame(np.random.randn(10, 3), columns=["a", "b", "c"]) |
| 306 | + df2 = df.sample(3) |
| 307 | + |
| 308 | + with tm.assert_produces_warning(None): |
| 309 | + df2["d"] = 1 |
0 commit comments