Skip to content

Commit 3e126fc

Browse files
CooperDataMateusz Górski
authored and
Mateusz Górski
committed
Adding more documentation for upsampling with replacement and error m… (pandas-dev#29444)
1 parent 1eaa27b commit 3e126fc

File tree

3 files changed

+45
-0
lines changed

3 files changed

+45
-0
lines changed

doc/source/whatsnew/v1.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,7 @@ Numeric
333333
- :class:`DataFrame` flex inequality comparisons methods (:meth:`DataFrame.lt`, :meth:`DataFrame.le`, :meth:`DataFrame.gt`, :meth: `DataFrame.ge`) with object-dtype and ``complex`` entries failing to raise ``TypeError`` like their :class:`Series` counterparts (:issue:`28079`)
334334
- Bug in :class:`DataFrame` logical operations (`&`, `|`, `^`) not matching :class:`Series` behavior by filling NA values (:issue:`28741`)
335335
- Bug in :meth:`DataFrame.interpolate` where specifying axis by name references variable before it is assigned (:issue:`29142`)
336+
- Improved error message when using `frac` > 1 and `replace` = False (:issue:`27451`)
336337
-
337338

338339
Conversion

pandas/core/generic.py

+23
Original file line numberDiff line numberDiff line change
@@ -4924,6 +4924,10 @@ def sample(
49244924
numpy.random.choice: Generates a random sample from a given 1-D numpy
49254925
array.
49264926
4927+
Notes
4928+
-----
4929+
If `frac` > 1, `replacement` should be set to `True`.
4930+
49274931
Examples
49284932
--------
49294933
>>> df = pd.DataFrame({'num_legs': [2, 4, 8, 0],
@@ -4954,6 +4958,20 @@ def sample(
49544958
dog 4 0 2
49554959
fish 0 0 8
49564960
4961+
An upsample sample of the ``DataFrame`` with replacement:
4962+
Note that `replace` parameter has to be `True` for `frac` parameter > 1.
4963+
4964+
>>> df.sample(frac=2, replace=True, random_state=1)
4965+
num_legs num_wings num_specimen_seen
4966+
dog 4 0 2
4967+
fish 0 0 8
4968+
falcon 2 2 10
4969+
falcon 2 2 10
4970+
fish 0 0 8
4971+
dog 4 0 2
4972+
fish 0 0 8
4973+
dog 4 0 2
4974+
49574975
Using a DataFrame column as weights. Rows with larger value in the
49584976
`num_specimen_seen` column are more likely to be sampled.
49594977
@@ -5029,6 +5047,11 @@ def sample(
50295047
# If no frac or n, default to n=1.
50305048
if n is None and frac is None:
50315049
n = 1
5050+
elif frac is not None and frac > 1 and not replace:
5051+
raise ValueError(
5052+
"Replace has to be set to `True` when "
5053+
"upsampling the population `frac` > 1."
5054+
)
50325055
elif n is not None and frac is None and n % 1 != 0:
50335056
raise ValueError("Only integers accepted as `n` values")
50345057
elif n is None and frac is not None:

pandas/tests/generic/test_generic.py

+21
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,7 @@ def test_sample(self):
322322
self._compare(
323323
o.sample(n=4, random_state=seed), o.sample(n=4, random_state=seed)
324324
)
325+
325326
self._compare(
326327
o.sample(frac=0.7, random_state=seed),
327328
o.sample(frac=0.7, random_state=seed),
@@ -337,6 +338,15 @@ def test_sample(self):
337338
o.sample(frac=0.7, random_state=np.random.RandomState(test)),
338339
)
339340

341+
self._compare(
342+
o.sample(
343+
frac=2, replace=True, random_state=np.random.RandomState(test)
344+
),
345+
o.sample(
346+
frac=2, replace=True, random_state=np.random.RandomState(test)
347+
),
348+
)
349+
340350
os1, os2 = [], []
341351
for _ in range(2):
342352
np.random.seed(test)
@@ -424,6 +434,17 @@ def test_sample(self):
424434
weights_with_None[5] = 0.5
425435
self._compare(o.sample(n=1, axis=0, weights=weights_with_None), o.iloc[5:6])
426436

437+
def test_sample_upsampling_without_replacement(self):
438+
# GH27451
439+
440+
df = pd.DataFrame({"A": list("abc")})
441+
msg = (
442+
"Replace has to be set to `True` when "
443+
"upsampling the population `frac` > 1."
444+
)
445+
with pytest.raises(ValueError, match=msg):
446+
df.sample(frac=2, replace=False)
447+
427448
def test_size_compat(self):
428449
# GH8846
429450
# size property should be defined

0 commit comments

Comments
 (0)