Adding more documentation for upsampling with replacement and error m… (pandas-dev#29444)

CooperData · Mateusz Górski · commit 3e126fc29a86 · 2019-11-18T15:09:57.000+01:00
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -333,6 +333,7 @@ Numeric
 - :class:`DataFrame` flex inequality comparisons methods (:meth:`DataFrame.lt`, :meth:`DataFrame.le`, :meth:`DataFrame.gt`, :meth: `DataFrame.ge`) with object-dtype and ``complex`` entries failing to raise ``TypeError`` like their :class:`Series` counterparts (:issue:`28079`)
 - Bug in :class:`DataFrame` logical operations (`&`, `|`, `^`) not matching :class:`Series` behavior by filling NA values (:issue:`28741`)
 - Bug in :meth:`DataFrame.interpolate` where specifying axis by name references variable before it is assigned (:issue:`29142`)
+- Improved error message when using `frac` > 1 and `replace` = False (:issue:`27451`)
 -
 
 Conversion
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -4924,6 +4924,10 @@ def sample(
         numpy.random.choice: Generates a random sample from a given 1-D numpy
             array.
 
+        Notes
+        -----
+        If `frac` > 1, `replacement` should be set to `True`.
+
         Examples
         --------
         >>> df = pd.DataFrame({'num_legs': [2, 4, 8, 0],
@@ -4954,6 +4958,20 @@ def sample(
         dog          4          0                  2
         fish         0          0                  8
 
+        An upsample sample of the ``DataFrame`` with replacement:
+        Note that `replace` parameter has to be `True` for `frac` parameter > 1.
+
+        >>> df.sample(frac=2, replace=True, random_state=1)
+                num_legs  num_wings  num_specimen_seen
+        dog            4          0                  2
+        fish           0          0                  8
+        falcon         2          2                 10
+        falcon         2          2                 10
+        fish           0          0                  8
+        dog            4          0                  2
+        fish           0          0                  8
+        dog            4          0                  2
+
         Using a DataFrame column as weights. Rows with larger value in the
         `num_specimen_seen` column are more likely to be sampled.
 
@@ -5029,6 +5047,11 @@ def sample(
         # If no frac or n, default to n=1.
         if n is None and frac is None:
             n = 1
+        elif frac is not None and frac > 1 and not replace:
+            raise ValueError(
+                "Replace has to be set to `True` when "
+                "upsampling the population `frac` > 1."
+            )
         elif n is not None and frac is None and n % 1 != 0:
             raise ValueError("Only integers accepted as `n` values")
         elif n is None and frac is not None:
diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py
@@ -322,6 +322,7 @@ def test_sample(self):
             self._compare(
                 o.sample(n=4, random_state=seed), o.sample(n=4, random_state=seed)
             )
+
             self._compare(
                 o.sample(frac=0.7, random_state=seed),
                 o.sample(frac=0.7, random_state=seed),
@@ -337,6 +338,15 @@ def test_sample(self):
                 o.sample(frac=0.7, random_state=np.random.RandomState(test)),
             )
 
+            self._compare(
+                o.sample(
+                    frac=2, replace=True, random_state=np.random.RandomState(test)
+                ),
+                o.sample(
+                    frac=2, replace=True, random_state=np.random.RandomState(test)
+                ),
+            )
+
             os1, os2 = [], []
             for _ in range(2):
                 np.random.seed(test)
@@ -424,6 +434,17 @@ def test_sample(self):
         weights_with_None[5] = 0.5
         self._compare(o.sample(n=1, axis=0, weights=weights_with_None), o.iloc[5:6])
 
+    def test_sample_upsampling_without_replacement(self):
+        # GH27451
+
+        df = pd.DataFrame({"A": list("abc")})
+        msg = (
+            "Replace has to be set to `True` when "
+            "upsampling the population `frac` > 1."
+        )
+        with pytest.raises(ValueError, match=msg):
+            df.sample(frac=2, replace=False)
+
     def test_size_compat(self):
         # GH8846
         # size property should be defined

Original file line number	Diff line number	Diff line change
`@@ -333,6 +333,7 @@ Numeric`
`333`	`333`	- :class:`DataFrame` flex inequality comparisons methods (:meth:`DataFrame.lt`, :meth:`DataFrame.le`, :meth:`DataFrame.gt`, :meth: `DataFrame.ge`) with object-dtype and ``complex`` entries failing to raise ``TypeError`` like their :class:`Series` counterparts (:issue:`28079`)
`334`	`334`	- Bug in :class:`DataFrame` logical operations (`&`, `\|`, `^`) not matching :class:`Series` behavior by filling NA values (:issue:`28741`)
`335`	`335`	- Bug in :meth:`DataFrame.interpolate` where specifying axis by name references variable before it is assigned (:issue:`29142`)
	`336`	+- Improved error message when using `frac` > 1 and `replace` = False (:issue:`27451`)
`336`	`337`	`-`
`337`	`338`
`338`	`339`	`Conversion`