pandas-dev · jreback · Jun 14, 2020 · May 8, 2020 · May 8, 2020 · May 8, 2020
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -150,7 +150,7 @@ Other enhancements
   such as ``dict`` and ``list``, mirroring the behavior of :meth:`DataFrame.update` (:issue:`33215`)
 - :meth:`~pandas.core.groupby.GroupBy.transform` and :meth:`~pandas.core.groupby.GroupBy.aggregate` has gained ``engine`` and ``engine_kwargs`` arguments that supports executing functions with ``Numba`` (:issue:`32854`, :issue:`33388`)
 - :meth:`~pandas.core.resample.Resampler.interpolate` now supports SciPy interpolation method :class:`scipy.interpolate.CubicSpline` as method ``cubicspline`` (:issue:`33670`)
--
+- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` now implement the ``sample`` method for doing random sampling within groups (:issue:`31775`)
 
 .. ---------------------------------------------------------------------------
 

diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py
@@ -180,6 +180,7 @@ def _gotitem(self, key, ndim, subset=None):
         "tail",
         "take",
         "transform",
+        "sample",
     ]
 )
 # Valid values  of `name` for `groupby.transform(name)`

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -2630,6 +2630,93 @@ def _reindex_output(
 
         return output.reset_index(drop=True)
 
+    def sample(
+        self,
+        n: Optional[int] = None,
+        frac: Optional[float] = None,
+        replace: bool = False,
+        weights=None,
+        random_state=None,
+    ):
+        """
+        Return a random sample of items from each group.
+
+        You can use `random_state` for reproducibility.
+
+        .. versionadded:: 1.1.0
+
+        Parameters
+        ----------
+        n : int, optional
+            Number of items to return. Cannot be used with `frac`.
+            Default = 1 if `frac` is None.
+        frac : float, optional
+            Fraction of items to return. Cannot be used with `n`.
+        replace : bool, default False
+            Allow or disallow sampling of the same row more than once.
+        weights : list-like, optional
+            Default None results in equal probability weighting.
+            If passed a list-like then values must have the same length as
+            the underlying object and will be used as sampling probabilities
+            after normalization within each group.
+        random_state : int, array-like, BitGenerator, np.random.RandomState, optional
+            If int, array-like, or BitGenerator (NumPy>=1.17), seed for
 def random_state(state=None): 
 def random_state(state=None): 
+            random number generator
+            If np.random.RandomState, use as numpy RandomState object.
+
+        Returns
+        -------
+        Series or DataFrame
+            A new object of same type as caller containing items randomly
+            sampled within each group from the caller object.
+
+        See Also
+        --------
+        DataFrame.sample: Generate random samples from a DataFrame object.
+        numpy.random.choice: Generate a random sample from a given 1-D numpy
+            array.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {"a": ["red"] * 2 + ["blue"] * 2 + ["black"] * 2, "b": range(6)}
+        ... )
+        >>> df
+               a  b
+        0    red  0
+        1    red  1
+        2   blue  2
+        3   blue  3
+        4  black  4
+        5  black  5
+        >>> df.groupby("a").sample(n=1, random_state=1)
+               a  b
+        4  black  4
+        2   blue  2
+        0    red  0
+        >>> df.groupby("a")["b"].sample(frac=0.5, random_state=2)
+        5    5
+        3    3
+        1    1
+        Name: b, dtype: int64
+        """
+        from pandas.core.reshape.concat import concat
+
+        if weights is not None:
+            weights = Series(weights, index=self._selected_obj.index)
+            ws = [weights[idx] for idx in self.indices.values()]
+        else:
+            ws = [None] * self.ngroups
+
+        samples = [
+            self._selected_obj.loc[idx].sample(
+                n=n, frac=frac, replace=replace, weights=w, random_state=random_state
+            )
+            for idx, w in zip(self.indices.values(), ws)
+        ]
+
+        return concat(samples, axis=self.axis)
+
 
 GroupBy._add_numeric_operations()
 

diff --git a/pandas/tests/groupby/test_sample.py b/pandas/tests/groupby/test_sample.py
@@ -0,0 +1,111 @@
+import pytest
+
+from pandas import DataFrame, Index, Series
+import pandas._testing as tm
+
+
+@pytest.mark.parametrize("n, frac", [(2, None), (None, 0.2)])
+def test_groupby_sample_balanced_groups_shape(n, frac):
+    values = [1] * 10 + [2] * 10
+    df = DataFrame({"a": values, "b": values})
+
+    result = df.groupby("a").sample(n=n, frac=frac)
+    values = [1] * 2 + [2] * 2
+    expected = DataFrame({"a": values, "b": values}, index=result.index)
+    tm.assert_frame_equal(result, expected)
+
+    result = df.groupby("a")["b"].sample(n=n, frac=frac)
+    expected = Series(values, name="b", index=result.index)
+    tm.assert_series_equal(result, expected)
+
+
+def test_groupby_sample_unbalanced_groups_shape():
+    values = [1] * 10 + [2] * 20
+    df = DataFrame({"a": values, "b": values})
+
+    result = df.groupby("a").sample(n=5)
+    values = [1] * 5 + [2] * 5
+    expected = DataFrame({"a": values, "b": values}, index=result.index)
+    tm.assert_frame_equal(result, expected)
+
+    result = df.groupby("a")["b"].sample(n=5)
+    expected = Series(values, name="b", index=result.index)
+    tm.assert_series_equal(result, expected)
+
+
+def test_groupby_sample_n_and_frac_raises():
+    df = DataFrame({"a": [1, 2], "b": [1, 2]})
+    msg = "Please enter a value for `frac` OR `n`, not both"
+
+    with pytest.raises(ValueError, match=msg):
+        df.groupby("a").sample(n=1, frac=1.0)
+
+    with pytest.raises(ValueError, match=msg):
+        df.groupby("a")["b"].sample(n=1, frac=1.0)
+
+
+def test_groupby_sample_frac_gt_one_without_replacement_raises():
+    df = DataFrame({"a": [1, 2], "b": [1, 2]})
+    msg = "Replace has to be set to `True` when upsampling the population `frac` > 1."
+
+    with pytest.raises(ValueError, match=msg):
+        df.groupby("a").sample(frac=1.5, replace=False)
+
+    with pytest.raises(ValueError, match=msg):
+        df.groupby("a")["b"].sample(frac=1.5, replace=False)
+
+
+@pytest.mark.parametrize("n", [-1, 1.5])
+def test_groupby_sample_invalid_n_raises(n):
+    df = DataFrame({"a": [1, 2], "b": [1, 2]})
+
+    if n < 0:
+        msg = "Please provide positive value"
+    else:
+        msg = "Only integers accepted as `n` values"
+
+    with pytest.raises(ValueError, match=msg):
+        df.groupby("a").sample(n=n)
+
+    with pytest.raises(ValueError, match=msg):
+        df.groupby("a")["b"].sample(n=n)
+
+
+def test_groupby_sample_oversample():
+    values = [1] * 10 + [2] * 10
+    df = DataFrame({"a": values, "b": values})
+
+    result = df.groupby("a").sample(frac=2.0, replace=True)
+    values = [1] * 20 + [2] * 20
+    expected = DataFrame({"a": values, "b": values}, index=result.index)
+    tm.assert_frame_equal(result, expected)
+
+    result = df.groupby("a")["b"].sample(frac=2.0, replace=True)
+    expected = Series(values, name="b", index=result.index)
+    tm.assert_series_equal(result, expected)
+
+
+def test_groupby_sample_without_n_or_frac():
+    values = [1] * 10 + [2] * 10
+    df = DataFrame({"a": values, "b": values})
+
+    result = df.groupby("a").sample(n=None, frac=None)
+    expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=result.index)
+    tm.assert_frame_equal(result, expected)
+
+    result = df.groupby("a")["b"].sample(n=None, frac=None)
+    expected = Series([1, 2], name="b", index=result.index)
+    tm.assert_series_equal(result, expected)
+
+
+def test_groupby_sample_with_weights():
+    values = [1] * 2 + [2] * 2
+    df = DataFrame({"a": values, "b": values}, index=Index([0, 1, 2, 3]))
+
+    result = df.groupby("a").sample(n=2, replace=True, weights=[1, 0, 1, 0])
+    expected = DataFrame({"a": values, "b": values}, index=Index([0, 0, 2, 2]))
+    tm.assert_frame_equal(result, expected)
+
+    result = df.groupby("a")["b"].sample(n=2, replace=True, weights=[1, 0, 1, 0])
+    expected = Series(values, name="b", index=Index([0, 0, 2, 2]))
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py
@@ -328,6 +328,7 @@ def test_tab_completion(mframe):
         "rolling",
         "expanding",
         "pipe",
+        "sample",
     }
     assert results == expected
-Original file line number
+Diff line change
@@ Expand Up / @@ -180,6 +180,7 @@ def _gotitem(self, key, ndim, subset=None): @@
             "tail",
             "take",
             "transform",
+            "sample",
         ]
     )
     # Valid values  of `name` for `groupby.transform(name)`
@@ Expand Down @@