From 5f6c210c168cc55e90c79916133a97a9992e599f Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Mon, 21 Jun 2021 21:21:09 -0400
Subject: [PATCH 01/11] wip

---
 pandas/core/algorithms.py      | 96 ++++++++++++++++++++++++++++++++++
 pandas/core/generic.py         | 90 +++----------------------------
 pandas/core/groupby/groupby.py | 13 +++--
 3 files changed, 113 insertions(+), 86 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index f4a6b0b1c1694..e3a0e1a9ce7e8 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -28,6 +28,8 @@
 from pandas._typing import (
     AnyArrayLike,
     ArrayLike,
+    FrameOrSeries,
+    RandomState,
     DtypeObj,
     FrameOrSeriesUnion,
     Scalar,
@@ -63,6 +65,7 @@
 from pandas.core.dtypes.dtypes import PandasDtype
 from pandas.core.dtypes.generic import (
     ABCDatetimeArray,
+    ABCDataFrame,
     ABCExtensionArray,
     ABCIndex,
     ABCMultiIndex,
@@ -1895,3 +1898,96 @@ def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike:
     for i, value in enumerate(unique_array):
         indexer += [i] * int(max(l_count[value], r_count[value]))
     return unique_array.take(indexer)
+
+
+def preprocess_weights(obj: DataFrame | Series, weights, axis: int):
+    if weights is not None:
+
+        # If a series, align with frame
+        if isinstance(weights, ABCSeries):
+            weights = weights.reindex(obj.axes[axis])
+
+        # Strings acceptable if a dataframe and axis = 0
+        if isinstance(weights, str):
+            if isinstance(obj, ABCDataFrame):
+                if axis == 0:
+                    try:
+                        weights = obj[weights]
+                    except KeyError as err:
+                        raise KeyError(
+                            "String passed to weights not a valid column"
+                        ) from err
+                else:
+                    raise ValueError(
+                        "Strings can only be passed to "
+                        "weights when sampling from rows on "
+                        "a DataFrame"
+                    )
+            else:
+                raise ValueError(
+                    "Strings cannot be passed as weights "
+                    "when sampling from a Series."
+                )
+
+        if isinstance(obj, ABCSeries):
+            func = obj._constructor
+        else:
+            func = obj._constructor_sliced
+
+        weights = func(weights, dtype="float64")._values
+
+        if len(weights) != obj.shape[axis]:
+            raise ValueError("Weights and axis to be sampled must be of same length")
+
+        if lib.has_infs(weights):
+            raise ValueError("weight vector may not include `inf` values")
+
+        if (weights < 0).any():
+            raise ValueError("weight vector many not include negative values")
+
+        weights[np.isnan(weights)] = 0
+        return weights
+
+
+def process_sampling_size(n, frac: float | None, replace: bool):
+    # If no frac or n, default to n=1.
+    if n is None and frac is None:
+        n = 1
+    elif frac is not None and frac > 1 and not replace:
+        raise ValueError(
+            "Replace has to be set to `True` when "
+            "upsampling the population `frac` > 1."
+        )
+    elif frac is None and n % 1 != 0:
+        raise ValueError("Only integers accepted as `n` values")
+    elif frac is not None:
+        raise ValueError("Please enter a value for `frac` OR `n`, not both")
+
+    # Check for negative sizes
+    if n < 0:
+        raise ValueError(
+            "A negative number of rows requested. Please provide positive value."
+        )
+
+    return n
+
+
+def sample(
+    obj: FrameOrSeries,
+    size: int,
+    replace: bool,
+    weights: np.ndarray,
+    random_state: RandomState,
+    axis: int,
+) -> FrameOrSeries:
+    axis_length = obj.shape[axis]
+
+    if weights is not None:
+        weight_sum = weights.sum()
+        if weight_sum != 0:
+            weights = weights / weight_sum
+        else:
+            raise ValueError("Invalid weights: weights sum to zero")
+
+    locs = random_state.choice(axis_length, size=size, replace=replace, p=weights)
+    return obj.take(locs, axis=axis)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 5bd845534fc96..e926a10175a5f 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -5265,92 +5265,18 @@ def sample(
             axis = self._stat_axis_number
 
         axis = self._get_axis_number(axis)
-        axis_length = self.shape[axis]
 
         # Process random_state argument
         rs = com.random_state(random_state)
 
-        # Check weights for compliance
-        if weights is not None:
-
-            # If a series, align with frame
-            if isinstance(weights, ABCSeries):
-                weights = weights.reindex(self.axes[axis])
-
-            # Strings acceptable if a dataframe and axis = 0
-            if isinstance(weights, str):
-                if isinstance(self, ABCDataFrame):
-                    if axis == 0:
-                        try:
-                            weights = self[weights]
-                        except KeyError as err:
-                            raise KeyError(
-                                "String passed to weights not a valid column"
-                            ) from err
-                    else:
-                        raise ValueError(
-                            "Strings can only be passed to "
-                            "weights when sampling from rows on "
-                            "a DataFrame"
-                        )
-                else:
-                    raise ValueError(
-                        "Strings cannot be passed as weights "
-                        "when sampling from a Series."
-                    )
-
-            if isinstance(self, ABCSeries):
-                func = self._constructor
-            else:
-                func = self._constructor_sliced
-            weights = func(weights, dtype="float64")
-
-            if len(weights) != axis_length:
-                raise ValueError(
-                    "Weights and axis to be sampled must be of same length"
-                )
-
-            if (weights == np.inf).any() or (weights == -np.inf).any():
-                raise ValueError("weight vector may not include `inf` values")
-
-            if (weights < 0).any():
-                raise ValueError("weight vector many not include negative values")
-
-            # If has nan, set to zero.
-            weights = weights.fillna(0)
-
-            # Renormalize if don't sum to 1
-            if weights.sum() != 1:
-                if weights.sum() != 0:
-                    weights = weights / weights.sum()
-                else:
-                    raise ValueError("Invalid weights: weights sum to zero")
-
-            weights = weights._values
-
-        # If no frac or n, default to n=1.
-        if n is None and frac is None:
-            n = 1
-        elif frac is not None and frac > 1 and not replace:
-            raise ValueError(
-                "Replace has to be set to `True` when "
-                "upsampling the population `frac` > 1."
-            )
-        elif frac is None and n % 1 != 0:
-            raise ValueError("Only integers accepted as `n` values")
-        elif n is None and frac is not None:
-            n = round(frac * axis_length)
-        elif frac is not None:
-            raise ValueError("Please enter a value for `frac` OR `n`, not both")
-
-        # Check for negative sizes
-        if n < 0:
-            raise ValueError(
-                "A negative number of rows requested. Please provide positive value."
-            )
-
-        locs = rs.choice(axis_length, size=n, replace=replace, p=weights)
-        return self.take(locs, axis=axis)
+        n = algos.process_sampling_size(n, frac, replace)
+        if n is None:
+            assert frac is not None
+            size = round(frac * self.shape[axis])
+        else:
+            size = n
+        weights = algos.preprocess_weights(self, weights, axis)
+        return algos.sample(self, size, replace, weights, rs, axis)
 
     @final
     @doc(klass=_shared_doc_kwargs["klass"])
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 0080791a51a4b..7d766c8569239 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -3272,19 +3272,24 @@ def sample(
         """
         from pandas.core.reshape.concat import concat
 
+        n = algorithms.process_sampling_size(n, frac, replace)
+        if n is None:
+            assert frac is not None
+            sizes = []
+
         if weights is not None:
+            weights = algorithms.preprocess_weights(self, weights, axis=0)
             weights = Series(weights, index=self._selected_obj.index)
             ws = [weights.iloc[idx] for idx in self.indices.values()]
         else:
             ws = [None] * self.ngroups
 
-        if random_state is not None:
-            random_state = com.random_state(random_state)
+        random_state = com.random_state(random_state)
 
         group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis)
         samples = [
-            obj.sample(
-                n=n, frac=frac, replace=replace, weights=w, random_state=random_state
+            algorithms.sample(
+                self, size=size, replace=replace, weights=w, random_state=random_state, axis=0
             )
             for (_, obj), w in zip(group_iterator, ws)
         ]

From 6dc14851ebd21e8658110eb084b84a8bfdb1f050 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Thu, 24 Jun 2021 22:36:55 -0400
Subject: [PATCH 02/11] WIP

---
 pandas/core/algorithms.py                 | 44 +++++++++++++----------
 pandas/core/generic.py                    |  7 ++--
 pandas/core/groupby/groupby.py            | 22 ++++++++----
 pandas/tests/frame/methods/test_sample.py | 11 ++++--
 pandas/tests/groupby/test_sample.py       |  2 +-
 5 files changed, 53 insertions(+), 33 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index e3a0e1a9ce7e8..5c30c673b15b9 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -28,10 +28,10 @@
 from pandas._typing import (
     AnyArrayLike,
     ArrayLike,
-    FrameOrSeries,
-    RandomState,
     DtypeObj,
+    FrameOrSeries,
     FrameOrSeriesUnion,
+    RandomState,
     Scalar,
 )
 from pandas.util._decorators import doc
@@ -64,8 +64,8 @@
 )
 from pandas.core.dtypes.dtypes import PandasDtype
 from pandas.core.dtypes.generic import (
-    ABCDatetimeArray,
     ABCDataFrame,
+    ABCDatetimeArray,
     ABCExtensionArray,
     ABCIndex,
     ABCMultiIndex,
@@ -1939,7 +1939,7 @@ def preprocess_weights(obj: DataFrame | Series, weights, axis: int):
         if len(weights) != obj.shape[axis]:
             raise ValueError("Weights and axis to be sampled must be of same length")
 
-        if lib.has_infs(weights):
+        if lib.has_infs_f8(weights):
             raise ValueError("weight vector may not include `inf` values")
 
         if (weights < 0).any():
@@ -1949,25 +1949,31 @@ def preprocess_weights(obj: DataFrame | Series, weights, axis: int):
         return weights
 
 
-def process_sampling_size(n, frac: float | None, replace: bool):
+def process_sampling_size(
+    n: int | None, frac: float | None, replace: bool
+) -> int | None:
     # If no frac or n, default to n=1.
     if n is None and frac is None:
         n = 1
-    elif frac is not None and frac > 1 and not replace:
-        raise ValueError(
-            "Replace has to be set to `True` when "
-            "upsampling the population `frac` > 1."
-        )
-    elif frac is None and n % 1 != 0:
-        raise ValueError("Only integers accepted as `n` values")
-    elif frac is not None:
+    elif n is not None and frac is not None:
         raise ValueError("Please enter a value for `frac` OR `n`, not both")
-
-    # Check for negative sizes
-    if n < 0:
-        raise ValueError(
-            "A negative number of rows requested. Please provide positive value."
-        )
+    elif n is not None:
+        if n < 0:
+            raise ValueError(
+                "A negative number of rows requested. Please provide `n` >= 0."
+            )
+        if n % 1 != 0:
+            raise ValueError("Only integers accepted as `n` values")
+    else:
+        if frac > 1 and not replace:
+            raise ValueError(
+                "Replace has to be set to `True` when "
+                "upsampling the population `frac` > 1."
+            )
+        if frac < 0:
+            raise ValueError(
+                "A negative number of rows requested. Please provide `frac` >= 0."
+            )
 
     return n
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 144e60c017212..a6fb327e917fa 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -5269,12 +5269,11 @@ def sample(
         # Process random_state argument
         rs = com.random_state(random_state)
 
-        n = algos.process_sampling_size(n, frac, replace)
-        if n is None:
+        size = algos.process_sampling_size(n, frac, replace)
+        if size is None:
             assert frac is not None
             size = round(frac * self.shape[axis])
-        else:
-            size = n
+
         weights = algos.preprocess_weights(self, weights, axis)
         return algos.sample(self, size, replace, weights, rs, axis)
 
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 7f36d40d16fbd..30d73274598eb 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -3275,12 +3275,17 @@ def sample(
         n = algorithms.process_sampling_size(n, frac, replace)
         if n is None:
             assert frac is not None
-            sizes = []
+            sizes = np.zeros(self.ngroups, dtype="i8")
+            for i, idx in enumerate(self.indices.values()):
+                sizes[i] = round(frac * len(idx))
+        else:
+            sizes = np.full(self.ngroups, n, dtype="i8")
 
         if weights is not None:
-            weights = algorithms.preprocess_weights(self, weights, axis=0)
-            weights = Series(weights, index=self._selected_obj.index)
-            ws = [weights.iloc[idx] for idx in self.indices.values()]
+            weights = algorithms.preprocess_weights(
+                self._selected_obj, weights, axis=self.axis
+            )
+            ws = [weights[idx] for idx in self.indices.values()]
         else:
             ws = [None] * self.ngroups
 
@@ -3289,9 +3294,14 @@ def sample(
         group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis)
         samples = [
             algorithms.sample(
-                self, size=size, replace=replace, weights=w, random_state=random_state, axis=0
+                obj,
+                size=sizes[i],
+                replace=replace,
+                weights=w,
+                random_state=random_state,
+                axis=self.axis,
             )
-            for (_, obj), w in zip(group_iterator, ws)
+            for i, ((_, obj), w) in enumerate(zip(group_iterator, ws))
         ]
 
         return concat(samples, axis=self.axis)
diff --git a/pandas/tests/frame/methods/test_sample.py b/pandas/tests/frame/methods/test_sample.py
index 55ef665c55241..de365c70b62c1 100644
--- a/pandas/tests/frame/methods/test_sample.py
+++ b/pandas/tests/frame/methods/test_sample.py
@@ -83,10 +83,15 @@ def test_sample_wont_accept_n_and_frac(self, obj):
             obj.sample(n=3, frac=0.3)
 
     def test_sample_requires_positive_n_frac(self, obj):
-        msg = "A negative number of rows requested. Please provide positive value."
-        with pytest.raises(ValueError, match=msg):
+        with pytest.raises(
+            ValueError,
+            match="A negative number of rows requested. Please provide `n` >= 0",
+        ):
             obj.sample(n=-3)
-        with pytest.raises(ValueError, match=msg):
+        with pytest.raises(
+            ValueError,
+            match="A negative number of rows requested. Please provide `frac` >= 0",
+        ):
             obj.sample(frac=-0.3)
 
     def test_sample_requires_integer_n(self, obj):
diff --git a/pandas/tests/groupby/test_sample.py b/pandas/tests/groupby/test_sample.py
index 652a5fc1a3c34..9153fac0927c5 100644
--- a/pandas/tests/groupby/test_sample.py
+++ b/pandas/tests/groupby/test_sample.py
@@ -78,7 +78,7 @@ def test_groupby_sample_invalid_n_raises(n):
     df = DataFrame({"a": [1, 2], "b": [1, 2]})
 
     if n < 0:
-        msg = "Please provide positive value"
+        msg = "A negative number of rows requested. Please provide `n` >= 0."
     else:
         msg = "Only integers accepted as `n` values"
 

From 45e0fe1d1bc9614d6d6897d51fe803c96bf46bd8 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Thu, 24 Jun 2021 22:46:51 -0400
Subject: [PATCH 03/11] Add asv

---
 asv_bench/benchmarks/groupby.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index 1648985a56b91..a13343ce05d03 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -832,4 +832,15 @@ def function(values):
         self.grouper.agg(function, engine="cython")
 
 
+class Sample:
+    def setup(self):
+        N = 10 ** 5
+        self.df = DataFrame({"a": np.zeros(N)})
+        self.groups = np.arange(0, N)
+        self.weights = np.ones(N)
+
+    def time_sample(self):
+        self.df.groupby(self.groups).sample(n=1, weights=self.weights)
+
+
 from .pandas_vb_common import setup  # noqa: F401 isort:skip

From e7c7d75a4f887e8e8b1facbfe6bf3a89c7bbafc1 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Fri, 25 Jun 2021 00:20:34 -0400
Subject: [PATCH 04/11] Avoid concat

---
 asv_bench/benchmarks/groupby.py |  2 +-
 pandas/core/algorithms.py       |  3 +--
 pandas/core/generic.py          |  3 ++-
 pandas/core/groupby/groupby.py  | 38 ++++++++++++++++++++++++---------
 4 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index a13343ce05d03..b30dbad4d25d1 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -834,7 +834,7 @@ def function(values):
 
 class Sample:
     def setup(self):
-        N = 10 ** 5
+        N = 10 ** 3
         self.df = DataFrame({"a": np.zeros(N)})
         self.groups = np.arange(0, N)
         self.weights = np.ones(N)
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 5c30c673b15b9..7d977a2795f62 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -1995,5 +1995,4 @@ def sample(
         else:
             raise ValueError("Invalid weights: weights sum to zero")
 
-    locs = random_state.choice(axis_length, size=size, replace=replace, p=weights)
-    return obj.take(locs, axis=axis)
+    return random_state.choice(axis_length, size=size, replace=replace, p=weights)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index a6fb327e917fa..c88fcda754ee8 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -5275,7 +5275,8 @@ def sample(
             size = round(frac * self.shape[axis])
 
         weights = algos.preprocess_weights(self, weights, axis)
-        return algos.sample(self, size, replace, weights, rs, axis)
+        sampled_indices = algos.sample(self, size, replace, weights, rs, axis)
+        return self.take(sampled_indices, axis=axis)
 
     @final
     @doc(klass=_shared_doc_kwargs["klass"])
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 30d73274598eb..1ddcc9e73d6dc 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -3270,8 +3270,6 @@ def sample(
         2   blue  2
         0    red  0
         """
-        from pandas.core.reshape.concat import concat
-
         n = algorithms.process_sampling_size(n, frac, replace)
         if n is None:
             assert frac is not None
@@ -3292,19 +3290,39 @@ def sample(
         random_state = com.random_state(random_state)
 
         group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis)
-        samples = [
-            algorithms.sample(
+        sampled_indices = []
+        for i, ((ind, obj), w) in enumerate(zip(group_iterator, ws)):
+            grp_idx = self.indices[ind]
+            size = n
+            if n is None:
+                size = round(frac * len(grp_idx))
+            grp_sample = algorithms.sample(
                 obj,
-                size=sizes[i],
+                size=size,
                 replace=replace,
-                weights=w,
+                weights=None if weights is None else weights[grp_idx],
                 random_state=random_state,
                 axis=self.axis,
             )
-            for i, ((_, obj), w) in enumerate(zip(group_iterator, ws))
-        ]
-
-        return concat(samples, axis=self.axis)
+            sampled_indices.append(grp_idx[grp_sample])
+
+        sampled_indices = np.concatenate(sampled_indices)
+
+        # sampled_indices = np.concatenate([
+        #     algorithms.sample(
+        #         obj,
+        #         size=sizes[i],
+        #         replace=replace,
+        #         weights=w,
+        #         random_state=random_state,
+        #         axis=self.axis,
+        #     )
+        #     for i, ((ind, obj), w) in enumerate(zip(group_iterator, ws))
+        # ])
+        # print(sampled_indices.shape)
+        # print(self._selected_obj)
+
+        return self._selected_obj.take(sampled_indices, axis=self.axis)
 
 
 @doc(GroupBy)

From ca26efb5225fb8037c7fc14f190fa0c5b7606364 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Fri, 25 Jun 2021 00:22:53 -0400
Subject: [PATCH 05/11] Clean dead code

---
 pandas/core/groupby/groupby.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 1ddcc9e73d6dc..b6fc066e3ff1e 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -3271,27 +3271,16 @@ def sample(
         0    red  0
         """
         n = algorithms.process_sampling_size(n, frac, replace)
-        if n is None:
-            assert frac is not None
-            sizes = np.zeros(self.ngroups, dtype="i8")
-            for i, idx in enumerate(self.indices.values()):
-                sizes[i] = round(frac * len(idx))
-        else:
-            sizes = np.full(self.ngroups, n, dtype="i8")
-
         if weights is not None:
             weights = algorithms.preprocess_weights(
                 self._selected_obj, weights, axis=self.axis
             )
-            ws = [weights[idx] for idx in self.indices.values()]
-        else:
-            ws = [None] * self.ngroups
 
         random_state = com.random_state(random_state)
 
         group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis)
         sampled_indices = []
-        for i, ((ind, obj), w) in enumerate(zip(group_iterator, ws)):
+        for i, (ind, obj) in enumerate(group_iterator):
             grp_idx = self.indices[ind]
             size = n
             if n is None:

From f14705237bb372daa3680ba952bd2b42cdea42d4 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Fri, 25 Jun 2021 12:44:37 -0400
Subject: [PATCH 06/11] WIP

---
 asv_bench/benchmarks/groupby.py |  3 +++
 pandas/core/groupby/groupby.py  | 22 ++--------------------
 2 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index b30dbad4d25d1..6ca951e946bad 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -840,6 +840,9 @@ def setup(self):
         self.weights = np.ones(N)
 
     def time_sample(self):
+        self.df.groupby(self.groups).sample(n=1)
+
+    def time_sample_weights(self):
         self.df.groupby(self.groups).sample(n=1, weights=self.weights)
 
 
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index b6fc066e3ff1e..380d2f23a6292 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -3280,14 +3280,11 @@ def sample(
 
         group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis)
         sampled_indices = []
-        for i, (ind, obj) in enumerate(group_iterator):
+        for ind, obj in group_iterator:
             grp_idx = self.indices[ind]
-            size = n
-            if n is None:
-                size = round(frac * len(grp_idx))
             grp_sample = algorithms.sample(
                 obj,
-                size=size,
+                size=n if n is None else round(frac * len(grp_idx)),
                 replace=replace,
                 weights=None if weights is None else weights[grp_idx],
                 random_state=random_state,
@@ -3296,21 +3293,6 @@ def sample(
             sampled_indices.append(grp_idx[grp_sample])
 
         sampled_indices = np.concatenate(sampled_indices)
-
-        # sampled_indices = np.concatenate([
-        #     algorithms.sample(
-        #         obj,
-        #         size=sizes[i],
-        #         replace=replace,
-        #         weights=w,
-        #         random_state=random_state,
-        #         axis=self.axis,
-        #     )
-        #     for i, ((ind, obj), w) in enumerate(zip(group_iterator, ws))
-        # ])
-        # print(sampled_indices.shape)
-        # print(self._selected_obj)
-
         return self._selected_obj.take(sampled_indices, axis=self.axis)
 
 

From 3834f0cdef9ddf4d3903c884850befa75d89a958 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Fri, 25 Jun 2021 13:23:24 -0400
Subject: [PATCH 07/11] Add whatsnew, fix some typing

---
 doc/source/whatsnew/v1.4.0.rst |  2 +-
 pandas/core/algorithms.py      | 91 +++++++++++++++++-----------------
 pandas/core/generic.py         |  8 +--
 pandas/core/groupby/groupby.py | 18 ++++---
 4 files changed, 63 insertions(+), 56 deletions(-)

diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
index f992d6aa09ead..e9a4ba10ff6e8 100644
--- a/doc/source/whatsnew/v1.4.0.rst
+++ b/doc/source/whatsnew/v1.4.0.rst
@@ -105,7 +105,7 @@ Deprecations
 
 Performance improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
--
+- Performance improvement in :meth:`.GroupBy.sample`, especially when ``weights`` argument provided (:issue:`34483`)
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index eadd92e3e58ba..9fd6e2e14b291 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -27,7 +27,6 @@
     DtypeObj,
     FrameOrSeries,
     FrameOrSeriesUnion,
-    RandomState,
     Scalar,
 )
 from pandas.util._decorators import doc
@@ -1896,53 +1895,55 @@ def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike:
     return unique_array.take(indexer)
 
 
-def preprocess_weights(obj: DataFrame | Series, weights, axis: int):
-    if weights is not None:
+# ------ #
+# sample #
+# ------ #
+
+
+def preprocess_weights(obj: FrameOrSeries, weights, axis: int) -> np.ndarray:
+    # If a series, align with frame
+    if isinstance(weights, ABCSeries):
+        weights = weights.reindex(obj.axes[axis])
 
-        # If a series, align with frame
-        if isinstance(weights, ABCSeries):
-            weights = weights.reindex(obj.axes[axis])
-
-        # Strings acceptable if a dataframe and axis = 0
-        if isinstance(weights, str):
-            if isinstance(obj, ABCDataFrame):
-                if axis == 0:
-                    try:
-                        weights = obj[weights]
-                    except KeyError as err:
-                        raise KeyError(
-                            "String passed to weights not a valid column"
-                        ) from err
-                else:
-                    raise ValueError(
-                        "Strings can only be passed to "
-                        "weights when sampling from rows on "
-                        "a DataFrame"
-                    )
+    # Strings acceptable if a dataframe and axis = 0
+    if isinstance(weights, str):
+        if isinstance(obj, ABCDataFrame):
+            if axis == 0:
+                try:
+                    weights = obj[weights]
+                except KeyError as err:
+                    raise KeyError(
+                        "String passed to weights not a valid column"
+                    ) from err
             else:
                 raise ValueError(
-                    "Strings cannot be passed as weights "
-                    "when sampling from a Series."
+                    "Strings can only be passed to "
+                    "weights when sampling from rows on "
+                    "a DataFrame"
                 )
-
-        if isinstance(obj, ABCSeries):
-            func = obj._constructor
         else:
-            func = obj._constructor_sliced
+            raise ValueError(
+                "Strings cannot be passed as weights when sampling from a Series."
+            )
+
+    if isinstance(obj, ABCSeries):
+        func = obj._constructor
+    else:
+        func = obj._constructor_sliced
 
-        weights = func(weights, dtype="float64")._values
+    weights = func(weights, dtype="float64")._values
 
-        if len(weights) != obj.shape[axis]:
-            raise ValueError("Weights and axis to be sampled must be of same length")
+    if len(weights) != obj.shape[axis]:
+        raise ValueError("Weights and axis to be sampled must be of same length")
 
-        if lib.has_infs_f8(weights):
-            raise ValueError("weight vector may not include `inf` values")
+    if lib.has_infs(weights):
+        raise ValueError("weight vector may not include `inf` values")
 
-        if (weights < 0).any():
-            raise ValueError("weight vector many not include negative values")
+    if (weights < 0).any():
+        raise ValueError("weight vector many not include negative values")
 
-        weights[np.isnan(weights)] = 0
-        return weights
+    weights[np.isnan(weights)] = 0
+    return weights
 
 
 def process_sampling_size(
@@ -1961,6 +1962,7 @@ def process_sampling_size(
         if n % 1 != 0:
             raise ValueError("Only integers accepted as `n` values")
     else:
+        assert frac is not None
         if frac > 1 and not replace:
             raise ValueError(
                 "Replace has to be set to `True` when "
@@ -1975,15 +1977,12 @@ def process_sampling_size(
 
 
 def sample(
-    obj: FrameOrSeries,
+    obj_len: int,
     size: int,
     replace: bool,
-    weights: np.ndarray,
-    random_state: RandomState,
-    axis: int,
-) -> FrameOrSeries:
-    axis_length = obj.shape[axis]
-
+    weights: np.ndarray | None,
+    random_state: np.random.RandomState,
+) -> np.ndarray:
     if weights is not None:
         weight_sum = weights.sum()
         if weight_sum != 0:
@@ -1991,4 +1990,4 @@ def sample(
         else:
             raise ValueError("Invalid weights: weights sum to zero")
 
-    return random_state.choice(axis_length, size=size, replace=replace, p=weights)
+    return random_state.choice(obj_len, size=size, replace=replace, p=weights)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index c88fcda754ee8..ce6867499cc00 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -5265,6 +5265,7 @@ def sample(
             axis = self._stat_axis_number
 
         axis = self._get_axis_number(axis)
+        obj_len = self.shape[axis]
 
         # Process random_state argument
         rs = com.random_state(random_state)
@@ -5272,10 +5273,11 @@ def sample(
         size = algos.process_sampling_size(n, frac, replace)
         if size is None:
             assert frac is not None
-            size = round(frac * self.shape[axis])
+            size = round(frac * obj_len)
 
-        weights = algos.preprocess_weights(self, weights, axis)
-        sampled_indices = algos.sample(self, size, replace, weights, rs, axis)
+        if weights is not None:
+            weights = algos.preprocess_weights(self, weights, axis)
+        sampled_indices = algos.sample(obj_len, size, replace, weights, rs)
         return self.take(sampled_indices, axis=axis)
 
     @final
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 380d2f23a6292..603b9587fb195 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -3270,9 +3270,9 @@ def sample(
         2   blue  2
         0    red  0
         """
-        n = algorithms.process_sampling_size(n, frac, replace)
+        size = algorithms.process_sampling_size(n, frac, replace)
         if weights is not None:
-            weights = algorithms.preprocess_weights(
+            weights_arr = algorithms.preprocess_weights(
                 self._selected_obj, weights, axis=self.axis
             )
 
@@ -3282,13 +3282,19 @@ def sample(
         sampled_indices = []
         for ind, obj in group_iterator:
             grp_idx = self.indices[ind]
+            group_size = len(grp_idx)
+            if size is not None:
+                sample_size = size
+            else:
+                assert frac is not None
+                sample_size = round(frac * group_size)
+
             grp_sample = algorithms.sample(
-                obj,
-                size=n if n is None else round(frac * len(grp_idx)),
+                group_size,
+                size=sample_size,
                 replace=replace,
-                weights=None if weights is None else weights[grp_idx],
+                weights=None if weights is None else weights_arr[grp_idx],
                 random_state=random_state,
-                axis=self.axis,
             )
             sampled_indices.append(grp_idx[grp_sample])
 

From a7028700404b4e1291ec91d69837bda13022ebde Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Fri, 25 Jun 2021 13:59:40 -0400
Subject: [PATCH 08/11] Add docstrings

---
 pandas/core/algorithms.py | 42 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 9fd6e2e14b291..1fbb037ed60f4 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -1901,6 +1901,13 @@ def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike:
 
 
 def preprocess_weights(obj: FrameOrSeries, weights, axis: int) -> np.ndarray:
+    """
+    Process and validate the `weights` argument to `NDFrame.sample` and
+    `.GroupBy.sample`.
+
+    Returns `weights` as an ndarray[np.float64], validated except for normalizing
+    weights (because that must be done groupwise in groupby sampling).
+    """
     # If a series, align with frame
     if isinstance(weights, ABCSeries):
         weights = weights.reindex(obj.axes[axis])
@@ -1949,6 +1956,13 @@ def preprocess_weights(obj: FrameOrSeries, weights, axis: int) -> np.ndarray:
 def process_sampling_size(
     n: int | None, frac: float | None, replace: bool
 ) -> int | None:
+    """
+    Process and validate the `n` and `frac` arguments to `NDFrame.sample` and
+    `.GroupBy.sample`.
+
+    Returns None if `frac` should be used (variable sampling sizes), otherwise returns
+    the constant sampling size.
+    """
     # If no frac or n, default to n=1.
     if n is None and frac is None:
         n = 1
@@ -1962,7 +1976,7 @@ def process_sampling_size(
         if n % 1 != 0:
             raise ValueError("Only integers accepted as `n` values")
     else:
-        assert frac is not None
+        assert frac is not None  # for mypy
         if frac > 1 and not replace:
             raise ValueError(
                 "Replace has to be set to `True` when "
@@ -1983,6 +1997,28 @@ def sample(
     weights: np.ndarray | None,
     random_state: np.random.RandomState,
 ) -> np.ndarray:
+    """
+    Extracts the union from lvals and rvals with respect to duplicates and nans in
+    both arrays.
+
+    Parameters
+    ----------
+    obj_len : int
+        The length of the items to consider
+    size : int
+        The number of items to return
+    replace : bool
+        Allow or disallow sampling of the same row more than once.
+    weights : np.ndarray[np.float64] or None
+        If None, equal probability weighting, otherwise weights according
+        to the vector normalized
+    random_state: np.random.RandomState
+        State used for the random sampling
+
+    Returns
+    -------
+    np.ndarray[np.intp]
+    """
     if weights is not None:
         weight_sum = weights.sum()
         if weight_sum != 0:
@@ -1990,4 +2026,6 @@ def sample(
         else:
             raise ValueError("Invalid weights: weights sum to zero")
 
-    return random_state.choice(obj_len, size=size, replace=replace, p=weights)
+    return random_state.choice(obj_len, size=size, replace=replace, p=weights).astype(
+        np.intp, copy=False
+    )

From 7fb839ce0684668693b27fb215a1a5a8ea4c0e65 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Fri, 25 Jun 2021 14:15:34 -0400
Subject: [PATCH 09/11] Improve some variable names

---
 pandas/core/algorithms.py      |  7 +++----
 pandas/core/groupby/groupby.py | 11 ++++++-----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 1fbb037ed60f4..716bf54e11b55 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -1998,15 +1998,14 @@ def sample(
     random_state: np.random.RandomState,
 ) -> np.ndarray:
     """
-    Extracts the union from lvals and rvals with respect to duplicates and nans in
-    both arrays.
+    Randomly sample `size` indices in `np.arange(obj_len)`
 
     Parameters
     ----------
     obj_len : int
-        The length of the items to consider
+        The length of the indices being considered
     size : int
-        The number of items to return
+        The number of values to choose
     replace : bool
         Allow or disallow sampling of the same row more than once.
     weights : np.ndarray[np.float64] or None
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 603b9587fb195..6de6d041413cf 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -3279,10 +3279,11 @@ def sample(
         random_state = com.random_state(random_state)
 
         group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis)
+
         sampled_indices = []
-        for ind, obj in group_iterator:
-            grp_idx = self.indices[ind]
-            group_size = len(grp_idx)
+        for labels, obj in group_iterator:
+            grp_indices = self.indices[labels]
+            group_size = len(grp_indices)
             if size is not None:
                 sample_size = size
             else:
@@ -3293,10 +3294,10 @@ def sample(
                 group_size,
                 size=sample_size,
                 replace=replace,
-                weights=None if weights is None else weights_arr[grp_idx],
+                weights=None if weights is None else weights_arr[grp_indices],
                 random_state=random_state,
             )
-            sampled_indices.append(grp_idx[grp_sample])
+            sampled_indices.append(grp_indices[grp_sample])
 
         sampled_indices = np.concatenate(sampled_indices)
         return self._selected_obj.take(sampled_indices, axis=self.axis)

From 994384dfd7acb56aa501b6a0533ef61c2902a31c Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Sun, 27 Jun 2021 20:52:49 -0400
Subject: [PATCH 10/11] Move to sample.py

---
 pandas/core/algorithms.py      | 137 --------------------------------
 pandas/core/generic.py         |   7 +-
 pandas/core/groupby/groupby.py |   7 +-
 pandas/core/sample.py          | 141 +++++++++++++++++++++++++++++++++
 4 files changed, 149 insertions(+), 143 deletions(-)
 create mode 100644 pandas/core/sample.py

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 716bf54e11b55..a9ca39b89360c 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -25,7 +25,6 @@
     AnyArrayLike,
     ArrayLike,
     DtypeObj,
-    FrameOrSeries,
     FrameOrSeriesUnion,
     Scalar,
 )
@@ -59,7 +58,6 @@
 )
 from pandas.core.dtypes.dtypes import PandasDtype
 from pandas.core.dtypes.generic import (
-    ABCDataFrame,
     ABCDatetimeArray,
     ABCExtensionArray,
     ABCIndex,
@@ -1893,138 +1891,3 @@ def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike:
     for i, value in enumerate(unique_array):
         indexer += [i] * int(max(l_count[value], r_count[value]))
     return unique_array.take(indexer)
-
-
-# ------ #
-# sample #
-# ------ #
-
-
-def preprocess_weights(obj: FrameOrSeries, weights, axis: int) -> np.ndarray:
-    """
-    Process and validate the `weights` argument to `NDFrame.sample` and
-    `.GroupBy.sample`.
-
-    Returns `weights` as an ndarray[np.float64], validated except for normalizing
-    weights (because that must be done groupwise in groupby sampling).
-    """
-    # If a series, align with frame
-    if isinstance(weights, ABCSeries):
-        weights = weights.reindex(obj.axes[axis])
-
-    # Strings acceptable if a dataframe and axis = 0
-    if isinstance(weights, str):
-        if isinstance(obj, ABCDataFrame):
-            if axis == 0:
-                try:
-                    weights = obj[weights]
-                except KeyError as err:
-                    raise KeyError(
-                        "String passed to weights not a valid column"
-                    ) from err
-            else:
-                raise ValueError(
-                    "Strings can only be passed to "
-                    "weights when sampling from rows on "
-                    "a DataFrame"
-                )
-        else:
-            raise ValueError(
-                "Strings cannot be passed as weights when sampling from a Series."
-            )
-
-    if isinstance(obj, ABCSeries):
-        func = obj._constructor
-    else:
-        func = obj._constructor_sliced
-
-    weights = func(weights, dtype="float64")._values
-
-    if len(weights) != obj.shape[axis]:
-        raise ValueError("Weights and axis to be sampled must be of same length")
-
-    if lib.has_infs(weights):
-        raise ValueError("weight vector may not include `inf` values")
-
-    if (weights < 0).any():
-        raise ValueError("weight vector many not include negative values")
-
-    weights[np.isnan(weights)] = 0
-    return weights
-
-
-def process_sampling_size(
-    n: int | None, frac: float | None, replace: bool
-) -> int | None:
-    """
-    Process and validate the `n` and `frac` arguments to `NDFrame.sample` and
-    `.GroupBy.sample`.
-
-    Returns None if `frac` should be used (variable sampling sizes), otherwise returns
-    the constant sampling size.
-    """
-    # If no frac or n, default to n=1.
-    if n is None and frac is None:
-        n = 1
-    elif n is not None and frac is not None:
-        raise ValueError("Please enter a value for `frac` OR `n`, not both")
-    elif n is not None:
-        if n < 0:
-            raise ValueError(
-                "A negative number of rows requested. Please provide `n` >= 0."
-            )
-        if n % 1 != 0:
-            raise ValueError("Only integers accepted as `n` values")
-    else:
-        assert frac is not None  # for mypy
-        if frac > 1 and not replace:
-            raise ValueError(
-                "Replace has to be set to `True` when "
-                "upsampling the population `frac` > 1."
-            )
-        if frac < 0:
-            raise ValueError(
-                "A negative number of rows requested. Please provide `frac` >= 0."
-            )
-
-    return n
-
-
-def sample(
-    obj_len: int,
-    size: int,
-    replace: bool,
-    weights: np.ndarray | None,
-    random_state: np.random.RandomState,
-) -> np.ndarray:
-    """
-    Randomly sample `size` indices in `np.arange(obj_len)`
-
-    Parameters
-    ----------
-    obj_len : int
-        The length of the indices being considered
-    size : int
-        The number of values to choose
-    replace : bool
-        Allow or disallow sampling of the same row more than once.
-    weights : np.ndarray[np.float64] or None
-        If None, equal probability weighting, otherwise weights according
-        to the vector normalized
-    random_state: np.random.RandomState
-        State used for the random sampling
-
-    Returns
-    -------
-    np.ndarray[np.intp]
-    """
-    if weights is not None:
-        weight_sum = weights.sum()
-        if weight_sum != 0:
-            weights = weights / weight_sum
-        else:
-            raise ValueError("Invalid weights: weights sum to zero")
-
-    return random_state.choice(obj_len, size=size, replace=replace, p=weights).astype(
-        np.intp, copy=False
-    )
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 8075faf458967..f2497c6e65967 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -137,6 +137,7 @@
 from pandas.core.missing import find_valid_index
 from pandas.core.ops import align_method_FRAME
 from pandas.core.reshape.concat import concat
+import pandas.core.sample as sample
 from pandas.core.shared_docs import _shared_docs
 from pandas.core.sorting import get_indexer_indexer
 from pandas.core.window import (
@@ -5275,15 +5276,15 @@ def sample(
         # Process random_state argument
         rs = com.random_state(random_state)
 
-        size = algos.process_sampling_size(n, frac, replace)
+        size = sample.process_sampling_size(n, frac, replace)
         if size is None:
             assert frac is not None
             size = round(frac * obj_len)
 
         if weights is not None:
-            weights = algos.preprocess_weights(self, weights, axis)
+            weights = sample.preprocess_weights(self, weights, axis)
 
-        sampled_indices = algos.sample(obj_len, size, replace, weights, rs)
+        sampled_indices = sample.sample(obj_len, size, replace, weights, rs)
         result = self.take(sampled_indices, axis=axis)
 
         if ignore_index:
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 6de6d041413cf..8fb50db2e33f2 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -102,6 +102,7 @@ class providing the base-class of operations.
     MultiIndex,
 )
 from pandas.core.internals.blocks import ensure_block_shape
+import pandas.core.sample as sample
 from pandas.core.series import Series
 from pandas.core.sorting import get_group_index_sorter
 from pandas.core.util.numba_ import (
@@ -3270,9 +3271,9 @@ def sample(
         2   blue  2
         0    red  0
         """
-        size = algorithms.process_sampling_size(n, frac, replace)
+        size = sample.process_sampling_size(n, frac, replace)
         if weights is not None:
-            weights_arr = algorithms.preprocess_weights(
+            weights_arr = sample.preprocess_weights(
                 self._selected_obj, weights, axis=self.axis
             )
 
@@ -3290,7 +3291,7 @@ def sample(
                 assert frac is not None
                 sample_size = round(frac * group_size)
 
-            grp_sample = algorithms.sample(
+            grp_sample = sample.sample(
                 group_size,
                 size=sample_size,
                 replace=replace,
diff --git a/pandas/core/sample.py b/pandas/core/sample.py
new file mode 100644
index 0000000000000..cfd512581fcc3
--- /dev/null
+++ b/pandas/core/sample.py
@@ -0,0 +1,141 @@
+from __future__ import annotations
+
+import numpy as np
+
+from pandas._libs import lib
+from pandas._typing import FrameOrSeries
+
+from pandas.core.dtypes.generic import (
+    ABCDataFrame,
+    ABCSeries,
+)
+
+
+def preprocess_weights(obj: FrameOrSeries, weights, axis: int) -> np.ndarray:
+    """
+    Process and validate the `weights` argument to `NDFrame.sample` and
+    `.GroupBy.sample`.
+
+    Returns `weights` as an ndarray[np.float64], validated except for normalizing
+    weights (because that must be done groupwise in groupby sampling).
+    """
+    # If a series, align with frame
+    if isinstance(weights, ABCSeries):
+        weights = weights.reindex(obj.axes[axis])
+
+    # Strings acceptable if a dataframe and axis = 0
+    if isinstance(weights, str):
+        if isinstance(obj, ABCDataFrame):
+            if axis == 0:
+                try:
+                    weights = obj[weights]
+                except KeyError as err:
+                    raise KeyError(
+                        "String passed to weights not a valid column"
+                    ) from err
+            else:
+                raise ValueError(
+                    "Strings can only be passed to "
+                    "weights when sampling from rows on "
+                    "a DataFrame"
+                )
+        else:
+            raise ValueError(
+                "Strings cannot be passed as weights when sampling from a Series."
+            )
+
+    if isinstance(obj, ABCSeries):
+        func = obj._constructor
+    else:
+        func = obj._constructor_sliced
+
+    weights = func(weights, dtype="float64")._values
+
+    if len(weights) != obj.shape[axis]:
+        raise ValueError("Weights and axis to be sampled must be of same length")
+
+    if lib.has_infs(weights):
+        raise ValueError("weight vector may not include `inf` values")
+
+    if (weights < 0).any():
+        raise ValueError("weight vector many not include negative values")
+
+    weights[np.isnan(weights)] = 0
+    return weights
+
+
+def process_sampling_size(
+    n: int | None, frac: float | None, replace: bool
+) -> int | None:
+    """
+    Process and validate the `n` and `frac` arguments to `NDFrame.sample` and
+    `.GroupBy.sample`.
+
+    Returns None if `frac` should be used (variable sampling sizes), otherwise returns
+    the constant sampling size.
+    """
+    # If no frac or n, default to n=1.
+    if n is None and frac is None:
+        n = 1
+    elif n is not None and frac is not None:
+        raise ValueError("Please enter a value for `frac` OR `n`, not both")
+    elif n is not None:
+        if n < 0:
+            raise ValueError(
+                "A negative number of rows requested. Please provide `n` >= 0."
+            )
+        if n % 1 != 0:
+            raise ValueError("Only integers accepted as `n` values")
+    else:
+        assert frac is not None  # for mypy
+        if frac > 1 and not replace:
+            raise ValueError(
+                "Replace has to be set to `True` when "
+                "upsampling the population `frac` > 1."
+            )
+        if frac < 0:
+            raise ValueError(
+                "A negative number of rows requested. Please provide `frac` >= 0."
+            )
+
+    return n
+
+
+def sample(
+    obj_len: int,
+    size: int,
+    replace: bool,
+    weights: np.ndarray | None,
+    random_state: np.random.RandomState,
+) -> np.ndarray:
+    """
+    Randomly sample `size` indices in `np.arange(obj_len)`
+
+    Parameters
+    ----------
+    obj_len : int
+        The length of the indices being considered
+    size : int
+        The number of values to choose
+    replace : bool
+        Allow or disallow sampling of the same row more than once.
+    weights : np.ndarray[np.float64] or None
+        If None, equal probability weighting, otherwise weights according
+        to the vector normalized
+    random_state: np.random.RandomState
+        State used for the random sampling
+
+    Returns
+    -------
+    np.ndarray[np.intp]
+    """
+    if weights is not None:
+        weight_sum = weights.sum()
+        if weight_sum != 0:
+            weights = weights / weight_sum
+        else:
+            raise ValueError("Invalid weights: weights sum to zero")
+
+    return random_state.choice(obj_len, size=size, replace=replace, p=weights).astype(
+        np.intp, copy=False
+    )

From fe9b028e0dacab5d680f2938dea0b3eba5b28c17 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Sun, 27 Jun 2021 20:57:48 -0400
Subject: [PATCH 11/11] Add module comment

---
 pandas/core/sample.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pandas/core/sample.py b/pandas/core/sample.py
index cfd512581fcc3..4798f385d523c 100644
--- a/pandas/core/sample.py
+++ b/pandas/core/sample.py
@@ -1,3 +1,6 @@
+"""
+Module containing utilities for NDFrame.sample() and .GroupBy.sample()
+"""
 from __future__ import annotations
 
 import numpy as np