From 5f6c210c168cc55e90c79916133a97a9992e599f Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 21 Jun 2021 21:21:09 -0400 Subject: [PATCH 01/11] wip --- pandas/core/algorithms.py | 96 ++++++++++++++++++++++++++++++++++ pandas/core/generic.py | 90 +++---------------------------- pandas/core/groupby/groupby.py | 13 +++-- 3 files changed, 113 insertions(+), 86 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index f4a6b0b1c1694..e3a0e1a9ce7e8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -28,6 +28,8 @@ from pandas._typing import ( AnyArrayLike, ArrayLike, + FrameOrSeries, + RandomState, DtypeObj, FrameOrSeriesUnion, Scalar, @@ -63,6 +65,7 @@ from pandas.core.dtypes.dtypes import PandasDtype from pandas.core.dtypes.generic import ( ABCDatetimeArray, + ABCDataFrame, ABCExtensionArray, ABCIndex, ABCMultiIndex, @@ -1895,3 +1898,96 @@ def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike: for i, value in enumerate(unique_array): indexer += [i] * int(max(l_count[value], r_count[value])) return unique_array.take(indexer) + + +def preprocess_weights(obj: DataFrame | Series, weights, axis: int): + if weights is not None: + + # If a series, align with frame + if isinstance(weights, ABCSeries): + weights = weights.reindex(obj.axes[axis]) + + # Strings acceptable if a dataframe and axis = 0 + if isinstance(weights, str): + if isinstance(obj, ABCDataFrame): + if axis == 0: + try: + weights = obj[weights] + except KeyError as err: + raise KeyError( + "String passed to weights not a valid column" + ) from err + else: + raise ValueError( + "Strings can only be passed to " + "weights when sampling from rows on " + "a DataFrame" + ) + else: + raise ValueError( + "Strings cannot be passed as weights " + "when sampling from a Series." + ) + + if isinstance(obj, ABCSeries): + func = obj._constructor + else: + func = obj._constructor_sliced + + weights = func(weights, dtype="float64")._values + + if len(weights) != obj.shape[axis]: + raise ValueError("Weights and axis to be sampled must be of same length") + + if lib.has_infs(weights): + raise ValueError("weight vector may not include `inf` values") + + if (weights < 0).any(): + raise ValueError("weight vector many not include negative values") + + weights[np.isnan(weights)] = 0 + return weights + + +def process_sampling_size(n, frac: float | None, replace: bool): + # If no frac or n, default to n=1. + if n is None and frac is None: + n = 1 + elif frac is not None and frac > 1 and not replace: + raise ValueError( + "Replace has to be set to `True` when " + "upsampling the population `frac` > 1." + ) + elif frac is None and n % 1 != 0: + raise ValueError("Only integers accepted as `n` values") + elif frac is not None: + raise ValueError("Please enter a value for `frac` OR `n`, not both") + + # Check for negative sizes + if n < 0: + raise ValueError( + "A negative number of rows requested. Please provide positive value." + ) + + return n + + +def sample( + obj: FrameOrSeries, + size: int, + replace: bool, + weights: np.ndarray, + random_state: RandomState, + axis: int, +) -> FrameOrSeries: + axis_length = obj.shape[axis] + + if weights is not None: + weight_sum = weights.sum() + if weight_sum != 0: + weights = weights / weight_sum + else: + raise ValueError("Invalid weights: weights sum to zero") + + locs = random_state.choice(axis_length, size=size, replace=replace, p=weights) + return obj.take(locs, axis=axis) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5bd845534fc96..e926a10175a5f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5265,92 +5265,18 @@ def sample( axis = self._stat_axis_number axis = self._get_axis_number(axis) - axis_length = self.shape[axis] # Process random_state argument rs = com.random_state(random_state) - # Check weights for compliance - if weights is not None: - - # If a series, align with frame - if isinstance(weights, ABCSeries): - weights = weights.reindex(self.axes[axis]) - - # Strings acceptable if a dataframe and axis = 0 - if isinstance(weights, str): - if isinstance(self, ABCDataFrame): - if axis == 0: - try: - weights = self[weights] - except KeyError as err: - raise KeyError( - "String passed to weights not a valid column" - ) from err - else: - raise ValueError( - "Strings can only be passed to " - "weights when sampling from rows on " - "a DataFrame" - ) - else: - raise ValueError( - "Strings cannot be passed as weights " - "when sampling from a Series." - ) - - if isinstance(self, ABCSeries): - func = self._constructor - else: - func = self._constructor_sliced - weights = func(weights, dtype="float64") - - if len(weights) != axis_length: - raise ValueError( - "Weights and axis to be sampled must be of same length" - ) - - if (weights == np.inf).any() or (weights == -np.inf).any(): - raise ValueError("weight vector may not include `inf` values") - - if (weights < 0).any(): - raise ValueError("weight vector many not include negative values") - - # If has nan, set to zero. - weights = weights.fillna(0) - - # Renormalize if don't sum to 1 - if weights.sum() != 1: - if weights.sum() != 0: - weights = weights / weights.sum() - else: - raise ValueError("Invalid weights: weights sum to zero") - - weights = weights._values - - # If no frac or n, default to n=1. - if n is None and frac is None: - n = 1 - elif frac is not None and frac > 1 and not replace: - raise ValueError( - "Replace has to be set to `True` when " - "upsampling the population `frac` > 1." - ) - elif frac is None and n % 1 != 0: - raise ValueError("Only integers accepted as `n` values") - elif n is None and frac is not None: - n = round(frac * axis_length) - elif frac is not None: - raise ValueError("Please enter a value for `frac` OR `n`, not both") - - # Check for negative sizes - if n < 0: - raise ValueError( - "A negative number of rows requested. Please provide positive value." - ) - - locs = rs.choice(axis_length, size=n, replace=replace, p=weights) - return self.take(locs, axis=axis) + n = algos.process_sampling_size(n, frac, replace) + if n is None: + assert frac is not None + size = round(frac * self.shape[axis]) + else: + size = n + weights = algos.preprocess_weights(self, weights, axis) + return algos.sample(self, size, replace, weights, rs, axis) @final @doc(klass=_shared_doc_kwargs["klass"]) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 0080791a51a4b..7d766c8569239 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3272,19 +3272,24 @@ def sample( """ from pandas.core.reshape.concat import concat + n = algorithms.process_sampling_size(n, frac, replace) + if n is None: + assert frac is not None + sizes = [] + if weights is not None: + weights = algorithms.preprocess_weights(self, weights, axis=0) weights = Series(weights, index=self._selected_obj.index) ws = [weights.iloc[idx] for idx in self.indices.values()] else: ws = [None] * self.ngroups - if random_state is not None: - random_state = com.random_state(random_state) + random_state = com.random_state(random_state) group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis) samples = [ - obj.sample( - n=n, frac=frac, replace=replace, weights=w, random_state=random_state + algorithms.sample( + self, size=size, replace=replace, weights=w, random_state=random_state, axis=0 ) for (_, obj), w in zip(group_iterator, ws) ] From 6dc14851ebd21e8658110eb084b84a8bfdb1f050 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 24 Jun 2021 22:36:55 -0400 Subject: [PATCH 02/11] WIP --- pandas/core/algorithms.py | 44 +++++++++++++---------- pandas/core/generic.py | 7 ++-- pandas/core/groupby/groupby.py | 22 ++++++++---- pandas/tests/frame/methods/test_sample.py | 11 ++++-- pandas/tests/groupby/test_sample.py | 2 +- 5 files changed, 53 insertions(+), 33 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index e3a0e1a9ce7e8..5c30c673b15b9 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -28,10 +28,10 @@ from pandas._typing import ( AnyArrayLike, ArrayLike, - FrameOrSeries, - RandomState, DtypeObj, + FrameOrSeries, FrameOrSeriesUnion, + RandomState, Scalar, ) from pandas.util._decorators import doc @@ -64,8 +64,8 @@ ) from pandas.core.dtypes.dtypes import PandasDtype from pandas.core.dtypes.generic import ( - ABCDatetimeArray, ABCDataFrame, + ABCDatetimeArray, ABCExtensionArray, ABCIndex, ABCMultiIndex, @@ -1939,7 +1939,7 @@ def preprocess_weights(obj: DataFrame | Series, weights, axis: int): if len(weights) != obj.shape[axis]: raise ValueError("Weights and axis to be sampled must be of same length") - if lib.has_infs(weights): + if lib.has_infs_f8(weights): raise ValueError("weight vector may not include `inf` values") if (weights < 0).any(): @@ -1949,25 +1949,31 @@ def preprocess_weights(obj: DataFrame | Series, weights, axis: int): return weights -def process_sampling_size(n, frac: float | None, replace: bool): +def process_sampling_size( + n: int | None, frac: float | None, replace: bool +) -> int | None: # If no frac or n, default to n=1. if n is None and frac is None: n = 1 - elif frac is not None and frac > 1 and not replace: - raise ValueError( - "Replace has to be set to `True` when " - "upsampling the population `frac` > 1." - ) - elif frac is None and n % 1 != 0: - raise ValueError("Only integers accepted as `n` values") - elif frac is not None: + elif n is not None and frac is not None: raise ValueError("Please enter a value for `frac` OR `n`, not both") - - # Check for negative sizes - if n < 0: - raise ValueError( - "A negative number of rows requested. Please provide positive value." - ) + elif n is not None: + if n < 0: + raise ValueError( + "A negative number of rows requested. Please provide `n` >= 0." + ) + if n % 1 != 0: + raise ValueError("Only integers accepted as `n` values") + else: + if frac > 1 and not replace: + raise ValueError( + "Replace has to be set to `True` when " + "upsampling the population `frac` > 1." + ) + if frac < 0: + raise ValueError( + "A negative number of rows requested. Please provide `frac` >= 0." + ) return n diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 144e60c017212..a6fb327e917fa 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5269,12 +5269,11 @@ def sample( # Process random_state argument rs = com.random_state(random_state) - n = algos.process_sampling_size(n, frac, replace) - if n is None: + size = algos.process_sampling_size(n, frac, replace) + if size is None: assert frac is not None size = round(frac * self.shape[axis]) - else: - size = n + weights = algos.preprocess_weights(self, weights, axis) return algos.sample(self, size, replace, weights, rs, axis) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 7f36d40d16fbd..30d73274598eb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3275,12 +3275,17 @@ def sample( n = algorithms.process_sampling_size(n, frac, replace) if n is None: assert frac is not None - sizes = [] + sizes = np.zeros(self.ngroups, dtype="i8") + for i, idx in enumerate(self.indices.values()): + sizes[i] = round(frac * len(idx)) + else: + sizes = np.full(self.ngroups, n, dtype="i8") if weights is not None: - weights = algorithms.preprocess_weights(self, weights, axis=0) - weights = Series(weights, index=self._selected_obj.index) - ws = [weights.iloc[idx] for idx in self.indices.values()] + weights = algorithms.preprocess_weights( + self._selected_obj, weights, axis=self.axis + ) + ws = [weights[idx] for idx in self.indices.values()] else: ws = [None] * self.ngroups @@ -3289,9 +3294,14 @@ def sample( group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis) samples = [ algorithms.sample( - self, size=size, replace=replace, weights=w, random_state=random_state, axis=0 + obj, + size=sizes[i], + replace=replace, + weights=w, + random_state=random_state, + axis=self.axis, ) - for (_, obj), w in zip(group_iterator, ws) + for i, ((_, obj), w) in enumerate(zip(group_iterator, ws)) ] return concat(samples, axis=self.axis) diff --git a/pandas/tests/frame/methods/test_sample.py b/pandas/tests/frame/methods/test_sample.py index 55ef665c55241..de365c70b62c1 100644 --- a/pandas/tests/frame/methods/test_sample.py +++ b/pandas/tests/frame/methods/test_sample.py @@ -83,10 +83,15 @@ def test_sample_wont_accept_n_and_frac(self, obj): obj.sample(n=3, frac=0.3) def test_sample_requires_positive_n_frac(self, obj): - msg = "A negative number of rows requested. Please provide positive value." - with pytest.raises(ValueError, match=msg): + with pytest.raises( + ValueError, + match="A negative number of rows requested. Please provide `n` >= 0", + ): obj.sample(n=-3) - with pytest.raises(ValueError, match=msg): + with pytest.raises( + ValueError, + match="A negative number of rows requested. Please provide `frac` >= 0", + ): obj.sample(frac=-0.3) def test_sample_requires_integer_n(self, obj): diff --git a/pandas/tests/groupby/test_sample.py b/pandas/tests/groupby/test_sample.py index 652a5fc1a3c34..9153fac0927c5 100644 --- a/pandas/tests/groupby/test_sample.py +++ b/pandas/tests/groupby/test_sample.py @@ -78,7 +78,7 @@ def test_groupby_sample_invalid_n_raises(n): df = DataFrame({"a": [1, 2], "b": [1, 2]}) if n < 0: - msg = "Please provide positive value" + msg = "A negative number of rows requested. Please provide `n` >= 0." else: msg = "Only integers accepted as `n` values" From 45e0fe1d1bc9614d6d6897d51fe803c96bf46bd8 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 24 Jun 2021 22:46:51 -0400 Subject: [PATCH 03/11] Add asv --- asv_bench/benchmarks/groupby.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 1648985a56b91..a13343ce05d03 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -832,4 +832,15 @@ def function(values): self.grouper.agg(function, engine="cython") +class Sample: + def setup(self): + N = 10 ** 5 + self.df = DataFrame({"a": np.zeros(N)}) + self.groups = np.arange(0, N) + self.weights = np.ones(N) + + def time_sample(self): + self.df.groupby(self.groups).sample(n=1, weights=self.weights) + + from .pandas_vb_common import setup # noqa: F401 isort:skip From e7c7d75a4f887e8e8b1facbfe6bf3a89c7bbafc1 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 25 Jun 2021 00:20:34 -0400 Subject: [PATCH 04/11] Avoid concat --- asv_bench/benchmarks/groupby.py | 2 +- pandas/core/algorithms.py | 3 +-- pandas/core/generic.py | 3 ++- pandas/core/groupby/groupby.py | 38 ++++++++++++++++++++++++--------- 4 files changed, 32 insertions(+), 14 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index a13343ce05d03..b30dbad4d25d1 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -834,7 +834,7 @@ def function(values): class Sample: def setup(self): - N = 10 ** 5 + N = 10 ** 3 self.df = DataFrame({"a": np.zeros(N)}) self.groups = np.arange(0, N) self.weights = np.ones(N) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 5c30c673b15b9..7d977a2795f62 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1995,5 +1995,4 @@ def sample( else: raise ValueError("Invalid weights: weights sum to zero") - locs = random_state.choice(axis_length, size=size, replace=replace, p=weights) - return obj.take(locs, axis=axis) + return random_state.choice(axis_length, size=size, replace=replace, p=weights) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a6fb327e917fa..c88fcda754ee8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5275,7 +5275,8 @@ def sample( size = round(frac * self.shape[axis]) weights = algos.preprocess_weights(self, weights, axis) - return algos.sample(self, size, replace, weights, rs, axis) + sampled_indices = algos.sample(self, size, replace, weights, rs, axis) + return self.take(sampled_indices, axis=axis) @final @doc(klass=_shared_doc_kwargs["klass"]) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 30d73274598eb..1ddcc9e73d6dc 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3270,8 +3270,6 @@ def sample( 2 blue 2 0 red 0 """ - from pandas.core.reshape.concat import concat - n = algorithms.process_sampling_size(n, frac, replace) if n is None: assert frac is not None @@ -3292,19 +3290,39 @@ def sample( random_state = com.random_state(random_state) group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis) - samples = [ - algorithms.sample( + sampled_indices = [] + for i, ((ind, obj), w) in enumerate(zip(group_iterator, ws)): + grp_idx = self.indices[ind] + size = n + if n is None: + size = round(frac * len(grp_idx)) + grp_sample = algorithms.sample( obj, - size=sizes[i], + size=size, replace=replace, - weights=w, + weights=None if weights is None else weights[grp_idx], random_state=random_state, axis=self.axis, ) - for i, ((_, obj), w) in enumerate(zip(group_iterator, ws)) - ] - - return concat(samples, axis=self.axis) + sampled_indices.append(grp_idx[grp_sample]) + + sampled_indices = np.concatenate(sampled_indices) + + # sampled_indices = np.concatenate([ + # algorithms.sample( + # obj, + # size=sizes[i], + # replace=replace, + # weights=w, + # random_state=random_state, + # axis=self.axis, + # ) + # for i, ((ind, obj), w) in enumerate(zip(group_iterator, ws)) + # ]) + # print(sampled_indices.shape) + # print(self._selected_obj) + + return self._selected_obj.take(sampled_indices, axis=self.axis) @doc(GroupBy) From ca26efb5225fb8037c7fc14f190fa0c5b7606364 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 25 Jun 2021 00:22:53 -0400 Subject: [PATCH 05/11] Clean dead code --- pandas/core/groupby/groupby.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 1ddcc9e73d6dc..b6fc066e3ff1e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3271,27 +3271,16 @@ def sample( 0 red 0 """ n = algorithms.process_sampling_size(n, frac, replace) - if n is None: - assert frac is not None - sizes = np.zeros(self.ngroups, dtype="i8") - for i, idx in enumerate(self.indices.values()): - sizes[i] = round(frac * len(idx)) - else: - sizes = np.full(self.ngroups, n, dtype="i8") - if weights is not None: weights = algorithms.preprocess_weights( self._selected_obj, weights, axis=self.axis ) - ws = [weights[idx] for idx in self.indices.values()] - else: - ws = [None] * self.ngroups random_state = com.random_state(random_state) group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis) sampled_indices = [] - for i, ((ind, obj), w) in enumerate(zip(group_iterator, ws)): + for i, (ind, obj) in enumerate(group_iterator): grp_idx = self.indices[ind] size = n if n is None: From f14705237bb372daa3680ba952bd2b42cdea42d4 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 25 Jun 2021 12:44:37 -0400 Subject: [PATCH 06/11] WIP --- asv_bench/benchmarks/groupby.py | 3 +++ pandas/core/groupby/groupby.py | 22 ++-------------------- 2 files changed, 5 insertions(+), 20 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index b30dbad4d25d1..6ca951e946bad 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -840,6 +840,9 @@ def setup(self): self.weights = np.ones(N) def time_sample(self): + self.df.groupby(self.groups).sample(n=1) + + def time_sample_weights(self): self.df.groupby(self.groups).sample(n=1, weights=self.weights) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b6fc066e3ff1e..380d2f23a6292 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3280,14 +3280,11 @@ def sample( group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis) sampled_indices = [] - for i, (ind, obj) in enumerate(group_iterator): + for ind, obj in group_iterator: grp_idx = self.indices[ind] - size = n - if n is None: - size = round(frac * len(grp_idx)) grp_sample = algorithms.sample( obj, - size=size, + size=n if n is None else round(frac * len(grp_idx)), replace=replace, weights=None if weights is None else weights[grp_idx], random_state=random_state, @@ -3296,21 +3293,6 @@ def sample( sampled_indices.append(grp_idx[grp_sample]) sampled_indices = np.concatenate(sampled_indices) - - # sampled_indices = np.concatenate([ - # algorithms.sample( - # obj, - # size=sizes[i], - # replace=replace, - # weights=w, - # random_state=random_state, - # axis=self.axis, - # ) - # for i, ((ind, obj), w) in enumerate(zip(group_iterator, ws)) - # ]) - # print(sampled_indices.shape) - # print(self._selected_obj) - return self._selected_obj.take(sampled_indices, axis=self.axis) From 3834f0cdef9ddf4d3903c884850befa75d89a958 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 25 Jun 2021 13:23:24 -0400 Subject: [PATCH 07/11] Add whatsnew, fix some typing --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/core/algorithms.py | 91 +++++++++++++++++----------------- pandas/core/generic.py | 8 +-- pandas/core/groupby/groupby.py | 18 ++++--- 4 files changed, 63 insertions(+), 56 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index f992d6aa09ead..e9a4ba10ff6e8 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -105,7 +105,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- +- Performance improvement in :meth:`.GroupBy.sample`, especially when ``weights`` argument provided (:issue:`34483`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index eadd92e3e58ba..9fd6e2e14b291 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -27,7 +27,6 @@ DtypeObj, FrameOrSeries, FrameOrSeriesUnion, - RandomState, Scalar, ) from pandas.util._decorators import doc @@ -1896,53 +1895,55 @@ def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike: return unique_array.take(indexer) -def preprocess_weights(obj: DataFrame | Series, weights, axis: int): - if weights is not None: +# ------ # +# sample # +# ------ # + + +def preprocess_weights(obj: FrameOrSeries, weights, axis: int) -> np.ndarray: + # If a series, align with frame + if isinstance(weights, ABCSeries): + weights = weights.reindex(obj.axes[axis]) - # If a series, align with frame - if isinstance(weights, ABCSeries): - weights = weights.reindex(obj.axes[axis]) - - # Strings acceptable if a dataframe and axis = 0 - if isinstance(weights, str): - if isinstance(obj, ABCDataFrame): - if axis == 0: - try: - weights = obj[weights] - except KeyError as err: - raise KeyError( - "String passed to weights not a valid column" - ) from err - else: - raise ValueError( - "Strings can only be passed to " - "weights when sampling from rows on " - "a DataFrame" - ) + # Strings acceptable if a dataframe and axis = 0 + if isinstance(weights, str): + if isinstance(obj, ABCDataFrame): + if axis == 0: + try: + weights = obj[weights] + except KeyError as err: + raise KeyError( + "String passed to weights not a valid column" + ) from err else: raise ValueError( - "Strings cannot be passed as weights " - "when sampling from a Series." + "Strings can only be passed to " + "weights when sampling from rows on " + "a DataFrame" ) - - if isinstance(obj, ABCSeries): - func = obj._constructor else: - func = obj._constructor_sliced + raise ValueError( + "Strings cannot be passed as weights when sampling from a Series." + ) + + if isinstance(obj, ABCSeries): + func = obj._constructor + else: + func = obj._constructor_sliced - weights = func(weights, dtype="float64")._values + weights = func(weights, dtype="float64")._values - if len(weights) != obj.shape[axis]: - raise ValueError("Weights and axis to be sampled must be of same length") + if len(weights) != obj.shape[axis]: + raise ValueError("Weights and axis to be sampled must be of same length") - if lib.has_infs_f8(weights): - raise ValueError("weight vector may not include `inf` values") + if lib.has_infs(weights): + raise ValueError("weight vector may not include `inf` values") - if (weights < 0).any(): - raise ValueError("weight vector many not include negative values") + if (weights < 0).any(): + raise ValueError("weight vector many not include negative values") - weights[np.isnan(weights)] = 0 - return weights + weights[np.isnan(weights)] = 0 + return weights def process_sampling_size( @@ -1961,6 +1962,7 @@ def process_sampling_size( if n % 1 != 0: raise ValueError("Only integers accepted as `n` values") else: + assert frac is not None if frac > 1 and not replace: raise ValueError( "Replace has to be set to `True` when " @@ -1975,15 +1977,12 @@ def process_sampling_size( def sample( - obj: FrameOrSeries, + obj_len: int, size: int, replace: bool, - weights: np.ndarray, - random_state: RandomState, - axis: int, -) -> FrameOrSeries: - axis_length = obj.shape[axis] - + weights: np.ndarray | None, + random_state: np.random.RandomState, +) -> np.ndarray: if weights is not None: weight_sum = weights.sum() if weight_sum != 0: @@ -1991,4 +1990,4 @@ def sample( else: raise ValueError("Invalid weights: weights sum to zero") - return random_state.choice(axis_length, size=size, replace=replace, p=weights) + return random_state.choice(obj_len, size=size, replace=replace, p=weights) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c88fcda754ee8..ce6867499cc00 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5265,6 +5265,7 @@ def sample( axis = self._stat_axis_number axis = self._get_axis_number(axis) + obj_len = self.shape[axis] # Process random_state argument rs = com.random_state(random_state) @@ -5272,10 +5273,11 @@ def sample( size = algos.process_sampling_size(n, frac, replace) if size is None: assert frac is not None - size = round(frac * self.shape[axis]) + size = round(frac * obj_len) - weights = algos.preprocess_weights(self, weights, axis) - sampled_indices = algos.sample(self, size, replace, weights, rs, axis) + if weights is not None: + weights = algos.preprocess_weights(self, weights, axis) + sampled_indices = algos.sample(obj_len, size, replace, weights, rs) return self.take(sampled_indices, axis=axis) @final diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 380d2f23a6292..603b9587fb195 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3270,9 +3270,9 @@ def sample( 2 blue 2 0 red 0 """ - n = algorithms.process_sampling_size(n, frac, replace) + size = algorithms.process_sampling_size(n, frac, replace) if weights is not None: - weights = algorithms.preprocess_weights( + weights_arr = algorithms.preprocess_weights( self._selected_obj, weights, axis=self.axis ) @@ -3282,13 +3282,19 @@ def sample( sampled_indices = [] for ind, obj in group_iterator: grp_idx = self.indices[ind] + group_size = len(grp_idx) + if size is not None: + sample_size = size + else: + assert frac is not None + sample_size = round(frac * group_size) + grp_sample = algorithms.sample( - obj, - size=n if n is None else round(frac * len(grp_idx)), + group_size, + size=sample_size, replace=replace, - weights=None if weights is None else weights[grp_idx], + weights=None if weights is None else weights_arr[grp_idx], random_state=random_state, - axis=self.axis, ) sampled_indices.append(grp_idx[grp_sample]) From a7028700404b4e1291ec91d69837bda13022ebde Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 25 Jun 2021 13:59:40 -0400 Subject: [PATCH 08/11] Add docstrings --- pandas/core/algorithms.py | 42 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9fd6e2e14b291..1fbb037ed60f4 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1901,6 +1901,13 @@ def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike: def preprocess_weights(obj: FrameOrSeries, weights, axis: int) -> np.ndarray: + """ + Process and validate the `weights` argument to `NDFrame.sample` and + `.GroupBy.sample`. + + Returns `weights` as an ndarray[np.float64], validated except for normalizing + weights (because that must be done groupwise in groupby sampling). + """ # If a series, align with frame if isinstance(weights, ABCSeries): weights = weights.reindex(obj.axes[axis]) @@ -1949,6 +1956,13 @@ def preprocess_weights(obj: FrameOrSeries, weights, axis: int) -> np.ndarray: def process_sampling_size( n: int | None, frac: float | None, replace: bool ) -> int | None: + """ + Process and validate the `n` and `frac` arguments to `NDFrame.sample` and + `.GroupBy.sample`. + + Returns None if `frac` should be used (variable sampling sizes), otherwise returns + the constant sampling size. + """ # If no frac or n, default to n=1. if n is None and frac is None: n = 1 @@ -1962,7 +1976,7 @@ def process_sampling_size( if n % 1 != 0: raise ValueError("Only integers accepted as `n` values") else: - assert frac is not None + assert frac is not None # for mypy if frac > 1 and not replace: raise ValueError( "Replace has to be set to `True` when " @@ -1983,6 +1997,28 @@ def sample( weights: np.ndarray | None, random_state: np.random.RandomState, ) -> np.ndarray: + """ + Extracts the union from lvals and rvals with respect to duplicates and nans in + both arrays. + + Parameters + ---------- + obj_len : int + The length of the items to consider + size : int + The number of items to return + replace : bool + Allow or disallow sampling of the same row more than once. + weights : np.ndarray[np.float64] or None + If None, equal probability weighting, otherwise weights according + to the vector normalized + random_state: np.random.RandomState + State used for the random sampling + + Returns + ------- + np.ndarray[np.intp] + """ if weights is not None: weight_sum = weights.sum() if weight_sum != 0: @@ -1990,4 +2026,6 @@ def sample( else: raise ValueError("Invalid weights: weights sum to zero") - return random_state.choice(obj_len, size=size, replace=replace, p=weights) + return random_state.choice(obj_len, size=size, replace=replace, p=weights).astype( + np.intp, copy=False + ) From 7fb839ce0684668693b27fb215a1a5a8ea4c0e65 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 25 Jun 2021 14:15:34 -0400 Subject: [PATCH 09/11] Improve some variable names --- pandas/core/algorithms.py | 7 +++---- pandas/core/groupby/groupby.py | 11 ++++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1fbb037ed60f4..716bf54e11b55 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1998,15 +1998,14 @@ def sample( random_state: np.random.RandomState, ) -> np.ndarray: """ - Extracts the union from lvals and rvals with respect to duplicates and nans in - both arrays. + Randomly sample `size` indices in `np.arange(obj_len)` Parameters ---------- obj_len : int - The length of the items to consider + The length of the indices being considered size : int - The number of items to return + The number of values to choose replace : bool Allow or disallow sampling of the same row more than once. weights : np.ndarray[np.float64] or None diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 603b9587fb195..6de6d041413cf 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3279,10 +3279,11 @@ def sample( random_state = com.random_state(random_state) group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis) + sampled_indices = [] - for ind, obj in group_iterator: - grp_idx = self.indices[ind] - group_size = len(grp_idx) + for labels, obj in group_iterator: + grp_indices = self.indices[labels] + group_size = len(grp_indices) if size is not None: sample_size = size else: @@ -3293,10 +3294,10 @@ def sample( group_size, size=sample_size, replace=replace, - weights=None if weights is None else weights_arr[grp_idx], + weights=None if weights is None else weights_arr[grp_indices], random_state=random_state, ) - sampled_indices.append(grp_idx[grp_sample]) + sampled_indices.append(grp_indices[grp_sample]) sampled_indices = np.concatenate(sampled_indices) return self._selected_obj.take(sampled_indices, axis=self.axis) From 994384dfd7acb56aa501b6a0533ef61c2902a31c Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sun, 27 Jun 2021 20:52:49 -0400 Subject: [PATCH 10/11] Move to sample.py --- pandas/core/algorithms.py | 137 -------------------------------- pandas/core/generic.py | 7 +- pandas/core/groupby/groupby.py | 7 +- pandas/core/sample.py | 141 +++++++++++++++++++++++++++++++++ 4 files changed, 149 insertions(+), 143 deletions(-) create mode 100644 pandas/core/sample.py diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 716bf54e11b55..a9ca39b89360c 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -25,7 +25,6 @@ AnyArrayLike, ArrayLike, DtypeObj, - FrameOrSeries, FrameOrSeriesUnion, Scalar, ) @@ -59,7 +58,6 @@ ) from pandas.core.dtypes.dtypes import PandasDtype from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCDatetimeArray, ABCExtensionArray, ABCIndex, @@ -1893,138 +1891,3 @@ def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike: for i, value in enumerate(unique_array): indexer += [i] * int(max(l_count[value], r_count[value])) return unique_array.take(indexer) - - -# ------ # -# sample # -# ------ # - - -def preprocess_weights(obj: FrameOrSeries, weights, axis: int) -> np.ndarray: - """ - Process and validate the `weights` argument to `NDFrame.sample` and - `.GroupBy.sample`. - - Returns `weights` as an ndarray[np.float64], validated except for normalizing - weights (because that must be done groupwise in groupby sampling). - """ - # If a series, align with frame - if isinstance(weights, ABCSeries): - weights = weights.reindex(obj.axes[axis]) - - # Strings acceptable if a dataframe and axis = 0 - if isinstance(weights, str): - if isinstance(obj, ABCDataFrame): - if axis == 0: - try: - weights = obj[weights] - except KeyError as err: - raise KeyError( - "String passed to weights not a valid column" - ) from err - else: - raise ValueError( - "Strings can only be passed to " - "weights when sampling from rows on " - "a DataFrame" - ) - else: - raise ValueError( - "Strings cannot be passed as weights when sampling from a Series." - ) - - if isinstance(obj, ABCSeries): - func = obj._constructor - else: - func = obj._constructor_sliced - - weights = func(weights, dtype="float64")._values - - if len(weights) != obj.shape[axis]: - raise ValueError("Weights and axis to be sampled must be of same length") - - if lib.has_infs(weights): - raise ValueError("weight vector may not include `inf` values") - - if (weights < 0).any(): - raise ValueError("weight vector many not include negative values") - - weights[np.isnan(weights)] = 0 - return weights - - -def process_sampling_size( - n: int | None, frac: float | None, replace: bool -) -> int | None: - """ - Process and validate the `n` and `frac` arguments to `NDFrame.sample` and - `.GroupBy.sample`. - - Returns None if `frac` should be used (variable sampling sizes), otherwise returns - the constant sampling size. - """ - # If no frac or n, default to n=1. - if n is None and frac is None: - n = 1 - elif n is not None and frac is not None: - raise ValueError("Please enter a value for `frac` OR `n`, not both") - elif n is not None: - if n < 0: - raise ValueError( - "A negative number of rows requested. Please provide `n` >= 0." - ) - if n % 1 != 0: - raise ValueError("Only integers accepted as `n` values") - else: - assert frac is not None # for mypy - if frac > 1 and not replace: - raise ValueError( - "Replace has to be set to `True` when " - "upsampling the population `frac` > 1." - ) - if frac < 0: - raise ValueError( - "A negative number of rows requested. Please provide `frac` >= 0." - ) - - return n - - -def sample( - obj_len: int, - size: int, - replace: bool, - weights: np.ndarray | None, - random_state: np.random.RandomState, -) -> np.ndarray: - """ - Randomly sample `size` indices in `np.arange(obj_len)` - - Parameters - ---------- - obj_len : int - The length of the indices being considered - size : int - The number of values to choose - replace : bool - Allow or disallow sampling of the same row more than once. - weights : np.ndarray[np.float64] or None - If None, equal probability weighting, otherwise weights according - to the vector normalized - random_state: np.random.RandomState - State used for the random sampling - - Returns - ------- - np.ndarray[np.intp] - """ - if weights is not None: - weight_sum = weights.sum() - if weight_sum != 0: - weights = weights / weight_sum - else: - raise ValueError("Invalid weights: weights sum to zero") - - return random_state.choice(obj_len, size=size, replace=replace, p=weights).astype( - np.intp, copy=False - ) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8075faf458967..f2497c6e65967 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -137,6 +137,7 @@ from pandas.core.missing import find_valid_index from pandas.core.ops import align_method_FRAME from pandas.core.reshape.concat import concat +import pandas.core.sample as sample from pandas.core.shared_docs import _shared_docs from pandas.core.sorting import get_indexer_indexer from pandas.core.window import ( @@ -5275,15 +5276,15 @@ def sample( # Process random_state argument rs = com.random_state(random_state) - size = algos.process_sampling_size(n, frac, replace) + size = sample.process_sampling_size(n, frac, replace) if size is None: assert frac is not None size = round(frac * obj_len) if weights is not None: - weights = algos.preprocess_weights(self, weights, axis) + weights = sample.preprocess_weights(self, weights, axis) - sampled_indices = algos.sample(obj_len, size, replace, weights, rs) + sampled_indices = sample.sample(obj_len, size, replace, weights, rs) result = self.take(sampled_indices, axis=axis) if ignore_index: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6de6d041413cf..8fb50db2e33f2 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -102,6 +102,7 @@ class providing the base-class of operations. MultiIndex, ) from pandas.core.internals.blocks import ensure_block_shape +import pandas.core.sample as sample from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter from pandas.core.util.numba_ import ( @@ -3270,9 +3271,9 @@ def sample( 2 blue 2 0 red 0 """ - size = algorithms.process_sampling_size(n, frac, replace) + size = sample.process_sampling_size(n, frac, replace) if weights is not None: - weights_arr = algorithms.preprocess_weights( + weights_arr = sample.preprocess_weights( self._selected_obj, weights, axis=self.axis ) @@ -3290,7 +3291,7 @@ def sample( assert frac is not None sample_size = round(frac * group_size) - grp_sample = algorithms.sample( + grp_sample = sample.sample( group_size, size=sample_size, replace=replace, diff --git a/pandas/core/sample.py b/pandas/core/sample.py new file mode 100644 index 0000000000000..cfd512581fcc3 --- /dev/null +++ b/pandas/core/sample.py @@ -0,0 +1,141 @@ +from __future__ import annotations + +import numpy as np + +from pandas._libs import lib +from pandas._typing import FrameOrSeries + +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, +) + + +def preprocess_weights(obj: FrameOrSeries, weights, axis: int) -> np.ndarray: + """ + Process and validate the `weights` argument to `NDFrame.sample` and + `.GroupBy.sample`. + + Returns `weights` as an ndarray[np.float64], validated except for normalizing + weights (because that must be done groupwise in groupby sampling). + """ + # If a series, align with frame + if isinstance(weights, ABCSeries): + weights = weights.reindex(obj.axes[axis]) + + # Strings acceptable if a dataframe and axis = 0 + if isinstance(weights, str): + if isinstance(obj, ABCDataFrame): + if axis == 0: + try: + weights = obj[weights] + except KeyError as err: + raise KeyError( + "String passed to weights not a valid column" + ) from err + else: + raise ValueError( + "Strings can only be passed to " + "weights when sampling from rows on " + "a DataFrame" + ) + else: + raise ValueError( + "Strings cannot be passed as weights when sampling from a Series." + ) + + if isinstance(obj, ABCSeries): + func = obj._constructor + else: + func = obj._constructor_sliced + + weights = func(weights, dtype="float64")._values + + if len(weights) != obj.shape[axis]: + raise ValueError("Weights and axis to be sampled must be of same length") + + if lib.has_infs(weights): + raise ValueError("weight vector may not include `inf` values") + + if (weights < 0).any(): + raise ValueError("weight vector many not include negative values") + + weights[np.isnan(weights)] = 0 + return weights + + +def process_sampling_size( + n: int | None, frac: float | None, replace: bool +) -> int | None: + """ + Process and validate the `n` and `frac` arguments to `NDFrame.sample` and + `.GroupBy.sample`. + + Returns None if `frac` should be used (variable sampling sizes), otherwise returns + the constant sampling size. + """ + # If no frac or n, default to n=1. + if n is None and frac is None: + n = 1 + elif n is not None and frac is not None: + raise ValueError("Please enter a value for `frac` OR `n`, not both") + elif n is not None: + if n < 0: + raise ValueError( + "A negative number of rows requested. Please provide `n` >= 0." + ) + if n % 1 != 0: + raise ValueError("Only integers accepted as `n` values") + else: + assert frac is not None # for mypy + if frac > 1 and not replace: + raise ValueError( + "Replace has to be set to `True` when " + "upsampling the population `frac` > 1." + ) + if frac < 0: + raise ValueError( + "A negative number of rows requested. Please provide `frac` >= 0." + ) + + return n + + +def sample( + obj_len: int, + size: int, + replace: bool, + weights: np.ndarray | None, + random_state: np.random.RandomState, +) -> np.ndarray: + """ + Randomly sample `size` indices in `np.arange(obj_len)` + + Parameters + ---------- + obj_len : int + The length of the indices being considered + size : int + The number of values to choose + replace : bool + Allow or disallow sampling of the same row more than once. + weights : np.ndarray[np.float64] or None + If None, equal probability weighting, otherwise weights according + to the vector normalized + random_state: np.random.RandomState + State used for the random sampling + + Returns + ------- + np.ndarray[np.intp] + """ + if weights is not None: + weight_sum = weights.sum() + if weight_sum != 0: + weights = weights / weight_sum + else: + raise ValueError("Invalid weights: weights sum to zero") + + return random_state.choice(obj_len, size=size, replace=replace, p=weights).astype( + np.intp, copy=False + ) From fe9b028e0dacab5d680f2938dea0b3eba5b28c17 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sun, 27 Jun 2021 20:57:48 -0400 Subject: [PATCH 11/11] Add module comment --- pandas/core/sample.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/sample.py b/pandas/core/sample.py index cfd512581fcc3..4798f385d523c 100644 --- a/pandas/core/sample.py +++ b/pandas/core/sample.py @@ -1,3 +1,6 @@ +""" +Module containing utilities for NDFrame.sample() and .GroupBy.sample() +""" from __future__ import annotations import numpy as np