diff --git a/ci/doctests.sh b/ci/doctests.sh index b3d7f6785815a..16b3430f1e431 100755 --- a/ci/doctests.sh +++ b/ci/doctests.sh @@ -35,7 +35,7 @@ if [ "$DOCTEST" ]; then fi pytest --doctest-modules -v pandas/core/generic.py \ - -k"-_set_axis_name -_xs -describe -droplevel -groupby -interpolate -pct_change -pipe -reindex -reindex_axis -resample -sample -to_json -transpose -values -xs" + -k"-_set_axis_name -_xs -describe -droplevel -groupby -interpolate -pct_change -pipe -reindex -reindex_axis -resample -to_json -transpose -values -xs" if [ $? -ne "0" ]; then RET=1 diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 393e7caae5fab..38555262885ec 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4326,8 +4326,8 @@ def sample(self, n=None, frac=None, replace=False, weights=None, Default = 1 if `frac` = None. frac : float, optional Fraction of axis items to return. Cannot be used with `n`. - replace : boolean, optional - Sample with or without replacement. Default = False. + replace : bool, default False + Sample with or without replacement. weights : str or ndarray-like, optional Default 'None' results in equal probability weighting. If passed a Series, will align with target object on index. Index @@ -4340,7 +4340,7 @@ def sample(self, n=None, frac=None, replace=False, weights=None, being sampled. If weights do not sum to 1, they will be normalized to sum to 1. Missing values in the weights column will be treated as zero. - inf and -inf values not allowed. + Infinite values not allowed. random_state : int or numpy.random.RandomState, optional Seed for the random number generator (if int), or numpy RandomState object. @@ -4350,58 +4350,52 @@ def sample(self, n=None, frac=None, replace=False, weights=None, Returns ------- - A new object of same type as caller. + Series or DataFrame + A new object of same type as caller containing `n` items randomly + sampled from the caller object. - Examples + See Also -------- - Generate an example ``Series`` and ``DataFrame``: - - >>> s = pd.Series(np.random.randn(50)) - >>> s.head() - 0 -0.038497 - 1 1.820773 - 2 -0.972766 - 3 -1.598270 - 4 -1.095526 - dtype: float64 - >>> df = pd.DataFrame(np.random.randn(50, 4), columns=list('ABCD')) - >>> df.head() - A B C D - 0 0.016443 -2.318952 -0.566372 -1.028078 - 1 -1.051921 0.438836 0.658280 -0.175797 - 2 -1.243569 -0.364626 -0.215065 0.057736 - 3 1.768216 0.404512 -0.385604 -1.457834 - 4 1.072446 -1.137172 0.314194 -0.046661 - - Next extract a random sample from both of these objects... + numpy.random.choice: Generates a random sample from a given 1-D numpy + array. - 3 random elements from the ``Series``: - - >>> s.sample(n=3) - 27 -0.994689 - 55 -1.049016 - 67 -0.224565 - dtype: float64 - - And a random 10% of the ``DataFrame`` with replacement: - - >>> df.sample(frac=0.1, replace=True) - A B C D - 35 1.981780 0.142106 1.817165 -0.290805 - 49 -1.336199 -0.448634 -0.789640 0.217116 - 40 0.823173 -0.078816 1.009536 1.015108 - 15 1.421154 -0.055301 -1.922594 -0.019696 - 6 -0.148339 0.832938 1.787600 -1.383767 - - You can use `random state` for reproducibility: - - >>> df.sample(random_state=1) - A B C D - 37 -2.027662 0.103611 0.237496 -0.165867 - 43 -0.259323 -0.583426 1.516140 -0.479118 - 12 -1.686325 -0.579510 0.985195 -0.460286 - 8 1.167946 0.429082 1.215742 -1.636041 - 9 1.197475 -0.864188 1.554031 -1.505264 + Examples + -------- + >>> df = pd.DataFrame({'num_legs': [2, 4, 8, 0], + ... 'num_wings': [2, 0, 0, 0], + ... 'num_specimen_seen': [10, 2, 1, 8]}, + ... index=['falcon', 'dog', 'spider', 'fish']) + >>> df + num_legs num_wings num_specimen_seen + falcon 2 2 10 + dog 4 0 2 + spider 8 0 1 + fish 0 0 8 + + Extract 3 random elements from the ``Series`` ``df['num_legs']``: + Note that we use `random_state` to ensure the reproducibility of + the examples. + + >>> df['num_legs'].sample(n=3, random_state=1) + fish 0 + spider 8 + falcon 2 + Name: num_legs, dtype: int64 + + A random 50% sample of the ``DataFrame`` with replacement: + + >>> df.sample(frac=0.5, replace=True, random_state=1) + num_legs num_wings num_specimen_seen + dog 4 0 2 + fish 0 0 8 + + Using a DataFrame column as weights. Rows with larger value in the + `num_specimen_seen` column are more likely to be sampled. + + >>> df.sample(n=2, weights='num_specimen_seen', random_state=1) + num_legs num_wings num_specimen_seen + falcon 2 2 10 + fish 0 0 8 """ if axis is None: