ENH: Add sample function with tests and docs (GH2419)

nickeubank · jreback · commit 8f0f417cbe67 · 2015-05-01T08:03:49.000-04:00
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -390,6 +390,7 @@ Reindexing / Selection / Label manipulation
    Series.reindex_like
    Series.rename
    Series.reset_index
+   Series.sample
    Series.select
    Series.take
    Series.tail
@@ -824,6 +825,7 @@ Reindexing / Selection / Label manipulation
    DataFrame.reindex_like
    DataFrame.rename
    DataFrame.reset_index
+   DataFrame.sample
    DataFrame.select
    DataFrame.set_index
    DataFrame.tail
@@ -1072,6 +1074,7 @@ Reindexing / Selection / Label manipulation
    Panel.reindex_axis
    Panel.reindex_like
    Panel.rename
+   Panel.sample
    Panel.select
    Panel.take
    Panel.truncate
diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst
@@ -508,6 +508,81 @@ A list of indexers where any element is out of bounds will raise an
 
 .. _indexing.basics.partial_setting:
 
+Selecting Random Samples
+------------------------
+.. versionadded::0.16.1
+
+A random selection of rows or columns from a Series, DataFrame, or Panel with the :meth:`~DataFrame.sample` method. The method will sample rows by default, and accepts a specific number of rows/columns to return, or a fraction of rows.
+
+.. ipython :: python
+
+    s = Series([0,1,2,3,4,5])
+
+    # When no arguments are passed, returns 1 row.
+    s.sample()
+
+    # One may specify either a number of rows:
+    s.sample(n=3)
+
+    # Or a fraction of the rows:
+    s.sample(frac=0.5)
+
+By default, ``sample`` will return each row at most once, but one can also sample with replacement
+using the ``replace`` option:
+
+.. ipython :: python
+
+   s = Series([0,1,2,3,4,5])
+
+    # Without replacement (default):
+    s.sample(n=6, replace=False)
+
+    # With replacement:
+    s.sample(n=6, replace=True)
+
+
+By default, each row has an equal probability of being selected, but if you want rows
+to have different probabilities, you can pass the ``sample`` function sampling weights as
+``weights``. These weights can be a list, a numpy array, or a Series, but they must be of the same length as the object you are sampling. Missing values will be treated as a weight of zero, and inf values are not allowed. If weights do not sum to 1, they will be re-normalized by dividing all weights by the sum of the weights. For example:
+
+.. ipython :: python
+
+    s = Series([0,1,2,3,4,5])
+    example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4]
+    s.sample(n=3, weights=example_weights)
+
+    # Weights will be re-normalized automatically
+    example_weights2 = [0.5, 0, 0, 0, 0, 0]
+    s.sample(n=1, weights=example_weights2)
+
+When applied to a DataFrame, you can use a column of the DataFrame as sampling weights
+(provided you are sampling rows and not columns) by simply passing the name of the column
+as a string.
+
+.. ipython :: python
+
+    df2 = DataFrame({'col1':[9,8,7,6], 'weight_column':[0.5, 0.4, 0.1, 0]})
+    df2.sample(n = 3, weights = 'weight_column')
+
+``sample`` also allows users to sample columns instead of rows using the ``axis`` argument.
+
+.. 	ipython :: python
+
+    df3 = DataFrame({'col1':[1,2,3], 'col2':[2,3,4]})
+    df3.sample(n=1, axis=1)
+
+Finally, one can also set a seed for ``sample``'s random number generator using the ``random_state`` argument, which will accept either an integer (as a seed) or a numpy RandomState object.
+
+.. 	ipython :: python
+
+    df4 = DataFrame({'col1':[1,2,3], 'col2':[2,3,4]})
+
+    # With a given seed, the sample will always draw the same rows.
+    df4.sample(n=2, random_state=2)
+    df4.sample(n=2, random_state=2)
+
+
+
 Setting With Enlargement
 ------------------------
 
diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt
@@ -12,11 +12,12 @@ Highlights include:
 - Support for a ``CategoricalIndex``, a category based index, see :ref:`here <whatsnew_0161.enhancements.categoricalindex>`
 - New section on how-to-contribute to *pandas*, see :ref`here <contributing>`
 
+- New method ``sample`` for drawing random samples from Series, DataFrames and Panels. See :ref:`here <whatsnew_0161.enchancements.sample>`
+
 .. contents:: What's new in v0.16.1
     :local:
     :backlinks: none
 
-
 .. _whatsnew_0161.enhancements:
 
 Enhancements
@@ -138,6 +139,48 @@ values NOT in the categories, similarly to how you can reindex ANY pandas index.
 
 See the :ref:`documentation <advanced.categoricalindex>` for more. (:issue:`7629`)
 
+.. _whatsnew_0161.enhancements.sample:
+
+Sample
+^^^^^^
+
+Series, DataFrames, and Panels now have a new method: :meth:`~pandas.DataFrame.sample`.
+The method accepts a specific number of rows or columns to return, or a fraction of the
+total number or rows or columns. It also has options for sampling with or without replacement,
+for passing in a column for weights for non-uniform sampling, and for setting seed values to
+facilitate replication. (:issue:`2419`)
+
+.. ipython :: python
+
+   example_series = Series([0,1,2,3,4,5])
+
+   # When no arguments are passed, returns 1
+   example_series.sample()
+
+   # One may specify either a number of rows:
+   example_series.sample(n=3)
+
+   # Or a fraction of the rows:
+   example_series.sample(frac=0.5)
+
+   # weights are accepted.
+   example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4]
+   example_series.sample(n=3, weights=example_weights)
+
+   # weights will also be normalized if they do not sum to one,
+   # and missing values will be treated as zeros.
+   example_weights2 = [0.5, 0, 0, 0, None, np.nan]
+   example_series.sample(n=1, weights=example_weights2)
+
+
+When applied to a DataFrame, one may pass the name of a column to specify sampling weights
+when sampling from rows.
+
+.. ipython :: python
+
+   df = DataFrame({'col1':[9,8,7,6], 'weight_column':[0.5, 0.4, 0.1, 0]})
+   df.sample(n=3, weights='weight_column')
+
 .. _whatsnew_0161.api:
 
 API changes
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -3319,3 +3319,30 @@ def _maybe_match_name(a, b):
     if a_name == b_name:
         return a_name
     return None
+
+def _random_state(state=None):
+    """
+    Helper function for processing random_state arguments.
+
+    Parameters
+    ----------
+    state : int, np.random.RandomState, None.
+        If receives an int, passes to np.random.RandomState() as seed.
+        If receives an np.random.RandomState object, just returns object.
+        If receives `None`, returns an np.random.RandomState object.
+        If receives anything else, raises an informative ValueError.
+        Default None.
+
+    Returns
+    -------
+    np.random.RandomState
+    """
+
+    if is_integer(state):
+        return np.random.RandomState(state)
+    elif isinstance(state, np.random.RandomState):
+        return state
+    elif state is None:
+        return np.random.RandomState()
+    else:
+        raise ValueError("random_state must be an integer, a numpy RandomState, or None")
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1948,6 +1948,103 @@ def tail(self, n=5):
             return self
         return self.iloc[-n:]
 
+
+    def sample(self, n=None, frac=None, replace=False, weights=None, random_state=None, axis=None):
+        """
+        Returns a random sample of items from an axis of object.
+
+        Parameters
+        ----------
+        n : int, optional
+            Number of items from axis to return. Cannot be used with `frac`.
+            Default = 1 if `frac` = None.
+        frac : float, optional
+            Fraction of axis items to return. Cannot be used with `n`.
+        replace : boolean, optional
+            Sample with or without replacement. Default = False.
+        weights : str or ndarray-like, optional
+            Default 'None' results in equal probability weighting.
+            If called on a DataFrame, will accept the name of a column
+            when axis = 0.
+            Weights must be same length as axis being sampled.
+            If weights do not sum to 1, they will be normalized to sum to 1.
+            Missing values in the weights column will be treated as zero.
+            inf and -inf values not allowed.
+        random_state : int or numpy.random.RandomState, optional
+            Seed for the random number generator (if int), or numpy RandomState
+            object.
+        axis : int or string, optional
+            Axis to sample. Accepts axis number or name. Default is stat axis
+            for given data type (0 for Series and DataFrames, 1 for Panels).
+
+        Returns
+        -------
+        Same type as caller.
+        """
+
+        if axis is None:
+            axis = self._stat_axis_number
+
+        axis = self._get_axis_number(axis)
+        axis_length = self.shape[axis]
+
+        # Process random_state argument
+        rs = com._random_state(random_state)
+
+        # Check weights for compliance
+        if weights is not None:
+
+            # Strings acceptable if a dataframe and axis = 0
+            if isinstance(weights, string_types):
+                if isinstance(self, pd.DataFrame):
+                    if axis == 0:
+                        try:
+                            weights = self[weights]
+                        except KeyError:
+                            raise KeyError("String passed to weights not a valid column")
+                    else:
+                        raise ValueError("Strings can only be passed to weights when sampling from rows on a DataFrame")
+                else:
+                    raise ValueError("Strings cannot be passed as weights when sampling from a Series or Panel.")
+
+            weights = pd.Series(weights, dtype='float64')
+
+            if len(weights) != axis_length:
+                raise ValueError("Weights and axis to be sampled must be of same length")
+
+            if (weights == np.inf).any() or (weights == -np.inf).any():
+                raise ValueError("weight vector may not include `inf` values")
+
+            if (weights < 0).any():
+                raise ValueError("weight vector many not include negative values")
+
+            # If has nan, set to zero.
+            weights = weights.fillna(0)
+
+            # Renormalize if don't sum to 1
+            if weights.sum() != 1:
+                weights = weights / weights.sum()
+
+            weights = weights.values
+
+        # If no frac or n, default to n=1.
+        if n is None and frac is None:
+            n = 1
+        elif n is not None and frac is None and n % 1 != 0:
+            raise ValueError("Only integers accepted as `n` values")
+        elif n is None and frac is not None:
+            n = int(round(frac * axis_length))
+        elif n is not None and frac is not None:
+            raise ValueError('Please enter a value for `frac` OR `n`, not both')
+
+        # Check for negative sizes
+        if n < 0:
+            raise ValueError("A negative number of rows requested. Please provide positive value.")
+
+        locs = rs.choice(axis_length, size=n, replace=replace, p=weights)
+        return self.take(locs, axis=axis)
+
+
     #----------------------------------------------------------------------
     # Attribute access
 
@@ -3395,7 +3492,7 @@ def where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
 
                 matches = (new_other == np.array(other))
                 if matches is False or not matches.all():
-                    
+
                     # coerce other to a common dtype if we can
                     if com.needs_i8_conversion(self.dtype):
                         try:
diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py
@@ -524,6 +524,26 @@ def test_is_recompilable():
     for f in fails:
         assert not com.is_re_compilable(f)
 
+def test_random_state():
+    import numpy.random as npr
+    # Check with seed
+    state = com._random_state(5)
+    assert_equal(state.uniform(), npr.RandomState(5).uniform())
+
+    # Check with random state object
+    state2 = npr.RandomState(10)
+    assert_equal(com._random_state(state2).uniform(), npr.RandomState(10).uniform())
+
+    # check with no arg random state
+    assert isinstance(com._random_state(), npr.RandomState)
+
+    # Error for floats or strings
+    with tm.assertRaises(ValueError):
+        com._random_state('test')
+
+    with tm.assertRaises(ValueError):
+        com._random_state(5.5)
+
 
 class TestTake(tm.TestCase):
     # standard incompatible fill error
diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py