Add sample function with tests and docs

nickeubank · Nick Eubank · commit 8b506a38823a · 2015-04-24T21:12:10.000-07:00
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -390,6 +390,7 @@ Reindexing / Selection / Label manipulation
    Series.reindex_like
    Series.rename
    Series.reset_index
+   Series.sample
    Series.select
    Series.take
    Series.tail
@@ -713,6 +714,7 @@ Indexing, iteration
    DataFrame.where
    DataFrame.mask
    DataFrame.query
+   DataFrame.sample
 
 For more information on ``.at``, ``.iat``, ``.ix``, ``.loc``, and
 ``.iloc``,  see the :ref:`indexing documentation <indexing>`.
@@ -823,6 +825,7 @@ Reindexing / Selection / Label manipulation
    DataFrame.reindex_like
    DataFrame.rename
    DataFrame.reset_index
+   DataFrame.sample
    DataFrame.select
    DataFrame.set_index
    DataFrame.tail
diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst
@@ -508,6 +508,81 @@ A list of indexers where any element is out of bounds will raise an
 
 .. _indexing.basics.partial_setting:
 
+Selecting Random Samples
+------------------------
+.. versionadded::0.16.1
+
+A random selection of rows or columns from a Series, DataFrame, or Panel with the ``.sample()`` method. The method will sample rows by default, and accepts a specific number of rows/columns to return, or a fraction of rows. 
+
+.. ipython :: python
+
+   s = Series([0,1,2,3,4,5])
+
+   # When no arguments are passed, returns 1 row.
+   s.sample()
+   
+   # One may specify either a number of rows:
+   s.sample(n = 3)
+   
+   # Or a fraction of the rows:
+   s.sample(frac = 0.5)
+
+By default, ``sample`` will return each row at most once, but one can also sample with replacement
+using the ``replace`` option:
+
+.. ipython :: python
+
+   s = Series([0,1,2,3,4,5])
+		
+   # Without replacement (default):
+   s.sample(n = 6, replace = False)
+	
+   # With replacement:
+   s.sample(n = 6, replace = True)
+
+
+By default, each row has an equal probability of being selected, but if you want rows
+to have different probabilities, you can pass the ``sample`` function sampling weights as 
+``weights``. These weights can be a list, a numpy array, or a Series, but they must be of the same length as the object you are sampling. Missing values will be treated as a weight of zero, and inf values are not allowed. If weights do not sum to 1, they will be re-normalized by dividing all weights by the sum of the weights. For example:
+
+.. ipython :: python
+
+    s = Series([0,1,2,3,4,5])
+    example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4]
+    s.sample(n=3, weights = example_weights)
+	
+	# Weights will be re-normalized automatically
+	example_weights2 = [0.5, 0, 0, 0, 0, 0]
+	s.sample(n=1, weights= example_weights2)
+
+When applied to a DataFrame, you can use a column of the DataFrame as sampling weights
+(provided you are sampling rows and not columns) by simply passing the name of the column 
+as a string.
+	
+.. ipython :: python
+
+   df2 = DataFrame({'col1':[9,8,7,6], 'weight_column':[0.5, 0.4, 0.1, 0]})
+   df2.sample(n = 3, weights = 'weight_column')
+
+``sample`` also allows users to sample columns instead of rows using the ``axis`` argument. 
+
+.. 	ipython :: python
+
+	df3 = DataFrame({'col1':[1,2,3], 'col2':[2,3,4]})
+	df3.sample(n=1, axis = 1)
+
+Finally, one can also set a seed for ``sample``'s random number generator using the ``random_state`` argument, which will accept either an integer (as a seed) or a numpy RandomState object. 
+
+.. 	ipython :: python
+
+	df4 = DataFrame({'col1':[1,2,3], 'col2':[2,3,4]})
+
+	# With a given seed, the sample will always draw the same rows. 
+	df4.sample(n=2, random_state = 2)
+	df4.sample(n=2, random_state = 2)
+
+
+
 Setting With Enlargement
 ------------------------
 
diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt
@@ -20,6 +20,7 @@ Highlights include:
 
 Enhancements
 ~~~~~~~~~~~~
+.. _whatsnew_0161.enhancements.sample:
 
 - Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`)
 - Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`)
@@ -135,6 +136,45 @@ values NOT in the categories, similarly to how you can reindex ANY pandas index.
 
 See the :ref:`documentation <advanced.categoricalindex>` for more. (:issue:`7629`)
 
+Sample
+^^^^^^^^^^^^^^^^
+
+Series, DataFrames, and Panels now have a new method: :meth:`~pandas.core.sample`.
+The method accepts a specific number of rows or columns to return, or a fraction of the 
+total number or rows or columns. It also has options for sampling with or without replacement, 
+for passing in a column for weights for non-uniform sampling, and for setting seed values to facilitate replication. 
+
+.. ipython :: python
+
+   example_series = Series([0,1,2,3,4,5])
+
+   # When no arguments are passed, returns 5 rows like .head() or .tail()
+   example_series.sample()
+   
+   # One may specify either a number of rows:
+   example_series.sample(n = 3)
+   
+   # Or a fraction of the rows:
+   example_series.sample(frac = 0.5)
+
+   # weights are accepted. 
+   example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4]
+   example_series.sample(n=3, weights = example_weights)
+
+   # weights will also be normalized if they do not sum to one, 
+   # and missing values will be treated as zeros. 
+   example_weights2 = [0.5, 0, 0, 0, None, np.nan]
+   example_series.sample(n=1, weights = example_weights2)
+
+
+When applied to a DataFrame, one may pass the name of a column to specify sampling weights,
+although note that the value of the weights column must sum to one. 
+	
+.. ipython :: python
+
+   df = DataFrame({'col1':[9,8,7,6], 'weight_column':[0.5, 0.4, 0.1, 0]})
+   df.sample(n = 3, weights = 'weight_column')
+
 .. _whatsnew_0161.api:
 
 API changes
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -3319,3 +3319,14 @@ def _maybe_match_name(a, b):
     if a_name == b_name:
         return a_name
     return None
+
+def _random_state(state):
+    if isinstance(state, int):
+        return np.random.RandomState(state)
+    elif isinstance(state, np.random.RandomState):
+        return state
+    elif state is None:
+        return np.random.RandomState()
+    else:
+        raise ValueError("random_state must be either an integer or numpy RandomState")
+
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1948,6 +1948,118 @@ def tail(self, n=5):
             return self
         return self.iloc[-n:]
 
+  
+    def sample(self, n=None, frac=None, replace=False, weights=None, random_state=None, axis = 0):
+        """
+        Returns a random sample of rows from object. 
+        
+        Parameters
+        ----------
+        n : int, optional
+            Number of rows to return. Cannot be used with `frac`.
+            Default = 1 if `frac` = None. 
+        frac : float, optional
+            Share of rows to return. Cannot be used with `n`. 
+        replace : boolean, optional
+            Sample with or without replacement. Default = False. 
+        weights : str or ndarray-like, optional
+            Default 'None' results in equal probability weighting. 
+            If called on a DataFrame or Panel, will also accept the name of a 
+            column as a string. Must be same length as index. 
+            If weights do not sum to 1, they will be normalized to sum to 1. 
+            Missing values in the weights column will be treated as zero. 
+            inf and -inf values not allowed. 
+        random_state : int or numpy.random.RandomState, optional
+            Seed for the random number generator (if int), or numpy RandomState 
+            object.
+        axis : int or string, optional
+            Axis to sample. Accepts axis number or name. Default = 0.
+            
+        Returns
+        -------
+        Same type as caller. 
+        """
+        
+        ###        
+        # Processing axis argument
+        ###
+        
+        # Check validity of axis argument.
+        axis = self._get_axis_number(axis)
+    
+        # Store length of relevant axis of object. 
+        axis_length = self.shape[axis]    
+    
+        ###
+        # Clean / process random_state argument
+        ###
+     
+        rs = com._random_state(random_state)
+    
+        ###
+        #  Process weight argument
+        ###
+    
+        # Check weights for compliance
+        if weights is not None:
+                                      
+            # Strings acceptable if not a series
+            if isinstance(weights, string_types): 
+
+                if self.ndim > 1 :
+                    try:
+                        weights = self[weights]
+                    except KeyError:
+                        raise KeyError("String passed to weights not a valid column name")
+
+                else:
+                    raise ValueError("Strings cannot be passed as weights when sampling from a Series.")
+
+            #normalize format of weights to ndarray. 
+            weights = pd.Series(weights, dtype = 'float64')
+                        
+            # Check length (numpy does this, but has confusing errors with different argument labels.)
+            if len(weights) != axis_length:
+                raise ValueError("Weights and axis to be sampled must be of same length")
+
+            # No infs allowed. The np.nan_to_num() command below would make these large values
+            # which is pretty unintuitive. 
+            if (weights == np.inf).any() or (weights == -np.inf).any():
+                raise ValueError("weight vector may not include `inf` values")
+                
+            if (weights < 0).any():
+                raise ValueError("weight vector many not include negative values")
+
+            # If has nan, set to zero. Already know there are no infs. 
+            weights = weights.fillna(0)
+
+
+            # Check that weights sum to 1. If not, renormalize. 
+            if weights.sum() != 1:
+                weights = weights / weights.sum()
+    
+        ###
+        # Process n and frac arguments
+        ###
+    
+        # Check whether frac or N is passed. If neither, default to N=1.
+        if n is None and frac is None:
+            n = 1
+        elif n is not None and frac is None and n % 1 != 0:
+            raise ValueError("Only integers accepted as `n` values")
+        elif n is None and frac is not None:
+            n = int(round(frac * axis_length)) 
+        elif n is not None and frac is not None:
+            raise ValueError('Please enter a value for `frac` OR `n`, not both')
+    
+        # Check for negative sizes
+        if n < 0:
+            raise ValueError("A negative number of rows requested. Please provide positive value.")
+    
+        locs = rs.choice(axis_length, size=n, replace=replace, p=weights)
+        return self.take(locs, axis=axis)
+
+    
     #----------------------------------------------------------------------
     # Attribute access
 
diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py