fixed small issues jorvisvandenbossche noted

Nick Eubank · Nick Eubank · commit ff4f44211b37 · 2015-04-25T15:56:34.000-07:00
diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst
@@ -516,29 +516,29 @@ A random selection of rows or columns from a Series, DataFrame, or Panel with th
 
 .. ipython :: python
 
-   s = Series([0,1,2,3,4,5])
+    s = Series([0,1,2,3,4,5])
 
-   # When no arguments are passed, returns 1 row.
-   s.sample()
-   
-   # One may specify either a number of rows:
-   s.sample(n = 3)
+    # When no arguments are passed, returns 1 row.
+    s.sample()
+    
+    # One may specify either a number of rows:
+    s.sample(n=3)
    
-   # Or a fraction of the rows:
-   s.sample(frac = 0.5)
+    # Or a fraction of the rows:
+    s.sample(frac=0.5)
 
 By default, ``sample`` will return each row at most once, but one can also sample with replacement
 using the ``replace`` option:
 
 .. ipython :: python
 
    s = Series([0,1,2,3,4,5])
-		
-   # Without replacement (default):
-   s.sample(n = 6, replace = False)
-	
-   # With replacement:
-   s.sample(n = 6, replace = True)
+
+    # Without replacement (default):
+    s.sample(n=6, replace=False)
+
+    # With replacement:
+    s.sample(n=6, replace=True)
 
 
 By default, each row has an equal probability of being selected, but if you want rows
@@ -549,37 +549,37 @@ to have different probabilities, you can pass the ``sample`` function sampling w
 
     s = Series([0,1,2,3,4,5])
     example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4]
-    s.sample(n=3, weights = example_weights)
-	
-	# Weights will be re-normalized automatically
-	example_weights2 = [0.5, 0, 0, 0, 0, 0]
-	s.sample(n=1, weights= example_weights2)
+    s.sample(n=3, weights=example_weights)
+    
+    # Weights will be re-normalized automatically
+    example_weights2 = [0.5, 0, 0, 0, 0, 0]
+    s.sample(n=1, weights=example_weights2)
 
 When applied to a DataFrame, you can use a column of the DataFrame as sampling weights
 (provided you are sampling rows and not columns) by simply passing the name of the column 
 as a string.
-	
+    
 .. ipython :: python
 
-   df2 = DataFrame({'col1':[9,8,7,6], 'weight_column':[0.5, 0.4, 0.1, 0]})
-   df2.sample(n = 3, weights = 'weight_column')
+    df2 = DataFrame({'col1':[9,8,7,6], 'weight_column':[0.5, 0.4, 0.1, 0]})
+    df2.sample(n = 3, weights = 'weight_column')
 
 ``sample`` also allows users to sample columns instead of rows using the ``axis`` argument. 
 
 .. 	ipython :: python
 
-	df3 = DataFrame({'col1':[1,2,3], 'col2':[2,3,4]})
-	df3.sample(n=1, axis = 1)
+    df3 = DataFrame({'col1':[1,2,3], 'col2':[2,3,4]})
+    df3.sample(n=1, axis=1)
 
 Finally, one can also set a seed for ``sample``'s random number generator using the ``random_state`` argument, which will accept either an integer (as a seed) or a numpy RandomState object. 
 
 .. 	ipython :: python
 
-	df4 = DataFrame({'col1':[1,2,3], 'col2':[2,3,4]})
+    df4 = DataFrame({'col1':[1,2,3], 'col2':[2,3,4]})
 
-	# With a given seed, the sample will always draw the same rows. 
-	df4.sample(n=2, random_state = 2)
-	df4.sample(n=2, random_state = 2)
+    # With a given seed, the sample will always draw the same rows. 
+    df4.sample(n=2, random_state=2)
+    df4.sample(n=2, random_state=2)
 
 
 
diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt
@@ -20,7 +20,6 @@ Highlights include:
 
 Enhancements
 ~~~~~~~~~~~~
-.. _whatsnew_0161.enhancements.sample:
 
 - Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`)
 - Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`)
@@ -136,10 +135,12 @@ values NOT in the categories, similarly to how you can reindex ANY pandas index.
 
 See the :ref:`documentation <advanced.categoricalindex>` for more. (:issue:`7629`)
 
+.. _whatsnew_0161.enhancements.sample:
+
 Sample
 ^^^^^^^^^^^^^^^^
 
-Series, DataFrames, and Panels now have a new method: :meth:`~pandas.core.sample`.
+Series, DataFrames, and Panels now have a new method: :meth:`~pandas.DataFrame.sample`.
 The method accepts a specific number of rows or columns to return, or a fraction of the 
 total number or rows or columns. It also has options for sampling with or without replacement, 
 for passing in a column for weights for non-uniform sampling, and for setting seed values to facilitate replication. 
@@ -148,32 +149,32 @@ for passing in a column for weights for non-uniform sampling, and for setting se
 
    example_series = Series([0,1,2,3,4,5])
 
-   # When no arguments are passed, returns 5 rows like .head() or .tail()
+   # When no arguments are passed, returns 1
    example_series.sample()
    
    # One may specify either a number of rows:
-   example_series.sample(n = 3)
+   example_series.sample(n=3)
    
    # Or a fraction of the rows:
-   example_series.sample(frac = 0.5)
+   example_series.sample(frac=0.5)
 
    # weights are accepted. 
    example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4]
-   example_series.sample(n=3, weights = example_weights)
+   example_series.sample(n=3, weights=example_weights)
 
    # weights will also be normalized if they do not sum to one, 
    # and missing values will be treated as zeros. 
    example_weights2 = [0.5, 0, 0, 0, None, np.nan]
-   example_series.sample(n=1, weights = example_weights2)
+   example_series.sample(n=1, weights=example_weights2)
 
 
-When applied to a DataFrame, one may pass the name of a column to specify sampling weights,
-although note that the value of the weights column must sum to one. 
+When applied to a DataFrame, one may pass the name of a column to specify sampling weights
+when sampling from rows. 
 	
 .. ipython :: python
 
    df = DataFrame({'col1':[9,8,7,6], 'weight_column':[0.5, 0.4, 0.1, 0]})
-   df.sample(n = 3, weights = 'weight_column')
+   df.sample(n=3, weights='weight_column')
 
 .. _whatsnew_0161.api:
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1959,13 +1959,14 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No
             Number of rows to return. Cannot be used with `frac`.
             Default = 1 if `frac` = None. 
         frac : float, optional
-            Share of rows to return. Cannot be used with `n`. 
+            Fraction of rows to return. Cannot be used with `n`. 
         replace : boolean, optional
             Sample with or without replacement. Default = False. 
         weights : str or ndarray-like, optional
             Default 'None' results in equal probability weighting. 
-            If called on a DataFrame or Panel, will also accept the name of a 
-            column as a string. Must be same length as index. 
+            If called on a DataFrame, will accept the name of a column
+            when axis = 0. 
+            Weights must be same length as axis being sampled. 
             If weights do not sum to 1, they will be normalized to sum to 1. 
             Missing values in the weights column will be treated as zero. 
             inf and -inf values not allowed. 
@@ -2003,17 +2004,18 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No
         # Check weights for compliance
         if weights is not None:
                                       
-            # Strings acceptable if not a series
+            # Strings acceptable if a dataframe and axis = 0
             if isinstance(weights, string_types): 
-
-                if self.ndim > 1 :
-                    try:
-                        weights = self[weights]
-                    except KeyError:
-                        raise KeyError("String passed to weights not a valid column name")
-
+                if isinstance(self, pd.DataFrame):
+                    if axis == 0:
+                        try:
+                            weights = self[weights]
+                        except KeyError:
+                            raise KeyError("String passed to weights not a valid column")
+                    else: 
+                        raise ValueError("Strings can only be passed to weights when sampling from rows on a DataFrame")
                 else:
-                    raise ValueError("Strings cannot be passed as weights when sampling from a Series.")
+                    raise ValueError("Strings cannot be passed as weights when sampling from a Series or Panel.")
 
             #normalize format of weights to ndarray. 
             weights = pd.Series(weights, dtype = 'float64')
@@ -2022,8 +2024,7 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No
             if len(weights) != axis_length:
                 raise ValueError("Weights and axis to be sampled must be of same length")
 
-            # No infs allowed. The np.nan_to_num() command below would make these large values
-            # which is pretty unintuitive. 
+            # No infs allowed.
             if (weights == np.inf).any() or (weights == -np.inf).any():
                 raise ValueError("weight vector may not include `inf` values")
                 
diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py
@@ -431,10 +431,6 @@ def test_sample(self):
             weights_with_ninf[0] =  -np.inf
             o.sample(n=3, weights=weights_with_ninf)
         
-        # Ensure proper error if string given as weight for Series
-        s = Series(range(10))
-        with tm.assertRaises(ValueError):
-            s.sample(n=3, weights='weight_column')
 
         # A few dataframe test with degenerate weights. 
         easy_weight_list = [0]*10
@@ -447,29 +443,32 @@ def test_sample(self):
         sample1 = df.sample(n=1, weights='easyweights') 
         assert_frame_equal(sample1, df.iloc[5:6])
 
+        # Ensure proper error if string given as weight for Series, panel, or 
+        # DataFrame with axis = 1.
+        s = Series(range(10))
+        with tm.assertRaises(ValueError):
+            s.sample(n=3, weights='weight_column')
+
+        panel = pd.Panel(items = [0,1,2], major_axis = [2,3,4], minor_axis = [3,4,5])
+        with tm.assertRaises(ValueError):
+            panel.sample(n=1, weights='weight_column')
+
+        with tm.assertRaises(ValueError):
+            df.sample(n=1, weights='weight_column', axis = 1)
+
         # Check weighting key error        
         with tm.assertRaises(KeyError):
             df.sample(n=3, weights='not_a_real_column_name')
  
          # Check np.nan are replaced by zeros. 
         weights_with_nan = [np.nan]*10
         weights_with_nan[5] = 0.5
-
-        sampled_df = df.sample(n=1, weights = weights_with_nan)
-        tm.assert_frame_equal(sampled_df, df.iloc[5:6])
-        
-        sampled_s = s.sample(n=1, weights = weights_with_nan)
-        tm.assert_series_equal(sampled_s, s.iloc[5:6])
+        self._compare(o.sample(n=1, weights=weights_with_nan), o.iloc[5:6])
     
         # Check None are also replaced by zeros. 
         weights_with_None = [None]*10        
         weights_with_None[5] = 0.5
-
-        sampled_df2 = df.sample(n=1, weights = weights_with_None)
-        tm.assert_frame_equal(sampled_df2, df.iloc[5:6])
-
-        sampled_s2 = s.sample(n=1, weights = weights_with_None)
-        tm.assert_series_equal(sampled_s2, s.iloc[5:6])
+        self._compare(o.sample(n=1, weights=weights_with_None), o.iloc[5:6])
 
         # Check that re-normalizes weights that don't sum to one.
         weights_less_than_1 = [0]*10