weight tweaks

Nick Eubank · Nick Eubank · commit 74debcc3e2e6 · 2015-04-25T15:49:09.000-07:00
diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt
@@ -168,14 +168,18 @@ for passing in a column for weights for non-uniform sampling, and for setting se
    example_series.sample(n=1, weights=example_weights2)
 
 
-When applied to a DataFrame, one may pass the name of a column to specify sampling weights,
-although note that the value of the weights column must sum to one. 
+When applied to a DataFrame, one may pass the name of a column to specify sampling weights
+when sampling from rows (thought row names may not be passed to sample from rows). 
 	
 .. ipython :: python
 
-   df = DataFrame({'col1':[9,8,7,6], 'weight_column':[0.5, 0.4, 0.1, 0]})
+   df = DataFrame({'col1':[9,8,7,6], 'weight_column':[0.5, 0.4, 0.1, 0]}, index=['a', 'b', 'c', 'd'])
    df.sample(n=3, weights='weight_column')
 
+   df.sample(n=3, weights='weight_column', axis = )
+
+
+
 .. _whatsnew_0161.api:
 
 API changes
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1964,8 +1964,9 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No
             Sample with or without replacement. Default = False. 
         weights : str or ndarray-like, optional
             Default 'None' results in equal probability weighting. 
-            If called on a DataFrame or Panel, will also accept the name of a 
-            column as a string. Must be same length as index. 
+            If called on a DataFrame, will accept the name of a column
+            when axis = 0. 
+            Weights must be same length as axis being sampled. 
             If weights do not sum to 1, they will be normalized to sum to 1. 
             Missing values in the weights column will be treated as zero. 
             inf and -inf values not allowed. 
@@ -2003,17 +2004,18 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No
         # Check weights for compliance
         if weights is not None:
                                       
-            # Strings acceptable if not a series
+            # Strings acceptable if a dataframe and axis = 0
             if isinstance(weights, string_types): 
-
-                if self.ndim > 1 :
-                    try:
-                        weights = self[weights]
-                    except KeyError:
-                        raise KeyError("String passed to weights not a valid name for an item in specified axis")
-
+                if isinstance(self, pd.DataFrame):
+                    if axis == 0:
+                        try:
+                            weights = self[weights]
+                        except KeyError:
+                            raise KeyError("String passed to weights not a valid column")
+                    else: 
+                        raise ValueError("Strings can only be passed to weights when sampling from rows on a DataFrame")
                 else:
-                    raise ValueError("Strings cannot be passed as weights when sampling from a Series.")
+                    raise ValueError("Strings cannot be passed as weights when sampling from a Series or Panel.")
 
             #normalize format of weights to ndarray. 
             weights = pd.Series(weights, dtype = 'float64')
diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py
@@ -431,10 +431,6 @@ def test_sample(self):
             weights_with_ninf[0] =  -np.inf
             o.sample(n=3, weights=weights_with_ninf)
         
-        # Ensure proper error if string given as weight for Series
-        s = Series(range(10))
-        with tm.assertRaises(ValueError):
-            s.sample(n=3, weights='weight_column')
 
         # A few dataframe test with degenerate weights. 
         easy_weight_list = [0]*10
@@ -443,33 +439,36 @@ def test_sample(self):
         df = pd.DataFrame({'col1':range(10,20), 
                            'col2':range(20,30), 
                            'colString': ['a']*10,
-                           'easyweights':easy_weight_list})    
+                           'easyweights':easy_weight_list}, index = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'])    
         sample1 = df.sample(n=1, weights='easyweights') 
         assert_frame_equal(sample1, df.iloc[5:6])
 
+        # Ensure proper error if string given as weight for Series, panel, or 
+        # DataFrame with axis = 1.
+        s = Series(range(10))
+        with tm.assertRaises(ValueError):
+            s.sample(n=3, weights='weight_column')
+
+        panel = pd.Panel(items = [0,1,2], major_axis = [2,3,4], minor_axis = [3,4,5])
+        with tm.assertRaises(ValueError):
+            panel.sample(n=1, weights='weight_column')
+
+        with tm.assertRaises(ValueError):
+            df.sample(n=1, weights='weight_column', axis = 1)
+
         # Check weighting key error        
         with tm.assertRaises(KeyError):
             df.sample(n=3, weights='not_a_real_column_name')
  
          # Check np.nan are replaced by zeros. 
         weights_with_nan = [np.nan]*10
         weights_with_nan[5] = 0.5
-
-        sampled_df = df.sample(n=1, weights = weights_with_nan)
-        tm.assert_frame_equal(sampled_df, df.iloc[5:6])
-        
-        sampled_s = s.sample(n=1, weights = weights_with_nan)
-        tm.assert_series_equal(sampled_s, s.iloc[5:6])
+        self._compare(o.sample(n=1, weights=weights_with_nan), o.iloc[5:6])
     
         # Check None are also replaced by zeros. 
         weights_with_None = [None]*10        
         weights_with_None[5] = 0.5
-
-        sampled_df2 = df.sample(n=1, weights = weights_with_None)
-        tm.assert_frame_equal(sampled_df2, df.iloc[5:6])
-
-        sampled_s2 = s.sample(n=1, weights = weights_with_None)
-        tm.assert_series_equal(sampled_s2, s.iloc[5:6])
+        self._compare(o.sample(n=1, weights=weights_with_None), o.iloc[5:6])
 
         # Check that re-normalizes weights that don't sum to one.
         weights_less_than_1 = [0]*10