From 17ec1d930d0ca05bc25bbf8c196afacb1b66cf30 Mon Sep 17 00:00:00 2001 From: Nick Eubank Date: Wed, 19 Aug 2015 10:57:04 -0700 Subject: [PATCH] amend sample to return copy and align weight axis --- doc/source/whatsnew/v0.17.0.txt | 2 ++ pandas/core/generic.py | 21 +++++++++++++++---- pandas/tests/test_generic.py | 37 +++++++++++++++++++++++++++++++-- 3 files changed, 54 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 71feddc1a8b7c..f45ed9eeeaeee 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -628,6 +628,8 @@ Bug Fixes - Bug in ``stack`` when index or columns are not unique. (:issue:`10417`) - Bug in setting a Panel when an axis has a multi-index (:issue:`10360`) - Bug in ``USFederalHolidayCalendar`` where ``USMemorialDay`` and ``USMartinLutherKingJr`` were incorrect (:issue:`10278` and :issue:`9760` ) +- Bug in ``.sample()`` where returned object, if set, gives unnecessary ``SettingWithCopyWarning`` (:issue:`10738`) +- Bug in ``.sample()`` where weights passed as Series were not aligned along axis before being treated positionally, potentially causing problems if weight indices were not aligned with sampled object. (:issue:`10738`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 22f3e3016c019..0e8afbdafb866 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2004,9 +2004,14 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No Sample with or without replacement. Default = False. weights : str or ndarray-like, optional Default 'None' results in equal probability weighting. + If passed a Series, will align with target object on index. Index + values in weights not found in sampled object will be ignored and + index values in sampled object not in weights will be assigned + weights of zero. If called on a DataFrame, will accept the name of a column when axis = 0. - Weights must be same length as axis being sampled. + Unless weights are a Series, weights must be same length as axis + being sampled. If weights do not sum to 1, they will be normalized to sum to 1. Missing values in the weights column will be treated as zero. inf and -inf values not allowed. @@ -2019,7 +2024,7 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No Returns ------- - Same type as caller. + A new object of same type as caller. """ if axis is None: @@ -2034,6 +2039,10 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No # Check weights for compliance if weights is not None: + # If a series, align with frame + if isinstance(weights, pd.Series): + weights = weights.reindex(self.axes[axis]) + # Strings acceptable if a dataframe and axis = 0 if isinstance(weights, string_types): if isinstance(self, pd.DataFrame): @@ -2063,7 +2072,10 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No # Renormalize if don't sum to 1 if weights.sum() != 1: - weights = weights / weights.sum() + if weights.sum() != 0: + weights = weights / weights.sum() + else: + raise ValueError("Invalid weights: weights sum to zero") weights = weights.values @@ -2082,7 +2094,8 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No raise ValueError("A negative number of rows requested. Please provide positive value.") locs = rs.choice(axis_length, size=n, replace=replace, p=weights) - return self.take(locs, axis=axis) + return self.take(locs, axis=axis, is_copy=False) + _shared_docs['pipe'] = (""" Apply func(self, \*args, \*\*kwargs) diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index f434992e9fcd8..c1f6045c61d54 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -374,7 +374,7 @@ def test_sample(self): self._compare(o.sample(frac=0.7,random_state=np.random.RandomState(test)), o.sample(frac=0.7, random_state=np.random.RandomState(test))) - + # Check for error when random_state argument invalid. with tm.assertRaises(ValueError): @@ -415,6 +415,10 @@ def test_sample(self): bad_weights = [0.5]*11 o.sample(n=3, weights=bad_weights) + with tm.assertRaises(ValueError): + bad_weight_series = Series([0,0,0.2]) + o.sample(n=4, weights=bad_weight_series) + # Check won't accept negative weights with tm.assertRaises(ValueError): bad_weights = [-0.1]*10 @@ -431,6 +435,16 @@ def test_sample(self): weights_with_ninf[0] = -np.inf o.sample(n=3, weights=weights_with_ninf) + # All zeros raises errors + zero_weights = [0]*10 + with tm.assertRaises(ValueError): + o.sample(n=3, weights=zero_weights) + + # All missing weights + nan_weights = [np.nan]*10 + with tm.assertRaises(ValueError): + o.sample(n=3, weights=nan_weights) + # A few dataframe test with degenerate weights. easy_weight_list = [0]*10 @@ -496,7 +510,6 @@ def test_sample(self): assert_frame_equal(df.sample(n=1, axis='index', weights=weight), df.iloc[5:6]) - # Check out of range axis values with tm.assertRaises(ValueError): df.sample(n=1, axis=2) @@ -527,6 +540,26 @@ def test_sample(self): assert_panel_equal(p.sample(n=3, random_state=42), p.sample(n=3, axis=1, random_state=42)) assert_frame_equal(df.sample(n=3, random_state=42), df.sample(n=3, axis=0, random_state=42)) + # Test that function aligns weights with frame + df = DataFrame({'col1':[5,6,7], 'col2':['a','b','c'], }, index = [9,5,3]) + s = Series([1,0,0], index=[3,5,9]) + assert_frame_equal(df.loc[[3]], df.sample(1, weights=s)) + + # Weights have index values to be dropped because not in + # sampled DataFrame + s2 = Series([0.001,0,10000], index=[3,5,10]) + assert_frame_equal(df.loc[[3]], df.sample(1, weights=s2)) + + # Weights have empty values to be filed with zeros + s3 = Series([0.01,0], index=[3,5]) + assert_frame_equal(df.loc[[3]], df.sample(1, weights=s3)) + + # No overlap in weight and sampled DataFrame indices + s4 = Series([1,0], index=[1,2]) + with tm.assertRaises(ValueError): + df.sample(1, weights=s4) + + def test_size_compat(self): # GH8846 # size property should be defined