Skip to content

amend sample to return copy and align weight axis #10738

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 19, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -628,6 +628,8 @@ Bug Fixes
- Bug in ``stack`` when index or columns are not unique. (:issue:`10417`)
- Bug in setting a Panel when an axis has a multi-index (:issue:`10360`)
- Bug in ``USFederalHolidayCalendar`` where ``USMemorialDay`` and ``USMartinLutherKingJr`` were incorrect (:issue:`10278` and :issue:`9760` )
- Bug in ``.sample()`` where returned object, if set, gives unnecessary ``SettingWithCopyWarning`` (:issue:`10738`)
- Bug in ``.sample()`` where weights passed as Series were not aligned along axis before being treated positionally, potentially causing problems if weight indices were not aligned with sampled object. (:issue:`10738`)



Expand Down
21 changes: 17 additions & 4 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2004,9 +2004,14 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No
Sample with or without replacement. Default = False.
weights : str or ndarray-like, optional
Default 'None' results in equal probability weighting.
If passed a Series, will align with target object on index. Index
values in weights not found in sampled object will be ignored and
index values in sampled object not in weights will be assigned
weights of zero.
If called on a DataFrame, will accept the name of a column
when axis = 0.
Weights must be same length as axis being sampled.
Unless weights are a Series, weights must be same length as axis
being sampled.
If weights do not sum to 1, they will be normalized to sum to 1.
Missing values in the weights column will be treated as zero.
inf and -inf values not allowed.
Expand All @@ -2019,7 +2024,7 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No

Returns
-------
Same type as caller.
A new object of same type as caller.
"""

if axis is None:
Expand All @@ -2034,6 +2039,10 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No
# Check weights for compliance
if weights is not None:

# If a series, align with frame
if isinstance(weights, pd.Series):
weights = weights.reindex(self.axes[axis])

# Strings acceptable if a dataframe and axis = 0
if isinstance(weights, string_types):
if isinstance(self, pd.DataFrame):
Expand Down Expand Up @@ -2063,7 +2072,10 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No

# Renormalize if don't sum to 1
if weights.sum() != 1:
weights = weights / weights.sum()
if weights.sum() != 0:
weights = weights / weights.sum()
else:
raise ValueError("Invalid weights: weights sum to zero")

weights = weights.values

Expand All @@ -2082,7 +2094,8 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No
raise ValueError("A negative number of rows requested. Please provide positive value.")

locs = rs.choice(axis_length, size=n, replace=replace, p=weights)
return self.take(locs, axis=axis)
return self.take(locs, axis=axis, is_copy=False)


_shared_docs['pipe'] = ("""
Apply func(self, \*args, \*\*kwargs)
Expand Down
37 changes: 35 additions & 2 deletions pandas/tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,7 @@ def test_sample(self):

self._compare(o.sample(frac=0.7,random_state=np.random.RandomState(test)),
o.sample(frac=0.7, random_state=np.random.RandomState(test)))


# Check for error when random_state argument invalid.
with tm.assertRaises(ValueError):
Expand Down Expand Up @@ -415,6 +415,10 @@ def test_sample(self):
bad_weights = [0.5]*11
o.sample(n=3, weights=bad_weights)

with tm.assertRaises(ValueError):
bad_weight_series = Series([0,0,0.2])
o.sample(n=4, weights=bad_weight_series)

# Check won't accept negative weights
with tm.assertRaises(ValueError):
bad_weights = [-0.1]*10
Expand All @@ -431,6 +435,16 @@ def test_sample(self):
weights_with_ninf[0] = -np.inf
o.sample(n=3, weights=weights_with_ninf)

# All zeros raises errors
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New simple tests for all zero or all np.nan weights.

zero_weights = [0]*10
with tm.assertRaises(ValueError):
o.sample(n=3, weights=zero_weights)

# All missing weights
nan_weights = [np.nan]*10
with tm.assertRaises(ValueError):
o.sample(n=3, weights=nan_weights)


# A few dataframe test with degenerate weights.
easy_weight_list = [0]*10
Expand Down Expand Up @@ -496,7 +510,6 @@ def test_sample(self):
assert_frame_equal(df.sample(n=1, axis='index', weights=weight),
df.iloc[5:6])


# Check out of range axis values
with tm.assertRaises(ValueError):
df.sample(n=1, axis=2)
Expand Down Expand Up @@ -527,6 +540,26 @@ def test_sample(self):
assert_panel_equal(p.sample(n=3, random_state=42), p.sample(n=3, axis=1, random_state=42))
assert_frame_equal(df.sample(n=3, random_state=42), df.sample(n=3, axis=0, random_state=42))

# Test that function aligns weights with frame
df = DataFrame({'col1':[5,6,7], 'col2':['a','b','c'], }, index = [9,5,3])
s = Series([1,0,0], index=[3,5,9])
assert_frame_equal(df.loc[[3]], df.sample(1, weights=s))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

test also where you have a missing indexer (so should be nan filled), e.g. Series([1,0,0],index=[3,5,10])

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added


# Weights have index values to be dropped because not in
# sampled DataFrame
s2 = Series([0.001,0,10000], index=[3,5,10])
assert_frame_equal(df.loc[[3]], df.sample(1, weights=s2))

# Weights have empty values to be filed with zeros
s3 = Series([0.01,0], index=[3,5])
assert_frame_equal(df.loc[[3]], df.sample(1, weights=s3))

# No overlap in weight and sampled DataFrame indices
s4 = Series([1,0], index=[1,2])
with tm.assertRaises(ValueError):
df.sample(1, weights=s4)


Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback You had a comment here that I think I addressed but am not sure I quite understood. You said:

"can you add a test where none of the weights are there (e.g. weights.sum == 0 so you raise), and also one that combines the passed weights and does the computation."

I added simpler tests for weights summing to zero above (around line 438). Does that address your concern? If not, could you please expand on what you had in mind?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think its ok

def test_size_compat(self):
# GH8846
# size property should be defined
Expand Down