Skip to content

add a new feature sample() into groupby #33516

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,16 @@ def is_any_frame() -> bool:
# we have a dict of DataFrames
# return a MI DataFrame

return concat([result[k] for k in keys], keys=keys, axis=1), True
#return concat([result[k] for k in keys], keys=keys, axis=1), True
# #issue 32580: Grouped-by column loses name when empty list of aggregations is specified.
#Bug in the method `DataFrame.groupby` lost index, when one of the ``agg`` keys referenced an empty list (:issue:`32580`)
keys_to_use=[k for k in keys if not result[k].empty]
# check: if at least one DataFrame is not empty
if keys_to_use !=[]:
keys_to_use=keys_to_use
else:
keys_to_use=keys_to_use
return(concat([result[k] for k in keys_to_use], keys=keys_to_use, axis=1), True)

elif isinstance(self, ABCSeries) and is_any_series():

Expand Down
142 changes: 142 additions & 0 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1436,6 +1436,148 @@ def describe(self, **kwargs):
return result.T
return result.unstack()

def sample(groupby_result, size=None, frac=None, replace=False, weights=None):
"""
Returns a random sample in dictionary.

Parameters
----------
n : int, optional
Number of items from axis to return. Cannot be used with `frac`.
Default = 1 if `frac` = None.
frac : float, optional
Fraction of items to return. Cannot be used with `size`.
replace : boolean, optional
Sample with or without replacement. Default = False.
weights : list of float, optional
Default 'None' results in equal probability weighting.
Index values in sampled object not in weights will be assigned
weights of zero.
If weights do not sum to 1, they will be normalized to sum to 1.
Missing values in the weights column will be treated as zero.
inf and -inf values not allowed.

Returns
-------
A new object of same type as caller.

Examples
--------
Generate an example ``DataFrame``:

>>> df = pd.DataFrame([['Male', 1], ['Female', 3], ['Female', 2], ['Other', 1]], columns=['gender', 'feature'])
gender feature
0 Male 1
1 Female 3
2 Female 2
3 Other 1

>>> grouped_df = df.groupby('gender')
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1034409b0>

Next extract a random sample:

2 random elements sample:

>>> sample=groupby.sample(size = 2)
{'Female': Int64Index([1, 2], dtype='int64'), 'Male': Int64Index([0], dtype='int64')}

2 random elements samplt with given weights:
>>> sample=groupby.sample(size = 2, weights = [0.1,0.1,0.2])
{'Male': Int64Index([0], dtype='int64'), 'Other': Int64Index([3], dtype='int64')}

A random 40% with replacement:
>>> sample=groupby.sample(frac = 0.4, replace = True)
{'Male': Int64Index([0], dtype='int64')}

"""
groups_dictionary=groupby_result.groups

#check size and frac:
#if no input sieze and no input frac: default sto size = 1
if(size == None and frac == None):
final_size=1

#if no input size but have the frac:
elif(size == None and frac is not None):
final_size=int(round(frac*len(groups_dictionary)))

#if no input frac but have the size:
elif(size is not None and frac is None and size % 1 ==0):
final_size=size
elif(size is not None and frac is None and size % 1 !=0):
raise ValueError("Only integers accepted as size value")
#if both enter size and frac: error
elif(size is not None and frac is not None):
raise ValueError('Please enter a value for `frac` OR `size`, not both')

print("For the given group, the size of sample is %d" %final_size)

#errors:
if(size is not None):
#1. non-integer size error:
#if(size%1 !=0):
# raise ValueError("Only integers accepted as size value")

#2. negative size error:
if size < 0:
raise ValueError("A negative number of sample size requested. Please provide a positive value.")

#3. overflow error:
maximum_size=len(groups_dictionary)
if size > maximum_size:
raise ValueError("The size of requested sample is overflow. Please provide the value of size in range.")

if(frac is not None):
if(frac >1):
raise ValueError("Only float between 0 an 1 accepted as frac value")


#edge warning:
if(size==0 or frac ==0):
raise Warning("Random sample is empty: the input sample size is 0")
if(size==len(groups_dictionary) or frac ==1):
raise Warning("Random sample equals to the given groupbt: the inplut size is the same as the size of the input group")

if weights is not None:
#weights is a list
if(len(weights) != len(groups_dictionary.keys())):
raise ValueError("Weights and axis to be sampled must be the same length")
for w in weights:
#if(w == np.inf() or w == -np.inf()):
# raise ValueError("Weight vectr may not inclue `inf` values")
if(w < 0):
raise ValueError("Weight vector may no include nagative value")
# If has nan, set to zero:
if(w==np.nan):
w=0

# Renormalize if don's sum to 1:
if(sum(weights)!=1):
if(sum(weights)!=0):
new_weights=[]
for w in weights:
new_w = w / sum(weights)
new_weights.append(new_w)
weights=new_weights
else:
raise ValueError("Invalid weights: weights sum to zero")

#random sampling:
#sample=random.sample(groups_dictionary.keys(),final_size, replace=replace)
dictionary_keys=list(groups_dictionary.keys())
num_of_keys=len(dictionary_keys)
sample=np.random.choice(num_of_keys,size=final_size,replace=replace,p=weights)
sample_keys=[]
for i in sample:
sample_keys.append(dictionary_keys[i])
sample_dictionary={key: value for key, value in groups_dictionary.items() if key in sample_keys}

return(sample_dictionary)




def resample(self, rule, *args, **kwargs):
"""
Provide resampling when using a TimeGrouper.
Expand Down