Skip to content

Commit c0be803

Browse files
committed
add a new feature sample() into groupby
1 parent 69f0641 commit c0be803

File tree

1 file changed

+142
-0
lines changed

1 file changed

+142
-0
lines changed

pandas/core/groupby/groupby.py

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1436,6 +1436,148 @@ def describe(self, **kwargs):
14361436
return result.T
14371437
return result.unstack()
14381438

1439+
1440+
def sample(groupby_result, size=None, frac=None, replace=False, weights=None):
1441+
"""
1442+
Returns a random sample in dictionary.
1443+
1444+
Parameters
1445+
----------
1446+
n : int, optional
1447+
Number of items from axis to return. Cannot be used with `frac`.
1448+
Default = 1 if `frac` = None.
1449+
frac : float, optional
1450+
Fraction of items to return. Cannot be used with `size`.
1451+
replace : boolean, optional
1452+
Sample with or without replacement. Default = False.
1453+
weights : list of float, optional
1454+
Default 'None' results in equal probability weighting.
1455+
Index values in sampled object not in weights will be assigned
1456+
weights of zero.
1457+
If weights do not sum to 1, they will be normalized to sum to 1.
1458+
Missing values in the weights column will be treated as zero.
1459+
inf and -inf values not allowed.
1460+
1461+
Returns
1462+
-------
1463+
A new object of same type as caller.
1464+
1465+
Examples
1466+
--------
1467+
Generate an example ``DataFrame``:
1468+
1469+
>>> df = pd.DataFrame([['Male', 1], ['Female', 3], ['Female', 2], ['Other', 1]], columns=['gender', 'feature'])
1470+
gender feature
1471+
0 Male 1
1472+
1 Female 3
1473+
2 Female 2
1474+
3 Other 1
1475+
1476+
>>> grouped_df = df.groupby('gender')
1477+
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1034409b0>
1478+
1479+
Next extract a random sample:
1480+
1481+
2 random elements sample:
1482+
1483+
>>> sample=groupby.sample(size = 2)
1484+
{'Female': Int64Index([1, 2], dtype='int64'), 'Male': Int64Index([0], dtype='int64')}
1485+
1486+
2 random elements samplt with given weights:
1487+
>>> sample=groupby.sample(size = 2, weights = [0.1,0.1,0.2])
1488+
{'Male': Int64Index([0], dtype='int64'), 'Other': Int64Index([3], dtype='int64')}
1489+
1490+
A random 40% with replacement:
1491+
>>> sample=groupby.sample(frac = 0.4, replace = True)
1492+
{'Male': Int64Index([0], dtype='int64')}
1493+
1494+
"""
1495+
groups_dictionary=groupby_result.groups
1496+
1497+
#check size and frac:
1498+
#if no input sieze and no input frac: default sto size = 1
1499+
if(size == None and frac == None):
1500+
final_size=1
1501+
1502+
#if no input size but have the frac:
1503+
elif(size == None and frac is not None):
1504+
final_size=int(round(frac*len(groups_dictionary)))
1505+
1506+
#if no input frac but have the size:
1507+
elif(size is not None and frac is None and size % 1 ==0):
1508+
final_size=size
1509+
elif(size is not None and frac is None and size % 1 !=0):
1510+
raise ValueError("Only integers accepted as size value")
1511+
#if both enter size and frac: error
1512+
elif(size is not None and frac is not None):
1513+
raise ValueError('Please enter a value for `frac` OR `size`, not both')
1514+
1515+
print("For the given group, the size of sample is %d" %final_size)
1516+
1517+
#errors:
1518+
if(size is not None):
1519+
#1. non-integer size error:
1520+
#if(size%1 !=0):
1521+
# raise ValueError("Only integers accepted as size value")
1522+
1523+
#2. negative size error:
1524+
if size < 0:
1525+
raise ValueError("A negative number of sample size requested. Please provide a positive value.")
1526+
1527+
#3. overflow error:
1528+
maximum_size=len(groups_dictionary)
1529+
if size > maximum_size:
1530+
raise ValueError("The size of requested sample is overflow. Please provide the value of size in range.")
1531+
1532+
if(frac is not None):
1533+
if(frac >1):
1534+
raise ValueError("Only float between 0 an 1 accepted as frac value")
1535+
1536+
1537+
#edge warning:
1538+
if(size==0 or frac ==0):
1539+
raise Warning("Random sample is empty: the input sample size is 0")
1540+
if(size==len(groups_dictionary) or frac ==1):
1541+
raise Warning("Random sample equals to the given groupbt: the inplut size is the same as the size of the input group")
1542+
1543+
if weights is not None:
1544+
#weights is a list
1545+
if(len(weights) != len(groups_dictionary.keys())):
1546+
raise ValueError("Weights and axis to be sampled must be the same length")
1547+
for w in weights:
1548+
#if(w == np.inf() or w == -np.inf()):
1549+
# raise ValueError("Weight vectr may not inclue `inf` values")
1550+
if(w < 0):
1551+
raise ValueError("Weight vector may no include nagative value")
1552+
# If has nan, set to zero:
1553+
if(w==np.nan):
1554+
w=0
1555+
1556+
# Renormalize if don's sum to 1:
1557+
if(sum(weights)!=1):
1558+
if(sum(weights)!=0):
1559+
new_weights=[]
1560+
for w in weights:
1561+
new_w = w / sum(weights)
1562+
new_weights.append(new_w)
1563+
weights=new_weights
1564+
else:
1565+
raise ValueError("Invalid weights: weights sum to zero")
1566+
1567+
#random sampling:
1568+
#sample=random.sample(groups_dictionary.keys(),final_size, replace=replace)
1569+
dictionary_keys=list(groups_dictionary.keys())
1570+
num_of_keys=len(dictionary_keys)
1571+
sample=np.random.choice(num_of_keys,size=final_size,replace=replace,p=weights)
1572+
sample_keys=[]
1573+
for i in sample:
1574+
sample_keys.append(dictionary_keys[i])
1575+
sample_dictionary={key: value for key, value in groups_dictionary.items() if key in sample_keys}
1576+
1577+
return(sample_dictionary)
1578+
1579+
1580+
14391581
def resample(self, rule, *args, **kwargs):
14401582
"""
14411583
Provide resampling when using a TimeGrouper.

0 commit comments

Comments
 (0)