@@ -1436,6 +1436,148 @@ def describe(self, **kwargs):
1436
1436
return result .T
1437
1437
return result .unstack ()
1438
1438
1439
+
1440
+ def sample (groupby_result , size = None , frac = None , replace = False , weights = None ):
1441
+ """
1442
+ Returns a random sample in dictionary.
1443
+
1444
+ Parameters
1445
+ ----------
1446
+ n : int, optional
1447
+ Number of items from axis to return. Cannot be used with `frac`.
1448
+ Default = 1 if `frac` = None.
1449
+ frac : float, optional
1450
+ Fraction of items to return. Cannot be used with `size`.
1451
+ replace : boolean, optional
1452
+ Sample with or without replacement. Default = False.
1453
+ weights : list of float, optional
1454
+ Default 'None' results in equal probability weighting.
1455
+ Index values in sampled object not in weights will be assigned
1456
+ weights of zero.
1457
+ If weights do not sum to 1, they will be normalized to sum to 1.
1458
+ Missing values in the weights column will be treated as zero.
1459
+ inf and -inf values not allowed.
1460
+
1461
+ Returns
1462
+ -------
1463
+ A new object of same type as caller.
1464
+
1465
+ Examples
1466
+ --------
1467
+ Generate an example ``DataFrame``:
1468
+
1469
+ >>> df = pd.DataFrame([['Male', 1], ['Female', 3], ['Female', 2], ['Other', 1]], columns=['gender', 'feature'])
1470
+ gender feature
1471
+ 0 Male 1
1472
+ 1 Female 3
1473
+ 2 Female 2
1474
+ 3 Other 1
1475
+
1476
+ >>> grouped_df = df.groupby('gender')
1477
+ <pandas.core.groupby.generic.DataFrameGroupBy object at 0x1034409b0>
1478
+
1479
+ Next extract a random sample:
1480
+
1481
+ 2 random elements sample:
1482
+
1483
+ >>> sample=groupby.sample(size = 2)
1484
+ {'Female': Int64Index([1, 2], dtype='int64'), 'Male': Int64Index([0], dtype='int64')}
1485
+
1486
+ 2 random elements samplt with given weights:
1487
+ >>> sample=groupby.sample(size = 2, weights = [0.1,0.1,0.2])
1488
+ {'Male': Int64Index([0], dtype='int64'), 'Other': Int64Index([3], dtype='int64')}
1489
+
1490
+ A random 40% with replacement:
1491
+ >>> sample=groupby.sample(frac = 0.4, replace = True)
1492
+ {'Male': Int64Index([0], dtype='int64')}
1493
+
1494
+ """
1495
+ groups_dictionary = groupby_result .groups
1496
+
1497
+ #check size and frac:
1498
+ #if no input sieze and no input frac: default sto size = 1
1499
+ if (size == None and frac == None ):
1500
+ final_size = 1
1501
+
1502
+ #if no input size but have the frac:
1503
+ elif (size == None and frac is not None ):
1504
+ final_size = int (round (frac * len (groups_dictionary )))
1505
+
1506
+ #if no input frac but have the size:
1507
+ elif (size is not None and frac is None and size % 1 == 0 ):
1508
+ final_size = size
1509
+ elif (size is not None and frac is None and size % 1 != 0 ):
1510
+ raise ValueError ("Only integers accepted as size value" )
1511
+ #if both enter size and frac: error
1512
+ elif (size is not None and frac is not None ):
1513
+ raise ValueError ('Please enter a value for `frac` OR `size`, not both' )
1514
+
1515
+ print ("For the given group, the size of sample is %d" % final_size )
1516
+
1517
+ #errors:
1518
+ if (size is not None ):
1519
+ #1. non-integer size error:
1520
+ #if(size%1 !=0):
1521
+ # raise ValueError("Only integers accepted as size value")
1522
+
1523
+ #2. negative size error:
1524
+ if size < 0 :
1525
+ raise ValueError ("A negative number of sample size requested. Please provide a positive value." )
1526
+
1527
+ #3. overflow error:
1528
+ maximum_size = len (groups_dictionary )
1529
+ if size > maximum_size :
1530
+ raise ValueError ("The size of requested sample is overflow. Please provide the value of size in range." )
1531
+
1532
+ if (frac is not None ):
1533
+ if (frac > 1 ):
1534
+ raise ValueError ("Only float between 0 an 1 accepted as frac value" )
1535
+
1536
+
1537
+ #edge warning:
1538
+ if (size == 0 or frac == 0 ):
1539
+ raise Warning ("Random sample is empty: the input sample size is 0" )
1540
+ if (size == len (groups_dictionary ) or frac == 1 ):
1541
+ raise Warning ("Random sample equals to the given groupbt: the inplut size is the same as the size of the input group" )
1542
+
1543
+ if weights is not None :
1544
+ #weights is a list
1545
+ if (len (weights ) != len (groups_dictionary .keys ())):
1546
+ raise ValueError ("Weights and axis to be sampled must be the same length" )
1547
+ for w in weights :
1548
+ #if(w == np.inf() or w == -np.inf()):
1549
+ # raise ValueError("Weight vectr may not inclue `inf` values")
1550
+ if (w < 0 ):
1551
+ raise ValueError ("Weight vector may no include nagative value" )
1552
+ # If has nan, set to zero:
1553
+ if (w == np .nan ):
1554
+ w = 0
1555
+
1556
+ # Renormalize if don's sum to 1:
1557
+ if (sum (weights )!= 1 ):
1558
+ if (sum (weights )!= 0 ):
1559
+ new_weights = []
1560
+ for w in weights :
1561
+ new_w = w / sum (weights )
1562
+ new_weights .append (new_w )
1563
+ weights = new_weights
1564
+ else :
1565
+ raise ValueError ("Invalid weights: weights sum to zero" )
1566
+
1567
+ #random sampling:
1568
+ #sample=random.sample(groups_dictionary.keys(),final_size, replace=replace)
1569
+ dictionary_keys = list (groups_dictionary .keys ())
1570
+ num_of_keys = len (dictionary_keys )
1571
+ sample = np .random .choice (num_of_keys ,size = final_size ,replace = replace ,p = weights )
1572
+ sample_keys = []
1573
+ for i in sample :
1574
+ sample_keys .append (dictionary_keys [i ])
1575
+ sample_dictionary = {key : value for key , value in groups_dictionary .items () if key in sample_keys }
1576
+
1577
+ return (sample_dictionary )
1578
+
1579
+
1580
+
1439
1581
def resample (self , rule , * args , ** kwargs ):
1440
1582
"""
1441
1583
Provide resampling when using a TimeGrouper.
0 commit comments