|
| 1 | +#other strats. |
| 2 | +# TODO: UBC strat, epsilon-greedy |
| 3 | + |
| 4 | +import numpy as np |
| 5 | +from pymc import rbeta |
| 6 | + |
| 7 | +rand = np.random.rand |
| 8 | + |
| 9 | + |
| 10 | + |
| 11 | +class GeneralBanditStrat( object ): |
| 12 | + |
| 13 | + """ |
| 14 | + Implements a online, learning strategy to solve |
| 15 | + the Multi-Armed Bandit problem. |
| 16 | + |
| 17 | + parameters: |
| 18 | + bandits: a Bandit class with .pull method |
| 19 | + choice_function: accepts a self argument (which gives access to all the variables), and |
| 20 | + returns and int between 0 and n-1 |
| 21 | + methods: |
| 22 | + sample_bandits(n): sample and train on n pulls. |
| 23 | +
|
| 24 | + attributes: |
| 25 | + N: the cumulative number of samples |
| 26 | + choices: the historical choices as a (N,) array |
| 27 | + bb_score: the historical score as a (N,) array |
| 28 | +
|
| 29 | + """ |
| 30 | + |
| 31 | + def __init__(self, bandits, choice_function): |
| 32 | + |
| 33 | + self.bandits = bandits |
| 34 | + n_bandits = len( self.bandits ) |
| 35 | + self.wins = np.zeros( n_bandits ) |
| 36 | + self.trials = np.zeros(n_bandits ) |
| 37 | + self.N = 0 |
| 38 | + self.choices = [] |
| 39 | + self.score = [] |
| 40 | + self.choice_function = choice_function |
| 41 | + |
| 42 | + def sample_bandits( self, n=1 ): |
| 43 | + |
| 44 | + score = np.zeros( n ) |
| 45 | + choices = np.zeros( n ) |
| 46 | + |
| 47 | + for k in range(n): |
| 48 | + #sample from the bandits's priors, and select the largest sample |
| 49 | + choice = self.choice_function(self) |
| 50 | + |
| 51 | + #sample the chosen bandit |
| 52 | + result = self.bandits.pull( choice ) |
| 53 | + |
| 54 | + #update priors and score |
| 55 | + self.wins[ choice ] += result |
| 56 | + self.trials[ choice ] += 1 |
| 57 | + score[ k ] = result |
| 58 | + self.N += 1 |
| 59 | + choices[ k ] = choice |
| 60 | + |
| 61 | + self.score = np.r_[ self.score, score ] |
| 62 | + self.choices = np.r_[ self.choices, choices ] |
| 63 | + return |
| 64 | + |
| 65 | + |
| 66 | +def bayesian_bandit_choice(self): |
| 67 | + return np.argmax( rbeta( 1 + self.wins, 1 + self.trials - self.wins) ) |
| 68 | + |
| 69 | +def max_mean( self ): |
| 70 | + """pick the bandit with the current best observed proportion of winning """ |
| 71 | + return np.argmax( self.wins / ( self.trials +1 ) ) |
| 72 | + |
| 73 | +def lower_credible_choice( self ): |
| 74 | + """pick the bandit with the best LOWER BOUND. See chapter 5""" |
| 75 | + def lb(a,b): |
| 76 | + return a/(a+b) - 1.65*np.sqrt( (a*b)/( (a+b)**2*(a+b+1) ) ) |
| 77 | + a = self.wins + 1 |
| 78 | + b = self.trials - self.wins + 1 |
| 79 | + return np.argmax( lb(a,b) ) |
| 80 | + |
| 81 | +def upper_credible_choice( self ): |
| 82 | + """pick the bandit with the best LOWER BOUND. See chapter 5""" |
| 83 | + def lb(a,b): |
| 84 | + return a/(a+b) + 1.65*np.sqrt( (a*b)/( (a+b)**2*(a+b+1) ) ) |
| 85 | + a = self.wins + 1 |
| 86 | + b = self.trials - self.wins + 1 |
| 87 | + return np.argmax( lb(a,b) ) |
| 88 | + |
| 89 | +def random_choice( self): |
| 90 | + return np.random.randint( 0, len( self.wins ) ) |
| 91 | + |
| 92 | + |
| 93 | +class Bandits(object): |
| 94 | + """ |
| 95 | + This class represents N bandits machines. |
| 96 | +
|
| 97 | + parameters: |
| 98 | + p_array: a (n,) Numpy array of probabilities >0, <1. |
| 99 | +
|
| 100 | + methods: |
| 101 | + pull( i ): return the results, 0 or 1, of pulling |
| 102 | + the ith bandit. |
| 103 | + """ |
| 104 | + def __init__(self, p_array): |
| 105 | + self.p = p_array |
| 106 | + self.optimal = np.argmax(p_array) |
| 107 | + |
| 108 | + def pull( self, i ): |
| 109 | + #i is which arm to pull |
| 110 | + return rand() < self.p[i] |
| 111 | + |
| 112 | + def __len__(self): |
| 113 | + return len(self.p) |
0 commit comments