@@ -41,7 +41,7 @@ def __init__(self, probabilities: list[float]) -> None:
41
41
probabilities: List of probabilities for each arm.
42
42
"""
43
43
self .probabilities = probabilities
44
- self .k = len (probabilities )
44
+ self .num_arms = len (probabilities )
45
45
46
46
def pull (self , arm_index : int ) -> int :
47
47
"""
@@ -72,18 +72,18 @@ class EpsilonGreedy:
72
72
https://medium.com/analytics-vidhya/the-epsilon-greedy-algorithm-for-reinforcement-learning-5fe6f96dc870
73
73
"""
74
74
75
- def __init__ (self , epsilon : float , k : int ) -> None :
75
+ def __init__ (self , epsilon : float , num_arms : int ) -> None :
76
76
"""
77
77
Initialize the Epsilon-Greedy strategy.
78
78
79
79
Args:
80
80
epsilon: The probability of exploring new arms.
81
- k : The number of arms.
81
+ num_arms : The number of arms.
82
82
"""
83
83
self .epsilon = epsilon
84
- self .k = k
85
- self .counts = np .zeros (k )
86
- self .values = np .zeros (k )
84
+ self .num_arms = num_arms
85
+ self .counts = np .zeros (num_arms )
86
+ self .values = np .zeros (num_arms )
87
87
88
88
def select_arm (self ) -> int :
89
89
"""
@@ -93,14 +93,14 @@ def select_arm(self) -> int:
93
93
The index of the arm to pull.
94
94
95
95
Example:
96
- >>> strategy = EpsilonGreedy(epsilon=0.1, k =3)
96
+ >>> strategy = EpsilonGreedy(epsilon=0.1, num_arms =3)
97
97
>>> 0 <= strategy.select_arm() < 3
98
98
np.True_
99
99
"""
100
100
rng = np .random .default_rng ()
101
101
102
102
if rng .random () < self .epsilon :
103
- return rng .integers (self .k )
103
+ return rng .integers (self .num_arms )
104
104
else :
105
105
return np .argmax (self .values )
106
106
@@ -113,7 +113,7 @@ def update(self, arm_index: int, reward: int) -> None:
113
113
reward: The reward for the arm.
114
114
115
115
Example:
116
- >>> strategy = EpsilonGreedy(epsilon=0.1, k =3)
116
+ >>> strategy = EpsilonGreedy(epsilon=0.1, num_arms =3)
117
117
>>> strategy.update(0, 1)
118
118
>>> strategy.counts[0] == 1
119
119
np.True_
@@ -133,16 +133,16 @@ class UCB:
133
133
https://people.maths.bris.ac.uk/~maajg/teaching/stochopt/ucb.pdf
134
134
"""
135
135
136
- def __init__ (self , k : int ) -> None :
136
+ def __init__ (self , num_arms : int ) -> None :
137
137
"""
138
138
Initialize the UCB strategy.
139
139
140
140
Args:
141
- k : The number of arms.
141
+ num_arms : The number of arms.
142
142
"""
143
- self .k = k
144
- self .counts = np .zeros (k )
145
- self .values = np .zeros (k )
143
+ self .num_arms = num_arms
144
+ self .counts = np .zeros (num_arms )
145
+ self .values = np .zeros (num_arms )
146
146
self .total_counts = 0
147
147
148
148
def select_arm (self ) -> int :
@@ -153,13 +153,14 @@ def select_arm(self) -> int:
153
153
The index of the arm to pull.
154
154
155
155
Example:
156
- >>> strategy = UCB(k =3)
156
+ >>> strategy = UCB(num_arms =3)
157
157
>>> 0 <= strategy.select_arm() < 3
158
158
True
159
159
"""
160
- if self .total_counts < self .k :
160
+ if self .total_counts < self .num_arms :
161
161
return self .total_counts
162
- ucb_values = self .values + np .sqrt (2 * np .log (self .total_counts ) / self .counts )
162
+ ucb_values = self .values + \
163
+ np .sqrt (2 * np .log (self .total_counts ) / self .counts )
163
164
return np .argmax (ucb_values )
164
165
165
166
def update (self , arm_index : int , reward : int ) -> None :
@@ -171,7 +172,7 @@ def update(self, arm_index: int, reward: int) -> None:
171
172
reward: The reward for the arm.
172
173
173
174
Example:
174
- >>> strategy = UCB(k =3)
175
+ >>> strategy = UCB(num_arms =3)
175
176
>>> strategy.update(0, 1)
176
177
>>> strategy.counts[0] == 1
177
178
np.True_
@@ -192,16 +193,16 @@ class ThompsonSampling:
192
193
https://en.wikipedia.org/wiki/Thompson_sampling
193
194
"""
194
195
195
- def __init__ (self , k : int ) -> None :
196
+ def __init__ (self , num_arms : int ) -> None :
196
197
"""
197
198
Initialize the Thompson Sampling strategy.
198
199
199
200
Args:
200
- k : The number of arms.
201
+ num_arms : The number of arms.
201
202
"""
202
- self .k = k
203
- self .successes = np .zeros (k )
204
- self .failures = np .zeros (k )
203
+ self .num_arms = num_arms
204
+ self .successes = np .zeros (num_arms )
205
+ self .failures = np .zeros (num_arms )
205
206
206
207
def select_arm (self ) -> int :
207
208
"""
@@ -212,14 +213,15 @@ def select_arm(self) -> int:
212
213
which relies on the Beta distribution.
213
214
214
215
Example:
215
- >>> strategy = ThompsonSampling(k =3)
216
+ >>> strategy = ThompsonSampling(num_arms =3)
216
217
>>> 0 <= strategy.select_arm() < 3
217
218
np.True_
218
219
"""
219
220
rng = np .random .default_rng ()
220
221
221
222
samples = [
222
- rng .beta (self .successes [i ] + 1 , self .failures [i ] + 1 ) for i in range (self .k )
223
+ rng .beta (self .successes [i ] + 1 , self .failures [i ] + 1 )
224
+ for i in range (self .num_arms )
223
225
]
224
226
return np .argmax (samples )
225
227
@@ -232,7 +234,7 @@ def update(self, arm_index: int, reward: int) -> None:
232
234
reward: The reward for the arm.
233
235
234
236
Example:
235
- >>> strategy = ThompsonSampling(k =3)
237
+ >>> strategy = ThompsonSampling(num_arms =3)
236
238
>>> strategy.update(0, 1)
237
239
>>> strategy.successes[0] == 1
238
240
np.True_
@@ -250,14 +252,14 @@ class RandomStrategy:
250
252
a better comparison with the other optimised strategies.
251
253
"""
252
254
253
- def __init__ (self , k : int ):
255
+ def __init__ (self , num_arms : int ) -> None :
254
256
"""
255
257
Initialize the Random strategy.
256
258
257
259
Args:
258
- k : The number of arms.
260
+ num_arms : The number of arms.
259
261
"""
260
- self .k = k
262
+ self .num_arms = num_arms
261
263
262
264
def select_arm (self ) -> int :
263
265
"""
@@ -267,12 +269,12 @@ def select_arm(self) -> int:
267
269
The index of the arm to pull.
268
270
269
271
Example:
270
- >>> strategy = RandomStrategy(k =3)
272
+ >>> strategy = RandomStrategy(num_arms =3)
271
273
>>> 0 <= strategy.select_arm() < 3
272
274
np.True_
273
275
"""
274
276
rng = np .random .default_rng ()
275
- return rng .integers (self .k )
277
+ return rng .integers (self .num_arms )
276
278
277
279
def update (self , arm_index : int , reward : int ) -> None :
278
280
"""
@@ -283,7 +285,7 @@ def update(self, arm_index: int, reward: int) -> None:
283
285
reward: The reward for the arm.
284
286
285
287
Example:
286
- >>> strategy = RandomStrategy(k =3)
288
+ >>> strategy = RandomStrategy(num_arms =3)
287
289
>>> strategy.update(0, 1)
288
290
"""
289
291
@@ -297,16 +299,16 @@ class GreedyStrategy:
297
299
detrimental to the performance of the strategy.
298
300
"""
299
301
300
- def __init__ (self , k : int ):
302
+ def __init__ (self , num_arms : int ) -> None :
301
303
"""
302
304
Initialize the Greedy strategy.
303
305
304
306
Args:
305
- k : The number of arms.
307
+ num_arms : The number of arms.
306
308
"""
307
- self .k = k
308
- self .counts = np .zeros (k )
309
- self .values = np .zeros (k )
309
+ self .num_arms = num_arms
310
+ self .counts = np .zeros (num_arms )
311
+ self .values = np .zeros (num_arms )
310
312
311
313
def select_arm (self ) -> int :
312
314
"""
@@ -316,7 +318,7 @@ def select_arm(self) -> int:
316
318
The index of the arm to pull.
317
319
318
320
Example:
319
- >>> strategy = GreedyStrategy(k =3)
321
+ >>> strategy = GreedyStrategy(num_arms =3)
320
322
>>> 0 <= strategy.select_arm() < 3
321
323
np.True_
322
324
"""
@@ -331,7 +333,7 @@ def update(self, arm_index: int, reward: int) -> None:
331
333
reward: The reward for the arm.
332
334
333
335
Example:
334
- >>> strategy = GreedyStrategy(k =3)
336
+ >>> strategy = GreedyStrategy(num_arms =3)
335
337
>>> strategy.update(0, 1)
336
338
>>> strategy.counts[0] == 1
337
339
np.True_
@@ -346,16 +348,16 @@ def test_mab_strategies() -> None:
346
348
Test the MAB strategies.
347
349
"""
348
350
# Simulation
349
- k = 4
351
+ num_arms = 4
350
352
arms_probabilities = [0.1 , 0.3 , 0.5 , 0.8 ] # True probabilities
351
353
352
354
bandit = Bandit (arms_probabilities )
353
355
strategies = {
354
- "Epsilon-Greedy" : EpsilonGreedy (epsilon = 0.1 , k = k ),
355
- "UCB" : UCB (k = k ),
356
- "Thompson Sampling" : ThompsonSampling (k = k ),
357
- "Full Exploration(Random)" : RandomStrategy (k = k ),
358
- "Full Exploitation(Greedy)" : GreedyStrategy (k = k ),
356
+ "Epsilon-Greedy" : EpsilonGreedy (epsilon = 0.1 , num_arms = num_arms ),
357
+ "UCB" : UCB (num_arms = num_arms ),
358
+ "Thompson Sampling" : ThompsonSampling (num_arms = num_arms ),
359
+ "Full Exploration(Random)" : RandomStrategy (num_arms = num_arms ),
360
+ "Full Exploitation(Greedy)" : GreedyStrategy (num_arms = num_arms ),
359
361
}
360
362
361
363
num_rounds = 1000
0 commit comments