@@ -95,14 +95,14 @@ def select_arm(self) -> int:
95
95
Example:
96
96
>>> strategy = EpsilonGreedy(epsilon=0.1, num_arms=3)
97
97
>>> 0 <= strategy.select_arm() < 3
98
- np.True_
98
+ True
99
99
"""
100
100
rng = np .random .default_rng ()
101
101
102
102
if rng .random () < self .epsilon :
103
103
return rng .integers (self .num_arms )
104
104
else :
105
- return np .argmax (self .values )
105
+ return int ( np .argmax (self .values ) )
106
106
107
107
def update (self , arm_index : int , reward : int ) -> None :
108
108
"""
@@ -160,7 +160,7 @@ def select_arm(self) -> int:
160
160
if self .total_counts < self .num_arms :
161
161
return self .total_counts
162
162
ucb_values = self .values + np .sqrt (2 * np .log (self .total_counts ) / self .counts )
163
- return np .argmax (ucb_values )
163
+ return int ( np .argmax (ucb_values ) )
164
164
165
165
def update (self , arm_index : int , reward : int ) -> None :
166
166
"""
@@ -214,15 +214,15 @@ def select_arm(self) -> int:
214
214
Example:
215
215
>>> strategy = ThompsonSampling(num_arms=3)
216
216
>>> 0 <= strategy.select_arm() < 3
217
- np.True_
217
+ True
218
218
"""
219
219
rng = np .random .default_rng ()
220
220
221
221
samples = [
222
222
rng .beta (self .successes [i ] + 1 , self .failures [i ] + 1 )
223
223
for i in range (self .num_arms )
224
224
]
225
- return np .argmax (samples )
225
+ return int ( np .argmax (samples ) )
226
226
227
227
def update (self , arm_index : int , reward : int ) -> None :
228
228
"""
@@ -319,9 +319,9 @@ def select_arm(self) -> int:
319
319
Example:
320
320
>>> strategy = GreedyStrategy(num_arms=3)
321
321
>>> 0 <= strategy.select_arm() < 3
322
- np.True_
322
+ True
323
323
"""
324
- return np .argmax (self .values )
324
+ return int ( np .argmax (self .values ) )
325
325
326
326
def update (self , arm_index : int , reward : int ) -> None :
327
327
"""
0 commit comments