2
2
from collections import Counter
3
3
import csv
4
4
from datetime import datetime
5
+ import os
6
+ import string
5
7
import numpy as np
6
8
import simpy
7
9
from llmactor import LLMActor
8
10
from loadbalancer import LoadBalancer
11
+ import sys
9
12
10
13
def main ():
14
+
15
+
11
16
parser = argparse .ArgumentParser (description = "Simulate LLM load balancing with configurable parameters." )
12
- parser .add_argument ("--rates_lo" , nargs = '+' , type = int , default = [35 , 30 , 25 , 20 , 15 , 10 , 5 , 1 ], help = "List of low rates." )
13
- parser .add_argument ("--rates_hi" , nargs = '+' , type = int , default = [35 , 30 , 25 , 20 , 15 , 10 , 5 , 1 ], help = "List of high rates." )
17
+ parser .add_argument ("--rates_lo" , nargs = '+' , type = int , default = [40 , 35 , 30 , 25 , 20 , 15 , 10 , 5 , 1 ], help = "List of low rates." )
18
+ parser .add_argument ("--rates_hi" , nargs = '+' , type = int , default = [40 , 35 , 30 , 25 , 20 , 15 , 10 , 5 , 1 ], help = "List of high rates." )
14
19
parser .add_argument ("--no_of_messages" , type = int , default = 2500 , help = "Number of messages to simulate." )
15
20
parser .add_argument ("--mean_request_size_1" , type = int , default = 202 , help = "Mean request size for set 1." )
16
21
parser .add_argument ("--std_request_size_1" , type = int , default = 20 , help = "Standard deviation of request size for set 1." )
@@ -20,7 +25,8 @@ def main():
20
25
parser .add_argument ("--std_request_size_2" , type = int , default = 20 , help = "Standard deviation of request size for set 2." )
21
26
parser .add_argument ("--mean_output_size_2" , type = int , default = 179 , help = "Mean output size for set 2." )
22
27
parser .add_argument ("--std_output_size_2" , type = int , default = 17 , help = "Standard deviation of output size for set 2." )
23
- parser .add_argument ("--queueing_perc" , type = float , default = 0.19 , help = "Queueing percentage." )
28
+ parser .add_argument ("--estimated_output_size" , type = str , default = "mean" , help = "how to determine the mean output size." )
29
+ parser .add_argument ("--queueing_perc" , type = float , default = np .inf , help = "Queueing percentage." )
24
30
parser .add_argument ('--target-latency-lo' , nargs = '+' , type = float , help = 'List of target latencies for low priority requests.' )
25
31
parser .add_argument ('--target-latency-hi' , nargs = '+' , type = float , help = 'List of target latencies for high priority requests.' )
26
32
@@ -29,6 +35,8 @@ def main():
29
35
30
36
31
37
parser .add_argument ('--number-of-servers' , type = int , default = 6 , help = 'List of target latencies for high priority requests.' )
38
+ parser .add_argument ('--output-file' , type = str , default = "result.csv" , help = 'output file name.' )
39
+ parser .add_argument ('--routing-type' , type = str , default = "random" , help = 'routing type' )
32
40
33
41
args = parser .parse_args ()
34
42
@@ -58,6 +66,9 @@ def main():
58
66
prefix_latency_list_hi = args .prefix_latency_hi if args .prefix_latency_hi else ['hi' ]
59
67
60
68
number_of_servers = args .number_of_servers
69
+ output_file = args .output_file
70
+ routing_type = args .routing_type
71
+ estimated_output_size = args .estimated_output_size
61
72
62
73
# Define a structure to store results for all routing types
63
74
results = {
@@ -71,9 +82,9 @@ def main():
71
82
'recompute_cnt' : [], 'recompute_cnt_hi' : [], 'recompute_cnt_lo' : [],
72
83
'pct_below_latency_target_lo' : [], 'pct_below_latency_target_hi' : [], 'queue_time_lo' : [], 'queue_time_hi' : [],
73
84
'tol_lat_time_lo' : [], 'tol_lat_time_hi' : [],
74
- 'avg_prefill_queue_size' = [],
75
- 'avg_pending_tokens_perc' = [],
76
- 'avg_actual_tokens_perc' = []},
85
+ 'avg_prefill_queue_size' : [],
86
+ 'avg_pending_tokens_perc' : [],
87
+ 'avg_actual_tokens_perc' : [], 'request_count' : []},
77
88
78
89
'smart' : {'latency' : [], 'latency_lo' : [], 'latency_hi' : [],
79
90
'estimated_latency' : [], 'estimated_latency_lo' : [], 'estimated_latency_hi' : [],
@@ -87,9 +98,9 @@ def main():
87
98
'pct_below_latency_target_lo' : [], 'pct_below_latency_target_hi' : [],
88
99
'pct_below_latency_target_lo' : [], 'pct_below_latency_target_hi' : [], 'queue_time_lo' : [], 'queue_time_hi' : [],
89
100
'tol_lat_time_lo' : [], 'tol_lat_time_hi' : [],
90
- 'avg_prefill_queue_size' = [],
91
- 'avg_pending_tokens_perc' = [],
92
- 'avg_actual_tokens_perc' = []},
101
+ 'avg_prefill_queue_size' : [],
102
+ 'avg_pending_tokens_perc' : [],
103
+ 'avg_actual_tokens_perc' : [], 'request_count' : []},
93
104
94
105
'leastlatency' : {'latency' : [], 'latency_lo' : [], 'latency_hi' : [],
95
106
'throughput_prefill' : [], 'throughput_decode' : [],
@@ -101,9 +112,9 @@ def main():
101
112
'recompute_cnt' : [], 'recompute_cnt_hi' : [], 'recompute_cnt_lo' : [],
102
113
'pct_below_latency_target_lo' : [], 'pct_below_latency_target_hi' : [], 'queue_time_lo' : [], 'queue_time_hi' : [],
103
114
'tol_lat_time_lo' : [], 'tol_lat_time_hi' : [],
104
- 'avg_prefill_queue_size' = [],
105
- 'avg_pending_tokens_perc' = [],
106
- 'avg_actual_tokens_perc' = []},
115
+ 'avg_prefill_queue_size' : [],
116
+ 'avg_pending_tokens_perc' : [],
117
+ 'avg_actual_tokens_perc' : [], 'request_count' : []},
107
118
'least' : {'latency' : [], 'latency_lo' : [], 'latency_hi' : [],
108
119
'throughput_prefill' : [], 'throughput_decode' : [],
109
120
'throughput_prefill_lo' : [], 'throughput_decode_lo' : [],
@@ -114,9 +125,9 @@ def main():
114
125
'recompute_cnt' : [], 'recompute_cnt_hi' : [], 'recompute_cnt_lo' : [],
115
126
'pct_below_latency_target_lo' : [], 'pct_below_latency_target_hi' : [], 'queue_time_lo' : [], 'queue_time_hi' : [],
116
127
'tol_lat_time_lo' : [], 'tol_lat_time_hi' : [],
117
- 'avg_prefill_queue_size' = [],
118
- 'avg_pending_tokens_perc' = [],
119
- 'avg_actual_tokens_perc' = []},
128
+ 'avg_prefill_queue_size' : [],
129
+ 'avg_pending_tokens_perc' : [],
130
+ 'avg_actual_tokens_perc' : [], 'request_count' : []},
120
131
'random' : {'latency' : [], 'latency_lo' : [], 'latency_hi' : [],
121
132
'throughput_prefill' : [], 'throughput_decode' : [],
122
133
'throughput_prefill_lo' : [], 'throughput_decode_lo' : [],
@@ -127,12 +138,12 @@ def main():
127
138
'recompute_cnt' : [], 'recompute_cnt_hi' : [], 'recompute_cnt_lo' : [],
128
139
'pct_below_latency_target_lo' : [], 'pct_below_latency_target_hi' : [], 'queue_time_lo' : [], 'queue_time_hi' : [],
129
140
'tol_lat_time_lo' : [], 'tol_lat_time_hi' : [],
130
- 'avg_prefill_queue_size' = [],
131
- 'avg_pending_tokens_perc' = [],
132
- 'avg_actual_tokens_perc' = []},
141
+ 'avg_prefill_queue_size' : [],
142
+ 'avg_pending_tokens_perc' : [],
143
+ 'avg_actual_tokens_perc' : [], 'request_count' : []},
133
144
}
134
145
135
- all_routing_types = [ "random" , ]
146
+ all_routing_types = [ routing_type ]
136
147
prompt_output_tuple = None
137
148
138
149
# Iterate over routing types
@@ -144,15 +155,23 @@ def main():
144
155
req_dict_prefill = {}
145
156
SIM_DURATION = SIM_DURATIONS [i ]
146
157
print (f'Simulate with rate: for lo { rates_lo [i ]} and for hi { rates_hi [i ]} and routing type: { routing_type } ' )
147
-
158
+ sys . stdout . flush ()
148
159
# Simpy environment and LLM actors setup
149
160
env = simpy .Environment ()
150
161
list_of_llmactors = [LLMActor (env , 1 , id ) for id in range (number_of_servers )]
151
162
lb = LoadBalancer (env , number_of_servers = number_of_servers , list_of_llmactors = list_of_llmactors , req_dict_prefill = req_dict_prefill , req_dict = req_dict , messages_remaining_cnt = no_of_messages * 2 )
152
163
lb .queueing_perc = queueing_perc
153
164
154
- estimated_output_size = mean_output_size_1
155
- lb .process (rates_lo [i ], lora_requested_lo , target_latency_list_lo , prefix_latency_list_lo , routing_type , prompt_output_tuple , mean_request_size_1 , std_request_size_1 , mean_output_size_1 , std_output_size_1 , estimated_output_size )
165
+ if estimated_output_size == "mean" :
166
+ estimated_output_size_1 = mean_output_size_1
167
+ estimated_output_size_2 = mean_output_size_2
168
+ elif estimated_output_size == "p95" :
169
+ estimated_output_size_1 = mean_output_size_1 + 2 * std_output_size_1
170
+ estimated_output_size_2 = mean_output_size_2 + 2 * std_output_size_2
171
+
172
+
173
+ lb .process (rates_lo [i ], lora_requested_lo , target_latency_list_lo , prefix_latency_list_lo , routing_type , prompt_output_tuple , mean_request_size_1 , std_request_size_1 , mean_output_size_1 , std_output_size_1 , estimated_output_size_1 )
174
+ lb .process (rates_hi [i ], lora_requested_hi , target_latency_list_hi , prefix_latency_list_hi , routing_type , prompt_output_tuple , mean_request_size_1 , std_request_size_1 , mean_output_size_1 , std_output_size_1 , estimated_output_size_2 )
156
175
env .run (until = SIM_DURATION )
157
176
158
177
# Track which pod processed each request (lo and hi)
@@ -268,11 +287,14 @@ def main():
268
287
results [routing_type ]['avg_prefill_queue_size' ].append (np .mean (prefill_queue_size ))
269
288
results [routing_type ]['avg_pending_tokens_perc' ].append (np .mean (pending_tokens_at_arrival_perc ))
270
289
results [routing_type ]['avg_actual_tokens_perc' ].append (np .mean (actual_tokens_at_arrival_perc ))
290
+
271
291
272
- l1 = [np .sum (list (dict (x ).values ())) for x in results [routing_type ]['target_pods_lo' ]]
273
- l2 = [np .sum (list (dict (x ).values ())) for x in results [routing_type ]['target_pods_hi' ]]
292
+ l1 = [np .sum (list (dict (x ).values ())) for x in results [routing_type ]['target_pods_lo' ]][ - 1 ]
293
+ l2 = [np .sum (list (dict (x ).values ())) for x in results [routing_type ]['target_pods_hi' ]][ - 1 ]
274
294
275
- print (f'req count { [(l1 [i ], l2 [i ]) for i in range (len (l1 ))]} ' )
295
+ print (f'req count { (l1 , l2 )} ' )
296
+ sys .stdout .flush ()
297
+ results [routing_type ]['request_count' ].append (len (completed_req ))
276
298
277
299
if routing_type == 'smart' :
278
300
results [routing_type ]['estimated_latency' ].append (estimated_latency_cur )
@@ -288,18 +310,29 @@ def main():
288
310
print (f'QPS: { rates_lo [i ]} (lo), { rates_hi [i ]} (hi)' )
289
311
print (f'% of lo requests below target: { pct_below_target_lo } %' )
290
312
print (f'% of hi requests below target: { pct_below_target_hi } %' )
313
+ print (f"prefill_queue_size { np .mean (prefill_queue_size )} " )
314
+ print (f"pending_tokens_perc { np .mean (pending_tokens_at_arrival_perc )} " )
315
+ print (f"actual_tokens_perc { np .mean (actual_tokens_at_arrival_perc )} " )
316
+ sys .stdout .flush ()
317
+
318
+
291
319
292
320
# Create a timestamp
293
321
timestamp = datetime .now ().strftime ("%Y-%m-%d_%H-%M-%S" )
294
322
295
323
# Create the output file name with the timestamp
296
- output_file = f"results_ { timestamp } .json"
324
+
297
325
298
326
299
327
300
328
# Write results to CSV
329
+ # Ensure the output directory exists
330
+ output_dir = os .path .dirname (output_file )
331
+ if not os .path .exists (output_dir ):
332
+ os .makedirs (output_dir )
333
+
301
334
with open (output_file , 'w' , newline = '' ) as csvfile :
302
- fieldnames = ['RoutingType' , 'RateIndex' , 'Latency' , 'Latency_Lo' , 'Latency_Hi' ,'Estimated_Latency' , 'Estimated_Latency_lo ' , 'Estimated_Latency_hi ' , 'avg_prefill_queue_size' , 'avg_pending_tokens_perc ' , 'avg_actual_tokens_perc' ]
335
+ fieldnames = ['RoutingType' , 'RateIndex' , 'Latency' , 'Latency_Lo' , 'Latency_Hi' , 'avg_prefill_queue_size ' , 'avg_pending_tokens_perc ' , 'avg_actual_tokens_perc' , 'pct_below_latency_target_lo ' , 'pct_below_latency_target_hi' ]
303
336
writer = csv .DictWriter (csvfile , fieldnames = fieldnames )
304
337
305
338
writer .writeheader ()
@@ -313,12 +346,11 @@ def main():
313
346
'Latency' : results [routing_type ]['latency' ][i ],
314
347
'Latency_Lo' : results [routing_type ]['latency_lo' ][i ],
315
348
'Latency_Hi' : results [routing_type ]['latency_hi' ][i ],
316
- 'Estimated_Latency' : results [routing_type ]['estimated_latency' ][i ],
317
- 'Estimated_Latency_Lo' : results [routing_type ]['estimated_latency_lo' ][i ],
318
- 'Estimated_Latency_Hi' : results [routing_type ]['estimated_latency_hi' ][i ],
319
349
'avg_prefill_queue_size' : results [routing_type ]['avg_prefill_queue_size' ][i ],
320
350
'avg_pending_tokens_perc' : results [routing_type ]['avg_pending_tokens_perc' ][i ],
321
351
'avg_actual_tokens_perc' : results [routing_type ]['avg_actual_tokens_perc' ][i ],
352
+ 'pct_below_latency_target_lo' : results [routing_type ]['pct_below_latency_target_lo' ][i ],
353
+ 'pct_below_latency_target_hi' : results [routing_type ]['pct_below_latency_target_hi' ][i ],
322
354
})
323
355
324
356
print (f"Results have been saved to { output_file } " )
0 commit comments