Skip to content

Commit b9d12e3

Browse files
committed
remove extra lines
1 parent 8e94a9c commit b9d12e3

File tree

6 files changed

+54
-141
lines changed

6 files changed

+54
-141
lines changed

simulations/llm_ig_simulation/src/benchmark_one_server.py

+4-49
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,3 @@
1-
2-
3-
4-
51
import argparse
62
from collections import Counter
73
import csv
@@ -27,16 +23,13 @@ def main():
2723
parser.add_argument("--queueing_perc", type=float, default=np.inf, help="Queueing percentage.")
2824
parser.add_argument('--target-latency-lo', nargs='+', type=float, help='List of target latencies for low priority requests.')
2925
parser.add_argument('--target-latency-hi', nargs='+', type=float, help='List of target latencies for high priority requests.')
30-
3126
parser.add_argument('--prefix-latency-lo', nargs='+', type=float, help='List of prefix of target latencies for low priority requests.')
3227
parser.add_argument('--prefix-latency-hi', nargs='+', type=float, help='List of prefix of target latencies for high priority requests.')
33-
34-
35-
parser.add_argument('--number-of-servers', type=int, default=1, help='List of target latencies for high priority requests.')
28+
parser.add_argument('--number-of-servers', type=int, default=1, help='List of target latencies for high priority requests.')
3629

3730
args = parser.parse_args()
3831

39-
# Use provided arguments or defaults
32+
# Use provided arguments or defaults
4033
rates_lo = args.rates_lo
4134
rates_hi = args.rates_hi
4235
no_of_messages = args.no_of_messages
@@ -45,22 +38,17 @@ def main():
4538
std_request_size_1 = args.std_request_size_1
4639
mean_output_size_1 = args.mean_output_size_1
4740
std_output_size_1 = args.std_output_size_1
48-
4941
mean_request_size_2 = args.mean_request_size_2
5042
std_request_size_2 = args.std_request_size_2
5143
mean_output_size_2 = args.mean_output_size_2
5244
std_output_size_2 = args.std_output_size_2
53-
5445
queueing_perc = args.queueing_perc
5546
lora_requested_lo = ""
5647
lora_requested_hi = ""
57-
5848
target_latency_list_lo = args.target_latency_lo if args.target_latency_lo else [0.025]
5949
target_latency_list_hi = args.target_latency_hi if args.target_latency_hi else [0.5]
60-
6150
prefix_latency_list_lo = args.prefix_latency_lo if args.prefix_latency_lo else ['lo']
6251
prefix_latency_list_hi = args.prefix_latency_hi if args.prefix_latency_hi else ['hi']
63-
6452
number_of_servers = args.number_of_servers
6553

6654
# Define a structure to store results for all routing types
@@ -163,43 +151,24 @@ def main():
163151
lb.process(rates_lo[i], lora_requested_lo, target_latency_list_lo, prefix_latency_list_lo, routing_type, prompt_output_tuple, mean_request_size_1, std_request_size_1, mean_output_size_1, std_output_size_1, estimated_output_size)
164152
env.run(until=SIM_DURATION)
165153

166-
167-
168154
# Completed requests
169155
completed_req = list(filter(lambda x: x.output_size_remaining == 0, req_dict.values()))
170-
171-
172156
completed_req_sorted = sorted(completed_req, key=lambda x: x.arrival_time)
173-
174-
175157
# Exclude the first 10% of requests based on end_decode_time
176158
exclude_count = int(0 * len(completed_req_sorted))
177-
178-
179159
# Filter out the first 10%
180160
filtered_req = completed_req_sorted[exclude_count:]
181161

182-
183-
184162
# Calculate ttft, tpot, latency, and throughput
185163
ttft_cur = np.mean([x.end_prefill_time - x.arrival_time for x in req_dict.values()])
186-
187-
188164
tpot_cur = np.mean([(x.end_decode_time - x.start_prefill_time) / (x.output_size - x.output_size_remaining) for x in req_dict.values()])
189-
190165
latency_cur = np.mean([(x.end_decode_time - x.arrival_time) / (x.output_size - x.output_size_remaining) for x in filtered_req])
191-
192166
estimated_latency_cur = np.mean([x.estimated_latency for x in filtered_req])
193-
194167
recompute_cur = np.sum([x.recompute_count for x in filtered_req]) / len(filtered_req)
195-
196168
tt = SIM_DURATION
197169
throughput_prefill_cur = np.sum([x.input_size for x in filtered_req]) / tt
198170
throughput_decode_cur = np.sum([max(0, x.output_size - x.output_size_remaining - 1) for x in filtered_req]) / tt
199171

200-
201-
202-
203172
pending_tokens_at_arrival_perc = [x.pending_tokens_at_arrival_perc for x in completed_req]
204173
actual_tokens_at_arrival_perc = [x.actual_tokens_at_arrival_perc for x in completed_req]
205174
prefill_queue_size = [x.queue_size_before_prefill for x in completed_req]
@@ -210,32 +179,20 @@ def main():
210179
results[routing_type]['throughput_decode'].append(throughput_decode_cur)
211180
results[routing_type]['ttft'].append(ttft_cur)
212181
results[routing_type]['tpot'].append(tpot_cur)
213-
214-
215182
results[routing_type]['recompute_cnt'].append(recompute_cur)
216-
217-
218-
219183
results[routing_type]['avg_prefill_queue_size'].append(np.mean(prefill_queue_size))
220184
results[routing_type]['avg_pending_tokens_perc'].append(np.mean(pending_tokens_at_arrival_perc))
221185
results[routing_type]['avg_actual_tokens_perc'].append(np.mean(actual_tokens_at_arrival_perc))
222186

223-
224-
225-
226187
# Create a timestamp
227188
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
228-
229189
# Create the output file name with the timestamp
230190
output_file = f"results_{timestamp}.csv"
231191

232-
233-
234192
# Write results to CSV
235193
with open(output_file, 'w', newline='') as csvfile:
236-
fieldnames = ['RoutingType', 'RateIndex', 'Latency', 'avg_prefill_queue_size', 'avg_pending_tokens_perc', 'avg_actual_tokens_perc' ]
194+
fieldnames = ['RoutingType', 'RateIndex', 'Latency', 'avg_prefill_queue_size', 'avg_pending_tokens_perc', 'avg_actual_tokens_perc']
237195
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
238-
239196
writer.writeheader()
240197

241198
# Iterate over routing types and write each entry
@@ -251,8 +208,6 @@ def main():
251208
})
252209

253210
print(f"Results have been saved to {output_file}")
254-
255-
256211

257212
if __name__ == "__main__":
258-
main()
213+
main()
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,3 @@
1-
2-
3-
4-
5-
61
PREFILL_LATENCY_CONST_2 = 0
72
PREFILL_LATENCY_CONST_1 = 0.00006769375513
83
PREFILL_LATENCY_CONST_0 = 0.01969
@@ -11,17 +6,16 @@
116
DECODE_LATENCY_CONST_BATCH = 0.0001026494433
127
DECODE_LATENCY_CONST_1 = 0.0000005353485087
138
DECODE_LATENCY_CONST_0 = 0.014
14-
TOKENIZE_LATENCY_CONST = 0
9+
TOKENIZE_LATENCY_CONST = 0
1510

1611
MAX_NUM_BATCH_TOKENS = 512 # in prefill
1712

1813
TOTAL_NUM_GPU_BLOCKS = 2810
1914
NUMBER_OF_TOKENS_PER_BLOCK = 16
20-
MAX_NUM_TOKENS_ALLOWED = TOTAL_NUM_GPU_BLOCKS*NUMBER_OF_TOKENS_PER_BLOCK - MAX_NUM_BATCH_TOKENS # in kv cache
15+
MAX_NUM_TOKENS_ALLOWED = TOTAL_NUM_GPU_BLOCKS * NUMBER_OF_TOKENS_PER_BLOCK - MAX_NUM_BATCH_TOKENS # in kv cache
2116
MAX_GPU_MEMORY_PERC_BEFORE_RECOMPUTE = 0.9
2217
MAX_GPU_MEMORY_PERC_BEFORE_RECOMPUTE_NON_CRITICAL = 0.8
2318
MAX_NUM_SEQ = 256
2419

25-
2620
# size of each lora in units of KV Cache
27-
LORA_DICT = {"tweet" : 1600, "sql" : 1600, "dummy-1" : 0, "dummy-2" : 0}
21+
LORA_DICT = {"tweet": 1600, "sql": 1600, "dummy-1": 0, "dummy-2": 0}

simulations/llm_ig_simulation/src/continous_batching.py

+39-48
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ def should_process_prefill_or_recompute(llmactor, env):
77
"""Check if the system should process prefill or recompute based on queue sizes and memory constraints."""
88
return can_prefill_items(llmactor, env)
99

10-
def can_prefill_items(llmactor, env, ):
10+
def can_prefill_items(llmactor, env):
1111
"""Are there items I can prefill?"""
1212
prefill_batch_size = 0
1313
num_new_seq = 0
@@ -25,8 +25,8 @@ def can_prefill_items(llmactor, env, ):
2525
break
2626

2727
return True
28-
while llmactor.get_prefill_queue_size() > 0:
2928

29+
while llmactor.get_prefill_queue_size() > 0:
3030
oldest_item = llmactor.prefill_store.items[0]
3131
oldest_item_len = oldest_item.input_size + oldest_item.output_size - oldest_item.output_size_remaining
3232
oldest_item_input_len = oldest_item.input_size
@@ -42,8 +42,7 @@ def can_prefill_items(llmactor, env, ):
4242

4343
return False
4444

45-
46-
def fetch_prefill_items(llmactor, env, ):
45+
def fetch_prefill_items(llmactor, env):
4746
"""Fetch items to prefill if there is memory either from recompute (p0) or from prefill (p1)"""
4847
items_to_prefill = []
4948
prefill_batch_size = 0
@@ -83,21 +82,19 @@ def fetch_prefill_items(llmactor, env, ):
8382
msg = yield llmactor.prefill_store.get()
8483
items_to_prefill.append(msg)
8584

86-
8785
return items_to_prefill
8886

89-
def process_prefill_items( llmactor, env, items_to_prefill, req_dict_prefill, req_dict, logging = False):
87+
def process_prefill_items(llmactor, env, items_to_prefill, req_dict_prefill, req_dict, logging=False):
9088
"""Process prefill items, updating times and managing item states."""
9189
prefill_len = np.sum([x.input_size + x.output_size - x.output_size_remaining for x in items_to_prefill])
92-
prefill_delay = calculate_prefill_delay(prefill_len, len(items_to_prefill), TOKENIZE_LATENCY_CONST, PREFILL_LATENCY_CONST_2, PREFILL_LATENCY_CONST_1 , PREFILL_LATENCY_CONST_0, PREFILL_LATENCY_CONST_MIN)
93-
90+
prefill_delay = calculate_prefill_delay(prefill_len, len(items_to_prefill), TOKENIZE_LATENCY_CONST, PREFILL_LATENCY_CONST_2, PREFILL_LATENCY_CONST_1, PREFILL_LATENCY_CONST_0, PREFILL_LATENCY_CONST_MIN)
9491

9592
for item in items_to_prefill:
96-
#lora stuff
93+
# lora stuff
9794
if item.lora is not None:
98-
if item.lora not in llmactor.lora_loaded:
95+
if item.lora not in llmactor.lora_loaded:
9996
llmactor.lora_loaded.add(item.lora)
100-
llmactor.max_num_tokens_allowed -= LORA_DICT[item.lora]
97+
llmactor.max_num_tokens_allowed -= LORA_DICT[item.lora]
10198

10299
if item.start_prefill_time is None:
103100
item.start_prefill_time = env.now
@@ -110,9 +107,9 @@ def process_prefill_items( llmactor, env, items_to_prefill, req_dict_prefill, re
110107
else:
111108
llmactor.decode_store.put(item)
112109
if item.output_size_remaining <= 0:
113-
if logging:
114-
print(f'llmactor {llmactor.id} {item.id} item.output_size_remaining {item.output_size_remaining}')
115-
assert item.output_size_remaining > 0
110+
if logging:
111+
print(f'llmactor {llmactor.id} {item.id} item.output_size_remaining {item.output_size_remaining}')
112+
assert item.output_size_remaining > 0
116113
req_dict_prefill[item.id] = item
117114
req_dict[item.id] = item
118115
return prefill_delay
@@ -121,13 +118,13 @@ def should_recompute(llmactor, env):
121118
"""Determine if items should be moved to recompute based on memory usage."""
122119
return llmactor.get_expected_num_tokens_in_kvcache_after_decode() / (llmactor.max_num_tokens_allowed + 0.0) > MAX_GPU_MEMORY_PERC_BEFORE_RECOMPUTE
123120

124-
def remove_from_decode_store(llmactor, env, req_dict_prefill, req_dict, logging = False):
121+
def remove_from_decode_store(llmactor, env, req_dict_prefill, req_dict, logging=False):
125122
"""Manage the recomputation of items based on priority and conditions."""
126123
while should_recompute(llmactor, env):
127124
if llmactor.get_decode_queue_size() > 0:
128125
newest_decode_item_id = llmactor.decode_store.items[-1].id # newest item goes to recompute
129126
if logging:
130-
print(f'llmactor {llmactor.id} removing from decode store sequence {newest_decode_item_id}')
127+
print(f'llmactor {llmactor.id} removing from decode store sequence {newest_decode_item_id}')
131128
req_dict[newest_decode_item_id].recompute_count += 1
132129

133130
newest_decode_item = yield llmactor.decode_store.get(lambda req: req.id == newest_decode_item_id)
@@ -140,13 +137,13 @@ def decode_items(llmactor, env, req_dict_prefill, req_dict, logging=False):
140137
temp_items = []
141138
decode_delay = calculate_decode_delay(before_decoding_token_count, num_items_to_decode, TOKENIZE_LATENCY_CONST, DECODE_LATENCY_CONST_1, DECODE_LATENCY_CONST_0, DECODE_LATENCY_CONST_BATCH)
142139
if logging:
143-
print(f'llmactor {llmactor.id} Decoding sequences {[x.id for x in llmactor.decode_store.items]} items with delay {decode_delay}')
140+
print(f'llmactor {llmactor.id} Decoding sequences {[x.id for x in llmactor.decode_store.items]} items with delay {decode_delay}')
144141

145142
for _ in range(num_items_to_decode):
146143
msg = yield llmactor.decode_store.get()
147-
if msg.output_size_remaining == msg.output_size-1:
148-
msg.start_decode_time = env.now
149-
msg.tokens_in_kv_cache_at_start_of_decode = before_decoding_token_count
144+
if msg.output_size_remaining == msg.output_size - 1:
145+
msg.start_decode_time = env.now
146+
msg.tokens_in_kv_cache_at_start_of_decode = before_decoding_token_count
150147
msg.output_size_remaining -= 1
151148
if msg.output_size_remaining < 0:
152149
raise ValueError(f'Output size remaining negative for {msg.id}')
@@ -155,57 +152,51 @@ def decode_items(llmactor, env, req_dict_prefill, req_dict, logging=False):
155152
req_dict_prefill[msg.id] = msg
156153
req_dict[msg.id] = msg
157154

158-
159-
160155
for item in temp_items:
161156
if item.output_size_remaining == 0:
162157
item.end_decode_time = env.now + decode_delay
163-
164158
llmactor.decoded_store.put(item)
165159
else:
166160
item.end_decode_time = env.now + decode_delay
167161
llmactor.decode_store.put(item)
168162

169163
return decode_delay
170164

171-
def calculate_decode_delay(token_count, num_items_to_decode, tokenize_latency_const, decode_latency_const_1, decode_latency_const_0, decode_latency_const_batch):
165+
def calculate_decode_delay(token_count, num_items_to_decode, tokenize_latency_const, decode_latency_const_1, decode_latency_const_0, decode_latency_const_batch):
172166
"""Calculate delay based on the token count and latency constants."""
173-
return token_count * decode_latency_const_1 + decode_latency_const_0 + (tokenize_latency_const + decode_latency_const_batch)* num_items_to_decode
167+
return token_count * decode_latency_const_1 + decode_latency_const_0 + (tokenize_latency_const + decode_latency_const_batch) * num_items_to_decode
174168

175169
def calculate_prefill_delay(token_count, num_items_to_prefill, tokenize_latency_const, prefill_latency_const_2, prefill_latency_const_1, prefill_latency_const_0, prefill_latency_const_min):
176170
"""Calculate delay based on the token count and latency constants."""
177-
return max(prefill_latency_const_min, (token_count * token_count * prefill_latency_const_2 + token_count*prefill_latency_const_1 + prefill_latency_const_0 + num_items_to_prefill * tokenize_latency_const))
171+
return max(prefill_latency_const_min, (token_count * token_count * prefill_latency_const_2 + token_count * prefill_latency_const_1 + prefill_latency_const_0 + num_items_to_prefill * tokenize_latency_const))
178172

179-
def prefill_or_decode(env, llmactor, req_dict_prefill, req_dict, logging = False):
173+
def prefill_or_decode(env, llmactor, req_dict_prefill, req_dict, logging=False):
180174
"""Main process for managing prefill, decode, or recompute operations."""
181175
while True:
182-
183176
with llmactor.actor.request() as req:
184-
185177
yield req
186178
if (llmactor.get_decode_queue_size() == 0) and (llmactor.get_prefill_queue_size() == 0) and (llmactor.get_recompute_queue_size() == 0):
187-
yield env.timeout(1/1000.0)
179+
yield env.timeout(1 / 1000.0)
188180
elif should_process_prefill_or_recompute(llmactor, env):
189181
items_to_prefill = yield from fetch_prefill_items(llmactor, env)
190-
prefill_delay = process_prefill_items( llmactor, env,items_to_prefill, req_dict_prefill, req_dict)
182+
prefill_delay = process_prefill_items(llmactor, env, items_to_prefill, req_dict_prefill, req_dict)
191183
if logging:
192-
print(f'llmactor {llmactor.id} Processed prefill for sequences {[x.id for x in items_to_prefill]} with delay {prefill_delay}')
184+
print(f'llmactor {llmactor.id} Processed prefill for sequences {[x.id for x in items_to_prefill]} with delay {prefill_delay}')
193185
yield env.timeout(prefill_delay) # Assume prefill_delay is calculated somewhere
194186
else:
195-
if should_recompute(llmactor, env):
196-
yield from remove_from_decode_store(llmactor, env, req_dict_prefill, req_dict)
197-
if llmactor.get_decode_queue_size() > 0:
198-
decode_delay = yield from decode_items(llmactor, env, req_dict_prefill, req_dict)
199-
yield env.timeout(decode_delay)
200-
187+
if should_recompute(llmactor, env):
188+
yield from remove_from_decode_store(llmactor, env, req_dict_prefill, req_dict)
189+
if llmactor.get_decode_queue_size() > 0:
190+
decode_delay = yield from decode_items(llmactor, env, req_dict_prefill, req_dict)
191+
yield env.timeout(decode_delay)
192+
201193
def metrics(env, llmactor):
202-
while True:
203-
yield env.timeout(10)
204-
cur_time = env.now
205-
num_of_prompt_tokens = llmactor.get_num_prompt_tokens_in_decode() + llmactor.get_num_prompt_tokens_in_decoded()
206-
num_of_gen_tokens = llmactor.get_num_gen_tokens_in_decode() + llmactor.get_num_gen_tokens_in_decoded()
207-
running_req = llmactor.get_decode_queue_size()
208-
pending_req = llmactor.get_prefill_queue_size()
209-
gpu_kv_cache_usage = llmactor.get_num_tokens_in_decode()/llmactor.max_num_tokens_allowed * 100
210-
print(f'llmactor {llmactor.id} Avg prompt throughput: {num_of_prompt_tokens/cur_time} tokens/s, Avg generation throughput: {num_of_gen_tokens/cur_time}, Running: {running_req} reqs, Pending: {pending_req} reqs, GPU KV cache usage: {gpu_kv_cache_usage}%')
211-
194+
while True:
195+
yield env.timeout(10)
196+
cur_time = env.now
197+
num_of_prompt_tokens = llmactor.get_num_prompt_tokens_in_decode() + llmactor.get_num_prompt_tokens_in_decoded()
198+
num_of_gen_tokens = llmactor.get_num_gen_tokens_in_decode() + llmactor.get_num_gen_tokens_in_decoded()
199+
running_req = llmactor.get_decode_queue_size()
200+
pending_req = llmactor.get_prefill_queue_size()
201+
gpu_kv_cache_usage = llmactor.get_num_tokens_in_decode() / llmactor.max_num_tokens_allowed * 100
202+
print(f'llmactor {llmactor.id} Avg prompt throughput: {num_of_prompt_tokens / cur_time} tokens/s, Avg generation throughput: {num_of_gen_tokens / cur_time}, Running: {running_req} reqs, Pending: {pending_req} reqs, GPU KV cache usage: {gpu_kv_cache_usage}%')

0 commit comments

Comments
 (0)