kubernetes-sigs
diff --git a/‎simulations/llm_ig_simulation/src/benchmark_one_server.py
+4-49 b/‎simulations/llm_ig_simulation/src/benchmark_one_server.py
+4-49
diff --git a/‎simulations/llm_ig_simulation/src/constants.py
+3-9 b/‎simulations/llm_ig_simulation/src/constants.py
+3-9
diff --git a/‎simulations/llm_ig_simulation/src/continous_batching.py
+39-48 b/‎simulations/llm_ig_simulation/src/continous_batching.py
+39-48
@@ -1,7 +1,3 @@
-
-        
-        
-        
 import argparse
 from collections import Counter
 import csv
@@ -27,16 +23,13 @@ def main():
     parser.add_argument("--queueing_perc", type=float, default=np.inf, help="Queueing percentage.")
     parser.add_argument('--target-latency-lo', nargs='+', type=float, help='List of target latencies for low priority requests.')
     parser.add_argument('--target-latency-hi', nargs='+', type=float, help='List of target latencies for high priority requests.')
-    
     parser.add_argument('--prefix-latency-lo', nargs='+', type=float, help='List of prefix of target latencies for low priority requests.')
     parser.add_argument('--prefix-latency-hi', nargs='+', type=float, help='List of prefix of target latencies for high priority requests.')
-    
-    
-    parser.add_argument('--number-of-servers',  type=int, default=1, help='List of target latencies for high priority requests.')
+    parser.add_argument('--number-of-servers', type=int, default=1, help='List of target latencies for high priority requests.')
 
     args = parser.parse_args()
 
-     # Use provided arguments or defaults
+    # Use provided arguments or defaults
     rates_lo = args.rates_lo
     rates_hi = args.rates_hi
     no_of_messages = args.no_of_messages
@@ -45,22 +38,17 @@ def main():
     std_request_size_1 = args.std_request_size_1
     mean_output_size_1 = args.mean_output_size_1
     std_output_size_1 = args.std_output_size_1
-
     mean_request_size_2 = args.mean_request_size_2
     std_request_size_2 = args.std_request_size_2
     mean_output_size_2 = args.mean_output_size_2
     std_output_size_2 = args.std_output_size_2
-    
     queueing_perc = args.queueing_perc
     lora_requested_lo = ""
     lora_requested_hi = ""
-    
     target_latency_list_lo = args.target_latency_lo if args.target_latency_lo else [0.025]
     target_latency_list_hi = args.target_latency_hi if args.target_latency_hi else [0.5]
-    
     prefix_latency_list_lo = args.prefix_latency_lo if args.prefix_latency_lo else ['lo']
     prefix_latency_list_hi = args.prefix_latency_hi if args.prefix_latency_hi else ['hi']
-    
     number_of_servers = args.number_of_servers
 
     # Define a structure to store results for all routing types
@@ -163,43 +151,24 @@ def main():
             lb.process(rates_lo[i], lora_requested_lo, target_latency_list_lo, prefix_latency_list_lo, routing_type, prompt_output_tuple, mean_request_size_1, std_request_size_1, mean_output_size_1, std_output_size_1, estimated_output_size)
             env.run(until=SIM_DURATION)
 
-
-
             # Completed requests
             completed_req = list(filter(lambda x: x.output_size_remaining == 0, req_dict.values()))
-
-
             completed_req_sorted = sorted(completed_req, key=lambda x: x.arrival_time)
-
-
             # Exclude the first 10% of requests based on end_decode_time
             exclude_count = int(0 * len(completed_req_sorted))
-
-
             # Filter out the first 10%
             filtered_req = completed_req_sorted[exclude_count:]
 
-
-
             # Calculate ttft, tpot, latency, and throughput
             ttft_cur = np.mean([x.end_prefill_time - x.arrival_time for x in req_dict.values()])
-
-
             tpot_cur = np.mean([(x.end_decode_time - x.start_prefill_time) / (x.output_size - x.output_size_remaining) for x in req_dict.values()])
-
             latency_cur = np.mean([(x.end_decode_time - x.arrival_time) / (x.output_size - x.output_size_remaining) for x in filtered_req])
-
             estimated_latency_cur = np.mean([x.estimated_latency for x in filtered_req])
-
             recompute_cur = np.sum([x.recompute_count for x in filtered_req]) / len(filtered_req)
-
             tt = SIM_DURATION
             throughput_prefill_cur = np.sum([x.input_size for x in filtered_req]) / tt
             throughput_decode_cur = np.sum([max(0, x.output_size - x.output_size_remaining - 1) for x in filtered_req]) / tt
 
-
-
-
             pending_tokens_at_arrival_perc = [x.pending_tokens_at_arrival_perc for x in completed_req]
             actual_tokens_at_arrival_perc = [x.actual_tokens_at_arrival_perc for x in completed_req]
             prefill_queue_size = [x.queue_size_before_prefill for x in completed_req]
@@ -210,32 +179,20 @@ def main():
             results[routing_type]['throughput_decode'].append(throughput_decode_cur)
             results[routing_type]['ttft'].append(ttft_cur)
             results[routing_type]['tpot'].append(tpot_cur)
-
-
             results[routing_type]['recompute_cnt'].append(recompute_cur)
-
-
-            
             results[routing_type]['avg_prefill_queue_size'].append(np.mean(prefill_queue_size))
             results[routing_type]['avg_pending_tokens_perc'].append(np.mean(pending_tokens_at_arrival_perc))
             results[routing_type]['avg_actual_tokens_perc'].append(np.mean(actual_tokens_at_arrival_perc))
 
-
-
-            
     # Create a timestamp
     timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-
     # Create the output file name with the timestamp
     output_file = f"results_{timestamp}.csv"
 
-
-
     # Write results to CSV
     with open(output_file, 'w', newline='') as csvfile:
-        fieldnames = ['RoutingType', 'RateIndex', 'Latency', 'avg_prefill_queue_size', 'avg_pending_tokens_perc', 'avg_actual_tokens_perc' ]
+        fieldnames = ['RoutingType', 'RateIndex', 'Latency', 'avg_prefill_queue_size', 'avg_pending_tokens_perc', 'avg_actual_tokens_perc']
         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
-
         writer.writeheader()
 
         # Iterate over routing types and write each entry
@@ -251,8 +208,6 @@ def main():
                 })
 
     print(f"Results have been saved to {output_file}")
-            
-    
 
 if __name__ == "__main__":
-        main()
+    main()
@@ -1,8 +1,3 @@
-
-
-
-
-
 PREFILL_LATENCY_CONST_2 = 0
 PREFILL_LATENCY_CONST_1 = 0.00006769375513
 PREFILL_LATENCY_CONST_0 = 0.01969
@@ -11,17 +6,16 @@
 DECODE_LATENCY_CONST_BATCH = 0.0001026494433
 DECODE_LATENCY_CONST_1 = 0.0000005353485087
 DECODE_LATENCY_CONST_0 = 0.014
-TOKENIZE_LATENCY_CONST =   0
+TOKENIZE_LATENCY_CONST = 0
 
 MAX_NUM_BATCH_TOKENS = 512 # in prefill
 
 TOTAL_NUM_GPU_BLOCKS = 2810
 NUMBER_OF_TOKENS_PER_BLOCK = 16
-MAX_NUM_TOKENS_ALLOWED = TOTAL_NUM_GPU_BLOCKS*NUMBER_OF_TOKENS_PER_BLOCK - MAX_NUM_BATCH_TOKENS # in kv cache
+MAX_NUM_TOKENS_ALLOWED = TOTAL_NUM_GPU_BLOCKS * NUMBER_OF_TOKENS_PER_BLOCK - MAX_NUM_BATCH_TOKENS # in kv cache
 MAX_GPU_MEMORY_PERC_BEFORE_RECOMPUTE = 0.9
 MAX_GPU_MEMORY_PERC_BEFORE_RECOMPUTE_NON_CRITICAL = 0.8
 MAX_NUM_SEQ = 256
 
-
 # size of each lora in units of KV Cache
-LORA_DICT = {"tweet" : 1600, "sql" : 1600, "dummy-1" : 0, "dummy-2" : 0}
+LORA_DICT = {"tweet": 1600, "sql": 1600, "dummy-1": 0, "dummy-2": 0}
@@ -7,7 +7,7 @@ def should_process_prefill_or_recompute(llmactor, env):
     """Check if the system should process prefill or recompute based on queue sizes and memory constraints."""
     return can_prefill_items(llmactor, env)
 
-def can_prefill_items(llmactor, env,  ):
+def can_prefill_items(llmactor, env):
     """Are there items I can prefill?"""
     prefill_batch_size = 0
     num_new_seq = 0
@@ -25,8 +25,8 @@ def can_prefill_items(llmactor, env,  ):
             break
 
         return True
-    while llmactor.get_prefill_queue_size() > 0:
 
+    while llmactor.get_prefill_queue_size() > 0:
         oldest_item = llmactor.prefill_store.items[0]
         oldest_item_len = oldest_item.input_size + oldest_item.output_size - oldest_item.output_size_remaining
         oldest_item_input_len = oldest_item.input_size 
@@ -42,8 +42,7 @@ def can_prefill_items(llmactor, env,  ):
 
     return False
 
-
-def fetch_prefill_items(llmactor, env,  ):
+def fetch_prefill_items(llmactor, env):
     """Fetch items to prefill if there is memory either from recompute (p0) or from prefill (p1)"""
     items_to_prefill = []
     prefill_batch_size = 0
@@ -83,21 +82,19 @@ def fetch_prefill_items(llmactor, env,  ):
         msg = yield llmactor.prefill_store.get()
         items_to_prefill.append(msg)
 
-
     return items_to_prefill
 
-def process_prefill_items( llmactor, env, items_to_prefill, req_dict_prefill, req_dict, logging = False):
+def process_prefill_items(llmactor, env, items_to_prefill, req_dict_prefill, req_dict, logging=False):
     """Process prefill items, updating times and managing item states."""
     prefill_len = np.sum([x.input_size + x.output_size - x.output_size_remaining for x in items_to_prefill])
-    prefill_delay = calculate_prefill_delay(prefill_len, len(items_to_prefill), TOKENIZE_LATENCY_CONST, PREFILL_LATENCY_CONST_2, PREFILL_LATENCY_CONST_1 , PREFILL_LATENCY_CONST_0, PREFILL_LATENCY_CONST_MIN)
-
+    prefill_delay = calculate_prefill_delay(prefill_len, len(items_to_prefill), TOKENIZE_LATENCY_CONST, PREFILL_LATENCY_CONST_2, PREFILL_LATENCY_CONST_1, PREFILL_LATENCY_CONST_0, PREFILL_LATENCY_CONST_MIN)
 
     for item in items_to_prefill:
-        #lora stuff
+        # lora stuff
         if item.lora is not None:
-              if item.lora not in llmactor.lora_loaded:
+            if item.lora not in llmactor.lora_loaded:
                 llmactor.lora_loaded.add(item.lora)
-                llmactor.max_num_tokens_allowed -=  LORA_DICT[item.lora]
+                llmactor.max_num_tokens_allowed -= LORA_DICT[item.lora]
 
         if item.start_prefill_time is None:
             item.start_prefill_time = env.now
@@ -110,9 +107,9 @@ def process_prefill_items( llmactor, env, items_to_prefill, req_dict_prefill, re
         else:
             llmactor.decode_store.put(item)
             if item.output_size_remaining <= 0:
-              if logging:
-                print(f'llmactor {llmactor.id} {item.id} item.output_size_remaining {item.output_size_remaining}')
-              assert item.output_size_remaining > 0
+                if logging:
+                    print(f'llmactor {llmactor.id} {item.id} item.output_size_remaining {item.output_size_remaining}')
+                assert item.output_size_remaining > 0
         req_dict_prefill[item.id] = item
         req_dict[item.id] = item
     return prefill_delay
@@ -121,13 +118,13 @@ def should_recompute(llmactor, env):
     """Determine if items should be moved to recompute based on memory usage."""
     return llmactor.get_expected_num_tokens_in_kvcache_after_decode() / (llmactor.max_num_tokens_allowed + 0.0) > MAX_GPU_MEMORY_PERC_BEFORE_RECOMPUTE
 
-def remove_from_decode_store(llmactor, env, req_dict_prefill, req_dict, logging = False):
+def remove_from_decode_store(llmactor, env, req_dict_prefill, req_dict, logging=False):
     """Manage the recomputation of items based on priority and conditions."""
     while should_recompute(llmactor, env):
         if llmactor.get_decode_queue_size() > 0:
             newest_decode_item_id = llmactor.decode_store.items[-1].id  # newest item goes to recompute
             if logging:
-              print(f'llmactor {llmactor.id} removing from decode store sequence {newest_decode_item_id}')
+                print(f'llmactor {llmactor.id} removing from decode store sequence {newest_decode_item_id}')
             req_dict[newest_decode_item_id].recompute_count += 1
 
             newest_decode_item = yield llmactor.decode_store.get(lambda req: req.id == newest_decode_item_id)
@@ -140,13 +137,13 @@ def decode_items(llmactor, env, req_dict_prefill, req_dict, logging=False):
     temp_items = []
     decode_delay = calculate_decode_delay(before_decoding_token_count, num_items_to_decode, TOKENIZE_LATENCY_CONST, DECODE_LATENCY_CONST_1, DECODE_LATENCY_CONST_0, DECODE_LATENCY_CONST_BATCH)
     if logging:
-      print(f'llmactor {llmactor.id} Decoding sequences {[x.id for x in llmactor.decode_store.items]} items with delay {decode_delay}')
+        print(f'llmactor {llmactor.id} Decoding sequences {[x.id for x in llmactor.decode_store.items]} items with delay {decode_delay}')
 
     for _ in range(num_items_to_decode):
         msg = yield llmactor.decode_store.get()
-        if msg.output_size_remaining == msg.output_size-1:
-          msg.start_decode_time = env.now
-          msg.tokens_in_kv_cache_at_start_of_decode = before_decoding_token_count
+        if msg.output_size_remaining == msg.output_size - 1:
+            msg.start_decode_time = env.now
+            msg.tokens_in_kv_cache_at_start_of_decode = before_decoding_token_count
         msg.output_size_remaining -= 1
         if msg.output_size_remaining < 0:
             raise ValueError(f'Output size remaining negative for {msg.id}')
@@ -155,57 +152,51 @@ def decode_items(llmactor, env, req_dict_prefill, req_dict, logging=False):
         req_dict_prefill[msg.id] = msg
         req_dict[msg.id] = msg
 
-
-
     for item in temp_items:
         if item.output_size_remaining == 0:
             item.end_decode_time = env.now + decode_delay
-
             llmactor.decoded_store.put(item)
         else:
             item.end_decode_time = env.now + decode_delay
             llmactor.decode_store.put(item)
 
     return decode_delay
 
-def calculate_decode_delay(token_count, num_items_to_decode, tokenize_latency_const,  decode_latency_const_1, decode_latency_const_0, decode_latency_const_batch):
+def calculate_decode_delay(token_count, num_items_to_decode, tokenize_latency_const, decode_latency_const_1, decode_latency_const_0, decode_latency_const_batch):
     """Calculate delay based on the token count and latency constants."""
-    return token_count * decode_latency_const_1 + decode_latency_const_0 + (tokenize_latency_const + decode_latency_const_batch)* num_items_to_decode
+    return token_count * decode_latency_const_1 + decode_latency_const_0 + (tokenize_latency_const + decode_latency_const_batch) * num_items_to_decode
 
 def calculate_prefill_delay(token_count, num_items_to_prefill, tokenize_latency_const, prefill_latency_const_2, prefill_latency_const_1, prefill_latency_const_0, prefill_latency_const_min):
     """Calculate delay based on the token count and latency constants."""
-    return max(prefill_latency_const_min, (token_count * token_count * prefill_latency_const_2 + token_count*prefill_latency_const_1 + prefill_latency_const_0 + num_items_to_prefill * tokenize_latency_const))
+    return max(prefill_latency_const_min, (token_count * token_count * prefill_latency_const_2 + token_count * prefill_latency_const_1 + prefill_latency_const_0 + num_items_to_prefill * tokenize_latency_const))
 
-def prefill_or_decode(env, llmactor, req_dict_prefill, req_dict, logging = False):
+def prefill_or_decode(env, llmactor, req_dict_prefill, req_dict, logging=False):
     """Main process for managing prefill, decode, or recompute operations."""
     while True:
-
         with llmactor.actor.request() as req:
-
             yield req
             if (llmactor.get_decode_queue_size() == 0) and (llmactor.get_prefill_queue_size() == 0) and (llmactor.get_recompute_queue_size() == 0):
-                yield env.timeout(1/1000.0)
+                yield env.timeout(1 / 1000.0)
             elif should_process_prefill_or_recompute(llmactor, env):
                 items_to_prefill = yield from fetch_prefill_items(llmactor, env)
-                prefill_delay =  process_prefill_items( llmactor, env,items_to_prefill, req_dict_prefill, req_dict)
+                prefill_delay = process_prefill_items(llmactor, env, items_to_prefill, req_dict_prefill, req_dict)
                 if logging:
-                  print(f'llmactor {llmactor.id} Processed prefill for sequences {[x.id for x in items_to_prefill]} with delay {prefill_delay}')
+                    print(f'llmactor {llmactor.id} Processed prefill for sequences {[x.id for x in items_to_prefill]} with delay {prefill_delay}')
                 yield env.timeout(prefill_delay)  # Assume prefill_delay is calculated somewhere
             else:
-              if should_recompute(llmactor, env):
-                yield from remove_from_decode_store(llmactor, env, req_dict_prefill, req_dict)
-              if llmactor.get_decode_queue_size() > 0:
-                decode_delay =   yield from decode_items(llmactor, env, req_dict_prefill, req_dict)
-                yield env.timeout(decode_delay)
-                
+                if should_recompute(llmactor, env):
+                    yield from remove_from_decode_store(llmactor, env, req_dict_prefill, req_dict)
+                if llmactor.get_decode_queue_size() > 0:
+                    decode_delay = yield from decode_items(llmactor, env, req_dict_prefill, req_dict)
+                    yield env.timeout(decode_delay)
+
 def metrics(env, llmactor):
-  while True:
-      yield env.timeout(10)
-      cur_time = env.now
-      num_of_prompt_tokens = llmactor.get_num_prompt_tokens_in_decode() + llmactor.get_num_prompt_tokens_in_decoded()
-      num_of_gen_tokens = llmactor.get_num_gen_tokens_in_decode() + llmactor.get_num_gen_tokens_in_decoded()
-      running_req = llmactor.get_decode_queue_size()
-      pending_req = llmactor.get_prefill_queue_size()
-      gpu_kv_cache_usage = llmactor.get_num_tokens_in_decode()/llmactor.max_num_tokens_allowed * 100
-      print(f'llmactor {llmactor.id} Avg prompt throughput: {num_of_prompt_tokens/cur_time} tokens/s, Avg generation throughput: {num_of_gen_tokens/cur_time}, Running: {running_req} reqs, Pending: {pending_req} reqs, GPU KV cache usage: {gpu_kv_cache_usage}%')
- 
+    while True:
+        yield env.timeout(10)
+        cur_time = env.now
+        num_of_prompt_tokens = llmactor.get_num_prompt_tokens_in_decode() + llmactor.get_num_prompt_tokens_in_decoded()
+        num_of_gen_tokens = llmactor.get_num_gen_tokens_in_decode() + llmactor.get_num_gen_tokens_in_decoded()
+        running_req = llmactor.get_decode_queue_size()
+        pending_req = llmactor.get_prefill_queue_size()
+        gpu_kv_cache_usage = llmactor.get_num_tokens_in_decode() / llmactor.max_num_tokens_allowed * 100
+        print(f'llmactor {llmactor.id} Avg prompt throughput: {num_of_prompt_tokens / cur_time} tokens/s, Avg generation throughput: {num_of_gen_tokens / cur_time}, Running: {running_req} reqs, Pending: {pending_req} reqs, GPU KV cache usage: {gpu_kv_cache_usage}%')