-
Notifications
You must be signed in to change notification settings - Fork 69
Improve the filter to return multiple preferred pods instead of one; also fix metrics update bug #17
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve the filter to return multiple preferred pods instead of one; also fix metrics update bug #17
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -43,24 +43,24 @@ func (f *filter) Name() string { | |
|
||
func (f *filter) Filter(b *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { | ||
if f == nil { | ||
klog.V(2).Infof("Running nil filter, returning all input pods by default") | ||
klog.V(3).Infof("Running nil filter, returning all input pods by default") | ||
return pods, nil | ||
} | ||
klog.V(2).Infof("Running filter %q on request %v with %v pods", f.name, b, len(pods)) | ||
klog.V(3).Infof("Running filter %q on request %v with %v pods", f.name, b, len(pods)) | ||
|
||
filtered, err := f.filter(b, pods) | ||
|
||
next := f.nextOnSuccessOrFailure | ||
if err == nil { | ||
klog.V(2).Infof("onSuccess %v -> %v, filtered: %v", f.name, next.Name(), len(filtered)) | ||
klog.V(3).Infof("onSuccess %v -> %v, filtered: %v", f.name, next.Name(), len(filtered)) | ||
if f.nextOnSuccess != nil { | ||
next = f.nextOnSuccess | ||
} | ||
// On success, pass the filtered result to the next filter. | ||
return next.Filter(b, filtered) | ||
} | ||
|
||
klog.V(2).Infof("onFailure %v -> %v", f.name, next.Name()) | ||
klog.V(3).Infof("onFailure %v -> %v", f.name, next.Name()) | ||
if f.nextOnFailure != nil { | ||
next = f.nextOnFailure | ||
} | ||
|
@@ -88,32 +88,57 @@ func toFilterFunc(pp podPredicate) filterFunc { | |
} | ||
} | ||
|
||
// leastQueuingFilterFunc finds the max and min queue size of all pods, divides the whole range | ||
// (max-min) by the number of pods, and finds the pods that fall into the first range. | ||
// The intuition is that if there are multiple pods that share similar queue size in the low range, | ||
// we should consider them all instead of the absolute minimum one. This worked better than picking | ||
// the least one as it gives more choices for the next filter, which on aggregate gave better | ||
// results. | ||
// TODO: Compare this strategy with other strategies such as top K. | ||
func leastQueuingFilterFunc(b *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { | ||
min := math.MaxInt | ||
max := 0 | ||
filtered := []*backend.PodMetrics{} | ||
|
||
for _, pod := range pods { | ||
if pod.WaitingQueueSize < min { | ||
if pod.WaitingQueueSize <= min { | ||
min = pod.WaitingQueueSize | ||
filtered = []*backend.PodMetrics{} | ||
} | ||
if pod.WaitingQueueSize == min { | ||
if pod.WaitingQueueSize >= max { | ||
max = pod.WaitingQueueSize | ||
} | ||
} | ||
|
||
for _, pod := range pods { | ||
if pod.WaitingQueueSize >= min && pod.WaitingQueueSize <= min+(max-min)/len(pods) { | ||
filtered = append(filtered, pod) | ||
} | ||
} | ||
return filtered, nil | ||
} | ||
|
||
// leastKVCacheFilterFunc finds the max and min KV cache of all pods, divides the whole range | ||
// (max-min) by the number of pods, and finds the pods that fall into the first range. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the intuition behind setting the range this way compared to finding the least-k elements for example? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nothing in particular. I will try to compare a few different algos and tune them with different parameters to see the difference. But for now I don't have data to see which one is better. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated the comment with the reason behind this. And added a TODO to compare with other strategies. |
||
// The intuition is that if there are multiple pods that share similar KV cache in the low range, we | ||
// should consider them all instead of the absolute minimum one. This worked better than picking the | ||
// least one as it gives more choices for the next filter, which on aggregate gave better results. | ||
// TODO: Compare this strategy with other strategies such as top K. | ||
func leastKVCacheFilterFunc(b *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { | ||
min := math.MaxInt | ||
min := math.MaxFloat64 | ||
max := math.SmallestNonzeroFloat64 | ||
filtered := []*backend.PodMetrics{} | ||
margin := 5 | ||
|
||
for _, pod := range pods { | ||
cur := int(pod.KVCacheUsagePercent) / margin | ||
if cur < min { | ||
min = cur | ||
filtered = []*backend.PodMetrics{} | ||
if pod.KVCacheUsagePercent <= min { | ||
min = pod.KVCacheUsagePercent | ||
} | ||
if pod.KVCacheUsagePercent >= max { | ||
max = pod.KVCacheUsagePercent | ||
} | ||
if cur == min { | ||
} | ||
|
||
for _, pod := range pods { | ||
if pod.KVCacheUsagePercent >= min && pod.KVCacheUsagePercent <= min+(max-min)/float64(len(pods)) { | ||
filtered = append(filtered, pod) | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not print this as part of the refresh loop above instead of spawning a separate go routine?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The refresh loop is run much more frequently (20ms in my benchmark).