From 6c6028b5a34574b06c8f3657e77e318546e181c1 Mon Sep 17 00:00:00 2001 From: Cong Liu Date: Wed, 12 Mar 2025 00:15:48 +0000 Subject: [PATCH] Fix metrics debug log; change metrics client log level to reduce spam --- pkg/epp/backend/metrics/fake.go | 4 ++++ pkg/epp/backend/metrics/logger.go | 4 +++- pkg/epp/backend/metrics/pod_metrics.go | 5 +++++ pkg/epp/backend/metrics/types.go | 8 ++++++++ pkg/epp/backend/vllm/metrics.go | 13 ++++++------- 5 files changed, 26 insertions(+), 8 deletions(-) diff --git a/pkg/epp/backend/metrics/fake.go b/pkg/epp/backend/metrics/fake.go index fae7149d8..7fd4970db 100644 --- a/pkg/epp/backend/metrics/fake.go +++ b/pkg/epp/backend/metrics/fake.go @@ -34,6 +34,10 @@ type FakePodMetrics struct { Metrics *Metrics } +func (fpm *FakePodMetrics) String() string { + return fmt.Sprintf("Pod: %v; Metrics: %v", fpm.GetPod(), fpm.GetMetrics()) +} + func (fpm *FakePodMetrics) GetPod() *Pod { return fpm.Pod } diff --git a/pkg/epp/backend/metrics/logger.go b/pkg/epp/backend/metrics/logger.go index 664115eb3..74735755a 100644 --- a/pkg/epp/backend/metrics/logger.go +++ b/pkg/epp/backend/metrics/logger.go @@ -18,6 +18,7 @@ package metrics import ( "context" + "fmt" "time" "github.com/go-logr/logr" @@ -76,7 +77,8 @@ func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometh podsWithStaleMetrics := datastore.PodList(func(pm PodMetrics) bool { return time.Since(pm.GetMetrics().UpdateTime) > metricsValidityPeriod }) - logger.Info("Current Pods and metrics gathered", "fresh metrics", podsWithFreshMetrics, "stale metrics", podsWithStaleMetrics) + s := fmt.Sprintf("Current Pods and metrics gathered. Fresh metrics: %+v, Stale metrics: %+v", podsWithFreshMetrics, podsWithStaleMetrics) + logger.Info(s) } } }() diff --git a/pkg/epp/backend/metrics/pod_metrics.go b/pkg/epp/backend/metrics/pod_metrics.go index f76c2e8cf..b954a98ce 100644 --- a/pkg/epp/backend/metrics/pod_metrics.go +++ b/pkg/epp/backend/metrics/pod_metrics.go @@ -18,6 +18,7 @@ package metrics import ( "context" + "fmt" "sync" "sync/atomic" "time" @@ -52,6 +53,10 @@ type PodMetricsClient interface { FetchMetrics(ctx context.Context, pod *Pod, existing *Metrics, port int32) (*Metrics, error) } +func (pm *podMetrics) String() string { + return fmt.Sprintf("Pod: %v; Metrics: %v", pm.GetPod(), pm.GetMetrics()) +} + func (pm *podMetrics) GetPod() *Pod { return (*Pod)(atomic.LoadPointer(&pm.pod)) } diff --git a/pkg/epp/backend/metrics/types.go b/pkg/epp/backend/metrics/types.go index cdbdb2cea..fd6001633 100644 --- a/pkg/epp/backend/metrics/types.go +++ b/pkg/epp/backend/metrics/types.go @@ -62,6 +62,7 @@ type PodMetrics interface { GetMetrics() *Metrics UpdatePod(*corev1.Pod) StopRefreshLoop() + String() string } type Pod struct { @@ -69,6 +70,13 @@ type Pod struct { Address string } +func (p *Pod) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("%+v", *p) +} + type Metrics struct { // ActiveModels is a set of models(including LoRA adapters) that are currently cached to GPU. ActiveModels map[string]int diff --git a/pkg/epp/backend/vllm/metrics.go b/pkg/epp/backend/vllm/metrics.go index f83326eb6..8d2dd7154 100644 --- a/pkg/epp/backend/vllm/metrics.go +++ b/pkg/epp/backend/vllm/metrics.go @@ -61,8 +61,7 @@ func (p *PodMetricsClientImpl) FetchMetrics( existing *metrics.Metrics, port int32, ) (*metrics.Metrics, error) { - logger := log.FromContext(ctx) - loggerDefault := logger.V(logutil.DEFAULT) + logger := log.FromContext(ctx).V(logutil.TRACE) // Currently the metrics endpoint is hard-coded, which works with vLLM. // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config. @@ -70,12 +69,12 @@ func (p *PodMetricsClientImpl) FetchMetrics( req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) if err != nil { - loggerDefault.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url) + logger.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url) return nil, fmt.Errorf("failed to create request: %v", err) } resp, err := http.DefaultClient.Do(req) if err != nil { - loggerDefault.Error(err, "Failed to fetch metrics", "pod", pod.NamespacedName) + logger.Error(err, "Failed to fetch metrics", "pod", pod.NamespacedName) return nil, fmt.Errorf("failed to fetch metrics from %s: %w", pod.NamespacedName, err) } defer func() { @@ -83,7 +82,7 @@ func (p *PodMetricsClientImpl) FetchMetrics( }() if resp.StatusCode != http.StatusOK { - loggerDefault.Error(nil, "Unexpected status code returned", "pod", pod.NamespacedName, "statusCode", resp.StatusCode) + logger.Error(nil, "Unexpected status code returned", "pod", pod.NamespacedName, "statusCode", resp.StatusCode) return nil, fmt.Errorf("unexpected status code from %s: %v", pod.NamespacedName, resp.StatusCode) } @@ -172,7 +171,7 @@ func promToPodMetrics( func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, time.Time, error) { loraRequests, ok := metricFamilies[LoraRequestInfoMetricName] if !ok { - logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", LoraRequestInfoMetricName) + logger.V(logutil.TRACE).Error(nil, "Metric family not found", "name", LoraRequestInfoMetricName) return nil, time.Time{}, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName) } @@ -219,7 +218,7 @@ func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.Metr func getLatestMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, error) { mf, ok := metricFamilies[metricName] if !ok { - logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", metricName) + logger.V(logutil.TRACE).Error(nil, "Metric family not found", "name", metricName) return nil, fmt.Errorf("metric family %q not found", metricName) } if len(mf.GetMetric()) == 0 {