Skip to content

Commit f704350

Browse files
committed
fix: add the stability level to the help message of the metric
1 parent 23981e1 commit f704350

16 files changed

+47
-75
lines changed

pkg/bbr/metrics/metrics.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@ limitations under the License.
1717
package metrics
1818

1919
import (
20+
"fmt"
2021
"sync"
2122

2223
"github.com/prometheus/client_golang/prometheus"
24+
compbasemetrics "k8s.io/component-base/metrics"
2325
"sigs.k8s.io/controller-runtime/pkg/metrics"
2426
)
2527

@@ -30,23 +32,23 @@ var (
3032
prometheus.CounterOpts{
3133
Subsystem: component,
3234
Name: "success_total",
33-
Help: "Count of successes pulling model name from body and injecting it in the request headers.",
35+
Help: fmt.Sprintf("[%v] %v", compbasemetrics.ALPHA, "Count of successes pulling model name from body and injecting it in the request headers."),
3436
},
3537
[]string{},
3638
)
3739
modelNotInBodyCounter = prometheus.NewCounterVec(
3840
prometheus.CounterOpts{
3941
Subsystem: component,
4042
Name: "model_not_in_body_total",
41-
Help: "Count of times the model was not present in the request body.",
43+
Help: fmt.Sprintf("[%v] %v", compbasemetrics.ALPHA, "Count of times the model was not present in the request body."),
4244
},
4345
[]string{},
4446
)
4547
modelNotParsedCounter = prometheus.NewCounterVec(
4648
prometheus.CounterOpts{
4749
Subsystem: component,
4850
Name: "model_not_parsed_total",
49-
Help: "Count of times the model was in the request body but we could not parse it.",
51+
Help: fmt.Sprintf("[%v] %v", compbasemetrics.ALPHA, "Count of times the model was in the request body but we could not parse it."),
5052
},
5153
[]string{},
5254
)

pkg/epp/metrics/metrics.go

Lines changed: 20 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,15 @@ package metrics
1818

1919
import (
2020
"context"
21+
"fmt"
2122
"sync"
2223
"time"
2324

2425
"github.com/prometheus/client_golang/prometheus"
26+
compbasemetrics "k8s.io/component-base/metrics"
2527
"sigs.k8s.io/controller-runtime/pkg/log"
2628
"sigs.k8s.io/controller-runtime/pkg/metrics"
29+
2730
logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
2831
)
2932

@@ -47,7 +50,7 @@ var (
4750
prometheus.CounterOpts{
4851
Subsystem: InferenceModelComponent,
4952
Name: "request_total",
50-
Help: "Counter of inference model requests broken out for each model and target model.",
53+
Help: fmt.Sprintf("[%v] %v", compbasemetrics.ALPHA, "Counter of inference model requests broken out for each model and target model."),
5154
},
5255
[]string{"model_name", "target_model_name"},
5356
)
@@ -56,7 +59,7 @@ var (
5659
prometheus.CounterOpts{
5760
Subsystem: InferenceModelComponent,
5861
Name: "request_error_total",
59-
Help: "Counter of inference model requests errors broken out for each model and target model.",
62+
Help: fmt.Sprintf("[%v] %v", compbasemetrics.ALPHA, "Counter of inference model requests errors broken out for each model and target model."),
6063
},
6164
[]string{"model_name", "target_model_name", "error_code"},
6265
)
@@ -65,7 +68,7 @@ var (
6568
prometheus.HistogramOpts{
6669
Subsystem: InferenceModelComponent,
6770
Name: "request_duration_seconds",
68-
Help: "Inference model response latency distribution in seconds for each model and target model.",
71+
Help: fmt.Sprintf("[%v] %v", compbasemetrics.ALPHA, "Inference model response latency distribution in seconds for each model and target model."),
6972
Buckets: []float64{
7073
0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3,
7174
4, 5, 6, 8, 10, 15, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600,
@@ -78,7 +81,7 @@ var (
7881
prometheus.HistogramOpts{
7982
Subsystem: InferenceModelComponent,
8083
Name: "request_sizes",
81-
Help: "Inference model requests size distribution in bytes for each model and target model.",
84+
Help: fmt.Sprintf("[%v] %v", compbasemetrics.ALPHA, "Inference model requests size distribution in bytes for each model and target model."),
8285
// Use buckets ranging from 1000 bytes (1KB) to 10^9 bytes (1GB).
8386
Buckets: []float64{
8487
64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, // More fine-grained up to 64KB
@@ -93,7 +96,7 @@ var (
9396
prometheus.HistogramOpts{
9497
Subsystem: InferenceModelComponent,
9598
Name: "response_sizes",
96-
Help: "Inference model responses size distribution in bytes for each model and target model.",
99+
Help: fmt.Sprintf("[%v] %v", compbasemetrics.ALPHA, "Inference model responses size distribution in bytes for each model and target model."),
97100
// Most models have a response token < 8192 tokens. Each token, in average, has 4 characters.
98101
// 8192 * 4 = 32768.
99102
Buckets: []float64{1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32778, 65536},
@@ -105,7 +108,7 @@ var (
105108
prometheus.HistogramOpts{
106109
Subsystem: InferenceModelComponent,
107110
Name: "input_tokens",
108-
Help: "Inference model input token count distribution for requests in each model.",
111+
Help: fmt.Sprintf("[%v] %v", compbasemetrics.ALPHA, "Inference model input token count distribution for requests in each model."),
109112
// Most models have a input context window less than 1 million tokens.
110113
Buckets: []float64{1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32778, 65536, 131072, 262144, 524288, 1048576},
111114
},
@@ -116,7 +119,7 @@ var (
116119
prometheus.HistogramOpts{
117120
Subsystem: InferenceModelComponent,
118121
Name: "output_tokens",
119-
Help: "Inference model output token count distribution for requests in each model.",
122+
Help: fmt.Sprintf("[%v] %v", compbasemetrics.ALPHA, "Inference model output token count distribution for requests in each model."),
120123
// Most models generates output less than 8192 tokens.
121124
Buckets: []float64{1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192},
122125
},
@@ -127,7 +130,7 @@ var (
127130
prometheus.GaugeOpts{
128131
Subsystem: InferenceModelComponent,
129132
Name: "running_requests",
130-
Help: "Inference model number of running requests in each model.",
133+
Help: fmt.Sprintf("[%v] %v", compbasemetrics.ALPHA, "Inference model number of running requests in each model."),
131134
},
132135
[]string{"model_name"},
133136
)
@@ -137,7 +140,7 @@ var (
137140
prometheus.HistogramOpts{
138141
Subsystem: InferenceModelComponent,
139142
Name: "normalized_time_per_output_token_seconds",
140-
Help: "Inference model latency divided by number of output tokens in seconds for each model and target model.",
143+
Help: fmt.Sprintf("[%v] %v", compbasemetrics.ALPHA, "Inference model latency divided by number of output tokens in seconds for each model and target model."),
141144
// From few milliseconds per token to multiple seconds per token
142145
Buckets: []float64{
143146
0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0,
@@ -151,7 +154,7 @@ var (
151154
prometheus.GaugeOpts{
152155
Subsystem: InferencePoolComponent,
153156
Name: "average_kv_cache_utilization",
154-
Help: "The average kv cache utilization for an inference server pool.",
157+
Help: fmt.Sprintf("[%v] %v", compbasemetrics.ALPHA, "The average kv cache utilization for an inference server pool."),
155158
},
156159
[]string{"name"},
157160
)
@@ -160,7 +163,7 @@ var (
160163
prometheus.GaugeOpts{
161164
Subsystem: InferencePoolComponent,
162165
Name: "average_queue_size",
163-
Help: "The average number of requests pending in the model server queue.",
166+
Help: fmt.Sprintf("[%v] %v", compbasemetrics.ALPHA, "The average number of requests pending in the model server queue."),
164167
},
165168
[]string{"name"},
166169
)
@@ -169,7 +172,7 @@ var (
169172
prometheus.GaugeOpts{
170173
Subsystem: InferencePoolComponent,
171174
Name: "ready_pods",
172-
Help: "The number of ready pods in the inference server pool.",
175+
Help: fmt.Sprintf("[%v] %v", compbasemetrics.ALPHA, "The number of ready pods in the inference server pool."),
173176
},
174177
[]string{"name"},
175178
)
@@ -179,7 +182,7 @@ var (
179182
prometheus.HistogramOpts{
180183
Subsystem: InferenceExtension,
181184
Name: "scheduler_e2e_duration_seconds",
182-
Help: "End-to-end scheduling latency distribution in seconds.",
185+
Help: fmt.Sprintf("[%v] %v", compbasemetrics.ALPHA, "End-to-end scheduling latency distribution in seconds."),
183186
Buckets: []float64{
184187
0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1,
185188
},
@@ -191,48 +194,14 @@ var (
191194
prometheus.HistogramOpts{
192195
Subsystem: InferenceExtension,
193196
Name: "scheduler_plugin_duration_seconds",
194-
Help: "Scheduler plugin processing latency distribution in seconds for each plugin type and plugin name.",
197+
Help: fmt.Sprintf("[%v] %v", compbasemetrics.ALPHA, "Scheduler plugin processing latency distribution in seconds for each plugin type and plugin name."),
195198
Buckets: []float64{
196199
0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1,
197200
},
198201
},
199202
[]string{"plugin_type", "plugin_name"},
200203
)
201204

202-
// Prefix indexer Metrics
203-
PrefixCacheSize = prometheus.NewGaugeVec(
204-
prometheus.GaugeOpts{
205-
Subsystem: InferenceExtension,
206-
Name: "prefix_indexer_size",
207-
Help: "Size of the prefix indexer.",
208-
StabilityLevel: prometheus.ALPHA,
209-
},
210-
[]string{},
211-
)
212-
213-
PrefixCacheHitRatio = prometheus.NewHistogramVec(
214-
prometheus.HistogramOpts{
215-
Subsystem: InferenceExtension,
216-
Name: "prefix_indexer_hit_ratio",
217-
Help: "Ratio of prefix length matched to total prefix length in the cache lookup.",
218-
// Buckets from 0.0 to 1.0 in increments
219-
Buckets: []float64{0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0},
220-
// StabilityLevel: prometheus.ALPHA,
221-
},
222-
[]string{},
223-
)
224-
225-
PrefixCacheHitLength = prometheus.NewHistogramVec(
226-
prometheus.HistogramOpts{
227-
Subsystem: InferenceExtension,
228-
Name: "prefix_indexer_hit_bytes",
229-
Help: "Length of the prefix match in number of bytes in the cache lookup.",
230-
Buckets: []float64{0, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536},
231-
// StabilityLevel: prometheus.ALPHA,
232-
},
233-
[]string{},
234-
)
235-
236205
// Prefix indexer Metrics
237206
PrefixCacheSize = prometheus.NewGaugeVec(
238207
prometheus.GaugeOpts{
@@ -272,8 +241,7 @@ var (
272241
prometheus.GaugeOpts{
273242
Subsystem: InferenceExtension,
274243
Name: "info",
275-
Help: "General information of the current build of Inference Extension.",
276-
// StabilityLevel: prometheus.ALPHA,
244+
Help: fmt.Sprintf("[%v] %v", compbasemetrics.ALPHA, "General information of the current build of Inference Extension."),
277245
},
278246
[]string{"commit", "build_ref"},
279247
)
@@ -323,6 +291,8 @@ func Reset() {
323291
inferencePoolAvgQueueSize.Reset()
324292
inferencePoolReadyPods.Reset()
325293
SchedulerPluginProcessingLatencies.Reset()
294+
SchedulerE2ELatency.Reset()
295+
InferenceExtensionInfo.Reset()
326296
}
327297

328298
// RecordRequstCounter records the number of requests.

pkg/epp/metrics/testdata/input_tokens_metric

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# HELP inference_model_input_tokens Inference model input token count distribution for requests in each model.
1+
# HELP inference_model_input_tokens [ALPHA] Inference model input token count distribution for requests in each model.
22
# TYPE inference_model_input_tokens histogram
33
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="1"} 0
44
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="8"} 0
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
# HELP inference_pool_average_kv_cache_utilization The average kv cache utilization for an inference server pool.
1+
# HELP inference_pool_average_kv_cache_utilization [ALPHA] The average kv cache utilization for an inference server pool.
22
# TYPE inference_pool_average_kv_cache_utilization gauge
33
inference_pool_average_kv_cache_utilization{name="p1"} 0.3

pkg/epp/metrics/testdata/normalized_time_per_output_token_seconds_metric

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# HELP inference_model_normalized_time_per_output_token_seconds Inference model latency divided by number of output tokens in seconds for each model and target model.
1+
# HELP inference_model_normalized_time_per_output_token_seconds [ALPHA] Inference model latency divided by number of output tokens in seconds for each model and target model.
22
# TYPE inference_model_normalized_time_per_output_token_seconds histogram
33
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.001"} 0
44
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.002"} 0

pkg/epp/metrics/testdata/output_tokens_metric

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# HELP inference_model_output_tokens Inference model output token count distribution for requests in each model.
1+
# HELP inference_model_output_tokens [ALPHA] Inference model output token count distribution for requests in each model.
22
# TYPE inference_model_output_tokens histogram
33
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="1"} 0
44
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="8"} 0
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
# HELP inference_pool_average_queue_size The average number of requests pending in the model server queue.
1+
# HELP inference_pool_average_queue_size [ALPHA] The average number of requests pending in the model server queue.
22
# TYPE inference_pool_average_queue_size gauge
33
inference_pool_average_queue_size{name="p1"} 0.4

pkg/epp/metrics/testdata/request_duration_seconds_metric

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# HELP inference_model_request_duration_seconds Inference model response latency distribution in seconds for each model and target model.
1+
# HELP inference_model_request_duration_seconds [ALPHA] Inference model response latency distribution in seconds for each model and target model.
22
# TYPE inference_model_request_duration_seconds histogram
33
inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.005"} 0
44
inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.025"} 1

pkg/epp/metrics/testdata/request_error_total_metric

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# HELP inference_model_request_error_total Counter of inference model requests errors broken out for each model and target model.
1+
# HELP inference_model_request_error_total [ALPHA] Counter of inference model requests errors broken out for each model and target model.
22
# TYPE inference_model_request_error_total counter
33
inference_model_request_error_total{error_code="Internal", model_name="m10",target_model_name="t10"} 2
44
inference_model_request_error_total{error_code="ModelServerError", model_name="m10",target_model_name="t11"} 1

pkg/epp/metrics/testdata/request_sizes_metric

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# HELP inference_model_request_sizes Inference model requests size distribution in bytes for each model and target model.
1+
# HELP inference_model_request_sizes [ALPHA] Inference model requests size distribution in bytes for each model and target model.
22
# TYPE inference_model_request_sizes histogram
33
inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="64"} 0
44
inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="128"} 0

pkg/epp/metrics/testdata/request_total_metric

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# HELP inference_model_request_total Counter of inference model requests broken out for each model and target model.
1+
# HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model.
22
# TYPE inference_model_request_total counter
33
inference_model_request_total{model_name="m10", target_model_name="t10"} 2
44
inference_model_request_total{model_name="m10", target_model_name="t11"} 1

pkg/epp/metrics/testdata/response_sizes_metric

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# HELP inference_model_response_sizes Inference model responses size distribution in bytes for each model and target model.
1+
# HELP inference_model_response_sizes [ALPHA] Inference model responses size distribution in bytes for each model and target model.
22
# TYPE inference_model_response_sizes histogram
33
inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="1"} 0
44
inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="8"} 0
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# HELP inference_model_running_requests Inference model number of running requests in each model.
1+
# HELP inference_model_running_requests [ALPHA] Inference model number of running requests in each model.
22
# TYPE inference_model_running_requests gauge
33
inference_model_running_requests{model_name="m1"} 1
44
inference_model_running_requests{model_name="m2"} 1

pkg/epp/metrics/testdata/scheduler_e2e_duration_seconds_metric

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# HELP inference_extension_scheduler_e2e_duration_seconds End-to-end scheduling latency distribution in seconds.
1+
# HELP inference_extension_scheduler_e2e_duration_seconds [ALPHA] End-to-end scheduling latency distribution in seconds.
22
# TYPE inference_extension_scheduler_e2e_duration_seconds histogram
33
inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.0001"} 0
44
inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.0002"} 1

pkg/epp/metrics/testdata/scheduler_plugin_processing_latencies_metric

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# HELP inference_extension_scheduler_plugin_duration_seconds Scheduler plugin processing latency distribution in seconds for each plugin type and plugin name.
1+
# HELP inference_extension_scheduler_plugin_duration_seconds [ALPHA] Scheduler plugin processing latency distribution in seconds for each plugin type and plugin name.
22
# TYPE inference_extension_scheduler_plugin_duration_seconds histogram
33
inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginA",plugin_type="PreSchedule",le="0.0001"} 0
44
inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginA",plugin_type="PreSchedule",le="0.0002"} 0

0 commit comments

Comments
 (0)