Skip to content

Commit 6d1bd1b

Browse files
authored
Add ResourceMonitor module in Cortex, and add ResourceBasedLimiter in Ingesters and StoreGateways (#6674)
* Add resource based throttling to ingesters and store gateways Signed-off-by: Justin Jung <[email protected]> * doc Signed-off-by: Justin Jung <[email protected]> * Add automaxprocs Signed-off-by: Justin Jung <[email protected]> * nit Signed-off-by: Justin Jung <[email protected]> * Add test for monitor Signed-off-by: Justin Jung <[email protected]> * fix tests Signed-off-by: Justin Jung <[email protected]> * changelog Signed-off-by: Justin Jung <[email protected]> * fix test Signed-off-by: Justin Jung <[email protected]> * remove interface Signed-off-by: Justin Jung <[email protected]> * address comments Signed-off-by: Justin Jung <[email protected]> * rename doc Signed-off-by: Justin Jung <[email protected]> * Make monitor more generic + separate scanners Signed-off-by: Justin Jung <[email protected]> * fix tests Signed-off-by: Justin Jung <[email protected]> * fix more tests Signed-off-by: Justin Jung <[email protected]> * remove monitor_test.go Signed-off-by: Justin Jung <[email protected]> * move noop scanner to darwin scanner Signed-off-by: Justin Jung <[email protected]> * doc update Signed-off-by: Justin Jung <[email protected]> * doc Signed-off-by: Justin Jung <[email protected]> * lint Signed-off-by: Justin Jung <[email protected]> * add debugging log on unsupported resource type Signed-off-by: Justin Jung <[email protected]> * test Signed-off-by: Justin Jung <[email protected]> * add more error handling + resource_based_limiter_limit metric Signed-off-by: Justin Jung <[email protected]> * fix test Signed-off-by: Justin Jung <[email protected]> * fix test Signed-off-by: Justin Jung <[email protected]> * update changelog Signed-off-by: Justin Jung <[email protected]> * Move noopScanner to scanner.go and fix RegisterFlagsWithPrefix Signed-off-by: Justin Jung <[email protected]> * Add limit breached metric + wrap error with 429 Signed-off-by: Justin Jung <[email protected]> * Add more validation and test on instance_limits Signed-off-by: Justin Jung <[email protected]> * Added _total to counter metric Signed-off-by: Justin Jung <[email protected]> --------- Signed-off-by: Justin Jung <[email protected]>
1 parent 22cd00c commit 6d1bd1b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+2418
-49
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* [FEATURE] Query Frontend: Add dynamic interval size for query splitting. This is enabled by configuring experimental flags `querier.max-shards-per-query` and/or `querier.max-fetched-data-duration-per-query`. The split interval size is dynamically increased to maintain a number of shards and total duration fetched below the configured values. #6458
66
* [FEATURE] Querier/Ruler: Add `query_partial_data` and `rules_partial_data` limits to allow queries/rules to be evaluated with data from a single zone, if other zones are not available. #6526
77
* [FEATURE] Update prometheus alertmanager version to v0.28.0 and add new integration msteamsv2, jira, and rocketchat. #6590
8+
* [FEATURE] Ingester/StoreGateway: Add `ResourceMonitor` module in Cortex, and add `ResourceBasedLimiter` in Ingesters and StoreGateways. #6674
89
* [FEATURE] Ingester: Support out-of-order native histogram ingestion. It automatically enabled when `-ingester.out-of-order-time-window > 0` and `-blocks-storage.tsdb.enable-native-histograms=true`. #6626 #6663
910
* [FEATURE] Ruler: Add support for percentage based sharding for rulers. #6680
1011
* [ENHANCEMENT] Querier: Support query parameters to metadata api (/api/v1/metadata) to allow user to limit metadata to return. #6681

cmd/cortex/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"github.com/prometheus/client_golang/prometheus"
1919
collectorversion "github.com/prometheus/client_golang/prometheus/collectors/version"
2020
"github.com/prometheus/common/version"
21+
_ "go.uber.org/automaxprocs"
2122
"gopkg.in/yaml.v2"
2223

2324
"github.com/cortexproject/cortex/pkg/cortex"

docs/blocks-storage/store-gateway.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,21 @@ store_gateway:
349349
# CLI flag: -store-gateway.disabled-tenants
350350
[disabled_tenants: <string> | default = ""]
351351

352+
instance_limits:
353+
# EXPERIMENTAL: Max CPU utilization that this ingester can reach before
354+
# rejecting new query request (across all tenants) in percentage, between 0
355+
# and 1. monitored_resources config must include the resource type. 0 to
356+
# disable.
357+
# CLI flag: -store-gateway.instance-limits.cpu-utilization
358+
[cpu_utilization: <float> | default = 0]
359+
360+
# EXPERIMENTAL: Max heap utilization that this ingester can reach before
361+
# rejecting new query request (across all tenants) in percentage, between 0
362+
# and 1. monitored_resources config must include the resource type. 0 to
363+
# disable.
364+
# CLI flag: -store-gateway.instance-limits.heap-utilization
365+
[heap_utilization: <float> | default = 0]
366+
352367
hedged_request:
353368
# If true, hedged requests are applied to object store calls. It can help
354369
# with reducing tail latency.

docs/configuration/config-file-reference.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,12 @@ Where default_value is the value to use if the environment variable is undefined
6868
# CLI flag: -http.prefix
6969
[http_prefix: <string> | default = "/api/prom"]
7070

71+
# Comma-separated list of resources to monitor. Supported values are cpu and
72+
# heap, which tracks metrics from github.com/prometheus/procfs and
73+
# runtime/metrics that are close estimates. Empty string to disable.
74+
# CLI flag: -monitored.resources
75+
[monitored_resources: <string> | default = ""]
76+
7177
api:
7278
# Use GZIP compression for API responses. Some endpoints serve large YAML or
7379
# JSON blobs which can benefit from compression.
@@ -3197,6 +3203,20 @@ lifecycler:
31973203
[upload_compacted_blocks_enabled: <boolean> | default = true]
31983204
31993205
instance_limits:
3206+
# EXPERIMENTAL: Max CPU utilization that this ingester can reach before
3207+
# rejecting new query request (across all tenants) in percentage, between 0
3208+
# and 1. monitored_resources config must include the resource type. 0 to
3209+
# disable.
3210+
# CLI flag: -ingester.instance-limits.cpu-utilization
3211+
[cpu_utilization: <float> | default = 0]
3212+
3213+
# EXPERIMENTAL: Max heap utilization that this ingester can reach before
3214+
# rejecting new query request (across all tenants) in percentage, between 0
3215+
# and 1. monitored_resources config must include the resource type. 0 to
3216+
# disable.
3217+
# CLI flag: -ingester.instance-limits.heap-utilization
3218+
[heap_utilization: <float> | default = 0]
3219+
32003220
# Max ingestion rate (samples/sec) that ingester will accept. This limit is
32013221
# per-ingester, not per-tenant. Additional push requests will be rejected.
32023222
# Current ingestion rate is computed as exponentially weighted moving average,
@@ -5857,6 +5877,21 @@ sharding_ring:
58575877
# CLI flag: -store-gateway.disabled-tenants
58585878
[disabled_tenants: <string> | default = ""]
58595879
5880+
instance_limits:
5881+
# EXPERIMENTAL: Max CPU utilization that this ingester can reach before
5882+
# rejecting new query request (across all tenants) in percentage, between 0
5883+
# and 1. monitored_resources config must include the resource type. 0 to
5884+
# disable.
5885+
# CLI flag: -store-gateway.instance-limits.cpu-utilization
5886+
[cpu_utilization: <float> | default = 0]
5887+
5888+
# EXPERIMENTAL: Max heap utilization that this ingester can reach before
5889+
# rejecting new query request (across all tenants) in percentage, between 0
5890+
# and 1. monitored_resources config must include the resource type. 0 to
5891+
# disable.
5892+
# CLI flag: -store-gateway.instance-limits.heap-utilization
5893+
[heap_utilization: <float> | default = 0]
5894+
58605895
hedged_request:
58615896
# If true, hedged requests are applied to object store calls. It can help with
58625897
# reducing tail latency.

docs/configuration/v1-guarantees.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,3 +123,8 @@ Currently experimental features are:
123123
- Query-frontend: dynamic query splits
124124
- `querier.max-shards-per-query` (int) CLI flag
125125
- `querier.max-fetched-data-duration-per-query` (duration) CLI flag
126+
- Ingester/Store-Gateway: Resource-based throttling
127+
- `-ingester.instance-limits.cpu-utilization`
128+
- `-ingester.instance-limits.heap-utilization`
129+
- `-store-gateway.instance-limits.cpu-utilization`
130+
- `-store-gateway.instance-limits.heap-utilization`
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
---
2+
title: "Protecting Cortex from Heavy Queries"
3+
linkTitle: "Protecting Cortex from Heavy Queries"
4+
weight: 11
5+
slug: protecting-cortex-from-heavy-queries
6+
---
7+
8+
PromQL is powerful, and is able to result in query requests that have very wide range of data fetched and samples processed. Heavy queries can cause:
9+
10+
1. CPU on any query component to be partially exhausted, increasing latency and causing incoming queries to queue up with high chance of time-out.
11+
2. CPU on any query component to be fully exhausted, causing GC to slow down leading to the pod being out-of-memory and killed.
12+
3. Heap memory on any query component to be exhausted, leading to the pod being out-of-memory and killed.
13+
14+
It's important to protect Cortex components by setting appropriate limits and throttling configurations based on your infrastructure and data ingested by the customers.
15+
16+
## Static limits
17+
18+
There are number of static limits that you could configure to block heavy queries from running.
19+
20+
### Max outstanding requests per tenant
21+
22+
See https://cortexmetrics.io/docs/configuration/configuration-file/#query_frontend_config:~:text=max_outstanding_requests_per_tenant for details.
23+
24+
### Max data bytes fetched per (sharded) query
25+
26+
See https://cortexmetrics.io/docs/configuration/configuration-file/#query_frontend_config:~:text=max_fetched_data_bytes_per_query for details.
27+
28+
### Max series fetched per (sharded) query
29+
30+
See https://cortexmetrics.io/docs/configuration/configuration-file/#query_frontend_config:~:text=max_fetched_series_per_query for details.
31+
32+
### Max chunks fetched per (sharded) query
33+
34+
See https://cortexmetrics.io/docs/configuration/configuration-file/#query_frontend_config:~:text=max_fetched_chunk_bytes_per_query for details.
35+
36+
### Max samples fetched per (sharded) query
37+
38+
See https://cortexmetrics.io/docs/configuration/configuration-file/#querier_config:~:text=max_samples for details.
39+
40+
## Resource-based throttling (Experimental)
41+
42+
Although the static limits are able to protect Cortex components from specific query patterns, they are not generic enough to cover different combinations of bad query patterns. For example, what if the query fetches relatively large postings, series and chunks that are slightly below the individual limits? For a more generic solution, you can enable resource-based throttling by setting CPU and heap utilization thresholds.
43+
44+
Currently, it only throttles incoming query requests with error code 429 (too many requests) when the resource usage breaches the configured thresholds.
45+
46+
For example, the following configuration will start throttling query requests if either CPU or heap utilization is above 80%, leaving 20% of room for inflight requests.
47+
48+
```
49+
target: ingester
50+
monitored_resources: cpu,heap
51+
instance_limits:
52+
cpu_utilization: 0.8
53+
heap_utilization: 0.8
54+
```
55+
56+
See https://cortexmetrics.io/docs/configuration/configuration-file/:~:text=instance_limits for details.

go.mod

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,11 @@ require (
8181
github.com/google/go-cmp v0.7.0
8282
github.com/hashicorp/golang-lru/v2 v2.0.7
8383
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822
84+
github.com/prometheus/procfs v0.15.1
8485
github.com/sercand/kuberesolver/v5 v5.1.1
8586
github.com/tjhop/slog-gokit v0.1.3
8687
go.opentelemetry.io/collector/pdata v1.24.0
88+
go.uber.org/automaxprocs v1.6.0
8789
google.golang.org/protobuf v1.36.4
8890
)
8991

@@ -199,7 +201,6 @@ require (
199201
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
200202
github.com/prometheus-community/prom-label-proxy v0.8.1-0.20240127162815-c1195f9aabc0 // indirect
201203
github.com/prometheus/exporter-toolkit v0.13.2 // indirect
202-
github.com/prometheus/procfs v0.15.1 // indirect
203204
github.com/prometheus/sigv4 v0.1.1 // indirect
204205
github.com/redis/rueidis v1.0.45-alpha.1 // indirect
205206
github.com/rs/cors v1.11.1 // indirect

go.sum

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1560,6 +1560,8 @@ github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRI
15601560
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
15611561
github.com/posener/complete v1.1.1/go.mod h1:em0nMJCgc9GFtwrmVmEMR/ZL6WyhyjMBndrE9hABlRI=
15621562
github.com/posener/complete v1.2.3/go.mod h1:WZIdtGGp+qx0sLrYKtIRAruyNpv6hFCicSgv7Sy7s/s=
1563+
github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g=
1564+
github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U=
15631565
github.com/prometheus-community/prom-label-proxy v0.8.1-0.20240127162815-c1195f9aabc0 h1:owfYHh79h8Y5HvNMGyww+DaVwo10CKiRW1RQrrZzIwg=
15641566
github.com/prometheus-community/prom-label-proxy v0.8.1-0.20240127162815-c1195f9aabc0/go.mod h1:rT989D4UtOcfd9tVqIZRVIM8rkg+9XbreBjFNEKXvVI=
15651567
github.com/prometheus/alertmanager v0.28.1 h1:BK5pCoAtaKg01BYRUJhEDV1tqJMEtYBGzPw8QdvnnvA=
@@ -1811,6 +1813,8 @@ go.opentelemetry.io/proto/otlp v1.5.0/go.mod h1:keN8WnHxOy8PG0rQZjJJ5A2ebUoafqWp
18111813
go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
18121814
go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE=
18131815
go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0=
1816+
go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs=
1817+
go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8=
18141818
go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A=
18151819
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
18161820
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=

pkg/configs/instance_limits.go

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
package configs
2+
3+
import (
4+
"errors"
5+
"flag"
6+
"strings"
7+
8+
"github.com/cortexproject/cortex/pkg/util/flagext"
9+
"github.com/cortexproject/cortex/pkg/util/resource"
10+
)
11+
12+
type InstanceLimits struct {
13+
CPUUtilization float64 `yaml:"cpu_utilization"`
14+
HeapUtilization float64 `yaml:"heap_utilization"`
15+
}
16+
17+
func (cfg *InstanceLimits) RegisterFlagsWithPrefix(f *flag.FlagSet, prefix string) {
18+
f.Float64Var(&cfg.CPUUtilization, prefix+"instance-limits.cpu-utilization", 0, "EXPERIMENTAL: Max CPU utilization that this ingester can reach before rejecting new query request (across all tenants) in percentage, between 0 and 1. monitored_resources config must include the resource type. 0 to disable.")
19+
f.Float64Var(&cfg.HeapUtilization, prefix+"instance-limits.heap-utilization", 0, "EXPERIMENTAL: Max heap utilization that this ingester can reach before rejecting new query request (across all tenants) in percentage, between 0 and 1. monitored_resources config must include the resource type. 0 to disable.")
20+
}
21+
22+
func (cfg *InstanceLimits) Validate(monitoredResources flagext.StringSliceCSV) error {
23+
if cfg.CPUUtilization > 1 || cfg.CPUUtilization < 0 {
24+
return errors.New("cpu_utilization must be between 0 and 1")
25+
}
26+
27+
if cfg.CPUUtilization > 0 && !strings.Contains(monitoredResources.String(), string(resource.CPU)) {
28+
return errors.New("monitored_resources config must include \"cpu\" as well")
29+
}
30+
31+
if cfg.HeapUtilization > 1 || cfg.HeapUtilization < 0 {
32+
return errors.New("heap_utilization must be between 0 and 1")
33+
}
34+
35+
if cfg.HeapUtilization > 0 && !strings.Contains(monitoredResources.String(), string(resource.Heap)) {
36+
return errors.New("monitored_resources config must include \"heap\" as well")
37+
}
38+
39+
return nil
40+
}

pkg/configs/instance_limits_test.go

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
package configs
2+
3+
import (
4+
"errors"
5+
"testing"
6+
7+
"github.com/stretchr/testify/require"
8+
)
9+
10+
func Test_Validate(t *testing.T) {
11+
for name, tc := range map[string]struct {
12+
instanceLimits InstanceLimits
13+
monitoredResources []string
14+
err error
15+
}{
16+
"correct config should pass validation": {
17+
instanceLimits: InstanceLimits{
18+
CPUUtilization: 0.5,
19+
HeapUtilization: 0.5,
20+
},
21+
monitoredResources: []string{"cpu", "heap"},
22+
err: nil,
23+
},
24+
"utilization config less than 0 should fail validation": {
25+
instanceLimits: InstanceLimits{
26+
CPUUtilization: -0.5,
27+
HeapUtilization: 0.5,
28+
},
29+
monitoredResources: []string{"cpu", "heap"},
30+
err: errors.New("cpu_utilization must be between 0 and 1"),
31+
},
32+
"utilization config greater than 1 should fail validation": {
33+
instanceLimits: InstanceLimits{
34+
CPUUtilization: 0.5,
35+
HeapUtilization: 1.5,
36+
},
37+
monitoredResources: []string{"cpu", "heap"},
38+
err: errors.New("heap_utilization must be between 0 and 1"),
39+
},
40+
"missing cpu in monitored_resources config should fail validation": {
41+
instanceLimits: InstanceLimits{
42+
CPUUtilization: 0.5,
43+
},
44+
monitoredResources: []string{"heap"},
45+
err: errors.New("monitored_resources config must include \"cpu\" as well"),
46+
},
47+
"missing heap in monitored_resources config should fail validation": {
48+
instanceLimits: InstanceLimits{
49+
HeapUtilization: 0.5,
50+
},
51+
monitoredResources: []string{"cpu"},
52+
err: errors.New("monitored_resources config must include \"heap\" as well"),
53+
},
54+
} {
55+
t.Run(name, func(t *testing.T) {
56+
err := tc.instanceLimits.Validate(tc.monitoredResources)
57+
if tc.err != nil {
58+
require.Errorf(t, err, tc.err.Error())
59+
} else {
60+
require.NoError(t, err)
61+
}
62+
})
63+
}
64+
}

pkg/cortex/cortex.go

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"gopkg.in/yaml.v2"
2323

2424
"github.com/cortexproject/cortex/pkg/util/grpcclient"
25+
"github.com/cortexproject/cortex/pkg/util/resource"
2526

2627
"github.com/cortexproject/cortex/pkg/alertmanager"
2728
"github.com/cortexproject/cortex/pkg/alertmanager/alertstore"
@@ -88,10 +89,11 @@ var (
8889

8990
// Config is the root config for Cortex.
9091
type Config struct {
91-
Target flagext.StringSliceCSV `yaml:"target"`
92-
AuthEnabled bool `yaml:"auth_enabled"`
93-
PrintConfig bool `yaml:"-"`
94-
HTTPPrefix string `yaml:"http_prefix"`
92+
Target flagext.StringSliceCSV `yaml:"target"`
93+
AuthEnabled bool `yaml:"auth_enabled"`
94+
PrintConfig bool `yaml:"-"`
95+
HTTPPrefix string `yaml:"http_prefix"`
96+
MonitoredResources flagext.StringSliceCSV `yaml:"monitored_resources"`
9597

9698
ExternalQueryable prom_storage.Queryable `yaml:"-"`
9799
ExternalPusher ruler.Pusher `yaml:"-"`
@@ -143,6 +145,11 @@ func (c *Config) RegisterFlags(f *flag.FlagSet) {
143145
f.BoolVar(&c.PrintConfig, "print.config", false, "Print the config and exit.")
144146
f.StringVar(&c.HTTPPrefix, "http.prefix", "/api/prom", "HTTP path prefix for Cortex API.")
145147

148+
c.MonitoredResources = []string{}
149+
f.Var(&c.MonitoredResources, "monitored.resources", "Comma-separated list of resources to monitor. "+
150+
"Supported values are cpu and heap, which tracks metrics from github.com/prometheus/procfs and runtime/metrics "+
151+
"that are close estimates. Empty string to disable.")
152+
146153
c.API.RegisterFlags(f)
147154
c.registerServerFlagsWithChangedDefaultValues(f)
148155
c.Distributor.RegisterFlags(f)
@@ -216,7 +223,7 @@ func (c *Config) Validate(log log.Logger) error {
216223
if err := c.QueryRange.Validate(c.Querier); err != nil {
217224
return errors.Wrap(err, "invalid query_range config")
218225
}
219-
if err := c.StoreGateway.Validate(c.LimitsConfig); err != nil {
226+
if err := c.StoreGateway.Validate(c.LimitsConfig, c.MonitoredResources); err != nil {
220227
return errors.Wrap(err, "invalid store-gateway config")
221228
}
222229
if err := c.Compactor.Validate(c.LimitsConfig); err != nil {
@@ -229,14 +236,24 @@ func (c *Config) Validate(log log.Logger) error {
229236
return errors.Wrap(err, "invalid alertmanager config")
230237
}
231238

232-
if err := c.Ingester.Validate(); err != nil {
239+
if err := c.Ingester.Validate(c.MonitoredResources); err != nil {
233240
return errors.Wrap(err, "invalid ingester config")
234241
}
235242

236243
if err := c.Tracing.Validate(); err != nil {
237244
return errors.Wrap(err, "invalid tracing config")
238245
}
239246

247+
for _, r := range c.MonitoredResources {
248+
switch resource.Type(r) {
249+
case resource.CPU, resource.Heap:
250+
default:
251+
if len(r) > 0 {
252+
return fmt.Errorf("unsupported resource type to monitor: %s", r)
253+
}
254+
}
255+
}
256+
240257
return nil
241258
}
242259

@@ -315,6 +332,7 @@ type Cortex struct {
315332
MetadataQuerier querier.MetadataQuerier
316333
QuerierEngine promql.QueryEngine
317334
QueryFrontendTripperware tripperware.Tripperware
335+
ResourceMonitor *resource.Monitor
318336

319337
Ruler *ruler.Ruler
320338
RulerStorage rulestore.RuleStore

0 commit comments

Comments
 (0)