coder-observability/values.yaml

global:
  coder:
    # global.coder.scrapeMetrics -- use this to scrape metrics from a standalone (set of) coder deployment(s)
    # if using kubernetes, rather add an annotation "prometheus.io/scrape=true" and coder will get automatically scraped;
    # set this value to null and configure coderdSelector to target your coder pods
    scrapeMetrics: null
#      hostname: localhost
#      port: 2112
#      scrapeInterval: 15s
#      additionalLabels:
#        job: coder
    # global.coder.coderdSelector -- series selector for Prometheus/Loki to locate provisioner pods.
    # ensure this uses backticks for quotes!
    coderdSelector: 'pod=~`coder.*`, pod!~`.*provisioner.*`'
    # global.coder.provisionerdSelector -- series selector for Prometheus/Loki to locate provisioner pods.
    # https://coder.com/docs/v2/latest/admin/provisioners
    # TODO: rename container label in provisioner helm chart to be "provisioner" not "coder"
    # ensure this uses backticks for quotes!
    provisionerdSelector: 'pod=~`coder-provisioner.*`'
    # global.coder.workspacesSelector -- the namespace into which any external provisioners have been deployed.
    workspacesSelector: 'namespace=`coder-workspaces`'
    # global.coder.controlPlaneNamespace -- the namespace into which the control plane has been deployed.
    controlPlaneNamespace: coder
    # global.coder.externalProvisionersNamespace -- the namespace into which any external provisioners have been deployed.
    externalProvisionersNamespace: coder
    # See https://coder.com/docs/v2/latest/cli/server#--log-human
    # "Human" format is the default, which is a combination of plaintext and logfmt but it' quite tricky to parse reliably
    # with regex matchers.
    # TODO: support "json" format
    logFormat: human
    # global.coder.alerts -- alerts for the various aspects of Coder
    alerts:
      enterprise:
        groups:
          Licences:
            enabled: true
            delay: 1m
            thresholds:
              warning: 0.9
              critical: 1
      coderd:
        groups:
          CPU:
            enabled: true
            delay: 10m
            period: 10m
            thresholds:
              warning: 0.8
              critical: 0.9
          Memory:
            enabled: true
            delay: 10m
            thresholds:
              warning: 0.8
              critical: 0.9
          Restarts:
            enabled: true
            delay: 1m
            period: 10m
            thresholds:
              notify: 1
              warning: 2
              critical: 3
          Replicas:
            enabled: true
            delay: 5m
            thresholds:
              notify: 3 # 2/3 replicas are alive
              warning: 2 # 1/3 replicas are alive
              critical: 1 # 0/3 replicas are alive
          WorkspaceBuildFailures:
            enabled: true
            delay: 10m
            period: 10m
            thresholds:
              notify: 2
              warning: 5
              critical: 10
          IneligiblePrebuilds:
            enabled: true
            delay: 10m
            thresholds:
              notify: 1
      provisionerd:
        groups:
          Replicas:
            enabled: true
            delay: 5m
            thresholds:
              notify: 3 # 2/3 replicas are alive
              warning: 2 # 1/3 replicas are alive
              critical: 1 # 0/3 replicas are alive

  zone: svc

  externalScheme: http
  # The external hostname from which k8s services can be accessed in the form of:
  # <externalScheme>:<svc>.<>
  # e.g.
  # http://dashboards.coder-observability.svc.cluster.local
  externalZone: svc.cluster.local

  # global.telemetry -- control telemetry collection
  telemetry:
    # global.telemetry.metrics -- control metric collection
    metrics:
      # global.telemetry.metrics.scrape_interval -- how often the collector will scrape discovered pods
      scrape_interval: 15s
      # global.telemetry.metrics.scrape_timeout -- how long a request will be allowed to wait before being canceled
      scrape_timeout: 12s

  # global.postgres -- postgres connection information
  # NOTE: these settings are global so we can parameterise some values which get rendered by subcharts
  postgres:
    hostname: localhost
    port: 5432
    username: coder
    password:
    database: coder
    sslmode: disable
    # ensure that your secret has a field named `PGPASSWORD`
    mountSecret: "secret-postgres"
    exporter:
      image: "quay.io/prometheuscommunity/postgres-exporter"

    # global.postgres.alerts -- alerts for postgres
    alerts:
      groups:
        Basic:
          enabled: true
          delay: 1m
        Notifications:
          enabled: true
          delay: 15m
          thresholds:
            notify: 0.5
            warning: 0.8
            critical: 0.9
        Connections:
          enabled: true
          delay: 5m
          thresholds:
            notify: 0.5
            warning: 0.8
            critical: 0.9

  # global.dashboards -- settings for bundled dashboards
  dashboards:
    # global.dashboards.timerange -- how far back dashboards should look
    timerange: 12h
    # global.dashboards.refresh -- how often dashboards should refresh
    refresh: 30s
    # global.dashboards.queryTimeout -- how long until a query in Grafana will timeout after
    queryTimeout: 900

runbookViewer:
  image: "dannyben/madness"

sqlExporter:
  image: "burningalchemist/sql_exporter"

grafana-agent:
  enabled: true
  fullnameOverride: grafana-agent
  agent:
    mode: flow
    configMap:
      name: collector-config
      key: config.river
      create: false
    clustering:
      enabled: false
    extraArgs:
      - --disable-reporting=true
    mounts:
      varlog: true
      dockercontainers: true
  controller:
    type: daemonset
    podAnnotations:
      prometheus.io/scrape: "true"
  crds:
    create: false

  withOTLPReceiver: false

  # Configuration blocks
  #
  # Enable debug logging (warning: produces large amount of logs!)
  #logging: |-
  #  logging {
  #    level  = "debug"
  #    format = "logfmt"
  #  }
  discovery: |-
    // Discover k8s nodes
    discovery.kubernetes "nodes" {
      role = "node"
    }

    // Discover k8s pods
    discovery.kubernetes "pods" {
      role = "pod"
      selectors {
       role  = "pod"
      }
    }
  commonRelabellings: |-
    rule {
      source_labels = ["__meta_kubernetes_namespace"]
      target_label  = "namespace"
    }
    rule {
      source_labels = ["__meta_kubernetes_pod_name"]
      target_label  = "pod"
    }
    // coalesce the following labels and pick the first value; we'll use this to define the "job" label
    rule {
      source_labels  = ["__meta_kubernetes_pod_label_app_kubernetes_io_component", "app", "__meta_kubernetes_pod_container_name"]
      separator      = "/"
      target_label   = "__meta_app"
      action         = "replace"
      regex          = "^/*([^/]+?)(?:/.*)?$" // split by the delimiter if it exists, we only want the first one
      replacement    = "${1}"
    }
    rule {
      source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_label_app_kubernetes_io_name", "__meta_app"]
      separator     = "/"
      target_label  = "job"
    }
    rule {
      source_labels = ["__meta_kubernetes_pod_container_name"]
      target_label  = "container"
    }
    rule {
      regex   = "__meta_kubernetes_pod_label_(statefulset_kubernetes_io_pod_name|controller_revision_hash)"
      action  = "labeldrop"
    }
    rule {
      regex   = "pod_template_generation"
      action  = "labeldrop"
    }
    rule {
      source_labels = ["__meta_kubernetes_pod_phase"]
      regex = "Pending|Succeeded|Failed|Completed"
      action = "drop"
    }
    rule {
      source_labels = ["__meta_kubernetes_pod_node_name"]
      action = "replace"
      target_label = "node"
    }
    rule {
      action = "labelmap"
      regex = "__meta_kubernetes_pod_annotation_prometheus_io_param_(.+)"
      replacement = "__param_$1"
    }
  extraBlocks: ""
    # Examples:
    # loki.source.file "tmpfiles" {
    #   targets    = [
    #     {__path__ = "/tmp/foo.txt", "color" = "pink"},
    #     {__path__ = "/tmp/bar.txt", "color" = "blue"},
    #     {__path__ = "/tmp/baz.txt", "color" = "grey"},
    #   ]
    #   forward_to = [loki.write.loki.receiver]
    # }
  podMetricsRelabelRules: ""
  podLogsRelabelRules: ""

grafana:
  enabled: true
  fullnameOverride: grafana
  useStatefulSet: true
  replicas: 1
  deploymentStrategy:
    type: Recreate  # avoid MultiAttachError for standard-rwo sc
  service:
    enabled: true
  persistence:
    enabled: true
    size: 10Gi
  testFramework:
    enabled: false
  annotations:
    # TODO: this adds annotations to _all_ resources; can we be more specific?
    prometheus.io/scrape: "true"
  dashboardProviders:
    infra.yaml:
      apiVersion: 1
      providers:
        - name: infra
          orgId: 1
          folder: 'Infrastructure'
          type: file
          disableDeletion: false
          editable: false
          options:
            path: /var/lib/grafana/dashboards/infra
    coder.yaml:
      apiVersion: 1
      providers:
        - name: coder
          orgId: 1
          folder: 'Coder'
          type: file
          updateIntervalSeconds: 5
          disableDeletion: false
          editable: false
          options:
            path: /var/lib/grafana/dashboards/coder
    sidecar.yaml:
      apiVersion: 1
      providers:
        - name: sidecar
          orgId: 1
          type: file
          folder: 'Other'
          disableDeletion: false
          updateIntervalSeconds: 30
          editable: false
          options:
            path: /tmp/dashboards
  dashboards:
    # TODO: import dashboards from coder/coder
    infra:
      node-exporter-full:
        gnetId: 1860
        revision: 36
        datasource: metrics
      postgres-database:
        gnetId: 9628
        revision: 7
        datasource: metrics
  datasources:
    datasources.yaml:
      apiVersion: 1
      datasources:
        - name: metrics
          type: prometheus
          url: http://prometheus.{{ .Release.Namespace }}.{{ $.Values.global.zone }}
          access: proxy
          isDefault: true
          editable: false
          # add 5s on global timeout to distinguish between Grafana timeout & datasource timeout
          timeout: '{{ add $.Values.global.dashboards.queryTimeout 5 }}'
          uid: prometheus
        - name: logs
          type: loki
          url: http://loki-gateway.{{ .Release.Namespace }}.{{ $.Values.global.zone }}
          access: proxy
          isDefault: false
          editable: false
          # add 5s on global timeout to distinguish between Grafana timeout & datasource timeout
          timeout: '{{ add $.Values.global.dashboards.queryTimeout 5 }}'
          uid: loki
        - name: postgres
          type: postgres
          url: '{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}'
          user: '{{ .Values.global.postgres.username }}'
          secureJsonData:
            password: '{{ if .Values.global.postgres.password }}{{ .Values.global.postgres.password }}{{ else }}$PGPASSWORD{{ end }}'
          jsonData:
            sslmode: '{{ .Values.global.postgres.sslmode }}'
          isDefault: false
          editable: false
          # add 5s on global timeout to distinguish between Grafana timeout & datasource timeout
          timeout: '{{ add $.Values.global.dashboards.queryTimeout 5 }}'
          uid: postgres
  admin:
    existingSecret: ""
  env:
    GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION: true
  grafana.ini:
    auth.anonymous:
      enabled: true
      org_name: Main Org.
      org_role: Admin
    analytics:
      reporting_enabled: false
    users:
      allow_sign_up: false
    feature_toggles:
      # migrate Angular panels to React
      # see https://grafana.com/docs/grafana/latest/developers/angular_deprecation/angular-plugins/#automatic-migration-of-plugins
      autoMigrateOldPanels: true
    dashboards:
      # mounted configmap will be synced with sidecar
      default_home_dashboard_path: /var/lib/grafana/dashboards/coder/0/status.json
    dataproxy:
      timeout: '{{ $.Values.global.dashboards.queryTimeout }}'
  sidecar:
    dashboards:
      provider:
        disableDelete: true
        allowUiUpdates: true
      enabled: false
      labelValue: "1"
  extraConfigmapMounts:
    # we can't combine configmaps because of the 1MiB size limit, but Grafana will scan
    # the /var/lib/grafana/dashboards/coder directory deeply to find dashboards
    - name: dashboards-status
      mountPath: /var/lib/grafana/dashboards/coder/0
      configMap: dashboards-status
      readOnly: false
    - name: dashboards-coderd
      mountPath: /var/lib/grafana/dashboards/coder/1
      configMap: dashboards-coderd
      readOnly: false
    - name: dashboards-provisionerd
      mountPath: /var/lib/grafana/dashboards/coder/2
      configMap: dashboards-provisionerd
      readOnly: false
    - name: dashboards-workspaces
      mountPath: /var/lib/grafana/dashboards/coder/3
      configMap: dashboards-workspaces
      readOnly: false
    - name: dashboards-workspace-detail
      mountPath: /var/lib/grafana/dashboards/coder/4
      configMap: dashboards-workspace-detail
      readOnly: false

prometheus:
  enabled: true
  server:
    fullnameOverride: prometheus
    podAnnotations:
      prometheus.io/scrape: "true"

    global:
      # prometheus.server.evaluation_interval -- how often to evaluate recording & alerting rule groups
      evaluation_interval: 30s

    extraArgs:
      log.level: debug

    replicaCount: 1
    statefulSet:
      enabled: true

    retentionSize: 10GB
    persistentVolume:
      enabled: true
      # Note: allowing +2GB breathing room above storage.tsdb.retention.size
      size: 12Gi
    service:
      type: ClusterIP
    extraFlags:
      - web.enable-lifecycle
      - enable-feature=remote-write-receiver
    extraConfigmapMounts:
      - name: alerts
        mountPath: /etc/config/alerts
        configMap: metrics-alerts
        readonly: true

  serverFiles:
    prometheus.yml:
      # disables scraping of metrics by the Prometheus helm chart since this is managed by the collector
      scrape_configs: []
      # use custom rule files to be able to render templates (can't do that in values.yaml, unless that value is evaluated by a tpl call)
      rule_files:
        - /etc/config/alerts/*.yaml

  testFramework:
    enabled: false

  # enable metric collection from configmap reloader
  configmapReload:
    prometheus:
      extraArgs:
        log-level: all
        watch-interval: 15s
      containerPort: 9091
      extraConfigmapMounts:
        - name: alerts
          mountPath: /etc/config/alerts
          configMap: metrics-alerts
          readonly: true

  alertmanager:
    fullnameOverride: alertmanager
    enabled: true
    service:
      port: 80
    podAnnotations:
      prometheus.io/scrape: "true"
  kube-state-metrics:
    fullnameOverride: kube-state-metrics
    enabled: true
    podAnnotations:
      prometheus.io/scrape: "true"
  prometheus-node-exporter:
    fullnameOverride: node-exporter
    enabled: true
    podAnnotations:
      prometheus.io/scrape: "true"

  # Disable push gateway
  prometheus-pushgateway:
    enabled: false

loki:
  enabled: true
  nameOverride: loki
  fullnameOverride: loki

  enterprise:
    enabled: false
    adminApi:
      enabled: false
    useExternalLicense: false

  test:
    canaryServiceAddress: "http://loki-canary:3500/metrics"
    enabled: true

  minio:
    enabled: true
    fullnameOverride: loki-storage
    address: loki-storage.{{ .Release.Namespace }}.{{ .Values.global.zone}}:9000
    podAnnotations:
      prometheus.io/scrape: "true"
      prometheus.io/path: "/minio/v2/metrics/cluster"
    podLabels:
      app.kubernetes.io/name: "loki-storage"

  loki:
    auth_enabled: false
    commonConfig:
      path_prefix: /var/loki
      replication_factor: 1
    schemaConfig:
      configs:
      - from: 2024-04-01
        store: tsdb
        object_store: s3
        schema: v13
        index:
          prefix: index_
          period: 24h

    rulerConfig:
      remote_write:
        enabled: true
        clients:
          # "fake" is the default username when auth is disabled (unfortunate, I know)
          fake:
            url: http://prometheus.{{ .Release.Namespace }}.{{ .Values.global.zone}}/api/v1/write
            headers:
              Source: Loki
            remote_timeout: 30s
      wal:
        dir: /var/loki-ruler-wal
      alertmanager_url: http://alertmanager.{{ .Release.Namespace }}.{{ .Values.global.zone}}
      enable_api: true
      ring:
        kvstore:
          store: inmemory
      enable_alertmanager_v2: true
      storage:
        type: local
        local:
          directory: /rules
      rule_path: /rules

  lokiCanary:
    enabled: true
    annotations:
      prometheus.io/scrape: "true"

  chunksCache:
    allocatedMemory: 1024
  resultsCache:
    allocatedMemory: 1024

  # disabled scraping of logs by the Loki helm chart since this is managed by the collector
  monitoring:
    selfMonitoring:
      enabled: false
      grafanaAgent:
        installOperator: false
    # creates ConfigMaps of dashboards which are discovered via labels
    dashboards:
      enabled: true

  sidecar:
    rules:
      logLevel: DEBUG
      folder: /rules/fake

  gateway:
    replicas: 1
  write:
    podAnnotations:
      prometheus.io/scrape: "true"
    replicas: 1
    extraArgs:
      - -log.level=debug
  read:
    podAnnotations:
      prometheus.io/scrape: "true"
    replicas: 1
  backend:
    podAnnotations:
      prometheus.io/scrape: "true"
    replicas: 1
    extraVolumes:
      - name: ruler-wal
        emptyDir: { }
    extraVolumeMounts:
      - name: ruler-wal
        mountPath: /var/loki-ruler-wal
    extraArgs:
      - -log.level=debug