feat: add observability stack and background task infrastructure

Add OpenTelemetry instrumentation with distributed tracing and metrics: - Structured JSON logging with trace context correlation - Auto-instrumentation for FastAPI, asyncpg, httpx, redis - OTLP exporter for traces and Prometheus metrics endpoint Implement Celery worker and notification task system: - Celery app with Redis/SQS broker support and configurable queues - Notification tasks for incident fan-out, webhooks, and escalations - Pluggable TaskQueue abstraction with in-memory driver for testing Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana): - OpenTelemetry Collector for receiving OTLP traces and logs - Tempo for distributed tracing backend - Loki for log aggregation with Promtail DaemonSet - Prometheus for metrics scraping with RBAC configuration - Grafana with pre-provisioned datasources and API overview dashboard - Helm templates for all observability components Enhance application infrastructure: - Global exception handlers with structured ErrorResponse schema - Request logging middleware with timing metrics - Health check updated to verify task queue connectivity - Non-root user in Dockerfile for security - Init containers in Helm deployments for dependency ordering - Production Helm values with autoscaling and retention policies
2026-01-07 20:51:13 -05:00
parent f427d191e0
commit 46ede7757d
45 changed files with 3742 additions and 76 deletions
--- a/helm/incidentops/templates/api-deployment.yaml
+++ b/helm/incidentops/templates/api-deployment.yaml
@@ -29,6 +29,29 @@ spec:
      serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
      securityContext:
        {{- toYaml .Values.podSecurityContext | nindent 8 }}
+      initContainers:
+        - name: wait-for-postgres
+          image: busybox:1.36
+          command:
+            - sh
+            - -c
+            - |
+              until nc -z {{ include "incidentops.fullname" . }}-postgresql 5432; do
+                echo "Waiting for PostgreSQL..."
+                sleep 2
+              done
+              echo "PostgreSQL is ready"
+        - name: wait-for-redis
+          image: busybox:1.36
+          command:
+            - sh
+            - -c
+            - |
+              until nc -z {{ include "incidentops.fullname" . }}-redis 6379; do
+                echo "Waiting for Redis..."
+                sleep 2
+              done
+              echo "Redis is ready"
      containers:
        - name: api
          securityContext:
@@ -39,6 +62,11 @@ spec:
            - name: http
              containerPort: 8000
              protocol: TCP
+            {{- if .Values.metrics.enabled }}
+            - name: metrics
+              containerPort: {{ .Values.metrics.port }}
+              protocol: TCP
+            {{- end }}
          envFrom:
            - configMapRef:
                name: {{ include "incidentops.fullname" . }}-config
--- a/helm/incidentops/templates/api-service.yaml
+++ b/helm/incidentops/templates/api-service.yaml
@@ -11,5 +11,11 @@ spec:
      targetPort: http
      protocol: TCP
      name: http
+    {{- if .Values.metrics.enabled }}
+    - port: {{ .Values.metrics.port }}
+      targetPort: metrics
+      protocol: TCP
+      name: metrics
+    {{- end }}
  selector:
    {{- include "incidentops.api.selectorLabels" . | nindent 4 }}
--- a/helm/incidentops/templates/configmap.yaml
+++ b/helm/incidentops/templates/configmap.yaml
@@ -8,3 +8,16 @@ data:
  JWT_ALGORITHM: {{ .Values.config.jwtAlgorithm | quote }}
  ACCESS_TOKEN_EXPIRE_MINUTES: {{ .Values.config.accessTokenExpireMinutes | quote }}
  REFRESH_TOKEN_EXPIRE_DAYS: {{ .Values.config.refreshTokenExpireDays | quote }}
+  # OpenTelemetry configuration
+  OTEL_ENABLED: {{ .Values.observability.enabled | quote }}
+  OTEL_SERVICE_NAME: "incidentops-api"
+  OTEL_ENVIRONMENT: {{ .Values.config.environment | default "production" | quote }}
+  {{- if .Values.observability.enabled }}
+  OTEL_EXPORTER_OTLP_ENDPOINT: "http://{{ include "incidentops.fullname" . }}-otel-collector:4317"
+  {{- end }}
+  OTEL_EXPORTER_OTLP_INSECURE: "true"
+  OTEL_LOG_LEVEL: {{ .Values.config.logLevel | default "INFO" | quote }}
+  # Metrics configuration
+  {{- if .Values.metrics.enabled }}
+  PROMETHEUS_PORT: {{ .Values.metrics.port | quote }}
+  {{- end }}
--- a/helm/incidentops/templates/grafana-deployment.yaml
+++ b/helm/incidentops/templates/grafana-deployment.yaml
@@ -0,0 +1,387 @@
+{{- if .Values.observability.enabled }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "incidentops.fullname" . }}-grafana-datasources
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: grafana
+data:
+  datasources.yaml: |
+    apiVersion: 1
+    datasources:
+      - name: Prometheus
+        type: prometheus
+        uid: prometheus
+        url: http://{{ include "incidentops.fullname" . }}-prometheus:9090
+        access: proxy
+        isDefault: false
+        jsonData:
+          httpMethod: POST
+          exemplarTraceIdDestinations:
+            - name: trace_id
+              datasourceUid: tempo
+
+      - name: Tempo
+        type: tempo
+        uid: tempo
+        url: http://{{ include "incidentops.fullname" . }}-tempo:3200
+        access: proxy
+        isDefault: false
+        jsonData:
+          tracesToLogsV2:
+            datasourceUid: loki
+            spanStartTimeShift: '-1h'
+            spanEndTimeShift: '1h'
+            filterByTraceID: true
+            filterBySpanID: true
+          tracesToMetrics:
+            datasourceUid: prometheus
+            spanStartTimeShift: '-1h'
+            spanEndTimeShift: '1h'
+          serviceMap:
+            datasourceUid: prometheus
+          nodeGraph:
+            enabled: true
+          lokiSearch:
+            datasourceUid: loki
+
+      - name: Loki
+        type: loki
+        uid: loki
+        url: http://{{ include "incidentops.fullname" . }}-loki:3100
+        access: proxy
+        isDefault: true
+        jsonData:
+          derivedFields:
+            - datasourceUid: tempo
+              matcherRegex: '"trace_id":"([a-f0-9]+)"'
+              name: TraceID
+              url: '$${__value.raw}'
+              urlDisplayLabel: 'View Trace'
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "incidentops.fullname" . }}-grafana-dashboards-provider
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: grafana
+data:
+  dashboards.yaml: |
+    apiVersion: 1
+    providers:
+      - name: 'default'
+        orgId: 1
+        folder: 'IncidentOps'
+        folderUid: 'incidentops'
+        type: file
+        disableDeletion: false
+        editable: true
+        options:
+          path: /var/lib/grafana/dashboards
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "incidentops.fullname" . }}-grafana-dashboards
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: grafana
+data:
+  api-overview.json: |
+    {
+      "title": "IncidentOps API Overview",
+      "uid": "incidentops-api",
+      "tags": ["incidentops", "api"],
+      "timezone": "browser",
+      "editable": true,
+      "panels": [
+        {
+          "id": 1,
+          "title": "Request Rate",
+          "type": "timeseries",
+          "gridPos": {"h": 8, "w": 8, "x": 0, "y": 0},
+          "targets": [
+            {
+              "datasource": {"type": "prometheus", "uid": "prometheus"},
+              "expr": "sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
+              "legendFormat": "Requests/sec",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "reqps"
+            }
+          }
+        },
+        {
+          "id": 2,
+          "title": "Request Duration (p50, p95, p99)",
+          "type": "timeseries",
+          "gridPos": {"h": 8, "w": 8, "x": 8, "y": 0},
+          "targets": [
+            {
+              "datasource": {"type": "prometheus", "uid": "prometheus"},
+              "expr": "histogram_quantile(0.50, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
+              "legendFormat": "p50",
+              "refId": "A"
+            },
+            {
+              "datasource": {"type": "prometheus", "uid": "prometheus"},
+              "expr": "histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
+              "legendFormat": "p95",
+              "refId": "B"
+            },
+            {
+              "datasource": {"type": "prometheus", "uid": "prometheus"},
+              "expr": "histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
+              "legendFormat": "p99",
+              "refId": "C"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "s"
+            }
+          }
+        },
+        {
+          "id": 3,
+          "title": "Error Rate",
+          "type": "timeseries",
+          "gridPos": {"h": 8, "w": 8, "x": 16, "y": 0},
+          "targets": [
+            {
+              "datasource": {"type": "prometheus", "uid": "prometheus"},
+              "expr": "sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\", http_status_code=~\"5..\"}[1m])) / sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m])) * 100",
+              "legendFormat": "Error %",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent",
+              "min": 0,
+              "max": 100
+            }
+          }
+        },
+        {
+          "id": 4,
+          "title": "Requests by Status Code",
+          "type": "timeseries",
+          "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
+          "targets": [
+            {
+              "datasource": {"type": "prometheus", "uid": "prometheus"},
+              "expr": "sum by (http_status_code) (rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
+              "legendFormat": "{{ "{{" }}http_status_code{{ "}}" }}",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "reqps"
+            }
+          }
+        },
+        {
+          "id": 5,
+          "title": "Requests by Endpoint",
+          "type": "timeseries",
+          "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
+          "targets": [
+            {
+              "datasource": {"type": "prometheus", "uid": "prometheus"},
+              "expr": "sum by (http_route) (rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
+              "legendFormat": "{{ "{{" }}http_route{{ "}}" }}",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "reqps"
+            }
+          }
+        },
+        {
+          "id": 6,
+          "title": "Recent Logs",
+          "type": "logs",
+          "gridPos": {"h": 10, "w": 24, "x": 0, "y": 16},
+          "targets": [
+            {
+              "datasource": {"type": "loki", "uid": "loki"},
+              "expr": "{service_name=\"incidentops-api\"} | json",
+              "refId": "A"
+            }
+          ],
+          "options": {
+            "showTime": true,
+            "showLabels": true,
+            "wrapLogMessage": true,
+            "enableLogDetails": true,
+            "sortOrder": "Descending"
+          }
+        },
+        {
+          "id": 7,
+          "title": "Recent Traces",
+          "type": "traces",
+          "gridPos": {"h": 10, "w": 24, "x": 0, "y": 26},
+          "targets": [
+            {
+              "datasource": {"type": "tempo", "uid": "tempo"},
+              "queryType": "traceqlSearch",
+              "filters": [
+                {
+                  "id": "service-name",
+                  "operator": "=",
+                  "scope": "resource",
+                  "tag": "service.name",
+                  "value": ["incidentops-api"]
+                }
+              ],
+              "refId": "A"
+            }
+          ]
+        }
+      ],
+      "schemaVersion": 38,
+      "version": 2
+    }
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "incidentops.fullname" . }}-grafana
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: grafana
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      {{- include "incidentops.selectorLabels" . | nindent 6 }}
+      app.kubernetes.io/component: grafana
+  template:
+    metadata:
+      labels:
+        {{- include "incidentops.selectorLabels" . | nindent 8 }}
+        app.kubernetes.io/component: grafana
+      annotations:
+        checksum/datasources: {{ .Values.observability.grafana.image.tag | sha256sum }}
+    spec:
+      securityContext:
+        fsGroup: 472
+        runAsUser: 472
+      containers:
+        - name: grafana
+          image: "{{ .Values.observability.grafana.image.repository }}:{{ .Values.observability.grafana.image.tag }}"
+          imagePullPolicy: {{ .Values.observability.grafana.image.pullPolicy }}
+          ports:
+            - name: http
+              containerPort: 3000
+              protocol: TCP
+          env:
+            - name: GF_SECURITY_ADMIN_USER
+              value: {{ .Values.observability.grafana.adminUser | quote }}
+            - name: GF_SECURITY_ADMIN_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: {{ include "incidentops.fullname" . }}-grafana
+                  key: admin-password
+            - name: GF_USERS_ALLOW_SIGN_UP
+              value: "false"
+            - name: GF_EXPLORE_ENABLED
+              value: "true"
+            - name: GF_FEATURE_TOGGLES_ENABLE
+              value: "traceqlEditor tempoSearch tempoBackendSearch tempoApmTable"
+          volumeMounts:
+            - name: datasources
+              mountPath: /etc/grafana/provisioning/datasources
+            - name: dashboards-provider
+              mountPath: /etc/grafana/provisioning/dashboards
+            - name: dashboards
+              mountPath: /var/lib/grafana/dashboards
+            - name: data
+              mountPath: /var/lib/grafana
+          resources:
+            {{- toYaml .Values.observability.grafana.resources | nindent 12 }}
+          readinessProbe:
+            httpGet:
+              path: /api/health
+              port: http
+            initialDelaySeconds: 10
+            periodSeconds: 10
+          livenessProbe:
+            httpGet:
+              path: /api/health
+              port: http
+            initialDelaySeconds: 30
+            periodSeconds: 30
+      volumes:
+        - name: datasources
+          configMap:
+            name: {{ include "incidentops.fullname" . }}-grafana-datasources
+        - name: dashboards-provider
+          configMap:
+            name: {{ include "incidentops.fullname" . }}-grafana-dashboards-provider
+        - name: dashboards
+          configMap:
+            name: {{ include "incidentops.fullname" . }}-grafana-dashboards
+        - name: data
+          {{- if .Values.observability.grafana.persistence.enabled }}
+          persistentVolumeClaim:
+            claimName: {{ include "incidentops.fullname" . }}-grafana
+          {{- else }}
+          emptyDir: {}
+          {{- end }}
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: {{ include "incidentops.fullname" . }}-grafana
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: grafana
+type: Opaque
+data:
+  admin-password: {{ .Values.observability.grafana.adminPassword | b64enc | quote }}
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "incidentops.fullname" . }}-grafana
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: grafana
+spec:
+  type: {{ .Values.observability.grafana.service.type }}
+  ports:
+    - name: http
+      port: 80
+      targetPort: http
+      protocol: TCP
+  selector:
+    {{- include "incidentops.selectorLabels" . | nindent 4 }}
+    app.kubernetes.io/component: grafana
+{{- if .Values.observability.grafana.persistence.enabled }}
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: {{ include "incidentops.fullname" . }}-grafana
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: grafana
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: {{ .Values.observability.grafana.persistence.size }}
+{{- end }}
+{{- end }}
--- a/helm/incidentops/templates/grafana-ingress.yaml
+++ b/helm/incidentops/templates/grafana-ingress.yaml
@@ -0,0 +1,38 @@
+{{- if and .Values.observability.enabled .Values.observability.grafana.ingress.enabled -}}
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: {{ include "incidentops.fullname" . }}-grafana
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: grafana
+  {{- with .Values.observability.grafana.ingress.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+spec:
+  {{- if .Values.ingress.className }}
+  ingressClassName: {{ .Values.ingress.className }}
+  {{- end }}
+  {{- if .Values.observability.grafana.ingress.tls }}
+  tls:
+    {{- range .Values.observability.grafana.ingress.tls }}
+    - hosts:
+        {{- range .hosts }}
+        - {{ . | quote }}
+        {{- end }}
+      secretName: {{ .secretName }}
+    {{- end }}
+  {{- end }}
+  rules:
+    - host: {{ .Values.observability.grafana.ingress.host | quote }}
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: {{ include "incidentops.fullname" . }}-grafana
+                port:
+                  number: 80
+{{- end }}
--- a/helm/incidentops/templates/loki-deployment.yaml
+++ b/helm/incidentops/templates/loki-deployment.yaml
@@ -0,0 +1,155 @@
+{{- if .Values.observability.enabled }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "incidentops.fullname" . }}-loki-config
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: loki
+data:
+  loki.yaml: |
+    auth_enabled: false
+
+    server:
+      http_listen_port: 3100
+      grpc_listen_port: 9096
+
+    common:
+      path_prefix: /loki
+      storage:
+        filesystem:
+          chunks_directory: /loki/chunks
+          rules_directory: /loki/rules
+      replication_factor: 1
+      ring:
+        kvstore:
+          store: inmemory
+
+    query_range:
+      results_cache:
+        cache:
+          embedded_cache:
+            enabled: true
+            max_size_mb: 100
+
+    schema_config:
+      configs:
+        - from: "2020-10-24"
+          store: tsdb
+          object_store: filesystem
+          schema: v13
+          index:
+            prefix: index_
+            period: 24h
+
+    ruler:
+      alertmanager_url: http://localhost:9093
+
+    limits_config:
+      retention_period: {{ .Values.observability.loki.retention }}
+      allow_structured_metadata: true
+      volume_enabled: true
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "incidentops.fullname" . }}-loki
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: loki
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      {{- include "incidentops.selectorLabels" . | nindent 6 }}
+      app.kubernetes.io/component: loki
+  template:
+    metadata:
+      labels:
+        {{- include "incidentops.selectorLabels" . | nindent 8 }}
+        app.kubernetes.io/component: loki
+      annotations:
+        checksum/config: {{ .Values.observability.loki.image.tag | sha256sum }}
+    spec:
+      containers:
+        - name: loki
+          image: "{{ .Values.observability.loki.image.repository }}:{{ .Values.observability.loki.image.tag }}"
+          imagePullPolicy: {{ .Values.observability.loki.image.pullPolicy }}
+          args:
+            - -config.file=/etc/loki/loki.yaml
+          ports:
+            - name: http
+              containerPort: 3100
+              protocol: TCP
+            - name: grpc
+              containerPort: 9096
+              protocol: TCP
+          volumeMounts:
+            - name: config
+              mountPath: /etc/loki
+            - name: data
+              mountPath: /loki
+          resources:
+            {{- toYaml .Values.observability.loki.resources | nindent 12 }}
+          readinessProbe:
+            httpGet:
+              path: /ready
+              port: http
+            initialDelaySeconds: 10
+            periodSeconds: 10
+          livenessProbe:
+            httpGet:
+              path: /ready
+              port: http
+            initialDelaySeconds: 30
+            periodSeconds: 30
+      volumes:
+        - name: config
+          configMap:
+            name: {{ include "incidentops.fullname" . }}-loki-config
+        - name: data
+          {{- if .Values.observability.loki.persistence.enabled }}
+          persistentVolumeClaim:
+            claimName: {{ include "incidentops.fullname" . }}-loki
+          {{- else }}
+          emptyDir: {}
+          {{- end }}
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "incidentops.fullname" . }}-loki
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: loki
+spec:
+  type: ClusterIP
+  ports:
+    - name: http
+      port: 3100
+      targetPort: http
+      protocol: TCP
+    - name: grpc
+      port: 9096
+      targetPort: grpc
+      protocol: TCP
+  selector:
+    {{- include "incidentops.selectorLabels" . | nindent 4 }}
+    app.kubernetes.io/component: loki
+{{- if .Values.observability.loki.persistence.enabled }}
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: {{ include "incidentops.fullname" . }}-loki
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: loki
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: {{ .Values.observability.loki.persistence.size }}
+{{- end }}
+{{- end }}
--- a/helm/incidentops/templates/migrate-job.yaml
+++ b/helm/incidentops/templates/migrate-job.yaml
@@ -30,9 +30,11 @@ spec:
        - name: migrate
          securityContext:
            {{- toYaml .Values.securityContext | nindent 12 }}
-          image: {{ include "incidentops.api.image" . }}
+          image: "{{ .Values.migration.image.repository }}:{{ .Values.migration.image.tag }}"
          imagePullPolicy: {{ .Values.migration.image.pullPolicy }}
          command:
+            - uv
+            - run
            - python
            - migrations/migrate.py
            - apply
--- a/helm/incidentops/templates/otel-collector-deployment.yaml
+++ b/helm/incidentops/templates/otel-collector-deployment.yaml
@@ -0,0 +1,132 @@
+{{- if .Values.observability.enabled }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "incidentops.fullname" . }}-otel-collector-config
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: otel-collector
+data:
+  otel-collector-config.yaml: |
+    extensions:
+      health_check:
+        endpoint: 0.0.0.0:13133
+
+    receivers:
+      otlp:
+        protocols:
+          grpc:
+            endpoint: 0.0.0.0:4317
+          http:
+            endpoint: 0.0.0.0:4318
+
+    processors:
+      batch:
+        timeout: 1s
+        send_batch_size: 1024
+      memory_limiter:
+        check_interval: 1s
+        limit_mib: 512
+        spike_limit_mib: 128
+
+    exporters:
+      otlp/tempo:
+        endpoint: {{ include "incidentops.fullname" . }}-tempo:4317
+        tls:
+          insecure: true
+      loki:
+        endpoint: http://{{ include "incidentops.fullname" . }}-loki:3100/loki/api/v1/push
+        default_labels_enabled:
+          exporter: true
+          job: true
+
+    service:
+      extensions: [health_check]
+      pipelines:
+        traces:
+          receivers: [otlp]
+          processors: [memory_limiter, batch]
+          exporters: [otlp/tempo]
+        logs:
+          receivers: [otlp]
+          processors: [memory_limiter, batch]
+          exporters: [loki]
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "incidentops.fullname" . }}-otel-collector
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: otel-collector
+spec:
+  replicas: {{ .Values.observability.otelCollector.replicaCount }}
+  selector:
+    matchLabels:
+      {{- include "incidentops.selectorLabels" . | nindent 6 }}
+      app.kubernetes.io/component: otel-collector
+  template:
+    metadata:
+      labels:
+        {{- include "incidentops.selectorLabels" . | nindent 8 }}
+        app.kubernetes.io/component: otel-collector
+      annotations:
+        checksum/config: {{ .Values.observability.otelCollector.image.tag | sha256sum }}
+    spec:
+      containers:
+        - name: otel-collector
+          image: "{{ .Values.observability.otelCollector.image.repository }}:{{ .Values.observability.otelCollector.image.tag }}"
+          imagePullPolicy: {{ .Values.observability.otelCollector.image.pullPolicy }}
+          args:
+            - --config=/etc/otel-collector/otel-collector-config.yaml
+          ports:
+            - name: otlp-grpc
+              containerPort: 4317
+              protocol: TCP
+            - name: otlp-http
+              containerPort: 4318
+              protocol: TCP
+          volumeMounts:
+            - name: config
+              mountPath: /etc/otel-collector
+          resources:
+            {{- toYaml .Values.observability.otelCollector.resources | nindent 12 }}
+          livenessProbe:
+            httpGet:
+              path: /
+              port: 13133
+            initialDelaySeconds: 10
+            periodSeconds: 30
+          readinessProbe:
+            httpGet:
+              path: /
+              port: 13133
+            initialDelaySeconds: 5
+            periodSeconds: 10
+      volumes:
+        - name: config
+          configMap:
+            name: {{ include "incidentops.fullname" . }}-otel-collector-config
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "incidentops.fullname" . }}-otel-collector
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: otel-collector
+spec:
+  type: ClusterIP
+  ports:
+    - name: otlp-grpc
+      port: 4317
+      targetPort: otlp-grpc
+      protocol: TCP
+    - name: otlp-http
+      port: 4318
+      targetPort: otlp-http
+      protocol: TCP
+  selector:
+    {{- include "incidentops.selectorLabels" . | nindent 4 }}
+    app.kubernetes.io/component: otel-collector
+{{- end }}
--- a/helm/incidentops/templates/prometheus-deployment.yaml
+++ b/helm/incidentops/templates/prometheus-deployment.yaml
@@ -0,0 +1,163 @@
+{{- if and .Values.observability.enabled .Values.metrics.enabled }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "incidentops.fullname" . }}-prometheus
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: prometheus
+data:
+  prometheus.yml: |
+    global:
+      scrape_interval: {{ .Values.observability.prometheus.scrapeInterval | default "15s" }}
+      evaluation_interval: 15s
+
+    scrape_configs:
+      - job_name: "prometheus"
+        static_configs:
+          - targets: ["localhost:9090"]
+
+      - job_name: "incidentops-api"
+        kubernetes_sd_configs:
+          - role: pod
+            namespaces:
+              names:
+                - {{ .Release.Namespace }}
+        relabel_configs:
+          - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
+            action: keep
+            regex: api
+          - source_labels: [__meta_kubernetes_pod_container_port_name]
+            action: keep
+            regex: metrics
+          - source_labels: [__meta_kubernetes_namespace]
+            target_label: namespace
+          - source_labels: [__meta_kubernetes_pod_name]
+            target_label: pod
+        metrics_path: /metrics
+        scrape_interval: 10s
+
+      - job_name: "incidentops-worker"
+        kubernetes_sd_configs:
+          - role: pod
+            namespaces:
+              names:
+                - {{ .Release.Namespace }}
+        relabel_configs:
+          - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
+            action: keep
+            regex: worker
+          - source_labels: [__meta_kubernetes_pod_container_port_name]
+            action: keep
+            regex: metrics
+          - source_labels: [__meta_kubernetes_namespace]
+            target_label: namespace
+          - source_labels: [__meta_kubernetes_pod_name]
+            target_label: pod
+        metrics_path: /metrics
+        scrape_interval: 10s
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "incidentops.fullname" . }}-prometheus
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: prometheus
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      {{- include "incidentops.selectorLabels" . | nindent 6 }}
+      app.kubernetes.io/component: prometheus
+  template:
+    metadata:
+      labels:
+        {{- include "incidentops.selectorLabels" . | nindent 8 }}
+        app.kubernetes.io/component: prometheus
+      annotations:
+        checksum/config: {{ .Values.observability.prometheus.image.tag | sha256sum }}
+    spec:
+      serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
+      securityContext:
+        fsGroup: 65534
+        runAsUser: 65534
+        runAsNonRoot: true
+      containers:
+        - name: prometheus
+          image: "{{ .Values.observability.prometheus.image.repository }}:{{ .Values.observability.prometheus.image.tag }}"
+          imagePullPolicy: {{ .Values.observability.prometheus.image.pullPolicy }}
+          args:
+            - "--config.file=/etc/prometheus/prometheus.yml"
+            - "--storage.tsdb.path=/prometheus"
+            - "--storage.tsdb.retention.time={{ .Values.observability.prometheus.retention }}"
+            - "--web.enable-lifecycle"
+          ports:
+            - name: http
+              containerPort: 9090
+              protocol: TCP
+          volumeMounts:
+            - name: config
+              mountPath: /etc/prometheus
+            - name: data
+              mountPath: /prometheus
+          resources:
+            {{- toYaml .Values.observability.prometheus.resources | nindent 12 }}
+          readinessProbe:
+            httpGet:
+              path: /-/ready
+              port: http
+            initialDelaySeconds: 10
+            periodSeconds: 10
+          livenessProbe:
+            httpGet:
+              path: /-/healthy
+              port: http
+            initialDelaySeconds: 30
+            periodSeconds: 30
+      volumes:
+        - name: config
+          configMap:
+            name: {{ include "incidentops.fullname" . }}-prometheus
+        - name: data
+          {{- if .Values.observability.prometheus.persistence.enabled }}
+          persistentVolumeClaim:
+            claimName: {{ include "incidentops.fullname" . }}-prometheus
+          {{- else }}
+          emptyDir: {}
+          {{- end }}
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "incidentops.fullname" . }}-prometheus
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: prometheus
+spec:
+  type: ClusterIP
+  ports:
+    - name: http
+      port: 9090
+      targetPort: http
+      protocol: TCP
+  selector:
+    {{- include "incidentops.selectorLabels" . | nindent 4 }}
+    app.kubernetes.io/component: prometheus
+{{- if .Values.observability.prometheus.persistence.enabled }}
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: {{ include "incidentops.fullname" . }}-prometheus
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: prometheus
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: {{ .Values.observability.prometheus.persistence.size }}
+{{- end }}
+{{- end }}
--- a/helm/incidentops/templates/prometheus-rbac.yaml
+++ b/helm/incidentops/templates/prometheus-rbac.yaml
@@ -0,0 +1,29 @@
+{{- if and .Values.observability.enabled .Values.metrics.enabled }}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: {{ include "incidentops.fullname" . }}-prometheus
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: prometheus
+rules:
+  - apiGroups: [""]
+    resources: ["pods", "endpoints", "services"]
+    verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: {{ include "incidentops.fullname" . }}-prometheus
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: prometheus
+subjects:
+  - kind: ServiceAccount
+    name: {{ include "incidentops.serviceAccountName" . }}
+    namespace: {{ .Release.Namespace }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: {{ include "incidentops.fullname" . }}-prometheus
+{{- end }}
--- a/helm/incidentops/templates/promtail-daemonset.yaml
+++ b/helm/incidentops/templates/promtail-daemonset.yaml
@@ -0,0 +1,169 @@
+{{- if and .Values.observability.enabled .Values.observability.promtail.enabled }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "incidentops.fullname" . }}-promtail-config
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: promtail
+data:
+  promtail.yaml: |
+    server:
+      http_listen_port: 3101
+      grpc_listen_port: 0
+
+    positions:
+      filename: /run/promtail/positions.yaml
+
+    clients:
+      - url: http://{{ include "incidentops.fullname" . }}-loki:3100/loki/api/v1/push
+
+    scrape_configs:
+      - job_name: kubernetes-pods
+        pipeline_stages:
+          - cri: {}
+        kubernetes_sd_configs:
+          - role: pod
+            namespaces:
+              names: [{{ .Release.Namespace }}]
+        relabel_configs:
+          - source_labels: [__meta_kubernetes_pod_container_init]
+            regex: "true"
+            action: drop
+          - source_labels: [__meta_kubernetes_pod_phase]
+            regex: Pending|Failed|Succeeded
+            action: drop
+          - source_labels: [__meta_kubernetes_pod_name, __meta_kubernetes_pod_namespace, __meta_kubernetes_pod_container_name]
+            target_label: __path__
+            replacement: /var/log/containers/$1_$2_$3-*.log
+          - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
+            regex: (.*)
+            target_label: service_name
+            replacement: {{ include "incidentops.fullname" . }}-$1
+          - source_labels: [__meta_kubernetes_pod_namespace]
+            target_label: namespace
+          - source_labels: [__meta_kubernetes_pod_name]
+            target_label: pod
+          - source_labels: [__meta_kubernetes_pod_container_name]
+            target_label: container
+          - source_labels: [__meta_kubernetes_pod_uid]
+            target_label: pod_uid
+          - target_label: cluster
+            replacement: {{ .Release.Namespace }}
+
+      - job_name: containers-fallback
+        pipeline_stages:
+          - cri: {}
+        static_configs:
+          - labels:
+              job: containers
+              namespace: {{ .Release.Namespace }}
+              service_name: incidentops-api
+              __path__: /var/log/containers/incidentops-api-*_incidentops_api-*.log
+          - labels:
+              job: containers
+              namespace: {{ .Release.Namespace }}
+              service_name: incidentops-worker
+              __path__: /var/log/containers/incidentops-worker-*_incidentops_worker-*.log
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ include "incidentops.fullname" . }}-promtail
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: promtail
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: {{ include "incidentops.fullname" . }}-promtail
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: promtail
+rules:
+  - apiGroups: [""]
+    resources: ["pods", "pods/log", "namespaces", "services", "endpoints", "nodes"]
+    verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: {{ include "incidentops.fullname" . }}-promtail
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: promtail
+subjects:
+  - kind: ServiceAccount
+    name: {{ include "incidentops.fullname" . }}-promtail
+    namespace: {{ .Release.Namespace }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: {{ include "incidentops.fullname" . }}-promtail
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: {{ include "incidentops.fullname" . }}-promtail
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: promtail
+spec:
+  selector:
+    matchLabels:
+      {{- include "incidentops.selectorLabels" . | nindent 6 }}
+      app.kubernetes.io/component: promtail
+  template:
+    metadata:
+      labels:
+        {{- include "incidentops.selectorLabels" . | nindent 8 }}
+        app.kubernetes.io/component: promtail
+      annotations:
+        checksum/config: {{ .Values.observability.promtail.image.tag | sha256sum }}
+    spec:
+      serviceAccountName: {{ include "incidentops.fullname" . }}-promtail
+      securityContext:
+        runAsUser: 0
+      containers:
+        - name: promtail
+          image: "{{ .Values.observability.promtail.image.repository }}:{{ .Values.observability.promtail.image.tag }}"
+          imagePullPolicy: {{ .Values.observability.promtail.image.pullPolicy }}
+          args:
+            - -config.file=/etc/promtail/promtail.yaml
+          ports:
+            - name: http-metrics
+              containerPort: 3101
+              protocol: TCP
+          volumeMounts:
+            - name: config
+              mountPath: /etc/promtail
+            - name: positions
+              mountPath: /run/promtail
+            - name: varlog
+              mountPath: /var/log
+              readOnly: true
+            - name: varlogpods
+              mountPath: /var/log/pods
+              readOnly: true
+            - name: varlogcontainers
+              mountPath: /var/log/containers
+              readOnly: true
+          resources:
+            {{- toYaml .Values.observability.promtail.resources | nindent 12 }}
+      volumes:
+        - name: config
+          configMap:
+            name: {{ include "incidentops.fullname" . }}-promtail-config
+        - name: positions
+          emptyDir: {}
+        - name: varlog
+          hostPath:
+            path: /var/log
+        - name: varlogpods
+          hostPath:
+            path: /var/log/pods
+        - name: varlogcontainers
+          hostPath:
+            path: /var/log/containers
+{{- end }}
--- a/helm/incidentops/templates/tempo-deployment.yaml
+++ b/helm/incidentops/templates/tempo-deployment.yaml
@@ -0,0 +1,153 @@
+{{- if .Values.observability.enabled }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "incidentops.fullname" . }}-tempo-config
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: tempo
+data:
+  tempo.yaml: |
+    server:
+      http_listen_port: 3200
+
+    distributor:
+      receivers:
+        otlp:
+          protocols:
+            grpc:
+              endpoint: 0.0.0.0:4317
+            http:
+              endpoint: 0.0.0.0:4318
+
+    ingester:
+      trace_idle_period: 10s
+      max_block_bytes: 1048576
+      max_block_duration: 5m
+
+    compactor:
+      compaction:
+        block_retention: {{ .Values.observability.tempo.retention }}
+
+    storage:
+      trace:
+        backend: local
+        local:
+          path: /var/tempo/traces
+        wal:
+          path: /var/tempo/wal
+
+    querier:
+      search:
+        query_timeout: 30s
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "incidentops.fullname" . }}-tempo
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: tempo
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      {{- include "incidentops.selectorLabels" . | nindent 6 }}
+      app.kubernetes.io/component: tempo
+  template:
+    metadata:
+      labels:
+        {{- include "incidentops.selectorLabels" . | nindent 8 }}
+        app.kubernetes.io/component: tempo
+      annotations:
+        checksum/config: {{ .Values.observability.tempo.image.tag | sha256sum }}
+    spec:
+      containers:
+        - name: tempo
+          image: "{{ .Values.observability.tempo.image.repository }}:{{ .Values.observability.tempo.image.tag }}"
+          imagePullPolicy: {{ .Values.observability.tempo.image.pullPolicy }}
+          args:
+            - -config.file=/etc/tempo/tempo.yaml
+          ports:
+            - name: http
+              containerPort: 3200
+              protocol: TCP
+            - name: otlp-grpc
+              containerPort: 4317
+              protocol: TCP
+            - name: otlp-http
+              containerPort: 4318
+              protocol: TCP
+          volumeMounts:
+            - name: config
+              mountPath: /etc/tempo
+            - name: data
+              mountPath: /var/tempo
+          resources:
+            {{- toYaml .Values.observability.tempo.resources | nindent 12 }}
+          readinessProbe:
+            httpGet:
+              path: /ready
+              port: http
+            initialDelaySeconds: 10
+            periodSeconds: 10
+          livenessProbe:
+            httpGet:
+              path: /ready
+              port: http
+            initialDelaySeconds: 30
+            periodSeconds: 30
+      volumes:
+        - name: config
+          configMap:
+            name: {{ include "incidentops.fullname" . }}-tempo-config
+        - name: data
+          {{- if .Values.observability.tempo.persistence.enabled }}
+          persistentVolumeClaim:
+            claimName: {{ include "incidentops.fullname" . }}-tempo
+          {{- else }}
+          emptyDir: {}
+          {{- end }}
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "incidentops.fullname" . }}-tempo
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: tempo
+spec:
+  type: ClusterIP
+  ports:
+    - name: http
+      port: 3200
+      targetPort: http
+      protocol: TCP
+    - name: otlp-grpc
+      port: 4317
+      targetPort: otlp-grpc
+      protocol: TCP
+    - name: otlp-http
+      port: 4318
+      targetPort: otlp-http
+      protocol: TCP
+  selector:
+    {{- include "incidentops.selectorLabels" . | nindent 4 }}
+    app.kubernetes.io/component: tempo
+{{- if .Values.observability.tempo.persistence.enabled }}
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: {{ include "incidentops.fullname" . }}-tempo
+  labels:
+    {{- include "incidentops.labels" . | nindent 4 }}
+    app.kubernetes.io/component: tempo
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: {{ .Values.observability.tempo.persistence.size }}
+{{- end }}
+{{- end }}
--- a/helm/incidentops/templates/worker-deployment.yaml
+++ b/helm/incidentops/templates/worker-deployment.yaml
@@ -29,6 +29,29 @@ spec:
      serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
      securityContext:
        {{- toYaml .Values.podSecurityContext | nindent 8 }}
+      initContainers:
+        - name: wait-for-postgres
+          image: busybox:1.36
+          command:
+            - sh
+            - -c
+            - |
+              until nc -z {{ include "incidentops.fullname" . }}-postgresql 5432; do
+                echo "Waiting for PostgreSQL..."
+                sleep 2
+              done
+              echo "PostgreSQL is ready"
+        - name: wait-for-redis
+          image: busybox:1.36
+          command:
+            - sh
+            - -c
+            - |
+              until nc -z {{ include "incidentops.fullname" . }}-redis 6379; do
+                echo "Waiting for Redis..."
+                sleep 2
+              done
+              echo "Redis is ready"
      containers:
        - name: worker
          securityContext:
@@ -36,6 +59,8 @@ spec:
          image: {{ include "incidentops.worker.image" . }}
          imagePullPolicy: {{ .Values.worker.image.pullPolicy }}
          command:
+            - uv
+            - run
            - celery
            - -A
            - worker.celery_app
@@ -52,6 +77,8 @@ spec:
          livenessProbe:
            exec:
              command:
+                - uv
+                - run
                - celery
                - -A
                - worker.celery_app
--- a/helm/incidentops/values-production.yaml
+++ b/helm/incidentops/values-production.yaml
@@ -80,3 +80,63 @@ redis:
    limits:
      cpu: 1000m
      memory: 1Gi
+
+# Application configuration
+config:
+  environment: production
+  logLevel: INFO
+
+# Observability Stack - Production settings
+observability:
+  enabled: true
+
+  otelCollector:
+    replicaCount: 2
+    resources:
+      requests:
+        cpu: 100m
+        memory: 256Mi
+      limits:
+        cpu: 500m
+        memory: 512Mi
+
+  tempo:
+    retention: "720h"  # 30 days
+    persistence:
+      enabled: true
+      size: 50Gi
+    resources:
+      requests:
+        cpu: 250m
+        memory: 512Mi
+      limits:
+        cpu: 1000m
+        memory: 2Gi
+
+  loki:
+    retention: "720h"  # 30 days
+    persistence:
+      enabled: true
+      size: 100Gi
+    resources:
+      requests:
+        cpu: 250m
+        memory: 512Mi
+      limits:
+        cpu: 1000m
+        memory: 2Gi
+
+  grafana:
+    adminPassword: ""  # Set via external secret in production
+    service:
+      type: ClusterIP
+    persistence:
+      enabled: true
+      size: 5Gi
+    resources:
+      requests:
+        cpu: 100m
+        memory: 256Mi
+      limits:
+        cpu: 500m
+        memory: 512Mi
--- a/helm/incidentops/values.yaml
+++ b/helm/incidentops/values.yaml
@@ -106,6 +106,8 @@ config:
  jwtAlgorithm: HS256
  accessTokenExpireMinutes: 30
  refreshTokenExpireDays: 30
+  environment: development
+  logLevel: INFO

 # Secrets (use external secrets in production)
 secrets:
@@ -161,3 +163,117 @@ podSecurityContext:
 securityContext:
  runAsNonRoot: true
  runAsUser: 1000
+
+# Observability Stack (Grafana + Loki + Tempo + OpenTelemetry Collector)
+observability:
+  enabled: true
+
+  otelCollector:
+    replicaCount: 1
+    image:
+      repository: otel/opentelemetry-collector-contrib
+      tag: "0.96.0"
+      pullPolicy: IfNotPresent
+    resources:
+      requests:
+        cpu: 50m
+        memory: 128Mi
+      limits:
+        cpu: 200m
+        memory: 256Mi
+
+  tempo:
+    image:
+      repository: grafana/tempo
+      tag: "2.4.1"
+      pullPolicy: IfNotPresent
+    retention: "168h"  # 7 days
+    persistence:
+      enabled: false
+      size: 10Gi
+    resources:
+      requests:
+        cpu: 50m
+        memory: 128Mi
+      limits:
+        cpu: 500m
+        memory: 512Mi
+
+  loki:
+    image:
+      repository: grafana/loki
+      tag: "2.9.6"
+      pullPolicy: IfNotPresent
+    retention: "168h"  # 7 days
+    persistence:
+      enabled: false
+      size: 10Gi
+    resources:
+      requests:
+        cpu: 50m
+        memory: 128Mi
+      limits:
+        cpu: 500m
+        memory: 512Mi
+
+  prometheus:
+    image:
+      repository: prom/prometheus
+      tag: "v2.51.0"
+      pullPolicy: IfNotPresent
+    retention: "15d"
+    scrapeInterval: "15s"
+    persistence:
+      enabled: false
+      size: 10Gi
+    resources:
+      requests:
+        cpu: 50m
+        memory: 128Mi
+      limits:
+        cpu: 500m
+        memory: 512Mi
+
+  grafana:
+    image:
+      repository: grafana/grafana
+      tag: "10.4.1"
+      pullPolicy: IfNotPresent
+    adminUser: admin
+    adminPassword: "admin"  # Change in production!
+    service:
+      type: ClusterIP
+    ingress:
+      enabled: false
+      host: grafana.incidentops.local
+      annotations: {}
+      tls: []
+    persistence:
+      enabled: false
+      size: 1Gi
+    resources:
+      requests:
+        cpu: 50m
+        memory: 128Mi
+      limits:
+        cpu: 200m
+        memory: 256Mi
+
+  promtail:
+    enabled: true
+    image:
+      repository: grafana/promtail
+      tag: "2.9.6"
+      pullPolicy: IfNotPresent
+    resources:
+      requests:
+        cpu: 25m
+        memory: 64Mi
+      limits:
+        cpu: 200m
+        memory: 256Mi
+
+# Metrics configuration
+metrics:
+  enabled: true
+  port: 9464