feat: add observability stack and background task infrastructure

Add OpenTelemetry instrumentation with distributed tracing and metrics:
- Structured JSON logging with trace context correlation
- Auto-instrumentation for FastAPI, asyncpg, httpx, redis
- OTLP exporter for traces and Prometheus metrics endpoint

Implement Celery worker and notification task system:
- Celery app with Redis/SQS broker support and configurable queues
- Notification tasks for incident fan-out, webhooks, and escalations
- Pluggable TaskQueue abstraction with in-memory driver for testing

Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana):
- OpenTelemetry Collector for receiving OTLP traces and logs
- Tempo for distributed tracing backend
- Loki for log aggregation with Promtail DaemonSet
- Prometheus for metrics scraping with RBAC configuration
- Grafana with pre-provisioned datasources and API overview dashboard
- Helm templates for all observability components

Enhance application infrastructure:
- Global exception handlers with structured ErrorResponse schema
- Request logging middleware with timing metrics
- Health check updated to verify task queue connectivity
- Non-root user in Dockerfile for security
- Init containers in Helm deployments for dependency ordering
- Production Helm values with autoscaling and retention policies
This commit is contained in:
2026-01-07 20:51:13 -05:00
parent f427d191e0
commit 46ede7757d
45 changed files with 3742 additions and 76 deletions

View File

@@ -29,6 +29,29 @@ spec:
serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
initContainers:
- name: wait-for-postgres
image: busybox:1.36
command:
- sh
- -c
- |
until nc -z {{ include "incidentops.fullname" . }}-postgresql 5432; do
echo "Waiting for PostgreSQL..."
sleep 2
done
echo "PostgreSQL is ready"
- name: wait-for-redis
image: busybox:1.36
command:
- sh
- -c
- |
until nc -z {{ include "incidentops.fullname" . }}-redis 6379; do
echo "Waiting for Redis..."
sleep 2
done
echo "Redis is ready"
containers:
- name: api
securityContext:
@@ -39,6 +62,11 @@ spec:
- name: http
containerPort: 8000
protocol: TCP
{{- if .Values.metrics.enabled }}
- name: metrics
containerPort: {{ .Values.metrics.port }}
protocol: TCP
{{- end }}
envFrom:
- configMapRef:
name: {{ include "incidentops.fullname" . }}-config

View File

@@ -11,5 +11,11 @@ spec:
targetPort: http
protocol: TCP
name: http
{{- if .Values.metrics.enabled }}
- port: {{ .Values.metrics.port }}
targetPort: metrics
protocol: TCP
name: metrics
{{- end }}
selector:
{{- include "incidentops.api.selectorLabels" . | nindent 4 }}

View File

@@ -8,3 +8,16 @@ data:
JWT_ALGORITHM: {{ .Values.config.jwtAlgorithm | quote }}
ACCESS_TOKEN_EXPIRE_MINUTES: {{ .Values.config.accessTokenExpireMinutes | quote }}
REFRESH_TOKEN_EXPIRE_DAYS: {{ .Values.config.refreshTokenExpireDays | quote }}
# OpenTelemetry configuration
OTEL_ENABLED: {{ .Values.observability.enabled | quote }}
OTEL_SERVICE_NAME: "incidentops-api"
OTEL_ENVIRONMENT: {{ .Values.config.environment | default "production" | quote }}
{{- if .Values.observability.enabled }}
OTEL_EXPORTER_OTLP_ENDPOINT: "http://{{ include "incidentops.fullname" . }}-otel-collector:4317"
{{- end }}
OTEL_EXPORTER_OTLP_INSECURE: "true"
OTEL_LOG_LEVEL: {{ .Values.config.logLevel | default "INFO" | quote }}
# Metrics configuration
{{- if .Values.metrics.enabled }}
PROMETHEUS_PORT: {{ .Values.metrics.port | quote }}
{{- end }}

View File

@@ -0,0 +1,387 @@
{{- if .Values.observability.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "incidentops.fullname" . }}-grafana-datasources
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: grafana
data:
datasources.yaml: |
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
uid: prometheus
url: http://{{ include "incidentops.fullname" . }}-prometheus:9090
access: proxy
isDefault: false
jsonData:
httpMethod: POST
exemplarTraceIdDestinations:
- name: trace_id
datasourceUid: tempo
- name: Tempo
type: tempo
uid: tempo
url: http://{{ include "incidentops.fullname" . }}-tempo:3200
access: proxy
isDefault: false
jsonData:
tracesToLogsV2:
datasourceUid: loki
spanStartTimeShift: '-1h'
spanEndTimeShift: '1h'
filterByTraceID: true
filterBySpanID: true
tracesToMetrics:
datasourceUid: prometheus
spanStartTimeShift: '-1h'
spanEndTimeShift: '1h'
serviceMap:
datasourceUid: prometheus
nodeGraph:
enabled: true
lokiSearch:
datasourceUid: loki
- name: Loki
type: loki
uid: loki
url: http://{{ include "incidentops.fullname" . }}-loki:3100
access: proxy
isDefault: true
jsonData:
derivedFields:
- datasourceUid: tempo
matcherRegex: '"trace_id":"([a-f0-9]+)"'
name: TraceID
url: '$${__value.raw}'
urlDisplayLabel: 'View Trace'
---
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "incidentops.fullname" . }}-grafana-dashboards-provider
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: grafana
data:
dashboards.yaml: |
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: 'IncidentOps'
folderUid: 'incidentops'
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards
---
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "incidentops.fullname" . }}-grafana-dashboards
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: grafana
data:
api-overview.json: |
{
"title": "IncidentOps API Overview",
"uid": "incidentops-api",
"tags": ["incidentops", "api"],
"timezone": "browser",
"editable": true,
"panels": [
{
"id": 1,
"title": "Request Rate",
"type": "timeseries",
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 0},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
"legendFormat": "Requests/sec",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "reqps"
}
}
},
{
"id": 2,
"title": "Request Duration (p50, p95, p99)",
"type": "timeseries",
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 0},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "histogram_quantile(0.50, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
"legendFormat": "p50",
"refId": "A"
},
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
"legendFormat": "p95",
"refId": "B"
},
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
"legendFormat": "p99",
"refId": "C"
}
],
"fieldConfig": {
"defaults": {
"unit": "s"
}
}
},
{
"id": 3,
"title": "Error Rate",
"type": "timeseries",
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 0},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\", http_status_code=~\"5..\"}[1m])) / sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m])) * 100",
"legendFormat": "Error %",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100
}
}
},
{
"id": 4,
"title": "Requests by Status Code",
"type": "timeseries",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "sum by (http_status_code) (rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
"legendFormat": "{{ "{{" }}http_status_code{{ "}}" }}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "reqps"
}
}
},
{
"id": 5,
"title": "Requests by Endpoint",
"type": "timeseries",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "sum by (http_route) (rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
"legendFormat": "{{ "{{" }}http_route{{ "}}" }}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "reqps"
}
}
},
{
"id": 6,
"title": "Recent Logs",
"type": "logs",
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 16},
"targets": [
{
"datasource": {"type": "loki", "uid": "loki"},
"expr": "{service_name=\"incidentops-api\"} | json",
"refId": "A"
}
],
"options": {
"showTime": true,
"showLabels": true,
"wrapLogMessage": true,
"enableLogDetails": true,
"sortOrder": "Descending"
}
},
{
"id": 7,
"title": "Recent Traces",
"type": "traces",
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 26},
"targets": [
{
"datasource": {"type": "tempo", "uid": "tempo"},
"queryType": "traceqlSearch",
"filters": [
{
"id": "service-name",
"operator": "=",
"scope": "resource",
"tag": "service.name",
"value": ["incidentops-api"]
}
],
"refId": "A"
}
]
}
],
"schemaVersion": 38,
"version": 2
}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "incidentops.fullname" . }}-grafana
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: grafana
spec:
replicas: 1
selector:
matchLabels:
{{- include "incidentops.selectorLabels" . | nindent 6 }}
app.kubernetes.io/component: grafana
template:
metadata:
labels:
{{- include "incidentops.selectorLabels" . | nindent 8 }}
app.kubernetes.io/component: grafana
annotations:
checksum/datasources: {{ .Values.observability.grafana.image.tag | sha256sum }}
spec:
securityContext:
fsGroup: 472
runAsUser: 472
containers:
- name: grafana
image: "{{ .Values.observability.grafana.image.repository }}:{{ .Values.observability.grafana.image.tag }}"
imagePullPolicy: {{ .Values.observability.grafana.image.pullPolicy }}
ports:
- name: http
containerPort: 3000
protocol: TCP
env:
- name: GF_SECURITY_ADMIN_USER
value: {{ .Values.observability.grafana.adminUser | quote }}
- name: GF_SECURITY_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: {{ include "incidentops.fullname" . }}-grafana
key: admin-password
- name: GF_USERS_ALLOW_SIGN_UP
value: "false"
- name: GF_EXPLORE_ENABLED
value: "true"
- name: GF_FEATURE_TOGGLES_ENABLE
value: "traceqlEditor tempoSearch tempoBackendSearch tempoApmTable"
volumeMounts:
- name: datasources
mountPath: /etc/grafana/provisioning/datasources
- name: dashboards-provider
mountPath: /etc/grafana/provisioning/dashboards
- name: dashboards
mountPath: /var/lib/grafana/dashboards
- name: data
mountPath: /var/lib/grafana
resources:
{{- toYaml .Values.observability.grafana.resources | nindent 12 }}
readinessProbe:
httpGet:
path: /api/health
port: http
initialDelaySeconds: 10
periodSeconds: 10
livenessProbe:
httpGet:
path: /api/health
port: http
initialDelaySeconds: 30
periodSeconds: 30
volumes:
- name: datasources
configMap:
name: {{ include "incidentops.fullname" . }}-grafana-datasources
- name: dashboards-provider
configMap:
name: {{ include "incidentops.fullname" . }}-grafana-dashboards-provider
- name: dashboards
configMap:
name: {{ include "incidentops.fullname" . }}-grafana-dashboards
- name: data
{{- if .Values.observability.grafana.persistence.enabled }}
persistentVolumeClaim:
claimName: {{ include "incidentops.fullname" . }}-grafana
{{- else }}
emptyDir: {}
{{- end }}
---
apiVersion: v1
kind: Secret
metadata:
name: {{ include "incidentops.fullname" . }}-grafana
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: grafana
type: Opaque
data:
admin-password: {{ .Values.observability.grafana.adminPassword | b64enc | quote }}
---
apiVersion: v1
kind: Service
metadata:
name: {{ include "incidentops.fullname" . }}-grafana
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: grafana
spec:
type: {{ .Values.observability.grafana.service.type }}
ports:
- name: http
port: 80
targetPort: http
protocol: TCP
selector:
{{- include "incidentops.selectorLabels" . | nindent 4 }}
app.kubernetes.io/component: grafana
{{- if .Values.observability.grafana.persistence.enabled }}
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: {{ include "incidentops.fullname" . }}-grafana
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: grafana
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ .Values.observability.grafana.persistence.size }}
{{- end }}
{{- end }}

View File

@@ -0,0 +1,38 @@
{{- if and .Values.observability.enabled .Values.observability.grafana.ingress.enabled -}}
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: {{ include "incidentops.fullname" . }}-grafana
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: grafana
{{- with .Values.observability.grafana.ingress.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
{{- if .Values.ingress.className }}
ingressClassName: {{ .Values.ingress.className }}
{{- end }}
{{- if .Values.observability.grafana.ingress.tls }}
tls:
{{- range .Values.observability.grafana.ingress.tls }}
- hosts:
{{- range .hosts }}
- {{ . | quote }}
{{- end }}
secretName: {{ .secretName }}
{{- end }}
{{- end }}
rules:
- host: {{ .Values.observability.grafana.ingress.host | quote }}
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: {{ include "incidentops.fullname" . }}-grafana
port:
number: 80
{{- end }}

View File

@@ -0,0 +1,155 @@
{{- if .Values.observability.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "incidentops.fullname" . }}-loki-config
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: loki
data:
loki.yaml: |
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
common:
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
query_range:
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 100
schema_config:
configs:
- from: "2020-10-24"
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
ruler:
alertmanager_url: http://localhost:9093
limits_config:
retention_period: {{ .Values.observability.loki.retention }}
allow_structured_metadata: true
volume_enabled: true
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "incidentops.fullname" . }}-loki
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: loki
spec:
replicas: 1
selector:
matchLabels:
{{- include "incidentops.selectorLabels" . | nindent 6 }}
app.kubernetes.io/component: loki
template:
metadata:
labels:
{{- include "incidentops.selectorLabels" . | nindent 8 }}
app.kubernetes.io/component: loki
annotations:
checksum/config: {{ .Values.observability.loki.image.tag | sha256sum }}
spec:
containers:
- name: loki
image: "{{ .Values.observability.loki.image.repository }}:{{ .Values.observability.loki.image.tag }}"
imagePullPolicy: {{ .Values.observability.loki.image.pullPolicy }}
args:
- -config.file=/etc/loki/loki.yaml
ports:
- name: http
containerPort: 3100
protocol: TCP
- name: grpc
containerPort: 9096
protocol: TCP
volumeMounts:
- name: config
mountPath: /etc/loki
- name: data
mountPath: /loki
resources:
{{- toYaml .Values.observability.loki.resources | nindent 12 }}
readinessProbe:
httpGet:
path: /ready
port: http
initialDelaySeconds: 10
periodSeconds: 10
livenessProbe:
httpGet:
path: /ready
port: http
initialDelaySeconds: 30
periodSeconds: 30
volumes:
- name: config
configMap:
name: {{ include "incidentops.fullname" . }}-loki-config
- name: data
{{- if .Values.observability.loki.persistence.enabled }}
persistentVolumeClaim:
claimName: {{ include "incidentops.fullname" . }}-loki
{{- else }}
emptyDir: {}
{{- end }}
---
apiVersion: v1
kind: Service
metadata:
name: {{ include "incidentops.fullname" . }}-loki
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: loki
spec:
type: ClusterIP
ports:
- name: http
port: 3100
targetPort: http
protocol: TCP
- name: grpc
port: 9096
targetPort: grpc
protocol: TCP
selector:
{{- include "incidentops.selectorLabels" . | nindent 4 }}
app.kubernetes.io/component: loki
{{- if .Values.observability.loki.persistence.enabled }}
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: {{ include "incidentops.fullname" . }}-loki
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: loki
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ .Values.observability.loki.persistence.size }}
{{- end }}
{{- end }}

View File

@@ -30,9 +30,11 @@ spec:
- name: migrate
securityContext:
{{- toYaml .Values.securityContext | nindent 12 }}
image: {{ include "incidentops.api.image" . }}
image: "{{ .Values.migration.image.repository }}:{{ .Values.migration.image.tag }}"
imagePullPolicy: {{ .Values.migration.image.pullPolicy }}
command:
- uv
- run
- python
- migrations/migrate.py
- apply

View File

@@ -0,0 +1,132 @@
{{- if .Values.observability.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "incidentops.fullname" . }}-otel-collector-config
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: otel-collector
data:
otel-collector-config.yaml: |
extensions:
health_check:
endpoint: 0.0.0.0:13133
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
processors:
batch:
timeout: 1s
send_batch_size: 1024
memory_limiter:
check_interval: 1s
limit_mib: 512
spike_limit_mib: 128
exporters:
otlp/tempo:
endpoint: {{ include "incidentops.fullname" . }}-tempo:4317
tls:
insecure: true
loki:
endpoint: http://{{ include "incidentops.fullname" . }}-loki:3100/loki/api/v1/push
default_labels_enabled:
exporter: true
job: true
service:
extensions: [health_check]
pipelines:
traces:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [otlp/tempo]
logs:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [loki]
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "incidentops.fullname" . }}-otel-collector
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: otel-collector
spec:
replicas: {{ .Values.observability.otelCollector.replicaCount }}
selector:
matchLabels:
{{- include "incidentops.selectorLabels" . | nindent 6 }}
app.kubernetes.io/component: otel-collector
template:
metadata:
labels:
{{- include "incidentops.selectorLabels" . | nindent 8 }}
app.kubernetes.io/component: otel-collector
annotations:
checksum/config: {{ .Values.observability.otelCollector.image.tag | sha256sum }}
spec:
containers:
- name: otel-collector
image: "{{ .Values.observability.otelCollector.image.repository }}:{{ .Values.observability.otelCollector.image.tag }}"
imagePullPolicy: {{ .Values.observability.otelCollector.image.pullPolicy }}
args:
- --config=/etc/otel-collector/otel-collector-config.yaml
ports:
- name: otlp-grpc
containerPort: 4317
protocol: TCP
- name: otlp-http
containerPort: 4318
protocol: TCP
volumeMounts:
- name: config
mountPath: /etc/otel-collector
resources:
{{- toYaml .Values.observability.otelCollector.resources | nindent 12 }}
livenessProbe:
httpGet:
path: /
port: 13133
initialDelaySeconds: 10
periodSeconds: 30
readinessProbe:
httpGet:
path: /
port: 13133
initialDelaySeconds: 5
periodSeconds: 10
volumes:
- name: config
configMap:
name: {{ include "incidentops.fullname" . }}-otel-collector-config
---
apiVersion: v1
kind: Service
metadata:
name: {{ include "incidentops.fullname" . }}-otel-collector
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: otel-collector
spec:
type: ClusterIP
ports:
- name: otlp-grpc
port: 4317
targetPort: otlp-grpc
protocol: TCP
- name: otlp-http
port: 4318
targetPort: otlp-http
protocol: TCP
selector:
{{- include "incidentops.selectorLabels" . | nindent 4 }}
app.kubernetes.io/component: otel-collector
{{- end }}

View File

@@ -0,0 +1,163 @@
{{- if and .Values.observability.enabled .Values.metrics.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "incidentops.fullname" . }}-prometheus
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: prometheus
data:
prometheus.yml: |
global:
scrape_interval: {{ .Values.observability.prometheus.scrapeInterval | default "15s" }}
evaluation_interval: 15s
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
- job_name: "incidentops-api"
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- {{ .Release.Namespace }}
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
action: keep
regex: api
- source_labels: [__meta_kubernetes_pod_container_port_name]
action: keep
regex: metrics
- source_labels: [__meta_kubernetes_namespace]
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod
metrics_path: /metrics
scrape_interval: 10s
- job_name: "incidentops-worker"
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- {{ .Release.Namespace }}
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
action: keep
regex: worker
- source_labels: [__meta_kubernetes_pod_container_port_name]
action: keep
regex: metrics
- source_labels: [__meta_kubernetes_namespace]
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod
metrics_path: /metrics
scrape_interval: 10s
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "incidentops.fullname" . }}-prometheus
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: prometheus
spec:
replicas: 1
selector:
matchLabels:
{{- include "incidentops.selectorLabels" . | nindent 6 }}
app.kubernetes.io/component: prometheus
template:
metadata:
labels:
{{- include "incidentops.selectorLabels" . | nindent 8 }}
app.kubernetes.io/component: prometheus
annotations:
checksum/config: {{ .Values.observability.prometheus.image.tag | sha256sum }}
spec:
serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
securityContext:
fsGroup: 65534
runAsUser: 65534
runAsNonRoot: true
containers:
- name: prometheus
image: "{{ .Values.observability.prometheus.image.repository }}:{{ .Values.observability.prometheus.image.tag }}"
imagePullPolicy: {{ .Values.observability.prometheus.image.pullPolicy }}
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention.time={{ .Values.observability.prometheus.retention }}"
- "--web.enable-lifecycle"
ports:
- name: http
containerPort: 9090
protocol: TCP
volumeMounts:
- name: config
mountPath: /etc/prometheus
- name: data
mountPath: /prometheus
resources:
{{- toYaml .Values.observability.prometheus.resources | nindent 12 }}
readinessProbe:
httpGet:
path: /-/ready
port: http
initialDelaySeconds: 10
periodSeconds: 10
livenessProbe:
httpGet:
path: /-/healthy
port: http
initialDelaySeconds: 30
periodSeconds: 30
volumes:
- name: config
configMap:
name: {{ include "incidentops.fullname" . }}-prometheus
- name: data
{{- if .Values.observability.prometheus.persistence.enabled }}
persistentVolumeClaim:
claimName: {{ include "incidentops.fullname" . }}-prometheus
{{- else }}
emptyDir: {}
{{- end }}
---
apiVersion: v1
kind: Service
metadata:
name: {{ include "incidentops.fullname" . }}-prometheus
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: prometheus
spec:
type: ClusterIP
ports:
- name: http
port: 9090
targetPort: http
protocol: TCP
selector:
{{- include "incidentops.selectorLabels" . | nindent 4 }}
app.kubernetes.io/component: prometheus
{{- if .Values.observability.prometheus.persistence.enabled }}
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: {{ include "incidentops.fullname" . }}-prometheus
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: prometheus
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ .Values.observability.prometheus.persistence.size }}
{{- end }}
{{- end }}

View File

@@ -0,0 +1,29 @@
{{- if and .Values.observability.enabled .Values.metrics.enabled }}
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: {{ include "incidentops.fullname" . }}-prometheus
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: prometheus
rules:
- apiGroups: [""]
resources: ["pods", "endpoints", "services"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: {{ include "incidentops.fullname" . }}-prometheus
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: prometheus
subjects:
- kind: ServiceAccount
name: {{ include "incidentops.serviceAccountName" . }}
namespace: {{ .Release.Namespace }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: {{ include "incidentops.fullname" . }}-prometheus
{{- end }}

View File

@@ -0,0 +1,169 @@
{{- if and .Values.observability.enabled .Values.observability.promtail.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "incidentops.fullname" . }}-promtail-config
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: promtail
data:
promtail.yaml: |
server:
http_listen_port: 3101
grpc_listen_port: 0
positions:
filename: /run/promtail/positions.yaml
clients:
- url: http://{{ include "incidentops.fullname" . }}-loki:3100/loki/api/v1/push
scrape_configs:
- job_name: kubernetes-pods
pipeline_stages:
- cri: {}
kubernetes_sd_configs:
- role: pod
namespaces:
names: [{{ .Release.Namespace }}]
relabel_configs:
- source_labels: [__meta_kubernetes_pod_container_init]
regex: "true"
action: drop
- source_labels: [__meta_kubernetes_pod_phase]
regex: Pending|Failed|Succeeded
action: drop
- source_labels: [__meta_kubernetes_pod_name, __meta_kubernetes_pod_namespace, __meta_kubernetes_pod_container_name]
target_label: __path__
replacement: /var/log/containers/$1_$2_$3-*.log
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
regex: (.*)
target_label: service_name
replacement: {{ include "incidentops.fullname" . }}-$1
- source_labels: [__meta_kubernetes_pod_namespace]
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod
- source_labels: [__meta_kubernetes_pod_container_name]
target_label: container
- source_labels: [__meta_kubernetes_pod_uid]
target_label: pod_uid
- target_label: cluster
replacement: {{ .Release.Namespace }}
- job_name: containers-fallback
pipeline_stages:
- cri: {}
static_configs:
- labels:
job: containers
namespace: {{ .Release.Namespace }}
service_name: incidentops-api
__path__: /var/log/containers/incidentops-api-*_incidentops_api-*.log
- labels:
job: containers
namespace: {{ .Release.Namespace }}
service_name: incidentops-worker
__path__: /var/log/containers/incidentops-worker-*_incidentops_worker-*.log
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "incidentops.fullname" . }}-promtail
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: promtail
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "incidentops.fullname" . }}-promtail
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: promtail
rules:
- apiGroups: [""]
resources: ["pods", "pods/log", "namespaces", "services", "endpoints", "nodes"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "incidentops.fullname" . }}-promtail
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: promtail
subjects:
- kind: ServiceAccount
name: {{ include "incidentops.fullname" . }}-promtail
namespace: {{ .Release.Namespace }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ include "incidentops.fullname" . }}-promtail
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: {{ include "incidentops.fullname" . }}-promtail
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: promtail
spec:
selector:
matchLabels:
{{- include "incidentops.selectorLabels" . | nindent 6 }}
app.kubernetes.io/component: promtail
template:
metadata:
labels:
{{- include "incidentops.selectorLabels" . | nindent 8 }}
app.kubernetes.io/component: promtail
annotations:
checksum/config: {{ .Values.observability.promtail.image.tag | sha256sum }}
spec:
serviceAccountName: {{ include "incidentops.fullname" . }}-promtail
securityContext:
runAsUser: 0
containers:
- name: promtail
image: "{{ .Values.observability.promtail.image.repository }}:{{ .Values.observability.promtail.image.tag }}"
imagePullPolicy: {{ .Values.observability.promtail.image.pullPolicy }}
args:
- -config.file=/etc/promtail/promtail.yaml
ports:
- name: http-metrics
containerPort: 3101
protocol: TCP
volumeMounts:
- name: config
mountPath: /etc/promtail
- name: positions
mountPath: /run/promtail
- name: varlog
mountPath: /var/log
readOnly: true
- name: varlogpods
mountPath: /var/log/pods
readOnly: true
- name: varlogcontainers
mountPath: /var/log/containers
readOnly: true
resources:
{{- toYaml .Values.observability.promtail.resources | nindent 12 }}
volumes:
- name: config
configMap:
name: {{ include "incidentops.fullname" . }}-promtail-config
- name: positions
emptyDir: {}
- name: varlog
hostPath:
path: /var/log
- name: varlogpods
hostPath:
path: /var/log/pods
- name: varlogcontainers
hostPath:
path: /var/log/containers
{{- end }}

View File

@@ -0,0 +1,153 @@
{{- if .Values.observability.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "incidentops.fullname" . }}-tempo-config
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: tempo
data:
tempo.yaml: |
server:
http_listen_port: 3200
distributor:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
ingester:
trace_idle_period: 10s
max_block_bytes: 1048576
max_block_duration: 5m
compactor:
compaction:
block_retention: {{ .Values.observability.tempo.retention }}
storage:
trace:
backend: local
local:
path: /var/tempo/traces
wal:
path: /var/tempo/wal
querier:
search:
query_timeout: 30s
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "incidentops.fullname" . }}-tempo
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: tempo
spec:
replicas: 1
selector:
matchLabels:
{{- include "incidentops.selectorLabels" . | nindent 6 }}
app.kubernetes.io/component: tempo
template:
metadata:
labels:
{{- include "incidentops.selectorLabels" . | nindent 8 }}
app.kubernetes.io/component: tempo
annotations:
checksum/config: {{ .Values.observability.tempo.image.tag | sha256sum }}
spec:
containers:
- name: tempo
image: "{{ .Values.observability.tempo.image.repository }}:{{ .Values.observability.tempo.image.tag }}"
imagePullPolicy: {{ .Values.observability.tempo.image.pullPolicy }}
args:
- -config.file=/etc/tempo/tempo.yaml
ports:
- name: http
containerPort: 3200
protocol: TCP
- name: otlp-grpc
containerPort: 4317
protocol: TCP
- name: otlp-http
containerPort: 4318
protocol: TCP
volumeMounts:
- name: config
mountPath: /etc/tempo
- name: data
mountPath: /var/tempo
resources:
{{- toYaml .Values.observability.tempo.resources | nindent 12 }}
readinessProbe:
httpGet:
path: /ready
port: http
initialDelaySeconds: 10
periodSeconds: 10
livenessProbe:
httpGet:
path: /ready
port: http
initialDelaySeconds: 30
periodSeconds: 30
volumes:
- name: config
configMap:
name: {{ include "incidentops.fullname" . }}-tempo-config
- name: data
{{- if .Values.observability.tempo.persistence.enabled }}
persistentVolumeClaim:
claimName: {{ include "incidentops.fullname" . }}-tempo
{{- else }}
emptyDir: {}
{{- end }}
---
apiVersion: v1
kind: Service
metadata:
name: {{ include "incidentops.fullname" . }}-tempo
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: tempo
spec:
type: ClusterIP
ports:
- name: http
port: 3200
targetPort: http
protocol: TCP
- name: otlp-grpc
port: 4317
targetPort: otlp-grpc
protocol: TCP
- name: otlp-http
port: 4318
targetPort: otlp-http
protocol: TCP
selector:
{{- include "incidentops.selectorLabels" . | nindent 4 }}
app.kubernetes.io/component: tempo
{{- if .Values.observability.tempo.persistence.enabled }}
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: {{ include "incidentops.fullname" . }}-tempo
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: tempo
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ .Values.observability.tempo.persistence.size }}
{{- end }}
{{- end }}

View File

@@ -29,6 +29,29 @@ spec:
serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
initContainers:
- name: wait-for-postgres
image: busybox:1.36
command:
- sh
- -c
- |
until nc -z {{ include "incidentops.fullname" . }}-postgresql 5432; do
echo "Waiting for PostgreSQL..."
sleep 2
done
echo "PostgreSQL is ready"
- name: wait-for-redis
image: busybox:1.36
command:
- sh
- -c
- |
until nc -z {{ include "incidentops.fullname" . }}-redis 6379; do
echo "Waiting for Redis..."
sleep 2
done
echo "Redis is ready"
containers:
- name: worker
securityContext:
@@ -36,6 +59,8 @@ spec:
image: {{ include "incidentops.worker.image" . }}
imagePullPolicy: {{ .Values.worker.image.pullPolicy }}
command:
- uv
- run
- celery
- -A
- worker.celery_app
@@ -52,6 +77,8 @@ spec:
livenessProbe:
exec:
command:
- uv
- run
- celery
- -A
- worker.celery_app

View File

@@ -80,3 +80,63 @@ redis:
limits:
cpu: 1000m
memory: 1Gi
# Application configuration
config:
environment: production
logLevel: INFO
# Observability Stack - Production settings
observability:
enabled: true
otelCollector:
replicaCount: 2
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
tempo:
retention: "720h" # 30 days
persistence:
enabled: true
size: 50Gi
resources:
requests:
cpu: 250m
memory: 512Mi
limits:
cpu: 1000m
memory: 2Gi
loki:
retention: "720h" # 30 days
persistence:
enabled: true
size: 100Gi
resources:
requests:
cpu: 250m
memory: 512Mi
limits:
cpu: 1000m
memory: 2Gi
grafana:
adminPassword: "" # Set via external secret in production
service:
type: ClusterIP
persistence:
enabled: true
size: 5Gi
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi

View File

@@ -106,6 +106,8 @@ config:
jwtAlgorithm: HS256
accessTokenExpireMinutes: 30
refreshTokenExpireDays: 30
environment: development
logLevel: INFO
# Secrets (use external secrets in production)
secrets:
@@ -161,3 +163,117 @@ podSecurityContext:
securityContext:
runAsNonRoot: true
runAsUser: 1000
# Observability Stack (Grafana + Loki + Tempo + OpenTelemetry Collector)
observability:
enabled: true
otelCollector:
replicaCount: 1
image:
repository: otel/opentelemetry-collector-contrib
tag: "0.96.0"
pullPolicy: IfNotPresent
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
tempo:
image:
repository: grafana/tempo
tag: "2.4.1"
pullPolicy: IfNotPresent
retention: "168h" # 7 days
persistence:
enabled: false
size: 10Gi
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
loki:
image:
repository: grafana/loki
tag: "2.9.6"
pullPolicy: IfNotPresent
retention: "168h" # 7 days
persistence:
enabled: false
size: 10Gi
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
prometheus:
image:
repository: prom/prometheus
tag: "v2.51.0"
pullPolicy: IfNotPresent
retention: "15d"
scrapeInterval: "15s"
persistence:
enabled: false
size: 10Gi
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
grafana:
image:
repository: grafana/grafana
tag: "10.4.1"
pullPolicy: IfNotPresent
adminUser: admin
adminPassword: "admin" # Change in production!
service:
type: ClusterIP
ingress:
enabled: false
host: grafana.incidentops.local
annotations: {}
tls: []
persistence:
enabled: false
size: 1Gi
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
promtail:
enabled: true
image:
repository: grafana/promtail
tag: "2.9.6"
pullPolicy: IfNotPresent
resources:
requests:
cpu: 25m
memory: 64Mi
limits:
cpu: 200m
memory: 256Mi
# Metrics configuration
metrics:
enabled: true
port: 9464