feat: add observability stack and background task infrastructure

Add OpenTelemetry instrumentation with distributed tracing and metrics:
- Structured JSON logging with trace context correlation
- Auto-instrumentation for FastAPI, asyncpg, httpx, redis
- OTLP exporter for traces and Prometheus metrics endpoint

Implement Celery worker and notification task system:
- Celery app with Redis/SQS broker support and configurable queues
- Notification tasks for incident fan-out, webhooks, and escalations
- Pluggable TaskQueue abstraction with in-memory driver for testing

Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana):
- OpenTelemetry Collector for receiving OTLP traces and logs
- Tempo for distributed tracing backend
- Loki for log aggregation with Promtail DaemonSet
- Prometheus for metrics scraping with RBAC configuration
- Grafana with pre-provisioned datasources and API overview dashboard
- Helm templates for all observability components

Enhance application infrastructure:
- Global exception handlers with structured ErrorResponse schema
- Request logging middleware with timing metrics
- Health check updated to verify task queue connectivity
- Non-root user in Dockerfile for security
- Init containers in Helm deployments for dependency ordering
- Production Helm values with autoscaling and retention policies
This commit is contained in:
2026-01-07 20:51:13 -05:00
parent f427d191e0
commit 46ede7757d
45 changed files with 3742 additions and 76 deletions

View File

@@ -27,14 +27,15 @@ build:
- src: "worker/**/*.py"
dest: /app
- image: incidentops/web
docker:
dockerfile: Dockerfile.web
context: .
sync:
manual:
- src: "web/src/**/*"
dest: /app
# Web frontend disabled until implemented
# - image: incidentops/web
# docker:
# dockerfile: Dockerfile.web
# context: .
# sync:
# manual:
# - src: "web/src/**/*"
# dest: /app
local:
push: false
@@ -48,12 +49,15 @@ deploy:
valuesFiles:
- helm/incidentops/values.yaml
setValues:
api.image.repository: incidentops/api
api.image.tag: ""
worker.image.repository: incidentops/worker
worker.image.tag: ""
web.image.repository: incidentops/web
web.image.tag: ""
web.replicaCount: 0 # Disabled until frontend is implemented
migration.enabled: true
setValueTemplates:
api.image.repository: "{{.IMAGE_REPO_incidentops_api}}"
api.image.tag: "{{.IMAGE_TAG_incidentops_api}}"
worker.image.repository: "{{.IMAGE_REPO_incidentops_worker}}"
worker.image.tag: "{{.IMAGE_TAG_incidentops_worker}}"
migration.image.repository: "{{.IMAGE_REPO_incidentops_api}}"
migration.image.tag: "{{.IMAGE_TAG_incidentops_api}}"
createNamespace: true
namespace: incidentops
@@ -74,13 +78,15 @@ profiles:
setValues:
api.replicaCount: 1
worker.replicaCount: 1
web.replicaCount: 1
api.image.repository: incidentops/api
api.image.tag: ""
worker.image.repository: incidentops/worker
worker.image.tag: ""
web.image.repository: incidentops/web
web.image.tag: ""
web.replicaCount: 0 # Disabled until frontend is implemented
migration.enabled: true
setValueTemplates:
api.image.repository: "{{.IMAGE_REPO_incidentops_api}}"
api.image.tag: "{{.IMAGE_TAG_incidentops_api}}"
worker.image.repository: "{{.IMAGE_REPO_incidentops_worker}}"
worker.image.tag: "{{.IMAGE_TAG_incidentops_worker}}"
migration.image.repository: "{{.IMAGE_REPO_incidentops_api}}"
migration.image.tag: "{{.IMAGE_TAG_incidentops_api}}"
createNamespace: true
namespace: incidentops
@@ -115,8 +121,30 @@ portForward:
namespace: incidentops
port: 8000
localPort: 8000
# Web frontend disabled until implemented
# - resourceType: service
# resourceName: incidentops-web
# namespace: incidentops
# port: 3000
# localPort: 3000
# Observability
- resourceType: service
resourceName: incidentops-web
resourceName: incidentops-grafana
namespace: incidentops
port: 3000
localPort: 3000
port: 80
localPort: 3001
- resourceType: service
resourceName: incidentops-prometheus
namespace: incidentops
port: 9090
localPort: 9090
- resourceType: service
resourceName: incidentops-tempo
namespace: incidentops
port: 3200
localPort: 3200
- resourceType: service
resourceName: incidentops-loki
namespace: incidentops
port: 3100
localPort: 3100