feat: add observability stack and background task infrastructure

Add OpenTelemetry instrumentation with distributed tracing and metrics:
- Structured JSON logging with trace context correlation
- Auto-instrumentation for FastAPI, asyncpg, httpx, redis
- OTLP exporter for traces and Prometheus metrics endpoint

Implement Celery worker and notification task system:
- Celery app with Redis/SQS broker support and configurable queues
- Notification tasks for incident fan-out, webhooks, and escalations
- Pluggable TaskQueue abstraction with in-memory driver for testing

Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana):
- OpenTelemetry Collector for receiving OTLP traces and logs
- Tempo for distributed tracing backend
- Loki for log aggregation with Promtail DaemonSet
- Prometheus for metrics scraping with RBAC configuration
- Grafana with pre-provisioned datasources and API overview dashboard
- Helm templates for all observability components

Enhance application infrastructure:
- Global exception handlers with structured ErrorResponse schema
- Request logging middleware with timing metrics
- Health check updated to verify task queue connectivity
- Non-root user in Dockerfile for security
- Init containers in Helm deployments for dependency ordering
- Production Helm values with autoscaling and retention policies
This commit is contained in:
2026-01-07 20:51:13 -05:00
parent f427d191e0
commit 46ede7757d
45 changed files with 3742 additions and 76 deletions

View File

@@ -41,6 +41,7 @@ services:
container_name: incidentops-api
ports:
- "8000:8000"
- "9464:9464" # Prometheus metrics
environment:
DATABASE_URL: postgresql://incidentops:incidentops@postgres:5432/incidentops
REDIS_URL: redis://redis:6379/0
@@ -48,11 +49,24 @@ services:
JWT_ALGORITHM: HS256
ACCESS_TOKEN_EXPIRE_MINUTES: 30
REFRESH_TOKEN_EXPIRE_DAYS: 30
# OpenTelemetry
OTEL_ENABLED: "true"
OTEL_SERVICE_NAME: incidentops-api
OTEL_ENVIRONMENT: development
OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317
OTEL_EXPORTER_OTLP_INSECURE: "true"
OTEL_LOG_LEVEL: INFO
# Metrics
PROMETHEUS_PORT: "9464"
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
otel-collector:
condition: service_started
prometheus:
condition: service_started
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/v1/healthz"]
interval: 30s
@@ -72,6 +86,12 @@ services:
REDIS_URL: redis://redis:6379/0
CELERY_BROKER_URL: redis://redis:6379/0
CELERY_RESULT_BACKEND: redis://redis:6379/1
# OpenTelemetry
OTEL_ENABLED: "true"
OTEL_SERVICE_NAME: incidentops-worker
OTEL_ENVIRONMENT: development
OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317
OTEL_EXPORTER_OTLP_INSECURE: "true"
depends_on:
postgres:
condition: service_healthy
@@ -121,9 +141,89 @@ services:
profiles:
- monitoring
# ============================================
# Observability Stack
# ============================================
# OpenTelemetry Collector - receives traces/logs from apps
otel-collector:
image: otel/opentelemetry-collector-contrib:0.96.0
container_name: incidentops-otel-collector
command: ["--config=/etc/otel-collector/config.yaml"]
volumes:
- ./observability/otel-collector/config.yaml:/etc/otel-collector/config.yaml:ro
ports:
- "4317:4317" # OTLP gRPC
- "4318:4318" # OTLP HTTP
depends_on:
- tempo
- loki
# Tempo - distributed tracing backend
tempo:
image: grafana/tempo:2.4.1
container_name: incidentops-tempo
command: ["-config.file=/etc/tempo/config.yaml"]
volumes:
- ./observability/tempo/config.yaml:/etc/tempo/config.yaml:ro
- tempo_data:/var/tempo
ports:
- "3200:3200" # Tempo HTTP
- "4320:4317" # Tempo OTLP gRPC (different host port to avoid conflict)
# Loki - log aggregation
loki:
image: grafana/loki:2.9.6
container_name: incidentops-loki
command: ["-config.file=/etc/loki/config.yaml"]
volumes:
- ./observability/loki/config.yaml:/etc/loki/config.yaml:ro
- loki_data:/loki
ports:
- "3100:3100" # Loki HTTP
# Prometheus - metrics storage
prometheus:
image: prom/prometheus:v2.51.0
container_name: incidentops-prometheus
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--web.enable-lifecycle"
volumes:
- ./observability/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus_data:/prometheus
ports:
- "9090:9090" # Prometheus UI
# Grafana - visualization
grafana:
image: grafana/grafana:10.4.1
container_name: incidentops-grafana
environment:
GF_SECURITY_ADMIN_USER: admin
GF_SECURITY_ADMIN_PASSWORD: admin
GF_USERS_ALLOW_SIGN_UP: "false"
GF_EXPLORE_ENABLED: "true"
GF_FEATURE_TOGGLES_ENABLE: traceqlEditor tempoSearch tempoBackendSearch tempoApmTable
volumes:
- ./observability/grafana/provisioning:/etc/grafana/provisioning:ro
- ./observability/grafana/dashboards:/var/lib/grafana/dashboards:ro
- grafana_data:/var/lib/grafana
ports:
- "3001:3000" # Grafana UI (3001 to avoid conflict with web frontend)
depends_on:
- tempo
- loki
- prometheus
volumes:
postgres_data:
redis_data:
tempo_data:
loki_data:
prometheus_data:
grafana_data:
networks:
default: