feat: add observability stack and background task infrastructure
Add OpenTelemetry instrumentation with distributed tracing and metrics: - Structured JSON logging with trace context correlation - Auto-instrumentation for FastAPI, asyncpg, httpx, redis - OTLP exporter for traces and Prometheus metrics endpoint Implement Celery worker and notification task system: - Celery app with Redis/SQS broker support and configurable queues - Notification tasks for incident fan-out, webhooks, and escalations - Pluggable TaskQueue abstraction with in-memory driver for testing Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana): - OpenTelemetry Collector for receiving OTLP traces and logs - Tempo for distributed tracing backend - Loki for log aggregation with Promtail DaemonSet - Prometheus for metrics scraping with RBAC configuration - Grafana with pre-provisioned datasources and API overview dashboard - Helm templates for all observability components Enhance application infrastructure: - Global exception handlers with structured ErrorResponse schema - Request logging middleware with timing metrics - Health check updated to verify task queue connectivity - Non-root user in Dockerfile for security - Init containers in Helm deployments for dependency ordering - Production Helm values with autoscaling and retention policies
This commit is contained in:
@@ -41,6 +41,7 @@ services:
|
||||
container_name: incidentops-api
|
||||
ports:
|
||||
- "8000:8000"
|
||||
- "9464:9464" # Prometheus metrics
|
||||
environment:
|
||||
DATABASE_URL: postgresql://incidentops:incidentops@postgres:5432/incidentops
|
||||
REDIS_URL: redis://redis:6379/0
|
||||
@@ -48,11 +49,24 @@ services:
|
||||
JWT_ALGORITHM: HS256
|
||||
ACCESS_TOKEN_EXPIRE_MINUTES: 30
|
||||
REFRESH_TOKEN_EXPIRE_DAYS: 30
|
||||
# OpenTelemetry
|
||||
OTEL_ENABLED: "true"
|
||||
OTEL_SERVICE_NAME: incidentops-api
|
||||
OTEL_ENVIRONMENT: development
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317
|
||||
OTEL_EXPORTER_OTLP_INSECURE: "true"
|
||||
OTEL_LOG_LEVEL: INFO
|
||||
# Metrics
|
||||
PROMETHEUS_PORT: "9464"
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
redis:
|
||||
condition: service_healthy
|
||||
otel-collector:
|
||||
condition: service_started
|
||||
prometheus:
|
||||
condition: service_started
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8000/v1/healthz"]
|
||||
interval: 30s
|
||||
@@ -72,6 +86,12 @@ services:
|
||||
REDIS_URL: redis://redis:6379/0
|
||||
CELERY_BROKER_URL: redis://redis:6379/0
|
||||
CELERY_RESULT_BACKEND: redis://redis:6379/1
|
||||
# OpenTelemetry
|
||||
OTEL_ENABLED: "true"
|
||||
OTEL_SERVICE_NAME: incidentops-worker
|
||||
OTEL_ENVIRONMENT: development
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317
|
||||
OTEL_EXPORTER_OTLP_INSECURE: "true"
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
@@ -121,9 +141,89 @@ services:
|
||||
profiles:
|
||||
- monitoring
|
||||
|
||||
# ============================================
|
||||
# Observability Stack
|
||||
# ============================================
|
||||
|
||||
# OpenTelemetry Collector - receives traces/logs from apps
|
||||
otel-collector:
|
||||
image: otel/opentelemetry-collector-contrib:0.96.0
|
||||
container_name: incidentops-otel-collector
|
||||
command: ["--config=/etc/otel-collector/config.yaml"]
|
||||
volumes:
|
||||
- ./observability/otel-collector/config.yaml:/etc/otel-collector/config.yaml:ro
|
||||
ports:
|
||||
- "4317:4317" # OTLP gRPC
|
||||
- "4318:4318" # OTLP HTTP
|
||||
depends_on:
|
||||
- tempo
|
||||
- loki
|
||||
|
||||
# Tempo - distributed tracing backend
|
||||
tempo:
|
||||
image: grafana/tempo:2.4.1
|
||||
container_name: incidentops-tempo
|
||||
command: ["-config.file=/etc/tempo/config.yaml"]
|
||||
volumes:
|
||||
- ./observability/tempo/config.yaml:/etc/tempo/config.yaml:ro
|
||||
- tempo_data:/var/tempo
|
||||
ports:
|
||||
- "3200:3200" # Tempo HTTP
|
||||
- "4320:4317" # Tempo OTLP gRPC (different host port to avoid conflict)
|
||||
|
||||
# Loki - log aggregation
|
||||
loki:
|
||||
image: grafana/loki:2.9.6
|
||||
container_name: incidentops-loki
|
||||
command: ["-config.file=/etc/loki/config.yaml"]
|
||||
volumes:
|
||||
- ./observability/loki/config.yaml:/etc/loki/config.yaml:ro
|
||||
- loki_data:/loki
|
||||
ports:
|
||||
- "3100:3100" # Loki HTTP
|
||||
|
||||
# Prometheus - metrics storage
|
||||
prometheus:
|
||||
image: prom/prometheus:v2.51.0
|
||||
container_name: incidentops-prometheus
|
||||
command:
|
||||
- "--config.file=/etc/prometheus/prometheus.yml"
|
||||
- "--storage.tsdb.path=/prometheus"
|
||||
- "--web.enable-lifecycle"
|
||||
volumes:
|
||||
- ./observability/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- prometheus_data:/prometheus
|
||||
ports:
|
||||
- "9090:9090" # Prometheus UI
|
||||
|
||||
# Grafana - visualization
|
||||
grafana:
|
||||
image: grafana/grafana:10.4.1
|
||||
container_name: incidentops-grafana
|
||||
environment:
|
||||
GF_SECURITY_ADMIN_USER: admin
|
||||
GF_SECURITY_ADMIN_PASSWORD: admin
|
||||
GF_USERS_ALLOW_SIGN_UP: "false"
|
||||
GF_EXPLORE_ENABLED: "true"
|
||||
GF_FEATURE_TOGGLES_ENABLE: traceqlEditor tempoSearch tempoBackendSearch tempoApmTable
|
||||
volumes:
|
||||
- ./observability/grafana/provisioning:/etc/grafana/provisioning:ro
|
||||
- ./observability/grafana/dashboards:/var/lib/grafana/dashboards:ro
|
||||
- grafana_data:/var/lib/grafana
|
||||
ports:
|
||||
- "3001:3000" # Grafana UI (3001 to avoid conflict with web frontend)
|
||||
depends_on:
|
||||
- tempo
|
||||
- loki
|
||||
- prometheus
|
||||
|
||||
volumes:
|
||||
postgres_data:
|
||||
redis_data:
|
||||
tempo_data:
|
||||
loki_data:
|
||||
prometheus_data:
|
||||
grafana_data:
|
||||
|
||||
networks:
|
||||
default:
|
||||
|
||||
Reference in New Issue
Block a user