feat: add observability stack and background task infrastructure
Add OpenTelemetry instrumentation with distributed tracing and metrics: - Structured JSON logging with trace context correlation - Auto-instrumentation for FastAPI, asyncpg, httpx, redis - OTLP exporter for traces and Prometheus metrics endpoint Implement Celery worker and notification task system: - Celery app with Redis/SQS broker support and configurable queues - Notification tasks for incident fan-out, webhooks, and escalations - Pluggable TaskQueue abstraction with in-memory driver for testing Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana): - OpenTelemetry Collector for receiving OTLP traces and logs - Tempo for distributed tracing backend - Loki for log aggregation with Promtail DaemonSet - Prometheus for metrics scraping with RBAC configuration - Grafana with pre-provisioned datasources and API overview dashboard - Helm templates for all observability components Enhance application infrastructure: - Global exception handlers with structured ErrorResponse schema - Request logging middleware with timing metrics - Health check updated to verify task queue connectivity - Non-root user in Dockerfile for security - Init containers in Helm deployments for dependency ordering - Production Helm values with autoscaling and retention policies
This commit is contained in:
@@ -27,14 +27,15 @@ build:
|
||||
- src: "worker/**/*.py"
|
||||
dest: /app
|
||||
|
||||
- image: incidentops/web
|
||||
docker:
|
||||
dockerfile: Dockerfile.web
|
||||
context: .
|
||||
sync:
|
||||
manual:
|
||||
- src: "web/src/**/*"
|
||||
dest: /app
|
||||
# Web frontend disabled until implemented
|
||||
# - image: incidentops/web
|
||||
# docker:
|
||||
# dockerfile: Dockerfile.web
|
||||
# context: .
|
||||
# sync:
|
||||
# manual:
|
||||
# - src: "web/src/**/*"
|
||||
# dest: /app
|
||||
|
||||
local:
|
||||
push: false
|
||||
@@ -48,12 +49,15 @@ deploy:
|
||||
valuesFiles:
|
||||
- helm/incidentops/values.yaml
|
||||
setValues:
|
||||
api.image.repository: incidentops/api
|
||||
api.image.tag: ""
|
||||
worker.image.repository: incidentops/worker
|
||||
worker.image.tag: ""
|
||||
web.image.repository: incidentops/web
|
||||
web.image.tag: ""
|
||||
web.replicaCount: 0 # Disabled until frontend is implemented
|
||||
migration.enabled: true
|
||||
setValueTemplates:
|
||||
api.image.repository: "{{.IMAGE_REPO_incidentops_api}}"
|
||||
api.image.tag: "{{.IMAGE_TAG_incidentops_api}}"
|
||||
worker.image.repository: "{{.IMAGE_REPO_incidentops_worker}}"
|
||||
worker.image.tag: "{{.IMAGE_TAG_incidentops_worker}}"
|
||||
migration.image.repository: "{{.IMAGE_REPO_incidentops_api}}"
|
||||
migration.image.tag: "{{.IMAGE_TAG_incidentops_api}}"
|
||||
createNamespace: true
|
||||
namespace: incidentops
|
||||
|
||||
@@ -74,13 +78,15 @@ profiles:
|
||||
setValues:
|
||||
api.replicaCount: 1
|
||||
worker.replicaCount: 1
|
||||
web.replicaCount: 1
|
||||
api.image.repository: incidentops/api
|
||||
api.image.tag: ""
|
||||
worker.image.repository: incidentops/worker
|
||||
worker.image.tag: ""
|
||||
web.image.repository: incidentops/web
|
||||
web.image.tag: ""
|
||||
web.replicaCount: 0 # Disabled until frontend is implemented
|
||||
migration.enabled: true
|
||||
setValueTemplates:
|
||||
api.image.repository: "{{.IMAGE_REPO_incidentops_api}}"
|
||||
api.image.tag: "{{.IMAGE_TAG_incidentops_api}}"
|
||||
worker.image.repository: "{{.IMAGE_REPO_incidentops_worker}}"
|
||||
worker.image.tag: "{{.IMAGE_TAG_incidentops_worker}}"
|
||||
migration.image.repository: "{{.IMAGE_REPO_incidentops_api}}"
|
||||
migration.image.tag: "{{.IMAGE_TAG_incidentops_api}}"
|
||||
createNamespace: true
|
||||
namespace: incidentops
|
||||
|
||||
@@ -115,8 +121,30 @@ portForward:
|
||||
namespace: incidentops
|
||||
port: 8000
|
||||
localPort: 8000
|
||||
# Web frontend disabled until implemented
|
||||
# - resourceType: service
|
||||
# resourceName: incidentops-web
|
||||
# namespace: incidentops
|
||||
# port: 3000
|
||||
# localPort: 3000
|
||||
# Observability
|
||||
- resourceType: service
|
||||
resourceName: incidentops-web
|
||||
resourceName: incidentops-grafana
|
||||
namespace: incidentops
|
||||
port: 3000
|
||||
localPort: 3000
|
||||
port: 80
|
||||
localPort: 3001
|
||||
- resourceType: service
|
||||
resourceName: incidentops-prometheus
|
||||
namespace: incidentops
|
||||
port: 9090
|
||||
localPort: 9090
|
||||
- resourceType: service
|
||||
resourceName: incidentops-tempo
|
||||
namespace: incidentops
|
||||
port: 3200
|
||||
localPort: 3200
|
||||
- resourceType: service
|
||||
resourceName: incidentops-loki
|
||||
namespace: incidentops
|
||||
port: 3100
|
||||
localPort: 3100
|
||||
|
||||
Reference in New Issue
Block a user