feat: add observability stack and background task infrastructure

Add OpenTelemetry instrumentation with distributed tracing and metrics: - Structured JSON logging with trace context correlation - Auto-instrumentation for FastAPI, asyncpg, httpx, redis - OTLP exporter for traces and Prometheus metrics endpoint Implement Celery worker and notification task system: - Celery app with Redis/SQS broker support and configurable queues - Notification tasks for incident fan-out, webhooks, and escalations - Pluggable TaskQueue abstraction with in-memory driver for testing Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana): - OpenTelemetry Collector for receiving OTLP traces and logs - Tempo for distributed tracing backend - Loki for log aggregation with Promtail DaemonSet - Prometheus for metrics scraping with RBAC configuration - Grafana with pre-provisioned datasources and API overview dashboard - Helm templates for all observability components Enhance application infrastructure: - Global exception handlers with structured ErrorResponse schema - Request logging middleware with timing metrics - Health check updated to verify task queue connectivity - Non-root user in Dockerfile for security - Init containers in Helm deployments for dependency ordering - Production Helm values with autoscaling and retention policies
2026-01-07 20:51:13 -05:00
parent f427d191e0
commit 46ede7757d
45 changed files with 3742 additions and 76 deletions
--- a/app/services/incident.py
+++ b/app/services/incident.py
@@ -10,6 +10,7 @@ import asyncpg
 from asyncpg.pool import PoolConnectionProxy

 from app.api.deps import CurrentUser, ensure_org_access
+from app.config import settings
 from app.core import exceptions as exc
 from app.db import Database, db
 from app.repositories import IncidentRepository, ServiceRepository
@@ -21,7 +22,8 @@ from app.schemas.incident import (
    IncidentResponse,
    TransitionRequest,
 )
-
+from app.taskqueue import TaskQueue
+from app.taskqueue import task_queue as default_task_queue

 _ALLOWED_TRANSITIONS: dict[str, set[str]] = {
    "triggered": {"acknowledged"},
@@ -40,8 +42,19 @@ def _as_conn(conn: asyncpg.Connection | PoolConnectionProxy) -> asyncpg.Connecti
 class IncidentService:
    """Encapsulates incident lifecycle operations within an org context."""

-    def __init__(self, database: Database | None = None) -> None:
+    def __init__(
+        self,
+        database: Database | None = None,
+        task_queue: TaskQueue | None = None,
+        escalation_delay_seconds: int | None = None,
+    ) -> None:
        self.db = database or db
+        self.task_queue = task_queue or default_task_queue
+        self.escalation_delay_seconds = (
+            escalation_delay_seconds
+            if escalation_delay_seconds is not None
+            else settings.notification_escalation_delay_seconds
+        )

    async def create_incident(
        self,
@@ -83,7 +96,22 @@ class IncidentService:
                },
            )

-            return IncidentResponse(**incident)
+            incident_response = IncidentResponse(**incident)
+
+        self.task_queue.incident_triggered(
+            incident_id=incident_response.id,
+            org_id=current_user.org_id,
+            triggered_by=current_user.user_id,
+        )
+
+        if self.escalation_delay_seconds > 0:
+            self.task_queue.schedule_escalation_check(
+                incident_id=incident_response.id,
+                org_id=current_user.org_id,
+                delay_seconds=self.escalation_delay_seconds,
+            )
+
+        return incident_response

    async def get_incidents(
        self,