feat: add observability stack and background task infrastructure
Add OpenTelemetry instrumentation with distributed tracing and metrics: - Structured JSON logging with trace context correlation - Auto-instrumentation for FastAPI, asyncpg, httpx, redis - OTLP exporter for traces and Prometheus metrics endpoint Implement Celery worker and notification task system: - Celery app with Redis/SQS broker support and configurable queues - Notification tasks for incident fan-out, webhooks, and escalations - Pluggable TaskQueue abstraction with in-memory driver for testing Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana): - OpenTelemetry Collector for receiving OTLP traces and logs - Tempo for distributed tracing backend - Loki for log aggregation with Promtail DaemonSet - Prometheus for metrics scraping with RBAC configuration - Grafana with pre-provisioned datasources and API overview dashboard - Helm templates for all observability components Enhance application infrastructure: - Global exception handlers with structured ErrorResponse schema - Request logging middleware with timing metrics - Health check updated to verify task queue connectivity - Non-root user in Dockerfile for security - Init containers in Helm deployments for dependency ordering - Production Helm values with autoscaling and retention policies
This commit is contained in:
@@ -10,6 +10,7 @@ import asyncpg
|
||||
from asyncpg.pool import PoolConnectionProxy
|
||||
|
||||
from app.api.deps import CurrentUser, ensure_org_access
|
||||
from app.config import settings
|
||||
from app.core import exceptions as exc
|
||||
from app.db import Database, db
|
||||
from app.repositories import IncidentRepository, ServiceRepository
|
||||
@@ -21,7 +22,8 @@ from app.schemas.incident import (
|
||||
IncidentResponse,
|
||||
TransitionRequest,
|
||||
)
|
||||
|
||||
from app.taskqueue import TaskQueue
|
||||
from app.taskqueue import task_queue as default_task_queue
|
||||
|
||||
_ALLOWED_TRANSITIONS: dict[str, set[str]] = {
|
||||
"triggered": {"acknowledged"},
|
||||
@@ -40,8 +42,19 @@ def _as_conn(conn: asyncpg.Connection | PoolConnectionProxy) -> asyncpg.Connecti
|
||||
class IncidentService:
|
||||
"""Encapsulates incident lifecycle operations within an org context."""
|
||||
|
||||
def __init__(self, database: Database | None = None) -> None:
|
||||
def __init__(
|
||||
self,
|
||||
database: Database | None = None,
|
||||
task_queue: TaskQueue | None = None,
|
||||
escalation_delay_seconds: int | None = None,
|
||||
) -> None:
|
||||
self.db = database or db
|
||||
self.task_queue = task_queue or default_task_queue
|
||||
self.escalation_delay_seconds = (
|
||||
escalation_delay_seconds
|
||||
if escalation_delay_seconds is not None
|
||||
else settings.notification_escalation_delay_seconds
|
||||
)
|
||||
|
||||
async def create_incident(
|
||||
self,
|
||||
@@ -83,7 +96,22 @@ class IncidentService:
|
||||
},
|
||||
)
|
||||
|
||||
return IncidentResponse(**incident)
|
||||
incident_response = IncidentResponse(**incident)
|
||||
|
||||
self.task_queue.incident_triggered(
|
||||
incident_id=incident_response.id,
|
||||
org_id=current_user.org_id,
|
||||
triggered_by=current_user.user_id,
|
||||
)
|
||||
|
||||
if self.escalation_delay_seconds > 0:
|
||||
self.task_queue.schedule_escalation_check(
|
||||
incident_id=incident_response.id,
|
||||
org_id=current_user.org_id,
|
||||
delay_seconds=self.escalation_delay_seconds,
|
||||
)
|
||||
|
||||
return incident_response
|
||||
|
||||
async def get_incidents(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user