Add OpenTelemetry instrumentation with distributed tracing and metrics: - Structured JSON logging with trace context correlation - Auto-instrumentation for FastAPI, asyncpg, httpx, redis - OTLP exporter for traces and Prometheus metrics endpoint Implement Celery worker and notification task system: - Celery app with Redis/SQS broker support and configurable queues - Notification tasks for incident fan-out, webhooks, and escalations - Pluggable TaskQueue abstraction with in-memory driver for testing Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana): - OpenTelemetry Collector for receiving OTLP traces and logs - Tempo for distributed tracing backend - Loki for log aggregation with Promtail DaemonSet - Prometheus for metrics scraping with RBAC configuration - Grafana with pre-provisioned datasources and API overview dashboard - Helm templates for all observability components Enhance application infrastructure: - Global exception handlers with structured ErrorResponse schema - Request logging middleware with timing metrics - Health check updated to verify task queue connectivity - Non-root user in Dockerfile for security - Init containers in Helm deployments for dependency ordering - Production Helm values with autoscaling and retention policies
248 lines
8.5 KiB
Python
248 lines
8.5 KiB
Python
"""Incident service implementing incident lifecycle operations."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import datetime
|
|
from typing import cast
|
|
from uuid import UUID, uuid4
|
|
|
|
import asyncpg
|
|
from asyncpg.pool import PoolConnectionProxy
|
|
|
|
from app.api.deps import CurrentUser, ensure_org_access
|
|
from app.config import settings
|
|
from app.core import exceptions as exc
|
|
from app.db import Database, db
|
|
from app.repositories import IncidentRepository, ServiceRepository
|
|
from app.schemas.common import PaginatedResponse
|
|
from app.schemas.incident import (
|
|
CommentRequest,
|
|
IncidentCreate,
|
|
IncidentEventResponse,
|
|
IncidentResponse,
|
|
TransitionRequest,
|
|
)
|
|
from app.taskqueue import TaskQueue
|
|
from app.taskqueue import task_queue as default_task_queue
|
|
|
|
_ALLOWED_TRANSITIONS: dict[str, set[str]] = {
|
|
"triggered": {"acknowledged"},
|
|
"acknowledged": {"mitigated"},
|
|
"mitigated": {"resolved"},
|
|
"resolved": set(),
|
|
}
|
|
|
|
|
|
def _as_conn(conn: asyncpg.Connection | PoolConnectionProxy) -> asyncpg.Connection:
|
|
"""Helper to satisfy typing when a pool proxy is returned."""
|
|
|
|
return cast(asyncpg.Connection, conn)
|
|
|
|
|
|
class IncidentService:
|
|
"""Encapsulates incident lifecycle operations within an org context."""
|
|
|
|
def __init__(
|
|
self,
|
|
database: Database | None = None,
|
|
task_queue: TaskQueue | None = None,
|
|
escalation_delay_seconds: int | None = None,
|
|
) -> None:
|
|
self.db = database or db
|
|
self.task_queue = task_queue or default_task_queue
|
|
self.escalation_delay_seconds = (
|
|
escalation_delay_seconds
|
|
if escalation_delay_seconds is not None
|
|
else settings.notification_escalation_delay_seconds
|
|
)
|
|
|
|
async def create_incident(
|
|
self,
|
|
current_user: CurrentUser,
|
|
service_id: UUID,
|
|
data: IncidentCreate,
|
|
) -> IncidentResponse:
|
|
"""Create an incident for a service in the active org and record the creation event."""
|
|
|
|
async with self.db.transaction() as conn:
|
|
db_conn = _as_conn(conn)
|
|
service_repo = ServiceRepository(db_conn)
|
|
incident_repo = IncidentRepository(db_conn)
|
|
|
|
service = await service_repo.get_by_id(service_id)
|
|
if service is None:
|
|
raise exc.NotFoundError("Service not found")
|
|
ensure_org_access(service["org_id"], current_user)
|
|
|
|
incident_id = uuid4()
|
|
incident = await incident_repo.create(
|
|
incident_id=incident_id,
|
|
org_id=current_user.org_id,
|
|
service_id=service_id,
|
|
title=data.title,
|
|
description=data.description,
|
|
severity=data.severity,
|
|
)
|
|
|
|
await incident_repo.add_event(
|
|
uuid4(),
|
|
incident_id,
|
|
"created",
|
|
actor_user_id=current_user.user_id,
|
|
payload={
|
|
"title": data.title,
|
|
"severity": data.severity,
|
|
"description": data.description,
|
|
},
|
|
)
|
|
|
|
incident_response = IncidentResponse(**incident)
|
|
|
|
self.task_queue.incident_triggered(
|
|
incident_id=incident_response.id,
|
|
org_id=current_user.org_id,
|
|
triggered_by=current_user.user_id,
|
|
)
|
|
|
|
if self.escalation_delay_seconds > 0:
|
|
self.task_queue.schedule_escalation_check(
|
|
incident_id=incident_response.id,
|
|
org_id=current_user.org_id,
|
|
delay_seconds=self.escalation_delay_seconds,
|
|
)
|
|
|
|
return incident_response
|
|
|
|
async def get_incidents(
|
|
self,
|
|
current_user: CurrentUser,
|
|
*,
|
|
status: str | None = None,
|
|
cursor: datetime | None = None,
|
|
limit: int = 20,
|
|
) -> PaginatedResponse[IncidentResponse]:
|
|
"""Return paginated incidents for the active organization."""
|
|
|
|
async with self.db.connection() as conn:
|
|
incident_repo = IncidentRepository(_as_conn(conn))
|
|
rows = await incident_repo.get_by_org(
|
|
org_id=current_user.org_id,
|
|
status=status,
|
|
cursor=cursor,
|
|
limit=limit,
|
|
)
|
|
|
|
has_more = len(rows) > limit
|
|
items = rows[:limit]
|
|
next_cursor = items[-1]["created_at"].isoformat() if has_more and items else None
|
|
|
|
incidents = [IncidentResponse(**row) for row in items]
|
|
return PaginatedResponse[IncidentResponse](
|
|
items=incidents,
|
|
next_cursor=next_cursor,
|
|
has_more=has_more,
|
|
)
|
|
|
|
async def get_incident(self, current_user: CurrentUser, incident_id: UUID) -> IncidentResponse:
|
|
"""Return a single incident, ensuring it belongs to the active org."""
|
|
|
|
async with self.db.connection() as conn:
|
|
incident_repo = IncidentRepository(_as_conn(conn))
|
|
incident = await incident_repo.get_by_id(incident_id)
|
|
if incident is None:
|
|
raise exc.NotFoundError("Incident not found")
|
|
ensure_org_access(incident["org_id"], current_user)
|
|
return IncidentResponse(**incident)
|
|
|
|
async def get_incident_events(
|
|
self, current_user: CurrentUser, incident_id: UUID
|
|
) -> list[IncidentEventResponse]:
|
|
"""Return the timeline events for an incident in the active org."""
|
|
|
|
async with self.db.connection() as conn:
|
|
incident_repo = IncidentRepository(_as_conn(conn))
|
|
incident = await incident_repo.get_by_id(incident_id)
|
|
if incident is None:
|
|
raise exc.NotFoundError("Incident not found")
|
|
ensure_org_access(incident["org_id"], current_user)
|
|
|
|
events = await incident_repo.get_events(incident_id)
|
|
return [IncidentEventResponse(**event) for event in events]
|
|
|
|
async def transition_incident(
|
|
self,
|
|
current_user: CurrentUser,
|
|
incident_id: UUID,
|
|
data: TransitionRequest,
|
|
) -> IncidentResponse:
|
|
"""Transition an incident status with optimistic locking and event recording."""
|
|
|
|
async with self.db.transaction() as conn:
|
|
db_conn = _as_conn(conn)
|
|
incident_repo = IncidentRepository(db_conn)
|
|
|
|
incident = await incident_repo.get_by_id(incident_id)
|
|
if incident is None:
|
|
raise exc.NotFoundError("Incident not found")
|
|
ensure_org_access(incident["org_id"], current_user)
|
|
self._validate_transition(incident["status"], data.to_status)
|
|
|
|
updated = await incident_repo.update_status(
|
|
incident_id,
|
|
data.to_status,
|
|
data.version,
|
|
)
|
|
if updated is None:
|
|
raise exc.ConflictError("Incident version mismatch")
|
|
|
|
payload = {"from": incident["status"], "to": data.to_status}
|
|
if data.note:
|
|
payload["note"] = data.note
|
|
|
|
await incident_repo.add_event(
|
|
uuid4(),
|
|
incident_id,
|
|
"status_changed",
|
|
actor_user_id=current_user.user_id,
|
|
payload=payload,
|
|
)
|
|
|
|
return IncidentResponse(**updated)
|
|
|
|
async def add_comment(
|
|
self,
|
|
current_user: CurrentUser,
|
|
incident_id: UUID,
|
|
data: CommentRequest,
|
|
) -> IncidentEventResponse:
|
|
"""Add a comment event to the incident timeline."""
|
|
|
|
async with self.db.connection() as conn:
|
|
incident_repo = IncidentRepository(_as_conn(conn))
|
|
incident = await incident_repo.get_by_id(incident_id)
|
|
if incident is None:
|
|
raise exc.NotFoundError("Incident not found")
|
|
ensure_org_access(incident["org_id"], current_user)
|
|
|
|
event = await incident_repo.add_event(
|
|
uuid4(),
|
|
incident_id,
|
|
"comment_added",
|
|
actor_user_id=current_user.user_id,
|
|
payload={"content": data.content},
|
|
)
|
|
return IncidentEventResponse(**event)
|
|
|
|
def _validate_transition(self, current_status: str, to_status: str) -> None:
|
|
"""Validate a requested status transition against the allowed state machine."""
|
|
|
|
if current_status == to_status:
|
|
raise exc.BadRequestError("Incident is already in the requested status")
|
|
|
|
allowed = _ALLOWED_TRANSITIONS.get(current_status, set())
|
|
if to_status not in allowed:
|
|
raise exc.BadRequestError("Invalid incident status transition")
|
|
|
|
|
|
__all__ = ["IncidentService"]
|