feat: add observability stack and background task infrastructure
Add OpenTelemetry instrumentation with distributed tracing and metrics: - Structured JSON logging with trace context correlation - Auto-instrumentation for FastAPI, asyncpg, httpx, redis - OTLP exporter for traces and Prometheus metrics endpoint Implement Celery worker and notification task system: - Celery app with Redis/SQS broker support and configurable queues - Notification tasks for incident fan-out, webhooks, and escalations - Pluggable TaskQueue abstraction with in-memory driver for testing Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana): - OpenTelemetry Collector for receiving OTLP traces and logs - Tempo for distributed tracing backend - Loki for log aggregation with Promtail DaemonSet - Prometheus for metrics scraping with RBAC configuration - Grafana with pre-provisioned datasources and API overview dashboard - Helm templates for all observability components Enhance application infrastructure: - Global exception handlers with structured ErrorResponse schema - Request logging middleware with timing metrics - Health check updated to verify task queue connectivity - Non-root user in Dockerfile for security - Init containers in Helm deployments for dependency ordering - Production Helm values with autoscaling and retention policies
This commit is contained in:
@@ -14,6 +14,7 @@ from app.core import exceptions as exc, security
|
||||
from app.db import Database
|
||||
from app.schemas.incident import CommentRequest, IncidentCreate, TransitionRequest
|
||||
from app.services.incident import IncidentService
|
||||
from app.taskqueue import InMemoryTaskQueue
|
||||
|
||||
|
||||
pytestmark = pytest.mark.asyncio
|
||||
@@ -43,10 +44,24 @@ class _SingleConnectionDatabase(Database):
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
async def incident_service(db_conn: asyncpg.Connection):
|
||||
def incident_task_queue() -> InMemoryTaskQueue:
|
||||
"""In-memory task queue used to assert dispatch behavior."""
|
||||
|
||||
return InMemoryTaskQueue()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
async def incident_service(
|
||||
db_conn: asyncpg.Connection,
|
||||
incident_task_queue: InMemoryTaskQueue,
|
||||
):
|
||||
"""IncidentService bound to the per-test database connection."""
|
||||
|
||||
return IncidentService(database=_SingleConnectionDatabase(db_conn))
|
||||
return IncidentService(
|
||||
database=_SingleConnectionDatabase(db_conn),
|
||||
task_queue=incident_task_queue,
|
||||
escalation_delay_seconds=60,
|
||||
)
|
||||
|
||||
|
||||
async def _seed_user_org_service(conn: asyncpg.Connection) -> tuple[CurrentUser, UUID]:
|
||||
@@ -94,7 +109,9 @@ async def _seed_user_org_service(conn: asyncpg.Connection) -> tuple[CurrentUser,
|
||||
|
||||
|
||||
async def test_create_incident_persists_and_records_event(
|
||||
incident_service: IncidentService, db_conn: asyncpg.Connection
|
||||
incident_service: IncidentService,
|
||||
db_conn: asyncpg.Connection,
|
||||
incident_task_queue: InMemoryTaskQueue,
|
||||
) -> None:
|
||||
current_user, service_id = await _seed_user_org_service(db_conn)
|
||||
|
||||
@@ -121,6 +138,12 @@ async def test_create_incident_persists_and_records_event(
|
||||
assert event["event_type"] == "created"
|
||||
assert event["actor_user_id"] == current_user.user_id
|
||||
|
||||
assert incident_task_queue.dispatched is not None
|
||||
assert len(incident_task_queue.dispatched) == 2
|
||||
first, second = incident_task_queue.dispatched
|
||||
assert first[0] == "incident_triggered"
|
||||
assert second[0] == "escalate_if_unacked"
|
||||
|
||||
|
||||
async def test_get_incidents_paginates_by_created_at(
|
||||
incident_service: IncidentService, db_conn: asyncpg.Connection
|
||||
|
||||
Reference in New Issue
Block a user