Files
incidentops/tests/services/test_incident_service.py
minhtrannhat 46ede7757d feat: add observability stack and background task infrastructure
Add OpenTelemetry instrumentation with distributed tracing and metrics:
- Structured JSON logging with trace context correlation
- Auto-instrumentation for FastAPI, asyncpg, httpx, redis
- OTLP exporter for traces and Prometheus metrics endpoint

Implement Celery worker and notification task system:
- Celery app with Redis/SQS broker support and configurable queues
- Notification tasks for incident fan-out, webhooks, and escalations
- Pluggable TaskQueue abstraction with in-memory driver for testing

Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana):
- OpenTelemetry Collector for receiving OTLP traces and logs
- Tempo for distributed tracing backend
- Loki for log aggregation with Promtail DaemonSet
- Prometheus for metrics scraping with RBAC configuration
- Grafana with pre-provisioned datasources and API overview dashboard
- Helm templates for all observability components

Enhance application infrastructure:
- Global exception handlers with structured ErrorResponse schema
- Request logging middleware with timing metrics
- Health check updated to verify task queue connectivity
- Non-root user in Dockerfile for security
- Init containers in Helm deployments for dependency ordering
- Production Helm values with autoscaling and retention policies
2026-01-07 20:51:13 -05:00

276 lines
8.4 KiB
Python

"""Unit tests for IncidentService."""
from __future__ import annotations
from contextlib import asynccontextmanager
from datetime import UTC, datetime, timedelta
from uuid import UUID, uuid4
import asyncpg
import pytest
from app.api.deps import CurrentUser
from app.core import exceptions as exc, security
from app.db import Database
from app.schemas.incident import CommentRequest, IncidentCreate, TransitionRequest
from app.services.incident import IncidentService
from app.taskqueue import InMemoryTaskQueue
pytestmark = pytest.mark.asyncio
class _SingleConnectionDatabase(Database):
"""Database stub that reuses a single asyncpg connection."""
def __init__(self, conn) -> None: # type: ignore[override]
self._conn = conn
@asynccontextmanager
async def connection(self): # type: ignore[override]
yield self._conn
@asynccontextmanager
async def transaction(self): # type: ignore[override]
tr = self._conn.transaction()
await tr.start()
try:
yield self._conn
except Exception:
await tr.rollback()
raise
else:
await tr.commit()
@pytest.fixture
def incident_task_queue() -> InMemoryTaskQueue:
"""In-memory task queue used to assert dispatch behavior."""
return InMemoryTaskQueue()
@pytest.fixture
async def incident_service(
db_conn: asyncpg.Connection,
incident_task_queue: InMemoryTaskQueue,
):
"""IncidentService bound to the per-test database connection."""
return IncidentService(
database=_SingleConnectionDatabase(db_conn),
task_queue=incident_task_queue,
escalation_delay_seconds=60,
)
async def _seed_user_org_service(conn: asyncpg.Connection) -> tuple[CurrentUser, UUID]:
"""Create a user, org, and service and return the CurrentUser + service_id."""
user_id = uuid4()
org_id = uuid4()
service_id = uuid4()
await conn.execute(
"INSERT INTO users (id, email, password_hash) VALUES ($1, $2, $3)",
user_id,
"owner@example.com",
security.hash_password("Passw0rd!"),
)
await conn.execute(
"INSERT INTO orgs (id, name, slug) VALUES ($1, $2, $3)",
org_id,
"Test Org",
"test-org",
)
await conn.execute(
"INSERT INTO org_members (id, user_id, org_id, role) VALUES ($1, $2, $3, $4)",
uuid4(),
user_id,
org_id,
"member",
)
await conn.execute(
"INSERT INTO services (id, org_id, name, slug) VALUES ($1, $2, $3, $4)",
service_id,
org_id,
"API",
"api",
)
current_user = CurrentUser(
user_id=user_id,
email="owner@example.com",
org_id=org_id,
org_role="member",
token="token",
)
return current_user, service_id
async def test_create_incident_persists_and_records_event(
incident_service: IncidentService,
db_conn: asyncpg.Connection,
incident_task_queue: InMemoryTaskQueue,
) -> None:
current_user, service_id = await _seed_user_org_service(db_conn)
incident = await incident_service.create_incident(
current_user,
service_id,
IncidentCreate(title="API outage", description="Gateway 502s", severity="critical"),
)
row = await db_conn.fetchrow(
"SELECT status, org_id, service_id FROM incidents WHERE id = $1",
incident.id,
)
assert row is not None
assert row["status"] == "triggered"
assert row["org_id"] == current_user.org_id
assert row["service_id"] == service_id
event = await db_conn.fetchrow(
"SELECT event_type, actor_user_id FROM incident_events WHERE incident_id = $1",
incident.id,
)
assert event is not None
assert event["event_type"] == "created"
assert event["actor_user_id"] == current_user.user_id
assert incident_task_queue.dispatched is not None
assert len(incident_task_queue.dispatched) == 2
first, second = incident_task_queue.dispatched
assert first[0] == "incident_triggered"
assert second[0] == "escalate_if_unacked"
async def test_get_incidents_paginates_by_created_at(
incident_service: IncidentService, db_conn: asyncpg.Connection
) -> None:
current_user, service_id = await _seed_user_org_service(db_conn)
first = await incident_service.create_incident(
current_user, service_id, IncidentCreate(title="First", description=None, severity="low")
)
second = await incident_service.create_incident(
current_user, service_id, IncidentCreate(title="Second", description=None, severity="medium")
)
third = await incident_service.create_incident(
current_user, service_id, IncidentCreate(title="Third", description=None, severity="high")
)
# Stagger created_at for deterministic ordering
now = datetime.now(UTC)
await db_conn.execute(
"UPDATE incidents SET created_at = $1 WHERE id = $2",
now - timedelta(minutes=3),
first.id,
)
await db_conn.execute(
"UPDATE incidents SET created_at = $1 WHERE id = $2",
now - timedelta(minutes=2),
second.id,
)
await db_conn.execute(
"UPDATE incidents SET created_at = $1 WHERE id = $2",
now - timedelta(minutes=1),
third.id,
)
page = await incident_service.get_incidents(current_user, limit=2)
titles = [item.title for item in page.items]
assert titles == ["Third", "Second"]
assert page.has_more is True
assert page.next_cursor is not None
async def test_transition_incident_updates_status_and_records_event(
incident_service: IncidentService, db_conn: asyncpg.Connection
) -> None:
current_user, service_id = await _seed_user_org_service(db_conn)
incident = await incident_service.create_incident(
current_user, service_id, IncidentCreate(title="Escalation", severity="high", description=None)
)
updated = await incident_service.transition_incident(
current_user,
incident.id,
TransitionRequest(to_status="acknowledged", version=incident.version, note="On it"),
)
assert updated.status == "acknowledged"
assert updated.version == incident.version + 1
event = await db_conn.fetchrow(
"""
SELECT payload
FROM incident_events
WHERE incident_id = $1 AND event_type = 'status_changed'
ORDER BY created_at DESC
LIMIT 1
""",
incident.id,
)
assert event is not None
payload = event["payload"]
if isinstance(payload, str):
import json
payload = json.loads(payload)
assert payload["from"] == "triggered"
assert payload["to"] == "acknowledged"
assert payload["note"] == "On it"
async def test_transition_incident_rejects_invalid_transition(
incident_service: IncidentService, db_conn: asyncpg.Connection
) -> None:
current_user, service_id = await _seed_user_org_service(db_conn)
incident = await incident_service.create_incident(
current_user, service_id, IncidentCreate(title="Invalid", severity="low", description=None)
)
with pytest.raises(exc.BadRequestError):
await incident_service.transition_incident(
current_user,
incident.id,
TransitionRequest(to_status="resolved", version=incident.version, note=None),
)
async def test_transition_incident_conflict_on_version_mismatch(
incident_service: IncidentService, db_conn: asyncpg.Connection
) -> None:
current_user, service_id = await _seed_user_org_service(db_conn)
incident = await incident_service.create_incident(
current_user, service_id, IncidentCreate(title="Version", severity="medium", description=None)
)
with pytest.raises(exc.ConflictError):
await incident_service.transition_incident(
current_user,
incident.id,
TransitionRequest(to_status="acknowledged", version=999, note=None),
)
async def test_add_comment_creates_event(
incident_service: IncidentService, db_conn: asyncpg.Connection
) -> None:
current_user, service_id = await _seed_user_org_service(db_conn)
incident = await incident_service.create_incident(
current_user, service_id, IncidentCreate(title="Comment", severity="low", description=None)
)
event = await incident_service.add_comment(
current_user,
incident.id,
CommentRequest(content="Investigating"),
)
assert event.event_type == "comment_added"
assert event.payload == {"content": "Investigating"}