Add OpenTelemetry instrumentation with distributed tracing and metrics: - Structured JSON logging with trace context correlation - Auto-instrumentation for FastAPI, asyncpg, httpx, redis - OTLP exporter for traces and Prometheus metrics endpoint Implement Celery worker and notification task system: - Celery app with Redis/SQS broker support and configurable queues - Notification tasks for incident fan-out, webhooks, and escalations - Pluggable TaskQueue abstraction with in-memory driver for testing Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana): - OpenTelemetry Collector for receiving OTLP traces and logs - Tempo for distributed tracing backend - Loki for log aggregation with Promtail DaemonSet - Prometheus for metrics scraping with RBAC configuration - Grafana with pre-provisioned datasources and API overview dashboard - Helm templates for all observability components Enhance application infrastructure: - Global exception handlers with structured ErrorResponse schema - Request logging middleware with timing metrics - Health check updated to verify task queue connectivity - Non-root user in Dockerfile for security - Init containers in Helm deployments for dependency ordering - Production Helm values with autoscaling and retention policies
48 lines
1.1 KiB
Python
48 lines
1.1 KiB
Python
"""Health check endpoints."""
|
|
|
|
from fastapi import APIRouter, Response, status
|
|
|
|
from app.db import db
|
|
from app.taskqueue import task_queue
|
|
|
|
router = APIRouter()
|
|
|
|
|
|
@router.get("/healthz")
|
|
async def healthz() -> dict[str, str]:
|
|
"""Liveness probe - returns 200 if the service is running."""
|
|
return {"status": "ok"}
|
|
|
|
|
|
@router.get("/readyz")
|
|
async def readyz(response: Response) -> dict[str, str | dict[str, bool]]:
|
|
"""
|
|
Readiness probe - checks database and task queue connectivity.
|
|
- Check Postgres status
|
|
- Check configured task queue backend
|
|
- Return overall healthiness
|
|
"""
|
|
checks = {
|
|
"postgres": False,
|
|
"task_queue": False,
|
|
}
|
|
|
|
try:
|
|
if db.pool:
|
|
async with db.connection() as conn:
|
|
await conn.fetchval("SELECT 1")
|
|
checks["postgres"] = True
|
|
except Exception:
|
|
pass
|
|
|
|
checks["task_queue"] = await task_queue.ping()
|
|
|
|
all_healthy = all(checks.values())
|
|
if not all_healthy:
|
|
response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
|
|
|
|
return {
|
|
"status": "ok" if all_healthy else "degraded",
|
|
"checks": checks,
|
|
}
|