feat: add observability stack and background task infrastructure

Add OpenTelemetry instrumentation with distributed tracing and metrics:
- Structured JSON logging with trace context correlation
- Auto-instrumentation for FastAPI, asyncpg, httpx, redis
- OTLP exporter for traces and Prometheus metrics endpoint

Implement Celery worker and notification task system:
- Celery app with Redis/SQS broker support and configurable queues
- Notification tasks for incident fan-out, webhooks, and escalations
- Pluggable TaskQueue abstraction with in-memory driver for testing

Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana):
- OpenTelemetry Collector for receiving OTLP traces and logs
- Tempo for distributed tracing backend
- Loki for log aggregation with Promtail DaemonSet
- Prometheus for metrics scraping with RBAC configuration
- Grafana with pre-provisioned datasources and API overview dashboard
- Helm templates for all observability components

Enhance application infrastructure:
- Global exception handlers with structured ErrorResponse schema
- Request logging middleware with timing metrics
- Health check updated to verify task queue connectivity
- Non-root user in Dockerfile for security
- Init containers in Helm deployments for dependency ordering
- Production Helm values with autoscaling and retention policies
This commit is contained in:
2026-01-07 20:51:13 -05:00
parent f427d191e0
commit 46ede7757d
45 changed files with 3742 additions and 76 deletions

View File

@@ -1,26 +1,50 @@
"""FastAPI application entry point."""
import logging
import time
from contextlib import asynccontextmanager
from typing import AsyncGenerator
from fastapi import FastAPI
from fastapi import FastAPI, Request, status
from fastapi.encoders import jsonable_encoder
from fastapi.exceptions import RequestValidationError
from fastapi.openapi.utils import get_openapi
from fastapi.responses import JSONResponse
from starlette.exceptions import HTTPException as StarletteHTTPException
from app.api.v1 import auth, health, incidents, org
from app.config import settings
from app.db import db, redis_client
from app.core.logging import setup_logging
from app.core.telemetry import (
get_current_trace_id,
record_exception,
setup_telemetry,
shutdown_telemetry,
)
from app.db import db
from app.schemas.common import ErrorDetail, ErrorResponse
from app.taskqueue import task_queue
# Initialize logging before anything else
setup_logging()
logger = logging.getLogger(__name__)
@asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
"""Manage application lifecycle - connect/disconnect resources."""
# Startup
logger.info("Starting IncidentOps API")
await db.connect(settings.database_url)
await redis_client.connect(settings.redis_url)
await task_queue.startup()
logger.info("Startup complete")
yield
# Shutdown
await redis_client.disconnect()
logger.info("Shutting down IncidentOps API")
await task_queue.shutdown()
await db.disconnect()
await shutdown_telemetry()
logger.info("Shutdown complete")
app = FastAPI(
@@ -33,6 +57,26 @@ app = FastAPI(
lifespan=lifespan,
)
# Set up OpenTelemetry instrumentation
setup_telemetry(app)
@app.middleware("http")
async def request_logging_middleware(request: Request, call_next):
start = time.time()
response = await call_next(request)
duration_ms = (time.time() - start) * 1000
logger.info(
"request",
extra={
"method": request.method,
"path": request.url.path,
"status_code": response.status_code,
"duration_ms": round(duration_ms, 2),
},
)
return response
app.openapi_tags = [
{"name": "auth", "description": "Registration, login, token lifecycle"},
{"name": "org", "description": "Organization membership, services, and notifications"},
@@ -41,9 +85,133 @@ app.openapi_tags = [
]
def custom_openapi() -> dict:
"""Add JWT bearer security scheme to the generated OpenAPI schema."""
# ---------------------------------------------------------------------------
# Global Exception Handlers
# ---------------------------------------------------------------------------
def _build_error_response(
error: str,
message: str,
status_code: int,
details: list[ErrorDetail] | None = None,
) -> JSONResponse:
"""Build a structured error response with trace context."""
response = ErrorResponse(
error=error,
message=message,
details=details,
request_id=get_current_trace_id(),
)
return JSONResponse(
status_code=status_code,
content=jsonable_encoder(response),
)
@app.exception_handler(StarletteHTTPException)
async def http_exception_handler(
request: Request, exc: StarletteHTTPException
) -> JSONResponse:
"""Handle HTTP exceptions with structured error responses."""
# Map status codes to error type strings
error_types = {
400: "bad_request",
401: "unauthorized",
403: "forbidden",
404: "not_found",
409: "conflict",
422: "validation_error",
429: "rate_limited",
500: "internal_error",
502: "bad_gateway",
503: "service_unavailable",
}
error_type = error_types.get(exc.status_code, "error")
logger.warning(
"HTTP exception",
extra={
"status_code": exc.status_code,
"error": error_type,
"detail": exc.detail,
"path": str(request.url.path),
"method": request.method,
},
)
return _build_error_response(
error=error_type,
message=str(exc.detail),
status_code=exc.status_code,
)
@app.exception_handler(RequestValidationError)
async def validation_exception_handler(
request: Request, exc: RequestValidationError
) -> JSONResponse:
"""Handle Pydantic validation errors with detailed error responses."""
details = [
ErrorDetail(
loc=[str(loc) for loc in error["loc"]],
msg=error["msg"],
type=error["type"],
)
for error in exc.errors()
]
logger.warning(
"Validation error",
extra={
"path": str(request.url.path),
"method": request.method,
"error_count": len(details),
},
)
return _build_error_response(
error="validation_error",
message="Request validation failed",
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
details=details,
)
@app.exception_handler(Exception)
async def unhandled_exception_handler(request: Request, exc: Exception) -> JSONResponse:
"""Handle unexpected exceptions with logging and safe error response."""
# Record exception in the current span for tracing
record_exception(exc)
logger.exception(
"Unhandled exception",
extra={
"path": str(request.url.path),
"method": request.method,
"exception_type": type(exc).__name__,
},
)
# Don't leak internal error details in production
message = "An unexpected error occurred"
if settings.debug:
message = f"{type(exc).__name__}: {exc}"
return _build_error_response(
error="internal_error",
message=message,
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
)
# ---------------------------------------------------------------------------
# OpenAPI Customization
# ---------------------------------------------------------------------------
def custom_openapi() -> dict:
"""Add JWT bearer security scheme and error responses to OpenAPI schema."""
if app.openapi_schema:
return app.openapi_schema
@@ -52,8 +220,12 @@ def custom_openapi() -> dict:
version=app.version,
description=app.description,
routes=app.routes,
tags=app.openapi_tags,
)
security_schemes = openapi_schema.setdefault("components", {}).setdefault("securitySchemes", {})
# Add security schemes
components = openapi_schema.setdefault("components", {})
security_schemes = components.setdefault("securitySchemes", {})
security_schemes["BearerToken"] = {
"type": "http",
"scheme": "bearer",
@@ -61,6 +233,42 @@ def custom_openapi() -> dict:
"description": "Paste the JWT access token returned by /auth endpoints",
}
openapi_schema["security"] = [{"BearerToken": []}]
# Add common error response schemas
schemas = components.setdefault("schemas", {})
schemas["ErrorResponse"] = {
"type": "object",
"properties": {
"error": {"type": "string", "description": "Error type identifier"},
"message": {"type": "string", "description": "Human-readable error message"},
"details": {
"type": "array",
"items": {"$ref": "#/components/schemas/ErrorDetail"},
"nullable": True,
"description": "Validation error details",
},
"request_id": {
"type": "string",
"nullable": True,
"description": "Trace ID for debugging",
},
},
"required": ["error", "message"],
}
schemas["ErrorDetail"] = {
"type": "object",
"properties": {
"loc": {
"type": "array",
"items": {"oneOf": [{"type": "string"}, {"type": "integer"}]},
"description": "Error location path",
},
"msg": {"type": "string", "description": "Error message"},
"type": {"type": "string", "description": "Error type"},
},
"required": ["loc", "msg", "type"],
}
app.openapi_schema = openapi_schema
return app.openapi_schema