Files
incidentops/app/main.py
minhtrannhat 46ede7757d feat: add observability stack and background task infrastructure
Add OpenTelemetry instrumentation with distributed tracing and metrics:
- Structured JSON logging with trace context correlation
- Auto-instrumentation for FastAPI, asyncpg, httpx, redis
- OTLP exporter for traces and Prometheus metrics endpoint

Implement Celery worker and notification task system:
- Celery app with Redis/SQS broker support and configurable queues
- Notification tasks for incident fan-out, webhooks, and escalations
- Pluggable TaskQueue abstraction with in-memory driver for testing

Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana):
- OpenTelemetry Collector for receiving OTLP traces and logs
- Tempo for distributed tracing backend
- Loki for log aggregation with Promtail DaemonSet
- Prometheus for metrics scraping with RBAC configuration
- Grafana with pre-provisioned datasources and API overview dashboard
- Helm templates for all observability components

Enhance application infrastructure:
- Global exception handlers with structured ErrorResponse schema
- Request logging middleware with timing metrics
- Health check updated to verify task queue connectivity
- Non-root user in Dockerfile for security
- Init containers in Helm deployments for dependency ordering
- Production Helm values with autoscaling and retention policies
2026-01-07 20:51:13 -05:00

283 lines
8.4 KiB
Python

"""FastAPI application entry point."""
import logging
import time
from contextlib import asynccontextmanager
from typing import AsyncGenerator
from fastapi import FastAPI, Request, status
from fastapi.encoders import jsonable_encoder
from fastapi.exceptions import RequestValidationError
from fastapi.openapi.utils import get_openapi
from fastapi.responses import JSONResponse
from starlette.exceptions import HTTPException as StarletteHTTPException
from app.api.v1 import auth, health, incidents, org
from app.config import settings
from app.core.logging import setup_logging
from app.core.telemetry import (
get_current_trace_id,
record_exception,
setup_telemetry,
shutdown_telemetry,
)
from app.db import db
from app.schemas.common import ErrorDetail, ErrorResponse
from app.taskqueue import task_queue
# Initialize logging before anything else
setup_logging()
logger = logging.getLogger(__name__)
@asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
"""Manage application lifecycle - connect/disconnect resources."""
# Startup
logger.info("Starting IncidentOps API")
await db.connect(settings.database_url)
await task_queue.startup()
logger.info("Startup complete")
yield
# Shutdown
logger.info("Shutting down IncidentOps API")
await task_queue.shutdown()
await db.disconnect()
await shutdown_telemetry()
logger.info("Shutdown complete")
app = FastAPI(
title="IncidentOps",
description="Incident management API with multi-tenant org support",
version="0.1.0",
docs_url="/docs",
redoc_url="/redoc",
openapi_url="/openapi.json",
lifespan=lifespan,
)
# Set up OpenTelemetry instrumentation
setup_telemetry(app)
@app.middleware("http")
async def request_logging_middleware(request: Request, call_next):
start = time.time()
response = await call_next(request)
duration_ms = (time.time() - start) * 1000
logger.info(
"request",
extra={
"method": request.method,
"path": request.url.path,
"status_code": response.status_code,
"duration_ms": round(duration_ms, 2),
},
)
return response
app.openapi_tags = [
{"name": "auth", "description": "Registration, login, token lifecycle"},
{"name": "org", "description": "Organization membership, services, and notifications"},
{"name": "incidents", "description": "Incident lifecycle and timelines"},
{"name": "health", "description": "Service health probes"},
]
# ---------------------------------------------------------------------------
# Global Exception Handlers
# ---------------------------------------------------------------------------
def _build_error_response(
error: str,
message: str,
status_code: int,
details: list[ErrorDetail] | None = None,
) -> JSONResponse:
"""Build a structured error response with trace context."""
response = ErrorResponse(
error=error,
message=message,
details=details,
request_id=get_current_trace_id(),
)
return JSONResponse(
status_code=status_code,
content=jsonable_encoder(response),
)
@app.exception_handler(StarletteHTTPException)
async def http_exception_handler(
request: Request, exc: StarletteHTTPException
) -> JSONResponse:
"""Handle HTTP exceptions with structured error responses."""
# Map status codes to error type strings
error_types = {
400: "bad_request",
401: "unauthorized",
403: "forbidden",
404: "not_found",
409: "conflict",
422: "validation_error",
429: "rate_limited",
500: "internal_error",
502: "bad_gateway",
503: "service_unavailable",
}
error_type = error_types.get(exc.status_code, "error")
logger.warning(
"HTTP exception",
extra={
"status_code": exc.status_code,
"error": error_type,
"detail": exc.detail,
"path": str(request.url.path),
"method": request.method,
},
)
return _build_error_response(
error=error_type,
message=str(exc.detail),
status_code=exc.status_code,
)
@app.exception_handler(RequestValidationError)
async def validation_exception_handler(
request: Request, exc: RequestValidationError
) -> JSONResponse:
"""Handle Pydantic validation errors with detailed error responses."""
details = [
ErrorDetail(
loc=[str(loc) for loc in error["loc"]],
msg=error["msg"],
type=error["type"],
)
for error in exc.errors()
]
logger.warning(
"Validation error",
extra={
"path": str(request.url.path),
"method": request.method,
"error_count": len(details),
},
)
return _build_error_response(
error="validation_error",
message="Request validation failed",
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
details=details,
)
@app.exception_handler(Exception)
async def unhandled_exception_handler(request: Request, exc: Exception) -> JSONResponse:
"""Handle unexpected exceptions with logging and safe error response."""
# Record exception in the current span for tracing
record_exception(exc)
logger.exception(
"Unhandled exception",
extra={
"path": str(request.url.path),
"method": request.method,
"exception_type": type(exc).__name__,
},
)
# Don't leak internal error details in production
message = "An unexpected error occurred"
if settings.debug:
message = f"{type(exc).__name__}: {exc}"
return _build_error_response(
error="internal_error",
message=message,
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
)
# ---------------------------------------------------------------------------
# OpenAPI Customization
# ---------------------------------------------------------------------------
def custom_openapi() -> dict:
"""Add JWT bearer security scheme and error responses to OpenAPI schema."""
if app.openapi_schema:
return app.openapi_schema
openapi_schema = get_openapi(
title=app.title,
version=app.version,
description=app.description,
routes=app.routes,
tags=app.openapi_tags,
)
# Add security schemes
components = openapi_schema.setdefault("components", {})
security_schemes = components.setdefault("securitySchemes", {})
security_schemes["BearerToken"] = {
"type": "http",
"scheme": "bearer",
"bearerFormat": "JWT",
"description": "Paste the JWT access token returned by /auth endpoints",
}
openapi_schema["security"] = [{"BearerToken": []}]
# Add common error response schemas
schemas = components.setdefault("schemas", {})
schemas["ErrorResponse"] = {
"type": "object",
"properties": {
"error": {"type": "string", "description": "Error type identifier"},
"message": {"type": "string", "description": "Human-readable error message"},
"details": {
"type": "array",
"items": {"$ref": "#/components/schemas/ErrorDetail"},
"nullable": True,
"description": "Validation error details",
},
"request_id": {
"type": "string",
"nullable": True,
"description": "Trace ID for debugging",
},
},
"required": ["error", "message"],
}
schemas["ErrorDetail"] = {
"type": "object",
"properties": {
"loc": {
"type": "array",
"items": {"oneOf": [{"type": "string"}, {"type": "integer"}]},
"description": "Error location path",
},
"msg": {"type": "string", "description": "Error message"},
"type": {"type": "string", "description": "Error type"},
},
"required": ["loc", "msg", "type"],
}
app.openapi_schema = openapi_schema
return app.openapi_schema
app.openapi = custom_openapi # type: ignore[assignment]
# Include routers
app.include_router(auth.router, prefix=settings.api_v1_prefix)
app.include_router(incidents.router, prefix=settings.api_v1_prefix)
app.include_router(org.router, prefix=settings.api_v1_prefix)
app.include_router(health.router, prefix=settings.api_v1_prefix, tags=["health"])