feat: add observability stack and background task infrastructure
Add OpenTelemetry instrumentation with distributed tracing and metrics: - Structured JSON logging with trace context correlation - Auto-instrumentation for FastAPI, asyncpg, httpx, redis - OTLP exporter for traces and Prometheus metrics endpoint Implement Celery worker and notification task system: - Celery app with Redis/SQS broker support and configurable queues - Notification tasks for incident fan-out, webhooks, and escalations - Pluggable TaskQueue abstraction with in-memory driver for testing Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana): - OpenTelemetry Collector for receiving OTLP traces and logs - Tempo for distributed tracing backend - Loki for log aggregation with Promtail DaemonSet - Prometheus for metrics scraping with RBAC configuration - Grafana with pre-provisioned datasources and API overview dashboard - Helm templates for all observability components Enhance application infrastructure: - Global exception handlers with structured ErrorResponse schema - Request logging middleware with timing metrics - Health check updated to verify task queue connectivity - Non-root user in Dockerfile for security - Init containers in Helm deployments for dependency ordering - Production Helm values with autoscaling and retention policies
This commit is contained in:
222
app/main.py
222
app/main.py
@@ -1,26 +1,50 @@
|
||||
"""FastAPI application entry point."""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from fastapi import FastAPI
|
||||
from fastapi import FastAPI, Request, status
|
||||
from fastapi.encoders import jsonable_encoder
|
||||
from fastapi.exceptions import RequestValidationError
|
||||
from fastapi.openapi.utils import get_openapi
|
||||
from fastapi.responses import JSONResponse
|
||||
from starlette.exceptions import HTTPException as StarletteHTTPException
|
||||
|
||||
from app.api.v1 import auth, health, incidents, org
|
||||
from app.config import settings
|
||||
from app.db import db, redis_client
|
||||
from app.core.logging import setup_logging
|
||||
from app.core.telemetry import (
|
||||
get_current_trace_id,
|
||||
record_exception,
|
||||
setup_telemetry,
|
||||
shutdown_telemetry,
|
||||
)
|
||||
from app.db import db
|
||||
from app.schemas.common import ErrorDetail, ErrorResponse
|
||||
from app.taskqueue import task_queue
|
||||
|
||||
# Initialize logging before anything else
|
||||
setup_logging()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
|
||||
"""Manage application lifecycle - connect/disconnect resources."""
|
||||
# Startup
|
||||
logger.info("Starting IncidentOps API")
|
||||
await db.connect(settings.database_url)
|
||||
await redis_client.connect(settings.redis_url)
|
||||
await task_queue.startup()
|
||||
logger.info("Startup complete")
|
||||
yield
|
||||
# Shutdown
|
||||
await redis_client.disconnect()
|
||||
logger.info("Shutting down IncidentOps API")
|
||||
await task_queue.shutdown()
|
||||
await db.disconnect()
|
||||
await shutdown_telemetry()
|
||||
logger.info("Shutdown complete")
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
@@ -33,6 +57,26 @@ app = FastAPI(
|
||||
lifespan=lifespan,
|
||||
)
|
||||
|
||||
# Set up OpenTelemetry instrumentation
|
||||
setup_telemetry(app)
|
||||
|
||||
|
||||
@app.middleware("http")
|
||||
async def request_logging_middleware(request: Request, call_next):
|
||||
start = time.time()
|
||||
response = await call_next(request)
|
||||
duration_ms = (time.time() - start) * 1000
|
||||
logger.info(
|
||||
"request",
|
||||
extra={
|
||||
"method": request.method,
|
||||
"path": request.url.path,
|
||||
"status_code": response.status_code,
|
||||
"duration_ms": round(duration_ms, 2),
|
||||
},
|
||||
)
|
||||
return response
|
||||
|
||||
app.openapi_tags = [
|
||||
{"name": "auth", "description": "Registration, login, token lifecycle"},
|
||||
{"name": "org", "description": "Organization membership, services, and notifications"},
|
||||
@@ -41,9 +85,133 @@ app.openapi_tags = [
|
||||
]
|
||||
|
||||
|
||||
def custom_openapi() -> dict:
|
||||
"""Add JWT bearer security scheme to the generated OpenAPI schema."""
|
||||
# ---------------------------------------------------------------------------
|
||||
# Global Exception Handlers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _build_error_response(
|
||||
error: str,
|
||||
message: str,
|
||||
status_code: int,
|
||||
details: list[ErrorDetail] | None = None,
|
||||
) -> JSONResponse:
|
||||
"""Build a structured error response with trace context."""
|
||||
response = ErrorResponse(
|
||||
error=error,
|
||||
message=message,
|
||||
details=details,
|
||||
request_id=get_current_trace_id(),
|
||||
)
|
||||
return JSONResponse(
|
||||
status_code=status_code,
|
||||
content=jsonable_encoder(response),
|
||||
)
|
||||
|
||||
|
||||
@app.exception_handler(StarletteHTTPException)
|
||||
async def http_exception_handler(
|
||||
request: Request, exc: StarletteHTTPException
|
||||
) -> JSONResponse:
|
||||
"""Handle HTTP exceptions with structured error responses."""
|
||||
# Map status codes to error type strings
|
||||
error_types = {
|
||||
400: "bad_request",
|
||||
401: "unauthorized",
|
||||
403: "forbidden",
|
||||
404: "not_found",
|
||||
409: "conflict",
|
||||
422: "validation_error",
|
||||
429: "rate_limited",
|
||||
500: "internal_error",
|
||||
502: "bad_gateway",
|
||||
503: "service_unavailable",
|
||||
}
|
||||
error_type = error_types.get(exc.status_code, "error")
|
||||
|
||||
logger.warning(
|
||||
"HTTP exception",
|
||||
extra={
|
||||
"status_code": exc.status_code,
|
||||
"error": error_type,
|
||||
"detail": exc.detail,
|
||||
"path": str(request.url.path),
|
||||
"method": request.method,
|
||||
},
|
||||
)
|
||||
|
||||
return _build_error_response(
|
||||
error=error_type,
|
||||
message=str(exc.detail),
|
||||
status_code=exc.status_code,
|
||||
)
|
||||
|
||||
|
||||
@app.exception_handler(RequestValidationError)
|
||||
async def validation_exception_handler(
|
||||
request: Request, exc: RequestValidationError
|
||||
) -> JSONResponse:
|
||||
"""Handle Pydantic validation errors with detailed error responses."""
|
||||
details = [
|
||||
ErrorDetail(
|
||||
loc=[str(loc) for loc in error["loc"]],
|
||||
msg=error["msg"],
|
||||
type=error["type"],
|
||||
)
|
||||
for error in exc.errors()
|
||||
]
|
||||
|
||||
logger.warning(
|
||||
"Validation error",
|
||||
extra={
|
||||
"path": str(request.url.path),
|
||||
"method": request.method,
|
||||
"error_count": len(details),
|
||||
},
|
||||
)
|
||||
|
||||
return _build_error_response(
|
||||
error="validation_error",
|
||||
message="Request validation failed",
|
||||
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
||||
details=details,
|
||||
)
|
||||
|
||||
|
||||
@app.exception_handler(Exception)
|
||||
async def unhandled_exception_handler(request: Request, exc: Exception) -> JSONResponse:
|
||||
"""Handle unexpected exceptions with logging and safe error response."""
|
||||
# Record exception in the current span for tracing
|
||||
record_exception(exc)
|
||||
|
||||
logger.exception(
|
||||
"Unhandled exception",
|
||||
extra={
|
||||
"path": str(request.url.path),
|
||||
"method": request.method,
|
||||
"exception_type": type(exc).__name__,
|
||||
},
|
||||
)
|
||||
|
||||
# Don't leak internal error details in production
|
||||
message = "An unexpected error occurred"
|
||||
if settings.debug:
|
||||
message = f"{type(exc).__name__}: {exc}"
|
||||
|
||||
return _build_error_response(
|
||||
error="internal_error",
|
||||
message=message,
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# OpenAPI Customization
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def custom_openapi() -> dict:
|
||||
"""Add JWT bearer security scheme and error responses to OpenAPI schema."""
|
||||
if app.openapi_schema:
|
||||
return app.openapi_schema
|
||||
|
||||
@@ -52,8 +220,12 @@ def custom_openapi() -> dict:
|
||||
version=app.version,
|
||||
description=app.description,
|
||||
routes=app.routes,
|
||||
tags=app.openapi_tags,
|
||||
)
|
||||
security_schemes = openapi_schema.setdefault("components", {}).setdefault("securitySchemes", {})
|
||||
|
||||
# Add security schemes
|
||||
components = openapi_schema.setdefault("components", {})
|
||||
security_schemes = components.setdefault("securitySchemes", {})
|
||||
security_schemes["BearerToken"] = {
|
||||
"type": "http",
|
||||
"scheme": "bearer",
|
||||
@@ -61,6 +233,42 @@ def custom_openapi() -> dict:
|
||||
"description": "Paste the JWT access token returned by /auth endpoints",
|
||||
}
|
||||
openapi_schema["security"] = [{"BearerToken": []}]
|
||||
|
||||
# Add common error response schemas
|
||||
schemas = components.setdefault("schemas", {})
|
||||
schemas["ErrorResponse"] = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {"type": "string", "description": "Error type identifier"},
|
||||
"message": {"type": "string", "description": "Human-readable error message"},
|
||||
"details": {
|
||||
"type": "array",
|
||||
"items": {"$ref": "#/components/schemas/ErrorDetail"},
|
||||
"nullable": True,
|
||||
"description": "Validation error details",
|
||||
},
|
||||
"request_id": {
|
||||
"type": "string",
|
||||
"nullable": True,
|
||||
"description": "Trace ID for debugging",
|
||||
},
|
||||
},
|
||||
"required": ["error", "message"],
|
||||
}
|
||||
schemas["ErrorDetail"] = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"loc": {
|
||||
"type": "array",
|
||||
"items": {"oneOf": [{"type": "string"}, {"type": "integer"}]},
|
||||
"description": "Error location path",
|
||||
},
|
||||
"msg": {"type": "string", "description": "Error message"},
|
||||
"type": {"type": "string", "description": "Error type"},
|
||||
},
|
||||
"required": ["loc", "msg", "type"],
|
||||
}
|
||||
|
||||
app.openapi_schema = openapi_schema
|
||||
return app.openapi_schema
|
||||
|
||||
|
||||
Reference in New Issue
Block a user