feat: add observability stack and background task infrastructure

Add OpenTelemetry instrumentation with distributed tracing and metrics:
- Structured JSON logging with trace context correlation
- Auto-instrumentation for FastAPI, asyncpg, httpx, redis
- OTLP exporter for traces and Prometheus metrics endpoint

Implement Celery worker and notification task system:
- Celery app with Redis/SQS broker support and configurable queues
- Notification tasks for incident fan-out, webhooks, and escalations
- Pluggable TaskQueue abstraction with in-memory driver for testing

Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana):
- OpenTelemetry Collector for receiving OTLP traces and logs
- Tempo for distributed tracing backend
- Loki for log aggregation with Promtail DaemonSet
- Prometheus for metrics scraping with RBAC configuration
- Grafana with pre-provisioned datasources and API overview dashboard
- Helm templates for all observability components

Enhance application infrastructure:
- Global exception handlers with structured ErrorResponse schema
- Request logging middleware with timing metrics
- Health check updated to verify task queue connectivity
- Non-root user in Dockerfile for security
- Init containers in Helm deployments for dependency ordering
- Production Helm values with autoscaling and retention policies
This commit is contained in:
2026-01-07 20:51:13 -05:00
parent f427d191e0
commit 46ede7757d
45 changed files with 3742 additions and 76 deletions

164
app/core/logging.py Normal file
View File

@@ -0,0 +1,164 @@
"""Structured JSON logging configuration with OpenTelemetry integration."""
import json
import logging
import sys
from datetime import datetime, timezone
from typing import Any
from app.config import settings
class JSONFormatter(logging.Formatter):
"""
JSON log formatter that outputs structured logs with trace context.
Log format includes:
- timestamp: ISO 8601 format
- level: Log level name
- message: Log message
- logger: Logger name
- trace_id: OpenTelemetry trace ID (if available)
- span_id: OpenTelemetry span ID (if available)
- Extra fields from log record
"""
def format(self, record: logging.LogRecord) -> str:
log_data: dict[str, Any] = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"level": record.levelname,
"message": record.getMessage(),
"logger": record.name,
}
# Add trace context if available (injected by OpenTelemetry LoggingInstrumentor)
if hasattr(record, "otelTraceID") and record.otelTraceID != "0":
log_data["trace_id"] = record.otelTraceID
if hasattr(record, "otelSpanID") and record.otelSpanID != "0":
log_data["span_id"] = record.otelSpanID
# Add exception info if present
if record.exc_info:
log_data["exception"] = self.formatException(record.exc_info)
# Add extra fields (excluding standard LogRecord attributes)
standard_attrs = {
"name",
"msg",
"args",
"created",
"filename",
"funcName",
"levelname",
"levelno",
"lineno",
"module",
"msecs",
"pathname",
"process",
"processName",
"relativeCreated",
"stack_info",
"exc_info",
"exc_text",
"thread",
"threadName",
"taskName",
"message",
"otelTraceID",
"otelSpanID",
"otelTraceSampled",
"otelServiceName",
}
for key, value in record.__dict__.items():
if key not in standard_attrs and not key.startswith("_"):
log_data[key] = value
return json.dumps(log_data, default=str)
class DevelopmentFormatter(logging.Formatter):
"""
Human-readable formatter for development with color support.
Format: [TIME] LEVEL logger - message [trace_id]
"""
COLORS = {
"DEBUG": "\033[36m", # Cyan
"INFO": "\033[32m", # Green
"WARNING": "\033[33m", # Yellow
"ERROR": "\033[31m", # Red
"CRITICAL": "\033[35m", # Magenta
}
RESET = "\033[0m"
def format(self, record: logging.LogRecord) -> str:
color = self.COLORS.get(record.levelname, "")
reset = self.RESET
# Format timestamp
timestamp = datetime.now(timezone.utc).strftime("%H:%M:%S.%f")[:-3]
# Build message
msg = f"[{timestamp}] {color}{record.levelname:8}{reset} {record.name} - {record.getMessage()}"
# Add trace context if available
if hasattr(record, "otelTraceID") and record.otelTraceID != "0":
msg += f" [{record.otelTraceID[:8]}...]"
# Add exception if present
if record.exc_info:
msg += f"\n{self.formatException(record.exc_info)}"
return msg
def setup_logging() -> None:
"""
Configure application logging.
- JSON format in production (OTEL enabled)
- Human-readable format in development
- Integrates with OpenTelemetry trace context
"""
# Determine log level
log_level = getattr(logging, settings.otel_log_level.upper(), logging.INFO)
# Choose formatter based on environment
if settings.otel_enabled and not settings.debug:
formatter = JSONFormatter()
else:
formatter = DevelopmentFormatter()
# Configure root logger
root_logger = logging.getLogger()
root_logger.setLevel(log_level)
# Remove existing handlers
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)
# Add stdout handler
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(formatter)
root_logger.addHandler(handler)
# Reduce noise from third-party libraries (keep uvicorn access at INFO so requests are logged)
logging.getLogger("uvicorn.access").setLevel(logging.INFO)
logging.getLogger("asyncpg").setLevel(logging.WARNING)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)
logging.info(
"Logging configured",
extra={
"log_level": settings.otel_log_level,
"format": "json" if settings.otel_enabled and not settings.debug else "dev",
},
)
def get_logger(name: str) -> logging.Logger:
"""Get a logger instance with the given name."""
return logging.getLogger(name)