feat: add observability stack and background task infrastructure
Add OpenTelemetry instrumentation with distributed tracing and metrics: - Structured JSON logging with trace context correlation - Auto-instrumentation for FastAPI, asyncpg, httpx, redis - OTLP exporter for traces and Prometheus metrics endpoint Implement Celery worker and notification task system: - Celery app with Redis/SQS broker support and configurable queues - Notification tasks for incident fan-out, webhooks, and escalations - Pluggable TaskQueue abstraction with in-memory driver for testing Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana): - OpenTelemetry Collector for receiving OTLP traces and logs - Tempo for distributed tracing backend - Loki for log aggregation with Promtail DaemonSet - Prometheus for metrics scraping with RBAC configuration - Grafana with pre-provisioned datasources and API overview dashboard - Helm templates for all observability components Enhance application infrastructure: - Global exception handlers with structured ErrorResponse schema - Request logging middleware with timing metrics - Health check updated to verify task queue connectivity - Non-root user in Dockerfile for security - Init containers in Helm deployments for dependency ordering - Production Helm values with autoscaling and retention policies
This commit is contained in:
164
app/core/logging.py
Normal file
164
app/core/logging.py
Normal file
@@ -0,0 +1,164 @@
|
||||
"""Structured JSON logging configuration with OpenTelemetry integration."""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from app.config import settings
|
||||
|
||||
|
||||
class JSONFormatter(logging.Formatter):
|
||||
"""
|
||||
JSON log formatter that outputs structured logs with trace context.
|
||||
|
||||
Log format includes:
|
||||
- timestamp: ISO 8601 format
|
||||
- level: Log level name
|
||||
- message: Log message
|
||||
- logger: Logger name
|
||||
- trace_id: OpenTelemetry trace ID (if available)
|
||||
- span_id: OpenTelemetry span ID (if available)
|
||||
- Extra fields from log record
|
||||
"""
|
||||
|
||||
def format(self, record: logging.LogRecord) -> str:
|
||||
log_data: dict[str, Any] = {
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"level": record.levelname,
|
||||
"message": record.getMessage(),
|
||||
"logger": record.name,
|
||||
}
|
||||
|
||||
# Add trace context if available (injected by OpenTelemetry LoggingInstrumentor)
|
||||
if hasattr(record, "otelTraceID") and record.otelTraceID != "0":
|
||||
log_data["trace_id"] = record.otelTraceID
|
||||
if hasattr(record, "otelSpanID") and record.otelSpanID != "0":
|
||||
log_data["span_id"] = record.otelSpanID
|
||||
|
||||
# Add exception info if present
|
||||
if record.exc_info:
|
||||
log_data["exception"] = self.formatException(record.exc_info)
|
||||
|
||||
# Add extra fields (excluding standard LogRecord attributes)
|
||||
standard_attrs = {
|
||||
"name",
|
||||
"msg",
|
||||
"args",
|
||||
"created",
|
||||
"filename",
|
||||
"funcName",
|
||||
"levelname",
|
||||
"levelno",
|
||||
"lineno",
|
||||
"module",
|
||||
"msecs",
|
||||
"pathname",
|
||||
"process",
|
||||
"processName",
|
||||
"relativeCreated",
|
||||
"stack_info",
|
||||
"exc_info",
|
||||
"exc_text",
|
||||
"thread",
|
||||
"threadName",
|
||||
"taskName",
|
||||
"message",
|
||||
"otelTraceID",
|
||||
"otelSpanID",
|
||||
"otelTraceSampled",
|
||||
"otelServiceName",
|
||||
}
|
||||
for key, value in record.__dict__.items():
|
||||
if key not in standard_attrs and not key.startswith("_"):
|
||||
log_data[key] = value
|
||||
|
||||
return json.dumps(log_data, default=str)
|
||||
|
||||
|
||||
class DevelopmentFormatter(logging.Formatter):
|
||||
"""
|
||||
Human-readable formatter for development with color support.
|
||||
|
||||
Format: [TIME] LEVEL logger - message [trace_id]
|
||||
"""
|
||||
|
||||
COLORS = {
|
||||
"DEBUG": "\033[36m", # Cyan
|
||||
"INFO": "\033[32m", # Green
|
||||
"WARNING": "\033[33m", # Yellow
|
||||
"ERROR": "\033[31m", # Red
|
||||
"CRITICAL": "\033[35m", # Magenta
|
||||
}
|
||||
RESET = "\033[0m"
|
||||
|
||||
def format(self, record: logging.LogRecord) -> str:
|
||||
color = self.COLORS.get(record.levelname, "")
|
||||
reset = self.RESET
|
||||
|
||||
# Format timestamp
|
||||
timestamp = datetime.now(timezone.utc).strftime("%H:%M:%S.%f")[:-3]
|
||||
|
||||
# Build message
|
||||
msg = f"[{timestamp}] {color}{record.levelname:8}{reset} {record.name} - {record.getMessage()}"
|
||||
|
||||
# Add trace context if available
|
||||
if hasattr(record, "otelTraceID") and record.otelTraceID != "0":
|
||||
msg += f" [{record.otelTraceID[:8]}...]"
|
||||
|
||||
# Add exception if present
|
||||
if record.exc_info:
|
||||
msg += f"\n{self.formatException(record.exc_info)}"
|
||||
|
||||
return msg
|
||||
|
||||
|
||||
def setup_logging() -> None:
|
||||
"""
|
||||
Configure application logging.
|
||||
|
||||
- JSON format in production (OTEL enabled)
|
||||
- Human-readable format in development
|
||||
- Integrates with OpenTelemetry trace context
|
||||
"""
|
||||
# Determine log level
|
||||
log_level = getattr(logging, settings.otel_log_level.upper(), logging.INFO)
|
||||
|
||||
# Choose formatter based on environment
|
||||
if settings.otel_enabled and not settings.debug:
|
||||
formatter = JSONFormatter()
|
||||
else:
|
||||
formatter = DevelopmentFormatter()
|
||||
|
||||
# Configure root logger
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.setLevel(log_level)
|
||||
|
||||
# Remove existing handlers
|
||||
for handler in root_logger.handlers[:]:
|
||||
root_logger.removeHandler(handler)
|
||||
|
||||
# Add stdout handler
|
||||
handler = logging.StreamHandler(sys.stdout)
|
||||
handler.setFormatter(formatter)
|
||||
root_logger.addHandler(handler)
|
||||
|
||||
# Reduce noise from third-party libraries (keep uvicorn access at INFO so requests are logged)
|
||||
logging.getLogger("uvicorn.access").setLevel(logging.INFO)
|
||||
logging.getLogger("asyncpg").setLevel(logging.WARNING)
|
||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
||||
|
||||
logging.info(
|
||||
"Logging configured",
|
||||
extra={
|
||||
"log_level": settings.otel_log_level,
|
||||
"format": "json" if settings.otel_enabled and not settings.debug else "dev",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def get_logger(name: str) -> logging.Logger:
|
||||
"""Get a logger instance with the given name."""
|
||||
return logging.getLogger(name)
|
||||
Reference in New Issue
Block a user