Files
incidentops/app/core/logging.py
minhtrannhat 46ede7757d feat: add observability stack and background task infrastructure
Add OpenTelemetry instrumentation with distributed tracing and metrics:
- Structured JSON logging with trace context correlation
- Auto-instrumentation for FastAPI, asyncpg, httpx, redis
- OTLP exporter for traces and Prometheus metrics endpoint

Implement Celery worker and notification task system:
- Celery app with Redis/SQS broker support and configurable queues
- Notification tasks for incident fan-out, webhooks, and escalations
- Pluggable TaskQueue abstraction with in-memory driver for testing

Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana):
- OpenTelemetry Collector for receiving OTLP traces and logs
- Tempo for distributed tracing backend
- Loki for log aggregation with Promtail DaemonSet
- Prometheus for metrics scraping with RBAC configuration
- Grafana with pre-provisioned datasources and API overview dashboard
- Helm templates for all observability components

Enhance application infrastructure:
- Global exception handlers with structured ErrorResponse schema
- Request logging middleware with timing metrics
- Health check updated to verify task queue connectivity
- Non-root user in Dockerfile for security
- Init containers in Helm deployments for dependency ordering
- Production Helm values with autoscaling and retention policies
2026-01-07 20:51:13 -05:00

165 lines
4.9 KiB
Python

"""Structured JSON logging configuration with OpenTelemetry integration."""
import json
import logging
import sys
from datetime import datetime, timezone
from typing import Any
from app.config import settings
class JSONFormatter(logging.Formatter):
"""
JSON log formatter that outputs structured logs with trace context.
Log format includes:
- timestamp: ISO 8601 format
- level: Log level name
- message: Log message
- logger: Logger name
- trace_id: OpenTelemetry trace ID (if available)
- span_id: OpenTelemetry span ID (if available)
- Extra fields from log record
"""
def format(self, record: logging.LogRecord) -> str:
log_data: dict[str, Any] = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"level": record.levelname,
"message": record.getMessage(),
"logger": record.name,
}
# Add trace context if available (injected by OpenTelemetry LoggingInstrumentor)
if hasattr(record, "otelTraceID") and record.otelTraceID != "0":
log_data["trace_id"] = record.otelTraceID
if hasattr(record, "otelSpanID") and record.otelSpanID != "0":
log_data["span_id"] = record.otelSpanID
# Add exception info if present
if record.exc_info:
log_data["exception"] = self.formatException(record.exc_info)
# Add extra fields (excluding standard LogRecord attributes)
standard_attrs = {
"name",
"msg",
"args",
"created",
"filename",
"funcName",
"levelname",
"levelno",
"lineno",
"module",
"msecs",
"pathname",
"process",
"processName",
"relativeCreated",
"stack_info",
"exc_info",
"exc_text",
"thread",
"threadName",
"taskName",
"message",
"otelTraceID",
"otelSpanID",
"otelTraceSampled",
"otelServiceName",
}
for key, value in record.__dict__.items():
if key not in standard_attrs and not key.startswith("_"):
log_data[key] = value
return json.dumps(log_data, default=str)
class DevelopmentFormatter(logging.Formatter):
"""
Human-readable formatter for development with color support.
Format: [TIME] LEVEL logger - message [trace_id]
"""
COLORS = {
"DEBUG": "\033[36m", # Cyan
"INFO": "\033[32m", # Green
"WARNING": "\033[33m", # Yellow
"ERROR": "\033[31m", # Red
"CRITICAL": "\033[35m", # Magenta
}
RESET = "\033[0m"
def format(self, record: logging.LogRecord) -> str:
color = self.COLORS.get(record.levelname, "")
reset = self.RESET
# Format timestamp
timestamp = datetime.now(timezone.utc).strftime("%H:%M:%S.%f")[:-3]
# Build message
msg = f"[{timestamp}] {color}{record.levelname:8}{reset} {record.name} - {record.getMessage()}"
# Add trace context if available
if hasattr(record, "otelTraceID") and record.otelTraceID != "0":
msg += f" [{record.otelTraceID[:8]}...]"
# Add exception if present
if record.exc_info:
msg += f"\n{self.formatException(record.exc_info)}"
return msg
def setup_logging() -> None:
"""
Configure application logging.
- JSON format in production (OTEL enabled)
- Human-readable format in development
- Integrates with OpenTelemetry trace context
"""
# Determine log level
log_level = getattr(logging, settings.otel_log_level.upper(), logging.INFO)
# Choose formatter based on environment
if settings.otel_enabled and not settings.debug:
formatter = JSONFormatter()
else:
formatter = DevelopmentFormatter()
# Configure root logger
root_logger = logging.getLogger()
root_logger.setLevel(log_level)
# Remove existing handlers
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)
# Add stdout handler
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(formatter)
root_logger.addHandler(handler)
# Reduce noise from third-party libraries (keep uvicorn access at INFO so requests are logged)
logging.getLogger("uvicorn.access").setLevel(logging.INFO)
logging.getLogger("asyncpg").setLevel(logging.WARNING)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)
logging.info(
"Logging configured",
extra={
"log_level": settings.otel_log_level,
"format": "json" if settings.otel_enabled and not settings.debug else "dev",
},
)
def get_logger(name: str) -> logging.Logger:
"""Get a logger instance with the given name."""
return logging.getLogger(name)