feat: add observability stack and background task infrastructure

Add OpenTelemetry instrumentation with distributed tracing and metrics:
- Structured JSON logging with trace context correlation
- Auto-instrumentation for FastAPI, asyncpg, httpx, redis
- OTLP exporter for traces and Prometheus metrics endpoint

Implement Celery worker and notification task system:
- Celery app with Redis/SQS broker support and configurable queues
- Notification tasks for incident fan-out, webhooks, and escalations
- Pluggable TaskQueue abstraction with in-memory driver for testing

Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana):
- OpenTelemetry Collector for receiving OTLP traces and logs
- Tempo for distributed tracing backend
- Loki for log aggregation with Promtail DaemonSet
- Prometheus for metrics scraping with RBAC configuration
- Grafana with pre-provisioned datasources and API overview dashboard
- Helm templates for all observability components

Enhance application infrastructure:
- Global exception handlers with structured ErrorResponse schema
- Request logging middleware with timing metrics
- Health check updated to verify task queue connectivity
- Non-root user in Dockerfile for security
- Init containers in Helm deployments for dependency ordering
- Production Helm values with autoscaling and retention policies
This commit is contained in:
2026-01-07 20:51:13 -05:00
parent f427d191e0
commit 46ede7757d
45 changed files with 3742 additions and 76 deletions

164
app/core/logging.py Normal file
View File

@@ -0,0 +1,164 @@
"""Structured JSON logging configuration with OpenTelemetry integration."""
import json
import logging
import sys
from datetime import datetime, timezone
from typing import Any
from app.config import settings
class JSONFormatter(logging.Formatter):
"""
JSON log formatter that outputs structured logs with trace context.
Log format includes:
- timestamp: ISO 8601 format
- level: Log level name
- message: Log message
- logger: Logger name
- trace_id: OpenTelemetry trace ID (if available)
- span_id: OpenTelemetry span ID (if available)
- Extra fields from log record
"""
def format(self, record: logging.LogRecord) -> str:
log_data: dict[str, Any] = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"level": record.levelname,
"message": record.getMessage(),
"logger": record.name,
}
# Add trace context if available (injected by OpenTelemetry LoggingInstrumentor)
if hasattr(record, "otelTraceID") and record.otelTraceID != "0":
log_data["trace_id"] = record.otelTraceID
if hasattr(record, "otelSpanID") and record.otelSpanID != "0":
log_data["span_id"] = record.otelSpanID
# Add exception info if present
if record.exc_info:
log_data["exception"] = self.formatException(record.exc_info)
# Add extra fields (excluding standard LogRecord attributes)
standard_attrs = {
"name",
"msg",
"args",
"created",
"filename",
"funcName",
"levelname",
"levelno",
"lineno",
"module",
"msecs",
"pathname",
"process",
"processName",
"relativeCreated",
"stack_info",
"exc_info",
"exc_text",
"thread",
"threadName",
"taskName",
"message",
"otelTraceID",
"otelSpanID",
"otelTraceSampled",
"otelServiceName",
}
for key, value in record.__dict__.items():
if key not in standard_attrs and not key.startswith("_"):
log_data[key] = value
return json.dumps(log_data, default=str)
class DevelopmentFormatter(logging.Formatter):
"""
Human-readable formatter for development with color support.
Format: [TIME] LEVEL logger - message [trace_id]
"""
COLORS = {
"DEBUG": "\033[36m", # Cyan
"INFO": "\033[32m", # Green
"WARNING": "\033[33m", # Yellow
"ERROR": "\033[31m", # Red
"CRITICAL": "\033[35m", # Magenta
}
RESET = "\033[0m"
def format(self, record: logging.LogRecord) -> str:
color = self.COLORS.get(record.levelname, "")
reset = self.RESET
# Format timestamp
timestamp = datetime.now(timezone.utc).strftime("%H:%M:%S.%f")[:-3]
# Build message
msg = f"[{timestamp}] {color}{record.levelname:8}{reset} {record.name} - {record.getMessage()}"
# Add trace context if available
if hasattr(record, "otelTraceID") and record.otelTraceID != "0":
msg += f" [{record.otelTraceID[:8]}...]"
# Add exception if present
if record.exc_info:
msg += f"\n{self.formatException(record.exc_info)}"
return msg
def setup_logging() -> None:
"""
Configure application logging.
- JSON format in production (OTEL enabled)
- Human-readable format in development
- Integrates with OpenTelemetry trace context
"""
# Determine log level
log_level = getattr(logging, settings.otel_log_level.upper(), logging.INFO)
# Choose formatter based on environment
if settings.otel_enabled and not settings.debug:
formatter = JSONFormatter()
else:
formatter = DevelopmentFormatter()
# Configure root logger
root_logger = logging.getLogger()
root_logger.setLevel(log_level)
# Remove existing handlers
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)
# Add stdout handler
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(formatter)
root_logger.addHandler(handler)
# Reduce noise from third-party libraries (keep uvicorn access at INFO so requests are logged)
logging.getLogger("uvicorn.access").setLevel(logging.INFO)
logging.getLogger("asyncpg").setLevel(logging.WARNING)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)
logging.info(
"Logging configured",
extra={
"log_level": settings.otel_log_level,
"format": "json" if settings.otel_enabled and not settings.debug else "dev",
},
)
def get_logger(name: str) -> logging.Logger:
"""Get a logger instance with the given name."""
return logging.getLogger(name)

271
app/core/telemetry.py Normal file
View File

@@ -0,0 +1,271 @@
"""OpenTelemetry instrumentation for tracing, metrics, and logging."""
import logging
from contextlib import contextmanager
from typing import Any
from opentelemetry import metrics, trace
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.exporter.prometheus import PrometheusMetricReader
from opentelemetry.instrumentation.asyncpg import AsyncPGInstrumentor
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
from opentelemetry.instrumentation.logging import LoggingInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.instrumentation.system_metrics import SystemMetricsInstrumentor
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
from opentelemetry.semconv.resource import ResourceAttributes
from prometheus_client import REGISTRY, start_http_server
from app.config import settings
logger = logging.getLogger(__name__)
_tracer_provider: TracerProvider | None = None
_meter_provider: MeterProvider | None = None
# Custom metrics
_request_counter = None
_request_duration = None
_active_requests = None
_error_counter = None
def setup_telemetry(app: Any) -> None:
"""
Initialize OpenTelemetry with tracing, metrics, and logging instrumentation.
Configures:
- OTLP exporter for traces (to Tempo/Jaeger)
- Prometheus exporter for metrics (scraped by Prometheus)
- Auto-instrumentation for FastAPI, asyncpg, httpx, redis
- System metrics (CPU, memory, etc.)
- Logging instrumentation for trace context injection
"""
global _tracer_provider, _meter_provider
global _request_counter, _request_duration, _active_requests, _error_counter
if not settings.otel_enabled:
logger.info("OpenTelemetry disabled")
return
# Create resource with service info
resource = Resource.create(
{
ResourceAttributes.SERVICE_NAME: settings.otel_service_name,
ResourceAttributes.SERVICE_VERSION: "0.1.0",
ResourceAttributes.DEPLOYMENT_ENVIRONMENT: settings.otel_environment,
}
)
# =========================================
# TRACING SETUP
# =========================================
_tracer_provider = TracerProvider(resource=resource)
if settings.otel_exporter_otlp_endpoint:
otlp_exporter = OTLPSpanExporter(
endpoint=settings.otel_exporter_otlp_endpoint,
insecure=settings.otel_exporter_otlp_insecure,
)
_tracer_provider.add_span_processor(BatchSpanProcessor(otlp_exporter))
logger.info(f"OTLP exporter configured: {settings.otel_exporter_otlp_endpoint}")
else:
_tracer_provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
logger.info("Console span exporter configured (no OTLP endpoint)")
trace.set_tracer_provider(_tracer_provider)
# =========================================
# METRICS SETUP
# =========================================
# Prometheus metric reader exposes metrics at /metrics endpoint
prometheus_reader = PrometheusMetricReader()
_meter_provider = MeterProvider(resource=resource, metric_readers=[prometheus_reader])
metrics.set_meter_provider(_meter_provider)
# Start Prometheus HTTP server on port 9464
prometheus_port = settings.prometheus_port
try:
start_http_server(port=prometheus_port, registry=REGISTRY)
logger.info(f"Prometheus metrics server started on port {prometheus_port}")
except OSError as e:
logger.warning(f"Could not start Prometheus server on port {prometheus_port}: {e}")
# Create custom metrics
meter = metrics.get_meter(__name__)
_request_counter = meter.create_counter(
name="http_requests_total",
description="Total number of HTTP requests",
unit="1",
)
_request_duration = meter.create_histogram(
name="http_request_duration_seconds",
description="HTTP request duration in seconds",
unit="s",
)
_active_requests = meter.create_up_down_counter(
name="http_requests_active",
description="Number of active HTTP requests",
unit="1",
)
_error_counter = meter.create_counter(
name="http_errors_total",
description="Total number of HTTP errors",
unit="1",
)
# Instrument system metrics (CPU, memory, etc.)
SystemMetricsInstrumentor().instrument()
logger.info("System metrics instrumentation enabled")
# =========================================
# LIBRARY INSTRUMENTATION
# =========================================
FastAPIInstrumentor.instrument_app(
app,
excluded_urls="healthz,readyz,metrics",
tracer_provider=_tracer_provider,
meter_provider=_meter_provider,
)
AsyncPGInstrumentor().instrument(tracer_provider=_tracer_provider)
HTTPXClientInstrumentor().instrument(tracer_provider=_tracer_provider)
RedisInstrumentor().instrument(tracer_provider=_tracer_provider)
# Inject trace context into logs
LoggingInstrumentor().instrument(
set_logging_format=True,
log_level=logging.INFO,
)
logger.info(
f"OpenTelemetry initialized: service={settings.otel_service_name}, "
f"env={settings.otel_environment}, metrics_port={prometheus_port}"
)
async def shutdown_telemetry() -> None:
"""Gracefully shutdown the tracer and meter providers."""
global _tracer_provider, _meter_provider
if _tracer_provider:
_tracer_provider.shutdown()
_tracer_provider = None
logger.info("Tracer provider shutdown complete")
if _meter_provider:
_meter_provider.shutdown()
_meter_provider = None
logger.info("Meter provider shutdown complete")
def get_tracer(name: str) -> trace.Tracer:
"""Get a tracer instance for manual span creation."""
return trace.get_tracer(name)
def get_meter(name: str) -> metrics.Meter:
"""Get a meter instance for custom metrics."""
return metrics.get_meter(name)
def get_current_trace_id() -> str | None:
"""Get the current trace ID for request correlation."""
span = trace.get_current_span()
if span and span.get_span_context().is_valid:
return format(span.get_span_context().trace_id, "032x")
return None
def get_current_span_id() -> str | None:
"""Get the current span ID."""
span = trace.get_current_span()
if span and span.get_span_context().is_valid:
return format(span.get_span_context().span_id, "016x")
return None
@contextmanager
def create_span(name: str, attributes: dict[str, Any] | None = None):
"""Context manager for creating manual spans."""
tracer = get_tracer(__name__)
with tracer.start_as_current_span(name, attributes=attributes) as span:
yield span
def add_span_attributes(attributes: dict[str, Any]) -> None:
"""Add attributes to the current span."""
span = trace.get_current_span()
if span:
for key, value in attributes.items():
span.set_attribute(key, value)
def record_exception(exception: Exception) -> None:
"""Record an exception on the current span."""
span = trace.get_current_span()
if span:
span.record_exception(exception)
span.set_status(trace.Status(trace.StatusCode.ERROR, str(exception)))
# =========================================
# CUSTOM METRICS HELPERS
# =========================================
def record_request(method: str, endpoint: str, status_code: int) -> None:
"""Record a request metric."""
if _request_counter:
_request_counter.add(
1,
{
"method": method,
"endpoint": endpoint,
"status_code": str(status_code),
},
)
def record_request_duration(method: str, endpoint: str, duration: float) -> None:
"""Record request duration in seconds."""
if _request_duration:
_request_duration.record(
duration,
{
"method": method,
"endpoint": endpoint,
},
)
def increment_active_requests(method: str, endpoint: str) -> None:
"""Increment active requests counter."""
if _active_requests:
_active_requests.add(1, {"method": method, "endpoint": endpoint})
def decrement_active_requests(method: str, endpoint: str) -> None:
"""Decrement active requests counter."""
if _active_requests:
_active_requests.add(-1, {"method": method, "endpoint": endpoint})
def record_error(method: str, endpoint: str, error_type: str) -> None:
"""Record an error metric."""
if _error_counter:
_error_counter.add(
1,
{
"method": method,
"endpoint": endpoint,
"error_type": error_type,
},
)