Add OpenTelemetry instrumentation with distributed tracing and metrics: - Structured JSON logging with trace context correlation - Auto-instrumentation for FastAPI, asyncpg, httpx, redis - OTLP exporter for traces and Prometheus metrics endpoint Implement Celery worker and notification task system: - Celery app with Redis/SQS broker support and configurable queues - Notification tasks for incident fan-out, webhooks, and escalations - Pluggable TaskQueue abstraction with in-memory driver for testing Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana): - OpenTelemetry Collector for receiving OTLP traces and logs - Tempo for distributed tracing backend - Loki for log aggregation with Promtail DaemonSet - Prometheus for metrics scraping with RBAC configuration - Grafana with pre-provisioned datasources and API overview dashboard - Helm templates for all observability components Enhance application infrastructure: - Global exception handlers with structured ErrorResponse schema - Request logging middleware with timing metrics - Health check updated to verify task queue connectivity - Non-root user in Dockerfile for security - Init containers in Helm deployments for dependency ordering - Production Helm values with autoscaling and retention policies
272 lines
9.0 KiB
Python
272 lines
9.0 KiB
Python
"""OpenTelemetry instrumentation for tracing, metrics, and logging."""
|
|
|
|
import logging
|
|
from contextlib import contextmanager
|
|
from typing import Any
|
|
|
|
from opentelemetry import metrics, trace
|
|
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
|
from opentelemetry.exporter.prometheus import PrometheusMetricReader
|
|
from opentelemetry.instrumentation.asyncpg import AsyncPGInstrumentor
|
|
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
|
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
|
|
from opentelemetry.instrumentation.logging import LoggingInstrumentor
|
|
from opentelemetry.instrumentation.redis import RedisInstrumentor
|
|
from opentelemetry.instrumentation.system_metrics import SystemMetricsInstrumentor
|
|
from opentelemetry.sdk.metrics import MeterProvider
|
|
from opentelemetry.sdk.resources import Resource
|
|
from opentelemetry.sdk.trace import TracerProvider
|
|
from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
|
|
from opentelemetry.semconv.resource import ResourceAttributes
|
|
from prometheus_client import REGISTRY, start_http_server
|
|
|
|
from app.config import settings
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_tracer_provider: TracerProvider | None = None
|
|
_meter_provider: MeterProvider | None = None
|
|
|
|
# Custom metrics
|
|
_request_counter = None
|
|
_request_duration = None
|
|
_active_requests = None
|
|
_error_counter = None
|
|
|
|
|
|
def setup_telemetry(app: Any) -> None:
|
|
"""
|
|
Initialize OpenTelemetry with tracing, metrics, and logging instrumentation.
|
|
|
|
Configures:
|
|
- OTLP exporter for traces (to Tempo/Jaeger)
|
|
- Prometheus exporter for metrics (scraped by Prometheus)
|
|
- Auto-instrumentation for FastAPI, asyncpg, httpx, redis
|
|
- System metrics (CPU, memory, etc.)
|
|
- Logging instrumentation for trace context injection
|
|
"""
|
|
global _tracer_provider, _meter_provider
|
|
global _request_counter, _request_duration, _active_requests, _error_counter
|
|
|
|
if not settings.otel_enabled:
|
|
logger.info("OpenTelemetry disabled")
|
|
return
|
|
|
|
# Create resource with service info
|
|
resource = Resource.create(
|
|
{
|
|
ResourceAttributes.SERVICE_NAME: settings.otel_service_name,
|
|
ResourceAttributes.SERVICE_VERSION: "0.1.0",
|
|
ResourceAttributes.DEPLOYMENT_ENVIRONMENT: settings.otel_environment,
|
|
}
|
|
)
|
|
|
|
# =========================================
|
|
# TRACING SETUP
|
|
# =========================================
|
|
_tracer_provider = TracerProvider(resource=resource)
|
|
|
|
if settings.otel_exporter_otlp_endpoint:
|
|
otlp_exporter = OTLPSpanExporter(
|
|
endpoint=settings.otel_exporter_otlp_endpoint,
|
|
insecure=settings.otel_exporter_otlp_insecure,
|
|
)
|
|
_tracer_provider.add_span_processor(BatchSpanProcessor(otlp_exporter))
|
|
logger.info(f"OTLP exporter configured: {settings.otel_exporter_otlp_endpoint}")
|
|
else:
|
|
_tracer_provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
|
|
logger.info("Console span exporter configured (no OTLP endpoint)")
|
|
|
|
trace.set_tracer_provider(_tracer_provider)
|
|
|
|
# =========================================
|
|
# METRICS SETUP
|
|
# =========================================
|
|
# Prometheus metric reader exposes metrics at /metrics endpoint
|
|
prometheus_reader = PrometheusMetricReader()
|
|
_meter_provider = MeterProvider(resource=resource, metric_readers=[prometheus_reader])
|
|
metrics.set_meter_provider(_meter_provider)
|
|
|
|
# Start Prometheus HTTP server on port 9464
|
|
prometheus_port = settings.prometheus_port
|
|
try:
|
|
start_http_server(port=prometheus_port, registry=REGISTRY)
|
|
logger.info(f"Prometheus metrics server started on port {prometheus_port}")
|
|
except OSError as e:
|
|
logger.warning(f"Could not start Prometheus server on port {prometheus_port}: {e}")
|
|
|
|
# Create custom metrics
|
|
meter = metrics.get_meter(__name__)
|
|
|
|
_request_counter = meter.create_counter(
|
|
name="http_requests_total",
|
|
description="Total number of HTTP requests",
|
|
unit="1",
|
|
)
|
|
|
|
_request_duration = meter.create_histogram(
|
|
name="http_request_duration_seconds",
|
|
description="HTTP request duration in seconds",
|
|
unit="s",
|
|
)
|
|
|
|
_active_requests = meter.create_up_down_counter(
|
|
name="http_requests_active",
|
|
description="Number of active HTTP requests",
|
|
unit="1",
|
|
)
|
|
|
|
_error_counter = meter.create_counter(
|
|
name="http_errors_total",
|
|
description="Total number of HTTP errors",
|
|
unit="1",
|
|
)
|
|
|
|
# Instrument system metrics (CPU, memory, etc.)
|
|
SystemMetricsInstrumentor().instrument()
|
|
logger.info("System metrics instrumentation enabled")
|
|
|
|
# =========================================
|
|
# LIBRARY INSTRUMENTATION
|
|
# =========================================
|
|
FastAPIInstrumentor.instrument_app(
|
|
app,
|
|
excluded_urls="healthz,readyz,metrics",
|
|
tracer_provider=_tracer_provider,
|
|
meter_provider=_meter_provider,
|
|
)
|
|
AsyncPGInstrumentor().instrument(tracer_provider=_tracer_provider)
|
|
HTTPXClientInstrumentor().instrument(tracer_provider=_tracer_provider)
|
|
RedisInstrumentor().instrument(tracer_provider=_tracer_provider)
|
|
|
|
# Inject trace context into logs
|
|
LoggingInstrumentor().instrument(
|
|
set_logging_format=True,
|
|
log_level=logging.INFO,
|
|
)
|
|
|
|
logger.info(
|
|
f"OpenTelemetry initialized: service={settings.otel_service_name}, "
|
|
f"env={settings.otel_environment}, metrics_port={prometheus_port}"
|
|
)
|
|
|
|
|
|
async def shutdown_telemetry() -> None:
|
|
"""Gracefully shutdown the tracer and meter providers."""
|
|
global _tracer_provider, _meter_provider
|
|
|
|
if _tracer_provider:
|
|
_tracer_provider.shutdown()
|
|
_tracer_provider = None
|
|
logger.info("Tracer provider shutdown complete")
|
|
|
|
if _meter_provider:
|
|
_meter_provider.shutdown()
|
|
_meter_provider = None
|
|
logger.info("Meter provider shutdown complete")
|
|
|
|
|
|
def get_tracer(name: str) -> trace.Tracer:
|
|
"""Get a tracer instance for manual span creation."""
|
|
return trace.get_tracer(name)
|
|
|
|
|
|
def get_meter(name: str) -> metrics.Meter:
|
|
"""Get a meter instance for custom metrics."""
|
|
return metrics.get_meter(name)
|
|
|
|
|
|
def get_current_trace_id() -> str | None:
|
|
"""Get the current trace ID for request correlation."""
|
|
span = trace.get_current_span()
|
|
if span and span.get_span_context().is_valid:
|
|
return format(span.get_span_context().trace_id, "032x")
|
|
return None
|
|
|
|
|
|
def get_current_span_id() -> str | None:
|
|
"""Get the current span ID."""
|
|
span = trace.get_current_span()
|
|
if span and span.get_span_context().is_valid:
|
|
return format(span.get_span_context().span_id, "016x")
|
|
return None
|
|
|
|
|
|
@contextmanager
|
|
def create_span(name: str, attributes: dict[str, Any] | None = None):
|
|
"""Context manager for creating manual spans."""
|
|
tracer = get_tracer(__name__)
|
|
with tracer.start_as_current_span(name, attributes=attributes) as span:
|
|
yield span
|
|
|
|
|
|
def add_span_attributes(attributes: dict[str, Any]) -> None:
|
|
"""Add attributes to the current span."""
|
|
span = trace.get_current_span()
|
|
if span:
|
|
for key, value in attributes.items():
|
|
span.set_attribute(key, value)
|
|
|
|
|
|
def record_exception(exception: Exception) -> None:
|
|
"""Record an exception on the current span."""
|
|
span = trace.get_current_span()
|
|
if span:
|
|
span.record_exception(exception)
|
|
span.set_status(trace.Status(trace.StatusCode.ERROR, str(exception)))
|
|
|
|
|
|
# =========================================
|
|
# CUSTOM METRICS HELPERS
|
|
# =========================================
|
|
|
|
|
|
def record_request(method: str, endpoint: str, status_code: int) -> None:
|
|
"""Record a request metric."""
|
|
if _request_counter:
|
|
_request_counter.add(
|
|
1,
|
|
{
|
|
"method": method,
|
|
"endpoint": endpoint,
|
|
"status_code": str(status_code),
|
|
},
|
|
)
|
|
|
|
|
|
def record_request_duration(method: str, endpoint: str, duration: float) -> None:
|
|
"""Record request duration in seconds."""
|
|
if _request_duration:
|
|
_request_duration.record(
|
|
duration,
|
|
{
|
|
"method": method,
|
|
"endpoint": endpoint,
|
|
},
|
|
)
|
|
|
|
|
|
def increment_active_requests(method: str, endpoint: str) -> None:
|
|
"""Increment active requests counter."""
|
|
if _active_requests:
|
|
_active_requests.add(1, {"method": method, "endpoint": endpoint})
|
|
|
|
|
|
def decrement_active_requests(method: str, endpoint: str) -> None:
|
|
"""Decrement active requests counter."""
|
|
if _active_requests:
|
|
_active_requests.add(-1, {"method": method, "endpoint": endpoint})
|
|
|
|
|
|
def record_error(method: str, endpoint: str, error_type: str) -> None:
|
|
"""Record an error metric."""
|
|
if _error_counter:
|
|
_error_counter.add(
|
|
1,
|
|
{
|
|
"method": method,
|
|
"endpoint": endpoint,
|
|
"error_type": error_type,
|
|
},
|
|
)
|