feat: add observability stack and background task infrastructure
Add OpenTelemetry instrumentation with distributed tracing and metrics: - Structured JSON logging with trace context correlation - Auto-instrumentation for FastAPI, asyncpg, httpx, redis - OTLP exporter for traces and Prometheus metrics endpoint Implement Celery worker and notification task system: - Celery app with Redis/SQS broker support and configurable queues - Notification tasks for incident fan-out, webhooks, and escalations - Pluggable TaskQueue abstraction with in-memory driver for testing Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana): - OpenTelemetry Collector for receiving OTLP traces and logs - Tempo for distributed tracing backend - Loki for log aggregation with Promtail DaemonSet - Prometheus for metrics scraping with RBAC configuration - Grafana with pre-provisioned datasources and API overview dashboard - Helm templates for all observability components Enhance application infrastructure: - Global exception handlers with structured ErrorResponse schema - Request logging middleware with timing metrics - Health check updated to verify task queue connectivity - Non-root user in Dockerfile for security - Init containers in Helm deployments for dependency ordering - Production Helm values with autoscaling and retention policies
This commit is contained in:
271
app/core/telemetry.py
Normal file
271
app/core/telemetry.py
Normal file
@@ -0,0 +1,271 @@
|
||||
"""OpenTelemetry instrumentation for tracing, metrics, and logging."""
|
||||
|
||||
import logging
|
||||
from contextlib import contextmanager
|
||||
from typing import Any
|
||||
|
||||
from opentelemetry import metrics, trace
|
||||
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
||||
from opentelemetry.exporter.prometheus import PrometheusMetricReader
|
||||
from opentelemetry.instrumentation.asyncpg import AsyncPGInstrumentor
|
||||
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
||||
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
|
||||
from opentelemetry.instrumentation.logging import LoggingInstrumentor
|
||||
from opentelemetry.instrumentation.redis import RedisInstrumentor
|
||||
from opentelemetry.instrumentation.system_metrics import SystemMetricsInstrumentor
|
||||
from opentelemetry.sdk.metrics import MeterProvider
|
||||
from opentelemetry.sdk.resources import Resource
|
||||
from opentelemetry.sdk.trace import TracerProvider
|
||||
from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
|
||||
from opentelemetry.semconv.resource import ResourceAttributes
|
||||
from prometheus_client import REGISTRY, start_http_server
|
||||
|
||||
from app.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_tracer_provider: TracerProvider | None = None
|
||||
_meter_provider: MeterProvider | None = None
|
||||
|
||||
# Custom metrics
|
||||
_request_counter = None
|
||||
_request_duration = None
|
||||
_active_requests = None
|
||||
_error_counter = None
|
||||
|
||||
|
||||
def setup_telemetry(app: Any) -> None:
|
||||
"""
|
||||
Initialize OpenTelemetry with tracing, metrics, and logging instrumentation.
|
||||
|
||||
Configures:
|
||||
- OTLP exporter for traces (to Tempo/Jaeger)
|
||||
- Prometheus exporter for metrics (scraped by Prometheus)
|
||||
- Auto-instrumentation for FastAPI, asyncpg, httpx, redis
|
||||
- System metrics (CPU, memory, etc.)
|
||||
- Logging instrumentation for trace context injection
|
||||
"""
|
||||
global _tracer_provider, _meter_provider
|
||||
global _request_counter, _request_duration, _active_requests, _error_counter
|
||||
|
||||
if not settings.otel_enabled:
|
||||
logger.info("OpenTelemetry disabled")
|
||||
return
|
||||
|
||||
# Create resource with service info
|
||||
resource = Resource.create(
|
||||
{
|
||||
ResourceAttributes.SERVICE_NAME: settings.otel_service_name,
|
||||
ResourceAttributes.SERVICE_VERSION: "0.1.0",
|
||||
ResourceAttributes.DEPLOYMENT_ENVIRONMENT: settings.otel_environment,
|
||||
}
|
||||
)
|
||||
|
||||
# =========================================
|
||||
# TRACING SETUP
|
||||
# =========================================
|
||||
_tracer_provider = TracerProvider(resource=resource)
|
||||
|
||||
if settings.otel_exporter_otlp_endpoint:
|
||||
otlp_exporter = OTLPSpanExporter(
|
||||
endpoint=settings.otel_exporter_otlp_endpoint,
|
||||
insecure=settings.otel_exporter_otlp_insecure,
|
||||
)
|
||||
_tracer_provider.add_span_processor(BatchSpanProcessor(otlp_exporter))
|
||||
logger.info(f"OTLP exporter configured: {settings.otel_exporter_otlp_endpoint}")
|
||||
else:
|
||||
_tracer_provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
|
||||
logger.info("Console span exporter configured (no OTLP endpoint)")
|
||||
|
||||
trace.set_tracer_provider(_tracer_provider)
|
||||
|
||||
# =========================================
|
||||
# METRICS SETUP
|
||||
# =========================================
|
||||
# Prometheus metric reader exposes metrics at /metrics endpoint
|
||||
prometheus_reader = PrometheusMetricReader()
|
||||
_meter_provider = MeterProvider(resource=resource, metric_readers=[prometheus_reader])
|
||||
metrics.set_meter_provider(_meter_provider)
|
||||
|
||||
# Start Prometheus HTTP server on port 9464
|
||||
prometheus_port = settings.prometheus_port
|
||||
try:
|
||||
start_http_server(port=prometheus_port, registry=REGISTRY)
|
||||
logger.info(f"Prometheus metrics server started on port {prometheus_port}")
|
||||
except OSError as e:
|
||||
logger.warning(f"Could not start Prometheus server on port {prometheus_port}: {e}")
|
||||
|
||||
# Create custom metrics
|
||||
meter = metrics.get_meter(__name__)
|
||||
|
||||
_request_counter = meter.create_counter(
|
||||
name="http_requests_total",
|
||||
description="Total number of HTTP requests",
|
||||
unit="1",
|
||||
)
|
||||
|
||||
_request_duration = meter.create_histogram(
|
||||
name="http_request_duration_seconds",
|
||||
description="HTTP request duration in seconds",
|
||||
unit="s",
|
||||
)
|
||||
|
||||
_active_requests = meter.create_up_down_counter(
|
||||
name="http_requests_active",
|
||||
description="Number of active HTTP requests",
|
||||
unit="1",
|
||||
)
|
||||
|
||||
_error_counter = meter.create_counter(
|
||||
name="http_errors_total",
|
||||
description="Total number of HTTP errors",
|
||||
unit="1",
|
||||
)
|
||||
|
||||
# Instrument system metrics (CPU, memory, etc.)
|
||||
SystemMetricsInstrumentor().instrument()
|
||||
logger.info("System metrics instrumentation enabled")
|
||||
|
||||
# =========================================
|
||||
# LIBRARY INSTRUMENTATION
|
||||
# =========================================
|
||||
FastAPIInstrumentor.instrument_app(
|
||||
app,
|
||||
excluded_urls="healthz,readyz,metrics",
|
||||
tracer_provider=_tracer_provider,
|
||||
meter_provider=_meter_provider,
|
||||
)
|
||||
AsyncPGInstrumentor().instrument(tracer_provider=_tracer_provider)
|
||||
HTTPXClientInstrumentor().instrument(tracer_provider=_tracer_provider)
|
||||
RedisInstrumentor().instrument(tracer_provider=_tracer_provider)
|
||||
|
||||
# Inject trace context into logs
|
||||
LoggingInstrumentor().instrument(
|
||||
set_logging_format=True,
|
||||
log_level=logging.INFO,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"OpenTelemetry initialized: service={settings.otel_service_name}, "
|
||||
f"env={settings.otel_environment}, metrics_port={prometheus_port}"
|
||||
)
|
||||
|
||||
|
||||
async def shutdown_telemetry() -> None:
|
||||
"""Gracefully shutdown the tracer and meter providers."""
|
||||
global _tracer_provider, _meter_provider
|
||||
|
||||
if _tracer_provider:
|
||||
_tracer_provider.shutdown()
|
||||
_tracer_provider = None
|
||||
logger.info("Tracer provider shutdown complete")
|
||||
|
||||
if _meter_provider:
|
||||
_meter_provider.shutdown()
|
||||
_meter_provider = None
|
||||
logger.info("Meter provider shutdown complete")
|
||||
|
||||
|
||||
def get_tracer(name: str) -> trace.Tracer:
|
||||
"""Get a tracer instance for manual span creation."""
|
||||
return trace.get_tracer(name)
|
||||
|
||||
|
||||
def get_meter(name: str) -> metrics.Meter:
|
||||
"""Get a meter instance for custom metrics."""
|
||||
return metrics.get_meter(name)
|
||||
|
||||
|
||||
def get_current_trace_id() -> str | None:
|
||||
"""Get the current trace ID for request correlation."""
|
||||
span = trace.get_current_span()
|
||||
if span and span.get_span_context().is_valid:
|
||||
return format(span.get_span_context().trace_id, "032x")
|
||||
return None
|
||||
|
||||
|
||||
def get_current_span_id() -> str | None:
|
||||
"""Get the current span ID."""
|
||||
span = trace.get_current_span()
|
||||
if span and span.get_span_context().is_valid:
|
||||
return format(span.get_span_context().span_id, "016x")
|
||||
return None
|
||||
|
||||
|
||||
@contextmanager
|
||||
def create_span(name: str, attributes: dict[str, Any] | None = None):
|
||||
"""Context manager for creating manual spans."""
|
||||
tracer = get_tracer(__name__)
|
||||
with tracer.start_as_current_span(name, attributes=attributes) as span:
|
||||
yield span
|
||||
|
||||
|
||||
def add_span_attributes(attributes: dict[str, Any]) -> None:
|
||||
"""Add attributes to the current span."""
|
||||
span = trace.get_current_span()
|
||||
if span:
|
||||
for key, value in attributes.items():
|
||||
span.set_attribute(key, value)
|
||||
|
||||
|
||||
def record_exception(exception: Exception) -> None:
|
||||
"""Record an exception on the current span."""
|
||||
span = trace.get_current_span()
|
||||
if span:
|
||||
span.record_exception(exception)
|
||||
span.set_status(trace.Status(trace.StatusCode.ERROR, str(exception)))
|
||||
|
||||
|
||||
# =========================================
|
||||
# CUSTOM METRICS HELPERS
|
||||
# =========================================
|
||||
|
||||
|
||||
def record_request(method: str, endpoint: str, status_code: int) -> None:
|
||||
"""Record a request metric."""
|
||||
if _request_counter:
|
||||
_request_counter.add(
|
||||
1,
|
||||
{
|
||||
"method": method,
|
||||
"endpoint": endpoint,
|
||||
"status_code": str(status_code),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def record_request_duration(method: str, endpoint: str, duration: float) -> None:
|
||||
"""Record request duration in seconds."""
|
||||
if _request_duration:
|
||||
_request_duration.record(
|
||||
duration,
|
||||
{
|
||||
"method": method,
|
||||
"endpoint": endpoint,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def increment_active_requests(method: str, endpoint: str) -> None:
|
||||
"""Increment active requests counter."""
|
||||
if _active_requests:
|
||||
_active_requests.add(1, {"method": method, "endpoint": endpoint})
|
||||
|
||||
|
||||
def decrement_active_requests(method: str, endpoint: str) -> None:
|
||||
"""Decrement active requests counter."""
|
||||
if _active_requests:
|
||||
_active_requests.add(-1, {"method": method, "endpoint": endpoint})
|
||||
|
||||
|
||||
def record_error(method: str, endpoint: str, error_type: str) -> None:
|
||||
"""Record an error metric."""
|
||||
if _error_counter:
|
||||
_error_counter.add(
|
||||
1,
|
||||
{
|
||||
"method": method,
|
||||
"endpoint": endpoint,
|
||||
"error_type": error_type,
|
||||
},
|
||||
)
|
||||
Reference in New Issue
Block a user