Add OpenTelemetry instrumentation with distributed tracing and metrics: - Structured JSON logging with trace context correlation - Auto-instrumentation for FastAPI, asyncpg, httpx, redis - OTLP exporter for traces and Prometheus metrics endpoint Implement Celery worker and notification task system: - Celery app with Redis/SQS broker support and configurable queues - Notification tasks for incident fan-out, webhooks, and escalations - Pluggable TaskQueue abstraction with in-memory driver for testing Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana): - OpenTelemetry Collector for receiving OTLP traces and logs - Tempo for distributed tracing backend - Loki for log aggregation with Promtail DaemonSet - Prometheus for metrics scraping with RBAC configuration - Grafana with pre-provisioned datasources and API overview dashboard - Helm templates for all observability components Enhance application infrastructure: - Global exception handlers with structured ErrorResponse schema - Request logging middleware with timing metrics - Health check updated to verify task queue connectivity - Non-root user in Dockerfile for security - Init containers in Helm deployments for dependency ordering - Production Helm values with autoscaling and retention policies
67 lines
1.9 KiB
Python
67 lines
1.9 KiB
Python
"""Application configuration via pydantic-settings."""
|
|
|
|
from typing import Literal
|
|
|
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
|
|
|
|
class Settings(BaseSettings):
|
|
"""Application settings loaded from environment variables."""
|
|
|
|
model_config = SettingsConfigDict(
|
|
env_file=".env",
|
|
env_file_encoding="utf-8",
|
|
case_sensitive=False,
|
|
)
|
|
|
|
# Database
|
|
database_url: str
|
|
|
|
# Redis (legacy default for Celery broker)
|
|
redis_url: str = "redis://localhost:6379/0"
|
|
|
|
# Task queue
|
|
task_queue_driver: Literal["celery", "inmemory"] = "celery"
|
|
task_queue_broker_url: str | None = None
|
|
task_queue_backend: Literal["redis", "sqs"] = "redis"
|
|
task_queue_default_queue: str = "default"
|
|
task_queue_critical_queue: str = "critical"
|
|
task_queue_visibility_timeout: int = 600
|
|
task_queue_polling_interval: float = 1.0
|
|
notification_escalation_delay_seconds: int = 900
|
|
|
|
# AWS (used when task_queue_backend="sqs")
|
|
aws_region: str | None = None
|
|
|
|
# JWT
|
|
jwt_secret_key: str
|
|
jwt_algorithm: str = "HS256"
|
|
jwt_issuer: str = "incidentops"
|
|
jwt_audience: str = "incidentops-api"
|
|
access_token_expire_minutes: int = 15
|
|
refresh_token_expire_days: int = 30
|
|
|
|
# Application
|
|
debug: bool = False
|
|
api_v1_prefix: str = "/v1"
|
|
|
|
# OpenTelemetry
|
|
otel_enabled: bool = True
|
|
otel_service_name: str = "incidentops-api"
|
|
otel_environment: str = "development"
|
|
otel_exporter_otlp_endpoint: str | None = None # e.g., "http://tempo:4317"
|
|
otel_exporter_otlp_insecure: bool = True
|
|
otel_log_level: str = "INFO"
|
|
|
|
# Metrics
|
|
prometheus_port: int = 9464 # Port for Prometheus metrics endpoint
|
|
|
|
@property
|
|
def resolved_task_queue_broker_url(self) -> str:
|
|
"""Return the broker URL with redis fallback for backwards compatibility."""
|
|
|
|
return self.task_queue_broker_url or self.redis_url
|
|
|
|
|
|
settings = Settings() # type: ignore[call-arg]
|