Add OpenTelemetry instrumentation with distributed tracing and metrics: - Structured JSON logging with trace context correlation - Auto-instrumentation for FastAPI, asyncpg, httpx, redis - OTLP exporter for traces and Prometheus metrics endpoint Implement Celery worker and notification task system: - Celery app with Redis/SQS broker support and configurable queues - Notification tasks for incident fan-out, webhooks, and escalations - Pluggable TaskQueue abstraction with in-memory driver for testing Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana): - OpenTelemetry Collector for receiving OTLP traces and logs - Tempo for distributed tracing backend - Loki for log aggregation with Promtail DaemonSet - Prometheus for metrics scraping with RBAC configuration - Grafana with pre-provisioned datasources and API overview dashboard - Helm templates for all observability components Enhance application infrastructure: - Global exception handlers with structured ErrorResponse schema - Request logging middleware with timing metrics - Health check updated to verify task queue connectivity - Non-root user in Dockerfile for security - Init containers in Helm deployments for dependency ordering - Production Helm values with autoscaling and retention policies
143 lines
2.4 KiB
YAML
143 lines
2.4 KiB
YAML
# Production values for incidentops
|
|
# Use external secrets management in production
|
|
|
|
api:
|
|
replicaCount: 3
|
|
autoscaling:
|
|
enabled: true
|
|
minReplicas: 3
|
|
maxReplicas: 10
|
|
targetCPUUtilizationPercentage: 70
|
|
resources:
|
|
requests:
|
|
cpu: 250m
|
|
memory: 512Mi
|
|
limits:
|
|
cpu: 1000m
|
|
memory: 1Gi
|
|
|
|
worker:
|
|
replicaCount: 3
|
|
autoscaling:
|
|
enabled: true
|
|
minReplicas: 3
|
|
maxReplicas: 10
|
|
targetCPUUtilizationPercentage: 70
|
|
concurrency: 8
|
|
resources:
|
|
requests:
|
|
cpu: 250m
|
|
memory: 512Mi
|
|
limits:
|
|
cpu: 1000m
|
|
memory: 1Gi
|
|
|
|
web:
|
|
replicaCount: 3
|
|
autoscaling:
|
|
enabled: true
|
|
minReplicas: 3
|
|
maxReplicas: 10
|
|
targetCPUUtilizationPercentage: 70
|
|
resources:
|
|
requests:
|
|
cpu: 100m
|
|
memory: 256Mi
|
|
limits:
|
|
cpu: 500m
|
|
memory: 512Mi
|
|
|
|
ingress:
|
|
enabled: true
|
|
className: nginx
|
|
annotations:
|
|
nginx.ingress.kubernetes.io/proxy-body-size: "10m"
|
|
cert-manager.io/cluster-issuer: letsencrypt-prod
|
|
host: incidentops.example.com
|
|
tls:
|
|
- secretName: incidentops-tls
|
|
hosts:
|
|
- incidentops.example.com
|
|
|
|
postgresql:
|
|
persistence:
|
|
size: 50Gi
|
|
resources:
|
|
requests:
|
|
cpu: 500m
|
|
memory: 1Gi
|
|
limits:
|
|
cpu: 2000m
|
|
memory: 4Gi
|
|
|
|
redis:
|
|
persistence:
|
|
size: 10Gi
|
|
resources:
|
|
requests:
|
|
cpu: 250m
|
|
memory: 512Mi
|
|
limits:
|
|
cpu: 1000m
|
|
memory: 1Gi
|
|
|
|
# Application configuration
|
|
config:
|
|
environment: production
|
|
logLevel: INFO
|
|
|
|
# Observability Stack - Production settings
|
|
observability:
|
|
enabled: true
|
|
|
|
otelCollector:
|
|
replicaCount: 2
|
|
resources:
|
|
requests:
|
|
cpu: 100m
|
|
memory: 256Mi
|
|
limits:
|
|
cpu: 500m
|
|
memory: 512Mi
|
|
|
|
tempo:
|
|
retention: "720h" # 30 days
|
|
persistence:
|
|
enabled: true
|
|
size: 50Gi
|
|
resources:
|
|
requests:
|
|
cpu: 250m
|
|
memory: 512Mi
|
|
limits:
|
|
cpu: 1000m
|
|
memory: 2Gi
|
|
|
|
loki:
|
|
retention: "720h" # 30 days
|
|
persistence:
|
|
enabled: true
|
|
size: 100Gi
|
|
resources:
|
|
requests:
|
|
cpu: 250m
|
|
memory: 512Mi
|
|
limits:
|
|
cpu: 1000m
|
|
memory: 2Gi
|
|
|
|
grafana:
|
|
adminPassword: "" # Set via external secret in production
|
|
service:
|
|
type: ClusterIP
|
|
persistence:
|
|
enabled: true
|
|
size: 5Gi
|
|
resources:
|
|
requests:
|
|
cpu: 100m
|
|
memory: 256Mi
|
|
limits:
|
|
cpu: 500m
|
|
memory: 512Mi
|