Add OpenTelemetry instrumentation with distributed tracing and metrics: - Structured JSON logging with trace context correlation - Auto-instrumentation for FastAPI, asyncpg, httpx, redis - OTLP exporter for traces and Prometheus metrics endpoint Implement Celery worker and notification task system: - Celery app with Redis/SQS broker support and configurable queues - Notification tasks for incident fan-out, webhooks, and escalations - Pluggable TaskQueue abstraction with in-memory driver for testing Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana): - OpenTelemetry Collector for receiving OTLP traces and logs - Tempo for distributed tracing backend - Loki for log aggregation with Promtail DaemonSet - Prometheus for metrics scraping with RBAC configuration - Grafana with pre-provisioned datasources and API overview dashboard - Helm templates for all observability components Enhance application infrastructure: - Global exception handlers with structured ErrorResponse schema - Request logging middleware with timing metrics - Health check updated to verify task queue connectivity - Non-root user in Dockerfile for security - Init containers in Helm deployments for dependency ordering - Production Helm values with autoscaling and retention policies
105 lines
3.3 KiB
YAML
105 lines
3.3 KiB
YAML
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: {{ include "incidentops.fullname" . }}-api
|
|
labels:
|
|
{{- include "incidentops.api.labels" . | nindent 4 }}
|
|
spec:
|
|
{{- if not .Values.api.autoscaling.enabled }}
|
|
replicas: {{ .Values.api.replicaCount }}
|
|
{{- end }}
|
|
selector:
|
|
matchLabels:
|
|
{{- include "incidentops.api.selectorLabels" . | nindent 6 }}
|
|
template:
|
|
metadata:
|
|
annotations:
|
|
checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }}
|
|
checksum/secret: {{ include (print $.Template.BasePath "/secret.yaml") . | sha256sum }}
|
|
{{- with .Values.api.podAnnotations }}
|
|
{{- toYaml . | nindent 8 }}
|
|
{{- end }}
|
|
labels:
|
|
{{- include "incidentops.api.selectorLabels" . | nindent 8 }}
|
|
spec:
|
|
{{- with .Values.global.imagePullSecrets }}
|
|
imagePullSecrets:
|
|
{{- toYaml . | nindent 8 }}
|
|
{{- end }}
|
|
serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
|
|
securityContext:
|
|
{{- toYaml .Values.podSecurityContext | nindent 8 }}
|
|
initContainers:
|
|
- name: wait-for-postgres
|
|
image: busybox:1.36
|
|
command:
|
|
- sh
|
|
- -c
|
|
- |
|
|
until nc -z {{ include "incidentops.fullname" . }}-postgresql 5432; do
|
|
echo "Waiting for PostgreSQL..."
|
|
sleep 2
|
|
done
|
|
echo "PostgreSQL is ready"
|
|
- name: wait-for-redis
|
|
image: busybox:1.36
|
|
command:
|
|
- sh
|
|
- -c
|
|
- |
|
|
until nc -z {{ include "incidentops.fullname" . }}-redis 6379; do
|
|
echo "Waiting for Redis..."
|
|
sleep 2
|
|
done
|
|
echo "Redis is ready"
|
|
containers:
|
|
- name: api
|
|
securityContext:
|
|
{{- toYaml .Values.securityContext | nindent 12 }}
|
|
image: {{ include "incidentops.api.image" . }}
|
|
imagePullPolicy: {{ .Values.api.image.pullPolicy }}
|
|
ports:
|
|
- name: http
|
|
containerPort: 8000
|
|
protocol: TCP
|
|
{{- if .Values.metrics.enabled }}
|
|
- name: metrics
|
|
containerPort: {{ .Values.metrics.port }}
|
|
protocol: TCP
|
|
{{- end }}
|
|
envFrom:
|
|
- configMapRef:
|
|
name: {{ include "incidentops.fullname" . }}-config
|
|
- secretRef:
|
|
name: {{ include "incidentops.fullname" . }}-secret
|
|
livenessProbe:
|
|
httpGet:
|
|
path: /v1/healthz
|
|
port: http
|
|
initialDelaySeconds: 10
|
|
periodSeconds: 30
|
|
timeoutSeconds: 5
|
|
failureThreshold: 3
|
|
readinessProbe:
|
|
httpGet:
|
|
path: /v1/readyz
|
|
port: http
|
|
initialDelaySeconds: 5
|
|
periodSeconds: 10
|
|
timeoutSeconds: 5
|
|
failureThreshold: 3
|
|
resources:
|
|
{{- toYaml .Values.api.resources | nindent 12 }}
|
|
{{- with .Values.api.nodeSelector }}
|
|
nodeSelector:
|
|
{{- toYaml . | nindent 8 }}
|
|
{{- end }}
|
|
{{- with .Values.api.affinity }}
|
|
affinity:
|
|
{{- toYaml . | nindent 8 }}
|
|
{{- end }}
|
|
{{- with .Values.api.tolerations }}
|
|
tolerations:
|
|
{{- toYaml . | nindent 8 }}
|
|
{{- end }}
|