Add OpenTelemetry instrumentation with distributed tracing and metrics: - Structured JSON logging with trace context correlation - Auto-instrumentation for FastAPI, asyncpg, httpx, redis - OTLP exporter for traces and Prometheus metrics endpoint Implement Celery worker and notification task system: - Celery app with Redis/SQS broker support and configurable queues - Notification tasks for incident fan-out, webhooks, and escalations - Pluggable TaskQueue abstraction with in-memory driver for testing Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana): - OpenTelemetry Collector for receiving OTLP traces and logs - Tempo for distributed tracing backend - Loki for log aggregation with Promtail DaemonSet - Prometheus for metrics scraping with RBAC configuration - Grafana with pre-provisioned datasources and API overview dashboard - Helm templates for all observability components Enhance application infrastructure: - Global exception handlers with structured ErrorResponse schema - Request logging middleware with timing metrics - Health check updated to verify task queue connectivity - Non-root user in Dockerfile for security - Init containers in Helm deployments for dependency ordering - Production Helm values with autoscaling and retention policies
164 lines
5.1 KiB
YAML
164 lines
5.1 KiB
YAML
{{- if and .Values.observability.enabled .Values.metrics.enabled }}
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: {{ include "incidentops.fullname" . }}-prometheus
|
|
labels:
|
|
{{- include "incidentops.labels" . | nindent 4 }}
|
|
app.kubernetes.io/component: prometheus
|
|
data:
|
|
prometheus.yml: |
|
|
global:
|
|
scrape_interval: {{ .Values.observability.prometheus.scrapeInterval | default "15s" }}
|
|
evaluation_interval: 15s
|
|
|
|
scrape_configs:
|
|
- job_name: "prometheus"
|
|
static_configs:
|
|
- targets: ["localhost:9090"]
|
|
|
|
- job_name: "incidentops-api"
|
|
kubernetes_sd_configs:
|
|
- role: pod
|
|
namespaces:
|
|
names:
|
|
- {{ .Release.Namespace }}
|
|
relabel_configs:
|
|
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
|
|
action: keep
|
|
regex: api
|
|
- source_labels: [__meta_kubernetes_pod_container_port_name]
|
|
action: keep
|
|
regex: metrics
|
|
- source_labels: [__meta_kubernetes_namespace]
|
|
target_label: namespace
|
|
- source_labels: [__meta_kubernetes_pod_name]
|
|
target_label: pod
|
|
metrics_path: /metrics
|
|
scrape_interval: 10s
|
|
|
|
- job_name: "incidentops-worker"
|
|
kubernetes_sd_configs:
|
|
- role: pod
|
|
namespaces:
|
|
names:
|
|
- {{ .Release.Namespace }}
|
|
relabel_configs:
|
|
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
|
|
action: keep
|
|
regex: worker
|
|
- source_labels: [__meta_kubernetes_pod_container_port_name]
|
|
action: keep
|
|
regex: metrics
|
|
- source_labels: [__meta_kubernetes_namespace]
|
|
target_label: namespace
|
|
- source_labels: [__meta_kubernetes_pod_name]
|
|
target_label: pod
|
|
metrics_path: /metrics
|
|
scrape_interval: 10s
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: {{ include "incidentops.fullname" . }}-prometheus
|
|
labels:
|
|
{{- include "incidentops.labels" . | nindent 4 }}
|
|
app.kubernetes.io/component: prometheus
|
|
spec:
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
|
app.kubernetes.io/component: prometheus
|
|
template:
|
|
metadata:
|
|
labels:
|
|
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
|
app.kubernetes.io/component: prometheus
|
|
annotations:
|
|
checksum/config: {{ .Values.observability.prometheus.image.tag | sha256sum }}
|
|
spec:
|
|
serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
|
|
securityContext:
|
|
fsGroup: 65534
|
|
runAsUser: 65534
|
|
runAsNonRoot: true
|
|
containers:
|
|
- name: prometheus
|
|
image: "{{ .Values.observability.prometheus.image.repository }}:{{ .Values.observability.prometheus.image.tag }}"
|
|
imagePullPolicy: {{ .Values.observability.prometheus.image.pullPolicy }}
|
|
args:
|
|
- "--config.file=/etc/prometheus/prometheus.yml"
|
|
- "--storage.tsdb.path=/prometheus"
|
|
- "--storage.tsdb.retention.time={{ .Values.observability.prometheus.retention }}"
|
|
- "--web.enable-lifecycle"
|
|
ports:
|
|
- name: http
|
|
containerPort: 9090
|
|
protocol: TCP
|
|
volumeMounts:
|
|
- name: config
|
|
mountPath: /etc/prometheus
|
|
- name: data
|
|
mountPath: /prometheus
|
|
resources:
|
|
{{- toYaml .Values.observability.prometheus.resources | nindent 12 }}
|
|
readinessProbe:
|
|
httpGet:
|
|
path: /-/ready
|
|
port: http
|
|
initialDelaySeconds: 10
|
|
periodSeconds: 10
|
|
livenessProbe:
|
|
httpGet:
|
|
path: /-/healthy
|
|
port: http
|
|
initialDelaySeconds: 30
|
|
periodSeconds: 30
|
|
volumes:
|
|
- name: config
|
|
configMap:
|
|
name: {{ include "incidentops.fullname" . }}-prometheus
|
|
- name: data
|
|
{{- if .Values.observability.prometheus.persistence.enabled }}
|
|
persistentVolumeClaim:
|
|
claimName: {{ include "incidentops.fullname" . }}-prometheus
|
|
{{- else }}
|
|
emptyDir: {}
|
|
{{- end }}
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: {{ include "incidentops.fullname" . }}-prometheus
|
|
labels:
|
|
{{- include "incidentops.labels" . | nindent 4 }}
|
|
app.kubernetes.io/component: prometheus
|
|
spec:
|
|
type: ClusterIP
|
|
ports:
|
|
- name: http
|
|
port: 9090
|
|
targetPort: http
|
|
protocol: TCP
|
|
selector:
|
|
{{- include "incidentops.selectorLabels" . | nindent 4 }}
|
|
app.kubernetes.io/component: prometheus
|
|
{{- if .Values.observability.prometheus.persistence.enabled }}
|
|
---
|
|
apiVersion: v1
|
|
kind: PersistentVolumeClaim
|
|
metadata:
|
|
name: {{ include "incidentops.fullname" . }}-prometheus
|
|
labels:
|
|
{{- include "incidentops.labels" . | nindent 4 }}
|
|
app.kubernetes.io/component: prometheus
|
|
spec:
|
|
accessModes:
|
|
- ReadWriteOnce
|
|
resources:
|
|
requests:
|
|
storage: {{ .Values.observability.prometheus.persistence.size }}
|
|
{{- end }}
|
|
{{- end }}
|