feat: add observability stack and background task infrastructure
Add OpenTelemetry instrumentation with distributed tracing and metrics: - Structured JSON logging with trace context correlation - Auto-instrumentation for FastAPI, asyncpg, httpx, redis - OTLP exporter for traces and Prometheus metrics endpoint Implement Celery worker and notification task system: - Celery app with Redis/SQS broker support and configurable queues - Notification tasks for incident fan-out, webhooks, and escalations - Pluggable TaskQueue abstraction with in-memory driver for testing Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana): - OpenTelemetry Collector for receiving OTLP traces and logs - Tempo for distributed tracing backend - Loki for log aggregation with Promtail DaemonSet - Prometheus for metrics scraping with RBAC configuration - Grafana with pre-provisioned datasources and API overview dashboard - Helm templates for all observability components Enhance application infrastructure: - Global exception handlers with structured ErrorResponse schema - Request logging middleware with timing metrics - Health check updated to verify task queue connectivity - Non-root user in Dockerfile for security - Init containers in Helm deployments for dependency ordering - Production Helm values with autoscaling and retention policies
This commit is contained in:
@@ -29,6 +29,29 @@ spec:
|
||||
serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
|
||||
securityContext:
|
||||
{{- toYaml .Values.podSecurityContext | nindent 8 }}
|
||||
initContainers:
|
||||
- name: wait-for-postgres
|
||||
image: busybox:1.36
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
until nc -z {{ include "incidentops.fullname" . }}-postgresql 5432; do
|
||||
echo "Waiting for PostgreSQL..."
|
||||
sleep 2
|
||||
done
|
||||
echo "PostgreSQL is ready"
|
||||
- name: wait-for-redis
|
||||
image: busybox:1.36
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
until nc -z {{ include "incidentops.fullname" . }}-redis 6379; do
|
||||
echo "Waiting for Redis..."
|
||||
sleep 2
|
||||
done
|
||||
echo "Redis is ready"
|
||||
containers:
|
||||
- name: api
|
||||
securityContext:
|
||||
@@ -39,6 +62,11 @@ spec:
|
||||
- name: http
|
||||
containerPort: 8000
|
||||
protocol: TCP
|
||||
{{- if .Values.metrics.enabled }}
|
||||
- name: metrics
|
||||
containerPort: {{ .Values.metrics.port }}
|
||||
protocol: TCP
|
||||
{{- end }}
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: {{ include "incidentops.fullname" . }}-config
|
||||
|
||||
@@ -11,5 +11,11 @@ spec:
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
name: http
|
||||
{{- if .Values.metrics.enabled }}
|
||||
- port: {{ .Values.metrics.port }}
|
||||
targetPort: metrics
|
||||
protocol: TCP
|
||||
name: metrics
|
||||
{{- end }}
|
||||
selector:
|
||||
{{- include "incidentops.api.selectorLabels" . | nindent 4 }}
|
||||
|
||||
@@ -8,3 +8,16 @@ data:
|
||||
JWT_ALGORITHM: {{ .Values.config.jwtAlgorithm | quote }}
|
||||
ACCESS_TOKEN_EXPIRE_MINUTES: {{ .Values.config.accessTokenExpireMinutes | quote }}
|
||||
REFRESH_TOKEN_EXPIRE_DAYS: {{ .Values.config.refreshTokenExpireDays | quote }}
|
||||
# OpenTelemetry configuration
|
||||
OTEL_ENABLED: {{ .Values.observability.enabled | quote }}
|
||||
OTEL_SERVICE_NAME: "incidentops-api"
|
||||
OTEL_ENVIRONMENT: {{ .Values.config.environment | default "production" | quote }}
|
||||
{{- if .Values.observability.enabled }}
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT: "http://{{ include "incidentops.fullname" . }}-otel-collector:4317"
|
||||
{{- end }}
|
||||
OTEL_EXPORTER_OTLP_INSECURE: "true"
|
||||
OTEL_LOG_LEVEL: {{ .Values.config.logLevel | default "INFO" | quote }}
|
||||
# Metrics configuration
|
||||
{{- if .Values.metrics.enabled }}
|
||||
PROMETHEUS_PORT: {{ .Values.metrics.port | quote }}
|
||||
{{- end }}
|
||||
|
||||
387
helm/incidentops/templates/grafana-deployment.yaml
Normal file
387
helm/incidentops/templates/grafana-deployment.yaml
Normal file
@@ -0,0 +1,387 @@
|
||||
{{- if .Values.observability.enabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana-datasources
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: grafana
|
||||
data:
|
||||
datasources.yaml: |
|
||||
apiVersion: 1
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
uid: prometheus
|
||||
url: http://{{ include "incidentops.fullname" . }}-prometheus:9090
|
||||
access: proxy
|
||||
isDefault: false
|
||||
jsonData:
|
||||
httpMethod: POST
|
||||
exemplarTraceIdDestinations:
|
||||
- name: trace_id
|
||||
datasourceUid: tempo
|
||||
|
||||
- name: Tempo
|
||||
type: tempo
|
||||
uid: tempo
|
||||
url: http://{{ include "incidentops.fullname" . }}-tempo:3200
|
||||
access: proxy
|
||||
isDefault: false
|
||||
jsonData:
|
||||
tracesToLogsV2:
|
||||
datasourceUid: loki
|
||||
spanStartTimeShift: '-1h'
|
||||
spanEndTimeShift: '1h'
|
||||
filterByTraceID: true
|
||||
filterBySpanID: true
|
||||
tracesToMetrics:
|
||||
datasourceUid: prometheus
|
||||
spanStartTimeShift: '-1h'
|
||||
spanEndTimeShift: '1h'
|
||||
serviceMap:
|
||||
datasourceUid: prometheus
|
||||
nodeGraph:
|
||||
enabled: true
|
||||
lokiSearch:
|
||||
datasourceUid: loki
|
||||
|
||||
- name: Loki
|
||||
type: loki
|
||||
uid: loki
|
||||
url: http://{{ include "incidentops.fullname" . }}-loki:3100
|
||||
access: proxy
|
||||
isDefault: true
|
||||
jsonData:
|
||||
derivedFields:
|
||||
- datasourceUid: tempo
|
||||
matcherRegex: '"trace_id":"([a-f0-9]+)"'
|
||||
name: TraceID
|
||||
url: '$${__value.raw}'
|
||||
urlDisplayLabel: 'View Trace'
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana-dashboards-provider
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: grafana
|
||||
data:
|
||||
dashboards.yaml: |
|
||||
apiVersion: 1
|
||||
providers:
|
||||
- name: 'default'
|
||||
orgId: 1
|
||||
folder: 'IncidentOps'
|
||||
folderUid: 'incidentops'
|
||||
type: file
|
||||
disableDeletion: false
|
||||
editable: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana-dashboards
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: grafana
|
||||
data:
|
||||
api-overview.json: |
|
||||
{
|
||||
"title": "IncidentOps API Overview",
|
||||
"uid": "incidentops-api",
|
||||
"tags": ["incidentops", "api"],
|
||||
"timezone": "browser",
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Request Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 0},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
|
||||
"legendFormat": "Requests/sec",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Request Duration (p50, p95, p99)",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 0},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "histogram_quantile(0.50, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
|
||||
"legendFormat": "p50",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
|
||||
"legendFormat": "p95",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
|
||||
"legendFormat": "p99",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Error Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 0},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\", http_status_code=~\"5..\"}[1m])) / sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m])) * 100",
|
||||
"legendFormat": "Error %",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Requests by Status Code",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "sum by (http_status_code) (rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
|
||||
"legendFormat": "{{ "{{" }}http_status_code{{ "}}" }}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Requests by Endpoint",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "sum by (http_route) (rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
|
||||
"legendFormat": "{{ "{{" }}http_route{{ "}}" }}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "Recent Logs",
|
||||
"type": "logs",
|
||||
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 16},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "loki", "uid": "loki"},
|
||||
"expr": "{service_name=\"incidentops-api\"} | json",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"showTime": true,
|
||||
"showLabels": true,
|
||||
"wrapLogMessage": true,
|
||||
"enableLogDetails": true,
|
||||
"sortOrder": "Descending"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"title": "Recent Traces",
|
||||
"type": "traces",
|
||||
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 26},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "tempo", "uid": "tempo"},
|
||||
"queryType": "traceqlSearch",
|
||||
"filters": [
|
||||
{
|
||||
"id": "service-name",
|
||||
"operator": "=",
|
||||
"scope": "resource",
|
||||
"tag": "service.name",
|
||||
"value": ["incidentops-api"]
|
||||
}
|
||||
],
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"schemaVersion": 38,
|
||||
"version": 2
|
||||
}
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: grafana
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
||||
app.kubernetes.io/component: grafana
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
||||
app.kubernetes.io/component: grafana
|
||||
annotations:
|
||||
checksum/datasources: {{ .Values.observability.grafana.image.tag | sha256sum }}
|
||||
spec:
|
||||
securityContext:
|
||||
fsGroup: 472
|
||||
runAsUser: 472
|
||||
containers:
|
||||
- name: grafana
|
||||
image: "{{ .Values.observability.grafana.image.repository }}:{{ .Values.observability.grafana.image.tag }}"
|
||||
imagePullPolicy: {{ .Values.observability.grafana.image.pullPolicy }}
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 3000
|
||||
protocol: TCP
|
||||
env:
|
||||
- name: GF_SECURITY_ADMIN_USER
|
||||
value: {{ .Values.observability.grafana.adminUser | quote }}
|
||||
- name: GF_SECURITY_ADMIN_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana
|
||||
key: admin-password
|
||||
- name: GF_USERS_ALLOW_SIGN_UP
|
||||
value: "false"
|
||||
- name: GF_EXPLORE_ENABLED
|
||||
value: "true"
|
||||
- name: GF_FEATURE_TOGGLES_ENABLE
|
||||
value: "traceqlEditor tempoSearch tempoBackendSearch tempoApmTable"
|
||||
volumeMounts:
|
||||
- name: datasources
|
||||
mountPath: /etc/grafana/provisioning/datasources
|
||||
- name: dashboards-provider
|
||||
mountPath: /etc/grafana/provisioning/dashboards
|
||||
- name: dashboards
|
||||
mountPath: /var/lib/grafana/dashboards
|
||||
- name: data
|
||||
mountPath: /var/lib/grafana
|
||||
resources:
|
||||
{{- toYaml .Values.observability.grafana.resources | nindent 12 }}
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /api/health
|
||||
port: http
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /api/health
|
||||
port: http
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 30
|
||||
volumes:
|
||||
- name: datasources
|
||||
configMap:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana-datasources
|
||||
- name: dashboards-provider
|
||||
configMap:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana-dashboards-provider
|
||||
- name: dashboards
|
||||
configMap:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana-dashboards
|
||||
- name: data
|
||||
{{- if .Values.observability.grafana.persistence.enabled }}
|
||||
persistentVolumeClaim:
|
||||
claimName: {{ include "incidentops.fullname" . }}-grafana
|
||||
{{- else }}
|
||||
emptyDir: {}
|
||||
{{- end }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: grafana
|
||||
type: Opaque
|
||||
data:
|
||||
admin-password: {{ .Values.observability.grafana.adminPassword | b64enc | quote }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: grafana
|
||||
spec:
|
||||
type: {{ .Values.observability.grafana.service.type }}
|
||||
ports:
|
||||
- name: http
|
||||
port: 80
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
selector:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: grafana
|
||||
{{- if .Values.observability.grafana.persistence.enabled }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: grafana
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: {{ .Values.observability.grafana.persistence.size }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
38
helm/incidentops/templates/grafana-ingress.yaml
Normal file
38
helm/incidentops/templates/grafana-ingress.yaml
Normal file
@@ -0,0 +1,38 @@
|
||||
{{- if and .Values.observability.enabled .Values.observability.grafana.ingress.enabled -}}
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: grafana
|
||||
{{- with .Values.observability.grafana.ingress.annotations }}
|
||||
annotations:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- if .Values.ingress.className }}
|
||||
ingressClassName: {{ .Values.ingress.className }}
|
||||
{{- end }}
|
||||
{{- if .Values.observability.grafana.ingress.tls }}
|
||||
tls:
|
||||
{{- range .Values.observability.grafana.ingress.tls }}
|
||||
- hosts:
|
||||
{{- range .hosts }}
|
||||
- {{ . | quote }}
|
||||
{{- end }}
|
||||
secretName: {{ .secretName }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
rules:
|
||||
- host: {{ .Values.observability.grafana.ingress.host | quote }}
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana
|
||||
port:
|
||||
number: 80
|
||||
{{- end }}
|
||||
155
helm/incidentops/templates/loki-deployment.yaml
Normal file
155
helm/incidentops/templates/loki-deployment.yaml
Normal file
@@ -0,0 +1,155 @@
|
||||
{{- if .Values.observability.enabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-loki-config
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: loki
|
||||
data:
|
||||
loki.yaml: |
|
||||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
grpc_listen_port: 9096
|
||||
|
||||
common:
|
||||
path_prefix: /loki
|
||||
storage:
|
||||
filesystem:
|
||||
chunks_directory: /loki/chunks
|
||||
rules_directory: /loki/rules
|
||||
replication_factor: 1
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
query_range:
|
||||
results_cache:
|
||||
cache:
|
||||
embedded_cache:
|
||||
enabled: true
|
||||
max_size_mb: 100
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: "2020-10-24"
|
||||
store: tsdb
|
||||
object_store: filesystem
|
||||
schema: v13
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
ruler:
|
||||
alertmanager_url: http://localhost:9093
|
||||
|
||||
limits_config:
|
||||
retention_period: {{ .Values.observability.loki.retention }}
|
||||
allow_structured_metadata: true
|
||||
volume_enabled: true
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-loki
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: loki
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
||||
app.kubernetes.io/component: loki
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
||||
app.kubernetes.io/component: loki
|
||||
annotations:
|
||||
checksum/config: {{ .Values.observability.loki.image.tag | sha256sum }}
|
||||
spec:
|
||||
containers:
|
||||
- name: loki
|
||||
image: "{{ .Values.observability.loki.image.repository }}:{{ .Values.observability.loki.image.tag }}"
|
||||
imagePullPolicy: {{ .Values.observability.loki.image.pullPolicy }}
|
||||
args:
|
||||
- -config.file=/etc/loki/loki.yaml
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 3100
|
||||
protocol: TCP
|
||||
- name: grpc
|
||||
containerPort: 9096
|
||||
protocol: TCP
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/loki
|
||||
- name: data
|
||||
mountPath: /loki
|
||||
resources:
|
||||
{{- toYaml .Values.observability.loki.resources | nindent 12 }}
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /ready
|
||||
port: http
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /ready
|
||||
port: http
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 30
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: {{ include "incidentops.fullname" . }}-loki-config
|
||||
- name: data
|
||||
{{- if .Values.observability.loki.persistence.enabled }}
|
||||
persistentVolumeClaim:
|
||||
claimName: {{ include "incidentops.fullname" . }}-loki
|
||||
{{- else }}
|
||||
emptyDir: {}
|
||||
{{- end }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-loki
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: loki
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- name: http
|
||||
port: 3100
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
- name: grpc
|
||||
port: 9096
|
||||
targetPort: grpc
|
||||
protocol: TCP
|
||||
selector:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: loki
|
||||
{{- if .Values.observability.loki.persistence.enabled }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-loki
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: loki
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: {{ .Values.observability.loki.persistence.size }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -30,9 +30,11 @@ spec:
|
||||
- name: migrate
|
||||
securityContext:
|
||||
{{- toYaml .Values.securityContext | nindent 12 }}
|
||||
image: {{ include "incidentops.api.image" . }}
|
||||
image: "{{ .Values.migration.image.repository }}:{{ .Values.migration.image.tag }}"
|
||||
imagePullPolicy: {{ .Values.migration.image.pullPolicy }}
|
||||
command:
|
||||
- uv
|
||||
- run
|
||||
- python
|
||||
- migrations/migrate.py
|
||||
- apply
|
||||
|
||||
132
helm/incidentops/templates/otel-collector-deployment.yaml
Normal file
132
helm/incidentops/templates/otel-collector-deployment.yaml
Normal file
@@ -0,0 +1,132 @@
|
||||
{{- if .Values.observability.enabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-otel-collector-config
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: otel-collector
|
||||
data:
|
||||
otel-collector-config.yaml: |
|
||||
extensions:
|
||||
health_check:
|
||||
endpoint: 0.0.0.0:13133
|
||||
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
|
||||
processors:
|
||||
batch:
|
||||
timeout: 1s
|
||||
send_batch_size: 1024
|
||||
memory_limiter:
|
||||
check_interval: 1s
|
||||
limit_mib: 512
|
||||
spike_limit_mib: 128
|
||||
|
||||
exporters:
|
||||
otlp/tempo:
|
||||
endpoint: {{ include "incidentops.fullname" . }}-tempo:4317
|
||||
tls:
|
||||
insecure: true
|
||||
loki:
|
||||
endpoint: http://{{ include "incidentops.fullname" . }}-loki:3100/loki/api/v1/push
|
||||
default_labels_enabled:
|
||||
exporter: true
|
||||
job: true
|
||||
|
||||
service:
|
||||
extensions: [health_check]
|
||||
pipelines:
|
||||
traces:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch]
|
||||
exporters: [otlp/tempo]
|
||||
logs:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch]
|
||||
exporters: [loki]
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-otel-collector
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: otel-collector
|
||||
spec:
|
||||
replicas: {{ .Values.observability.otelCollector.replicaCount }}
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
||||
app.kubernetes.io/component: otel-collector
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
||||
app.kubernetes.io/component: otel-collector
|
||||
annotations:
|
||||
checksum/config: {{ .Values.observability.otelCollector.image.tag | sha256sum }}
|
||||
spec:
|
||||
containers:
|
||||
- name: otel-collector
|
||||
image: "{{ .Values.observability.otelCollector.image.repository }}:{{ .Values.observability.otelCollector.image.tag }}"
|
||||
imagePullPolicy: {{ .Values.observability.otelCollector.image.pullPolicy }}
|
||||
args:
|
||||
- --config=/etc/otel-collector/otel-collector-config.yaml
|
||||
ports:
|
||||
- name: otlp-grpc
|
||||
containerPort: 4317
|
||||
protocol: TCP
|
||||
- name: otlp-http
|
||||
containerPort: 4318
|
||||
protocol: TCP
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/otel-collector
|
||||
resources:
|
||||
{{- toYaml .Values.observability.otelCollector.resources | nindent 12 }}
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: 13133
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 30
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: 13133
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: {{ include "incidentops.fullname" . }}-otel-collector-config
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-otel-collector
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: otel-collector
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- name: otlp-grpc
|
||||
port: 4317
|
||||
targetPort: otlp-grpc
|
||||
protocol: TCP
|
||||
- name: otlp-http
|
||||
port: 4318
|
||||
targetPort: otlp-http
|
||||
protocol: TCP
|
||||
selector:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: otel-collector
|
||||
{{- end }}
|
||||
163
helm/incidentops/templates/prometheus-deployment.yaml
Normal file
163
helm/incidentops/templates/prometheus-deployment.yaml
Normal file
@@ -0,0 +1,163 @@
|
||||
{{- if and .Values.observability.enabled .Values.metrics.enabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-prometheus
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: prometheus
|
||||
data:
|
||||
prometheus.yml: |
|
||||
global:
|
||||
scrape_interval: {{ .Values.observability.prometheus.scrapeInterval | default "15s" }}
|
||||
evaluation_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: "prometheus"
|
||||
static_configs:
|
||||
- targets: ["localhost:9090"]
|
||||
|
||||
- job_name: "incidentops-api"
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names:
|
||||
- {{ .Release.Namespace }}
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
|
||||
action: keep
|
||||
regex: api
|
||||
- source_labels: [__meta_kubernetes_pod_container_port_name]
|
||||
action: keep
|
||||
regex: metrics
|
||||
- source_labels: [__meta_kubernetes_namespace]
|
||||
target_label: namespace
|
||||
- source_labels: [__meta_kubernetes_pod_name]
|
||||
target_label: pod
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 10s
|
||||
|
||||
- job_name: "incidentops-worker"
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names:
|
||||
- {{ .Release.Namespace }}
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
|
||||
action: keep
|
||||
regex: worker
|
||||
- source_labels: [__meta_kubernetes_pod_container_port_name]
|
||||
action: keep
|
||||
regex: metrics
|
||||
- source_labels: [__meta_kubernetes_namespace]
|
||||
target_label: namespace
|
||||
- source_labels: [__meta_kubernetes_pod_name]
|
||||
target_label: pod
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 10s
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-prometheus
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: prometheus
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
||||
app.kubernetes.io/component: prometheus
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
||||
app.kubernetes.io/component: prometheus
|
||||
annotations:
|
||||
checksum/config: {{ .Values.observability.prometheus.image.tag | sha256sum }}
|
||||
spec:
|
||||
serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
|
||||
securityContext:
|
||||
fsGroup: 65534
|
||||
runAsUser: 65534
|
||||
runAsNonRoot: true
|
||||
containers:
|
||||
- name: prometheus
|
||||
image: "{{ .Values.observability.prometheus.image.repository }}:{{ .Values.observability.prometheus.image.tag }}"
|
||||
imagePullPolicy: {{ .Values.observability.prometheus.image.pullPolicy }}
|
||||
args:
|
||||
- "--config.file=/etc/prometheus/prometheus.yml"
|
||||
- "--storage.tsdb.path=/prometheus"
|
||||
- "--storage.tsdb.retention.time={{ .Values.observability.prometheus.retention }}"
|
||||
- "--web.enable-lifecycle"
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 9090
|
||||
protocol: TCP
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/prometheus
|
||||
- name: data
|
||||
mountPath: /prometheus
|
||||
resources:
|
||||
{{- toYaml .Values.observability.prometheus.resources | nindent 12 }}
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /-/ready
|
||||
port: http
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /-/healthy
|
||||
port: http
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 30
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: {{ include "incidentops.fullname" . }}-prometheus
|
||||
- name: data
|
||||
{{- if .Values.observability.prometheus.persistence.enabled }}
|
||||
persistentVolumeClaim:
|
||||
claimName: {{ include "incidentops.fullname" . }}-prometheus
|
||||
{{- else }}
|
||||
emptyDir: {}
|
||||
{{- end }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-prometheus
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: prometheus
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- name: http
|
||||
port: 9090
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
selector:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: prometheus
|
||||
{{- if .Values.observability.prometheus.persistence.enabled }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-prometheus
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: prometheus
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: {{ .Values.observability.prometheus.persistence.size }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
29
helm/incidentops/templates/prometheus-rbac.yaml
Normal file
29
helm/incidentops/templates/prometheus-rbac.yaml
Normal file
@@ -0,0 +1,29 @@
|
||||
{{- if and .Values.observability.enabled .Values.metrics.enabled }}
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-prometheus
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: prometheus
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["pods", "endpoints", "services"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-prometheus
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: prometheus
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: {{ include "incidentops.serviceAccountName" . }}
|
||||
namespace: {{ .Release.Namespace }}
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: {{ include "incidentops.fullname" . }}-prometheus
|
||||
{{- end }}
|
||||
169
helm/incidentops/templates/promtail-daemonset.yaml
Normal file
169
helm/incidentops/templates/promtail-daemonset.yaml
Normal file
@@ -0,0 +1,169 @@
|
||||
{{- if and .Values.observability.enabled .Values.observability.promtail.enabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-promtail-config
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: promtail
|
||||
data:
|
||||
promtail.yaml: |
|
||||
server:
|
||||
http_listen_port: 3101
|
||||
grpc_listen_port: 0
|
||||
|
||||
positions:
|
||||
filename: /run/promtail/positions.yaml
|
||||
|
||||
clients:
|
||||
- url: http://{{ include "incidentops.fullname" . }}-loki:3100/loki/api/v1/push
|
||||
|
||||
scrape_configs:
|
||||
- job_name: kubernetes-pods
|
||||
pipeline_stages:
|
||||
- cri: {}
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names: [{{ .Release.Namespace }}]
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_container_init]
|
||||
regex: "true"
|
||||
action: drop
|
||||
- source_labels: [__meta_kubernetes_pod_phase]
|
||||
regex: Pending|Failed|Succeeded
|
||||
action: drop
|
||||
- source_labels: [__meta_kubernetes_pod_name, __meta_kubernetes_pod_namespace, __meta_kubernetes_pod_container_name]
|
||||
target_label: __path__
|
||||
replacement: /var/log/containers/$1_$2_$3-*.log
|
||||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
|
||||
regex: (.*)
|
||||
target_label: service_name
|
||||
replacement: {{ include "incidentops.fullname" . }}-$1
|
||||
- source_labels: [__meta_kubernetes_pod_namespace]
|
||||
target_label: namespace
|
||||
- source_labels: [__meta_kubernetes_pod_name]
|
||||
target_label: pod
|
||||
- source_labels: [__meta_kubernetes_pod_container_name]
|
||||
target_label: container
|
||||
- source_labels: [__meta_kubernetes_pod_uid]
|
||||
target_label: pod_uid
|
||||
- target_label: cluster
|
||||
replacement: {{ .Release.Namespace }}
|
||||
|
||||
- job_name: containers-fallback
|
||||
pipeline_stages:
|
||||
- cri: {}
|
||||
static_configs:
|
||||
- labels:
|
||||
job: containers
|
||||
namespace: {{ .Release.Namespace }}
|
||||
service_name: incidentops-api
|
||||
__path__: /var/log/containers/incidentops-api-*_incidentops_api-*.log
|
||||
- labels:
|
||||
job: containers
|
||||
namespace: {{ .Release.Namespace }}
|
||||
service_name: incidentops-worker
|
||||
__path__: /var/log/containers/incidentops-worker-*_incidentops_worker-*.log
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-promtail
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: promtail
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-promtail
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: promtail
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["pods", "pods/log", "namespaces", "services", "endpoints", "nodes"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-promtail
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: promtail
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: {{ include "incidentops.fullname" . }}-promtail
|
||||
namespace: {{ .Release.Namespace }}
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: {{ include "incidentops.fullname" . }}-promtail
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-promtail
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: promtail
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
||||
app.kubernetes.io/component: promtail
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
||||
app.kubernetes.io/component: promtail
|
||||
annotations:
|
||||
checksum/config: {{ .Values.observability.promtail.image.tag | sha256sum }}
|
||||
spec:
|
||||
serviceAccountName: {{ include "incidentops.fullname" . }}-promtail
|
||||
securityContext:
|
||||
runAsUser: 0
|
||||
containers:
|
||||
- name: promtail
|
||||
image: "{{ .Values.observability.promtail.image.repository }}:{{ .Values.observability.promtail.image.tag }}"
|
||||
imagePullPolicy: {{ .Values.observability.promtail.image.pullPolicy }}
|
||||
args:
|
||||
- -config.file=/etc/promtail/promtail.yaml
|
||||
ports:
|
||||
- name: http-metrics
|
||||
containerPort: 3101
|
||||
protocol: TCP
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/promtail
|
||||
- name: positions
|
||||
mountPath: /run/promtail
|
||||
- name: varlog
|
||||
mountPath: /var/log
|
||||
readOnly: true
|
||||
- name: varlogpods
|
||||
mountPath: /var/log/pods
|
||||
readOnly: true
|
||||
- name: varlogcontainers
|
||||
mountPath: /var/log/containers
|
||||
readOnly: true
|
||||
resources:
|
||||
{{- toYaml .Values.observability.promtail.resources | nindent 12 }}
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: {{ include "incidentops.fullname" . }}-promtail-config
|
||||
- name: positions
|
||||
emptyDir: {}
|
||||
- name: varlog
|
||||
hostPath:
|
||||
path: /var/log
|
||||
- name: varlogpods
|
||||
hostPath:
|
||||
path: /var/log/pods
|
||||
- name: varlogcontainers
|
||||
hostPath:
|
||||
path: /var/log/containers
|
||||
{{- end }}
|
||||
153
helm/incidentops/templates/tempo-deployment.yaml
Normal file
153
helm/incidentops/templates/tempo-deployment.yaml
Normal file
@@ -0,0 +1,153 @@
|
||||
{{- if .Values.observability.enabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-tempo-config
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: tempo
|
||||
data:
|
||||
tempo.yaml: |
|
||||
server:
|
||||
http_listen_port: 3200
|
||||
|
||||
distributor:
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
|
||||
ingester:
|
||||
trace_idle_period: 10s
|
||||
max_block_bytes: 1048576
|
||||
max_block_duration: 5m
|
||||
|
||||
compactor:
|
||||
compaction:
|
||||
block_retention: {{ .Values.observability.tempo.retention }}
|
||||
|
||||
storage:
|
||||
trace:
|
||||
backend: local
|
||||
local:
|
||||
path: /var/tempo/traces
|
||||
wal:
|
||||
path: /var/tempo/wal
|
||||
|
||||
querier:
|
||||
search:
|
||||
query_timeout: 30s
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-tempo
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: tempo
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
||||
app.kubernetes.io/component: tempo
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
||||
app.kubernetes.io/component: tempo
|
||||
annotations:
|
||||
checksum/config: {{ .Values.observability.tempo.image.tag | sha256sum }}
|
||||
spec:
|
||||
containers:
|
||||
- name: tempo
|
||||
image: "{{ .Values.observability.tempo.image.repository }}:{{ .Values.observability.tempo.image.tag }}"
|
||||
imagePullPolicy: {{ .Values.observability.tempo.image.pullPolicy }}
|
||||
args:
|
||||
- -config.file=/etc/tempo/tempo.yaml
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 3200
|
||||
protocol: TCP
|
||||
- name: otlp-grpc
|
||||
containerPort: 4317
|
||||
protocol: TCP
|
||||
- name: otlp-http
|
||||
containerPort: 4318
|
||||
protocol: TCP
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/tempo
|
||||
- name: data
|
||||
mountPath: /var/tempo
|
||||
resources:
|
||||
{{- toYaml .Values.observability.tempo.resources | nindent 12 }}
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /ready
|
||||
port: http
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /ready
|
||||
port: http
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 30
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: {{ include "incidentops.fullname" . }}-tempo-config
|
||||
- name: data
|
||||
{{- if .Values.observability.tempo.persistence.enabled }}
|
||||
persistentVolumeClaim:
|
||||
claimName: {{ include "incidentops.fullname" . }}-tempo
|
||||
{{- else }}
|
||||
emptyDir: {}
|
||||
{{- end }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-tempo
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: tempo
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- name: http
|
||||
port: 3200
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
- name: otlp-grpc
|
||||
port: 4317
|
||||
targetPort: otlp-grpc
|
||||
protocol: TCP
|
||||
- name: otlp-http
|
||||
port: 4318
|
||||
targetPort: otlp-http
|
||||
protocol: TCP
|
||||
selector:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: tempo
|
||||
{{- if .Values.observability.tempo.persistence.enabled }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-tempo
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: tempo
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: {{ .Values.observability.tempo.persistence.size }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -29,6 +29,29 @@ spec:
|
||||
serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
|
||||
securityContext:
|
||||
{{- toYaml .Values.podSecurityContext | nindent 8 }}
|
||||
initContainers:
|
||||
- name: wait-for-postgres
|
||||
image: busybox:1.36
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
until nc -z {{ include "incidentops.fullname" . }}-postgresql 5432; do
|
||||
echo "Waiting for PostgreSQL..."
|
||||
sleep 2
|
||||
done
|
||||
echo "PostgreSQL is ready"
|
||||
- name: wait-for-redis
|
||||
image: busybox:1.36
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
until nc -z {{ include "incidentops.fullname" . }}-redis 6379; do
|
||||
echo "Waiting for Redis..."
|
||||
sleep 2
|
||||
done
|
||||
echo "Redis is ready"
|
||||
containers:
|
||||
- name: worker
|
||||
securityContext:
|
||||
@@ -36,6 +59,8 @@ spec:
|
||||
image: {{ include "incidentops.worker.image" . }}
|
||||
imagePullPolicy: {{ .Values.worker.image.pullPolicy }}
|
||||
command:
|
||||
- uv
|
||||
- run
|
||||
- celery
|
||||
- -A
|
||||
- worker.celery_app
|
||||
@@ -52,6 +77,8 @@ spec:
|
||||
livenessProbe:
|
||||
exec:
|
||||
command:
|
||||
- uv
|
||||
- run
|
||||
- celery
|
||||
- -A
|
||||
- worker.celery_app
|
||||
|
||||
@@ -80,3 +80,63 @@ redis:
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 1Gi
|
||||
|
||||
# Application configuration
|
||||
config:
|
||||
environment: production
|
||||
logLevel: INFO
|
||||
|
||||
# Observability Stack - Production settings
|
||||
observability:
|
||||
enabled: true
|
||||
|
||||
otelCollector:
|
||||
replicaCount: 2
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
|
||||
tempo:
|
||||
retention: "720h" # 30 days
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 50Gi
|
||||
resources:
|
||||
requests:
|
||||
cpu: 250m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 2Gi
|
||||
|
||||
loki:
|
||||
retention: "720h" # 30 days
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 100Gi
|
||||
resources:
|
||||
requests:
|
||||
cpu: 250m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 2Gi
|
||||
|
||||
grafana:
|
||||
adminPassword: "" # Set via external secret in production
|
||||
service:
|
||||
type: ClusterIP
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 5Gi
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
|
||||
@@ -106,6 +106,8 @@ config:
|
||||
jwtAlgorithm: HS256
|
||||
accessTokenExpireMinutes: 30
|
||||
refreshTokenExpireDays: 30
|
||||
environment: development
|
||||
logLevel: INFO
|
||||
|
||||
# Secrets (use external secrets in production)
|
||||
secrets:
|
||||
@@ -161,3 +163,117 @@ podSecurityContext:
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1000
|
||||
|
||||
# Observability Stack (Grafana + Loki + Tempo + OpenTelemetry Collector)
|
||||
observability:
|
||||
enabled: true
|
||||
|
||||
otelCollector:
|
||||
replicaCount: 1
|
||||
image:
|
||||
repository: otel/opentelemetry-collector-contrib
|
||||
tag: "0.96.0"
|
||||
pullPolicy: IfNotPresent
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 256Mi
|
||||
|
||||
tempo:
|
||||
image:
|
||||
repository: grafana/tempo
|
||||
tag: "2.4.1"
|
||||
pullPolicy: IfNotPresent
|
||||
retention: "168h" # 7 days
|
||||
persistence:
|
||||
enabled: false
|
||||
size: 10Gi
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
|
||||
loki:
|
||||
image:
|
||||
repository: grafana/loki
|
||||
tag: "2.9.6"
|
||||
pullPolicy: IfNotPresent
|
||||
retention: "168h" # 7 days
|
||||
persistence:
|
||||
enabled: false
|
||||
size: 10Gi
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
|
||||
prometheus:
|
||||
image:
|
||||
repository: prom/prometheus
|
||||
tag: "v2.51.0"
|
||||
pullPolicy: IfNotPresent
|
||||
retention: "15d"
|
||||
scrapeInterval: "15s"
|
||||
persistence:
|
||||
enabled: false
|
||||
size: 10Gi
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
|
||||
grafana:
|
||||
image:
|
||||
repository: grafana/grafana
|
||||
tag: "10.4.1"
|
||||
pullPolicy: IfNotPresent
|
||||
adminUser: admin
|
||||
adminPassword: "admin" # Change in production!
|
||||
service:
|
||||
type: ClusterIP
|
||||
ingress:
|
||||
enabled: false
|
||||
host: grafana.incidentops.local
|
||||
annotations: {}
|
||||
tls: []
|
||||
persistence:
|
||||
enabled: false
|
||||
size: 1Gi
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 256Mi
|
||||
|
||||
promtail:
|
||||
enabled: true
|
||||
image:
|
||||
repository: grafana/promtail
|
||||
tag: "2.9.6"
|
||||
pullPolicy: IfNotPresent
|
||||
resources:
|
||||
requests:
|
||||
cpu: 25m
|
||||
memory: 64Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 256Mi
|
||||
|
||||
# Metrics configuration
|
||||
metrics:
|
||||
enabled: true
|
||||
port: 9464
|
||||
|
||||
Reference in New Issue
Block a user