feat: add observability stack and background task infrastructure
Add OpenTelemetry instrumentation with distributed tracing and metrics: - Structured JSON logging with trace context correlation - Auto-instrumentation for FastAPI, asyncpg, httpx, redis - OTLP exporter for traces and Prometheus metrics endpoint Implement Celery worker and notification task system: - Celery app with Redis/SQS broker support and configurable queues - Notification tasks for incident fan-out, webhooks, and escalations - Pluggable TaskQueue abstraction with in-memory driver for testing Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana): - OpenTelemetry Collector for receiving OTLP traces and logs - Tempo for distributed tracing backend - Loki for log aggregation with Promtail DaemonSet - Prometheus for metrics scraping with RBAC configuration - Grafana with pre-provisioned datasources and API overview dashboard - Helm templates for all observability components Enhance application infrastructure: - Global exception handlers with structured ErrorResponse schema - Request logging middleware with timing metrics - Health check updated to verify task queue connectivity - Non-root user in Dockerfile for security - Init containers in Helm deployments for dependency ordering - Production Helm values with autoscaling and retention policies
This commit is contained in:
294
observability/grafana/dashboards/api-overview.json
Normal file
294
observability/grafana/dashboards/api-overview.json
Normal file
@@ -0,0 +1,294 @@
|
||||
{
|
||||
"title": "IncidentOps API Overview",
|
||||
"uid": "incidentops-api",
|
||||
"tags": ["incidentops", "api"],
|
||||
"timezone": "browser",
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Request Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 0},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
|
||||
"legendFormat": "Requests/sec",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "palette-classic"},
|
||||
"unit": "reqps"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Request Duration (p50, p95, p99)",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 0},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "histogram_quantile(0.50, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
|
||||
"legendFormat": "p50",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
|
||||
"legendFormat": "p95",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
|
||||
"legendFormat": "p99",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "palette-classic"},
|
||||
"unit": "s"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Error Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 0},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\", http_status_code=~\"5..\"}[1m])) / sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m])) * 100",
|
||||
"legendFormat": "Error %",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"fixedColor": "red", "mode": "fixed"},
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Requests by Status Code",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "sum by (http_status_code) (rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
|
||||
"legendFormat": "{{http_status_code}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "palette-classic"},
|
||||
"unit": "reqps"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Requests by Endpoint",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "sum by (http_route) (rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
|
||||
"legendFormat": "{{http_route}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "palette-classic"},
|
||||
"unit": "reqps"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "System CPU Usage",
|
||||
"type": "gauge",
|
||||
"gridPos": {"h": 6, "w": 6, "x": 0, "y": 16},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "avg(system_cpu_utilization{job=\"incidentops-api\"}) * 100",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 60},
|
||||
{"color": "red", "value": 80}
|
||||
]
|
||||
},
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"title": "Memory Usage",
|
||||
"type": "gauge",
|
||||
"gridPos": {"h": 6, "w": 6, "x": 6, "y": 16},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "process_runtime_cpython_memory_bytes{job=\"incidentops-api\", type=\"rss\"} / 1024 / 1024",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 256},
|
||||
{"color": "red", "value": 512}
|
||||
]
|
||||
},
|
||||
"unit": "decmbytes"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"title": "Active Threads",
|
||||
"type": "stat",
|
||||
"gridPos": {"h": 6, "w": 6, "x": 12, "y": 16},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "process_runtime_cpython_thread_count{job=\"incidentops-api\"}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 50},
|
||||
{"color": "red", "value": 100}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"title": "GC Collections",
|
||||
"type": "stat",
|
||||
"gridPos": {"h": 6, "w": 6, "x": 18, "y": 16},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "sum(rate(process_runtime_cpython_gc_count{job=\"incidentops-api\"}[5m]))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": null}
|
||||
]
|
||||
},
|
||||
"unit": "cps"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"title": "Recent Logs",
|
||||
"type": "logs",
|
||||
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 22},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "loki", "uid": "loki"},
|
||||
"expr": "{service_name=\"incidentops-api\"} | json",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"showTime": true,
|
||||
"showLabels": true,
|
||||
"wrapLogMessage": true,
|
||||
"enableLogDetails": true,
|
||||
"sortOrder": "Descending"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"title": "Error Logs",
|
||||
"type": "logs",
|
||||
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 32},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "loki", "uid": "loki"},
|
||||
"expr": "{service_name=\"incidentops-api\"} |= \"ERROR\" | json",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"showTime": true,
|
||||
"showLabels": true,
|
||||
"wrapLogMessage": true,
|
||||
"enableLogDetails": true,
|
||||
"sortOrder": "Descending"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"title": "Recent Traces",
|
||||
"type": "traces",
|
||||
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 40},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "tempo", "uid": "tempo"},
|
||||
"queryType": "traceqlSearch",
|
||||
"filters": [
|
||||
{
|
||||
"id": "service-name",
|
||||
"operator": "=",
|
||||
"scope": "resource",
|
||||
"tag": "service.name",
|
||||
"value": ["incidentops-api"]
|
||||
}
|
||||
],
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"schemaVersion": 38,
|
||||
"version": 2
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'default'
|
||||
orgId: 1
|
||||
folder: 'IncidentOps'
|
||||
folderUid: 'incidentops'
|
||||
type: file
|
||||
disableDeletion: false
|
||||
editable: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
@@ -0,0 +1,48 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
uid: prometheus
|
||||
url: http://prometheus:9090
|
||||
access: proxy
|
||||
isDefault: false
|
||||
jsonData:
|
||||
httpMethod: POST
|
||||
exemplarTraceIdDestinations:
|
||||
- name: trace_id
|
||||
datasourceUid: tempo
|
||||
|
||||
- name: Tempo
|
||||
type: tempo
|
||||
uid: tempo
|
||||
url: http://tempo:3200
|
||||
access: proxy
|
||||
isDefault: false
|
||||
jsonData:
|
||||
tracesToLogsV2:
|
||||
datasourceUid: loki
|
||||
spanStartTimeShift: '-1h'
|
||||
spanEndTimeShift: '1h'
|
||||
filterByTraceID: true
|
||||
filterBySpanID: true
|
||||
tracesToMetrics:
|
||||
datasourceUid: prometheus
|
||||
nodeGraph:
|
||||
enabled: true
|
||||
lokiSearch:
|
||||
datasourceUid: loki
|
||||
|
||||
- name: Loki
|
||||
type: loki
|
||||
uid: loki
|
||||
url: http://loki:3100
|
||||
access: proxy
|
||||
isDefault: true
|
||||
jsonData:
|
||||
derivedFields:
|
||||
- datasourceUid: tempo
|
||||
matcherRegex: '"trace_id":"([a-f0-9]+)"'
|
||||
name: TraceID
|
||||
url: '$${__value.raw}'
|
||||
urlDisplayLabel: 'View Trace'
|
||||
41
observability/loki/config.yaml
Normal file
41
observability/loki/config.yaml
Normal file
@@ -0,0 +1,41 @@
|
||||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
grpc_listen_port: 9096
|
||||
|
||||
common:
|
||||
path_prefix: /loki
|
||||
storage:
|
||||
filesystem:
|
||||
chunks_directory: /loki/chunks
|
||||
rules_directory: /loki/rules
|
||||
replication_factor: 1
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
query_range:
|
||||
results_cache:
|
||||
cache:
|
||||
embedded_cache:
|
||||
enabled: true
|
||||
max_size_mb: 100
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: "2020-10-24"
|
||||
store: tsdb
|
||||
object_store: filesystem
|
||||
schema: v13
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
ruler:
|
||||
alertmanager_url: http://localhost:9093
|
||||
|
||||
limits_config:
|
||||
retention_period: 168h # 7 days
|
||||
allow_structured_metadata: true
|
||||
volume_enabled: true
|
||||
38
observability/otel-collector/config.yaml
Normal file
38
observability/otel-collector/config.yaml
Normal file
@@ -0,0 +1,38 @@
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
|
||||
processors:
|
||||
batch:
|
||||
timeout: 1s
|
||||
send_batch_size: 1024
|
||||
memory_limiter:
|
||||
check_interval: 1s
|
||||
limit_mib: 256
|
||||
spike_limit_mib: 64
|
||||
|
||||
exporters:
|
||||
otlp/tempo:
|
||||
endpoint: tempo:4317
|
||||
tls:
|
||||
insecure: true
|
||||
loki:
|
||||
endpoint: http://loki:3100/loki/api/v1/push
|
||||
default_labels_enabled:
|
||||
exporter: true
|
||||
job: true
|
||||
|
||||
service:
|
||||
pipelines:
|
||||
traces:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch]
|
||||
exporters: [otlp/tempo]
|
||||
logs:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch]
|
||||
exporters: [loki]
|
||||
23
observability/prometheus/prometheus.yml
Normal file
23
observability/prometheus/prometheus.yml
Normal file
@@ -0,0 +1,23 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
# Scrape Prometheus itself
|
||||
- job_name: "prometheus"
|
||||
static_configs:
|
||||
- targets: ["localhost:9090"]
|
||||
|
||||
# Scrape IncidentOps API metrics
|
||||
- job_name: "incidentops-api"
|
||||
static_configs:
|
||||
- targets: ["api:9464"]
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 10s
|
||||
|
||||
# Scrape IncidentOps Worker metrics (when metrics are enabled)
|
||||
- job_name: "incidentops-worker"
|
||||
static_configs:
|
||||
- targets: ["worker:9464"]
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 10s
|
||||
32
observability/tempo/config.yaml
Normal file
32
observability/tempo/config.yaml
Normal file
@@ -0,0 +1,32 @@
|
||||
server:
|
||||
http_listen_port: 3200
|
||||
|
||||
distributor:
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
|
||||
ingester:
|
||||
trace_idle_period: 10s
|
||||
max_block_bytes: 1048576
|
||||
max_block_duration: 5m
|
||||
|
||||
compactor:
|
||||
compaction:
|
||||
block_retention: 168h # 7 days
|
||||
|
||||
storage:
|
||||
trace:
|
||||
backend: local
|
||||
local:
|
||||
path: /var/tempo/traces
|
||||
wal:
|
||||
path: /var/tempo/wal
|
||||
|
||||
querier:
|
||||
search:
|
||||
query_timeout: 30s
|
||||
Reference in New Issue
Block a user