Files
awoooi/apps/api/alert_rules.yaml

825 lines
36 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# AWOOOI OpenClaw 告警規則匹配引擎
# ============================================================
# 格式說明:
# match.alertname : Prometheus alertname 完全匹配 (list = OR)
# match.alert_type : alert_type 關鍵字 (list = OR, 部分匹配)
# match.message : message 關鍵字 (list = OR, 部分匹配, 不分大小寫)
# response.* : 回應模板,支援變數 {target} {host} {container} {instance} {job} {namespace}
# responsibility : FE / BE / INFRA / DB / COLLAB
# risk : low / medium / critical
# confidence : 0.0 (規則匹配固定值,禁止偽造)
#
# 修改規則: 不需要重新部署,重啟 API Pod 即可熱載入
# 新增規則: 在 rules 清單末尾加入priority 越小越優先
# 2026-04-09 ogt: 初版,從 openclaw.py _generate_mock_response 抽出
# ============================================================
version: "1.0.0"
updated_at: "2026-04-09"
rules:
# ── Docker / Host 層 ────────────────────────────────────────
- id: docker_container_unhealthy
priority: 10
description: Docker 容器 healthcheck 失敗
match:
alertname:
- DockerContainerUnhealthy
message:
- unhealthy
- health check
- healthcheck
response:
action_title: "檢查 Docker 容器 {container} 健康狀態"
description: "⚙️ 規則匹配: Docker 容器 {container} ({host}) healthcheck 失敗。常見原因: 應用程式啟動慢、healthcheck 指令錯誤、依賴服務未就緒。"
suggested_action: RESTART_DEPLOYMENT
kubectl_command: "ssh {host} 'docker inspect {container} --format=\"{{.State.Health.Status}}\" && docker restart {container}'"
estimated_downtime: "~30s"
risk: medium
responsibility: INFRA
responsibility_reasoning: "Docker 容器健康檢查失敗屬基礎設施團隊責任,需確認 healthcheck 設定與容器狀態"
secondary_teams: [BE]
optimization:
- type: HEALTHCHECK
description: "確認 healthcheck 指令在容器內可執行 (mc/curl 是否存在)"
command: "ssh {host} 'docker exec {container} sh -c \"mc ready local 2>/dev/null || curl -sf http://localhost:9000/minio/health/live\"'"
reasoning: "[規則匹配] Docker healthcheck 失敗先 restart 恢復服務,同時確認 healthcheck 指令正確。"
- id: target_down
priority: 20
description: Prometheus scrape target 下線 — 自動重啟 exporter
match:
alertname:
- TargetDown
- InstanceDown
- NodeExporterDown
response:
action_title: "重啟 {job} exporter on {host}"
description: "⚙️ 規則匹配: Prometheus 無法抓取 {instance} ({job}) 指標。自動重啟主機上的 exporter container。"
suggested_action: RESTART_DEPLOYMENT
kubectl_command: "ssh {host} 'docker restart $(docker ps -a --filter name=exporter --format \"{{.Names}}\" | head -1) 2>/dev/null || systemctl restart node_exporter 2>/dev/null || systemctl restart prometheus-node-exporter'"
estimated_downtime: "~30s"
risk: medium
responsibility: INFRA
responsibility_reasoning: "Prometheus scrape 目標下線屬基礎設施監控範疇,自動重啟 exporter"
secondary_teams: []
optimization:
- type: MONITORING
description: "確認 exporter 重啟後可被 Prometheus scrape"
command: "ssh {host} 'curl -s http://localhost:{port}/metrics | head -3'"
reasoning: "[規則匹配] Prometheus target 下線SSH 到主機重啟 exporter container 或 systemd service。"
# ── K8s Pod 層 ──────────────────────────────────────────────
- id: oom_killed
priority: 30
description: Pod OOMKilled 記憶體不足
match:
# 2026-04-10 Claude Sonnet 4.6: Phase 2 飛輪修復 — 補齊 Prometheus alertname 變體
alertname:
- PodOOMKilled
- KubePodOOMKilled
- KubernetesMemoryPressure
- NodeMemoryUsageHigh
- HighMemoryUsage
alert_type:
- memory
message:
- oomkilled
- oom
- out of memory
response:
action_title: "刪除異常 Pod {target} (OOMKilled)"
description: "⚙️ 規則匹配: {target} 發生 OOMKilled根因為 JVM Heap 配置與 K8s memory limit 不匹配或存在記憶體洩漏。"
suggested_action: DELETE_POD
kubectl_command: "kubectl delete pod {target} -n {namespace}"
estimated_downtime: "~30s"
risk: critical
responsibility: BE
responsibility_reasoning: "OOMKilled 通常源於應用程式記憶體配置不當,屬後端團隊責任範圍"
secondary_teams: [INFRA]
optimization:
- type: RESOURCE_LIMIT
description: "調整 memory limit 至 1Gi 並確保 JVM -Xmx 不超過 70%"
command: "kubectl set resources deployment/{target} -c {target} --limits=memory=1Gi -n {namespace}"
- type: HPA
description: "啟用基於記憶體的 HPA 自動擴展"
command: "kubectl autoscale deployment {target} --memory-percent=80 --min=2 --max=5 -n {namespace}"
reasoning: "[規則匹配] Pod OOMKilled 後 ReplicaSet 將自動重建,但需同步修正資源配置防止復發。"
# 2026-04-12 ogt: Host CPU 告警獨立規則 — node_exporter 告警無 pod/deployment label
# 2026-04-16 ogt + Claude Sonnet 4.6: 補齊主機層所有常見 Prometheus alertname
# 原則:主機層告警 = 只能通知 + 建議 SSH 排查,絕對禁止 kubectl restart
- id: host_resource_alert
priority: 45
description: Host 主機資源告警 (node_exporter — CPU/記憶體/負載/磁碟增長,非 K8s workload)
match:
alertname:
# CPU 相關
- HostHighCpuLoad
- NodeCPUUsageHigh
- NodeHighCpuLoad
# 負載相關
- HostHighLoadAverage
- NodeLoadAverageHigh
- HostLoadAverageHigh
# 記憶體相關
- HostOutOfMemory
- HostMemoryUnderMemoryPressure
- HostMemoryUsageHigh
- NodeMemoryPressure
# 磁碟 I/O 相關
- HostUnusualDiskReadLatency
- HostUnusualDiskWriteLatency
- HostUnusualDiskReadRate
- HostUnusualDiskWriteRate
- HostDiskWillFillIn24Hours
- HostOutOfDiskSpace
- HostDiskUsageHigh
- HostDiskUsageCritical
# 網路相關
- HostUnusualNetworkThroughputIn
- HostUnusualNetworkThroughputOut
# 系統服務
- HostSystemdServiceCrashed
- HostKernelVersionDeviations
- HostOomKillDetected
- HostEdacCorrectableErrors
- HostEdacUncorrectableErrors
- HostClockSkewDetected
- HostClockNotSynchronising
response:
action_title: "🔍 主機自動診斷 — SSH 收集根因"
description: "主機層告警node_exporter。自動 SSH 登入主機執行診斷指令,收集 CPU/記憶體/磁碟資訊後回報。"
# 2026-04-27 Claude Sonnet 4.6: 從 NO_ACTION 改為自動 SSH 診斷
# 根因SSH_MCP_ALLOWED_HOSTS 空白導致全部降為人工審核(飛輪完全停轉)
# 修復:補 SSH_MCP_ALLOWED_HOSTS 白名單 + 改為自動診斷指令(收集不修改,安全)
# 診斷原則:只收集資訊,不做任何改動 → risk=low 且不在 _DESTRUCTIVE_PATTERNS 清單
suggested_action: SSH_DIAGNOSE
kubectl_command: "ssh {host} 'echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -15; echo \"=== MEMORY ===\"; free -h; echo \"=== DISK ===\"; df -h; echo \"=== LOAD ===\"; uptime'"
estimated_downtime: "N/A"
risk: low
responsibility: INFRA
reasoning: "[規則匹配] 主機層資源告警,自動 SSH 執行診斷指令(只讀,不修改),收集根因資訊後推送 Telegram 讓 SRE 決策。"
- id: high_cpu
priority: 40
description: K8s Pod/Deployment CPU 使用率過高
match:
# 2026-04-10 Claude Sonnet 4.6: Phase 2 飛輪修復 — 補齊 Prometheus alertname 變體
# 2026-04-12 ogt: 移除 HostHighCpuLoad/NodeCPUUsageHigh → 已獨立為 host_cpu_high 規則
alertname:
- HighCPUUsage
- ContainerCpuUsageSecondsTotal
- CPUThrottlingHigh
- KubeCPUOvercommit
alert_type:
- cpu
- high_cpu
response:
action_title: "擴展 {target} 副本數 + 啟用 HPA"
description: "⚙️ 規則匹配: {target} CPU 使用率過高,根因為流量突增或計算密集任務未配置自動擴展。"
suggested_action: SCALE_DEPLOYMENT
kubectl_command: "kubectl scale deployment {target} --replicas=3 -n {namespace}"
estimated_downtime: "0"
risk: medium
responsibility: INFRA
responsibility_reasoning: "自動擴展策略未配置或閾值過高,屬基礎設施團隊責任"
secondary_teams: [BE]
optimization:
- type: RESOURCE_LIMIT
description: "增加 CPU request 確保 QoS 為 Guaranteed"
command: "kubectl set resources deployment/{target} --requests=cpu=500m --limits=cpu=2000m -n {namespace}"
reasoning: "[規則匹配] 水平擴展可即時分散負載,同時建議配置 HPA 防止復發。"
- id: http_5xx
priority: 50
description: HTTP 5xx 錯誤率過高
match:
alert_type:
- http
message:
- "5xx"
- "502"
- "503"
- "500"
response:
action_title: "重啟 {target} + 檢查上游服務"
description: "⚙️ 規則匹配: {target} 產生 HTTP 5xx 錯誤,可能為應用程式例外或上游服務不可達。"
suggested_action: RESTART_DEPLOYMENT
kubectl_command: "kubectl rollout restart deployment/{target} -n {namespace}"
estimated_downtime: "~1 min"
risk: critical
responsibility: COLLAB
responsibility_reasoning: "HTTP 5xx 可能源於前端路由、後端邏輯或基礎設施,需多團隊協同排查"
secondary_teams: [FE, BE, INFRA]
optimization:
- type: CIRCUIT_BREAKER
description: "配置熔斷器防止故障擴散"
command: "# Istio VirtualService outlierDetection 配置"
reasoning: "[規則匹配] HTTP 錯誤需協同排查,先重啟恢復服務同時通知相關團隊。"
- id: pod_crash
priority: 60
description: Pod CrashLoopBackOff
match:
# 2026-04-10 Claude Sonnet 4.6: Phase 2 飛輪修復 — 補齊 Prometheus alertname 變體
alertname:
- KubePodCrashLooping
- PodCrashLoopBackOff
- KubernetesPodCrashLooping
alert_type:
- pod_crash
- crash
message:
- crashloop
- crash
- backoff
response:
action_title: "診斷 {target} CrashLoop 根因"
description: "⚙️ 規則匹配: {target} 進入 CrashLoopBackOff需檢查啟動錯誤日誌。"
suggested_action: NO_ACTION
kubectl_command: "kubectl logs {target} -n {namespace} --previous --tail=50"
estimated_downtime: "依根因而定"
risk: critical
responsibility: BE
responsibility_reasoning: "Pod crash 通常源於應用程式啟動錯誤,屬後端團隊責任"
secondary_teams: [INFRA]
optimization:
- type: LIVENESS_PROBE
description: "調整 liveness probe 初始延遲防止誤殺"
command: "# 調整 initialDelaySeconds >= 應用啟動時間"
reasoning: "[規則匹配] 先查 previous log 確認 crash 原因,再決定修復策略。"
# ── 資料庫層 ─────────────────────────────────────────────────
# 2026-04-16 ogt + Claude Sonnet 4.6: PostgreSQL 監控告警 — 磁碟/資源類,絕對不能重啟
# 根因PostgreSQLDiskGrowthRate 落 generic_fallback → 輸出 kubectl rollout restart postgresql錯誤
- id: postgresql_disk_monitoring
priority: 68
description: PostgreSQL 磁碟/增長率/exporter 監控告警(不重啟資料庫)
match:
alertname:
- PostgreSQLDiskGrowthRate
- PostgreSQLDiskUsageHigh
- PostgreSQLDiskFull
- PostgresExporterDown
- PostgreSQLExporterDown
- PostgreSQLTableBloat
- PostgreSQLVacuumRequired
- PostgreSQLReplicationLag
- PostgreSQLTooManyConnections
response:
action_title: "⚠️ PostgreSQL 監控告警 — 需人工排查,禁止重啟"
description: "⚠️ PostgreSQL 資源/監控告警。磁碟增長過快或 exporter 異常,重啟資料庫會造成資料風險。請登入排查磁碟用量或 WAL 狀態。"
suggested_action: NO_ACTION
kubectl_command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SELECT pg_database_size(current_database()), pg_size_pretty(pg_database_size(current_database()));'"
estimated_downtime: "N/A"
risk: medium
responsibility: DB
responsibility_reasoning: "PostgreSQL 磁碟告警需 DBA 評估,自動重啟資料庫有資料丟失風險,必須人工確認"
secondary_teams: [INFRA]
reasoning: "[規則匹配] PostgreSQL 磁碟增長/監控告警,絕對禁止自動重啟資料庫。需 DBA 人工確認磁碟用量、WAL 清理、VACUUM 狀態。"
- id: postgresql_down
priority: 70
description: PostgreSQL 服務下線
match:
alertname:
- PostgreSQLDown
message:
- postgresql
- postgres
- pg down
response:
action_title: "重啟 PostgreSQL {target}"
description: "⚙️ 規則匹配: PostgreSQL ({instance}) 無法連線。常見原因: 程序崩潰、磁碟空間不足、連線數超限。"
suggested_action: RESTART_DEPLOYMENT
kubectl_command: "kubectl rollout restart deployment/postgresql -n {namespace}"
estimated_downtime: "~2 min"
risk: critical
responsibility: DB
responsibility_reasoning: "PostgreSQL 下線屬資料庫團隊責任,需立即確認資料完整性"
secondary_teams: [INFRA, BE]
optimization:
- type: HEALTH_CHECK
description: "確認 PostgreSQL 連線與資料完整性"
command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SELECT 1'"
reasoning: "[規則匹配] PostgreSQL 下線影響所有依賴服務,優先重啟恢復,同時確認資料無損。"
- id: postgresql_connection_pool
priority: 75
description: PostgreSQL 連線池耗盡或接近上限
match:
alertname:
- PostgreSQLConnectionPoolNearLimit
- PostgreSQLConnectionPoolExhausted
message:
- connection pool
- connections
- pgbouncer
response:
action_title: "清理 PostgreSQL 閒置連線"
description: "⚙️ 規則匹配: PostgreSQL 連線池使用率過高,可能導致新請求被拒絕。"
suggested_action: NO_ACTION
kubectl_command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE state = ''idle'' AND state_change < NOW() - INTERVAL ''5 minutes'';'"
estimated_downtime: "0"
risk: critical
responsibility: DB
responsibility_reasoning: "連線池管理屬資料庫設定範疇"
secondary_teams: [BE]
optimization:
- type: CONNECTION_POOL
description: "調整 max_connections 或啟用 PgBouncer 連線池"
command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SHOW max_connections;'"
reasoning: "[規則匹配] 清理閒置連線是最快恢復手段,同時需排查連線洩漏。"
- id: postgresql_slow_queries
priority: 80
description: PostgreSQL 慢查詢告警
match:
alertname:
- PostgreSQLSlowQueries
- PostgreSQLLockWaiting
message:
- slow query
- lock wait
- deadlock
response:
action_title: "診斷 PostgreSQL 慢查詢 + 索引優化"
description: "⚙️ 規則匹配: PostgreSQL 存在慢查詢或鎖等待,影響系統整體性能。"
suggested_action: NO_ACTION
kubectl_command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SELECT pid, query, state, wait_event_type, wait_event FROM pg_stat_activity WHERE state != ''idle'' ORDER BY query_start;'"
estimated_downtime: "0"
risk: medium
responsibility: DB
responsibility_reasoning: "慢查詢優化屬資料庫效能調優範疇"
secondary_teams: [BE]
optimization:
- type: INDEX
description: "使用 EXPLAIN ANALYZE 找出缺少索引的查詢"
command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SELECT * FROM pg_stat_user_tables ORDER BY seq_scan DESC LIMIT 10;'"
reasoning: "[規則匹配] 先找出阻塞查詢,必要時 pg_terminate_backend 解除鎖定。"
# ── 基礎設施服務層 ──────────────────────────────────────────
- id: redis_down
priority: 85
description: Redis 服務下線
match:
alertname:
- RedisDown
message:
- redis
- cache down
response:
action_title: "重啟 Redis {target}"
description: "⚙️ 規則匹配: Redis ({instance}) 無法連線。影響 Session 管理、去重快取、AI Router 狀態。"
suggested_action: RESTART_DEPLOYMENT
kubectl_command: "kubectl rollout restart deployment/redis -n {namespace}"
estimated_downtime: "~30s"
risk: critical
responsibility: INFRA
responsibility_reasoning: "Redis 屬基礎設施快取層,下線影響多個上層服務"
secondary_teams: [BE]
optimization:
- type: HEALTH_CHECK
description: "確認 Redis 連線"
command: "kubectl exec -n {namespace} deployment/redis -- redis-cli ping"
reasoning: "[規則匹配] Redis 下線會導致去重失效和 AI Router 狀態丟失,需立即重啟。"
- id: ollama_down
priority: 90
description: Ollama AI 服務下線
match:
alertname:
- OllamaDown
message:
- ollama
- llm down
- ai service
response:
action_title: "重啟 Ollama 服務 on {host}"
description: "⚙️ 規則匹配: Ollama ({instance}) 無法連線。影響 AI 規則自動生成和本地推理。"
suggested_action: RESTART_DEPLOYMENT
kubectl_command: "ssh {host} 'systemctl restart ollama || docker restart ollama'"
estimated_downtime: "~2 min (model reload)"
risk: medium
responsibility: INFRA
responsibility_reasoning: "Ollama 屬 AI 推理基礎設施,由基礎設施團隊管理"
secondary_teams: []
optimization:
- type: HEALTH_CHECK
description: "確認 Ollama 狀態和已載入模型"
command: "curl -s http://{host}:11434/api/tags | jq '.models[].name'"
reasoning: "[規則匹配] Ollama 下線觸發 AI Router fallback 至 Gemini重啟恢復本地推理能力。"
- id: minio_down
priority: 95
description: MinIO 物件儲存下線
match:
alertname:
- MinioDown
message:
- minio
- s3
- object storage
response:
action_title: "重啟 MinIO {target}"
description: "⚙️ 規則匹配: MinIO ({instance}) 無法連線。影響靜態資源和備份儲存。"
suggested_action: RESTART_DEPLOYMENT
kubectl_command: "ssh {host} 'docker restart minio'"
estimated_downtime: "~1 min"
risk: critical
responsibility: INFRA
responsibility_reasoning: "MinIO 屬物件儲存基礎設施"
secondary_teams: []
optimization:
- type: DISK_CHECK
description: "確認磁碟空間充足"
command: "ssh {host} 'df -h /data/minio'"
reasoning: "[規則匹配] MinIO 下線需先確認磁碟空間,再重啟服務。"
- id: minio_disk_high
priority: 96
description: MinIO 磁碟使用率過高
match:
alertname:
- MinioDiskUsageHigh
- MinioDiskUsageCritical
message:
- disk usage
- disk full
- storage
response:
action_title: "清理 MinIO 過期資料 on {host}"
description: "⚙️ 規則匹配: MinIO 磁碟使用率過高,需清理舊資料或擴展儲存空間。"
suggested_action: NO_ACTION
kubectl_command: "ssh {host} 'df -h /data/minio && du -sh /data/minio/* | sort -rh | head -10'"
estimated_downtime: "0"
risk: critical
responsibility: INFRA
responsibility_reasoning: "磁碟空間管理屬基礎設施團隊責任"
secondary_teams: []
optimization:
- type: CLEANUP
description: "清理 MinIO 舊備份和 lifecycle policy"
command: "mc admin lifecycle add local --expiry-days 30"
reasoning: "[規則匹配] 磁碟滿會導致寫入失敗,需立即清理最大的目錄。"
- id: harbor_down
priority: 97
description: Harbor Registry 下線
match:
alertname:
- HarborDown
message:
- harbor
- registry
- docker registry
response:
action_title: "重啟 Harbor Registry on {host}"
description: "⚙️ 規則匹配: Harbor ({instance}) 無法連線。影響 CD 部署流程。"
suggested_action: RESTART_DEPLOYMENT
kubectl_command: "ssh {host} 'cd /data/harbor && docker-compose up -d'"
estimated_downtime: "~2 min"
risk: critical
responsibility: INFRA
responsibility_reasoning: "Harbor 是 CD 部署的核心依賴,屬基礎設施團隊責任"
secondary_teams: []
optimization:
- type: HEALTH_CHECK
description: "確認 Harbor 各組件狀態"
command: "ssh {host} 'cd /data/harbor && docker-compose ps'"
reasoning: "[規則匹配] Harbor 下線會阻塞所有 CD 部署,需立即重啟。"
# ── K8s 叢集層 ──────────────────────────────────────────────
- id: k3s_node_down
priority: 100
description: K3s 節點下線
match:
alertname:
- K3sNodeDown
- K3sVIPDown
message:
- node down
- node not ready
- k3s
response:
action_title: "確認 K3s 節點 {target} 狀態"
description: "⚙️ 規則匹配: K3s 節點下線,影響叢集可用性和 Pod 調度。"
suggested_action: NO_ACTION
kubectl_command: "kubectl get nodes -o wide && kubectl describe node {target}"
estimated_downtime: "依節點恢復時間"
risk: critical
responsibility: INFRA
responsibility_reasoning: "K3s 叢集節點管理屬基礎設施團隊責任"
secondary_teams: []
optimization:
- type: NODE_DRAIN
description: "先 drain 節點確保 Pod 安全遷移"
command: "kubectl drain {target} --ignore-daemonsets --delete-emptydir-data"
reasoning: "[規則匹配] 節點下線需先確認主機可達性,必要時手動遷移 workload。"
- id: awoooi_api_down
priority: 105
description: AWOOOI API 服務下線
match:
alertname:
- AWOOOIApiDown
- OpenClawDown
message:
- awoooi api
- openclaw
- api down
response:
action_title: "重啟 AWOOOI API deployment"
description: "⚙️ 規則匹配: AWOOOI API 無法連線。影響所有告警處理和 AI 決策流程。"
suggested_action: RESTART_DEPLOYMENT
kubectl_command: "kubectl rollout restart deployment/awoooi-api -n awoooi"
estimated_downtime: "~1 min"
risk: critical
responsibility: BE
responsibility_reasoning: "AWOOOI API 是核心服務,屬後端團隊直接責任"
secondary_teams: [INFRA]
optimization:
- type: HEALTH_CHECK
description: "確認 API Pod 狀態和最近 log"
command: "kubectl get pods -n awoooi && kubectl logs -n awoooi deployment/awoooi-api --tail=50"
reasoning: "[規則匹配] AWOOOI API 下線需立即重啟,同時查 Pod log 確認根因。"
# ── 告警鏈路監控 ────────────────────────────────────────────
- id: alert_chain_broken
priority: 110
description: 告警鏈路中斷
match:
alertname:
- AlertChainBroken_Alertmanager
- AlertChainBroken_Sentry
- AlertChainBroken_SignOz
- AlertChainUnhealthy
- NoAlertsReceived2Hours
message:
- alert chain
- alertmanager
- no alerts
response:
action_title: "診斷告警鏈路中斷"
description: "⚙️ 規則匹配: 告警鏈路異常,可能導致真實告警無法送達 Telegram。"
suggested_action: NO_ACTION
kubectl_command: "kubectl get pods -n monitoring && curl -s http://192.168.0.120:9093/api/v1/status | jq '.data.uptime'"
estimated_downtime: "監控盲區持續中"
risk: critical
responsibility: INFRA
responsibility_reasoning: "告警鏈路屬基礎設施監控體系,需立即修復確保可觀測性"
secondary_teams: [BE]
optimization:
- type: E2E_TEST
description: "發送測試告警驗證整條鏈路"
command: "curl -X POST http://192.168.0.125:32334/api/v1/test-alert -H 'Content-Type: application/json' -d '{\"test\": true}'"
reasoning: "[規則匹配] 告警鏈路中斷等同監控失明,最高優先修復。"
# ── GPU / AI 基礎設施 ────────────────────────────────────────
- id: nvidia_circuit_breaker
priority: 115
description: NVIDIA/Nemotron 熔斷器開啟
match:
alertname:
- NvidiaCircuitBreakerOpen
- NvidiaToolCallingHighErrorRate
- NvidiaToolCallingHighLatency
message:
- circuit breaker
- nvidia
- nemotron
- tool calling
response:
action_title: "確認 NVIDIA API 熔斷狀態"
description: "⚙️ 規則匹配: NVIDIA/Nemotron 熔斷器開啟或錯誤率過高AI Router 已自動降級。"
suggested_action: NO_ACTION
kubectl_command: "curl -s http://192.168.0.125:32334/api/v1/ai-router/status | jq '.providers'"
estimated_downtime: "0 (已自動 fallback)"
risk: medium
responsibility: BE
responsibility_reasoning: "AI Provider 熔斷管理屬後端 AI Router 責任範圍"
secondary_teams: []
optimization:
- type: CIRCUIT_BREAKER_RESET
description: "等待熔斷器自動恢復 (half-open 狀態)"
command: "curl -s http://192.168.0.125:32334/api/v1/ai-router/reset -X POST"
reasoning: "[規則匹配] AI Router 已自動降級至備援 Provider監控熔斷器恢復狀態即可。"
# ── E2E / Smoke Test 告警 ────────────────────────────────────
# 2026-04-09 Claude Sonnet 4.6: E2E test 假告警識別,僅記錄不修復
- id: e2e_smoke_test
priority: 120
description: E2E Smoke Test / 告警鏈路驗證假告警
match:
alertname:
- E2E_SMOKE_TEST
- E2E_FINAL_SMOKE_TEST
- SmokeTest
instance_prefix:
- e2e-final-
- e2e-test-
- test-host
- smoke-test-
message:
- e2e smoke test
- smoke test
- please ignore
- e2e test
- e2e-final
- e2e-test
- e2e_smoke
- alert chain smoke
response:
action_title: "告警鏈路驗證成功 (E2E)"
description: "✅ E2E Smoke Test 告警已收到,告警鏈路正常。此告警僅用於驗證,無需修復動作。"
suggested_action: NO_ACTION
kubectl_command: ""
estimated_downtime: "N/A"
risk: low
responsibility: INFRA
responsibility_reasoning: "E2E smoke test 假告警,告警鏈路驗證用途,系統自動識別跳過修復"
secondary_teams: []
optimization: []
reasoning: "[規則匹配] E2E Smoke Test 假告警,僅確認告警鏈路暢通,無實際服務異常。"
# ── 備份失敗 ────────────────────────────────────────────────
# 2026-04-11 Claude Sonnet 4.6: backup 類告警屬主機層,無 K8s deployment 可重啟
# → TYPE-1 純資訊通知,不應出現 [重啟] 按鈕
- id: host_backup_failed
priority: 50
description: 備份任務失敗 (rsync/velero/HostBackupFailed)
match:
alertname:
- HostBackupFailed
- VeleroBackupFailed
- VeleroBackupNotRun
- BackupJobFailed
response:
action_title: "🔍 備份失敗自動診斷 — SSH 收集備份與磁碟狀態"
description: "⚠️ 備份任務失敗。先自動 SSH 收集 backup log、last_success 與磁碟空間;若無法確認安全修復,立即升級緊急介入。"
suggested_action: SSH_DIAGNOSE
# 2026-05-02 ogt + Claude Sonnet 4.6: 補上 ps aux 讓 _ssh_execute 走 diagnostics 路徑(無阻擋)
kubectl_command: "ssh {host} 'ps aux --sort=-%cpu | head -15; echo \"=== BACKUP STATUS ===\"; ls -lah /home/ollama/backup/110 2>/dev/null || true; echo \"=== LAST SUCCESS ===\"; cat /home/ollama/backup/110/last_success 2>/dev/null || true; echo \"=== BACKUP LOG ===\"; tail -80 /home/ollama/backup/110/backup.log 2>/dev/null || true; echo \"=== DISK ===\"; df -h /home/ollama /backup / 2>/dev/null || df -h'"
estimated_downtime: "N/A"
risk: low
responsibility: INFRA
responsibility_reasoning: "備份失敗屬基礎設施維運問題,先自動收集只讀證據,再交由緊急介入或後續 Playbook 修復"
secondary_teams: []
optimization: []
reasoning: "[規則匹配] 備份失敗先自動 SSH 只讀診斷,避免 LLM 誤判為 K8s deployment 重啟。"
# ── DevOps 工具層 ─────────────────────────────────────────
# 2026-04-14 Claude Sonnet 4.6: Task 2.2 ADR-076 — 新增 devops_tool / ssl_cert / external_site 三類規則
# 設計原則: CI/CD 工具與外部服務均為 NO_ACTION不可自動修復誤操作風險過高
- id: gitea_down
priority: 125
description: Gitea CI/CD 服務下線(不自動修復)
match:
alertname:
- GiteaDown
- GiteaServiceDown
- GiteaUnhealthy
message:
- gitea
- git server
- ci/cd down
response:
action_title: "Gitea ({instance}) 下線 — 需人工確認"
description: "⚠️ 規則匹配: Gitea CI/CD 服務 ({instance}) 無法連線,影響所有部署流程。不自動重啟(誤觸 CD 風險過高)。"
suggested_action: NO_ACTION
kubectl_command: ""
estimated_downtime: "N/A"
risk: critical
responsibility: INFRA
responsibility_reasoning: "Gitea 是 CI/CD 核心,自動重啟有誤觸部署風險,需人工確認狀態後手動操作"
secondary_teams: []
optimization:
- type: HEALTH_CHECK
description: "確認 Gitea 服務狀態"
command: "ssh {host} 'cd /data/gitea && docker compose ps && docker compose logs --tail=20 gitea'"
reasoning: "[規則匹配] Gitea 下線不自動修復,通知後由人工確認狀態再操作,避免 CD pipeline 誤觸發。"
- id: ssl_cert_expiring
priority: 126
description: SSL/TLS 憑證即將到期或已到期
match:
alertname:
- SSLCertExpiringSoon
- SSLCertExpired
- CertificateExpirationWarning
- TLSCertExpiring
message:
- ssl cert
- certificate expir
- tls cert
- cert will expire
response:
action_title: "SSL 憑證 ({instance}) 即將到期 — 需人工更新"
description: "⚠️ 規則匹配: SSL/TLS 憑證 ({instance}) 即將到期或已到期。無自動修復,需人工確認 cert-manager 或執行 certbot 更新。"
suggested_action: NO_ACTION
kubectl_command: ""
estimated_downtime: "N/A"
risk: medium
responsibility: INFRA
responsibility_reasoning: "SSL 憑證更新需域名驗證,屬基礎設施團隊責任"
secondary_teams: []
optimization:
- type: CERT_RENEWAL
description: "確認 cert-manager 自動更新狀態"
command: "kubectl get certificate,certificaterequest -A && kubectl get secret -n awoooi-prod | grep tls"
reasoning: "[規則匹配] SSL 憑證到期無法自動修復,需人工操作 certbot 或確認 cert-manager 自動更新是否正常。"
- id: external_site_down
priority: 127
description: 外部網站或服務下線MoWooo 系列 / HTTP probe 失敗)
match:
alertname:
- MoWoooWorkDown
- MoWoooDevDown
- ExternalSiteDown
- WebsiteDown
- BlackboxProbeFailed
message:
- external site
- website down
- mowooo
- http probe failed
- probe failed
response:
action_title: "外部網站 {instance} 下線 — 僅通知"
description: "⚠️ 規則匹配: 外部網站 ({instance}) HTTP probe 失敗。此為外部服務,無自動修復動作,等待服務恢復。"
suggested_action: NO_ACTION
kubectl_command: ""
estimated_downtime: "N/A"
risk: medium
responsibility: INFRA
responsibility_reasoning: "外部網站超出系統控制範圍,無法自動修復,通知後人工跟進"
secondary_teams: []
optimization:
- type: STATUS_CHECK
description: "手動確認外部網站狀態"
command: "curl -sv {instance} --max-time 10 2>&1 | grep -E '(HTTP|Connected|Failed)'"
reasoning: "[規則匹配] 外部網站下線屬外部依賴,通知統帥後等待服務恢復,必要時切換備援路徑。"
# 2026-04-24 ogt + Claude Sonnet 4.6: Sentry / ClickHouse 監控告警 — 外部服務,禁止 kubectl 操作
- id: sentry_clickhouse_alert
priority: 60
description: Sentry 或 ClickHouse 監控告警(外部服務,不是 K8s workload
match:
alertname:
- SentryClickHouseMemoryPressure
- SentryClickHouseCpuHigh
- SentryClickHouseDiskUsageHigh
- ClickHouseMemoryHigh
- ClickHouseMemoryPressure
- ClickHouseCpuHigh
- ClickHouseReplicationLag
- ClickHouseQuerySlow
- SentryWorkerQueueHigh
- SentryKafkaLag
- SentryBacklogHigh
response:
action_title: "⚠️ Sentry/ClickHouse 告警 — 需 SSH 人工排查"
description: "⚠️ Sentry/ClickHouse 屬外部監控服務,無法透過 kubectl 自動修復。請 SSH 登入服務主機排查根因clickhouse-client / docker stats / journalctl -xe。若記憶體壓力持續考慮調整 ClickHouse max_memory_usage 設定或清理舊資料。"
suggested_action: NO_ACTION
kubectl_command: ""
estimated_downtime: "N/A"
risk: high
responsibility: INFRA
responsibility_reasoning: "Sentry/ClickHouse 基礎設施由 INFRA 團隊管理"
secondary_teams: []
optimization: []
reasoning: "[規則匹配] Sentry/ClickHouse 非 K8s 服務kubectl 操作無效。需 SSH 進入服務主機,確認記憶體/CPU/磁碟狀況後手動介入。"
# ── 通用兜底 ────────────────────────────────────────────────
- id: generic_fallback
priority: 999
description: 通用兜底規則 (無法匹配的告警)
match:
alertname:
- "*"
response:
action_title: "重新啟動 {target} 服務"
description: "⚙️ 規則匹配: {target} 發生異常,需進一步診斷確認根因。"
suggested_action: NO_ACTION
kubectl_command: ""
estimated_downtime: "N/A"
risk: medium
responsibility: COLLAB
responsibility_reasoning: "告警資訊不足以判定單一責任團隊,建議多團隊協同排查"
secondary_teams: [BE, INFRA]
optimization: []
reasoning: "[規則匹配] 未知告警類型,無法安全判斷修復動作,由人工或 LLM 診斷後決策。"