# AWOOOI OpenClaw 告警規則匹配引擎 # ============================================================ # 格式說明: # match.alertname : Prometheus alertname 完全匹配 (list = OR) # match.alert_type : alert_type 關鍵字 (list = OR, 部分匹配) # match.message : message 關鍵字 (list = OR, 部分匹配, 不分大小寫) # response.* : 回應模板,支援變數 {target} {host} {container} {instance} {job} {namespace} # responsibility : FE / BE / INFRA / DB / COLLAB # risk : low / medium / critical # confidence : 0.0 (規則匹配固定值,禁止偽造) # # 修改規則: 不需要重新部署,重啟 API Pod 即可熱載入 # 新增規則: 在 rules 清單末尾加入,priority 越小越優先 # 2026-04-09 ogt: 初版,從 openclaw.py _generate_mock_response 抽出 # ============================================================ version: "1.0.0" updated_at: "2026-04-09" rules: # ── Docker / Host 層 ──────────────────────────────────────── - id: docker_container_unhealthy priority: 10 description: Docker 容器 healthcheck 失敗 match: alertname: - DockerContainerUnhealthy message: - unhealthy - health check - healthcheck response: action_title: "檢查 Docker 容器 {container} 健康狀態" description: "⚙️ 規則匹配: Docker 容器 {container} ({host}) healthcheck 失敗。常見原因: 應用程式啟動慢、healthcheck 指令錯誤、依賴服務未就緒。" suggested_action: RESTART_DEPLOYMENT kubectl_command: "ssh {host} 'docker inspect {container} --format=\"{{.State.Health.Status}}\" && docker restart {container}'" estimated_downtime: "~30s" risk: medium responsibility: INFRA responsibility_reasoning: "Docker 容器健康檢查失敗屬基礎設施團隊責任,需確認 healthcheck 設定與容器狀態" secondary_teams: [BE] optimization: - type: HEALTHCHECK description: "確認 healthcheck 指令在容器內可執行 (mc/curl 是否存在)" command: "ssh {host} 'docker exec {container} sh -c \"mc ready local 2>/dev/null || curl -sf http://localhost:9000/minio/health/live\"'" reasoning: "[規則匹配] Docker healthcheck 失敗先 restart 恢復服務,同時確認 healthcheck 指令正確。" - id: target_down priority: 20 description: Prometheus scrape target 下線 match: alertname: - TargetDown - InstanceDown response: action_title: "確認 {job} ({instance}) 服務存活" description: "⚙️ 規則匹配: Prometheus 無法抓取 {instance} ({job}) 指標。常見原因: 主機離線、exporter crash、防火牆封鎖。" suggested_action: RESTART_DEPLOYMENT kubectl_command: "ssh {host} 'systemctl status node_exporter 2>/dev/null || docker ps | grep exporter'" estimated_downtime: "監控盲區持續中" risk: medium responsibility: INFRA responsibility_reasoning: "Prometheus scrape 目標下線屬基礎設施監控範疇" secondary_teams: [] optimization: - type: MONITORING description: "確認 exporter 進程是否存活" command: "ssh {host} 'ps aux | grep exporter | grep -v grep'" reasoning: "[規則匹配] Prometheus target 下線,先 SSH 確認主機存活再重啟 exporter。" # ── K8s Pod 層 ────────────────────────────────────────────── - id: oom_killed priority: 30 description: Pod OOMKilled 記憶體不足 match: alert_type: - memory message: - oomkilled - oom - out of memory response: action_title: "刪除異常 Pod {target} (OOMKilled)" description: "⚙️ 規則匹配: {target} 發生 OOMKilled,根因為 JVM Heap 配置與 K8s memory limit 不匹配或存在記憶體洩漏。" suggested_action: DELETE_POD kubectl_command: "kubectl delete pod {target} -n {namespace}" estimated_downtime: "~30s" risk: critical responsibility: BE responsibility_reasoning: "OOMKilled 通常源於應用程式記憶體配置不當,屬後端團隊責任範圍" secondary_teams: [INFRA] optimization: - type: RESOURCE_LIMIT description: "調整 memory limit 至 1Gi 並確保 JVM -Xmx 不超過 70%" command: "kubectl set resources deployment/{target} -c {target} --limits=memory=1Gi -n {namespace}" - type: HPA description: "啟用基於記憶體的 HPA 自動擴展" command: "kubectl autoscale deployment {target} --memory-percent=80 --min=2 --max=5 -n {namespace}" reasoning: "[規則匹配] Pod OOMKilled 後 ReplicaSet 將自動重建,但需同步修正資源配置防止復發。" - id: high_cpu priority: 40 description: Pod CPU 使用率過高 match: alert_type: - cpu - high_cpu response: action_title: "擴展 {target} 副本數 + 啟用 HPA" description: "⚙️ 規則匹配: {target} CPU 使用率過高,根因為流量突增或計算密集任務未配置自動擴展。" suggested_action: SCALE_DEPLOYMENT kubectl_command: "kubectl scale deployment {target} --replicas=3 -n {namespace}" estimated_downtime: "0" risk: medium responsibility: INFRA responsibility_reasoning: "自動擴展策略未配置或閾值過高,屬基礎設施團隊責任" secondary_teams: [BE] optimization: - type: RESOURCE_LIMIT description: "增加 CPU request 確保 QoS 為 Guaranteed" command: "kubectl set resources deployment/{target} --requests=cpu=500m --limits=cpu=2000m -n {namespace}" reasoning: "[規則匹配] 水平擴展可即時分散負載,同時建議配置 HPA 防止復發。" - id: http_5xx priority: 50 description: HTTP 5xx 錯誤率過高 match: alert_type: - http message: - "5xx" - "502" - "503" - "500" response: action_title: "重啟 {target} + 檢查上游服務" description: "⚙️ 規則匹配: {target} 產生 HTTP 5xx 錯誤,可能為應用程式例外或上游服務不可達。" suggested_action: RESTART_DEPLOYMENT kubectl_command: "kubectl rollout restart deployment/{target} -n {namespace}" estimated_downtime: "~1 min" risk: critical responsibility: COLLAB responsibility_reasoning: "HTTP 5xx 可能源於前端路由、後端邏輯或基礎設施,需多團隊協同排查" secondary_teams: [FE, BE, INFRA] optimization: - type: CIRCUIT_BREAKER description: "配置熔斷器防止故障擴散" command: "# Istio VirtualService outlierDetection 配置" reasoning: "[規則匹配] HTTP 錯誤需協同排查,先重啟恢復服務同時通知相關團隊。" - id: pod_crash priority: 60 description: Pod CrashLoopBackOff match: alert_type: - pod_crash - crash message: - crashloop - crash - backoff response: action_title: "診斷 {target} CrashLoop 根因" description: "⚙️ 規則匹配: {target} 進入 CrashLoopBackOff,需檢查啟動錯誤日誌。" suggested_action: RESTART_DEPLOYMENT kubectl_command: "kubectl logs {target} -n {namespace} --previous --tail=50" estimated_downtime: "依根因而定" risk: critical responsibility: BE responsibility_reasoning: "Pod crash 通常源於應用程式啟動錯誤,屬後端團隊責任" secondary_teams: [INFRA] optimization: - type: LIVENESS_PROBE description: "調整 liveness probe 初始延遲防止誤殺" command: "# 調整 initialDelaySeconds >= 應用啟動時間" reasoning: "[規則匹配] 先查 previous log 確認 crash 原因,再決定修復策略。" # ── 通用兜底 ──────────────────────────────────────────────── - id: generic_fallback priority: 999 description: 通用兜底規則 (無法匹配的告警) match: alertname: - "*" response: action_title: "重新啟動 {target} 服務" description: "⚙️ 規則匹配: {target} 發生異常,需進一步診斷確認根因。" suggested_action: RESTART_DEPLOYMENT kubectl_command: "kubectl rollout restart deployment/{target} -n {namespace}" estimated_downtime: "5-15 min" risk: medium responsibility: COLLAB responsibility_reasoning: "告警資訊不足以判定單一責任團隊,建議多團隊協同排查" secondary_teams: [BE, INFRA] optimization: [] reasoning: "[規則匹配] 根據告警先重啟恢復服務,同時安排深入診斷。"