Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
- 新增 alert_rules.yaml: 6 條規則 (docker/target_down/oom/cpu/5xx/crash) + 通用兜底 - 新增 alert_rule_engine.py: YAML 載入、匹配邏輯、變數填充 - openclaw.py _generate_mock_response: 重構為呼叫規則引擎 (v8.0) - 新增規則只需修改 YAML,重啟 Pod 即可,不需改代碼 - 2026-04-09 ogt: 架構重構 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
200 lines
8.9 KiB
YAML
200 lines
8.9 KiB
YAML
# AWOOOI OpenClaw 告警規則匹配引擎
|
||
# ============================================================
|
||
# 格式說明:
|
||
# match.alertname : Prometheus alertname 完全匹配 (list = OR)
|
||
# match.alert_type : alert_type 關鍵字 (list = OR, 部分匹配)
|
||
# match.message : message 關鍵字 (list = OR, 部分匹配, 不分大小寫)
|
||
# response.* : 回應模板,支援變數 {target} {host} {container} {instance} {job} {namespace}
|
||
# responsibility : FE / BE / INFRA / DB / COLLAB
|
||
# risk : low / medium / critical
|
||
# confidence : 0.0 (規則匹配固定值,禁止偽造)
|
||
#
|
||
# 修改規則: 不需要重新部署,重啟 API Pod 即可熱載入
|
||
# 新增規則: 在 rules 清單末尾加入,priority 越小越優先
|
||
# 2026-04-09 ogt: 初版,從 openclaw.py _generate_mock_response 抽出
|
||
# ============================================================
|
||
|
||
version: "1.0.0"
|
||
updated_at: "2026-04-09"
|
||
|
||
rules:
|
||
# ── Docker / Host 層 ────────────────────────────────────────
|
||
|
||
- id: docker_container_unhealthy
|
||
priority: 10
|
||
description: Docker 容器 healthcheck 失敗
|
||
match:
|
||
alertname:
|
||
- DockerContainerUnhealthy
|
||
message:
|
||
- unhealthy
|
||
- health check
|
||
- healthcheck
|
||
response:
|
||
action_title: "檢查 Docker 容器 {container} 健康狀態"
|
||
description: "⚙️ 規則匹配: Docker 容器 {container} ({host}) healthcheck 失敗。常見原因: 應用程式啟動慢、healthcheck 指令錯誤、依賴服務未就緒。"
|
||
suggested_action: RESTART_DEPLOYMENT
|
||
kubectl_command: "ssh {host} 'docker inspect {container} --format=\"{{.State.Health.Status}}\" && docker restart {container}'"
|
||
estimated_downtime: "~30s"
|
||
risk: medium
|
||
responsibility: INFRA
|
||
responsibility_reasoning: "Docker 容器健康檢查失敗屬基礎設施團隊責任,需確認 healthcheck 設定與容器狀態"
|
||
secondary_teams: [BE]
|
||
optimization:
|
||
- type: HEALTHCHECK
|
||
description: "確認 healthcheck 指令在容器內可執行 (mc/curl 是否存在)"
|
||
command: "ssh {host} 'docker exec {container} sh -c \"mc ready local 2>/dev/null || curl -sf http://localhost:9000/minio/health/live\"'"
|
||
reasoning: "[規則匹配] Docker healthcheck 失敗先 restart 恢復服務,同時確認 healthcheck 指令正確。"
|
||
|
||
- id: target_down
|
||
priority: 20
|
||
description: Prometheus scrape target 下線
|
||
match:
|
||
alertname:
|
||
- TargetDown
|
||
- InstanceDown
|
||
response:
|
||
action_title: "確認 {job} ({instance}) 服務存活"
|
||
description: "⚙️ 規則匹配: Prometheus 無法抓取 {instance} ({job}) 指標。常見原因: 主機離線、exporter crash、防火牆封鎖。"
|
||
suggested_action: RESTART_DEPLOYMENT
|
||
kubectl_command: "ssh {host} 'systemctl status node_exporter 2>/dev/null || docker ps | grep exporter'"
|
||
estimated_downtime: "監控盲區持續中"
|
||
risk: medium
|
||
responsibility: INFRA
|
||
responsibility_reasoning: "Prometheus scrape 目標下線屬基礎設施監控範疇"
|
||
secondary_teams: []
|
||
optimization:
|
||
- type: MONITORING
|
||
description: "確認 exporter 進程是否存活"
|
||
command: "ssh {host} 'ps aux | grep exporter | grep -v grep'"
|
||
reasoning: "[規則匹配] Prometheus target 下線,先 SSH 確認主機存活再重啟 exporter。"
|
||
|
||
# ── K8s Pod 層 ──────────────────────────────────────────────
|
||
|
||
- id: oom_killed
|
||
priority: 30
|
||
description: Pod OOMKilled 記憶體不足
|
||
match:
|
||
alert_type:
|
||
- memory
|
||
message:
|
||
- oomkilled
|
||
- oom
|
||
- out of memory
|
||
response:
|
||
action_title: "刪除異常 Pod {target} (OOMKilled)"
|
||
description: "⚙️ 規則匹配: {target} 發生 OOMKilled,根因為 JVM Heap 配置與 K8s memory limit 不匹配或存在記憶體洩漏。"
|
||
suggested_action: DELETE_POD
|
||
kubectl_command: "kubectl delete pod {target} -n {namespace}"
|
||
estimated_downtime: "~30s"
|
||
risk: critical
|
||
responsibility: BE
|
||
responsibility_reasoning: "OOMKilled 通常源於應用程式記憶體配置不當,屬後端團隊責任範圍"
|
||
secondary_teams: [INFRA]
|
||
optimization:
|
||
- type: RESOURCE_LIMIT
|
||
description: "調整 memory limit 至 1Gi 並確保 JVM -Xmx 不超過 70%"
|
||
command: "kubectl set resources deployment/{target} -c {target} --limits=memory=1Gi -n {namespace}"
|
||
- type: HPA
|
||
description: "啟用基於記憶體的 HPA 自動擴展"
|
||
command: "kubectl autoscale deployment {target} --memory-percent=80 --min=2 --max=5 -n {namespace}"
|
||
reasoning: "[規則匹配] Pod OOMKilled 後 ReplicaSet 將自動重建,但需同步修正資源配置防止復發。"
|
||
|
||
- id: high_cpu
|
||
priority: 40
|
||
description: Pod CPU 使用率過高
|
||
match:
|
||
alert_type:
|
||
- cpu
|
||
- high_cpu
|
||
response:
|
||
action_title: "擴展 {target} 副本數 + 啟用 HPA"
|
||
description: "⚙️ 規則匹配: {target} CPU 使用率過高,根因為流量突增或計算密集任務未配置自動擴展。"
|
||
suggested_action: SCALE_DEPLOYMENT
|
||
kubectl_command: "kubectl scale deployment {target} --replicas=3 -n {namespace}"
|
||
estimated_downtime: "0"
|
||
risk: medium
|
||
responsibility: INFRA
|
||
responsibility_reasoning: "自動擴展策略未配置或閾值過高,屬基礎設施團隊責任"
|
||
secondary_teams: [BE]
|
||
optimization:
|
||
- type: RESOURCE_LIMIT
|
||
description: "增加 CPU request 確保 QoS 為 Guaranteed"
|
||
command: "kubectl set resources deployment/{target} --requests=cpu=500m --limits=cpu=2000m -n {namespace}"
|
||
reasoning: "[規則匹配] 水平擴展可即時分散負載,同時建議配置 HPA 防止復發。"
|
||
|
||
- id: http_5xx
|
||
priority: 50
|
||
description: HTTP 5xx 錯誤率過高
|
||
match:
|
||
alert_type:
|
||
- http
|
||
message:
|
||
- "5xx"
|
||
- "502"
|
||
- "503"
|
||
- "500"
|
||
response:
|
||
action_title: "重啟 {target} + 檢查上游服務"
|
||
description: "⚙️ 規則匹配: {target} 產生 HTTP 5xx 錯誤,可能為應用程式例外或上游服務不可達。"
|
||
suggested_action: RESTART_DEPLOYMENT
|
||
kubectl_command: "kubectl rollout restart deployment/{target} -n {namespace}"
|
||
estimated_downtime: "~1 min"
|
||
risk: critical
|
||
responsibility: COLLAB
|
||
responsibility_reasoning: "HTTP 5xx 可能源於前端路由、後端邏輯或基礎設施,需多團隊協同排查"
|
||
secondary_teams: [FE, BE, INFRA]
|
||
optimization:
|
||
- type: CIRCUIT_BREAKER
|
||
description: "配置熔斷器防止故障擴散"
|
||
command: "# Istio VirtualService outlierDetection 配置"
|
||
reasoning: "[規則匹配] HTTP 錯誤需協同排查,先重啟恢復服務同時通知相關團隊。"
|
||
|
||
- id: pod_crash
|
||
priority: 60
|
||
description: Pod CrashLoopBackOff
|
||
match:
|
||
alert_type:
|
||
- pod_crash
|
||
- crash
|
||
message:
|
||
- crashloop
|
||
- crash
|
||
- backoff
|
||
response:
|
||
action_title: "診斷 {target} CrashLoop 根因"
|
||
description: "⚙️ 規則匹配: {target} 進入 CrashLoopBackOff,需檢查啟動錯誤日誌。"
|
||
suggested_action: RESTART_DEPLOYMENT
|
||
kubectl_command: "kubectl logs {target} -n {namespace} --previous --tail=50"
|
||
estimated_downtime: "依根因而定"
|
||
risk: critical
|
||
responsibility: BE
|
||
responsibility_reasoning: "Pod crash 通常源於應用程式啟動錯誤,屬後端團隊責任"
|
||
secondary_teams: [INFRA]
|
||
optimization:
|
||
- type: LIVENESS_PROBE
|
||
description: "調整 liveness probe 初始延遲防止誤殺"
|
||
command: "# 調整 initialDelaySeconds >= 應用啟動時間"
|
||
reasoning: "[規則匹配] 先查 previous log 確認 crash 原因,再決定修復策略。"
|
||
|
||
# ── 通用兜底 ────────────────────────────────────────────────
|
||
|
||
- id: generic_fallback
|
||
priority: 999
|
||
description: 通用兜底規則 (無法匹配的告警)
|
||
match:
|
||
alertname:
|
||
- "*"
|
||
response:
|
||
action_title: "重新啟動 {target} 服務"
|
||
description: "⚙️ 規則匹配: {target} 發生異常,需進一步診斷確認根因。"
|
||
suggested_action: RESTART_DEPLOYMENT
|
||
kubectl_command: "kubectl rollout restart deployment/{target} -n {namespace}"
|
||
estimated_downtime: "5-15 min"
|
||
risk: medium
|
||
responsibility: COLLAB
|
||
responsibility_reasoning: "告警資訊不足以判定單一責任團隊,建議多團隊協同排查"
|
||
secondary_teams: [BE, INFRA]
|
||
optimization: []
|
||
reasoning: "[規則匹配] 根據告警先重啟恢復服務,同時安排深入診斷。"
|