awoooi/apps/api/alert_rules.yaml

# AWOOOI OpenClaw 告警規則匹配引擎
# ============================================================
# 格式說明:
#   match.alertname   : Prometheus alertname 完全匹配 (list = OR)
#   match.alert_type  : alert_type 關鍵字 (list = OR, 部分匹配)
#   match.message     : message 關鍵字 (list = OR, 部分匹配, 不分大小寫)
#   response.*        : 回應模板，支援變數 {target} {host} {container} {instance} {job} {namespace}
#   responsibility    : FE / BE / INFRA / DB / COLLAB
#   risk              : low / medium / critical
#   confidence        : 0.0 (規則匹配固定值，禁止偽造)
#
# 修改規則: 不需要重新部署，重啟 API Pod 即可熱載入
# 新增規則: 在 rules 清單末尾加入，priority 越小越優先
# 2026-04-09 ogt: 初版，從 openclaw.py _generate_mock_response 抽出
# ============================================================

version: "1.0.0"
updated_at: "2026-04-09"

rules:
  # ── Docker / Host 層 ────────────────────────────────────────

  - id: docker_container_unhealthy
    priority: 10
    description: Docker 容器 healthcheck 失敗
    match:
      alertname:
        - DockerContainerUnhealthy
      message:
        - unhealthy
        - health check
        - healthcheck
    response:
      action_title: "檢查 Docker 容器 {container} 健康狀態"
      description: "⚙️ 規則匹配: Docker 容器 {container} ({host}) healthcheck 失敗。常見原因: 應用程式啟動慢、healthcheck 指令錯誤、依賴服務未就緒。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "ssh {host} 'docker inspect {container} --format=\"{{.State.Health.Status}}\" && docker restart {container}'"
      estimated_downtime: "~30s"
      risk: medium
      responsibility: INFRA
      responsibility_reasoning: "Docker 容器健康檢查失敗屬基礎設施團隊責任，需確認 healthcheck 設定與容器狀態"
      secondary_teams: [BE]
      optimization:
        - type: HEALTHCHECK
          description: "確認 healthcheck 指令在容器內可執行 (mc/curl 是否存在)"
          command: "ssh {host} 'docker exec {container} sh -c \"mc ready local 2>/dev/null || curl -sf http://localhost:9000/minio/health/live\"'"
      reasoning: "[規則匹配] Docker healthcheck 失敗先 restart 恢復服務，同時確認 healthcheck 指令正確。"

  - id: target_down
    priority: 20
    description: Prometheus scrape target 下線
    match:
      alertname:
        - TargetDown
        - InstanceDown
    response:
      action_title: "確認 {job} ({instance}) 服務存活"
      description: "⚙️ 規則匹配: Prometheus 無法抓取 {instance} ({job}) 指標。常見原因: 主機離線、exporter crash、防火牆封鎖。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "ssh {host} 'systemctl status node_exporter 2>/dev/null || docker ps | grep exporter'"
      estimated_downtime: "監控盲區持續中"
      risk: medium
      responsibility: INFRA
      responsibility_reasoning: "Prometheus scrape 目標下線屬基礎設施監控範疇"
      secondary_teams: []
      optimization:
        - type: MONITORING
          description: "確認 exporter 進程是否存活"
          command: "ssh {host} 'ps aux | grep exporter | grep -v grep'"
      reasoning: "[規則匹配] Prometheus target 下線，先 SSH 確認主機存活再重啟 exporter。"

  # ── K8s Pod 層 ──────────────────────────────────────────────

  - id: oom_killed
    priority: 30
    description: Pod OOMKilled 記憶體不足
    match:
      alert_type:
        - memory
      message:
        - oomkilled
        - oom
        - out of memory
    response:
      action_title: "刪除異常 Pod {target} (OOMKilled)"
      description: "⚙️ 規則匹配: {target} 發生 OOMKilled，根因為 JVM Heap 配置與 K8s memory limit 不匹配或存在記憶體洩漏。"
      suggested_action: DELETE_POD
      kubectl_command: "kubectl delete pod {target} -n {namespace}"
      estimated_downtime: "~30s"
      risk: critical
      responsibility: BE
      responsibility_reasoning: "OOMKilled 通常源於應用程式記憶體配置不當，屬後端團隊責任範圍"
      secondary_teams: [INFRA]
      optimization:
        - type: RESOURCE_LIMIT
          description: "調整 memory limit 至 1Gi 並確保 JVM -Xmx 不超過 70%"
          command: "kubectl set resources deployment/{target} -c {target} --limits=memory=1Gi -n {namespace}"
        - type: HPA
          description: "啟用基於記憶體的 HPA 自動擴展"
          command: "kubectl autoscale deployment {target} --memory-percent=80 --min=2 --max=5 -n {namespace}"
      reasoning: "[規則匹配] Pod OOMKilled 後 ReplicaSet 將自動重建，但需同步修正資源配置防止復發。"

  - id: high_cpu
    priority: 40
    description: Pod CPU 使用率過高
    match:
      alert_type:
        - cpu
        - high_cpu
    response:
      action_title: "擴展 {target} 副本數 + 啟用 HPA"
      description: "⚙️ 規則匹配: {target} CPU 使用率過高，根因為流量突增或計算密集任務未配置自動擴展。"
      suggested_action: SCALE_DEPLOYMENT
      kubectl_command: "kubectl scale deployment {target} --replicas=3 -n {namespace}"
      estimated_downtime: "0"
      risk: medium
      responsibility: INFRA
      responsibility_reasoning: "自動擴展策略未配置或閾值過高，屬基礎設施團隊責任"
      secondary_teams: [BE]
      optimization:
        - type: RESOURCE_LIMIT
          description: "增加 CPU request 確保 QoS 為 Guaranteed"
          command: "kubectl set resources deployment/{target} --requests=cpu=500m --limits=cpu=2000m -n {namespace}"
      reasoning: "[規則匹配] 水平擴展可即時分散負載，同時建議配置 HPA 防止復發。"

  - id: http_5xx
    priority: 50
    description: HTTP 5xx 錯誤率過高
    match:
      alert_type:
        - http
      message:
        - "5xx"
        - "502"
        - "503"
        - "500"
    response:
      action_title: "重啟 {target} + 檢查上游服務"
      description: "⚙️ 規則匹配: {target} 產生 HTTP 5xx 錯誤，可能為應用程式例外或上游服務不可達。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "kubectl rollout restart deployment/{target} -n {namespace}"
      estimated_downtime: "~1 min"
      risk: critical
      responsibility: COLLAB
      responsibility_reasoning: "HTTP 5xx 可能源於前端路由、後端邏輯或基礎設施，需多團隊協同排查"
      secondary_teams: [FE, BE, INFRA]
      optimization:
        - type: CIRCUIT_BREAKER
          description: "配置熔斷器防止故障擴散"
          command: "# Istio VirtualService outlierDetection 配置"
      reasoning: "[規則匹配] HTTP 錯誤需協同排查，先重啟恢復服務同時通知相關團隊。"

  - id: pod_crash
    priority: 60
    description: Pod CrashLoopBackOff
    match:
      alert_type:
        - pod_crash
        - crash
      message:
        - crashloop
        - crash
        - backoff
    response:
      action_title: "診斷 {target} CrashLoop 根因"
      description: "⚙️ 規則匹配: {target} 進入 CrashLoopBackOff，需檢查啟動錯誤日誌。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "kubectl logs {target} -n {namespace} --previous --tail=50"
      estimated_downtime: "依根因而定"
      risk: critical
      responsibility: BE
      responsibility_reasoning: "Pod crash 通常源於應用程式啟動錯誤，屬後端團隊責任"
      secondary_teams: [INFRA]
      optimization:
        - type: LIVENESS_PROBE
          description: "調整 liveness probe 初始延遲防止誤殺"
          command: "# 調整 initialDelaySeconds >= 應用啟動時間"
      reasoning: "[規則匹配] 先查 previous log 確認 crash 原因，再決定修復策略。"

  # ── 通用兜底 ────────────────────────────────────────────────

  - id: generic_fallback
    priority: 999
    description: 通用兜底規則 (無法匹配的告警)
    match:
      alertname:
        - "*"
    response:
      action_title: "重新啟動 {target} 服務"
      description: "⚙️ 規則匹配: {target} 發生異常，需進一步診斷確認根因。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "kubectl rollout restart deployment/{target} -n {namespace}"
      estimated_downtime: "5-15 min"
      risk: medium
      responsibility: COLLAB
      responsibility_reasoning: "告警資訊不足以判定單一責任團隊，建議多團隊協同排查"
      secondary_teams: [BE, INFRA]
      optimization: []
      reasoning: "[規則匹配] 根據告警先重啟恢復服務，同時安排深入診斷。"