awoooi/apps/api/alert_rules.yaml

# AWOOOI OpenClaw 告警規則匹配引擎
# ============================================================
# 格式說明:
#   match.alertname   : Prometheus alertname 完全匹配 (list = OR)
#   match.alert_type  : alert_type 關鍵字 (list = OR, 部分匹配)
#   match.message     : message 關鍵字 (list = OR, 部分匹配, 不分大小寫)
#   response.*        : 回應模板，支援變數 {target} {host} {container} {instance} {job} {namespace}
#   responsibility    : FE / BE / INFRA / DB / COLLAB
#   risk              : low / medium / critical
#   confidence        : 0.0 (規則匹配固定值，禁止偽造)
#
# 修改規則: 不需要重新部署，重啟 API Pod 即可熱載入
# 新增規則: 在 rules 清單末尾加入，priority 越小越優先
# 2026-04-09 ogt: 初版，從 openclaw.py _generate_mock_response 抽出
# ============================================================

version: "1.0.0"
updated_at: "2026-04-09"

rules:
  # ── Docker / Host 層 ────────────────────────────────────────

  - id: docker_container_unhealthy
    priority: 10
    description: Docker 容器 healthcheck 失敗
    match:
      alertname:
        - DockerContainerUnhealthy
      message:
        - unhealthy
        - health check
        - healthcheck
    response:
      action_title: "檢查 Docker 容器 {container} 健康狀態"
      description: "⚙️ 規則匹配: Docker 容器 {container} ({host}) healthcheck 失敗。常見原因: 應用程式啟動慢、healthcheck 指令錯誤、依賴服務未就緒。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "ssh {host} 'docker inspect {container} --format=\"{{.State.Health.Status}}\" && docker restart {container}'"
      estimated_downtime: "~30s"
      risk: medium
      responsibility: INFRA
      responsibility_reasoning: "Docker 容器健康檢查失敗屬基礎設施團隊責任，需確認 healthcheck 設定與容器狀態"
      secondary_teams: [BE]
      optimization:
        - type: HEALTHCHECK
          description: "確認 healthcheck 指令在容器內可執行 (mc/curl 是否存在)"
          command: "ssh {host} 'docker exec {container} sh -c \"mc ready local 2>/dev/null || curl -sf http://localhost:9000/minio/health/live\"'"
      reasoning: "[規則匹配] Docker healthcheck 失敗先 restart 恢復服務，同時確認 healthcheck 指令正確。"

  - id: target_down
    priority: 20
    description: Prometheus scrape target 下線 — 自動重啟 exporter
    match:
      alertname:
        - TargetDown
        - InstanceDown
    response:
      action_title: "重啟 {job} exporter on {host}"
      description: "⚙️ 規則匹配: Prometheus 無法抓取 {instance} ({job}) 指標。自動重啟主機上的 exporter container。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "ssh {host} 'docker restart $(docker ps -a --filter name=exporter --format \"{{.Names}}\" | head -1) 2>/dev/null || systemctl restart node_exporter 2>/dev/null || systemctl restart prometheus-node-exporter'"
      estimated_downtime: "~30s"
      risk: medium
      responsibility: INFRA
      responsibility_reasoning: "Prometheus scrape 目標下線屬基礎設施監控範疇，自動重啟 exporter"
      secondary_teams: []
      optimization:
        - type: MONITORING
          description: "確認 exporter 重啟後可被 Prometheus scrape"
          command: "ssh {host} 'curl -s http://localhost:{port}/metrics | head -3'"
      reasoning: "[規則匹配] Prometheus target 下線，SSH 到主機重啟 exporter container 或 systemd service。"

  # ── K8s Pod 層 ──────────────────────────────────────────────

  - id: oom_killed
    priority: 30
    description: Pod OOMKilled 記憶體不足
    match:
      # 2026-04-10 Claude Sonnet 4.6: Phase 2 飛輪修復 — 補齊 Prometheus alertname 變體
      alertname:
        - PodOOMKilled
        - KubePodOOMKilled
        - KubernetesMemoryPressure
        - NodeMemoryUsageHigh
        - HighMemoryUsage
      alert_type:
        - memory
      message:
        - oomkilled
        - oom
        - out of memory
    response:
      action_title: "刪除異常 Pod {target} (OOMKilled)"
      description: "⚙️ 規則匹配: {target} 發生 OOMKilled，根因為 JVM Heap 配置與 K8s memory limit 不匹配或存在記憶體洩漏。"
      suggested_action: DELETE_POD
      kubectl_command: "kubectl delete pod {target} -n {namespace}"
      estimated_downtime: "~30s"
      risk: critical
      responsibility: BE
      responsibility_reasoning: "OOMKilled 通常源於應用程式記憶體配置不當，屬後端團隊責任範圍"
      secondary_teams: [INFRA]
      optimization:
        - type: RESOURCE_LIMIT
          description: "調整 memory limit 至 1Gi 並確保 JVM -Xmx 不超過 70%"
          command: "kubectl set resources deployment/{target} -c {target} --limits=memory=1Gi -n {namespace}"
        - type: HPA
          description: "啟用基於記憶體的 HPA 自動擴展"
          command: "kubectl autoscale deployment {target} --memory-percent=80 --min=2 --max=5 -n {namespace}"
      reasoning: "[規則匹配] Pod OOMKilled 後 ReplicaSet 將自動重建，但需同步修正資源配置防止復發。"

  - id: high_cpu
    priority: 40
    description: Pod/Node CPU 使用率過高
    match:
      # 2026-04-10 Claude Sonnet 4.6: Phase 2 飛輪修復 — 補齊 Prometheus alertname 變體
      alertname:
        - HighCPUUsage
        - ContainerCpuUsageSecondsTotal
        - HostHighCpuLoad
        - NodeCPUUsageHigh
        - CPUThrottlingHigh
        - KubeCPUOvercommit
      alert_type:
        - cpu
        - high_cpu
    response:
      action_title: "擴展 {target} 副本數 + 啟用 HPA"
      description: "⚙️ 規則匹配: {target} CPU 使用率過高，根因為流量突增或計算密集任務未配置自動擴展。"
      suggested_action: SCALE_DEPLOYMENT
      kubectl_command: "kubectl scale deployment {target} --replicas=3 -n {namespace}"
      estimated_downtime: "0"
      risk: medium
      responsibility: INFRA
      responsibility_reasoning: "自動擴展策略未配置或閾值過高，屬基礎設施團隊責任"
      secondary_teams: [BE]
      optimization:
        - type: RESOURCE_LIMIT
          description: "增加 CPU request 確保 QoS 為 Guaranteed"
          command: "kubectl set resources deployment/{target} --requests=cpu=500m --limits=cpu=2000m -n {namespace}"
      reasoning: "[規則匹配] 水平擴展可即時分散負載，同時建議配置 HPA 防止復發。"

  - id: http_5xx
    priority: 50
    description: HTTP 5xx 錯誤率過高
    match:
      alert_type:
        - http
      message:
        - "5xx"
        - "502"
        - "503"
        - "500"
    response:
      action_title: "重啟 {target} + 檢查上游服務"
      description: "⚙️ 規則匹配: {target} 產生 HTTP 5xx 錯誤，可能為應用程式例外或上游服務不可達。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "kubectl rollout restart deployment/{target} -n {namespace}"
      estimated_downtime: "~1 min"
      risk: critical
      responsibility: COLLAB
      responsibility_reasoning: "HTTP 5xx 可能源於前端路由、後端邏輯或基礎設施，需多團隊協同排查"
      secondary_teams: [FE, BE, INFRA]
      optimization:
        - type: CIRCUIT_BREAKER
          description: "配置熔斷器防止故障擴散"
          command: "# Istio VirtualService outlierDetection 配置"
      reasoning: "[規則匹配] HTTP 錯誤需協同排查，先重啟恢復服務同時通知相關團隊。"

  - id: pod_crash
    priority: 60
    description: Pod CrashLoopBackOff
    match:
      # 2026-04-10 Claude Sonnet 4.6: Phase 2 飛輪修復 — 補齊 Prometheus alertname 變體
      alertname:
        - KubePodCrashLooping
        - PodCrashLoopBackOff
        - KubernetesPodCrashLooping
      alert_type:
        - pod_crash
        - crash
      message:
        - crashloop
        - crash
        - backoff
    response:
      action_title: "診斷 {target} CrashLoop 根因"
      description: "⚙️ 規則匹配: {target} 進入 CrashLoopBackOff，需檢查啟動錯誤日誌。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "kubectl logs {target} -n {namespace} --previous --tail=50"
      estimated_downtime: "依根因而定"
      risk: critical
      responsibility: BE
      responsibility_reasoning: "Pod crash 通常源於應用程式啟動錯誤，屬後端團隊責任"
      secondary_teams: [INFRA]
      optimization:
        - type: LIVENESS_PROBE
          description: "調整 liveness probe 初始延遲防止誤殺"
          command: "# 調整 initialDelaySeconds >= 應用啟動時間"
      reasoning: "[規則匹配] 先查 previous log 確認 crash 原因，再決定修復策略。"

  # ── 資料庫層 ─────────────────────────────────────────────────

  - id: postgresql_down
    priority: 70
    description: PostgreSQL 服務下線
    match:
      alertname:
        - PostgreSQLDown
      message:
        - postgresql
        - postgres
        - pg down
    response:
      action_title: "重啟 PostgreSQL {target}"
      description: "⚙️ 規則匹配: PostgreSQL ({instance}) 無法連線。常見原因: 程序崩潰、磁碟空間不足、連線數超限。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "kubectl rollout restart deployment/postgresql -n {namespace}"
      estimated_downtime: "~2 min"
      risk: critical
      responsibility: DB
      responsibility_reasoning: "PostgreSQL 下線屬資料庫團隊責任，需立即確認資料完整性"
      secondary_teams: [INFRA, BE]
      optimization:
        - type: HEALTH_CHECK
          description: "確認 PostgreSQL 連線與資料完整性"
          command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SELECT 1'"
      reasoning: "[規則匹配] PostgreSQL 下線影響所有依賴服務，優先重啟恢復，同時確認資料無損。"

  - id: postgresql_connection_pool
    priority: 75
    description: PostgreSQL 連線池耗盡或接近上限
    match:
      alertname:
        - PostgreSQLConnectionPoolNearLimit
        - PostgreSQLConnectionPoolExhausted
      message:
        - connection pool
        - connections
        - pgbouncer
    response:
      action_title: "清理 PostgreSQL 閒置連線"
      description: "⚙️ 規則匹配: PostgreSQL 連線池使用率過高，可能導致新請求被拒絕。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE state = ''idle'' AND state_change < NOW() - INTERVAL ''5 minutes'';'"
      estimated_downtime: "0"
      risk: critical
      responsibility: DB
      responsibility_reasoning: "連線池管理屬資料庫設定範疇"
      secondary_teams: [BE]
      optimization:
        - type: CONNECTION_POOL
          description: "調整 max_connections 或啟用 PgBouncer 連線池"
          command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SHOW max_connections;'"
      reasoning: "[規則匹配] 清理閒置連線是最快恢復手段，同時需排查連線洩漏。"

  - id: postgresql_slow_queries
    priority: 80
    description: PostgreSQL 慢查詢告警
    match:
      alertname:
        - PostgreSQLSlowQueries
        - PostgreSQLLockWaiting
      message:
        - slow query
        - lock wait
        - deadlock
    response:
      action_title: "診斷 PostgreSQL 慢查詢 + 索引優化"
      description: "⚙️ 規則匹配: PostgreSQL 存在慢查詢或鎖等待，影響系統整體性能。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SELECT pid, query, state, wait_event_type, wait_event FROM pg_stat_activity WHERE state != ''idle'' ORDER BY query_start;'"
      estimated_downtime: "0"
      risk: medium
      responsibility: DB
      responsibility_reasoning: "慢查詢優化屬資料庫效能調優範疇"
      secondary_teams: [BE]
      optimization:
        - type: INDEX
          description: "使用 EXPLAIN ANALYZE 找出缺少索引的查詢"
          command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SELECT * FROM pg_stat_user_tables ORDER BY seq_scan DESC LIMIT 10;'"
      reasoning: "[規則匹配] 先找出阻塞查詢，必要時 pg_terminate_backend 解除鎖定。"

  # ── 基礎設施服務層 ──────────────────────────────────────────

  - id: redis_down
    priority: 85
    description: Redis 服務下線
    match:
      alertname:
        - RedisDown
      message:
        - redis
        - cache down
    response:
      action_title: "重啟 Redis {target}"
      description: "⚙️ 規則匹配: Redis ({instance}) 無法連線。影響 Session 管理、去重快取、AI Router 狀態。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "kubectl rollout restart deployment/redis -n {namespace}"
      estimated_downtime: "~30s"
      risk: critical
      responsibility: INFRA
      responsibility_reasoning: "Redis 屬基礎設施快取層，下線影響多個上層服務"
      secondary_teams: [BE]
      optimization:
        - type: HEALTH_CHECK
          description: "確認 Redis 連線"
          command: "kubectl exec -n {namespace} deployment/redis -- redis-cli ping"
      reasoning: "[規則匹配] Redis 下線會導致去重失效和 AI Router 狀態丟失，需立即重啟。"

  - id: ollama_down
    priority: 90
    description: Ollama AI 服務下線
    match:
      alertname:
        - OllamaDown
      message:
        - ollama
        - llm down
        - ai service
    response:
      action_title: "重啟 Ollama 服務 on {host}"
      description: "⚙️ 規則匹配: Ollama ({instance}) 無法連線。影響 AI 規則自動生成和本地推理。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "ssh {host} 'systemctl restart ollama || docker restart ollama'"
      estimated_downtime: "~2 min (model reload)"
      risk: medium
      responsibility: INFRA
      responsibility_reasoning: "Ollama 屬 AI 推理基礎設施，由基礎設施團隊管理"
      secondary_teams: []
      optimization:
        - type: HEALTH_CHECK
          description: "確認 Ollama 狀態和已載入模型"
          command: "curl -s http://{host}:11434/api/tags | jq '.models[].name'"
      reasoning: "[規則匹配] Ollama 下線觸發 AI Router fallback 至 Gemini，重啟恢復本地推理能力。"

  - id: minio_down
    priority: 95
    description: MinIO 物件儲存下線
    match:
      alertname:
        - MinioDown
      message:
        - minio
        - s3
        - object storage
    response:
      action_title: "重啟 MinIO {target}"
      description: "⚙️ 規則匹配: MinIO ({instance}) 無法連線。影響靜態資源和備份儲存。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "ssh {host} 'docker restart minio'"
      estimated_downtime: "~1 min"
      risk: critical
      responsibility: INFRA
      responsibility_reasoning: "MinIO 屬物件儲存基礎設施"
      secondary_teams: []
      optimization:
        - type: DISK_CHECK
          description: "確認磁碟空間充足"
          command: "ssh {host} 'df -h /data/minio'"
      reasoning: "[規則匹配] MinIO 下線需先確認磁碟空間，再重啟服務。"

  - id: minio_disk_high
    priority: 96
    description: MinIO 磁碟使用率過高
    match:
      alertname:
        - MinioDiskUsageHigh
        - MinioDiskUsageCritical
      message:
        - disk usage
        - disk full
        - storage
    response:
      action_title: "清理 MinIO 過期資料 on {host}"
      description: "⚙️ 規則匹配: MinIO 磁碟使用率過高，需清理舊資料或擴展儲存空間。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "ssh {host} 'df -h /data/minio && du -sh /data/minio/* | sort -rh | head -10'"
      estimated_downtime: "0"
      risk: critical
      responsibility: INFRA
      responsibility_reasoning: "磁碟空間管理屬基礎設施團隊責任"
      secondary_teams: []
      optimization:
        - type: CLEANUP
          description: "清理 MinIO 舊備份和 lifecycle policy"
          command: "mc admin lifecycle add local --expiry-days 30"
      reasoning: "[規則匹配] 磁碟滿會導致寫入失敗，需立即清理最大的目錄。"

  - id: harbor_down
    priority: 97
    description: Harbor Registry 下線
    match:
      alertname:
        - HarborDown
      message:
        - harbor
        - registry
        - docker registry
    response:
      action_title: "重啟 Harbor Registry on {host}"
      description: "⚙️ 規則匹配: Harbor ({instance}) 無法連線。影響 CD 部署流程。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "ssh {host} 'cd /data/harbor && docker-compose up -d'"
      estimated_downtime: "~2 min"
      risk: critical
      responsibility: INFRA
      responsibility_reasoning: "Harbor 是 CD 部署的核心依賴，屬基礎設施團隊責任"
      secondary_teams: []
      optimization:
        - type: HEALTH_CHECK
          description: "確認 Harbor 各組件狀態"
          command: "ssh {host} 'cd /data/harbor && docker-compose ps'"
      reasoning: "[規則匹配] Harbor 下線會阻塞所有 CD 部署，需立即重啟。"

  # ── K8s 叢集層 ──────────────────────────────────────────────

  - id: k3s_node_down
    priority: 100
    description: K3s 節點下線
    match:
      alertname:
        - K3sNodeDown
        - K3sVIPDown
      message:
        - node down
        - node not ready
        - k3s
    response:
      action_title: "確認 K3s 節點 {target} 狀態"
      description: "⚙️ 規則匹配: K3s 節點下線，影響叢集可用性和 Pod 調度。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "kubectl get nodes -o wide && kubectl describe node {target}"
      estimated_downtime: "依節點恢復時間"
      risk: critical
      responsibility: INFRA
      responsibility_reasoning: "K3s 叢集節點管理屬基礎設施團隊責任"
      secondary_teams: []
      optimization:
        - type: NODE_DRAIN
          description: "先 drain 節點確保 Pod 安全遷移"
          command: "kubectl drain {target} --ignore-daemonsets --delete-emptydir-data"
      reasoning: "[規則匹配] 節點下線需先確認主機可達性，必要時手動遷移 workload。"

  - id: awoooi_api_down
    priority: 105
    description: AWOOOI API 服務下線
    match:
      alertname:
        - AWOOOIApiDown
        - OpenClawDown
      message:
        - awoooi api
        - openclaw
        - api down
    response:
      action_title: "重啟 AWOOOI API deployment"
      description: "⚙️ 規則匹配: AWOOOI API 無法連線。影響所有告警處理和 AI 決策流程。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "kubectl rollout restart deployment/awoooi-api -n awoooi"
      estimated_downtime: "~1 min"
      risk: critical
      responsibility: BE
      responsibility_reasoning: "AWOOOI API 是核心服務，屬後端團隊直接責任"
      secondary_teams: [INFRA]
      optimization:
        - type: HEALTH_CHECK
          description: "確認 API Pod 狀態和最近 log"
          command: "kubectl get pods -n awoooi && kubectl logs -n awoooi deployment/awoooi-api --tail=50"
      reasoning: "[規則匹配] AWOOOI API 下線需立即重啟，同時查 Pod log 確認根因。"

  # ── 告警鏈路監控 ────────────────────────────────────────────

  - id: alert_chain_broken
    priority: 110
    description: 告警鏈路中斷
    match:
      alertname:
        - AlertChainBroken_Alertmanager
        - AlertChainBroken_Sentry
        - AlertChainBroken_SignOz
        - AlertChainUnhealthy
        - NoAlertsReceived2Hours
      message:
        - alert chain
        - alertmanager
        - no alerts
    response:
      action_title: "診斷告警鏈路中斷"
      description: "⚙️ 規則匹配: 告警鏈路異常，可能導致真實告警無法送達 Telegram。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "kubectl get pods -n monitoring && curl -s http://192.168.0.120:9093/api/v1/status | jq '.data.uptime'"
      estimated_downtime: "監控盲區持續中"
      risk: critical
      responsibility: INFRA
      responsibility_reasoning: "告警鏈路屬基礎設施監控體系，需立即修復確保可觀測性"
      secondary_teams: [BE]
      optimization:
        - type: E2E_TEST
          description: "發送測試告警驗證整條鏈路"
          command: "curl -X POST http://192.168.0.125:32334/api/v1/test-alert -H 'Content-Type: application/json' -d '{\"test\": true}'"
      reasoning: "[規則匹配] 告警鏈路中斷等同監控失明，最高優先修復。"

  # ── GPU / AI 基礎設施 ────────────────────────────────────────

  - id: nvidia_circuit_breaker
    priority: 115
    description: NVIDIA/Nemotron 熔斷器開啟
    match:
      alertname:
        - NvidiaCircuitBreakerOpen
        - NvidiaToolCallingHighErrorRate
        - NvidiaToolCallingHighLatency
      message:
        - circuit breaker
        - nvidia
        - nemotron
        - tool calling
    response:
      action_title: "確認 NVIDIA API 熔斷狀態"
      description: "⚙️ 規則匹配: NVIDIA/Nemotron 熔斷器開啟或錯誤率過高，AI Router 已自動降級。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "curl -s http://192.168.0.125:32334/api/v1/ai-router/status | jq '.providers'"
      estimated_downtime: "0 (已自動 fallback)"
      risk: medium
      responsibility: BE
      responsibility_reasoning: "AI Provider 熔斷管理屬後端 AI Router 責任範圍"
      secondary_teams: []
      optimization:
        - type: CIRCUIT_BREAKER_RESET
          description: "等待熔斷器自動恢復 (half-open 狀態)"
          command: "curl -s http://192.168.0.125:32334/api/v1/ai-router/reset -X POST"
      reasoning: "[規則匹配] AI Router 已自動降級至備援 Provider，監控熔斷器恢復狀態即可。"

  # ── E2E / Smoke Test 告警 ────────────────────────────────────
  # 2026-04-09 Claude Sonnet 4.6: E2E test 假告警識別，僅記錄不修復

  - id: e2e_smoke_test
    priority: 120
    description: E2E Smoke Test / 告警鏈路驗證假告警
    match:
      alertname:
        - E2E_SMOKE_TEST
        - E2E_FINAL_SMOKE_TEST
        - SmokeTest
      instance_prefix:
        - e2e-final-
        - e2e-test-
        - test-host
        - smoke-test-
      message:
        - e2e smoke test
        - smoke test
        - please ignore
        - e2e test
        - e2e-final
        - e2e-test
        - e2e_smoke
        - alert chain smoke
    response:
      action_title: "告警鏈路驗證成功 (E2E)"
      description: "✅ E2E Smoke Test 告警已收到，告警鏈路正常。此告警僅用於驗證，無需修復動作。"
      suggested_action: NO_ACTION
      kubectl_command: ""
      estimated_downtime: "N/A"
      risk: low
      responsibility: INFRA
      responsibility_reasoning: "E2E smoke test 假告警，告警鏈路驗證用途，系統自動識別跳過修復"
      secondary_teams: []
      optimization: []
      reasoning: "[規則匹配] E2E Smoke Test 假告警，僅確認告警鏈路暢通，無實際服務異常。"

  # ── 通用兜底 ────────────────────────────────────────────────

  - id: generic_fallback
    priority: 999
    description: 通用兜底規則 (無法匹配的告警)
    match:
      alertname:
        - "*"
    response:
      action_title: "重新啟動 {target} 服務"
      description: "⚙️ 規則匹配: {target} 發生異常，需進一步診斷確認根因。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "kubectl rollout restart deployment/{target} -n {namespace}"
      estimated_downtime: "5-15 min"
      risk: medium
      responsibility: COLLAB
      responsibility_reasoning: "告警資訊不足以判定單一責任團隊，建議多團隊協同排查"
      secondary_teams: [BE, INFRA]
      optimization: []
      reasoning: "[規則匹配] 根據告警先重啟恢復服務，同時安排深入診斷。"