feat(openclaw): 告警規則引擎 — alert_rules.yaml 取代硬編碼 if/elif

- 新增 alert_rules.yaml: 6 條規則 (docker/target_down/oom/cpu/5xx/crash) + 通用兜底 - 新增 alert_rule_engine.py: YAML 載入、匹配邏輯、變數填充 - openclaw.py _generate_mock_response: 重構為呼叫規則引擎 (v8.0) - 新增規則只需修改 YAML，重啟 Pod 即可，不需改代碼 - 2026-04-09 ogt: 架構重構 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-09 09:05:23 +08:00
parent 7e327c806e
commit d1ede7f989
3 changed files with 423 additions and 238 deletions
--- a/apps/api/alert_rules.yaml
+++ b/apps/api/alert_rules.yaml
@@ -0,0 +1,199 @@
 # AWOOOI OpenClaw 告警規則匹配引擎
 # ============================================================
 # 格式說明:
 #   match.alertname   : Prometheus alertname 完全匹配 (list = OR)
 #   match.alert_type  : alert_type 關鍵字 (list = OR, 部分匹配)
 #   match.message     : message 關鍵字 (list = OR, 部分匹配, 不分大小寫)
 #   response.*        : 回應模板，支援變數 {target} {host} {container} {instance} {job} {namespace}
 #   responsibility    : FE / BE / INFRA / DB / COLLAB
 #   risk              : low / medium / critical
 #   confidence        : 0.0 (規則匹配固定值，禁止偽造)
 #
 # 修改規則: 不需要重新部署，重啟 API Pod 即可熱載入
 # 新增規則: 在 rules 清單末尾加入，priority 越小越優先
 # 2026-04-09 ogt: 初版，從 openclaw.py _generate_mock_response 抽出
 # ============================================================
 version: "1.0.0"
 updated_at: "2026-04-09"
 rules:
  # ── Docker / Host 層 ────────────────────────────────────────
  - id: docker_container_unhealthy
    priority: 10
    description: Docker 容器 healthcheck 失敗
    match:
      alertname:
        - DockerContainerUnhealthy
      message:
        - unhealthy
        - health check
        - healthcheck
    response:
      action_title: "檢查 Docker 容器 {container} 健康狀態"
      description: "⚙️ 規則匹配: Docker 容器 {container} ({host}) healthcheck 失敗。常見原因: 應用程式啟動慢、healthcheck 指令錯誤、依賴服務未就緒。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "ssh {host} 'docker inspect {container} --format=\"{{.State.Health.Status}}\" && docker restart {container}'"
      estimated_downtime: "~30s"
      risk: medium
      responsibility: INFRA
      responsibility_reasoning: "Docker 容器健康檢查失敗屬基礎設施團隊責任，需確認 healthcheck 設定與容器狀態"
      secondary_teams: [BE]
      optimization:
        - type: HEALTHCHECK
          description: "確認 healthcheck 指令在容器內可執行 (mc/curl 是否存在)"
          command: "ssh {host} 'docker exec {container} sh -c \"mc ready local 2>/dev/null || curl -sf http://localhost:9000/minio/health/live\"'"
      reasoning: "[規則匹配] Docker healthcheck 失敗先 restart 恢復服務，同時確認 healthcheck 指令正確。"
  - id: target_down
    priority: 20
    description: Prometheus scrape target 下線
    match:
      alertname:
        - TargetDown
        - InstanceDown
    response:
      action_title: "確認 {job} ({instance}) 服務存活"
      description: "⚙️ 規則匹配: Prometheus 無法抓取 {instance} ({job}) 指標。常見原因: 主機離線、exporter crash、防火牆封鎖。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "ssh {host} 'systemctl status node_exporter 2>/dev/null || docker ps | grep exporter'"
      estimated_downtime: "監控盲區持續中"
      risk: medium
      responsibility: INFRA
      responsibility_reasoning: "Prometheus scrape 目標下線屬基礎設施監控範疇"
      secondary_teams: []
      optimization:
        - type: MONITORING
          description: "確認 exporter 進程是否存活"
          command: "ssh {host} 'ps aux | grep exporter | grep -v grep'"
      reasoning: "[規則匹配] Prometheus target 下線，先 SSH 確認主機存活再重啟 exporter。"
  # ── K8s Pod 層 ──────────────────────────────────────────────
  - id: oom_killed
    priority: 30
    description: Pod OOMKilled 記憶體不足
    match:
      alert_type:
        - memory
      message:
        - oomkilled
        - oom
        - out of memory
    response:
      action_title: "刪除異常 Pod {target} (OOMKilled)"
      description: "⚙️ 規則匹配: {target} 發生 OOMKilled，根因為 JVM Heap 配置與 K8s memory limit 不匹配或存在記憶體洩漏。"
      suggested_action: DELETE_POD
      kubectl_command: "kubectl delete pod {target} -n {namespace}"
      estimated_downtime: "~30s"
      risk: critical
      responsibility: BE
      responsibility_reasoning: "OOMKilled 通常源於應用程式記憶體配置不當，屬後端團隊責任範圍"
      secondary_teams: [INFRA]
      optimization:
        - type: RESOURCE_LIMIT
          description: "調整 memory limit 至 1Gi 並確保 JVM -Xmx 不超過 70%"
          command: "kubectl set resources deployment/{target} -c {target} --limits=memory=1Gi -n {namespace}"
        - type: HPA
          description: "啟用基於記憶體的 HPA 自動擴展"
          command: "kubectl autoscale deployment {target} --memory-percent=80 --min=2 --max=5 -n {namespace}"
      reasoning: "[規則匹配] Pod OOMKilled 後 ReplicaSet 將自動重建，但需同步修正資源配置防止復發。"
  - id: high_cpu
    priority: 40
    description: Pod CPU 使用率過高
    match:
      alert_type:
        - cpu
        - high_cpu
    response:
      action_title: "擴展 {target} 副本數 + 啟用 HPA"
      description: "⚙️ 規則匹配: {target} CPU 使用率過高，根因為流量突增或計算密集任務未配置自動擴展。"
      suggested_action: SCALE_DEPLOYMENT
      kubectl_command: "kubectl scale deployment {target} --replicas=3 -n {namespace}"
      estimated_downtime: "0"
      risk: medium
      responsibility: INFRA
      responsibility_reasoning: "自動擴展策略未配置或閾值過高，屬基礎設施團隊責任"
      secondary_teams: [BE]
      optimization:
        - type: RESOURCE_LIMIT
          description: "增加 CPU request 確保 QoS 為 Guaranteed"
          command: "kubectl set resources deployment/{target} --requests=cpu=500m --limits=cpu=2000m -n {namespace}"
      reasoning: "[規則匹配] 水平擴展可即時分散負載，同時建議配置 HPA 防止復發。"
  - id: http_5xx
    priority: 50
    description: HTTP 5xx 錯誤率過高
    match:
      alert_type:
        - http
      message:
        - "5xx"
        - "502"
        - "503"
        - "500"
    response:
      action_title: "重啟 {target} + 檢查上游服務"
      description: "⚙️ 規則匹配: {target} 產生 HTTP 5xx 錯誤，可能為應用程式例外或上游服務不可達。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "kubectl rollout restart deployment/{target} -n {namespace}"
      estimated_downtime: "~1 min"
      risk: critical
      responsibility: COLLAB
      responsibility_reasoning: "HTTP 5xx 可能源於前端路由、後端邏輯或基礎設施，需多團隊協同排查"
      secondary_teams: [FE, BE, INFRA]
      optimization:
        - type: CIRCUIT_BREAKER
          description: "配置熔斷器防止故障擴散"
          command: "# Istio VirtualService outlierDetection 配置"
      reasoning: "[規則匹配] HTTP 錯誤需協同排查，先重啟恢復服務同時通知相關團隊。"
  - id: pod_crash
    priority: 60
    description: Pod CrashLoopBackOff
    match:
      alert_type:
        - pod_crash
        - crash
      message:
        - crashloop
        - crash
        - backoff
    response:
      action_title: "診斷 {target} CrashLoop 根因"
      description: "⚙️ 規則匹配: {target} 進入 CrashLoopBackOff，需檢查啟動錯誤日誌。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "kubectl logs {target} -n {namespace} --previous --tail=50"
      estimated_downtime: "依根因而定"
      risk: critical
      responsibility: BE
      responsibility_reasoning: "Pod crash 通常源於應用程式啟動錯誤，屬後端團隊責任"
      secondary_teams: [INFRA]
      optimization:
        - type: LIVENESS_PROBE
          description: "調整 liveness probe 初始延遲防止誤殺"
          command: "# 調整 initialDelaySeconds >= 應用啟動時間"
      reasoning: "[規則匹配] 先查 previous log 確認 crash 原因，再決定修復策略。"
  # ── 通用兜底 ────────────────────────────────────────────────
  - id: generic_fallback
    priority: 999
    description: 通用兜底規則 (無法匹配的告警)
    match:
      alertname:
        - "*"
    response:
      action_title: "重新啟動 {target} 服務"
      description: "⚙️ 規則匹配: {target} 發生異常，需進一步診斷確認根因。"
      suggested_action: RESTART_DEPLOYMENT
      kubectl_command: "kubectl rollout restart deployment/{target} -n {namespace}"
      estimated_downtime: "5-15 min"
      risk: medium
      responsibility: COLLAB
      responsibility_reasoning: "告警資訊不足以判定單一責任團隊，建議多團隊協同排查"
      secondary_teams: [BE, INFRA]
      optimization: []
      reasoning: "[規則匹配] 根據告警先重啟恢復服務，同時安排深入診斷。"
--- a/apps/api/src/services/alert_rule_engine.py
+++ b/apps/api/src/services/alert_rule_engine.py
@@ -0,0 +1,195 @@
 """
 OpenClaw 告警規則匹配引擎
 ============================================================
 從 alert_rules.yaml 載入規則，取代 openclaw.py 中硬編碼的 if/elif 規則匹配。
 設計原則:
 - 規則在 YAML 定義，不需要改 Python 代碼
 - 匹配邏輯: alertname 完全匹配 > alert_type 部分匹配 > message 關鍵字
 - priority 越小越優先，999 = 通用兜底
 - 變數替換: {target} {host} {container} {instance} {job} {namespace}
 2026-04-09 ogt: 初版，從 openclaw.py _generate_mock_response if/elif 抽出
 """
 from __future__ import annotations
 import re
 from functools import lru_cache
 from pathlib import Path
 from typing import Any
 import structlog
 import yaml
 logger = structlog.get_logger(__name__)
 RULES_FILE = Path(__file__).parent.parent.parent / "alert_rules.yaml"
 # ── 變數提取 ────────────────────────────────────────────────
 def _extract_vars(alert_context: dict) -> dict[str, str]:
    """從 alert_context 提取模板變數"""
    labels = alert_context.get("labels", {})
    raw_target = alert_context.get("target_resource", "unknown")
    instance = labels.get("instance", raw_target)
    host = instance.split(":")[0] if ":" in instance else instance
    container = labels.get("name", labels.get("container", raw_target))
    job = labels.get("job", "exporter")
    namespace = alert_context.get("namespace", "awoooi-prod")
    # target: 優先用 pod label，否則用 raw_target（排除純 IP:port 和 alertname）
    pod = labels.get("pod", "")
    if pod:
        target = pod
    elif ":" in raw_target or raw_target == alert_context.get("labels", {}).get("alertname", ""):
        # raw_target 是 IP:port 或 alertname — 用 job 或 container 代替
        target = container if container != raw_target else job
    else:
        target = raw_target
    return {
        "target": target,
        "host": host,
        "container": container,
        "instance": instance,
        "job": job,
        "namespace": namespace,
    }
 def _fill(template: str, vars: dict[str, str]) -> str:
    """填充模板變數，保留未知變數原樣"""
    try:
        return template.format_map(vars)
    except (KeyError, ValueError):
        return template
 # ── 規則載入 ────────────────────────────────────────────────
@lru_cache(maxsize=1)
 def _load_rules() -> list[dict]:
    """載入並快取規則（進程內不重載，重啟 Pod 才更新）"""
    if not RULES_FILE.exists():
        logger.warning("alert_rules_file_not_found", path=str(RULES_FILE))
        return []
    with RULES_FILE.open("r", encoding="utf-8") as f:
        data = yaml.safe_load(f)
    rules = sorted(data.get("rules", []), key=lambda r: r.get("priority", 999))
    logger.info("alert_rules_loaded", count=len(rules), path=str(RULES_FILE))
    return rules
 # ── 匹配邏輯 ────────────────────────────────────────────────
 def _matches(rule: dict, alertname: str, alert_type: str, message: str) -> bool:
    """判斷規則是否匹配"""
    match = rule.get("match", {})
    # alertname 完全匹配
    alertnames = match.get("alertname", [])
    if alertnames and alertnames != ["*"]:
        if alertname in alertnames:
            return True
    # alert_type 部分匹配
    for kw in match.get("alert_type", []):
        if kw.lower() in alert_type.lower():
            return True
    # message 關鍵字匹配（不分大小寫）
    msg_lower = message.lower()
    for kw in match.get("message", []):
        if kw.lower() in msg_lower:
            return True
    return False
 def _is_generic(rule: dict) -> bool:
    alertnames = rule.get("match", {}).get("alertname", [])
    return alertnames == ["*"]
 # ── 公開 API ────────────────────────────────────────────────
 def match_rule(alert_context: dict) -> dict[str, Any] | None:
    """
    根據 alert_context 匹配規則，回傳填充後的 response dict。
    Returns:
        匹配到的規則 response，未匹配則回傳 None。
        呼叫方應自行處理 None（走 AI 分析或通用兜底）。
    """
    labels = alert_context.get("labels", {})
    alertname = labels.get("alertname", alert_context.get("alert_type", "custom"))
    alert_type = alert_context.get("alert_type", "custom")
    message = alert_context.get("message", "")
    severity = alert_context.get("severity", "warning")
    rules = _load_rules()
    vars = _extract_vars(alert_context)
    matched_rule = None
    for rule in rules:
        if _is_generic(rule):
            continue  # 通用兜底最後才用
        if _matches(rule, alertname, alert_type, message):
            matched_rule = rule
            break
    # 未匹配到具體規則 → 用通用兜底
    if matched_rule is None:
        for rule in rules:
            if _is_generic(rule):
                matched_rule = rule
                break
    if matched_rule is None:
        return None
    resp = matched_rule["response"]
    risk = resp.get("risk", "medium")
    # severity=critical 強制升級風險等級
    if severity == "critical" and risk == "medium":
        risk = "critical"
    optimization = [
        {
            "type": o["type"],
            "description": _fill(o.get("description", ""), vars),
            "kubectl_or_config": _fill(o.get("command", ""), vars),
        }
        for o in resp.get("optimization", [])
    ]
    return {
        "rule_id": matched_rule["id"],
        "action_title": _fill(resp["action_title"], vars),
        "description": _fill(resp["description"], vars),
        "suggested_action": resp["suggested_action"],
        "kubectl_command": _fill(resp["kubectl_command"], vars),
        "target_resource": vars["target"],
        "namespace": vars["namespace"],
        "risk_level": risk,
        "blast_radius": {
            "affected_pods": 1,
            "estimated_downtime": resp.get("estimated_downtime", "unknown"),
            "related_services": [vars["target"]],
            "data_impact": "NONE",
        },
        "primary_responsibility": resp.get("responsibility", "COLLAB"),
        "responsibility_reasoning": resp.get("responsibility_reasoning", ""),
        "secondary_teams": resp.get("secondary_teams", []),
        "optimization_suggestions": optimization,
        "reasoning": _fill(resp.get("reasoning", ""), vars),
        "deviation_analysis": "規則引擎觸發，監控指標偏離正常基準",
        "confidence": 0.0,  # 🔴 規則匹配固定 0.0，禁止偽造
        "affected_services": [vars["target"]],
        "signoz_correlation": "",
    }
--- a/apps/api/src/services/openclaw.py
+++ b/apps/api/src/services/openclaw.py
@@ -572,274 +572,65 @@ class OpenClawService:
        signoz_metrics: GoldMetrics | None = None,
    ) -> str:
        """
-        Mock LLM 回應生成器 - 智能降級 (v7.0 含 SignOz)
+        Mock LLM 回應生成器 - 規則引擎降級 (v8.0)
-        根據告警類型和 SignOz 數據動態產生合理的 RCA 分析結果
+        從 alert_rules.yaml 載入規則，取代硬編碼 if/elif。
        新增規則只需修改 YAML，不需要改代碼重新部署。
        2026-04-09 ogt: 重構為規則引擎，移除 if/elif 硬編碼
        """
        from src.services.alert_rule_engine import match_rule
        time.sleep(random.uniform(0.3, 0.8))  # 模擬思考延遲
        alert_type = alert_context.get("alert_type", "custom")
        severity = alert_context.get("severity", "warning")
        raw_target = alert_context.get("target_resource", "unknown-service")
        raw_namespace = alert_context.get("namespace", "default")
        message = alert_context.get("message", "")
        metrics = alert_context.get("metrics", {})
        labels = alert_context.get("labels", {})
        alertname = labels.get("alertname", alert_type)
        # Phase 18.1: 正規化資源名稱 (ADR-016)
        # 確保 kubectl 指令使用有效的 K8s 名稱
        normalized = normalize_resource_name(raw_target, raw_namespace)
        if normalized.is_k8s_resource and normalized.normalized:
            target = normalized.normalized
            namespace = normalized.namespace or raw_namespace
            logger.info(
                "mock_response_resource_normalized",
                original=raw_target,
                normalized=target,
                namespace=namespace,
            )
        else:
            target = raw_target
            namespace = raw_namespace
        # SignOz 數據整合
        signoz_summary = ""
        signoz_correlation = "SignOz 數據擷取中..."
        if signoz_metrics:
            signoz_summary = signoz_metrics.to_summary()
            signoz_correlation = (
                f"RPS={signoz_metrics.rps:.1f} ({signoz_metrics.rps_trend}), "
                f"Error={signoz_metrics.error_rate:.2f}%, "
                f"P99={signoz_metrics.p99_latency_ms:.0f}ms"
            )
-        # 生成調優指令
+        mock_response = match_rule(alert_context)
-        tuning = self.generate_auto_tuning_command(
+        if mock_response is None:
-            alert_type=alert_type,
+            # match_rule 不應該回傳 None（有通用兜底），但防禦性處理
-            target_resource=target,
+            alert_type = alert_context.get("alert_type", "custom")
-            namespace=namespace,
+            target = alert_context.get("target_resource", "unknown")
-            metrics=signoz_metrics,
+            namespace = alert_context.get("namespace", "awoooi-prod")
        )
        # 根據告警類型生成專業 RCA + 仲裁
        # 🔴 2026-03-29 ogt: Mock 響應必須標示為規則匹配，不是 AI 仲裁
        # 2026-04-09 ogt: 新增 DockerContainerUnhealthy / TargetDown 專屬規則
        if alertname == "DockerContainerUnhealthy" or "unhealthy" in message.lower():
            container_name = labels.get("name", target)
            host = labels.get("instance", "").split(":")[0] or "192.168.0.188"
            mock_response = {
                "action_title": f"檢查 Docker 容器 {container_name} 健康狀態",
                "description": f"⚙️ 規則匹配: Docker 容器 {container_name} ({host}) healthcheck 失敗。常見原因: 應用程式啟動慢、healthcheck 指令錯誤、依賴服務未就緒。{signoz_summary}",
                "suggested_action": "RESTART_DEPLOYMENT",
                "kubectl_command": f"ssh {host} 'docker inspect {container_name} --format=\"{{{{.State.Health.Status}}}}\" && docker restart {container_name}'",
                "target_resource": container_name,
                "namespace": namespace,
                "risk_level": "medium",
                "blast_radius": {
                    "affected_pods": 1,
                    "estimated_downtime": "~30s",
                    "related_services": [container_name],
                    "data_impact": "NONE",
                },
                "primary_responsibility": "INFRA",
                "responsibility_reasoning": "Docker 容器健康檢查失敗屬基礎設施團隊責任，需確認 healthcheck 設定與容器狀態",
                "secondary_teams": ["BE"],
                "optimization_suggestions": [
                    {
                        "type": "HEALTHCHECK",
                        "description": "確認 healthcheck 指令在容器內可執行 (mc/curl 是否存在)",
                        "kubectl_or_config": f"ssh {host} 'docker exec {container_name} sh -c \"mc ready local || curl -f http://localhost:9000/minio/health/live\"'",
                    }
                ],
                "reasoning": f"[規則匹配] Docker healthcheck 失敗先 restart 恢復服務，同時確認 healthcheck 指令正確。{signoz_correlation}",
                "deviation_analysis": "容器健康檢查連續失敗，超出允許次數",
                "confidence": 0.0,
                "affected_services": [container_name],
                "signoz_correlation": signoz_correlation,
            }
        elif alertname == "TargetDown" or (labels.get("job") and ":" in raw_target):
            instance = labels.get("instance", raw_target)
            job = labels.get("job", "exporter")
            host = instance.split(":")[0]
            mock_response = {
                "action_title": f"檢查 {job} ({instance}) 服務存活",
                "description": f"⚙️ 規則匹配: Prometheus 無法抓取 {instance} ({job}) 指標。常見原因: 主機離線、exporter crash、防火牆封鎖。{signoz_summary}",
                "suggested_action": "RESTART_DEPLOYMENT",
                "kubectl_command": f"ssh {host} 'systemctl status node_exporter 2>/dev/null || docker ps | grep exporter'",
                "target_resource": instance,
                "namespace": namespace,
                "risk_level": "medium",
                "blast_radius": {
                    "affected_pods": 0,
                    "estimated_downtime": "監控盲區持續中",
                    "related_services": [job],
                    "data_impact": "NONE",
                },
                "primary_responsibility": "INFRA",
                "responsibility_reasoning": "Prometheus scrape 目標下線屬基礎設施監控範疇",
                "secondary_teams": [],
                "optimization_suggestions": [
                    {
                        "type": "MONITORING",
                        "description": f"確認 {host} 上的 {job} exporter 是否正常運行",
                        "kubectl_or_config": f"ssh {host} 'ps aux | grep exporter'",
                    }
                ],
                "reasoning": f"[規則匹配] Prometheus target 下線，先 SSH 確認主機存活再重啟 exporter。{signoz_correlation}",
                "deviation_analysis": "Prometheus scrape 失敗，監控數據中斷",
                "confidence": 0.0,
                "affected_services": [instance],
                "signoz_correlation": signoz_correlation,
            }
        elif "oom" in message.lower() or "memory" in alert_type.lower():
            mock_response = {
                "action_title": f"刪除異常 Pod {target} (OOMKilled)",
                "description": f"⚙️ 規則匹配: {target} 發生 OOMKilled，根因為 JVM Heap 配置與 K8s memory limit 不匹配或存在記憶體洩漏。{signoz_summary}",
                "suggested_action": "DELETE_POD",
                "kubectl_command": f"kubectl delete pod {target} -n {namespace}",
                "target_resource": target,
                "namespace": namespace,
                "risk_level": "critical" if severity == "critical" else "medium",
                "blast_radius": {
                    "affected_pods": 1,
                    "estimated_downtime": "~30s",
                    "related_services": ["api-gateway", "downstream-service"],
                    "data_impact": "NONE"
                },
                "primary_responsibility": "BE",
                "responsibility_reasoning": "OOMKilled 通常源於應用程式記憶體配置不當，屬後端團隊責任範圍",
                "secondary_teams": ["INFRA"],
                "optimization_suggestions": [
                    {
                        "type": "RESOURCE_LIMIT",
                        "description": "調整 memory limit 至 1Gi 並確保 JVM -Xmx 不超過 70%",
                        "kubectl_or_config": f"kubectl set resources deployment/{target.rsplit('-', 2)[0]} -c {target.rsplit('-', 2)[0]} --limits=memory=1Gi -n {namespace}"
                    },
                    {
                        "type": "HPA",
                        "description": "啟用基於記憶體的 HPA 自動擴展",
                        "kubectl_or_config": f"kubectl autoscale deployment {target.rsplit('-', 2)[0]} --memory-percent=80 --min=2 --max=5 -n {namespace}"
                    }
                ],
                "reasoning": f"⚙️ Pod OOMKilled 後 ReplicaSet 將自動重建，但需同步修正資源配置防止復發。{signoz_correlation}",
                "deviation_analysis": f"Memory 使用率 {metrics.get('memory_percent', 99)}%，超出基準線 60% 達 +6.5σ",
                "confidence": 0.0,  # 🔴 規則匹配，非 AI 仲裁
                "affected_services": [target, "api-gateway"],
                "signoz_correlation": signoz_correlation,
            }
        elif "cpu" in alert_type.lower() or "high_cpu" in alert_type:
            # 根據 SignOz RPS 調整策略
            rps_context = ""
            if signoz_metrics and signoz_metrics.rps > 50:
                rps_context = f"SignOz 顯示 RPS={signoz_metrics.rps:.0f}，流量較高，建議配置 HPA。"
            mock_response = {
                "action_title": f"擴展 {target} 副本數 + 啟用 HPA",
                "description": f"⚙️ 規則匹配: {target} CPU 使用率過高，根因為流量突增或計算密集任務未配置自動擴展。{rps_context}",
                "suggested_action": "SCALE_DEPLOYMENT",
                "kubectl_command": tuning["command"],
                "target_resource": target,
                "namespace": namespace,
                "risk_level": "medium",
                "blast_radius": {
                    "affected_pods": 0,
                    "estimated_downtime": "0",
                    "related_services": [],
                    "data_impact": "NONE"
                },
                "primary_responsibility": "INFRA",
                "responsibility_reasoning": "自動擴展策略未配置或閾值過高，屬基礎設施團隊責任",
                "secondary_teams": ["BE"],
                "optimization_suggestions": [
                    {
                        "type": tuning["type"],
                        "description": tuning["description"],
                        "kubectl_or_config": tuning["command"],
                    },
                    {
                        "type": "RESOURCE_LIMIT",
                        "description": "增加 CPU request 確保 QoS 為 Guaranteed",
                        "kubectl_or_config": f"kubectl set resources deployment/{target} --requests=cpu=500m --limits=cpu=2000m -n {namespace}"
                    }
                ],
                "reasoning": f"[規則匹配] 水平擴展可即時分散負載，同時建議配置 HPA 防止復發。{signoz_correlation}",
                "deviation_analysis": f"CPU 使用率 {metrics.get('cpu_percent', 95)}%，超出基準線 50% 達 +4.5σ",
                "confidence": 0.0,  # 🔴 規則匹配不是 AI 仲裁，信心度設 0
                "affected_services": [target],
                "signoz_correlation": signoz_correlation,
            }
        elif "http" in alert_type.lower() or "5xx" in message.lower() or "502" in message.lower():
            mock_response = {
                "action_title": f"重啟 {target} + 檢查上游服務",
                "description": f"⚙️ 規則匹配: {target} 產生 HTTP 5xx 錯誤，可能為應用程式例外或上游服務不可達。{signoz_summary}",
                "suggested_action": "RESTART_DEPLOYMENT",
                "kubectl_command": f"kubectl rollout restart deployment/{target} -n {namespace}",
                "target_resource": target,
                "namespace": namespace,
                "risk_level": "critical",
                "blast_radius": {
                    "affected_pods": 3,
                    "estimated_downtime": "~1 min",
                    "related_services": ["nginx-ingress", "upstream-api"],
                    "data_impact": "NONE"
                },
                "primary_responsibility": "COLLAB",
                "responsibility_reasoning": "HTTP 5xx 可能源於前端路由、後端邏輯或基礎設施，需多團隊協同排查",
                "secondary_teams": ["FE", "BE", "INFRA"],
                "optimization_suggestions": [
                    {
                        "type": "CIRCUIT_BREAKER",
                        "description": "配置熔斷器防止故障擴散",
                        "kubectl_or_config": "# Istio VirtualService outlierDetection 配置"
                    },
                    {
                        "type": "CACHE",
                        "description": "增加 Redis 緩存減少上游壓力",
                        "kubectl_or_config": "# 檢查 Redis 連線池配置，建議 maxTotal=50"
                    }
                ],
                "reasoning": f"[規則匹配] HTTP 錯誤需協同排查，先重啟恢復服務同時通知相關團隊。{signoz_correlation}",
                "deviation_analysis": "錯誤率 5%，超出基準線 0.1% 達 +50σ",
                "confidence": 0.0,  # 🔴 規則匹配不是 AI 仲裁，信心度設 0
                "affected_services": [target, "nginx-ingress", "upstream-api"],
                "signoz_correlation": signoz_correlation,
            }
        else:
            # 通用異常處理
            mock_response = {
                "action_title": f"重新啟動 {target} 服務",
-                "description": f"⚙️ 規則匹配: {target} 發生異常: {message[:80]}。需進一步診斷確認根因。{signoz_summary}",
+                "description": f"⚙️ 規則匹配: {target} 發生異常，需進一步診斷確認根因。",
                "suggested_action": "RESTART_DEPLOYMENT",
                "kubectl_command": f"kubectl rollout restart deployment/{target} -n {namespace}",
                "target_resource": target,
                "namespace": namespace,
-                "risk_level": "critical" if severity == "critical" else "medium",
+                "risk_level": "medium",
                "blast_radius": {
-                    "affected_pods": 3,
+                    "affected_pods": 1,
-                    "estimated_downtime": "~1 min",
+                    "estimated_downtime": "5-15 min",
-                    "related_services": ["dependent-services"],
+                    "related_services": [target],
-                    "data_impact": "NONE"
+                    "data_impact": "NONE",
                },
                "primary_responsibility": "COLLAB",
-                "responsibility_reasoning": "告警資訊不足以判定單一責任團隊，建議多團隊協同排查",
+                "responsibility_reasoning": "告警資訊不足，建議多團隊協同排查",
                "secondary_teams": ["BE", "INFRA"],
-                "optimization_suggestions": [
+                "optimization_suggestions": [],
-                    {
+                "reasoning": f"[規則匹配] 根據告警 {alert_type} 先重啟恢復服務。",
                        "type": tuning["type"],
                        "description": tuning["description"],
                        "kubectl_or_config": tuning["command"],
                    }
                ],
                "reasoning": f"[規則匹配] 根據告警 {alert_type} 先重啟恢復服務，同時安排深入診斷。{signoz_correlation}",
                "deviation_analysis": "監控指標顯示異常偏離基準線",
-                "confidence": 0.0,  # 🔴 規則匹配不是 AI 仲裁，信心度設 0
+                "confidence": 0.0,
                "affected_services": [target],
                "signoz_correlation": signoz_correlation,
            }
        # 補充 SignOz 關聯資訊（規則引擎不持有 signoz_metrics）
        mock_response["signoz_correlation"] = signoz_correlation
        if signoz_metrics:
            mock_response["description"] += f" {signoz_metrics.to_summary()}"
        logger.info(
            "mock_llm_response_generated",
            rule_id=mock_response.get("rule_id", "unknown"),
            action_title=mock_response["action_title"],
            risk_level=mock_response["risk_level"],
            primary_responsibility=mock_response["primary_responsibility"],