feat(openclaw): 告警規則引擎 — alert_rules.yaml 取代硬編碼 if/elif

- 新增 alert_rules.yaml: 6 條規則 (docker/target_down/oom/cpu/5xx/crash) + 通用兜底 - 新增 alert_rule_engine.py: YAML 載入、匹配邏輯、變數填充 - openclaw.py _generate_mock_response: 重構為呼叫規則引擎 (v8.0) - 新增規則只需修改 YAML，重啟 Pod 即可，不需改代碼 - 2026-04-09 ogt: 架構重構 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-09 09:05:23 +08:00
parent 7e327c806e
commit d1ede7f989
3 changed files with 423 additions and 238 deletions
--- a/apps/api/alert_rules.yaml
+++ b/apps/api/alert_rules.yaml
@@ -0,0 +1,199 @@
+# AWOOOI OpenClaw 告警規則匹配引擎
+# ============================================================
+# 格式說明:
+#   match.alertname   : Prometheus alertname 完全匹配 (list = OR)
+#   match.alert_type  : alert_type 關鍵字 (list = OR, 部分匹配)
+#   match.message     : message 關鍵字 (list = OR, 部分匹配, 不分大小寫)
+#   response.*        : 回應模板，支援變數 {target} {host} {container} {instance} {job} {namespace}
+#   responsibility    : FE / BE / INFRA / DB / COLLAB
+#   risk              : low / medium / critical
+#   confidence        : 0.0 (規則匹配固定值，禁止偽造)
+#
+# 修改規則: 不需要重新部署，重啟 API Pod 即可熱載入
+# 新增規則: 在 rules 清單末尾加入，priority 越小越優先
+# 2026-04-09 ogt: 初版，從 openclaw.py _generate_mock_response 抽出
+# ============================================================
+
+version: "1.0.0"
+updated_at: "2026-04-09"
+
+rules:
+  # ── Docker / Host 層 ────────────────────────────────────────
+
+  - id: docker_container_unhealthy
+    priority: 10
+    description: Docker 容器 healthcheck 失敗
+    match:
+      alertname:
+        - DockerContainerUnhealthy
+      message:
+        - unhealthy
+        - health check
+        - healthcheck
+    response:
+      action_title: "檢查 Docker 容器 {container} 健康狀態"
+      description: "⚙️ 規則匹配: Docker 容器 {container} ({host}) healthcheck 失敗。常見原因: 應用程式啟動慢、healthcheck 指令錯誤、依賴服務未就緒。"
+      suggested_action: RESTART_DEPLOYMENT
+      kubectl_command: "ssh {host} 'docker inspect {container} --format=\"{{.State.Health.Status}}\" && docker restart {container}'"
+      estimated_downtime: "~30s"
+      risk: medium
+      responsibility: INFRA
+      responsibility_reasoning: "Docker 容器健康檢查失敗屬基礎設施團隊責任，需確認 healthcheck 設定與容器狀態"
+      secondary_teams: [BE]
+      optimization:
+        - type: HEALTHCHECK
+          description: "確認 healthcheck 指令在容器內可執行 (mc/curl 是否存在)"
+          command: "ssh {host} 'docker exec {container} sh -c \"mc ready local 2>/dev/null || curl -sf http://localhost:9000/minio/health/live\"'"
+      reasoning: "[規則匹配] Docker healthcheck 失敗先 restart 恢復服務，同時確認 healthcheck 指令正確。"
+
+  - id: target_down
+    priority: 20
+    description: Prometheus scrape target 下線
+    match:
+      alertname:
+        - TargetDown
+        - InstanceDown
+    response:
+      action_title: "確認 {job} ({instance}) 服務存活"
+      description: "⚙️ 規則匹配: Prometheus 無法抓取 {instance} ({job}) 指標。常見原因: 主機離線、exporter crash、防火牆封鎖。"
+      suggested_action: RESTART_DEPLOYMENT
+      kubectl_command: "ssh {host} 'systemctl status node_exporter 2>/dev/null || docker ps | grep exporter'"
+      estimated_downtime: "監控盲區持續中"
+      risk: medium
+      responsibility: INFRA
+      responsibility_reasoning: "Prometheus scrape 目標下線屬基礎設施監控範疇"
+      secondary_teams: []
+      optimization:
+        - type: MONITORING
+          description: "確認 exporter 進程是否存活"
+          command: "ssh {host} 'ps aux | grep exporter | grep -v grep'"
+      reasoning: "[規則匹配] Prometheus target 下線，先 SSH 確認主機存活再重啟 exporter。"
+
+  # ── K8s Pod 層 ──────────────────────────────────────────────
+
+  - id: oom_killed
+    priority: 30
+    description: Pod OOMKilled 記憶體不足
+    match:
+      alert_type:
+        - memory
+      message:
+        - oomkilled
+        - oom
+        - out of memory
+    response:
+      action_title: "刪除異常 Pod {target} (OOMKilled)"
+      description: "⚙️ 規則匹配: {target} 發生 OOMKilled，根因為 JVM Heap 配置與 K8s memory limit 不匹配或存在記憶體洩漏。"
+      suggested_action: DELETE_POD
+      kubectl_command: "kubectl delete pod {target} -n {namespace}"
+      estimated_downtime: "~30s"
+      risk: critical
+      responsibility: BE
+      responsibility_reasoning: "OOMKilled 通常源於應用程式記憶體配置不當，屬後端團隊責任範圍"
+      secondary_teams: [INFRA]
+      optimization:
+        - type: RESOURCE_LIMIT
+          description: "調整 memory limit 至 1Gi 並確保 JVM -Xmx 不超過 70%"
+          command: "kubectl set resources deployment/{target} -c {target} --limits=memory=1Gi -n {namespace}"
+        - type: HPA
+          description: "啟用基於記憶體的 HPA 自動擴展"
+          command: "kubectl autoscale deployment {target} --memory-percent=80 --min=2 --max=5 -n {namespace}"
+      reasoning: "[規則匹配] Pod OOMKilled 後 ReplicaSet 將自動重建，但需同步修正資源配置防止復發。"
+
+  - id: high_cpu
+    priority: 40
+    description: Pod CPU 使用率過高
+    match:
+      alert_type:
+        - cpu
+        - high_cpu
+    response:
+      action_title: "擴展 {target} 副本數 + 啟用 HPA"
+      description: "⚙️ 規則匹配: {target} CPU 使用率過高，根因為流量突增或計算密集任務未配置自動擴展。"
+      suggested_action: SCALE_DEPLOYMENT
+      kubectl_command: "kubectl scale deployment {target} --replicas=3 -n {namespace}"
+      estimated_downtime: "0"
+      risk: medium
+      responsibility: INFRA
+      responsibility_reasoning: "自動擴展策略未配置或閾值過高，屬基礎設施團隊責任"
+      secondary_teams: [BE]
+      optimization:
+        - type: RESOURCE_LIMIT
+          description: "增加 CPU request 確保 QoS 為 Guaranteed"
+          command: "kubectl set resources deployment/{target} --requests=cpu=500m --limits=cpu=2000m -n {namespace}"
+      reasoning: "[規則匹配] 水平擴展可即時分散負載，同時建議配置 HPA 防止復發。"
+
+  - id: http_5xx
+    priority: 50
+    description: HTTP 5xx 錯誤率過高
+    match:
+      alert_type:
+        - http
+      message:
+        - "5xx"
+        - "502"
+        - "503"
+        - "500"
+    response:
+      action_title: "重啟 {target} + 檢查上游服務"
+      description: "⚙️ 規則匹配: {target} 產生 HTTP 5xx 錯誤，可能為應用程式例外或上游服務不可達。"
+      suggested_action: RESTART_DEPLOYMENT
+      kubectl_command: "kubectl rollout restart deployment/{target} -n {namespace}"
+      estimated_downtime: "~1 min"
+      risk: critical
+      responsibility: COLLAB
+      responsibility_reasoning: "HTTP 5xx 可能源於前端路由、後端邏輯或基礎設施，需多團隊協同排查"
+      secondary_teams: [FE, BE, INFRA]
+      optimization:
+        - type: CIRCUIT_BREAKER
+          description: "配置熔斷器防止故障擴散"
+          command: "# Istio VirtualService outlierDetection 配置"
+      reasoning: "[規則匹配] HTTP 錯誤需協同排查，先重啟恢復服務同時通知相關團隊。"
+
+  - id: pod_crash
+    priority: 60
+    description: Pod CrashLoopBackOff
+    match:
+      alert_type:
+        - pod_crash
+        - crash
+      message:
+        - crashloop
+        - crash
+        - backoff
+    response:
+      action_title: "診斷 {target} CrashLoop 根因"
+      description: "⚙️ 規則匹配: {target} 進入 CrashLoopBackOff，需檢查啟動錯誤日誌。"
+      suggested_action: RESTART_DEPLOYMENT
+      kubectl_command: "kubectl logs {target} -n {namespace} --previous --tail=50"
+      estimated_downtime: "依根因而定"
+      risk: critical
+      responsibility: BE
+      responsibility_reasoning: "Pod crash 通常源於應用程式啟動錯誤，屬後端團隊責任"
+      secondary_teams: [INFRA]
+      optimization:
+        - type: LIVENESS_PROBE
+          description: "調整 liveness probe 初始延遲防止誤殺"
+          command: "# 調整 initialDelaySeconds >= 應用啟動時間"
+      reasoning: "[規則匹配] 先查 previous log 確認 crash 原因，再決定修復策略。"
+
+  # ── 通用兜底 ────────────────────────────────────────────────
+
+  - id: generic_fallback
+    priority: 999
+    description: 通用兜底規則 (無法匹配的告警)
+    match:
+      alertname:
+        - "*"
+    response:
+      action_title: "重新啟動 {target} 服務"
+      description: "⚙️ 規則匹配: {target} 發生異常，需進一步診斷確認根因。"
+      suggested_action: RESTART_DEPLOYMENT
+      kubectl_command: "kubectl rollout restart deployment/{target} -n {namespace}"
+      estimated_downtime: "5-15 min"
+      risk: medium
+      responsibility: COLLAB
+      responsibility_reasoning: "告警資訊不足以判定單一責任團隊，建議多團隊協同排查"
+      secondary_teams: [BE, INFRA]
+      optimization: []
+      reasoning: "[規則匹配] 根據告警先重啟恢復服務，同時安排深入診斷。"
--- a/apps/api/src/services/alert_rule_engine.py
+++ b/apps/api/src/services/alert_rule_engine.py
@@ -0,0 +1,195 @@
+"""
+OpenClaw 告警規則匹配引擎
+============================================================
+從 alert_rules.yaml 載入規則，取代 openclaw.py 中硬編碼的 if/elif 規則匹配。
+
+設計原則:
+- 規則在 YAML 定義，不需要改 Python 代碼
+- 匹配邏輯: alertname 完全匹配 > alert_type 部分匹配 > message 關鍵字
+- priority 越小越優先，999 = 通用兜底
+- 變數替換: {target} {host} {container} {instance} {job} {namespace}
+
+2026-04-09 ogt: 初版，從 openclaw.py _generate_mock_response if/elif 抽出
+"""
+
+from __future__ import annotations
+
+import re
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+
+import structlog
+import yaml
+
+logger = structlog.get_logger(__name__)
+
+RULES_FILE = Path(__file__).parent.parent.parent / "alert_rules.yaml"
+
+# ── 變數提取 ────────────────────────────────────────────────
+
+
+def _extract_vars(alert_context: dict) -> dict[str, str]:
+    """從 alert_context 提取模板變數"""
+    labels = alert_context.get("labels", {})
+    raw_target = alert_context.get("target_resource", "unknown")
+    instance = labels.get("instance", raw_target)
+    host = instance.split(":")[0] if ":" in instance else instance
+    container = labels.get("name", labels.get("container", raw_target))
+    job = labels.get("job", "exporter")
+    namespace = alert_context.get("namespace", "awoooi-prod")
+
+    # target: 優先用 pod label，否則用 raw_target（排除純 IP:port 和 alertname）
+    pod = labels.get("pod", "")
+    if pod:
+        target = pod
+    elif ":" in raw_target or raw_target == alert_context.get("labels", {}).get("alertname", ""):
+        # raw_target 是 IP:port 或 alertname — 用 job 或 container 代替
+        target = container if container != raw_target else job
+    else:
+        target = raw_target
+
+    return {
+        "target": target,
+        "host": host,
+        "container": container,
+        "instance": instance,
+        "job": job,
+        "namespace": namespace,
+    }
+
+
+def _fill(template: str, vars: dict[str, str]) -> str:
+    """填充模板變數，保留未知變數原樣"""
+    try:
+        return template.format_map(vars)
+    except (KeyError, ValueError):
+        return template
+
+
+# ── 規則載入 ────────────────────────────────────────────────
+
+
+@lru_cache(maxsize=1)
+def _load_rules() -> list[dict]:
+    """載入並快取規則（進程內不重載，重啟 Pod 才更新）"""
+    if not RULES_FILE.exists():
+        logger.warning("alert_rules_file_not_found", path=str(RULES_FILE))
+        return []
+    with RULES_FILE.open("r", encoding="utf-8") as f:
+        data = yaml.safe_load(f)
+    rules = sorted(data.get("rules", []), key=lambda r: r.get("priority", 999))
+    logger.info("alert_rules_loaded", count=len(rules), path=str(RULES_FILE))
+    return rules
+
+
+# ── 匹配邏輯 ────────────────────────────────────────────────
+
+
+def _matches(rule: dict, alertname: str, alert_type: str, message: str) -> bool:
+    """判斷規則是否匹配"""
+    match = rule.get("match", {})
+
+    # alertname 完全匹配
+    alertnames = match.get("alertname", [])
+    if alertnames and alertnames != ["*"]:
+        if alertname in alertnames:
+            return True
+
+    # alert_type 部分匹配
+    for kw in match.get("alert_type", []):
+        if kw.lower() in alert_type.lower():
+            return True
+
+    # message 關鍵字匹配（不分大小寫）
+    msg_lower = message.lower()
+    for kw in match.get("message", []):
+        if kw.lower() in msg_lower:
+            return True
+
+    return False
+
+
+def _is_generic(rule: dict) -> bool:
+    alertnames = rule.get("match", {}).get("alertname", [])
+    return alertnames == ["*"]
+
+
+# ── 公開 API ────────────────────────────────────────────────
+
+
+def match_rule(alert_context: dict) -> dict[str, Any] | None:
+    """
+    根據 alert_context 匹配規則，回傳填充後的 response dict。
+
+    Returns:
+        匹配到的規則 response，未匹配則回傳 None。
+        呼叫方應自行處理 None（走 AI 分析或通用兜底）。
+    """
+    labels = alert_context.get("labels", {})
+    alertname = labels.get("alertname", alert_context.get("alert_type", "custom"))
+    alert_type = alert_context.get("alert_type", "custom")
+    message = alert_context.get("message", "")
+    severity = alert_context.get("severity", "warning")
+
+    rules = _load_rules()
+    vars = _extract_vars(alert_context)
+
+    matched_rule = None
+    for rule in rules:
+        if _is_generic(rule):
+            continue  # 通用兜底最後才用
+        if _matches(rule, alertname, alert_type, message):
+            matched_rule = rule
+            break
+
+    # 未匹配到具體規則 → 用通用兜底
+    if matched_rule is None:
+        for rule in rules:
+            if _is_generic(rule):
+                matched_rule = rule
+                break
+
+    if matched_rule is None:
+        return None
+
+    resp = matched_rule["response"]
+    risk = resp.get("risk", "medium")
+    # severity=critical 強制升級風險等級
+    if severity == "critical" and risk == "medium":
+        risk = "critical"
+
+    optimization = [
+        {
+            "type": o["type"],
+            "description": _fill(o.get("description", ""), vars),
+            "kubectl_or_config": _fill(o.get("command", ""), vars),
+        }
+        for o in resp.get("optimization", [])
+    ]
+
+    return {
+        "rule_id": matched_rule["id"],
+        "action_title": _fill(resp["action_title"], vars),
+        "description": _fill(resp["description"], vars),
+        "suggested_action": resp["suggested_action"],
+        "kubectl_command": _fill(resp["kubectl_command"], vars),
+        "target_resource": vars["target"],
+        "namespace": vars["namespace"],
+        "risk_level": risk,
+        "blast_radius": {
+            "affected_pods": 1,
+            "estimated_downtime": resp.get("estimated_downtime", "unknown"),
+            "related_services": [vars["target"]],
+            "data_impact": "NONE",
+        },
+        "primary_responsibility": resp.get("responsibility", "COLLAB"),
+        "responsibility_reasoning": resp.get("responsibility_reasoning", ""),
+        "secondary_teams": resp.get("secondary_teams", []),
+        "optimization_suggestions": optimization,
+        "reasoning": _fill(resp.get("reasoning", ""), vars),
+        "deviation_analysis": "規則引擎觸發，監控指標偏離正常基準",
+        "confidence": 0.0,  # 🔴 規則匹配固定 0.0，禁止偽造
+        "affected_services": [vars["target"]],
+        "signoz_correlation": "",
+    }
--- a/apps/api/src/services/openclaw.py
+++ b/apps/api/src/services/openclaw.py
@@ -572,274 +572,65 @@ class OpenClawService:
        signoz_metrics: GoldMetrics | None = None,
    ) -> str:
        """
-        Mock LLM 回應生成器 - 智能降級 (v7.0 含 SignOz)
+        Mock LLM 回應生成器 - 規則引擎降級 (v8.0)

-        根據告警類型和 SignOz 數據動態產生合理的 RCA 分析結果
+        從 alert_rules.yaml 載入規則，取代硬編碼 if/elif。
+        新增規則只需修改 YAML，不需要改代碼重新部署。
+
+        2026-04-09 ogt: 重構為規則引擎，移除 if/elif 硬編碼
        """
+        from src.services.alert_rule_engine import match_rule
+
        time.sleep(random.uniform(0.3, 0.8))  # 模擬思考延遲

-        alert_type = alert_context.get("alert_type", "custom")
-        severity = alert_context.get("severity", "warning")
-        raw_target = alert_context.get("target_resource", "unknown-service")
-        raw_namespace = alert_context.get("namespace", "default")
-        message = alert_context.get("message", "")
-        metrics = alert_context.get("metrics", {})
-        labels = alert_context.get("labels", {})
-        alertname = labels.get("alertname", alert_type)
-
-        # Phase 18.1: 正規化資源名稱 (ADR-016)
-        # 確保 kubectl 指令使用有效的 K8s 名稱
-        normalized = normalize_resource_name(raw_target, raw_namespace)
-        if normalized.is_k8s_resource and normalized.normalized:
-            target = normalized.normalized
-            namespace = normalized.namespace or raw_namespace
-            logger.info(
-                "mock_response_resource_normalized",
-                original=raw_target,
-                normalized=target,
-                namespace=namespace,
-            )
-        else:
-            target = raw_target
-            namespace = raw_namespace
-
        # SignOz 數據整合
-        signoz_summary = ""
        signoz_correlation = "SignOz 數據擷取中..."
        if signoz_metrics:
-            signoz_summary = signoz_metrics.to_summary()
            signoz_correlation = (
                f"RPS={signoz_metrics.rps:.1f} ({signoz_metrics.rps_trend}), "
                f"Error={signoz_metrics.error_rate:.2f}%, "
                f"P99={signoz_metrics.p99_latency_ms:.0f}ms"
            )

-        # 生成調優指令
-        tuning = self.generate_auto_tuning_command(
-            alert_type=alert_type,
-            target_resource=target,
-            namespace=namespace,
-            metrics=signoz_metrics,
-        )
-
-        # 根據告警類型生成專業 RCA + 仲裁
-        # 🔴 2026-03-29 ogt: Mock 響應必須標示為規則匹配，不是 AI 仲裁
-        # 2026-04-09 ogt: 新增 DockerContainerUnhealthy / TargetDown 專屬規則
-        if alertname == "DockerContainerUnhealthy" or "unhealthy" in message.lower():
-            container_name = labels.get("name", target)
-            host = labels.get("instance", "").split(":")[0] or "192.168.0.188"
-            mock_response = {
-                "action_title": f"檢查 Docker 容器 {container_name} 健康狀態",
-                "description": f"⚙️ 規則匹配: Docker 容器 {container_name} ({host}) healthcheck 失敗。常見原因: 應用程式啟動慢、healthcheck 指令錯誤、依賴服務未就緒。{signoz_summary}",
-                "suggested_action": "RESTART_DEPLOYMENT",
-                "kubectl_command": f"ssh {host} 'docker inspect {container_name} --format=\"{{{{.State.Health.Status}}}}\" && docker restart {container_name}'",
-                "target_resource": container_name,
-                "namespace": namespace,
-                "risk_level": "medium",
-                "blast_radius": {
-                    "affected_pods": 1,
-                    "estimated_downtime": "~30s",
-                    "related_services": [container_name],
-                    "data_impact": "NONE",
-                },
-                "primary_responsibility": "INFRA",
-                "responsibility_reasoning": "Docker 容器健康檢查失敗屬基礎設施團隊責任，需確認 healthcheck 設定與容器狀態",
-                "secondary_teams": ["BE"],
-                "optimization_suggestions": [
-                    {
-                        "type": "HEALTHCHECK",
-                        "description": "確認 healthcheck 指令在容器內可執行 (mc/curl 是否存在)",
-                        "kubectl_or_config": f"ssh {host} 'docker exec {container_name} sh -c \"mc ready local || curl -f http://localhost:9000/minio/health/live\"'",
-                    }
-                ],
-                "reasoning": f"[規則匹配] Docker healthcheck 失敗先 restart 恢復服務，同時確認 healthcheck 指令正確。{signoz_correlation}",
-                "deviation_analysis": "容器健康檢查連續失敗，超出允許次數",
-                "confidence": 0.0,
-                "affected_services": [container_name],
-                "signoz_correlation": signoz_correlation,
-            }
-        elif alertname == "TargetDown" or (labels.get("job") and ":" in raw_target):
-            instance = labels.get("instance", raw_target)
-            job = labels.get("job", "exporter")
-            host = instance.split(":")[0]
-            mock_response = {
-                "action_title": f"檢查 {job} ({instance}) 服務存活",
-                "description": f"⚙️ 規則匹配: Prometheus 無法抓取 {instance} ({job}) 指標。常見原因: 主機離線、exporter crash、防火牆封鎖。{signoz_summary}",
-                "suggested_action": "RESTART_DEPLOYMENT",
-                "kubectl_command": f"ssh {host} 'systemctl status node_exporter 2>/dev/null || docker ps | grep exporter'",
-                "target_resource": instance,
-                "namespace": namespace,
-                "risk_level": "medium",
-                "blast_radius": {
-                    "affected_pods": 0,
-                    "estimated_downtime": "監控盲區持續中",
-                    "related_services": [job],
-                    "data_impact": "NONE",
-                },
-                "primary_responsibility": "INFRA",
-                "responsibility_reasoning": "Prometheus scrape 目標下線屬基礎設施監控範疇",
-                "secondary_teams": [],
-                "optimization_suggestions": [
-                    {
-                        "type": "MONITORING",
-                        "description": f"確認 {host} 上的 {job} exporter 是否正常運行",
-                        "kubectl_or_config": f"ssh {host} 'ps aux | grep exporter'",
-                    }
-                ],
-                "reasoning": f"[規則匹配] Prometheus target 下線，先 SSH 確認主機存活再重啟 exporter。{signoz_correlation}",
-                "deviation_analysis": "Prometheus scrape 失敗，監控數據中斷",
-                "confidence": 0.0,
-                "affected_services": [instance],
-                "signoz_correlation": signoz_correlation,
-            }
-        elif "oom" in message.lower() or "memory" in alert_type.lower():
-            mock_response = {
-                "action_title": f"刪除異常 Pod {target} (OOMKilled)",
-                "description": f"⚙️ 規則匹配: {target} 發生 OOMKilled，根因為 JVM Heap 配置與 K8s memory limit 不匹配或存在記憶體洩漏。{signoz_summary}",
-                "suggested_action": "DELETE_POD",
-                "kubectl_command": f"kubectl delete pod {target} -n {namespace}",
-                "target_resource": target,
-                "namespace": namespace,
-                "risk_level": "critical" if severity == "critical" else "medium",
-                "blast_radius": {
-                    "affected_pods": 1,
-                    "estimated_downtime": "~30s",
-                    "related_services": ["api-gateway", "downstream-service"],
-                    "data_impact": "NONE"
-                },
-                "primary_responsibility": "BE",
-                "responsibility_reasoning": "OOMKilled 通常源於應用程式記憶體配置不當，屬後端團隊責任範圍",
-                "secondary_teams": ["INFRA"],
-                "optimization_suggestions": [
-                    {
-                        "type": "RESOURCE_LIMIT",
-                        "description": "調整 memory limit 至 1Gi 並確保 JVM -Xmx 不超過 70%",
-                        "kubectl_or_config": f"kubectl set resources deployment/{target.rsplit('-', 2)[0]} -c {target.rsplit('-', 2)[0]} --limits=memory=1Gi -n {namespace}"
-                    },
-                    {
-                        "type": "HPA",
-                        "description": "啟用基於記憶體的 HPA 自動擴展",
-                        "kubectl_or_config": f"kubectl autoscale deployment {target.rsplit('-', 2)[0]} --memory-percent=80 --min=2 --max=5 -n {namespace}"
-                    }
-                ],
-                "reasoning": f"⚙️ Pod OOMKilled 後 ReplicaSet 將自動重建，但需同步修正資源配置防止復發。{signoz_correlation}",
-                "deviation_analysis": f"Memory 使用率 {metrics.get('memory_percent', 99)}%，超出基準線 60% 達 +6.5σ",
-                "confidence": 0.0,  # 🔴 規則匹配，非 AI 仲裁
-                "affected_services": [target, "api-gateway"],
-                "signoz_correlation": signoz_correlation,
-            }
-        elif "cpu" in alert_type.lower() or "high_cpu" in alert_type:
-            # 根據 SignOz RPS 調整策略
-            rps_context = ""
-            if signoz_metrics and signoz_metrics.rps > 50:
-                rps_context = f"SignOz 顯示 RPS={signoz_metrics.rps:.0f}，流量較高，建議配置 HPA。"
-
-            mock_response = {
-                "action_title": f"擴展 {target} 副本數 + 啟用 HPA",
-                "description": f"⚙️ 規則匹配: {target} CPU 使用率過高，根因為流量突增或計算密集任務未配置自動擴展。{rps_context}",
-                "suggested_action": "SCALE_DEPLOYMENT",
-                "kubectl_command": tuning["command"],
-                "target_resource": target,
-                "namespace": namespace,
-                "risk_level": "medium",
-                "blast_radius": {
-                    "affected_pods": 0,
-                    "estimated_downtime": "0",
-                    "related_services": [],
-                    "data_impact": "NONE"
-                },
-                "primary_responsibility": "INFRA",
-                "responsibility_reasoning": "自動擴展策略未配置或閾值過高，屬基礎設施團隊責任",
-                "secondary_teams": ["BE"],
-                "optimization_suggestions": [
-                    {
-                        "type": tuning["type"],
-                        "description": tuning["description"],
-                        "kubectl_or_config": tuning["command"],
-                    },
-                    {
-                        "type": "RESOURCE_LIMIT",
-                        "description": "增加 CPU request 確保 QoS 為 Guaranteed",
-                        "kubectl_or_config": f"kubectl set resources deployment/{target} --requests=cpu=500m --limits=cpu=2000m -n {namespace}"
-                    }
-                ],
-                "reasoning": f"[規則匹配] 水平擴展可即時分散負載，同時建議配置 HPA 防止復發。{signoz_correlation}",
-                "deviation_analysis": f"CPU 使用率 {metrics.get('cpu_percent', 95)}%，超出基準線 50% 達 +4.5σ",
-                "confidence": 0.0,  # 🔴 規則匹配不是 AI 仲裁，信心度設 0
-                "affected_services": [target],
-                "signoz_correlation": signoz_correlation,
-            }
-        elif "http" in alert_type.lower() or "5xx" in message.lower() or "502" in message.lower():
-            mock_response = {
-                "action_title": f"重啟 {target} + 檢查上游服務",
-                "description": f"⚙️ 規則匹配: {target} 產生 HTTP 5xx 錯誤，可能為應用程式例外或上游服務不可達。{signoz_summary}",
-                "suggested_action": "RESTART_DEPLOYMENT",
-                "kubectl_command": f"kubectl rollout restart deployment/{target} -n {namespace}",
-                "target_resource": target,
-                "namespace": namespace,
-                "risk_level": "critical",
-                "blast_radius": {
-                    "affected_pods": 3,
-                    "estimated_downtime": "~1 min",
-                    "related_services": ["nginx-ingress", "upstream-api"],
-                    "data_impact": "NONE"
-                },
-                "primary_responsibility": "COLLAB",
-                "responsibility_reasoning": "HTTP 5xx 可能源於前端路由、後端邏輯或基礎設施，需多團隊協同排查",
-                "secondary_teams": ["FE", "BE", "INFRA"],
-                "optimization_suggestions": [
-                    {
-                        "type": "CIRCUIT_BREAKER",
-                        "description": "配置熔斷器防止故障擴散",
-                        "kubectl_or_config": "# Istio VirtualService outlierDetection 配置"
-                    },
-                    {
-                        "type": "CACHE",
-                        "description": "增加 Redis 緩存減少上游壓力",
-                        "kubectl_or_config": "# 檢查 Redis 連線池配置，建議 maxTotal=50"
-                    }
-                ],
-                "reasoning": f"[規則匹配] HTTP 錯誤需協同排查，先重啟恢復服務同時通知相關團隊。{signoz_correlation}",
-                "deviation_analysis": "錯誤率 5%，超出基準線 0.1% 達 +50σ",
-                "confidence": 0.0,  # 🔴 規則匹配不是 AI 仲裁，信心度設 0
-                "affected_services": [target, "nginx-ingress", "upstream-api"],
-                "signoz_correlation": signoz_correlation,
-            }
-        else:
-            # 通用異常處理
+        mock_response = match_rule(alert_context)
+        if mock_response is None:
+            # match_rule 不應該回傳 None（有通用兜底），但防禦性處理
+            alert_type = alert_context.get("alert_type", "custom")
+            target = alert_context.get("target_resource", "unknown")
+            namespace = alert_context.get("namespace", "awoooi-prod")
            mock_response = {
                "action_title": f"重新啟動 {target} 服務",
-                "description": f"⚙️ 規則匹配: {target} 發生異常: {message[:80]}。需進一步診斷確認根因。{signoz_summary}",
+                "description": f"⚙️ 規則匹配: {target} 發生異常，需進一步診斷確認根因。",
                "suggested_action": "RESTART_DEPLOYMENT",
                "kubectl_command": f"kubectl rollout restart deployment/{target} -n {namespace}",
                "target_resource": target,
                "namespace": namespace,
-                "risk_level": "critical" if severity == "critical" else "medium",
+                "risk_level": "medium",
                "blast_radius": {
-                    "affected_pods": 3,
-                    "estimated_downtime": "~1 min",
-                    "related_services": ["dependent-services"],
-                    "data_impact": "NONE"
+                    "affected_pods": 1,
+                    "estimated_downtime": "5-15 min",
+                    "related_services": [target],
+                    "data_impact": "NONE",
                },
                "primary_responsibility": "COLLAB",
-                "responsibility_reasoning": "告警資訊不足以判定單一責任團隊，建議多團隊協同排查",
+                "responsibility_reasoning": "告警資訊不足，建議多團隊協同排查",
                "secondary_teams": ["BE", "INFRA"],
-                "optimization_suggestions": [
-                    {
-                        "type": tuning["type"],
-                        "description": tuning["description"],
-                        "kubectl_or_config": tuning["command"],
-                    }
-                ],
-                "reasoning": f"[規則匹配] 根據告警 {alert_type} 先重啟恢復服務，同時安排深入診斷。{signoz_correlation}",
+                "optimization_suggestions": [],
+                "reasoning": f"[規則匹配] 根據告警 {alert_type} 先重啟恢復服務。",
                "deviation_analysis": "監控指標顯示異常偏離基準線",
-                "confidence": 0.0,  # 🔴 規則匹配不是 AI 仲裁，信心度設 0
+                "confidence": 0.0,
                "affected_services": [target],
                "signoz_correlation": signoz_correlation,
            }

+        # 補充 SignOz 關聯資訊（規則引擎不持有 signoz_metrics）
+        mock_response["signoz_correlation"] = signoz_correlation
+        if signoz_metrics:
+            mock_response["description"] += f" {signoz_metrics.to_summary()}"
+
        logger.info(
            "mock_llm_response_generated",
+            rule_id=mock_response.get("rule_id", "unknown"),
            action_title=mock_response["action_title"],
            risk_level=mock_response["risk_level"],
            primary_responsibility=mock_response["primary_responsibility"],