diff --git a/apps/api/alert_rules.yaml b/apps/api/alert_rules.yaml new file mode 100644 index 00000000..12fad747 --- /dev/null +++ b/apps/api/alert_rules.yaml @@ -0,0 +1,199 @@ +# AWOOOI OpenClaw 告警規則匹配引擎 +# ============================================================ +# 格式說明: +# match.alertname : Prometheus alertname 完全匹配 (list = OR) +# match.alert_type : alert_type 關鍵字 (list = OR, 部分匹配) +# match.message : message 關鍵字 (list = OR, 部分匹配, 不分大小寫) +# response.* : 回應模板,支援變數 {target} {host} {container} {instance} {job} {namespace} +# responsibility : FE / BE / INFRA / DB / COLLAB +# risk : low / medium / critical +# confidence : 0.0 (規則匹配固定值,禁止偽造) +# +# 修改規則: 不需要重新部署,重啟 API Pod 即可熱載入 +# 新增規則: 在 rules 清單末尾加入,priority 越小越優先 +# 2026-04-09 ogt: 初版,從 openclaw.py _generate_mock_response 抽出 +# ============================================================ + +version: "1.0.0" +updated_at: "2026-04-09" + +rules: + # ── Docker / Host 層 ──────────────────────────────────────── + + - id: docker_container_unhealthy + priority: 10 + description: Docker 容器 healthcheck 失敗 + match: + alertname: + - DockerContainerUnhealthy + message: + - unhealthy + - health check + - healthcheck + response: + action_title: "檢查 Docker 容器 {container} 健康狀態" + description: "⚙️ 規則匹配: Docker 容器 {container} ({host}) healthcheck 失敗。常見原因: 應用程式啟動慢、healthcheck 指令錯誤、依賴服務未就緒。" + suggested_action: RESTART_DEPLOYMENT + kubectl_command: "ssh {host} 'docker inspect {container} --format=\"{{.State.Health.Status}}\" && docker restart {container}'" + estimated_downtime: "~30s" + risk: medium + responsibility: INFRA + responsibility_reasoning: "Docker 容器健康檢查失敗屬基礎設施團隊責任,需確認 healthcheck 設定與容器狀態" + secondary_teams: [BE] + optimization: + - type: HEALTHCHECK + description: "確認 healthcheck 指令在容器內可執行 (mc/curl 是否存在)" + command: "ssh {host} 'docker exec {container} sh -c \"mc ready local 2>/dev/null || curl -sf http://localhost:9000/minio/health/live\"'" + reasoning: "[規則匹配] Docker healthcheck 失敗先 restart 恢復服務,同時確認 healthcheck 指令正確。" + + - id: target_down + priority: 20 + description: Prometheus scrape target 下線 + match: + alertname: + - TargetDown + - InstanceDown + response: + action_title: "確認 {job} ({instance}) 服務存活" + description: "⚙️ 規則匹配: Prometheus 無法抓取 {instance} ({job}) 指標。常見原因: 主機離線、exporter crash、防火牆封鎖。" + suggested_action: RESTART_DEPLOYMENT + kubectl_command: "ssh {host} 'systemctl status node_exporter 2>/dev/null || docker ps | grep exporter'" + estimated_downtime: "監控盲區持續中" + risk: medium + responsibility: INFRA + responsibility_reasoning: "Prometheus scrape 目標下線屬基礎設施監控範疇" + secondary_teams: [] + optimization: + - type: MONITORING + description: "確認 exporter 進程是否存活" + command: "ssh {host} 'ps aux | grep exporter | grep -v grep'" + reasoning: "[規則匹配] Prometheus target 下線,先 SSH 確認主機存活再重啟 exporter。" + + # ── K8s Pod 層 ────────────────────────────────────────────── + + - id: oom_killed + priority: 30 + description: Pod OOMKilled 記憶體不足 + match: + alert_type: + - memory + message: + - oomkilled + - oom + - out of memory + response: + action_title: "刪除異常 Pod {target} (OOMKilled)" + description: "⚙️ 規則匹配: {target} 發生 OOMKilled,根因為 JVM Heap 配置與 K8s memory limit 不匹配或存在記憶體洩漏。" + suggested_action: DELETE_POD + kubectl_command: "kubectl delete pod {target} -n {namespace}" + estimated_downtime: "~30s" + risk: critical + responsibility: BE + responsibility_reasoning: "OOMKilled 通常源於應用程式記憶體配置不當,屬後端團隊責任範圍" + secondary_teams: [INFRA] + optimization: + - type: RESOURCE_LIMIT + description: "調整 memory limit 至 1Gi 並確保 JVM -Xmx 不超過 70%" + command: "kubectl set resources deployment/{target} -c {target} --limits=memory=1Gi -n {namespace}" + - type: HPA + description: "啟用基於記憶體的 HPA 自動擴展" + command: "kubectl autoscale deployment {target} --memory-percent=80 --min=2 --max=5 -n {namespace}" + reasoning: "[規則匹配] Pod OOMKilled 後 ReplicaSet 將自動重建,但需同步修正資源配置防止復發。" + + - id: high_cpu + priority: 40 + description: Pod CPU 使用率過高 + match: + alert_type: + - cpu + - high_cpu + response: + action_title: "擴展 {target} 副本數 + 啟用 HPA" + description: "⚙️ 規則匹配: {target} CPU 使用率過高,根因為流量突增或計算密集任務未配置自動擴展。" + suggested_action: SCALE_DEPLOYMENT + kubectl_command: "kubectl scale deployment {target} --replicas=3 -n {namespace}" + estimated_downtime: "0" + risk: medium + responsibility: INFRA + responsibility_reasoning: "自動擴展策略未配置或閾值過高,屬基礎設施團隊責任" + secondary_teams: [BE] + optimization: + - type: RESOURCE_LIMIT + description: "增加 CPU request 確保 QoS 為 Guaranteed" + command: "kubectl set resources deployment/{target} --requests=cpu=500m --limits=cpu=2000m -n {namespace}" + reasoning: "[規則匹配] 水平擴展可即時分散負載,同時建議配置 HPA 防止復發。" + + - id: http_5xx + priority: 50 + description: HTTP 5xx 錯誤率過高 + match: + alert_type: + - http + message: + - "5xx" + - "502" + - "503" + - "500" + response: + action_title: "重啟 {target} + 檢查上游服務" + description: "⚙️ 規則匹配: {target} 產生 HTTP 5xx 錯誤,可能為應用程式例外或上游服務不可達。" + suggested_action: RESTART_DEPLOYMENT + kubectl_command: "kubectl rollout restart deployment/{target} -n {namespace}" + estimated_downtime: "~1 min" + risk: critical + responsibility: COLLAB + responsibility_reasoning: "HTTP 5xx 可能源於前端路由、後端邏輯或基礎設施,需多團隊協同排查" + secondary_teams: [FE, BE, INFRA] + optimization: + - type: CIRCUIT_BREAKER + description: "配置熔斷器防止故障擴散" + command: "# Istio VirtualService outlierDetection 配置" + reasoning: "[規則匹配] HTTP 錯誤需協同排查,先重啟恢復服務同時通知相關團隊。" + + - id: pod_crash + priority: 60 + description: Pod CrashLoopBackOff + match: + alert_type: + - pod_crash + - crash + message: + - crashloop + - crash + - backoff + response: + action_title: "診斷 {target} CrashLoop 根因" + description: "⚙️ 規則匹配: {target} 進入 CrashLoopBackOff,需檢查啟動錯誤日誌。" + suggested_action: RESTART_DEPLOYMENT + kubectl_command: "kubectl logs {target} -n {namespace} --previous --tail=50" + estimated_downtime: "依根因而定" + risk: critical + responsibility: BE + responsibility_reasoning: "Pod crash 通常源於應用程式啟動錯誤,屬後端團隊責任" + secondary_teams: [INFRA] + optimization: + - type: LIVENESS_PROBE + description: "調整 liveness probe 初始延遲防止誤殺" + command: "# 調整 initialDelaySeconds >= 應用啟動時間" + reasoning: "[規則匹配] 先查 previous log 確認 crash 原因,再決定修復策略。" + + # ── 通用兜底 ──────────────────────────────────────────────── + + - id: generic_fallback + priority: 999 + description: 通用兜底規則 (無法匹配的告警) + match: + alertname: + - "*" + response: + action_title: "重新啟動 {target} 服務" + description: "⚙️ 規則匹配: {target} 發生異常,需進一步診斷確認根因。" + suggested_action: RESTART_DEPLOYMENT + kubectl_command: "kubectl rollout restart deployment/{target} -n {namespace}" + estimated_downtime: "5-15 min" + risk: medium + responsibility: COLLAB + responsibility_reasoning: "告警資訊不足以判定單一責任團隊,建議多團隊協同排查" + secondary_teams: [BE, INFRA] + optimization: [] + reasoning: "[規則匹配] 根據告警先重啟恢復服務,同時安排深入診斷。" diff --git a/apps/api/src/services/alert_rule_engine.py b/apps/api/src/services/alert_rule_engine.py new file mode 100644 index 00000000..20d4c057 --- /dev/null +++ b/apps/api/src/services/alert_rule_engine.py @@ -0,0 +1,195 @@ +""" +OpenClaw 告警規則匹配引擎 +============================================================ +從 alert_rules.yaml 載入規則,取代 openclaw.py 中硬編碼的 if/elif 規則匹配。 + +設計原則: +- 規則在 YAML 定義,不需要改 Python 代碼 +- 匹配邏輯: alertname 完全匹配 > alert_type 部分匹配 > message 關鍵字 +- priority 越小越優先,999 = 通用兜底 +- 變數替換: {target} {host} {container} {instance} {job} {namespace} + +2026-04-09 ogt: 初版,從 openclaw.py _generate_mock_response if/elif 抽出 +""" + +from __future__ import annotations + +import re +from functools import lru_cache +from pathlib import Path +from typing import Any + +import structlog +import yaml + +logger = structlog.get_logger(__name__) + +RULES_FILE = Path(__file__).parent.parent.parent / "alert_rules.yaml" + +# ── 變數提取 ──────────────────────────────────────────────── + + +def _extract_vars(alert_context: dict) -> dict[str, str]: + """從 alert_context 提取模板變數""" + labels = alert_context.get("labels", {}) + raw_target = alert_context.get("target_resource", "unknown") + instance = labels.get("instance", raw_target) + host = instance.split(":")[0] if ":" in instance else instance + container = labels.get("name", labels.get("container", raw_target)) + job = labels.get("job", "exporter") + namespace = alert_context.get("namespace", "awoooi-prod") + + # target: 優先用 pod label,否則用 raw_target(排除純 IP:port 和 alertname) + pod = labels.get("pod", "") + if pod: + target = pod + elif ":" in raw_target or raw_target == alert_context.get("labels", {}).get("alertname", ""): + # raw_target 是 IP:port 或 alertname — 用 job 或 container 代替 + target = container if container != raw_target else job + else: + target = raw_target + + return { + "target": target, + "host": host, + "container": container, + "instance": instance, + "job": job, + "namespace": namespace, + } + + +def _fill(template: str, vars: dict[str, str]) -> str: + """填充模板變數,保留未知變數原樣""" + try: + return template.format_map(vars) + except (KeyError, ValueError): + return template + + +# ── 規則載入 ──────────────────────────────────────────────── + + +@lru_cache(maxsize=1) +def _load_rules() -> list[dict]: + """載入並快取規則(進程內不重載,重啟 Pod 才更新)""" + if not RULES_FILE.exists(): + logger.warning("alert_rules_file_not_found", path=str(RULES_FILE)) + return [] + with RULES_FILE.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) + rules = sorted(data.get("rules", []), key=lambda r: r.get("priority", 999)) + logger.info("alert_rules_loaded", count=len(rules), path=str(RULES_FILE)) + return rules + + +# ── 匹配邏輯 ──────────────────────────────────────────────── + + +def _matches(rule: dict, alertname: str, alert_type: str, message: str) -> bool: + """判斷規則是否匹配""" + match = rule.get("match", {}) + + # alertname 完全匹配 + alertnames = match.get("alertname", []) + if alertnames and alertnames != ["*"]: + if alertname in alertnames: + return True + + # alert_type 部分匹配 + for kw in match.get("alert_type", []): + if kw.lower() in alert_type.lower(): + return True + + # message 關鍵字匹配(不分大小寫) + msg_lower = message.lower() + for kw in match.get("message", []): + if kw.lower() in msg_lower: + return True + + return False + + +def _is_generic(rule: dict) -> bool: + alertnames = rule.get("match", {}).get("alertname", []) + return alertnames == ["*"] + + +# ── 公開 API ──────────────────────────────────────────────── + + +def match_rule(alert_context: dict) -> dict[str, Any] | None: + """ + 根據 alert_context 匹配規則,回傳填充後的 response dict。 + + Returns: + 匹配到的規則 response,未匹配則回傳 None。 + 呼叫方應自行處理 None(走 AI 分析或通用兜底)。 + """ + labels = alert_context.get("labels", {}) + alertname = labels.get("alertname", alert_context.get("alert_type", "custom")) + alert_type = alert_context.get("alert_type", "custom") + message = alert_context.get("message", "") + severity = alert_context.get("severity", "warning") + + rules = _load_rules() + vars = _extract_vars(alert_context) + + matched_rule = None + for rule in rules: + if _is_generic(rule): + continue # 通用兜底最後才用 + if _matches(rule, alertname, alert_type, message): + matched_rule = rule + break + + # 未匹配到具體規則 → 用通用兜底 + if matched_rule is None: + for rule in rules: + if _is_generic(rule): + matched_rule = rule + break + + if matched_rule is None: + return None + + resp = matched_rule["response"] + risk = resp.get("risk", "medium") + # severity=critical 強制升級風險等級 + if severity == "critical" and risk == "medium": + risk = "critical" + + optimization = [ + { + "type": o["type"], + "description": _fill(o.get("description", ""), vars), + "kubectl_or_config": _fill(o.get("command", ""), vars), + } + for o in resp.get("optimization", []) + ] + + return { + "rule_id": matched_rule["id"], + "action_title": _fill(resp["action_title"], vars), + "description": _fill(resp["description"], vars), + "suggested_action": resp["suggested_action"], + "kubectl_command": _fill(resp["kubectl_command"], vars), + "target_resource": vars["target"], + "namespace": vars["namespace"], + "risk_level": risk, + "blast_radius": { + "affected_pods": 1, + "estimated_downtime": resp.get("estimated_downtime", "unknown"), + "related_services": [vars["target"]], + "data_impact": "NONE", + }, + "primary_responsibility": resp.get("responsibility", "COLLAB"), + "responsibility_reasoning": resp.get("responsibility_reasoning", ""), + "secondary_teams": resp.get("secondary_teams", []), + "optimization_suggestions": optimization, + "reasoning": _fill(resp.get("reasoning", ""), vars), + "deviation_analysis": "規則引擎觸發,監控指標偏離正常基準", + "confidence": 0.0, # 🔴 規則匹配固定 0.0,禁止偽造 + "affected_services": [vars["target"]], + "signoz_correlation": "", + } diff --git a/apps/api/src/services/openclaw.py b/apps/api/src/services/openclaw.py index ba1e38e0..771debbe 100644 --- a/apps/api/src/services/openclaw.py +++ b/apps/api/src/services/openclaw.py @@ -572,274 +572,65 @@ class OpenClawService: signoz_metrics: GoldMetrics | None = None, ) -> str: """ - Mock LLM 回應生成器 - 智能降級 (v7.0 含 SignOz) + Mock LLM 回應生成器 - 規則引擎降級 (v8.0) - 根據告警類型和 SignOz 數據動態產生合理的 RCA 分析結果 + 從 alert_rules.yaml 載入規則,取代硬編碼 if/elif。 + 新增規則只需修改 YAML,不需要改代碼重新部署。 + + 2026-04-09 ogt: 重構為規則引擎,移除 if/elif 硬編碼 """ + from src.services.alert_rule_engine import match_rule + time.sleep(random.uniform(0.3, 0.8)) # 模擬思考延遲 - alert_type = alert_context.get("alert_type", "custom") - severity = alert_context.get("severity", "warning") - raw_target = alert_context.get("target_resource", "unknown-service") - raw_namespace = alert_context.get("namespace", "default") - message = alert_context.get("message", "") - metrics = alert_context.get("metrics", {}) - labels = alert_context.get("labels", {}) - alertname = labels.get("alertname", alert_type) - - # Phase 18.1: 正規化資源名稱 (ADR-016) - # 確保 kubectl 指令使用有效的 K8s 名稱 - normalized = normalize_resource_name(raw_target, raw_namespace) - if normalized.is_k8s_resource and normalized.normalized: - target = normalized.normalized - namespace = normalized.namespace or raw_namespace - logger.info( - "mock_response_resource_normalized", - original=raw_target, - normalized=target, - namespace=namespace, - ) - else: - target = raw_target - namespace = raw_namespace - # SignOz 數據整合 - signoz_summary = "" signoz_correlation = "SignOz 數據擷取中..." if signoz_metrics: - signoz_summary = signoz_metrics.to_summary() signoz_correlation = ( f"RPS={signoz_metrics.rps:.1f} ({signoz_metrics.rps_trend}), " f"Error={signoz_metrics.error_rate:.2f}%, " f"P99={signoz_metrics.p99_latency_ms:.0f}ms" ) - # 生成調優指令 - tuning = self.generate_auto_tuning_command( - alert_type=alert_type, - target_resource=target, - namespace=namespace, - metrics=signoz_metrics, - ) - - # 根據告警類型生成專業 RCA + 仲裁 - # 🔴 2026-03-29 ogt: Mock 響應必須標示為規則匹配,不是 AI 仲裁 - # 2026-04-09 ogt: 新增 DockerContainerUnhealthy / TargetDown 專屬規則 - if alertname == "DockerContainerUnhealthy" or "unhealthy" in message.lower(): - container_name = labels.get("name", target) - host = labels.get("instance", "").split(":")[0] or "192.168.0.188" - mock_response = { - "action_title": f"檢查 Docker 容器 {container_name} 健康狀態", - "description": f"⚙️ 規則匹配: Docker 容器 {container_name} ({host}) healthcheck 失敗。常見原因: 應用程式啟動慢、healthcheck 指令錯誤、依賴服務未就緒。{signoz_summary}", - "suggested_action": "RESTART_DEPLOYMENT", - "kubectl_command": f"ssh {host} 'docker inspect {container_name} --format=\"{{{{.State.Health.Status}}}}\" && docker restart {container_name}'", - "target_resource": container_name, - "namespace": namespace, - "risk_level": "medium", - "blast_radius": { - "affected_pods": 1, - "estimated_downtime": "~30s", - "related_services": [container_name], - "data_impact": "NONE", - }, - "primary_responsibility": "INFRA", - "responsibility_reasoning": "Docker 容器健康檢查失敗屬基礎設施團隊責任,需確認 healthcheck 設定與容器狀態", - "secondary_teams": ["BE"], - "optimization_suggestions": [ - { - "type": "HEALTHCHECK", - "description": "確認 healthcheck 指令在容器內可執行 (mc/curl 是否存在)", - "kubectl_or_config": f"ssh {host} 'docker exec {container_name} sh -c \"mc ready local || curl -f http://localhost:9000/minio/health/live\"'", - } - ], - "reasoning": f"[規則匹配] Docker healthcheck 失敗先 restart 恢復服務,同時確認 healthcheck 指令正確。{signoz_correlation}", - "deviation_analysis": "容器健康檢查連續失敗,超出允許次數", - "confidence": 0.0, - "affected_services": [container_name], - "signoz_correlation": signoz_correlation, - } - elif alertname == "TargetDown" or (labels.get("job") and ":" in raw_target): - instance = labels.get("instance", raw_target) - job = labels.get("job", "exporter") - host = instance.split(":")[0] - mock_response = { - "action_title": f"檢查 {job} ({instance}) 服務存活", - "description": f"⚙️ 規則匹配: Prometheus 無法抓取 {instance} ({job}) 指標。常見原因: 主機離線、exporter crash、防火牆封鎖。{signoz_summary}", - "suggested_action": "RESTART_DEPLOYMENT", - "kubectl_command": f"ssh {host} 'systemctl status node_exporter 2>/dev/null || docker ps | grep exporter'", - "target_resource": instance, - "namespace": namespace, - "risk_level": "medium", - "blast_radius": { - "affected_pods": 0, - "estimated_downtime": "監控盲區持續中", - "related_services": [job], - "data_impact": "NONE", - }, - "primary_responsibility": "INFRA", - "responsibility_reasoning": "Prometheus scrape 目標下線屬基礎設施監控範疇", - "secondary_teams": [], - "optimization_suggestions": [ - { - "type": "MONITORING", - "description": f"確認 {host} 上的 {job} exporter 是否正常運行", - "kubectl_or_config": f"ssh {host} 'ps aux | grep exporter'", - } - ], - "reasoning": f"[規則匹配] Prometheus target 下線,先 SSH 確認主機存活再重啟 exporter。{signoz_correlation}", - "deviation_analysis": "Prometheus scrape 失敗,監控數據中斷", - "confidence": 0.0, - "affected_services": [instance], - "signoz_correlation": signoz_correlation, - } - elif "oom" in message.lower() or "memory" in alert_type.lower(): - mock_response = { - "action_title": f"刪除異常 Pod {target} (OOMKilled)", - "description": f"⚙️ 規則匹配: {target} 發生 OOMKilled,根因為 JVM Heap 配置與 K8s memory limit 不匹配或存在記憶體洩漏。{signoz_summary}", - "suggested_action": "DELETE_POD", - "kubectl_command": f"kubectl delete pod {target} -n {namespace}", - "target_resource": target, - "namespace": namespace, - "risk_level": "critical" if severity == "critical" else "medium", - "blast_radius": { - "affected_pods": 1, - "estimated_downtime": "~30s", - "related_services": ["api-gateway", "downstream-service"], - "data_impact": "NONE" - }, - "primary_responsibility": "BE", - "responsibility_reasoning": "OOMKilled 通常源於應用程式記憶體配置不當,屬後端團隊責任範圍", - "secondary_teams": ["INFRA"], - "optimization_suggestions": [ - { - "type": "RESOURCE_LIMIT", - "description": "調整 memory limit 至 1Gi 並確保 JVM -Xmx 不超過 70%", - "kubectl_or_config": f"kubectl set resources deployment/{target.rsplit('-', 2)[0]} -c {target.rsplit('-', 2)[0]} --limits=memory=1Gi -n {namespace}" - }, - { - "type": "HPA", - "description": "啟用基於記憶體的 HPA 自動擴展", - "kubectl_or_config": f"kubectl autoscale deployment {target.rsplit('-', 2)[0]} --memory-percent=80 --min=2 --max=5 -n {namespace}" - } - ], - "reasoning": f"⚙️ Pod OOMKilled 後 ReplicaSet 將自動重建,但需同步修正資源配置防止復發。{signoz_correlation}", - "deviation_analysis": f"Memory 使用率 {metrics.get('memory_percent', 99)}%,超出基準線 60% 達 +6.5σ", - "confidence": 0.0, # 🔴 規則匹配,非 AI 仲裁 - "affected_services": [target, "api-gateway"], - "signoz_correlation": signoz_correlation, - } - elif "cpu" in alert_type.lower() or "high_cpu" in alert_type: - # 根據 SignOz RPS 調整策略 - rps_context = "" - if signoz_metrics and signoz_metrics.rps > 50: - rps_context = f"SignOz 顯示 RPS={signoz_metrics.rps:.0f},流量較高,建議配置 HPA。" - - mock_response = { - "action_title": f"擴展 {target} 副本數 + 啟用 HPA", - "description": f"⚙️ 規則匹配: {target} CPU 使用率過高,根因為流量突增或計算密集任務未配置自動擴展。{rps_context}", - "suggested_action": "SCALE_DEPLOYMENT", - "kubectl_command": tuning["command"], - "target_resource": target, - "namespace": namespace, - "risk_level": "medium", - "blast_radius": { - "affected_pods": 0, - "estimated_downtime": "0", - "related_services": [], - "data_impact": "NONE" - }, - "primary_responsibility": "INFRA", - "responsibility_reasoning": "自動擴展策略未配置或閾值過高,屬基礎設施團隊責任", - "secondary_teams": ["BE"], - "optimization_suggestions": [ - { - "type": tuning["type"], - "description": tuning["description"], - "kubectl_or_config": tuning["command"], - }, - { - "type": "RESOURCE_LIMIT", - "description": "增加 CPU request 確保 QoS 為 Guaranteed", - "kubectl_or_config": f"kubectl set resources deployment/{target} --requests=cpu=500m --limits=cpu=2000m -n {namespace}" - } - ], - "reasoning": f"[規則匹配] 水平擴展可即時分散負載,同時建議配置 HPA 防止復發。{signoz_correlation}", - "deviation_analysis": f"CPU 使用率 {metrics.get('cpu_percent', 95)}%,超出基準線 50% 達 +4.5σ", - "confidence": 0.0, # 🔴 規則匹配不是 AI 仲裁,信心度設 0 - "affected_services": [target], - "signoz_correlation": signoz_correlation, - } - elif "http" in alert_type.lower() or "5xx" in message.lower() or "502" in message.lower(): - mock_response = { - "action_title": f"重啟 {target} + 檢查上游服務", - "description": f"⚙️ 規則匹配: {target} 產生 HTTP 5xx 錯誤,可能為應用程式例外或上游服務不可達。{signoz_summary}", - "suggested_action": "RESTART_DEPLOYMENT", - "kubectl_command": f"kubectl rollout restart deployment/{target} -n {namespace}", - "target_resource": target, - "namespace": namespace, - "risk_level": "critical", - "blast_radius": { - "affected_pods": 3, - "estimated_downtime": "~1 min", - "related_services": ["nginx-ingress", "upstream-api"], - "data_impact": "NONE" - }, - "primary_responsibility": "COLLAB", - "responsibility_reasoning": "HTTP 5xx 可能源於前端路由、後端邏輯或基礎設施,需多團隊協同排查", - "secondary_teams": ["FE", "BE", "INFRA"], - "optimization_suggestions": [ - { - "type": "CIRCUIT_BREAKER", - "description": "配置熔斷器防止故障擴散", - "kubectl_or_config": "# Istio VirtualService outlierDetection 配置" - }, - { - "type": "CACHE", - "description": "增加 Redis 緩存減少上游壓力", - "kubectl_or_config": "# 檢查 Redis 連線池配置,建議 maxTotal=50" - } - ], - "reasoning": f"[規則匹配] HTTP 錯誤需協同排查,先重啟恢復服務同時通知相關團隊。{signoz_correlation}", - "deviation_analysis": "錯誤率 5%,超出基準線 0.1% 達 +50σ", - "confidence": 0.0, # 🔴 規則匹配不是 AI 仲裁,信心度設 0 - "affected_services": [target, "nginx-ingress", "upstream-api"], - "signoz_correlation": signoz_correlation, - } - else: - # 通用異常處理 + mock_response = match_rule(alert_context) + if mock_response is None: + # match_rule 不應該回傳 None(有通用兜底),但防禦性處理 + alert_type = alert_context.get("alert_type", "custom") + target = alert_context.get("target_resource", "unknown") + namespace = alert_context.get("namespace", "awoooi-prod") mock_response = { "action_title": f"重新啟動 {target} 服務", - "description": f"⚙️ 規則匹配: {target} 發生異常: {message[:80]}。需進一步診斷確認根因。{signoz_summary}", + "description": f"⚙️ 規則匹配: {target} 發生異常,需進一步診斷確認根因。", "suggested_action": "RESTART_DEPLOYMENT", "kubectl_command": f"kubectl rollout restart deployment/{target} -n {namespace}", "target_resource": target, "namespace": namespace, - "risk_level": "critical" if severity == "critical" else "medium", + "risk_level": "medium", "blast_radius": { - "affected_pods": 3, - "estimated_downtime": "~1 min", - "related_services": ["dependent-services"], - "data_impact": "NONE" + "affected_pods": 1, + "estimated_downtime": "5-15 min", + "related_services": [target], + "data_impact": "NONE", }, "primary_responsibility": "COLLAB", - "responsibility_reasoning": "告警資訊不足以判定單一責任團隊,建議多團隊協同排查", + "responsibility_reasoning": "告警資訊不足,建議多團隊協同排查", "secondary_teams": ["BE", "INFRA"], - "optimization_suggestions": [ - { - "type": tuning["type"], - "description": tuning["description"], - "kubectl_or_config": tuning["command"], - } - ], - "reasoning": f"[規則匹配] 根據告警 {alert_type} 先重啟恢復服務,同時安排深入診斷。{signoz_correlation}", + "optimization_suggestions": [], + "reasoning": f"[規則匹配] 根據告警 {alert_type} 先重啟恢復服務。", "deviation_analysis": "監控指標顯示異常偏離基準線", - "confidence": 0.0, # 🔴 規則匹配不是 AI 仲裁,信心度設 0 + "confidence": 0.0, "affected_services": [target], "signoz_correlation": signoz_correlation, } + # 補充 SignOz 關聯資訊(規則引擎不持有 signoz_metrics) + mock_response["signoz_correlation"] = signoz_correlation + if signoz_metrics: + mock_response["description"] += f" {signoz_metrics.to_summary()}" + logger.info( "mock_llm_response_generated", + rule_id=mock_response.get("rule_id", "unknown"), action_title=mock_response["action_title"], risk_level=mock_response["risk_level"], primary_responsibility=mock_response["primary_responsibility"],