feat(openclaw): 告警規則引擎 — alert_rules.yaml 取代硬編碼 if/elif
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled

- 新增 alert_rules.yaml: 6 條規則 (docker/target_down/oom/cpu/5xx/crash) + 通用兜底
- 新增 alert_rule_engine.py: YAML 載入、匹配邏輯、變數填充
- openclaw.py _generate_mock_response: 重構為呼叫規則引擎 (v8.0)
- 新增規則只需修改 YAML,重啟 Pod 即可,不需改代碼
- 2026-04-09 ogt: 架構重構

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-09 09:05:23 +08:00
parent 7e327c806e
commit d1ede7f989
3 changed files with 423 additions and 238 deletions

199
apps/api/alert_rules.yaml Normal file
View File

@@ -0,0 +1,199 @@
# AWOOOI OpenClaw 告警規則匹配引擎
# ============================================================
# 格式說明:
# match.alertname : Prometheus alertname 完全匹配 (list = OR)
# match.alert_type : alert_type 關鍵字 (list = OR, 部分匹配)
# match.message : message 關鍵字 (list = OR, 部分匹配, 不分大小寫)
# response.* : 回應模板,支援變數 {target} {host} {container} {instance} {job} {namespace}
# responsibility : FE / BE / INFRA / DB / COLLAB
# risk : low / medium / critical
# confidence : 0.0 (規則匹配固定值,禁止偽造)
#
# 修改規則: 不需要重新部署,重啟 API Pod 即可熱載入
# 新增規則: 在 rules 清單末尾加入priority 越小越優先
# 2026-04-09 ogt: 初版,從 openclaw.py _generate_mock_response 抽出
# ============================================================
version: "1.0.0"
updated_at: "2026-04-09"
rules:
# ── Docker / Host 層 ────────────────────────────────────────
- id: docker_container_unhealthy
priority: 10
description: Docker 容器 healthcheck 失敗
match:
alertname:
- DockerContainerUnhealthy
message:
- unhealthy
- health check
- healthcheck
response:
action_title: "檢查 Docker 容器 {container} 健康狀態"
description: "⚙️ 規則匹配: Docker 容器 {container} ({host}) healthcheck 失敗。常見原因: 應用程式啟動慢、healthcheck 指令錯誤、依賴服務未就緒。"
suggested_action: RESTART_DEPLOYMENT
kubectl_command: "ssh {host} 'docker inspect {container} --format=\"{{.State.Health.Status}}\" && docker restart {container}'"
estimated_downtime: "~30s"
risk: medium
responsibility: INFRA
responsibility_reasoning: "Docker 容器健康檢查失敗屬基礎設施團隊責任,需確認 healthcheck 設定與容器狀態"
secondary_teams: [BE]
optimization:
- type: HEALTHCHECK
description: "確認 healthcheck 指令在容器內可執行 (mc/curl 是否存在)"
command: "ssh {host} 'docker exec {container} sh -c \"mc ready local 2>/dev/null || curl -sf http://localhost:9000/minio/health/live\"'"
reasoning: "[規則匹配] Docker healthcheck 失敗先 restart 恢復服務,同時確認 healthcheck 指令正確。"
- id: target_down
priority: 20
description: Prometheus scrape target 下線
match:
alertname:
- TargetDown
- InstanceDown
response:
action_title: "確認 {job} ({instance}) 服務存活"
description: "⚙️ 規則匹配: Prometheus 無法抓取 {instance} ({job}) 指標。常見原因: 主機離線、exporter crash、防火牆封鎖。"
suggested_action: RESTART_DEPLOYMENT
kubectl_command: "ssh {host} 'systemctl status node_exporter 2>/dev/null || docker ps | grep exporter'"
estimated_downtime: "監控盲區持續中"
risk: medium
responsibility: INFRA
responsibility_reasoning: "Prometheus scrape 目標下線屬基礎設施監控範疇"
secondary_teams: []
optimization:
- type: MONITORING
description: "確認 exporter 進程是否存活"
command: "ssh {host} 'ps aux | grep exporter | grep -v grep'"
reasoning: "[規則匹配] Prometheus target 下線,先 SSH 確認主機存活再重啟 exporter。"
# ── K8s Pod 層 ──────────────────────────────────────────────
- id: oom_killed
priority: 30
description: Pod OOMKilled 記憶體不足
match:
alert_type:
- memory
message:
- oomkilled
- oom
- out of memory
response:
action_title: "刪除異常 Pod {target} (OOMKilled)"
description: "⚙️ 規則匹配: {target} 發生 OOMKilled根因為 JVM Heap 配置與 K8s memory limit 不匹配或存在記憶體洩漏。"
suggested_action: DELETE_POD
kubectl_command: "kubectl delete pod {target} -n {namespace}"
estimated_downtime: "~30s"
risk: critical
responsibility: BE
responsibility_reasoning: "OOMKilled 通常源於應用程式記憶體配置不當,屬後端團隊責任範圍"
secondary_teams: [INFRA]
optimization:
- type: RESOURCE_LIMIT
description: "調整 memory limit 至 1Gi 並確保 JVM -Xmx 不超過 70%"
command: "kubectl set resources deployment/{target} -c {target} --limits=memory=1Gi -n {namespace}"
- type: HPA
description: "啟用基於記憶體的 HPA 自動擴展"
command: "kubectl autoscale deployment {target} --memory-percent=80 --min=2 --max=5 -n {namespace}"
reasoning: "[規則匹配] Pod OOMKilled 後 ReplicaSet 將自動重建,但需同步修正資源配置防止復發。"
- id: high_cpu
priority: 40
description: Pod CPU 使用率過高
match:
alert_type:
- cpu
- high_cpu
response:
action_title: "擴展 {target} 副本數 + 啟用 HPA"
description: "⚙️ 規則匹配: {target} CPU 使用率過高,根因為流量突增或計算密集任務未配置自動擴展。"
suggested_action: SCALE_DEPLOYMENT
kubectl_command: "kubectl scale deployment {target} --replicas=3 -n {namespace}"
estimated_downtime: "0"
risk: medium
responsibility: INFRA
responsibility_reasoning: "自動擴展策略未配置或閾值過高,屬基礎設施團隊責任"
secondary_teams: [BE]
optimization:
- type: RESOURCE_LIMIT
description: "增加 CPU request 確保 QoS 為 Guaranteed"
command: "kubectl set resources deployment/{target} --requests=cpu=500m --limits=cpu=2000m -n {namespace}"
reasoning: "[規則匹配] 水平擴展可即時分散負載,同時建議配置 HPA 防止復發。"
- id: http_5xx
priority: 50
description: HTTP 5xx 錯誤率過高
match:
alert_type:
- http
message:
- "5xx"
- "502"
- "503"
- "500"
response:
action_title: "重啟 {target} + 檢查上游服務"
description: "⚙️ 規則匹配: {target} 產生 HTTP 5xx 錯誤,可能為應用程式例外或上游服務不可達。"
suggested_action: RESTART_DEPLOYMENT
kubectl_command: "kubectl rollout restart deployment/{target} -n {namespace}"
estimated_downtime: "~1 min"
risk: critical
responsibility: COLLAB
responsibility_reasoning: "HTTP 5xx 可能源於前端路由、後端邏輯或基礎設施,需多團隊協同排查"
secondary_teams: [FE, BE, INFRA]
optimization:
- type: CIRCUIT_BREAKER
description: "配置熔斷器防止故障擴散"
command: "# Istio VirtualService outlierDetection 配置"
reasoning: "[規則匹配] HTTP 錯誤需協同排查,先重啟恢復服務同時通知相關團隊。"
- id: pod_crash
priority: 60
description: Pod CrashLoopBackOff
match:
alert_type:
- pod_crash
- crash
message:
- crashloop
- crash
- backoff
response:
action_title: "診斷 {target} CrashLoop 根因"
description: "⚙️ 規則匹配: {target} 進入 CrashLoopBackOff需檢查啟動錯誤日誌。"
suggested_action: RESTART_DEPLOYMENT
kubectl_command: "kubectl logs {target} -n {namespace} --previous --tail=50"
estimated_downtime: "依根因而定"
risk: critical
responsibility: BE
responsibility_reasoning: "Pod crash 通常源於應用程式啟動錯誤,屬後端團隊責任"
secondary_teams: [INFRA]
optimization:
- type: LIVENESS_PROBE
description: "調整 liveness probe 初始延遲防止誤殺"
command: "# 調整 initialDelaySeconds >= 應用啟動時間"
reasoning: "[規則匹配] 先查 previous log 確認 crash 原因,再決定修復策略。"
# ── 通用兜底 ────────────────────────────────────────────────
- id: generic_fallback
priority: 999
description: 通用兜底規則 (無法匹配的告警)
match:
alertname:
- "*"
response:
action_title: "重新啟動 {target} 服務"
description: "⚙️ 規則匹配: {target} 發生異常,需進一步診斷確認根因。"
suggested_action: RESTART_DEPLOYMENT
kubectl_command: "kubectl rollout restart deployment/{target} -n {namespace}"
estimated_downtime: "5-15 min"
risk: medium
responsibility: COLLAB
responsibility_reasoning: "告警資訊不足以判定單一責任團隊,建議多團隊協同排查"
secondary_teams: [BE, INFRA]
optimization: []
reasoning: "[規則匹配] 根據告警先重啟恢復服務,同時安排深入診斷。"

View File

@@ -0,0 +1,195 @@
"""
OpenClaw 告警規則匹配引擎
============================================================
從 alert_rules.yaml 載入規則,取代 openclaw.py 中硬編碼的 if/elif 規則匹配。
設計原則:
- 規則在 YAML 定義,不需要改 Python 代碼
- 匹配邏輯: alertname 完全匹配 > alert_type 部分匹配 > message 關鍵字
- priority 越小越優先999 = 通用兜底
- 變數替換: {target} {host} {container} {instance} {job} {namespace}
2026-04-09 ogt: 初版,從 openclaw.py _generate_mock_response if/elif 抽出
"""
from __future__ import annotations
import re
from functools import lru_cache
from pathlib import Path
from typing import Any
import structlog
import yaml
logger = structlog.get_logger(__name__)
RULES_FILE = Path(__file__).parent.parent.parent / "alert_rules.yaml"
# ── 變數提取 ────────────────────────────────────────────────
def _extract_vars(alert_context: dict) -> dict[str, str]:
"""從 alert_context 提取模板變數"""
labels = alert_context.get("labels", {})
raw_target = alert_context.get("target_resource", "unknown")
instance = labels.get("instance", raw_target)
host = instance.split(":")[0] if ":" in instance else instance
container = labels.get("name", labels.get("container", raw_target))
job = labels.get("job", "exporter")
namespace = alert_context.get("namespace", "awoooi-prod")
# target: 優先用 pod label否則用 raw_target排除純 IP:port 和 alertname
pod = labels.get("pod", "")
if pod:
target = pod
elif ":" in raw_target or raw_target == alert_context.get("labels", {}).get("alertname", ""):
# raw_target 是 IP:port 或 alertname — 用 job 或 container 代替
target = container if container != raw_target else job
else:
target = raw_target
return {
"target": target,
"host": host,
"container": container,
"instance": instance,
"job": job,
"namespace": namespace,
}
def _fill(template: str, vars: dict[str, str]) -> str:
"""填充模板變數,保留未知變數原樣"""
try:
return template.format_map(vars)
except (KeyError, ValueError):
return template
# ── 規則載入 ────────────────────────────────────────────────
@lru_cache(maxsize=1)
def _load_rules() -> list[dict]:
"""載入並快取規則(進程內不重載,重啟 Pod 才更新)"""
if not RULES_FILE.exists():
logger.warning("alert_rules_file_not_found", path=str(RULES_FILE))
return []
with RULES_FILE.open("r", encoding="utf-8") as f:
data = yaml.safe_load(f)
rules = sorted(data.get("rules", []), key=lambda r: r.get("priority", 999))
logger.info("alert_rules_loaded", count=len(rules), path=str(RULES_FILE))
return rules
# ── 匹配邏輯 ────────────────────────────────────────────────
def _matches(rule: dict, alertname: str, alert_type: str, message: str) -> bool:
"""判斷規則是否匹配"""
match = rule.get("match", {})
# alertname 完全匹配
alertnames = match.get("alertname", [])
if alertnames and alertnames != ["*"]:
if alertname in alertnames:
return True
# alert_type 部分匹配
for kw in match.get("alert_type", []):
if kw.lower() in alert_type.lower():
return True
# message 關鍵字匹配(不分大小寫)
msg_lower = message.lower()
for kw in match.get("message", []):
if kw.lower() in msg_lower:
return True
return False
def _is_generic(rule: dict) -> bool:
alertnames = rule.get("match", {}).get("alertname", [])
return alertnames == ["*"]
# ── 公開 API ────────────────────────────────────────────────
def match_rule(alert_context: dict) -> dict[str, Any] | None:
"""
根據 alert_context 匹配規則,回傳填充後的 response dict。
Returns:
匹配到的規則 response未匹配則回傳 None。
呼叫方應自行處理 None走 AI 分析或通用兜底)。
"""
labels = alert_context.get("labels", {})
alertname = labels.get("alertname", alert_context.get("alert_type", "custom"))
alert_type = alert_context.get("alert_type", "custom")
message = alert_context.get("message", "")
severity = alert_context.get("severity", "warning")
rules = _load_rules()
vars = _extract_vars(alert_context)
matched_rule = None
for rule in rules:
if _is_generic(rule):
continue # 通用兜底最後才用
if _matches(rule, alertname, alert_type, message):
matched_rule = rule
break
# 未匹配到具體規則 → 用通用兜底
if matched_rule is None:
for rule in rules:
if _is_generic(rule):
matched_rule = rule
break
if matched_rule is None:
return None
resp = matched_rule["response"]
risk = resp.get("risk", "medium")
# severity=critical 強制升級風險等級
if severity == "critical" and risk == "medium":
risk = "critical"
optimization = [
{
"type": o["type"],
"description": _fill(o.get("description", ""), vars),
"kubectl_or_config": _fill(o.get("command", ""), vars),
}
for o in resp.get("optimization", [])
]
return {
"rule_id": matched_rule["id"],
"action_title": _fill(resp["action_title"], vars),
"description": _fill(resp["description"], vars),
"suggested_action": resp["suggested_action"],
"kubectl_command": _fill(resp["kubectl_command"], vars),
"target_resource": vars["target"],
"namespace": vars["namespace"],
"risk_level": risk,
"blast_radius": {
"affected_pods": 1,
"estimated_downtime": resp.get("estimated_downtime", "unknown"),
"related_services": [vars["target"]],
"data_impact": "NONE",
},
"primary_responsibility": resp.get("responsibility", "COLLAB"),
"responsibility_reasoning": resp.get("responsibility_reasoning", ""),
"secondary_teams": resp.get("secondary_teams", []),
"optimization_suggestions": optimization,
"reasoning": _fill(resp.get("reasoning", ""), vars),
"deviation_analysis": "規則引擎觸發,監控指標偏離正常基準",
"confidence": 0.0, # 🔴 規則匹配固定 0.0,禁止偽造
"affected_services": [vars["target"]],
"signoz_correlation": "",
}

View File

@@ -572,274 +572,65 @@ class OpenClawService:
signoz_metrics: GoldMetrics | None = None, signoz_metrics: GoldMetrics | None = None,
) -> str: ) -> str:
""" """
Mock LLM 回應生成器 - 智能降級 (v7.0 含 SignOz) Mock LLM 回應生成器 - 規則引擎降級 (v8.0)
根據告警類型和 SignOz 數據動態產生合理的 RCA 分析結果 從 alert_rules.yaml 載入規則,取代硬編碼 if/elif。
新增規則只需修改 YAML不需要改代碼重新部署。
2026-04-09 ogt: 重構為規則引擎,移除 if/elif 硬編碼
""" """
from src.services.alert_rule_engine import match_rule
time.sleep(random.uniform(0.3, 0.8)) # 模擬思考延遲 time.sleep(random.uniform(0.3, 0.8)) # 模擬思考延遲
alert_type = alert_context.get("alert_type", "custom")
severity = alert_context.get("severity", "warning")
raw_target = alert_context.get("target_resource", "unknown-service")
raw_namespace = alert_context.get("namespace", "default")
message = alert_context.get("message", "")
metrics = alert_context.get("metrics", {})
labels = alert_context.get("labels", {})
alertname = labels.get("alertname", alert_type)
# Phase 18.1: 正規化資源名稱 (ADR-016)
# 確保 kubectl 指令使用有效的 K8s 名稱
normalized = normalize_resource_name(raw_target, raw_namespace)
if normalized.is_k8s_resource and normalized.normalized:
target = normalized.normalized
namespace = normalized.namespace or raw_namespace
logger.info(
"mock_response_resource_normalized",
original=raw_target,
normalized=target,
namespace=namespace,
)
else:
target = raw_target
namespace = raw_namespace
# SignOz 數據整合 # SignOz 數據整合
signoz_summary = ""
signoz_correlation = "SignOz 數據擷取中..." signoz_correlation = "SignOz 數據擷取中..."
if signoz_metrics: if signoz_metrics:
signoz_summary = signoz_metrics.to_summary()
signoz_correlation = ( signoz_correlation = (
f"RPS={signoz_metrics.rps:.1f} ({signoz_metrics.rps_trend}), " f"RPS={signoz_metrics.rps:.1f} ({signoz_metrics.rps_trend}), "
f"Error={signoz_metrics.error_rate:.2f}%, " f"Error={signoz_metrics.error_rate:.2f}%, "
f"P99={signoz_metrics.p99_latency_ms:.0f}ms" f"P99={signoz_metrics.p99_latency_ms:.0f}ms"
) )
# 生成調優指令 mock_response = match_rule(alert_context)
tuning = self.generate_auto_tuning_command( if mock_response is None:
alert_type=alert_type, # match_rule 不應該回傳 None有通用兜底但防禦性處理
target_resource=target, alert_type = alert_context.get("alert_type", "custom")
namespace=namespace, target = alert_context.get("target_resource", "unknown")
metrics=signoz_metrics, namespace = alert_context.get("namespace", "awoooi-prod")
)
# 根據告警類型生成專業 RCA + 仲裁
# 🔴 2026-03-29 ogt: Mock 響應必須標示為規則匹配,不是 AI 仲裁
# 2026-04-09 ogt: 新增 DockerContainerUnhealthy / TargetDown 專屬規則
if alertname == "DockerContainerUnhealthy" or "unhealthy" in message.lower():
container_name = labels.get("name", target)
host = labels.get("instance", "").split(":")[0] or "192.168.0.188"
mock_response = {
"action_title": f"檢查 Docker 容器 {container_name} 健康狀態",
"description": f"⚙️ 規則匹配: Docker 容器 {container_name} ({host}) healthcheck 失敗。常見原因: 應用程式啟動慢、healthcheck 指令錯誤、依賴服務未就緒。{signoz_summary}",
"suggested_action": "RESTART_DEPLOYMENT",
"kubectl_command": f"ssh {host} 'docker inspect {container_name} --format=\"{{{{.State.Health.Status}}}}\" && docker restart {container_name}'",
"target_resource": container_name,
"namespace": namespace,
"risk_level": "medium",
"blast_radius": {
"affected_pods": 1,
"estimated_downtime": "~30s",
"related_services": [container_name],
"data_impact": "NONE",
},
"primary_responsibility": "INFRA",
"responsibility_reasoning": "Docker 容器健康檢查失敗屬基礎設施團隊責任,需確認 healthcheck 設定與容器狀態",
"secondary_teams": ["BE"],
"optimization_suggestions": [
{
"type": "HEALTHCHECK",
"description": "確認 healthcheck 指令在容器內可執行 (mc/curl 是否存在)",
"kubectl_or_config": f"ssh {host} 'docker exec {container_name} sh -c \"mc ready local || curl -f http://localhost:9000/minio/health/live\"'",
}
],
"reasoning": f"[規則匹配] Docker healthcheck 失敗先 restart 恢復服務,同時確認 healthcheck 指令正確。{signoz_correlation}",
"deviation_analysis": "容器健康檢查連續失敗,超出允許次數",
"confidence": 0.0,
"affected_services": [container_name],
"signoz_correlation": signoz_correlation,
}
elif alertname == "TargetDown" or (labels.get("job") and ":" in raw_target):
instance = labels.get("instance", raw_target)
job = labels.get("job", "exporter")
host = instance.split(":")[0]
mock_response = {
"action_title": f"檢查 {job} ({instance}) 服務存活",
"description": f"⚙️ 規則匹配: Prometheus 無法抓取 {instance} ({job}) 指標。常見原因: 主機離線、exporter crash、防火牆封鎖。{signoz_summary}",
"suggested_action": "RESTART_DEPLOYMENT",
"kubectl_command": f"ssh {host} 'systemctl status node_exporter 2>/dev/null || docker ps | grep exporter'",
"target_resource": instance,
"namespace": namespace,
"risk_level": "medium",
"blast_radius": {
"affected_pods": 0,
"estimated_downtime": "監控盲區持續中",
"related_services": [job],
"data_impact": "NONE",
},
"primary_responsibility": "INFRA",
"responsibility_reasoning": "Prometheus scrape 目標下線屬基礎設施監控範疇",
"secondary_teams": [],
"optimization_suggestions": [
{
"type": "MONITORING",
"description": f"確認 {host} 上的 {job} exporter 是否正常運行",
"kubectl_or_config": f"ssh {host} 'ps aux | grep exporter'",
}
],
"reasoning": f"[規則匹配] Prometheus target 下線,先 SSH 確認主機存活再重啟 exporter。{signoz_correlation}",
"deviation_analysis": "Prometheus scrape 失敗,監控數據中斷",
"confidence": 0.0,
"affected_services": [instance],
"signoz_correlation": signoz_correlation,
}
elif "oom" in message.lower() or "memory" in alert_type.lower():
mock_response = {
"action_title": f"刪除異常 Pod {target} (OOMKilled)",
"description": f"⚙️ 規則匹配: {target} 發生 OOMKilled根因為 JVM Heap 配置與 K8s memory limit 不匹配或存在記憶體洩漏。{signoz_summary}",
"suggested_action": "DELETE_POD",
"kubectl_command": f"kubectl delete pod {target} -n {namespace}",
"target_resource": target,
"namespace": namespace,
"risk_level": "critical" if severity == "critical" else "medium",
"blast_radius": {
"affected_pods": 1,
"estimated_downtime": "~30s",
"related_services": ["api-gateway", "downstream-service"],
"data_impact": "NONE"
},
"primary_responsibility": "BE",
"responsibility_reasoning": "OOMKilled 通常源於應用程式記憶體配置不當,屬後端團隊責任範圍",
"secondary_teams": ["INFRA"],
"optimization_suggestions": [
{
"type": "RESOURCE_LIMIT",
"description": "調整 memory limit 至 1Gi 並確保 JVM -Xmx 不超過 70%",
"kubectl_or_config": f"kubectl set resources deployment/{target.rsplit('-', 2)[0]} -c {target.rsplit('-', 2)[0]} --limits=memory=1Gi -n {namespace}"
},
{
"type": "HPA",
"description": "啟用基於記憶體的 HPA 自動擴展",
"kubectl_or_config": f"kubectl autoscale deployment {target.rsplit('-', 2)[0]} --memory-percent=80 --min=2 --max=5 -n {namespace}"
}
],
"reasoning": f"⚙️ Pod OOMKilled 後 ReplicaSet 將自動重建,但需同步修正資源配置防止復發。{signoz_correlation}",
"deviation_analysis": f"Memory 使用率 {metrics.get('memory_percent', 99)}%,超出基準線 60% 達 +6.5σ",
"confidence": 0.0, # 🔴 規則匹配,非 AI 仲裁
"affected_services": [target, "api-gateway"],
"signoz_correlation": signoz_correlation,
}
elif "cpu" in alert_type.lower() or "high_cpu" in alert_type:
# 根據 SignOz RPS 調整策略
rps_context = ""
if signoz_metrics and signoz_metrics.rps > 50:
rps_context = f"SignOz 顯示 RPS={signoz_metrics.rps:.0f},流量較高,建議配置 HPA。"
mock_response = {
"action_title": f"擴展 {target} 副本數 + 啟用 HPA",
"description": f"⚙️ 規則匹配: {target} CPU 使用率過高,根因為流量突增或計算密集任務未配置自動擴展。{rps_context}",
"suggested_action": "SCALE_DEPLOYMENT",
"kubectl_command": tuning["command"],
"target_resource": target,
"namespace": namespace,
"risk_level": "medium",
"blast_radius": {
"affected_pods": 0,
"estimated_downtime": "0",
"related_services": [],
"data_impact": "NONE"
},
"primary_responsibility": "INFRA",
"responsibility_reasoning": "自動擴展策略未配置或閾值過高,屬基礎設施團隊責任",
"secondary_teams": ["BE"],
"optimization_suggestions": [
{
"type": tuning["type"],
"description": tuning["description"],
"kubectl_or_config": tuning["command"],
},
{
"type": "RESOURCE_LIMIT",
"description": "增加 CPU request 確保 QoS 為 Guaranteed",
"kubectl_or_config": f"kubectl set resources deployment/{target} --requests=cpu=500m --limits=cpu=2000m -n {namespace}"
}
],
"reasoning": f"[規則匹配] 水平擴展可即時分散負載,同時建議配置 HPA 防止復發。{signoz_correlation}",
"deviation_analysis": f"CPU 使用率 {metrics.get('cpu_percent', 95)}%,超出基準線 50% 達 +4.5σ",
"confidence": 0.0, # 🔴 規則匹配不是 AI 仲裁,信心度設 0
"affected_services": [target],
"signoz_correlation": signoz_correlation,
}
elif "http" in alert_type.lower() or "5xx" in message.lower() or "502" in message.lower():
mock_response = {
"action_title": f"重啟 {target} + 檢查上游服務",
"description": f"⚙️ 規則匹配: {target} 產生 HTTP 5xx 錯誤,可能為應用程式例外或上游服務不可達。{signoz_summary}",
"suggested_action": "RESTART_DEPLOYMENT",
"kubectl_command": f"kubectl rollout restart deployment/{target} -n {namespace}",
"target_resource": target,
"namespace": namespace,
"risk_level": "critical",
"blast_radius": {
"affected_pods": 3,
"estimated_downtime": "~1 min",
"related_services": ["nginx-ingress", "upstream-api"],
"data_impact": "NONE"
},
"primary_responsibility": "COLLAB",
"responsibility_reasoning": "HTTP 5xx 可能源於前端路由、後端邏輯或基礎設施,需多團隊協同排查",
"secondary_teams": ["FE", "BE", "INFRA"],
"optimization_suggestions": [
{
"type": "CIRCUIT_BREAKER",
"description": "配置熔斷器防止故障擴散",
"kubectl_or_config": "# Istio VirtualService outlierDetection 配置"
},
{
"type": "CACHE",
"description": "增加 Redis 緩存減少上游壓力",
"kubectl_or_config": "# 檢查 Redis 連線池配置,建議 maxTotal=50"
}
],
"reasoning": f"[規則匹配] HTTP 錯誤需協同排查,先重啟恢復服務同時通知相關團隊。{signoz_correlation}",
"deviation_analysis": "錯誤率 5%,超出基準線 0.1% 達 +50σ",
"confidence": 0.0, # 🔴 規則匹配不是 AI 仲裁,信心度設 0
"affected_services": [target, "nginx-ingress", "upstream-api"],
"signoz_correlation": signoz_correlation,
}
else:
# 通用異常處理
mock_response = { mock_response = {
"action_title": f"重新啟動 {target} 服務", "action_title": f"重新啟動 {target} 服務",
"description": f"⚙️ 規則匹配: {target} 發生異常: {message[:80]}。需進一步診斷確認根因。{signoz_summary}", "description": f"⚙️ 規則匹配: {target} 發生異常,需進一步診斷確認根因。",
"suggested_action": "RESTART_DEPLOYMENT", "suggested_action": "RESTART_DEPLOYMENT",
"kubectl_command": f"kubectl rollout restart deployment/{target} -n {namespace}", "kubectl_command": f"kubectl rollout restart deployment/{target} -n {namespace}",
"target_resource": target, "target_resource": target,
"namespace": namespace, "namespace": namespace,
"risk_level": "critical" if severity == "critical" else "medium", "risk_level": "medium",
"blast_radius": { "blast_radius": {
"affected_pods": 3, "affected_pods": 1,
"estimated_downtime": "~1 min", "estimated_downtime": "5-15 min",
"related_services": ["dependent-services"], "related_services": [target],
"data_impact": "NONE" "data_impact": "NONE",
}, },
"primary_responsibility": "COLLAB", "primary_responsibility": "COLLAB",
"responsibility_reasoning": "告警資訊不足以判定單一責任團隊,建議多團隊協同排查", "responsibility_reasoning": "告警資訊不足,建議多團隊協同排查",
"secondary_teams": ["BE", "INFRA"], "secondary_teams": ["BE", "INFRA"],
"optimization_suggestions": [ "optimization_suggestions": [],
{ "reasoning": f"[規則匹配] 根據告警 {alert_type} 先重啟恢復服務。",
"type": tuning["type"],
"description": tuning["description"],
"kubectl_or_config": tuning["command"],
}
],
"reasoning": f"[規則匹配] 根據告警 {alert_type} 先重啟恢復服務,同時安排深入診斷。{signoz_correlation}",
"deviation_analysis": "監控指標顯示異常偏離基準線", "deviation_analysis": "監控指標顯示異常偏離基準線",
"confidence": 0.0, # 🔴 規則匹配不是 AI 仲裁,信心度設 0 "confidence": 0.0,
"affected_services": [target], "affected_services": [target],
"signoz_correlation": signoz_correlation, "signoz_correlation": signoz_correlation,
} }
# 補充 SignOz 關聯資訊(規則引擎不持有 signoz_metrics
mock_response["signoz_correlation"] = signoz_correlation
if signoz_metrics:
mock_response["description"] += f" {signoz_metrics.to_summary()}"
logger.info( logger.info(
"mock_llm_response_generated", "mock_llm_response_generated",
rule_id=mock_response.get("rule_id", "unknown"),
action_title=mock_response["action_title"], action_title=mock_response["action_title"],
risk_level=mock_response["risk_level"], risk_level=mock_response["risk_level"],
primary_responsibility=mock_response["primary_responsibility"], primary_responsibility=mock_response["primary_responsibility"],