feat(openclaw): 告警規則引擎 — alert_rules.yaml 取代硬編碼 if/elif
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
- 新增 alert_rules.yaml: 6 條規則 (docker/target_down/oom/cpu/5xx/crash) + 通用兜底 - 新增 alert_rule_engine.py: YAML 載入、匹配邏輯、變數填充 - openclaw.py _generate_mock_response: 重構為呼叫規則引擎 (v8.0) - 新增規則只需修改 YAML,重啟 Pod 即可,不需改代碼 - 2026-04-09 ogt: 架構重構 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
199
apps/api/alert_rules.yaml
Normal file
199
apps/api/alert_rules.yaml
Normal file
@@ -0,0 +1,199 @@
|
||||
# AWOOOI OpenClaw 告警規則匹配引擎
|
||||
# ============================================================
|
||||
# 格式說明:
|
||||
# match.alertname : Prometheus alertname 完全匹配 (list = OR)
|
||||
# match.alert_type : alert_type 關鍵字 (list = OR, 部分匹配)
|
||||
# match.message : message 關鍵字 (list = OR, 部分匹配, 不分大小寫)
|
||||
# response.* : 回應模板,支援變數 {target} {host} {container} {instance} {job} {namespace}
|
||||
# responsibility : FE / BE / INFRA / DB / COLLAB
|
||||
# risk : low / medium / critical
|
||||
# confidence : 0.0 (規則匹配固定值,禁止偽造)
|
||||
#
|
||||
# 修改規則: 不需要重新部署,重啟 API Pod 即可熱載入
|
||||
# 新增規則: 在 rules 清單末尾加入,priority 越小越優先
|
||||
# 2026-04-09 ogt: 初版,從 openclaw.py _generate_mock_response 抽出
|
||||
# ============================================================
|
||||
|
||||
version: "1.0.0"
|
||||
updated_at: "2026-04-09"
|
||||
|
||||
rules:
|
||||
# ── Docker / Host 層 ────────────────────────────────────────
|
||||
|
||||
- id: docker_container_unhealthy
|
||||
priority: 10
|
||||
description: Docker 容器 healthcheck 失敗
|
||||
match:
|
||||
alertname:
|
||||
- DockerContainerUnhealthy
|
||||
message:
|
||||
- unhealthy
|
||||
- health check
|
||||
- healthcheck
|
||||
response:
|
||||
action_title: "檢查 Docker 容器 {container} 健康狀態"
|
||||
description: "⚙️ 規則匹配: Docker 容器 {container} ({host}) healthcheck 失敗。常見原因: 應用程式啟動慢、healthcheck 指令錯誤、依賴服務未就緒。"
|
||||
suggested_action: RESTART_DEPLOYMENT
|
||||
kubectl_command: "ssh {host} 'docker inspect {container} --format=\"{{.State.Health.Status}}\" && docker restart {container}'"
|
||||
estimated_downtime: "~30s"
|
||||
risk: medium
|
||||
responsibility: INFRA
|
||||
responsibility_reasoning: "Docker 容器健康檢查失敗屬基礎設施團隊責任,需確認 healthcheck 設定與容器狀態"
|
||||
secondary_teams: [BE]
|
||||
optimization:
|
||||
- type: HEALTHCHECK
|
||||
description: "確認 healthcheck 指令在容器內可執行 (mc/curl 是否存在)"
|
||||
command: "ssh {host} 'docker exec {container} sh -c \"mc ready local 2>/dev/null || curl -sf http://localhost:9000/minio/health/live\"'"
|
||||
reasoning: "[規則匹配] Docker healthcheck 失敗先 restart 恢復服務,同時確認 healthcheck 指令正確。"
|
||||
|
||||
- id: target_down
|
||||
priority: 20
|
||||
description: Prometheus scrape target 下線
|
||||
match:
|
||||
alertname:
|
||||
- TargetDown
|
||||
- InstanceDown
|
||||
response:
|
||||
action_title: "確認 {job} ({instance}) 服務存活"
|
||||
description: "⚙️ 規則匹配: Prometheus 無法抓取 {instance} ({job}) 指標。常見原因: 主機離線、exporter crash、防火牆封鎖。"
|
||||
suggested_action: RESTART_DEPLOYMENT
|
||||
kubectl_command: "ssh {host} 'systemctl status node_exporter 2>/dev/null || docker ps | grep exporter'"
|
||||
estimated_downtime: "監控盲區持續中"
|
||||
risk: medium
|
||||
responsibility: INFRA
|
||||
responsibility_reasoning: "Prometheus scrape 目標下線屬基礎設施監控範疇"
|
||||
secondary_teams: []
|
||||
optimization:
|
||||
- type: MONITORING
|
||||
description: "確認 exporter 進程是否存活"
|
||||
command: "ssh {host} 'ps aux | grep exporter | grep -v grep'"
|
||||
reasoning: "[規則匹配] Prometheus target 下線,先 SSH 確認主機存活再重啟 exporter。"
|
||||
|
||||
# ── K8s Pod 層 ──────────────────────────────────────────────
|
||||
|
||||
- id: oom_killed
|
||||
priority: 30
|
||||
description: Pod OOMKilled 記憶體不足
|
||||
match:
|
||||
alert_type:
|
||||
- memory
|
||||
message:
|
||||
- oomkilled
|
||||
- oom
|
||||
- out of memory
|
||||
response:
|
||||
action_title: "刪除異常 Pod {target} (OOMKilled)"
|
||||
description: "⚙️ 規則匹配: {target} 發生 OOMKilled,根因為 JVM Heap 配置與 K8s memory limit 不匹配或存在記憶體洩漏。"
|
||||
suggested_action: DELETE_POD
|
||||
kubectl_command: "kubectl delete pod {target} -n {namespace}"
|
||||
estimated_downtime: "~30s"
|
||||
risk: critical
|
||||
responsibility: BE
|
||||
responsibility_reasoning: "OOMKilled 通常源於應用程式記憶體配置不當,屬後端團隊責任範圍"
|
||||
secondary_teams: [INFRA]
|
||||
optimization:
|
||||
- type: RESOURCE_LIMIT
|
||||
description: "調整 memory limit 至 1Gi 並確保 JVM -Xmx 不超過 70%"
|
||||
command: "kubectl set resources deployment/{target} -c {target} --limits=memory=1Gi -n {namespace}"
|
||||
- type: HPA
|
||||
description: "啟用基於記憶體的 HPA 自動擴展"
|
||||
command: "kubectl autoscale deployment {target} --memory-percent=80 --min=2 --max=5 -n {namespace}"
|
||||
reasoning: "[規則匹配] Pod OOMKilled 後 ReplicaSet 將自動重建,但需同步修正資源配置防止復發。"
|
||||
|
||||
- id: high_cpu
|
||||
priority: 40
|
||||
description: Pod CPU 使用率過高
|
||||
match:
|
||||
alert_type:
|
||||
- cpu
|
||||
- high_cpu
|
||||
response:
|
||||
action_title: "擴展 {target} 副本數 + 啟用 HPA"
|
||||
description: "⚙️ 規則匹配: {target} CPU 使用率過高,根因為流量突增或計算密集任務未配置自動擴展。"
|
||||
suggested_action: SCALE_DEPLOYMENT
|
||||
kubectl_command: "kubectl scale deployment {target} --replicas=3 -n {namespace}"
|
||||
estimated_downtime: "0"
|
||||
risk: medium
|
||||
responsibility: INFRA
|
||||
responsibility_reasoning: "自動擴展策略未配置或閾值過高,屬基礎設施團隊責任"
|
||||
secondary_teams: [BE]
|
||||
optimization:
|
||||
- type: RESOURCE_LIMIT
|
||||
description: "增加 CPU request 確保 QoS 為 Guaranteed"
|
||||
command: "kubectl set resources deployment/{target} --requests=cpu=500m --limits=cpu=2000m -n {namespace}"
|
||||
reasoning: "[規則匹配] 水平擴展可即時分散負載,同時建議配置 HPA 防止復發。"
|
||||
|
||||
- id: http_5xx
|
||||
priority: 50
|
||||
description: HTTP 5xx 錯誤率過高
|
||||
match:
|
||||
alert_type:
|
||||
- http
|
||||
message:
|
||||
- "5xx"
|
||||
- "502"
|
||||
- "503"
|
||||
- "500"
|
||||
response:
|
||||
action_title: "重啟 {target} + 檢查上游服務"
|
||||
description: "⚙️ 規則匹配: {target} 產生 HTTP 5xx 錯誤,可能為應用程式例外或上游服務不可達。"
|
||||
suggested_action: RESTART_DEPLOYMENT
|
||||
kubectl_command: "kubectl rollout restart deployment/{target} -n {namespace}"
|
||||
estimated_downtime: "~1 min"
|
||||
risk: critical
|
||||
responsibility: COLLAB
|
||||
responsibility_reasoning: "HTTP 5xx 可能源於前端路由、後端邏輯或基礎設施,需多團隊協同排查"
|
||||
secondary_teams: [FE, BE, INFRA]
|
||||
optimization:
|
||||
- type: CIRCUIT_BREAKER
|
||||
description: "配置熔斷器防止故障擴散"
|
||||
command: "# Istio VirtualService outlierDetection 配置"
|
||||
reasoning: "[規則匹配] HTTP 錯誤需協同排查,先重啟恢復服務同時通知相關團隊。"
|
||||
|
||||
- id: pod_crash
|
||||
priority: 60
|
||||
description: Pod CrashLoopBackOff
|
||||
match:
|
||||
alert_type:
|
||||
- pod_crash
|
||||
- crash
|
||||
message:
|
||||
- crashloop
|
||||
- crash
|
||||
- backoff
|
||||
response:
|
||||
action_title: "診斷 {target} CrashLoop 根因"
|
||||
description: "⚙️ 規則匹配: {target} 進入 CrashLoopBackOff,需檢查啟動錯誤日誌。"
|
||||
suggested_action: RESTART_DEPLOYMENT
|
||||
kubectl_command: "kubectl logs {target} -n {namespace} --previous --tail=50"
|
||||
estimated_downtime: "依根因而定"
|
||||
risk: critical
|
||||
responsibility: BE
|
||||
responsibility_reasoning: "Pod crash 通常源於應用程式啟動錯誤,屬後端團隊責任"
|
||||
secondary_teams: [INFRA]
|
||||
optimization:
|
||||
- type: LIVENESS_PROBE
|
||||
description: "調整 liveness probe 初始延遲防止誤殺"
|
||||
command: "# 調整 initialDelaySeconds >= 應用啟動時間"
|
||||
reasoning: "[規則匹配] 先查 previous log 確認 crash 原因,再決定修復策略。"
|
||||
|
||||
# ── 通用兜底 ────────────────────────────────────────────────
|
||||
|
||||
- id: generic_fallback
|
||||
priority: 999
|
||||
description: 通用兜底規則 (無法匹配的告警)
|
||||
match:
|
||||
alertname:
|
||||
- "*"
|
||||
response:
|
||||
action_title: "重新啟動 {target} 服務"
|
||||
description: "⚙️ 規則匹配: {target} 發生異常,需進一步診斷確認根因。"
|
||||
suggested_action: RESTART_DEPLOYMENT
|
||||
kubectl_command: "kubectl rollout restart deployment/{target} -n {namespace}"
|
||||
estimated_downtime: "5-15 min"
|
||||
risk: medium
|
||||
responsibility: COLLAB
|
||||
responsibility_reasoning: "告警資訊不足以判定單一責任團隊,建議多團隊協同排查"
|
||||
secondary_teams: [BE, INFRA]
|
||||
optimization: []
|
||||
reasoning: "[規則匹配] 根據告警先重啟恢復服務,同時安排深入診斷。"
|
||||
195
apps/api/src/services/alert_rule_engine.py
Normal file
195
apps/api/src/services/alert_rule_engine.py
Normal file
@@ -0,0 +1,195 @@
|
||||
"""
|
||||
OpenClaw 告警規則匹配引擎
|
||||
============================================================
|
||||
從 alert_rules.yaml 載入規則,取代 openclaw.py 中硬編碼的 if/elif 規則匹配。
|
||||
|
||||
設計原則:
|
||||
- 規則在 YAML 定義,不需要改 Python 代碼
|
||||
- 匹配邏輯: alertname 完全匹配 > alert_type 部分匹配 > message 關鍵字
|
||||
- priority 越小越優先,999 = 通用兜底
|
||||
- 變數替換: {target} {host} {container} {instance} {job} {namespace}
|
||||
|
||||
2026-04-09 ogt: 初版,從 openclaw.py _generate_mock_response if/elif 抽出
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
import yaml
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
RULES_FILE = Path(__file__).parent.parent.parent / "alert_rules.yaml"
|
||||
|
||||
# ── 變數提取 ────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _extract_vars(alert_context: dict) -> dict[str, str]:
|
||||
"""從 alert_context 提取模板變數"""
|
||||
labels = alert_context.get("labels", {})
|
||||
raw_target = alert_context.get("target_resource", "unknown")
|
||||
instance = labels.get("instance", raw_target)
|
||||
host = instance.split(":")[0] if ":" in instance else instance
|
||||
container = labels.get("name", labels.get("container", raw_target))
|
||||
job = labels.get("job", "exporter")
|
||||
namespace = alert_context.get("namespace", "awoooi-prod")
|
||||
|
||||
# target: 優先用 pod label,否則用 raw_target(排除純 IP:port 和 alertname)
|
||||
pod = labels.get("pod", "")
|
||||
if pod:
|
||||
target = pod
|
||||
elif ":" in raw_target or raw_target == alert_context.get("labels", {}).get("alertname", ""):
|
||||
# raw_target 是 IP:port 或 alertname — 用 job 或 container 代替
|
||||
target = container if container != raw_target else job
|
||||
else:
|
||||
target = raw_target
|
||||
|
||||
return {
|
||||
"target": target,
|
||||
"host": host,
|
||||
"container": container,
|
||||
"instance": instance,
|
||||
"job": job,
|
||||
"namespace": namespace,
|
||||
}
|
||||
|
||||
|
||||
def _fill(template: str, vars: dict[str, str]) -> str:
|
||||
"""填充模板變數,保留未知變數原樣"""
|
||||
try:
|
||||
return template.format_map(vars)
|
||||
except (KeyError, ValueError):
|
||||
return template
|
||||
|
||||
|
||||
# ── 規則載入 ────────────────────────────────────────────────
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def _load_rules() -> list[dict]:
|
||||
"""載入並快取規則(進程內不重載,重啟 Pod 才更新)"""
|
||||
if not RULES_FILE.exists():
|
||||
logger.warning("alert_rules_file_not_found", path=str(RULES_FILE))
|
||||
return []
|
||||
with RULES_FILE.open("r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f)
|
||||
rules = sorted(data.get("rules", []), key=lambda r: r.get("priority", 999))
|
||||
logger.info("alert_rules_loaded", count=len(rules), path=str(RULES_FILE))
|
||||
return rules
|
||||
|
||||
|
||||
# ── 匹配邏輯 ────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _matches(rule: dict, alertname: str, alert_type: str, message: str) -> bool:
|
||||
"""判斷規則是否匹配"""
|
||||
match = rule.get("match", {})
|
||||
|
||||
# alertname 完全匹配
|
||||
alertnames = match.get("alertname", [])
|
||||
if alertnames and alertnames != ["*"]:
|
||||
if alertname in alertnames:
|
||||
return True
|
||||
|
||||
# alert_type 部分匹配
|
||||
for kw in match.get("alert_type", []):
|
||||
if kw.lower() in alert_type.lower():
|
||||
return True
|
||||
|
||||
# message 關鍵字匹配(不分大小寫)
|
||||
msg_lower = message.lower()
|
||||
for kw in match.get("message", []):
|
||||
if kw.lower() in msg_lower:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _is_generic(rule: dict) -> bool:
|
||||
alertnames = rule.get("match", {}).get("alertname", [])
|
||||
return alertnames == ["*"]
|
||||
|
||||
|
||||
# ── 公開 API ────────────────────────────────────────────────
|
||||
|
||||
|
||||
def match_rule(alert_context: dict) -> dict[str, Any] | None:
|
||||
"""
|
||||
根據 alert_context 匹配規則,回傳填充後的 response dict。
|
||||
|
||||
Returns:
|
||||
匹配到的規則 response,未匹配則回傳 None。
|
||||
呼叫方應自行處理 None(走 AI 分析或通用兜底)。
|
||||
"""
|
||||
labels = alert_context.get("labels", {})
|
||||
alertname = labels.get("alertname", alert_context.get("alert_type", "custom"))
|
||||
alert_type = alert_context.get("alert_type", "custom")
|
||||
message = alert_context.get("message", "")
|
||||
severity = alert_context.get("severity", "warning")
|
||||
|
||||
rules = _load_rules()
|
||||
vars = _extract_vars(alert_context)
|
||||
|
||||
matched_rule = None
|
||||
for rule in rules:
|
||||
if _is_generic(rule):
|
||||
continue # 通用兜底最後才用
|
||||
if _matches(rule, alertname, alert_type, message):
|
||||
matched_rule = rule
|
||||
break
|
||||
|
||||
# 未匹配到具體規則 → 用通用兜底
|
||||
if matched_rule is None:
|
||||
for rule in rules:
|
||||
if _is_generic(rule):
|
||||
matched_rule = rule
|
||||
break
|
||||
|
||||
if matched_rule is None:
|
||||
return None
|
||||
|
||||
resp = matched_rule["response"]
|
||||
risk = resp.get("risk", "medium")
|
||||
# severity=critical 強制升級風險等級
|
||||
if severity == "critical" and risk == "medium":
|
||||
risk = "critical"
|
||||
|
||||
optimization = [
|
||||
{
|
||||
"type": o["type"],
|
||||
"description": _fill(o.get("description", ""), vars),
|
||||
"kubectl_or_config": _fill(o.get("command", ""), vars),
|
||||
}
|
||||
for o in resp.get("optimization", [])
|
||||
]
|
||||
|
||||
return {
|
||||
"rule_id": matched_rule["id"],
|
||||
"action_title": _fill(resp["action_title"], vars),
|
||||
"description": _fill(resp["description"], vars),
|
||||
"suggested_action": resp["suggested_action"],
|
||||
"kubectl_command": _fill(resp["kubectl_command"], vars),
|
||||
"target_resource": vars["target"],
|
||||
"namespace": vars["namespace"],
|
||||
"risk_level": risk,
|
||||
"blast_radius": {
|
||||
"affected_pods": 1,
|
||||
"estimated_downtime": resp.get("estimated_downtime", "unknown"),
|
||||
"related_services": [vars["target"]],
|
||||
"data_impact": "NONE",
|
||||
},
|
||||
"primary_responsibility": resp.get("responsibility", "COLLAB"),
|
||||
"responsibility_reasoning": resp.get("responsibility_reasoning", ""),
|
||||
"secondary_teams": resp.get("secondary_teams", []),
|
||||
"optimization_suggestions": optimization,
|
||||
"reasoning": _fill(resp.get("reasoning", ""), vars),
|
||||
"deviation_analysis": "規則引擎觸發,監控指標偏離正常基準",
|
||||
"confidence": 0.0, # 🔴 規則匹配固定 0.0,禁止偽造
|
||||
"affected_services": [vars["target"]],
|
||||
"signoz_correlation": "",
|
||||
}
|
||||
@@ -572,274 +572,65 @@ class OpenClawService:
|
||||
signoz_metrics: GoldMetrics | None = None,
|
||||
) -> str:
|
||||
"""
|
||||
Mock LLM 回應生成器 - 智能降級 (v7.0 含 SignOz)
|
||||
Mock LLM 回應生成器 - 規則引擎降級 (v8.0)
|
||||
|
||||
根據告警類型和 SignOz 數據動態產生合理的 RCA 分析結果
|
||||
從 alert_rules.yaml 載入規則,取代硬編碼 if/elif。
|
||||
新增規則只需修改 YAML,不需要改代碼重新部署。
|
||||
|
||||
2026-04-09 ogt: 重構為規則引擎,移除 if/elif 硬編碼
|
||||
"""
|
||||
from src.services.alert_rule_engine import match_rule
|
||||
|
||||
time.sleep(random.uniform(0.3, 0.8)) # 模擬思考延遲
|
||||
|
||||
alert_type = alert_context.get("alert_type", "custom")
|
||||
severity = alert_context.get("severity", "warning")
|
||||
raw_target = alert_context.get("target_resource", "unknown-service")
|
||||
raw_namespace = alert_context.get("namespace", "default")
|
||||
message = alert_context.get("message", "")
|
||||
metrics = alert_context.get("metrics", {})
|
||||
labels = alert_context.get("labels", {})
|
||||
alertname = labels.get("alertname", alert_type)
|
||||
|
||||
# Phase 18.1: 正規化資源名稱 (ADR-016)
|
||||
# 確保 kubectl 指令使用有效的 K8s 名稱
|
||||
normalized = normalize_resource_name(raw_target, raw_namespace)
|
||||
if normalized.is_k8s_resource and normalized.normalized:
|
||||
target = normalized.normalized
|
||||
namespace = normalized.namespace or raw_namespace
|
||||
logger.info(
|
||||
"mock_response_resource_normalized",
|
||||
original=raw_target,
|
||||
normalized=target,
|
||||
namespace=namespace,
|
||||
)
|
||||
else:
|
||||
target = raw_target
|
||||
namespace = raw_namespace
|
||||
|
||||
# SignOz 數據整合
|
||||
signoz_summary = ""
|
||||
signoz_correlation = "SignOz 數據擷取中..."
|
||||
if signoz_metrics:
|
||||
signoz_summary = signoz_metrics.to_summary()
|
||||
signoz_correlation = (
|
||||
f"RPS={signoz_metrics.rps:.1f} ({signoz_metrics.rps_trend}), "
|
||||
f"Error={signoz_metrics.error_rate:.2f}%, "
|
||||
f"P99={signoz_metrics.p99_latency_ms:.0f}ms"
|
||||
)
|
||||
|
||||
# 生成調優指令
|
||||
tuning = self.generate_auto_tuning_command(
|
||||
alert_type=alert_type,
|
||||
target_resource=target,
|
||||
namespace=namespace,
|
||||
metrics=signoz_metrics,
|
||||
)
|
||||
|
||||
# 根據告警類型生成專業 RCA + 仲裁
|
||||
# 🔴 2026-03-29 ogt: Mock 響應必須標示為規則匹配,不是 AI 仲裁
|
||||
# 2026-04-09 ogt: 新增 DockerContainerUnhealthy / TargetDown 專屬規則
|
||||
if alertname == "DockerContainerUnhealthy" or "unhealthy" in message.lower():
|
||||
container_name = labels.get("name", target)
|
||||
host = labels.get("instance", "").split(":")[0] or "192.168.0.188"
|
||||
mock_response = {
|
||||
"action_title": f"檢查 Docker 容器 {container_name} 健康狀態",
|
||||
"description": f"⚙️ 規則匹配: Docker 容器 {container_name} ({host}) healthcheck 失敗。常見原因: 應用程式啟動慢、healthcheck 指令錯誤、依賴服務未就緒。{signoz_summary}",
|
||||
"suggested_action": "RESTART_DEPLOYMENT",
|
||||
"kubectl_command": f"ssh {host} 'docker inspect {container_name} --format=\"{{{{.State.Health.Status}}}}\" && docker restart {container_name}'",
|
||||
"target_resource": container_name,
|
||||
"namespace": namespace,
|
||||
"risk_level": "medium",
|
||||
"blast_radius": {
|
||||
"affected_pods": 1,
|
||||
"estimated_downtime": "~30s",
|
||||
"related_services": [container_name],
|
||||
"data_impact": "NONE",
|
||||
},
|
||||
"primary_responsibility": "INFRA",
|
||||
"responsibility_reasoning": "Docker 容器健康檢查失敗屬基礎設施團隊責任,需確認 healthcheck 設定與容器狀態",
|
||||
"secondary_teams": ["BE"],
|
||||
"optimization_suggestions": [
|
||||
{
|
||||
"type": "HEALTHCHECK",
|
||||
"description": "確認 healthcheck 指令在容器內可執行 (mc/curl 是否存在)",
|
||||
"kubectl_or_config": f"ssh {host} 'docker exec {container_name} sh -c \"mc ready local || curl -f http://localhost:9000/minio/health/live\"'",
|
||||
}
|
||||
],
|
||||
"reasoning": f"[規則匹配] Docker healthcheck 失敗先 restart 恢復服務,同時確認 healthcheck 指令正確。{signoz_correlation}",
|
||||
"deviation_analysis": "容器健康檢查連續失敗,超出允許次數",
|
||||
"confidence": 0.0,
|
||||
"affected_services": [container_name],
|
||||
"signoz_correlation": signoz_correlation,
|
||||
}
|
||||
elif alertname == "TargetDown" or (labels.get("job") and ":" in raw_target):
|
||||
instance = labels.get("instance", raw_target)
|
||||
job = labels.get("job", "exporter")
|
||||
host = instance.split(":")[0]
|
||||
mock_response = {
|
||||
"action_title": f"檢查 {job} ({instance}) 服務存活",
|
||||
"description": f"⚙️ 規則匹配: Prometheus 無法抓取 {instance} ({job}) 指標。常見原因: 主機離線、exporter crash、防火牆封鎖。{signoz_summary}",
|
||||
"suggested_action": "RESTART_DEPLOYMENT",
|
||||
"kubectl_command": f"ssh {host} 'systemctl status node_exporter 2>/dev/null || docker ps | grep exporter'",
|
||||
"target_resource": instance,
|
||||
"namespace": namespace,
|
||||
"risk_level": "medium",
|
||||
"blast_radius": {
|
||||
"affected_pods": 0,
|
||||
"estimated_downtime": "監控盲區持續中",
|
||||
"related_services": [job],
|
||||
"data_impact": "NONE",
|
||||
},
|
||||
"primary_responsibility": "INFRA",
|
||||
"responsibility_reasoning": "Prometheus scrape 目標下線屬基礎設施監控範疇",
|
||||
"secondary_teams": [],
|
||||
"optimization_suggestions": [
|
||||
{
|
||||
"type": "MONITORING",
|
||||
"description": f"確認 {host} 上的 {job} exporter 是否正常運行",
|
||||
"kubectl_or_config": f"ssh {host} 'ps aux | grep exporter'",
|
||||
}
|
||||
],
|
||||
"reasoning": f"[規則匹配] Prometheus target 下線,先 SSH 確認主機存活再重啟 exporter。{signoz_correlation}",
|
||||
"deviation_analysis": "Prometheus scrape 失敗,監控數據中斷",
|
||||
"confidence": 0.0,
|
||||
"affected_services": [instance],
|
||||
"signoz_correlation": signoz_correlation,
|
||||
}
|
||||
elif "oom" in message.lower() or "memory" in alert_type.lower():
|
||||
mock_response = {
|
||||
"action_title": f"刪除異常 Pod {target} (OOMKilled)",
|
||||
"description": f"⚙️ 規則匹配: {target} 發生 OOMKilled,根因為 JVM Heap 配置與 K8s memory limit 不匹配或存在記憶體洩漏。{signoz_summary}",
|
||||
"suggested_action": "DELETE_POD",
|
||||
"kubectl_command": f"kubectl delete pod {target} -n {namespace}",
|
||||
"target_resource": target,
|
||||
"namespace": namespace,
|
||||
"risk_level": "critical" if severity == "critical" else "medium",
|
||||
"blast_radius": {
|
||||
"affected_pods": 1,
|
||||
"estimated_downtime": "~30s",
|
||||
"related_services": ["api-gateway", "downstream-service"],
|
||||
"data_impact": "NONE"
|
||||
},
|
||||
"primary_responsibility": "BE",
|
||||
"responsibility_reasoning": "OOMKilled 通常源於應用程式記憶體配置不當,屬後端團隊責任範圍",
|
||||
"secondary_teams": ["INFRA"],
|
||||
"optimization_suggestions": [
|
||||
{
|
||||
"type": "RESOURCE_LIMIT",
|
||||
"description": "調整 memory limit 至 1Gi 並確保 JVM -Xmx 不超過 70%",
|
||||
"kubectl_or_config": f"kubectl set resources deployment/{target.rsplit('-', 2)[0]} -c {target.rsplit('-', 2)[0]} --limits=memory=1Gi -n {namespace}"
|
||||
},
|
||||
{
|
||||
"type": "HPA",
|
||||
"description": "啟用基於記憶體的 HPA 自動擴展",
|
||||
"kubectl_or_config": f"kubectl autoscale deployment {target.rsplit('-', 2)[0]} --memory-percent=80 --min=2 --max=5 -n {namespace}"
|
||||
}
|
||||
],
|
||||
"reasoning": f"⚙️ Pod OOMKilled 後 ReplicaSet 將自動重建,但需同步修正資源配置防止復發。{signoz_correlation}",
|
||||
"deviation_analysis": f"Memory 使用率 {metrics.get('memory_percent', 99)}%,超出基準線 60% 達 +6.5σ",
|
||||
"confidence": 0.0, # 🔴 規則匹配,非 AI 仲裁
|
||||
"affected_services": [target, "api-gateway"],
|
||||
"signoz_correlation": signoz_correlation,
|
||||
}
|
||||
elif "cpu" in alert_type.lower() or "high_cpu" in alert_type:
|
||||
# 根據 SignOz RPS 調整策略
|
||||
rps_context = ""
|
||||
if signoz_metrics and signoz_metrics.rps > 50:
|
||||
rps_context = f"SignOz 顯示 RPS={signoz_metrics.rps:.0f},流量較高,建議配置 HPA。"
|
||||
|
||||
mock_response = {
|
||||
"action_title": f"擴展 {target} 副本數 + 啟用 HPA",
|
||||
"description": f"⚙️ 規則匹配: {target} CPU 使用率過高,根因為流量突增或計算密集任務未配置自動擴展。{rps_context}",
|
||||
"suggested_action": "SCALE_DEPLOYMENT",
|
||||
"kubectl_command": tuning["command"],
|
||||
"target_resource": target,
|
||||
"namespace": namespace,
|
||||
"risk_level": "medium",
|
||||
"blast_radius": {
|
||||
"affected_pods": 0,
|
||||
"estimated_downtime": "0",
|
||||
"related_services": [],
|
||||
"data_impact": "NONE"
|
||||
},
|
||||
"primary_responsibility": "INFRA",
|
||||
"responsibility_reasoning": "自動擴展策略未配置或閾值過高,屬基礎設施團隊責任",
|
||||
"secondary_teams": ["BE"],
|
||||
"optimization_suggestions": [
|
||||
{
|
||||
"type": tuning["type"],
|
||||
"description": tuning["description"],
|
||||
"kubectl_or_config": tuning["command"],
|
||||
},
|
||||
{
|
||||
"type": "RESOURCE_LIMIT",
|
||||
"description": "增加 CPU request 確保 QoS 為 Guaranteed",
|
||||
"kubectl_or_config": f"kubectl set resources deployment/{target} --requests=cpu=500m --limits=cpu=2000m -n {namespace}"
|
||||
}
|
||||
],
|
||||
"reasoning": f"[規則匹配] 水平擴展可即時分散負載,同時建議配置 HPA 防止復發。{signoz_correlation}",
|
||||
"deviation_analysis": f"CPU 使用率 {metrics.get('cpu_percent', 95)}%,超出基準線 50% 達 +4.5σ",
|
||||
"confidence": 0.0, # 🔴 規則匹配不是 AI 仲裁,信心度設 0
|
||||
"affected_services": [target],
|
||||
"signoz_correlation": signoz_correlation,
|
||||
}
|
||||
elif "http" in alert_type.lower() or "5xx" in message.lower() or "502" in message.lower():
|
||||
mock_response = {
|
||||
"action_title": f"重啟 {target} + 檢查上游服務",
|
||||
"description": f"⚙️ 規則匹配: {target} 產生 HTTP 5xx 錯誤,可能為應用程式例外或上游服務不可達。{signoz_summary}",
|
||||
"suggested_action": "RESTART_DEPLOYMENT",
|
||||
"kubectl_command": f"kubectl rollout restart deployment/{target} -n {namespace}",
|
||||
"target_resource": target,
|
||||
"namespace": namespace,
|
||||
"risk_level": "critical",
|
||||
"blast_radius": {
|
||||
"affected_pods": 3,
|
||||
"estimated_downtime": "~1 min",
|
||||
"related_services": ["nginx-ingress", "upstream-api"],
|
||||
"data_impact": "NONE"
|
||||
},
|
||||
"primary_responsibility": "COLLAB",
|
||||
"responsibility_reasoning": "HTTP 5xx 可能源於前端路由、後端邏輯或基礎設施,需多團隊協同排查",
|
||||
"secondary_teams": ["FE", "BE", "INFRA"],
|
||||
"optimization_suggestions": [
|
||||
{
|
||||
"type": "CIRCUIT_BREAKER",
|
||||
"description": "配置熔斷器防止故障擴散",
|
||||
"kubectl_or_config": "# Istio VirtualService outlierDetection 配置"
|
||||
},
|
||||
{
|
||||
"type": "CACHE",
|
||||
"description": "增加 Redis 緩存減少上游壓力",
|
||||
"kubectl_or_config": "# 檢查 Redis 連線池配置,建議 maxTotal=50"
|
||||
}
|
||||
],
|
||||
"reasoning": f"[規則匹配] HTTP 錯誤需協同排查,先重啟恢復服務同時通知相關團隊。{signoz_correlation}",
|
||||
"deviation_analysis": "錯誤率 5%,超出基準線 0.1% 達 +50σ",
|
||||
"confidence": 0.0, # 🔴 規則匹配不是 AI 仲裁,信心度設 0
|
||||
"affected_services": [target, "nginx-ingress", "upstream-api"],
|
||||
"signoz_correlation": signoz_correlation,
|
||||
}
|
||||
else:
|
||||
# 通用異常處理
|
||||
mock_response = match_rule(alert_context)
|
||||
if mock_response is None:
|
||||
# match_rule 不應該回傳 None(有通用兜底),但防禦性處理
|
||||
alert_type = alert_context.get("alert_type", "custom")
|
||||
target = alert_context.get("target_resource", "unknown")
|
||||
namespace = alert_context.get("namespace", "awoooi-prod")
|
||||
mock_response = {
|
||||
"action_title": f"重新啟動 {target} 服務",
|
||||
"description": f"⚙️ 規則匹配: {target} 發生異常: {message[:80]}。需進一步診斷確認根因。{signoz_summary}",
|
||||
"description": f"⚙️ 規則匹配: {target} 發生異常,需進一步診斷確認根因。",
|
||||
"suggested_action": "RESTART_DEPLOYMENT",
|
||||
"kubectl_command": f"kubectl rollout restart deployment/{target} -n {namespace}",
|
||||
"target_resource": target,
|
||||
"namespace": namespace,
|
||||
"risk_level": "critical" if severity == "critical" else "medium",
|
||||
"risk_level": "medium",
|
||||
"blast_radius": {
|
||||
"affected_pods": 3,
|
||||
"estimated_downtime": "~1 min",
|
||||
"related_services": ["dependent-services"],
|
||||
"data_impact": "NONE"
|
||||
"affected_pods": 1,
|
||||
"estimated_downtime": "5-15 min",
|
||||
"related_services": [target],
|
||||
"data_impact": "NONE",
|
||||
},
|
||||
"primary_responsibility": "COLLAB",
|
||||
"responsibility_reasoning": "告警資訊不足以判定單一責任團隊,建議多團隊協同排查",
|
||||
"responsibility_reasoning": "告警資訊不足,建議多團隊協同排查",
|
||||
"secondary_teams": ["BE", "INFRA"],
|
||||
"optimization_suggestions": [
|
||||
{
|
||||
"type": tuning["type"],
|
||||
"description": tuning["description"],
|
||||
"kubectl_or_config": tuning["command"],
|
||||
}
|
||||
],
|
||||
"reasoning": f"[規則匹配] 根據告警 {alert_type} 先重啟恢復服務,同時安排深入診斷。{signoz_correlation}",
|
||||
"optimization_suggestions": [],
|
||||
"reasoning": f"[規則匹配] 根據告警 {alert_type} 先重啟恢復服務。",
|
||||
"deviation_analysis": "監控指標顯示異常偏離基準線",
|
||||
"confidence": 0.0, # 🔴 規則匹配不是 AI 仲裁,信心度設 0
|
||||
"confidence": 0.0,
|
||||
"affected_services": [target],
|
||||
"signoz_correlation": signoz_correlation,
|
||||
}
|
||||
|
||||
# 補充 SignOz 關聯資訊(規則引擎不持有 signoz_metrics)
|
||||
mock_response["signoz_correlation"] = signoz_correlation
|
||||
if signoz_metrics:
|
||||
mock_response["description"] += f" {signoz_metrics.to_summary()}"
|
||||
|
||||
logger.info(
|
||||
"mock_llm_response_generated",
|
||||
rule_id=mock_response.get("rule_id", "unknown"),
|
||||
action_title=mock_response["action_title"],
|
||||
risk_level=mock_response["risk_level"],
|
||||
primary_responsibility=mock_response["primary_responsibility"],
|
||||
|
||||
Reference in New Issue
Block a user