feat(aiops): ADR-070 全自動化方向 — 三大修復
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
1. auto_approve.py: 允許 high risk 自動執行 (low/medium/high 全開放)
- min_confidence 0.65→0.50 (信心門檻降低)
- 新增 DESTRUCTIVE_PATTERNS 攔截真正危險指令
(scale=0, delete deployment/pvc/namespace, drop table)
- 核心: critical + 破壞性操作 → 人工; 其他 → 全自動
2. decision_manager.py: 新增 _collect_mcp_context()
- LLM 分析前先收集真實環境狀態 (SSH/K8s MCP)
- Host/Docker 告警 → ssh_get_container_status + ssh_get_top_processes
- K8s 告警 → k8s_get_events
- 注入 diagnosis_context "當前環境狀態 (MCP 實時查詢)" 區段
3. webhooks.py: 修復 target_resource 提取
- 新增 name/container/job label 提取
- DockerContainerUnhealthy 不再 target=alertname
- IP 位址自動排除 (192.x 開頭不作為 target)
🔴 Tier 3 紅區 — 需首席架構師批准
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1109,13 +1109,19 @@ async def alertmanager_webhook(
|
||||
"warning"
|
||||
)
|
||||
|
||||
# 優先用 component label(Docker 層告警用 component,如 SentryDown → "sentry")
|
||||
# 次優 pod(K8s 告警),再次 instance(blackbox probe),最後 alertname
|
||||
# (2026-04-09 Claude Sonnet 4.6 Asia/Taipei, Bug #5 修正 — affected_services 匹配 Playbook)
|
||||
# target_resource 提取優先順序 (2026-04-11 Claude Sonnet 4.6 全自動化修正)
|
||||
# component (Docker 服務名) > pod (K8s) > name/container (Cadvisor 容器名) > job > instance IP > alertname
|
||||
# 關鍵:Docker 告警 (DockerContainerUnhealthy/DockerContainerExited) 的容器名在 name label
|
||||
# 過去 fallback 直接用 alertname,導致 target_resource="DockerContainerUnhealthy" 污染整個修復流程
|
||||
_instance = alert.labels.get("instance", "")
|
||||
_instance_clean = _instance.split(":")[0] if _instance and ":" in _instance else _instance
|
||||
target_resource = (
|
||||
alert.labels.get("component")
|
||||
or alert.labels.get("pod")
|
||||
or alert.labels.get("instance")
|
||||
or alert.labels.get("name") # Cadvisor/cAdvisor 容器名
|
||||
or alert.labels.get("container") # K8s container name
|
||||
or alert.labels.get("job") # Prometheus job name(次優)
|
||||
or (_instance_clean if _instance_clean and not _instance_clean.startswith("192.") else None)
|
||||
or alertname
|
||||
)
|
||||
namespace = alert.labels.get("namespace", "default")
|
||||
|
||||
@@ -59,9 +59,11 @@ class AutoApproveConfig:
|
||||
"""自動執行配置"""
|
||||
|
||||
# 風險等級閾值
|
||||
# 2026-04-01 ogt: 開放 low + medium,讓常見 restart 操作可自動執行
|
||||
# 2026-04-11 Claude Sonnet 4.6: ADR-070 全自動化方向 — low/medium/high 全開放
|
||||
# 真正需要人工的由 DESTRUCTIVE_PATTERNS 攔截(scale=0, delete, drop)
|
||||
# 原: ["low", "medium"] → 導致所有 high risk 告警永遠走人工審核
|
||||
allowed_risk_levels: list[str] = field(
|
||||
default_factory=lambda: ["low", "medium"]
|
||||
default_factory=lambda: ["low", "medium", "high"]
|
||||
)
|
||||
|
||||
# 信任度閾值
|
||||
@@ -69,7 +71,9 @@ class AutoApproveConfig:
|
||||
# → 改為 0,讓 medium risk + confidence >= 0.65 的操作直接自動執行
|
||||
# 歷史原因: min_trust_score=1 導致所有告警永遠走審批,從未自動修復
|
||||
min_trust_score: int = 0 # 不要求執行歷史 (原: 1)
|
||||
min_confidence: float = 0.65 # AI 有合理把握即可 (原: 0.90)
|
||||
# 2026-04-11 Claude Sonnet 4.6: ADR-070 全自動化 — 0.5 即可執行
|
||||
# 真正風險由 DESTRUCTIVE_PATTERNS + risk_level=critical 把關
|
||||
min_confidence: float = 0.50 # AI 有基本把握即可 (原: 0.90, 後: 0.65)
|
||||
|
||||
# Playbook 閾值
|
||||
# 2026-04-01 ogt: 降低啟動門檻,1次成功記錄即可
|
||||
@@ -219,6 +223,31 @@ class AutoApprovePolicy:
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
# 條件 1b: 破壞性指令攔截 (ADR-070: 2026-04-11 Claude Sonnet 4.6)
|
||||
# 即使是 low/medium risk,以下操作仍需人工確認
|
||||
# 原則: 可恢復操作 → 自動執行; 不可逆 / 業務衝擊 → 人工
|
||||
_DESTRUCTIVE_PATTERNS = [
|
||||
"--replicas=0", # scale to zero — 等同停機
|
||||
"scale deployment", # 任何 scale 操作需確認目標副本數
|
||||
"delete pod", # 強制刪除 pod
|
||||
"delete deployment", # 刪除 deployment (不是 restart)
|
||||
"delete pvc", # 刪除 PVC (資料丟失)
|
||||
"delete namespace", # 刪除 namespace
|
||||
"drop table", # DB DDL
|
||||
"drop database", # DB DDL
|
||||
"truncate table", # DB DDL
|
||||
]
|
||||
action_lower = action.lower()
|
||||
for pattern in _DESTRUCTIVE_PATTERNS:
|
||||
if pattern in action_lower:
|
||||
return self._reject(
|
||||
reason=AutoApproveReason.CRITICAL_OPERATION,
|
||||
detail=f"Destructive pattern detected: '{pattern}' in action — requires human approval",
|
||||
risk_level=risk_level,
|
||||
trust_score=trust_score,
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
# 條件 2: 風險等級必須在允許列表中
|
||||
if risk_level not in self.config.allowed_risk_levels:
|
||||
return self._reject(
|
||||
|
||||
@@ -1384,20 +1384,91 @@ class DecisionManager:
|
||||
logger.error("kb_rag_unexpected_error", incident_id=incident.incident_id, error=str(e))
|
||||
return ""
|
||||
|
||||
async def _collect_mcp_context(self, incident: Incident) -> str:
|
||||
"""
|
||||
ADR-070 全自動 AIOps: 分析前用 MCP 收集真實環境狀態
|
||||
讓 LLM 拿到真實資訊做決策,而非只憑 alert labels
|
||||
|
||||
策略:
|
||||
- K8s 告警 → K8s MCP 查 Pod 狀態/事件
|
||||
- 主機/Docker 告警 → SSH MCP 查容器狀態/資源
|
||||
|
||||
2026-04-11 Claude Sonnet 4.6 Asia/Taipei
|
||||
"""
|
||||
if not incident.signals:
|
||||
return ""
|
||||
|
||||
labels = incident.signals[0].labels
|
||||
alertname = labels.get("alertname", "")
|
||||
host = labels.get("instance", "").split(":")[0] or labels.get("host", "")
|
||||
container = labels.get("name") or labels.get("container") or incident.affected_services[0] if incident.affected_services else ""
|
||||
ns = labels.get("namespace", "awoooi-prod")
|
||||
|
||||
ctx_parts: list[str] = []
|
||||
|
||||
# 主機/Docker 告警 → SSH MCP 診斷
|
||||
_HOST_ALERT_PREFIXES = ("Host", "Docker", "Sentry", "Harbor", "Ollama", "Backup")
|
||||
if alertname.startswith(_HOST_ALERT_PREFIXES) and host:
|
||||
try:
|
||||
from src.plugins.mcp.providers.ssh_provider import SSHProvider
|
||||
ssh = SSHProvider()
|
||||
if ssh.enabled and host in ("192.168.0.188", "192.168.0.110"):
|
||||
# 查容器狀態
|
||||
if container and container != alertname:
|
||||
status_result = await ssh.execute(
|
||||
tool_name="ssh_get_container_status",
|
||||
params={"host": host, "container_name": container},
|
||||
)
|
||||
if status_result.get("success"):
|
||||
ctx_parts.append(f"[SSH] 容器 {container} 狀態: {status_result.get('output', '')[:300]}")
|
||||
# 查主機資源
|
||||
if "CpuLoad" in alertname or "Memory" in alertname:
|
||||
top_result = await ssh.execute(
|
||||
tool_name="ssh_get_top_processes",
|
||||
params={"host": host, "top_n": 5},
|
||||
)
|
||||
if top_result.get("success"):
|
||||
ctx_parts.append(f"[SSH] 主機 {host} Top processes: {top_result.get('output', '')[:300]}")
|
||||
except Exception as e:
|
||||
logger.debug("mcp_context_ssh_failed", alertname=alertname, error=str(e))
|
||||
|
||||
# K8s 告警 → K8s MCP 查 Pod 狀態
|
||||
if alertname.startswith(("Kube", "K3s")) or labels.get("pod"):
|
||||
try:
|
||||
from src.plugins.mcp.providers.k8s_provider import K8sProvider
|
||||
k8s = K8sProvider()
|
||||
if k8s.enabled:
|
||||
pod = labels.get("pod", "")
|
||||
if pod:
|
||||
events_result = await k8s.execute(
|
||||
tool_name="k8s_get_events",
|
||||
params={"namespace": ns, "field_selector": f"involvedObject.name={pod}"},
|
||||
)
|
||||
if events_result.get("success"):
|
||||
ctx_parts.append(f"[K8s] Pod {pod} 事件: {events_result.get('output', '')[:300]}")
|
||||
except Exception as e:
|
||||
logger.debug("mcp_context_k8s_failed", alertname=alertname, error=str(e))
|
||||
|
||||
return "\n".join(ctx_parts)
|
||||
|
||||
async def _dual_engine_analyze(
|
||||
self,
|
||||
incident: Incident,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
三軌決策分析 (Phase 7.5 升級 + KB Phase 2 RAG 整合)
|
||||
三軌決策分析 (Phase 7.5 升級 + KB Phase 2 RAG 整合 + ADR-070 MCP 前置收集)
|
||||
|
||||
策略:
|
||||
1. 先檢查 Playbook 是否有高度匹配 (similarity >= 85%)
|
||||
2. Playbook 命中則直接使用 (最快、經驗驗證)
|
||||
3. 否則 LLM + Expert System 雙軌 + KB RAG context 注入
|
||||
1. MCP 前置收集真實環境狀態(ADR-070)
|
||||
2. 先檢查 Playbook 是否有高度匹配 (similarity >= 85%)
|
||||
3. Playbook 命中則直接使用 (最快、經驗驗證)
|
||||
4. 否則 LLM + Expert System 雙軌 + KB RAG context + MCP context 注入
|
||||
|
||||
優先順序: Playbook > LLM > Expert System
|
||||
"""
|
||||
# ADR-070: 分析前用 MCP 收集真實環境狀態
|
||||
mcp_context = await self._collect_mcp_context(incident)
|
||||
|
||||
# Phase 7.5: 先嘗試 Playbook 匹配
|
||||
playbook_result = await self._try_playbook_match(incident)
|
||||
if playbook_result:
|
||||
@@ -1418,13 +1489,19 @@ class DecisionManager:
|
||||
try:
|
||||
signals_dict = [s.model_dump() for s in incident.signals]
|
||||
|
||||
# 將 KB context 注入 expert_context 傳給 LLM
|
||||
# 將 KB context + MCP 實時狀態 注入 expert_context 傳給 LLM
|
||||
# ADR-070: MCP context 優先放最前面,讓 LLM 看到真實環境狀態再做決策
|
||||
llm_expert_context: dict[str, Any] = {**expert_result} if expert_result else {}
|
||||
existing = str(llm_expert_context.get("diagnosis_context", ""))
|
||||
context_parts = []
|
||||
if mcp_context:
|
||||
context_parts.append(f"## 當前環境狀態 (MCP 實時查詢)\n{mcp_context}")
|
||||
if kb_context:
|
||||
existing = str(llm_expert_context.get("diagnosis_context", ""))
|
||||
llm_expert_context["diagnosis_context"] = (
|
||||
f"{kb_context}\n\n{existing}" if existing else kb_context
|
||||
)
|
||||
context_parts.append(f"## 相關歷史知識\n{kb_context}")
|
||||
if existing:
|
||||
context_parts.append(existing)
|
||||
if context_parts:
|
||||
llm_expert_context["diagnosis_context"] = "\n\n".join(context_parts)
|
||||
|
||||
llm_result, provider, success = await self._openclaw.generate_incident_proposal_with_tools(
|
||||
incident_id=incident.incident_id,
|
||||
|
||||
Reference in New Issue
Block a user