feat(aiops): ADR-070 全自動化方向 — 三大修復
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled

1. auto_approve.py: 允許 high risk 自動執行 (low/medium/high 全開放)
   - min_confidence 0.65→0.50 (信心門檻降低)
   - 新增 DESTRUCTIVE_PATTERNS 攔截真正危險指令
     (scale=0, delete deployment/pvc/namespace, drop table)
   - 核心: critical + 破壞性操作 → 人工; 其他 → 全自動

2. decision_manager.py: 新增 _collect_mcp_context()
   - LLM 分析前先收集真實環境狀態 (SSH/K8s MCP)
   - Host/Docker 告警 → ssh_get_container_status + ssh_get_top_processes
   - K8s 告警 → k8s_get_events
   - 注入 diagnosis_context "當前環境狀態 (MCP 實時查詢)" 區段

3. webhooks.py: 修復 target_resource 提取
   - 新增 name/container/job label 提取
   - DockerContainerUnhealthy 不再 target=alertname
   - IP 位址自動排除 (192.x 開頭不作為 target)

🔴 Tier 3 紅區 — 需首席架構師批准
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-11 21:39:52 +08:00
parent 99cc420429
commit c439277fc3
3 changed files with 128 additions and 16 deletions

View File

@@ -1109,13 +1109,19 @@ async def alertmanager_webhook(
"warning"
)
# 優先用 component labelDocker 層告警用 component如 SentryDown → "sentry"
# 次優 podK8s 告警),再次 instanceblackbox probe最後 alertname
# (2026-04-09 Claude Sonnet 4.6 Asia/Taipei, Bug #5 修正 — affected_services 匹配 Playbook)
# target_resource 提取優先順序 (2026-04-11 Claude Sonnet 4.6 全自動化修正)
# component (Docker 服務名) > pod (K8s) > name/container (Cadvisor 容器名) > job > instance IP > alertname
# 關鍵Docker 告警 (DockerContainerUnhealthy/DockerContainerExited) 的容器名在 name label
# 過去 fallback 直接用 alertname導致 target_resource="DockerContainerUnhealthy" 污染整個修復流程
_instance = alert.labels.get("instance", "")
_instance_clean = _instance.split(":")[0] if _instance and ":" in _instance else _instance
target_resource = (
alert.labels.get("component")
or alert.labels.get("pod")
or alert.labels.get("instance")
or alert.labels.get("name") # Cadvisor/cAdvisor 容器名
or alert.labels.get("container") # K8s container name
or alert.labels.get("job") # Prometheus job name次優
or (_instance_clean if _instance_clean and not _instance_clean.startswith("192.") else None)
or alertname
)
namespace = alert.labels.get("namespace", "default")

View File

@@ -59,9 +59,11 @@ class AutoApproveConfig:
"""自動執行配置"""
# 風險等級閾值
# 2026-04-01 ogt: 開放 low + medium,讓常見 restart 操作可自動執行
# 2026-04-11 Claude Sonnet 4.6: ADR-070 全自動化方向 — low/medium/high 全開放
# 真正需要人工的由 DESTRUCTIVE_PATTERNS 攔截scale=0, delete, drop
# 原: ["low", "medium"] → 導致所有 high risk 告警永遠走人工審核
allowed_risk_levels: list[str] = field(
default_factory=lambda: ["low", "medium"]
default_factory=lambda: ["low", "medium", "high"]
)
# 信任度閾值
@@ -69,7 +71,9 @@ class AutoApproveConfig:
# → 改為 0讓 medium risk + confidence >= 0.65 的操作直接自動執行
# 歷史原因: min_trust_score=1 導致所有告警永遠走審批,從未自動修復
min_trust_score: int = 0 # 不要求執行歷史 (原: 1)
min_confidence: float = 0.65 # AI 有合理把握即可 (原: 0.90)
# 2026-04-11 Claude Sonnet 4.6: ADR-070 全自動化 — 0.5 即可執行
# 真正風險由 DESTRUCTIVE_PATTERNS + risk_level=critical 把關
min_confidence: float = 0.50 # AI 有基本把握即可 (原: 0.90, 後: 0.65)
# Playbook 閾值
# 2026-04-01 ogt: 降低啟動門檻1次成功記錄即可
@@ -219,6 +223,31 @@ class AutoApprovePolicy:
confidence=confidence,
)
# 條件 1b: 破壞性指令攔截 (ADR-070: 2026-04-11 Claude Sonnet 4.6)
# 即使是 low/medium risk以下操作仍需人工確認
# 原則: 可恢復操作 → 自動執行; 不可逆 / 業務衝擊 → 人工
_DESTRUCTIVE_PATTERNS = [
"--replicas=0", # scale to zero — 等同停機
"scale deployment", # 任何 scale 操作需確認目標副本數
"delete pod", # 強制刪除 pod
"delete deployment", # 刪除 deployment (不是 restart)
"delete pvc", # 刪除 PVC (資料丟失)
"delete namespace", # 刪除 namespace
"drop table", # DB DDL
"drop database", # DB DDL
"truncate table", # DB DDL
]
action_lower = action.lower()
for pattern in _DESTRUCTIVE_PATTERNS:
if pattern in action_lower:
return self._reject(
reason=AutoApproveReason.CRITICAL_OPERATION,
detail=f"Destructive pattern detected: '{pattern}' in action — requires human approval",
risk_level=risk_level,
trust_score=trust_score,
confidence=confidence,
)
# 條件 2: 風險等級必須在允許列表中
if risk_level not in self.config.allowed_risk_levels:
return self._reject(

View File

@@ -1384,20 +1384,91 @@ class DecisionManager:
logger.error("kb_rag_unexpected_error", incident_id=incident.incident_id, error=str(e))
return ""
async def _collect_mcp_context(self, incident: Incident) -> str:
"""
ADR-070 全自動 AIOps: 分析前用 MCP 收集真實環境狀態
讓 LLM 拿到真實資訊做決策,而非只憑 alert labels
策略:
- K8s 告警 → K8s MCP 查 Pod 狀態/事件
- 主機/Docker 告警 → SSH MCP 查容器狀態/資源
2026-04-11 Claude Sonnet 4.6 Asia/Taipei
"""
if not incident.signals:
return ""
labels = incident.signals[0].labels
alertname = labels.get("alertname", "")
host = labels.get("instance", "").split(":")[0] or labels.get("host", "")
container = labels.get("name") or labels.get("container") or incident.affected_services[0] if incident.affected_services else ""
ns = labels.get("namespace", "awoooi-prod")
ctx_parts: list[str] = []
# 主機/Docker 告警 → SSH MCP 診斷
_HOST_ALERT_PREFIXES = ("Host", "Docker", "Sentry", "Harbor", "Ollama", "Backup")
if alertname.startswith(_HOST_ALERT_PREFIXES) and host:
try:
from src.plugins.mcp.providers.ssh_provider import SSHProvider
ssh = SSHProvider()
if ssh.enabled and host in ("192.168.0.188", "192.168.0.110"):
# 查容器狀態
if container and container != alertname:
status_result = await ssh.execute(
tool_name="ssh_get_container_status",
params={"host": host, "container_name": container},
)
if status_result.get("success"):
ctx_parts.append(f"[SSH] 容器 {container} 狀態: {status_result.get('output', '')[:300]}")
# 查主機資源
if "CpuLoad" in alertname or "Memory" in alertname:
top_result = await ssh.execute(
tool_name="ssh_get_top_processes",
params={"host": host, "top_n": 5},
)
if top_result.get("success"):
ctx_parts.append(f"[SSH] 主機 {host} Top processes: {top_result.get('output', '')[:300]}")
except Exception as e:
logger.debug("mcp_context_ssh_failed", alertname=alertname, error=str(e))
# K8s 告警 → K8s MCP 查 Pod 狀態
if alertname.startswith(("Kube", "K3s")) or labels.get("pod"):
try:
from src.plugins.mcp.providers.k8s_provider import K8sProvider
k8s = K8sProvider()
if k8s.enabled:
pod = labels.get("pod", "")
if pod:
events_result = await k8s.execute(
tool_name="k8s_get_events",
params={"namespace": ns, "field_selector": f"involvedObject.name={pod}"},
)
if events_result.get("success"):
ctx_parts.append(f"[K8s] Pod {pod} 事件: {events_result.get('output', '')[:300]}")
except Exception as e:
logger.debug("mcp_context_k8s_failed", alertname=alertname, error=str(e))
return "\n".join(ctx_parts)
async def _dual_engine_analyze(
self,
incident: Incident,
) -> dict[str, Any]:
"""
三軌決策分析 (Phase 7.5 升級 + KB Phase 2 RAG 整合)
三軌決策分析 (Phase 7.5 升級 + KB Phase 2 RAG 整合 + ADR-070 MCP 前置收集)
策略:
1. 先檢查 Playbook 是否有高度匹配 (similarity >= 85%)
2. Playbook 命中則直接使用 (最快、經驗驗證)
3. 否則 LLM + Expert System 雙軌 + KB RAG context 注入
1. MCP 前置收集真實環境狀態ADR-070
2. 先檢查 Playbook 是否有高度匹配 (similarity >= 85%)
3. Playbook 命中則直接使用 (最快、經驗驗證)
4. 否則 LLM + Expert System 雙軌 + KB RAG context + MCP context 注入
優先順序: Playbook > LLM > Expert System
"""
# ADR-070: 分析前用 MCP 收集真實環境狀態
mcp_context = await self._collect_mcp_context(incident)
# Phase 7.5: 先嘗試 Playbook 匹配
playbook_result = await self._try_playbook_match(incident)
if playbook_result:
@@ -1418,13 +1489,19 @@ class DecisionManager:
try:
signals_dict = [s.model_dump() for s in incident.signals]
# 將 KB context 注入 expert_context 傳給 LLM
# 將 KB context + MCP 實時狀態 注入 expert_context 傳給 LLM
# ADR-070: MCP context 優先放最前面,讓 LLM 看到真實環境狀態再做決策
llm_expert_context: dict[str, Any] = {**expert_result} if expert_result else {}
existing = str(llm_expert_context.get("diagnosis_context", ""))
context_parts = []
if mcp_context:
context_parts.append(f"## 當前環境狀態 (MCP 實時查詢)\n{mcp_context}")
if kb_context:
existing = str(llm_expert_context.get("diagnosis_context", ""))
llm_expert_context["diagnosis_context"] = (
f"{kb_context}\n\n{existing}" if existing else kb_context
)
context_parts.append(f"## 相關歷史知識\n{kb_context}")
if existing:
context_parts.append(existing)
if context_parts:
llm_expert_context["diagnosis_context"] = "\n\n".join(context_parts)
llm_result, provider, success = await self._openclaw.generate_incident_proposal_with_tools(
incident_id=incident.incident_id,