diff --git a/apps/api/src/services/drift_detector.py b/apps/api/src/services/drift_detector.py index 39c622ef..de16be4f 100644 --- a/apps/api/src/services/drift_detector.py +++ b/apps/api/src/services/drift_detector.py @@ -35,6 +35,23 @@ _DEFAULT_ALLOWLIST_FIELDS = frozenset([ "metadata.generation", "metadata.uid", "status", + # K8s 執行時自動填入的欄位(Git manifest 不指定,K8s 注入預設值或運行時資訊) + # 2026-04-16 ogt + Claude Sonnet 4.6: 修復 kubectl rollout restart 觸發假 drift 告警 + # 根因:awoooi-web rollout restart 後 restartedAt annotation 被偵測為 "medium" drift + "spec.template.metadata.annotations", # kubectl.kubernetes.io/restartedAt + "spec.template.metadata.creationTimestamp", + "spec.template.spec.restartPolicy", # K8s 預設: Always(Git 不指定) + "spec.template.spec.dnsPolicy", # K8s 預設: ClusterFirst + "spec.template.spec.terminationGracePeriodSeconds", # K8s 預設: 30 + "spec.template.spec.schedulerName", # K8s 預設: default-scheduler + "spec.strategy.rollingUpdate", # K8s 預設: maxSurge=25% + "spec.strategy.type", # K8s 預設: RollingUpdate + "spec.progressDeadlineSeconds", # K8s 預設: 600 + "spec.revisionHistoryLimit", # K8s 預設: 10 + "metadata.creationTimestamp", + "spec.template.spec.containers[*].terminationMessagePath", + "spec.template.spec.containers[*].terminationMessagePolicy", + "spec.template.spec.containers[*].imagePullPolicy", # K8s 預設: IfNotPresent ]) # 關鍵欄位(必須立即告警) diff --git a/apps/api/src/services/failure_watcher.py b/apps/api/src/services/failure_watcher.py index bf00d4d9..e70905de 100644 --- a/apps/api/src/services/failure_watcher.py +++ b/apps/api/src/services/failure_watcher.py @@ -299,12 +299,18 @@ class FailureWatcherService(IFailureWatcher): return llm_analysis # LLM 失敗,使用規則引擎結果 + # 2026-04-16 ogt + Claude Sonnet 4.6: 修復 root_cause 只顯示 "規則引擎分類: K8S_ERROR" + # 根因:LLM 分析失敗時未帶入實際 error_message,用戶看到的卡片無任何有用資訊 + _error_preview = (error_message[:200] if error_message else "未知錯誤").strip() return { "classification": classification, - "root_cause": f"規則引擎分類: {classification}", + "root_cause": ( + f"[{classification}] {operation_type} 操作在 {target_resource} 失敗\n" + f"錯誤:{_error_preview}" + ), "suggested_repair": self._suggest_repair(classification), "risk_level": risk_level, - "confidence": 0.5, # 規則引擎信心度較低 + "confidence": 0.4, } async def execute_auto_repair( diff --git a/apps/api/src/services/pre_decision_investigator.py b/apps/api/src/services/pre_decision_investigator.py index 9e728ce1..1a0a64f7 100644 --- a/apps/api/src/services/pre_decision_investigator.py +++ b/apps/api/src/services/pre_decision_investigator.py @@ -382,14 +382,30 @@ def _get_labels(incident: "Incident") -> dict[str, Any]: return {} +_SHORT_HOST_MAP: dict[str, str] = { + "110": "192.168.0.110", + "120": "192.168.0.120", + "121": "192.168.0.121", + "188": "192.168.0.188", +} +""" +Prometheus instance label 使用短主機名(如 "110:9100"), +SSH_MCP_ALLOWED_HOSTS 使用完整 IP(如 "192.168.0.110")。 +此映射表做轉換,避免 SSH 工具 "Host 'X' not in SSH_MCP_ALLOWED_HOSTS" 失敗。 +2026-04-16 ogt + Claude Sonnet 4.6: 修復 sensors 7/8 失敗根因 +""" + + def _build_tool_params(incident: "Incident") -> dict[str, Any]: """從 Incident 提取 MCP 工具呼叫所需的公共參數。""" labels = _get_labels(incident) + raw_host = labels.get("instance", "").split(":")[0] or labels.get("host", "") + host = _SHORT_HOST_MAP.get(raw_host, raw_host) # 短名 → 完整 IP return { "namespace": labels.get("namespace", "awoooi-prod"), "pod_name": labels.get("pod", labels.get("name", "")), "deployment": labels.get("deployment", ""), - "host": labels.get("instance", "").split(":")[0] or labels.get("host", ""), + "host": host, "container": labels.get("container", labels.get("name", "")), "alertname": labels.get("alertname", ""), }