fix(sensors+drift+repair-card): 全景修復三個節點問題
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 1h1m39s
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 1h1m39s
Fix 1: sensors 7/8 失敗 — SSH host 短名展開 (pre_decision_investigator.py)
根因: Prometheus instance label 為 "110:9100",split(":")[0]="110"
SSH_MCP_ALLOWED_HOSTS 存完整 IP "192.168.0.110" → 7 個 SSH 工具全部失敗
修復: 加入 _SHORT_HOST_MAP,"110"→"192.168.0.110",四台主機全覆蓋
Fix 2: Config Drift 誤報 — K8s 預設欄位加入白名單 (drift_detector.py)
根因: kubectl rollout restart 後 restartedAt annotation 被偵測為 "medium" drift
restartPolicy/dnsPolicy/terminationGracePeriodSeconds 等 K8s 自動填入欄位未白名單
修復: _DEFAULT_ALLOWLIST_FIELDS 加入 13 個 K8s 執行時自動填入欄位
Fix 3: 修復請求卡內容垃圾 — fallback 帶入真實 error context (failure_watcher.py)
根因: LLM 分析失敗時 root_cause = "規則引擎分類: K8S_ERROR"(無任何有用資訊)
修復: fallback 改為 "[K8S_ERROR] {operation_type} 在 {target_resource} 失敗\n錯誤:{error_message[:200]}"
2026-04-16 ogt + Claude Sonnet 4.6(亞太)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -35,6 +35,23 @@ _DEFAULT_ALLOWLIST_FIELDS = frozenset([
|
||||
"metadata.generation",
|
||||
"metadata.uid",
|
||||
"status",
|
||||
# K8s 執行時自動填入的欄位(Git manifest 不指定,K8s 注入預設值或運行時資訊)
|
||||
# 2026-04-16 ogt + Claude Sonnet 4.6: 修復 kubectl rollout restart 觸發假 drift 告警
|
||||
# 根因:awoooi-web rollout restart 後 restartedAt annotation 被偵測為 "medium" drift
|
||||
"spec.template.metadata.annotations", # kubectl.kubernetes.io/restartedAt
|
||||
"spec.template.metadata.creationTimestamp",
|
||||
"spec.template.spec.restartPolicy", # K8s 預設: Always(Git 不指定)
|
||||
"spec.template.spec.dnsPolicy", # K8s 預設: ClusterFirst
|
||||
"spec.template.spec.terminationGracePeriodSeconds", # K8s 預設: 30
|
||||
"spec.template.spec.schedulerName", # K8s 預設: default-scheduler
|
||||
"spec.strategy.rollingUpdate", # K8s 預設: maxSurge=25%
|
||||
"spec.strategy.type", # K8s 預設: RollingUpdate
|
||||
"spec.progressDeadlineSeconds", # K8s 預設: 600
|
||||
"spec.revisionHistoryLimit", # K8s 預設: 10
|
||||
"metadata.creationTimestamp",
|
||||
"spec.template.spec.containers[*].terminationMessagePath",
|
||||
"spec.template.spec.containers[*].terminationMessagePolicy",
|
||||
"spec.template.spec.containers[*].imagePullPolicy", # K8s 預設: IfNotPresent
|
||||
])
|
||||
|
||||
# 關鍵欄位(必須立即告警)
|
||||
|
||||
@@ -299,12 +299,18 @@ class FailureWatcherService(IFailureWatcher):
|
||||
return llm_analysis
|
||||
|
||||
# LLM 失敗,使用規則引擎結果
|
||||
# 2026-04-16 ogt + Claude Sonnet 4.6: 修復 root_cause 只顯示 "規則引擎分類: K8S_ERROR"
|
||||
# 根因:LLM 分析失敗時未帶入實際 error_message,用戶看到的卡片無任何有用資訊
|
||||
_error_preview = (error_message[:200] if error_message else "未知錯誤").strip()
|
||||
return {
|
||||
"classification": classification,
|
||||
"root_cause": f"規則引擎分類: {classification}",
|
||||
"root_cause": (
|
||||
f"[{classification}] {operation_type} 操作在 {target_resource} 失敗\n"
|
||||
f"錯誤:{_error_preview}"
|
||||
),
|
||||
"suggested_repair": self._suggest_repair(classification),
|
||||
"risk_level": risk_level,
|
||||
"confidence": 0.5, # 規則引擎信心度較低
|
||||
"confidence": 0.4,
|
||||
}
|
||||
|
||||
async def execute_auto_repair(
|
||||
|
||||
@@ -382,14 +382,30 @@ def _get_labels(incident: "Incident") -> dict[str, Any]:
|
||||
return {}
|
||||
|
||||
|
||||
_SHORT_HOST_MAP: dict[str, str] = {
|
||||
"110": "192.168.0.110",
|
||||
"120": "192.168.0.120",
|
||||
"121": "192.168.0.121",
|
||||
"188": "192.168.0.188",
|
||||
}
|
||||
"""
|
||||
Prometheus instance label 使用短主機名(如 "110:9100"),
|
||||
SSH_MCP_ALLOWED_HOSTS 使用完整 IP(如 "192.168.0.110")。
|
||||
此映射表做轉換,避免 SSH 工具 "Host 'X' not in SSH_MCP_ALLOWED_HOSTS" 失敗。
|
||||
2026-04-16 ogt + Claude Sonnet 4.6: 修復 sensors 7/8 失敗根因
|
||||
"""
|
||||
|
||||
|
||||
def _build_tool_params(incident: "Incident") -> dict[str, Any]:
|
||||
"""從 Incident 提取 MCP 工具呼叫所需的公共參數。"""
|
||||
labels = _get_labels(incident)
|
||||
raw_host = labels.get("instance", "").split(":")[0] or labels.get("host", "")
|
||||
host = _SHORT_HOST_MAP.get(raw_host, raw_host) # 短名 → 完整 IP
|
||||
return {
|
||||
"namespace": labels.get("namespace", "awoooi-prod"),
|
||||
"pod_name": labels.get("pod", labels.get("name", "")),
|
||||
"deployment": labels.get("deployment", ""),
|
||||
"host": labels.get("instance", "").split(":")[0] or labels.get("host", ""),
|
||||
"host": host,
|
||||
"container": labels.get("container", labels.get("name", "")),
|
||||
"alertname": labels.get("alertname", ""),
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user