fix(b1): Solver Agent 結構化動作 — 北極星 §1.1 修復多樣性 ≥ 40%
Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 2m22s
Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 2m22s
INC-20260425 衍生修復 — Solver 拒絕 rule-based mock 兜底: 原設計缺陷: - LLM 失敗時 → rule-based mock 推 RESTART 兜底 - 違反北極星 §1.1:修復多樣性 ≥ 40%(不能寫死同一指令) 新設計: - LLM 失敗 → graceful degraded(candidates=[], recommended_actions=[], degraded=True) - 禁止 rule-based mock / hardcode RESTART - 新增 recommended_actions 結構化 MCP 動作清單 · 供 B3 Telegram 按鈕動態生成 · YAML 規則庫驅動,非寫死 - 新增 yaml + Path import 載入動作模板庫 向下相容: - 既有 candidates / blast_radius 邏輯不變 - 新增欄位 recommended_actions 為 optional list Tests: 8 passed (solver 相關全綠) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-Authored-By: Claude Sonnet 4.6 (B1 北極星 §1.1) <noreply@anthropic.com>
This commit is contained in:
@@ -10,11 +10,14 @@ AWOOOI AIOps Phase 2 — Solver Agent(軍師)
|
||||
1. 每個 Hypothesis 至少產 1 個 CandidateAction
|
||||
2. blast_radius 評分影響 Reviewer 的審查嚴格度
|
||||
3. blast_radius > 50 → Reviewer 必須 request_revision
|
||||
4. 熔斷降級:LLM 失敗 → rule-based mock(基於 category 推 RESTART 為兜底動作)
|
||||
4. 熔斷降級:LLM 失敗 → graceful degraded(candidates=[], recommended_actions=[], degraded=True)
|
||||
禁止 rule-based mock / hardcode RESTART 兜底(北極星 §1.1 修復多樣性 ≥ 40%,禁止寫死規則)
|
||||
5. Solver 不直接觸碰執行層(Coordinator 的工作)
|
||||
6. recommended_actions:結構化 MCP 動作清單,供 B3 Telegram 按鈕動態生成
|
||||
|
||||
ADR-082: Phase 2 多 Agent 協作
|
||||
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 2 初始建立
|
||||
2026-04-27 Claude Sonnet 4.6: B1 — Solver 結構化動作 (北極星 §1.1 修復多樣性 ≥ 40%)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -24,9 +27,11 @@ import hashlib
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
import yaml
|
||||
|
||||
from src.agents.base import BaseAgent
|
||||
from src.agents.protocol import (
|
||||
@@ -35,6 +40,7 @@ from src.agents.protocol import (
|
||||
AgentVote,
|
||||
CandidateAction,
|
||||
DiagnosisReport,
|
||||
RecommendedAction,
|
||||
)
|
||||
from src.observability.agent_step_metrics import observe_agent_step
|
||||
from src.services.sanitization_service import sanitize
|
||||
@@ -62,7 +68,7 @@ PHASE2_STEP_TIMEOUT_SEC = AGENT_SOLVER_TIMEOUT_SEC
|
||||
# 1. 分隔符改為顯式 [ ](ASCII 空格),明確排除 \n\r\t\f\v
|
||||
# 2. 字元類改為 [A-Za-z0-9 \-=./:_,@](顯式空格,非 \s)
|
||||
# 3. 有界 quantifier {1,500} 防止無界回溯
|
||||
# 4. re.ASCII 旗標禁用 Unicode 空白匹配(如 等不可見字元)
|
||||
# 4. re.ASCII 旗標禁用 Unicode 空白匹配(如 等不可見字元)
|
||||
# 範圍:Nemo 路徑 + action_title 路徑 + 標準 candidates 路徑三層防護(C2)
|
||||
_KUBECTL_COMMAND_PATTERN = re.compile(
|
||||
r"^kubectl[ ][A-Za-z0-9 \-=./:_,@]{1,500}$",
|
||||
@@ -72,6 +78,47 @@ _KUBECTL_COMMAND_PATTERN = re.compile(
|
||||
# 指令長度上限(與正則 {1,500} 對齊,先做長度 O(1) 硬檢查再跑正則)
|
||||
_KUBECTL_MAX_LEN = 500
|
||||
|
||||
# 2026-04-27 Claude Sonnet 4.6: B1 — 從 callback_action_spec.yaml 動態載入 MCP tool 清單
|
||||
# 失敗無害:載入失敗 → 使用空 dict → prompt 提示「暫無 MCP 清單」
|
||||
_CALLBACK_SPEC_PATH = Path(__file__).parent.parent / "services" / "callback_action_spec.yaml"
|
||||
|
||||
# 有效的 mcp_provider 清單(與 RecommendedAction schema 對應)
|
||||
_VALID_MCP_PROVIDERS = frozenset(
|
||||
{"k8s", "ssh", "prometheus", "signoz", "database", "internal"}
|
||||
)
|
||||
|
||||
# 有效的 risk 等級(與 RecommendedAction schema 對應)
|
||||
_VALID_RISK_LEVELS = frozenset({"low", "medium", "high", "critical"})
|
||||
|
||||
|
||||
def _load_mcp_tool_registry() -> dict[str, dict[str, Any]]:
|
||||
"""
|
||||
從 callback_action_spec.yaml 動態載入 MCP tool 清單。
|
||||
|
||||
2026-04-27 Claude Sonnet 4.6: B1 — 動態載入,避免 hardcode
|
||||
回傳格式:{action_name: {provider, tool, risk, label, emoji}}
|
||||
失敗 → 返回 {} 並 log warning(不中斷 Solver 主流程)
|
||||
"""
|
||||
try:
|
||||
with _CALLBACK_SPEC_PATH.open(encoding="utf-8") as f:
|
||||
spec = yaml.safe_load(f)
|
||||
registry: dict[str, dict[str, Any]] = {}
|
||||
for name, action in (spec.get("actions") or {}).items():
|
||||
mcp = action.get("mcp") or {}
|
||||
registry[name] = {
|
||||
"provider": mcp.get("provider", ""),
|
||||
"tool": mcp.get("tool", ""),
|
||||
"risk": action.get("risk", "low"),
|
||||
"label": action.get("label", name),
|
||||
"emoji": action.get("emoji", ""),
|
||||
"params": mcp.get("params") or {},
|
||||
"description": action.get("description", ""),
|
||||
}
|
||||
return registry
|
||||
except Exception as e:
|
||||
logger.warning("mcp_registry_load_failed", path=str(_CALLBACK_SPEC_PATH), error=str(e))
|
||||
return {}
|
||||
|
||||
|
||||
def _is_safe_kubectl_command(cmd: str) -> bool:
|
||||
"""kubectl 命令白名單驗證。
|
||||
@@ -108,6 +155,107 @@ def _is_safe_kubectl_command(cmd: str) -> bool:
|
||||
return _KUBECTL_COMMAND_PATTERN.fullmatch(cmd) is not None
|
||||
|
||||
|
||||
def _validate_recommended_action(raw: Any) -> RecommendedAction | None:
|
||||
"""
|
||||
驗證單一 recommended_action LLM 輸出是否符合 RecommendedAction schema。
|
||||
|
||||
2026-04-27 Claude Sonnet 4.6: B1 — schema 驗證,不合規則 graceful skip
|
||||
- mcp_provider 必須在 _VALID_MCP_PROVIDERS 清單
|
||||
- risk 必須在 _VALID_RISK_LEVELS 清單
|
||||
- 任何欄位缺失 → 返回 None,由呼叫端記 warning 並 skip
|
||||
|
||||
Returns:
|
||||
RecommendedAction — 驗證通過
|
||||
None — 不符 schema,呼叫端 skip 此項(不假造)
|
||||
"""
|
||||
if not isinstance(raw, dict):
|
||||
return None
|
||||
|
||||
name = str(raw.get("name", "")).strip()
|
||||
label = str(raw.get("label", "")).strip()
|
||||
emoji = str(raw.get("emoji", "")).strip()
|
||||
mcp_provider = str(raw.get("mcp_provider", "")).strip().lower()
|
||||
mcp_tool = str(raw.get("mcp_tool", "")).strip()
|
||||
risk = str(raw.get("risk", "")).strip().lower()
|
||||
reasoning = str(raw.get("reasoning", "")).strip()
|
||||
params_raw = raw.get("params", {})
|
||||
|
||||
# 必填欄位非空
|
||||
if not name or not label or not mcp_provider or not mcp_tool or not risk:
|
||||
return None
|
||||
|
||||
# provider 白名單驗證
|
||||
if mcp_provider not in _VALID_MCP_PROVIDERS:
|
||||
logger.warning(
|
||||
"solver_recommended_action_invalid_provider",
|
||||
name=name,
|
||||
mcp_provider=mcp_provider,
|
||||
valid=sorted(_VALID_MCP_PROVIDERS),
|
||||
)
|
||||
return None
|
||||
|
||||
# risk 白名單驗證
|
||||
if risk not in _VALID_RISK_LEVELS:
|
||||
logger.warning(
|
||||
"solver_recommended_action_invalid_risk",
|
||||
name=name,
|
||||
risk=risk,
|
||||
valid=sorted(_VALID_RISK_LEVELS),
|
||||
)
|
||||
return None
|
||||
|
||||
# params 型別保護(必須是 dict[str, str])
|
||||
if not isinstance(params_raw, dict):
|
||||
params: dict[str, str] = {}
|
||||
else:
|
||||
params = {str(k): str(v) for k, v in params_raw.items()}
|
||||
|
||||
return RecommendedAction(
|
||||
name=name[:64],
|
||||
label=label[:80],
|
||||
emoji=emoji[:8],
|
||||
mcp_provider=mcp_provider, # type: ignore[arg-type] # 已驗過在 Literal 範圍
|
||||
mcp_tool=mcp_tool[:80],
|
||||
params=params,
|
||||
risk=risk, # type: ignore[arg-type] # 已驗過在 Literal 範圍
|
||||
reasoning=reasoning[:400],
|
||||
)
|
||||
|
||||
|
||||
def _extract_recommended_actions(parsed: dict[str, Any]) -> list[RecommendedAction]:
|
||||
"""
|
||||
從 LLM 解析結果提取 recommended_actions(按 schema 驗證)。
|
||||
|
||||
2026-04-27 Claude Sonnet 4.6: B1 — schema 驗證 + graceful degraded
|
||||
- LLM 輸出不合 schema → 記 warning + skip(不假造)
|
||||
- 最多取 3 個(每假設 0-3 個動作)
|
||||
- 返回空列表 = 降級,呼叫端設 degraded=True
|
||||
|
||||
Returns:
|
||||
list[RecommendedAction](0-3 個,驗證通過的項目)
|
||||
"""
|
||||
raw_list = parsed.get("recommended_actions", [])
|
||||
if not isinstance(raw_list, list):
|
||||
return []
|
||||
|
||||
result: list[RecommendedAction] = []
|
||||
for idx, raw in enumerate(raw_list[:5]): # 最多處理 5 個,取前 3 個通過驗證的
|
||||
action = _validate_recommended_action(raw)
|
||||
if action is None:
|
||||
logger.warning(
|
||||
"solver_recommended_action_schema_invalid",
|
||||
index=idx,
|
||||
raw=str(raw)[:200],
|
||||
reason="欄位缺失或不符白名單,skip(不假造)",
|
||||
)
|
||||
continue
|
||||
result.append(action)
|
||||
if len(result) >= 3:
|
||||
break
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class SolverAgent(BaseAgent):
|
||||
"""
|
||||
Solver Agent — 修復方案軍師
|
||||
@@ -133,7 +281,7 @@ class SolverAgent(BaseAgent):
|
||||
timeout_sec: 已廢棄 (2026-04-16 ogt) — LLM 等完整回應,真實異常才降級
|
||||
|
||||
Returns:
|
||||
ActionPlan(真實異常時 degraded=True)
|
||||
ActionPlan(真實異常時 degraded=True,recommended_actions=[],不假造)
|
||||
"""
|
||||
start_ms = int(time.monotonic() * 1000)
|
||||
|
||||
@@ -146,6 +294,7 @@ class SolverAgent(BaseAgent):
|
||||
latency_ms=0,
|
||||
vote=AgentVote.ABSTAIN,
|
||||
degraded=diagnosis.degraded,
|
||||
recommended_actions=[],
|
||||
)
|
||||
|
||||
try:
|
||||
@@ -154,6 +303,7 @@ class SolverAgent(BaseAgent):
|
||||
logger.info(
|
||||
"solver_done",
|
||||
candidates=len(plan.candidates),
|
||||
recommended_actions=len(plan.recommended_actions),
|
||||
vote=plan.vote,
|
||||
latency_ms=plan.latency_ms,
|
||||
)
|
||||
@@ -173,6 +323,7 @@ class SolverAgent(BaseAgent):
|
||||
diagnosis_report=diagnosis,
|
||||
latency_ms=0,
|
||||
vote=AgentVote.ABSTAIN,
|
||||
recommended_actions=[],
|
||||
)
|
||||
|
||||
# 2026-04-17 ogt + Claude Sonnet 4.6 (Checkpoint-2 環境感知):
|
||||
@@ -181,11 +332,15 @@ class SolverAgent(BaseAgent):
|
||||
# 失敗無害:kubectl 超時或拒絕 → _k8s_inventory 為空 → prompt 仍正常但無鎖定效果
|
||||
_k8s_inventory = await _fetch_k8s_inventory(namespace="awoooi-prod")
|
||||
|
||||
# 2026-04-27 Claude Sonnet 4.6: B1 — 動態載入 MCP tool 清單注入 prompt
|
||||
mcp_registry = _load_mcp_tool_registry()
|
||||
|
||||
prompt = self._build_prompt({
|
||||
"hypothesis": top.description,
|
||||
"category": top.category,
|
||||
"confidence": top.confidence,
|
||||
"k8s_inventory": _k8s_inventory,
|
||||
"mcp_registry": mcp_registry,
|
||||
})
|
||||
|
||||
# 2026-04-16 ogt + Claude Sonnet 4.6: 傳遞 hypothesis 結構化資料給 OPENCLAW_NEMO
|
||||
@@ -227,6 +382,16 @@ class SolverAgent(BaseAgent):
|
||||
parsed = self._parse_response(sanitize(response_text, "solver_output"))
|
||||
candidates = _extract_candidates(parsed)
|
||||
|
||||
# 2026-04-27 Claude Sonnet 4.6: B1 — 提取 recommended_actions(schema 驗證)
|
||||
# LLM 輸出不合 schema → graceful skip(log warn + empty actions),不假造
|
||||
recommended_actions = _extract_recommended_actions(parsed)
|
||||
if not recommended_actions:
|
||||
logger.info(
|
||||
"solver_recommended_actions_empty",
|
||||
reason="LLM 未輸出合法 recommended_actions 或 schema 驗證失敗",
|
||||
snapshot_id=diagnosis.evidence_snapshot_id,
|
||||
)
|
||||
|
||||
# 2026-04-25 ogt + Claude Sonnet 4.6: 非 K8s target 後置過濾(P0 修復)
|
||||
# 根因:GiteaMemoryPressure 告警觸發 Solver → LLM 生成 "kubectl scale deployment gitea"
|
||||
# Gitea 在主機 docker-compose,不在 awoooi-prod namespace → 執行必然失敗
|
||||
@@ -247,6 +412,7 @@ class SolverAgent(BaseAgent):
|
||||
diagnosis_report=diagnosis,
|
||||
latency_ms=0,
|
||||
vote=AgentVote.APPROVE,
|
||||
recommended_actions=recommended_actions,
|
||||
)
|
||||
|
||||
def _build_prompt(self, context: dict[str, Any]) -> str:
|
||||
@@ -258,6 +424,7 @@ class SolverAgent(BaseAgent):
|
||||
# 修復:要求 action 必須是真實 kubectl 命令,並提供正確範例
|
||||
# 2026-04-17 ogt + Claude Sonnet 4.6 (Checkpoint-2): 注入 K8s 實際 Deployment 清單
|
||||
# LLM 必須從此清單選擇資源名稱,不可自行編造
|
||||
# 2026-04-27 Claude Sonnet 4.6: B1 — 注入 MCP tool 清單 + recommended_actions 要求
|
||||
_inventory = context.get("k8s_inventory", "")
|
||||
_inventory_section = (
|
||||
f"\n🔒 叢集實際 Deployment 清單(awoooi-prod)— 必須從此清單選擇資源名稱:\n{_inventory}\n"
|
||||
@@ -276,16 +443,40 @@ class SolverAgent(BaseAgent):
|
||||
if _inventory
|
||||
else ""
|
||||
)
|
||||
|
||||
# 2026-04-27 Claude Sonnet 4.6: B1 — 動態 MCP tool 清單注入
|
||||
# 只注入 provider/tool/risk/label,避免 params 模板混淆 LLM
|
||||
mcp_registry: dict[str, dict[str, Any]] = context.get("mcp_registry") or {}
|
||||
if mcp_registry:
|
||||
_mcp_lines = []
|
||||
for action_name, info in mcp_registry.items():
|
||||
_mcp_lines.append(
|
||||
f" - name: {action_name}"
|
||||
f" | provider: {info['provider']}"
|
||||
f" | tool: {info['tool']}"
|
||||
f" | risk: {info['risk']}"
|
||||
f" | label: {info['label']}"
|
||||
)
|
||||
_mcp_section = (
|
||||
"\n可用 MCP actions(請從此清單選擇 recommended_actions):\n"
|
||||
+ "\n".join(_mcp_lines)
|
||||
+ "\n"
|
||||
)
|
||||
else:
|
||||
_mcp_section = "\n(MCP action 清單暫不可用,recommended_actions 可留空陣列)\n"
|
||||
|
||||
return f"""你是 AWOOOI SRE 系統的軍師 Agent,專職修復方案設計。
|
||||
|
||||
根因假設:{context.get("hypothesis", "")}
|
||||
告警類別:{context.get("category", "")}
|
||||
診斷信心:{context.get("confidence", 0.0):.0%}
|
||||
{_inventory_section}{_non_k8s_warning}
|
||||
你的工作:為此根因提出 1-3 個修復候選方案。
|
||||
每個方案必須評估:
|
||||
- blast_radius(0-100):影響範圍(越高 = 風險越大)
|
||||
- rollback_cost(0-100):回滾難度(越高 = 越難還原)
|
||||
{_inventory_section}{_non_k8s_warning}{_mcp_section}
|
||||
你的工作:為此根因提出 1-3 個修復候選方案,同時輸出 0-3 個結構化 recommended_actions。
|
||||
|
||||
candidates 格式規則:
|
||||
- action 欄位必須是真實的 kubectl 命令(不可用自然語言描述)
|
||||
- 目標資源格式:deployment/<name>,命名空間統一用 awoooi-prod
|
||||
- 每個方案必須評估 blast_radius(0-100 影響範圍)和 rollback_cost(0-100 回滾難度)
|
||||
|
||||
blast_radius 參考:
|
||||
- kubectl rollout restart deployment = 10
|
||||
@@ -295,8 +486,12 @@ blast_radius 參考:
|
||||
- kubectl delete deployment = 75
|
||||
- kubectl delete pvc = 95
|
||||
|
||||
🔴 關鍵規則:action 欄位必須是真實的 kubectl 命令,不可用自然語言描述。
|
||||
目標資源格式:deployment/<name>,命名空間統一用 awoooi-prod。
|
||||
recommended_actions 規則(北極星 §1.1 修復多樣性):
|
||||
- 不要全部是 restart 類動作,至少 1 個是查看 log 或診斷類(低風險)
|
||||
- mcp_provider 必須是以下之一:k8s | ssh | prometheus | signoz | database | internal
|
||||
- risk 必須是以下之一:low | medium | high | critical
|
||||
- critical risk 的動作必須在 reasoning 說明原因
|
||||
- params 中可使用模板:{{labels.namespace}} / {{labels.pod}} / {{incident_id}}
|
||||
|
||||
以 JSON 回覆:
|
||||
{{
|
||||
@@ -307,6 +502,35 @@ blast_radius 參考:
|
||||
"rollback_cost": 5,
|
||||
"confidence": 0.8,
|
||||
"rationale": "重啟可清除 OOM 導致的記憶體碎片化"
|
||||
}},
|
||||
{{
|
||||
"action": "kubectl top pods -n awoooi-prod --sort-by=memory",
|
||||
"blast_radius": 0,
|
||||
"rollback_cost": 0,
|
||||
"confidence": 0.9,
|
||||
"rationale": "先確認哪個 Pod 記憶體使用最高再決定操作"
|
||||
}}
|
||||
],
|
||||
"recommended_actions": [
|
||||
{{
|
||||
"name": "check_pod_logs",
|
||||
"label": "查 Pod Log",
|
||||
"emoji": "📋",
|
||||
"mcp_provider": "k8s",
|
||||
"mcp_tool": "k8s_get_pod_logs",
|
||||
"params": {{"namespace": "awoooi-prod", "pod": "{{labels.pod}}", "tail_lines": "50"}},
|
||||
"risk": "low",
|
||||
"reasoning": "先查 log 確認 OOM 根因,避免盲目重啟"
|
||||
}},
|
||||
{{
|
||||
"name": "k8s_restart",
|
||||
"label": "重啟",
|
||||
"emoji": "🔄",
|
||||
"mcp_provider": "k8s",
|
||||
"mcp_tool": "kubectl_restart",
|
||||
"params": {{"namespace": "awoooi-prod", "deployment": "{{labels.deployment}}"}},
|
||||
"risk": "medium",
|
||||
"reasoning": "確認 OOM 後重啟清除記憶體碎片"
|
||||
}}
|
||||
]
|
||||
}}"""
|
||||
@@ -323,23 +547,29 @@ blast_radius 參考:
|
||||
latency_ms: int,
|
||||
reason: str = "unknown",
|
||||
) -> ActionPlan:
|
||||
"""熔斷降級:rule-based mock(依 category 推 RESTART 兜底)"""
|
||||
category = diagnosis.top_hypothesis.category if diagnosis.top_hypothesis else "Unknown"
|
||||
fallback_action = _default_action_for_category(category)
|
||||
"""
|
||||
熔斷降級:graceful degraded(candidates=[], recommended_actions=[], degraded=True)
|
||||
|
||||
2026-04-27 Claude Sonnet 4.6: B1 — 砍掉 rule-based mock RESTART 兜底
|
||||
舊行為:LLM 失敗 → category-based RESTART mock(hardcode 違反北極星 §1.1)
|
||||
新行為:candidates=[],recommended_actions=[],degraded=True,明確標記降級
|
||||
- 不假造任何動作(Coordinator 會因 candidates=[] 進入人工審核路徑)
|
||||
- Telegram 顯示「AI 降級,請人工研判」而非錯誤的 hardcode 建議
|
||||
|
||||
北極星 §1.1「禁止寫死規則」— 降級不等於兜底假造,是誠實的能力邊界聲明。
|
||||
"""
|
||||
logger.warning(
|
||||
"solver_degraded",
|
||||
reason=reason,
|
||||
snapshot_id=diagnosis.evidence_snapshot_id if diagnosis else None,
|
||||
)
|
||||
return ActionPlan(
|
||||
candidates=[
|
||||
CandidateAction(
|
||||
action=fallback_action,
|
||||
blast_radius=20,
|
||||
rollback_cost=5,
|
||||
confidence=0.2,
|
||||
rationale=f"[降級] LLM 分析失敗({reason}),使用類別 {category} 的預設兜底動作",
|
||||
)
|
||||
],
|
||||
candidates=[],
|
||||
diagnosis_report=diagnosis,
|
||||
latency_ms=latency_ms,
|
||||
vote=AgentVote.DEGRADED,
|
||||
degraded=True,
|
||||
recommended_actions=[],
|
||||
)
|
||||
|
||||
|
||||
@@ -627,7 +857,7 @@ def _extract_candidates(parsed: dict[str, Any]) -> list[CandidateAction]:
|
||||
rationale=f"[語意合成] Nemo 建議「{action_title[:80]}」→ 轉為 kubectl 指令",
|
||||
)]
|
||||
|
||||
# 缺乏資源名稱或無法合成 → return [](交由 _degraded_plan 輸出 category-based 調查指令)
|
||||
# 缺乏資源名稱或無法合成 → return [](交由 _degraded_plan 輸出 empty actions)
|
||||
if not _target and any(w in _at_lower for w in ("rollback", "undo", "restart", "重啟", "回滾", "還原", "重新啟動")):
|
||||
logger.warning(
|
||||
"solver_synthesis_insufficient_context",
|
||||
@@ -672,42 +902,6 @@ def _extract_candidates(parsed: dict[str, Any]) -> list[CandidateAction]:
|
||||
return candidates
|
||||
|
||||
|
||||
def _default_action_for_category(category: str) -> str:
|
||||
"""降級時的預設調查指令 — 必須是真實 kubectl/ssh 命令(調查優先,不執行破壞性操作)
|
||||
|
||||
2026-04-17 ogt + Claude Sonnet 4.6: 改為真實 kubectl 指令
|
||||
2026-04-24 ogt + Claude Sonnet 4.6: 擴展非 K8s 類別(ClickHouse/主機磁碟/DB)
|
||||
根因:SentryClickHouseMemoryPressure/HostDiskUsageHigh 類別不符任何 K8s 關鍵字
|
||||
→ 全部 fallback 到 "kubectl get pods"(無意義診斷指令)
|
||||
修復:加入 clickhouse/database/sentry/host/node/infra 類別映射
|
||||
"""
|
||||
category_lower = category.lower()
|
||||
# K8s workload 層
|
||||
if "pod" in category_lower or "kube" in category_lower or "crash" in category_lower:
|
||||
return "kubectl get pods -n awoooi-prod -o wide"
|
||||
if "cpu" in category_lower or "load" in category_lower:
|
||||
return "kubectl top pods -n awoooi-prod --sort-by=cpu"
|
||||
if "memory" in category_lower or "oom" in category_lower:
|
||||
return "kubectl top pods -n awoooi-prod --sort-by=memory"
|
||||
if "network" in category_lower or "connect" in category_lower:
|
||||
return "kubectl get services -n awoooi-prod"
|
||||
if "disk" in category_lower or "storage" in category_lower or "pvc" in category_lower:
|
||||
return "kubectl exec -n awoooi-prod deployment/postgresql -- df -h"
|
||||
# 外部服務層(非 K8s — 唯讀診斷)
|
||||
if "clickhouse" in category_lower or "sentry" in category_lower:
|
||||
return "kubectl get pods -n awoooi-prod -l app=sentry -o wide"
|
||||
if "database" in category_lower or "postgres" in category_lower or "redis" in category_lower:
|
||||
return "kubectl get pods -n awoooi-prod -l tier=database -o wide"
|
||||
if "rollback" in category_lower or "deploy" in category_lower or "version" in category_lower:
|
||||
return "kubectl rollout history deployment -n awoooi-prod"
|
||||
if "latency" in category_lower or "slow" in category_lower or "timeout" in category_lower:
|
||||
return "kubectl top pods -n awoooi-prod --sort-by=cpu"
|
||||
# 主機層(host/node/infra — 調查指令,kubectl 只查 node 資訊)
|
||||
if "host" in category_lower or "node" in category_lower or "infra" in category_lower:
|
||||
return "kubectl describe nodes | grep -A5 'Conditions\\|Allocatable'"
|
||||
return "kubectl get pods -n awoooi-prod -o wide"
|
||||
|
||||
|
||||
def compute_input_hash(diagnosis: DiagnosisReport) -> str:
|
||||
"""計算 Solver 輸入的 fingerprint。"""
|
||||
key = diagnosis.evidence_snapshot_id + (
|
||||
|
||||
Reference in New Issue
Block a user