diff --git a/apps/api/src/api/v1/sentry_webhook.py b/apps/api/src/api/v1/sentry_webhook.py index 8b2ff9f3..6d0c329b 100644 --- a/apps/api/src/api/v1/sentry_webhook.py +++ b/apps/api/src/api/v1/sentry_webhook.py @@ -37,6 +37,11 @@ from src.services.anomaly_counter import get_anomaly_counter from src.services.approval_db import get_approval_service from src.services.openclaw_http_service import get_openclaw_http_service from src.services.sentry_service import get_sentry_service +# 2026-04-27 P3.1-T2 by Claude — Tier-2 三服務感知強化:補 SentryWebhookService 簽章驗證 +from src.services.sentry_webhook_service import ( + SentrySignatureError, + verify_sentry_signature, +) from src.services.telegram_gateway import get_telegram_gateway from src.utils.timezone import now_taipei_iso @@ -101,6 +106,15 @@ async def handle_sentry_error( 4. 回寫 Sentry Comment """ try: + # 2026-04-27 P3.1-T2 by Claude — Tier-2 三服務感知強化:接入 SentryWebhookService 簽章驗證 + body = await request.body() + sig_header = request.headers.get("sentry-hook-signature", "") + try: + verify_sentry_signature(body, sig_header) + except SentrySignatureError as sig_err: + logger.warning("sentry_signature_rejected", error=str(sig_err)) + raise HTTPException(status_code=401, detail=str(sig_err)) from sig_err + payload = await request.json() logger.info(f"Received Sentry webhook: action={payload.get('action')}") diff --git a/apps/api/src/core/config.py b/apps/api/src/core/config.py index d99dc95f..aa10ae92 100644 --- a/apps/api/src/core/config.py +++ b/apps/api/src/core/config.py @@ -513,6 +513,14 @@ class Settings(BaseSettings): default=False, description="ADR-095: 啟用 12-Agent ConsensusEngine weights(預設關閉)", ) + # 2026-04-27 P3.1-T2 by Claude — Tier-2 感知強化:DiagnosisAggregator 整合開關 + # 預設關閉:DiagnosisAggregator 與 PreDecisionInvestigator 存在 K8s+SignOz 資料重疊, + # 待重疊分析完成(獨立審查任務)確認互補性後再啟用。 + # 啟用:kubectl set env deployment/awoooi-api ENABLE_DIAGNOSIS_AGGREGATOR=true + ENABLE_DIAGNOSIS_AGGREGATOR: bool = Field( + default=False, + description="P3.1-T2: 啟用 DiagnosisAggregator 在 PreDecisionInvestigator 中補充 Pod 診斷(預設關閉,待重疊分析完成後評估)", + ) def get_tg_user_whitelist(self) -> list[int]: """Parse comma-separated or JSON array user IDs to list[int]""" diff --git a/apps/api/src/services/evidence_snapshot.py b/apps/api/src/services/evidence_snapshot.py index ec7e136d..139eb2a8 100644 --- a/apps/api/src/services/evidence_snapshot.py +++ b/apps/api/src/services/evidence_snapshot.py @@ -92,6 +92,8 @@ class EvidenceSnapshot: # Phase 4 ADR-084: 動態異常感官(DynamicBaseline + LogAnomaly + TrendPredictor) # 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 4 8D 升級 anomaly_context: dict[str, Any] | None = None # Phase 4 動態異常上下文 + # 2026-04-27 P3.1-T2 by Claude — DiagnosisAggregator Pod 深診斷補充(in-memory only,不持久化) + extra_diagnosis: str | None = None # 感官品質 mcp_health: dict[str, bool] = field(default_factory=dict) @@ -162,6 +164,9 @@ class EvidenceSnapshot: parts.append(f"[依賴拓撲] {self.dependency_topology}") if self.anomaly_context: parts.append(f"[動態異常偵測]\n{self.anomaly_context}") + # 2026-04-27 P3.1-T2 by Claude — DiagnosisAggregator Pod 深診斷(ENABLE_DIAGNOSIS_AGGREGATOR=true 時填入) + if self.extra_diagnosis: + parts.append(f"[Pod深診斷]\n{self.extra_diagnosis}") # 感官品質報告 failed_tools = [t for t, ok in self.mcp_health.items() if not ok] diff --git a/apps/api/src/services/pre_decision_investigator.py b/apps/api/src/services/pre_decision_investigator.py index ef2841db..3fabe2a5 100644 --- a/apps/api/src/services/pre_decision_investigator.py +++ b/apps/api/src/services/pre_decision_investigator.py @@ -149,6 +149,21 @@ class PreDecisionInvestigator: except Exception: logger.exception("phase4_anomaly_collect_error", incident_id=incident_id) + # 4.6 P3.1-T2 by Claude 2026-04-27 — DiagnosisAggregator Pod 深診斷(守門:ENABLE_DIAGNOSIS_AGGREGATOR) + # Conservative 策略:預設關閉,避免與 MCP sensor 重複收集 K8s+SignOz 資料。 + # 待重疊分析完成確認互補性後,由統帥設定 ENABLE_DIAGNOSIS_AGGREGATOR=true 啟用。 + try: + from src.core.config import settings as _settings + if _settings.ENABLE_DIAGNOSIS_AGGREGATOR: + await asyncio.wait_for( + self._collect_diagnosis_aggregator(snapshot, incident), + timeout=3.0, + ) + except asyncio.TimeoutError: + logger.warning("diagnosis_aggregator_collect_timeout", incident_id=incident_id) + except Exception: + logger.warning("diagnosis_aggregator_collect_failed", incident_id=incident_id) + # 5. 組裝 summary snapshot.evidence_summary = snapshot.build_summary() @@ -171,6 +186,48 @@ class PreDecisionInvestigator: ) return snapshot + async def _collect_diagnosis_aggregator( + self, + snapshot: EvidenceSnapshot, + incident: "Incident", + ) -> None: + """ + P3.1-T2 by Claude 2026-04-27 — DiagnosisAggregator Pod 深診斷整合 + + 僅在 ENABLE_DIAGNOSIS_AGGREGATOR=true 時呼叫(外層已守門)。 + 從 incident labels 取 pod_name + namespace,呼叫 DiagnosisAggregator + 收集 K8s events + SignOz metrics,結果存入 snapshot.extra_diagnosis。 + + Conservative 策略說明: + DiagnosisAggregator 與 MCP sensors(D1_K8S_STATE / D3_METRICS)存在資料重疊, + 本方法透過 feature flag 隔離,不影響主路徑。資料僅作補充,不覆蓋 MCP 結果。 + """ + from src.services.diagnosis_aggregator import get_diagnosis_aggregator + + labels = _get_labels(incident) + pod_name = labels.get("pod", labels.get("name", "")) + namespace = labels.get("namespace", "awoooi-prod") + + if not pod_name: + logger.debug("diagnosis_aggregator_skip_no_pod", incident_id=snapshot.incident_id) + return + + aggregator = get_diagnosis_aggregator() + ctx = await aggregator.collect_pod_diagnosis( + pod_name=pod_name, + namespace=namespace, + ) + prompt_ctx = ctx.get_llm_prompt_context() + if prompt_ctx: + snapshot.extra_diagnosis = prompt_ctx[:4000] # 限 4K chars,不壓縮主 evidence_summary + logger.debug( + "diagnosis_aggregator_collected", + incident_id=snapshot.incident_id, + pod=pod_name, + signals=len(ctx.signals), + highest_severity=ctx.highest_severity.value, + ) + async def _collect_phase4_anomalies(self, snapshot: EvidenceSnapshot) -> None: """ Phase 4 8D 感官增強:從 ProactiveInspector 快取 + LogAnomalyDetector diff --git a/apps/api/tests/test_solver_recommended_actions.py b/apps/api/tests/test_solver_recommended_actions.py new file mode 100644 index 00000000..bc7cd8b5 --- /dev/null +++ b/apps/api/tests/test_solver_recommended_actions.py @@ -0,0 +1,609 @@ +""" +test_solver_recommended_actions.py +=================================== +B1 — Solver 結構化動作 RecommendedAction schema 驗證 + 真實 NIM e2e 測試 + +2026-04-27 Claude Sonnet 4.6: B1 — Solver 結構化動作 (北極星 §1.1 修復多樣性 ≥ 40%) + +測試範圍: + Unit — schema 驗證(_validate_recommended_action / _extract_recommended_actions) + Unit — _degraded_plan 改造:candidates=[], recommended_actions=[], degraded=True + Unit — YAML MCP registry 動態載入 + E2E — 真實 NIM (192.168.0.188:8088) 三類 incident: + HostDiskUsage / KubePodOOM / DatabaseConnectionFail + assert: len(recommended_actions) >= 1 + 至少 1 個非 restart 類動作(label/name 不含「重啟」「restart」) + +遵循 feedback_no_mock_testing.md: + - 禁止 MagicMock/AsyncMock/unittest.mock.patch 虛構 LLM 呼叫 + - 真實 NIM 不可達時 pytest.skip()(skipif 判斷 OpenClaw 連線) +""" + +from __future__ import annotations + +import sys +import os + +# 確保 src 可找到 +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../")) + +import pytest +import asyncio + +from src.agents.protocol import ( + AgentVote, + DiagnosisReport, + Hypothesis, + RecommendedAction, +) +from src.agents.solver_agent import ( + _extract_recommended_actions, + _load_mcp_tool_registry, + _validate_recommended_action, + get_solver_agent, +) + + +# ============================================================================= +# Helpers +# ============================================================================= + +def _make_diagnosis( + incident_id: str, + hypothesis: str, + category: str, + confidence: float = 0.85, +) -> DiagnosisReport: + """建立標準 DiagnosisReport stub(不 mock LLM,只提供輸入結構)""" + return DiagnosisReport( + hypotheses=[ + Hypothesis( + description=hypothesis, + confidence=confidence, + evidence_chain=["test_evidence"], + category=category, + ) + ], + evidence_snapshot_id=incident_id, + latency_ms=100, + vote=AgentVote.APPROVE, + degraded=False, + ) + + +async def _nim_reachable() -> bool: + """ + 快速探測 NIM (192.168.0.188:8088) 是否可達(用於 skipif) + + E2E 跳過條件(任一滿足即 skip): + 1. MOCK_MODE=true(conftest.py 預設設置,pytest 測試套件中避免真實呼叫) + 2. NIM 實際不可達(網路斷線 / server down) + + 若要在本地跑真實 NIM e2e:MOCK_MODE=false pytest tests/test_solver_recommended_actions.py -k e2e_nim + """ + # 1. MOCK_MODE 檢查(conftest 設 true,會讓 openclaw 走 mock 而非 NIM) + if os.environ.get("MOCK_MODE", "").lower() == "true": + return False + + # 2. NIM 連線探測 + try: + import httpx + async with httpx.AsyncClient(timeout=5.0) as client: + r = await client.get("http://192.168.0.188:8088/health") + return r.status_code < 500 + except Exception: + return False + + +# ============================================================================= +# Unit — _validate_recommended_action schema 驗證 +# ============================================================================= + +class TestValidateRecommendedAction: + """_validate_recommended_action 的 schema 驗證單元測試""" + + def test_valid_action_passes(self): + """合法 action dict → 返回 RecommendedAction""" + raw = { + "name": "check_pod_logs", + "label": "查 Pod Log", + "emoji": "📋", + "mcp_provider": "k8s", + "mcp_tool": "k8s_get_pod_logs", + "params": {"namespace": "awoooi-prod", "pod": "{labels.pod}"}, + "risk": "low", + "reasoning": "先查 log 確認根因", + } + result = _validate_recommended_action(raw) + assert result is not None + assert isinstance(result, RecommendedAction) + assert result.name == "check_pod_logs" + assert result.mcp_provider == "k8s" + assert result.risk == "low" + + def test_missing_name_returns_none(self): + """缺少 name → 返回 None(不假造)""" + raw = { + "label": "查 Pod Log", + "emoji": "📋", + "mcp_provider": "k8s", + "mcp_tool": "k8s_get_pod_logs", + "params": {}, + "risk": "low", + "reasoning": "先查 log", + } + assert _validate_recommended_action(raw) is None + + def test_invalid_provider_returns_none(self): + """mcp_provider 不在白名單 → 返回 None""" + raw = { + "name": "hack", + "label": "惡意", + "emoji": "⚠️", + "mcp_provider": "evil_provider", # 不在白名單 + "mcp_tool": "run_rm_rf", + "params": {}, + "risk": "low", + "reasoning": "test", + } + assert _validate_recommended_action(raw) is None + + def test_invalid_risk_returns_none(self): + """risk 不在白名單 → 返回 None""" + raw = { + "name": "test_action", + "label": "測試", + "emoji": "🔍", + "mcp_provider": "k8s", + "mcp_tool": "k8s_get_pod_logs", + "params": {}, + "risk": "apocalypse", # 不在白名單 + "reasoning": "test", + } + assert _validate_recommended_action(raw) is None + + def test_non_dict_input_returns_none(self): + """非 dict 輸入 → 返回 None""" + assert _validate_recommended_action("not a dict") is None + assert _validate_recommended_action(None) is None + assert _validate_recommended_action(42) is None + + def test_params_non_dict_coerced_to_empty(self): + """params 非 dict → 轉為 {}(不 crash)""" + raw = { + "name": "test_action", + "label": "測試", + "emoji": "🔍", + "mcp_provider": "ssh", + "mcp_tool": "ssh_get_top_processes", + "params": "not_a_dict", # 型別錯誤 + "risk": "low", + "reasoning": "test", + } + result = _validate_recommended_action(raw) + assert result is not None + assert result.params == {} + + def test_all_valid_providers(self): + """所有合法 mcp_provider 都能通過驗證""" + valid_providers = ["k8s", "ssh", "prometheus", "signoz", "database", "internal"] + for provider in valid_providers: + raw = { + "name": f"test_{provider}", + "label": f"測試 {provider}", + "emoji": "🔍", + "mcp_provider": provider, + "mcp_tool": "some_tool", + "params": {}, + "risk": "low", + "reasoning": "test", + } + result = _validate_recommended_action(raw) + assert result is not None, f"provider={provider} 應通過驗證" + + def test_all_valid_risk_levels(self): + """所有合法 risk 等級都能通過驗證""" + valid_risks = ["low", "medium", "high", "critical"] + for risk in valid_risks: + raw = { + "name": f"test_{risk}", + "label": f"測試 {risk}", + "emoji": "🔍", + "mcp_provider": "k8s", + "mcp_tool": "some_tool", + "params": {}, + "risk": risk, + "reasoning": "test", + } + result = _validate_recommended_action(raw) + assert result is not None, f"risk={risk} 應通過驗證" + + +# ============================================================================= +# Unit — _extract_recommended_actions 批量驗證 +# ============================================================================= + +class TestExtractRecommendedActions: + """_extract_recommended_actions 的批量驗證單元測試""" + + def test_empty_list_returns_empty(self): + """空 recommended_actions → 返回 []""" + result = _extract_recommended_actions({"recommended_actions": []}) + assert result == [] + + def test_missing_key_returns_empty(self): + """沒有 recommended_actions key → 返回 []""" + result = _extract_recommended_actions({"candidates": []}) + assert result == [] + + def test_non_list_returns_empty(self): + """recommended_actions 非 list → 返回 []""" + result = _extract_recommended_actions({"recommended_actions": "not_a_list"}) + assert result == [] + + def test_valid_actions_extracted(self): + """合法 actions → 全部提取""" + parsed = { + "recommended_actions": [ + { + "name": "check_pod_logs", + "label": "查 Pod Log", + "emoji": "📋", + "mcp_provider": "k8s", + "mcp_tool": "k8s_get_pod_logs", + "params": {"namespace": "awoooi-prod"}, + "risk": "low", + "reasoning": "先查 log", + }, + { + "name": "k8s_restart", + "label": "重啟", + "emoji": "🔄", + "mcp_provider": "k8s", + "mcp_tool": "kubectl_restart", + "params": {"namespace": "awoooi-prod"}, + "risk": "medium", + "reasoning": "確認 OOM 後重啟", + }, + ] + } + result = _extract_recommended_actions(parsed) + assert len(result) == 2 + assert result[0].name == "check_pod_logs" + assert result[1].name == "k8s_restart" + + def test_mixed_valid_invalid_skips_invalid(self): + """混合合法/非法 → skip 非法,不假造""" + parsed = { + "recommended_actions": [ + { + "name": "valid_action", + "label": "合法", + "emoji": "✅", + "mcp_provider": "k8s", + "mcp_tool": "k8s_get_pod_logs", + "params": {}, + "risk": "low", + "reasoning": "ok", + }, + { + # 缺少 mcp_tool → 無效 + "name": "invalid_action", + "label": "非法", + "emoji": "❌", + "mcp_provider": "k8s", + "risk": "low", + "reasoning": "bad", + }, + { + "name": "another_valid", + "label": "另一合法", + "emoji": "🔍", + "mcp_provider": "ssh", + "mcp_tool": "ssh_get_top_processes", + "params": {}, + "risk": "low", + "reasoning": "ok", + }, + ] + } + result = _extract_recommended_actions(parsed) + assert len(result) == 2, "應 skip 非法,只返回 2 個合法 action" + names = {r.name for r in result} + assert "valid_action" in names + assert "another_valid" in names + assert "invalid_action" not in names + + def test_max_3_actions_enforced(self): + """超過 3 個 → 最多取 3 個""" + parsed = { + "recommended_actions": [ + { + "name": f"action_{i}", + "label": f"動作 {i}", + "emoji": "🔍", + "mcp_provider": "k8s", + "mcp_tool": "k8s_get_pod_logs", + "params": {}, + "risk": "low", + "reasoning": f"reason {i}", + } + for i in range(5) + ] + } + result = _extract_recommended_actions(parsed) + assert len(result) == 3, "最多取 3 個 recommended_actions" + + +# ============================================================================= +# Unit — _degraded_plan 改造驗證 +# ============================================================================= + +class TestDegradedPlanGraceful: + """驗證 _degraded_plan 改造後不再假造 hardcode RESTART""" + + @pytest.mark.asyncio + async def test_degraded_plan_empty_candidates_and_actions(self): + """ + _degraded_plan 必須: + - candidates=[](不假造 RESTART) + - recommended_actions=[](不假造動作) + - degraded=True + - vote=DEGRADED + + 北極星 §1.1:禁止寫死規則,降級 = 誠實的能力邊界聲明 + """ + from src.agents.solver_agent import SolverAgent + + agent = SolverAgent() + diagnosis = _make_diagnosis( + "TEST-DEGRADED-001", + "磁碟使用率超過 90%", + "HostDiskUsage", + ) + + plan = agent._degraded_plan(diagnosis, latency_ms=500, reason="test_unit") + + assert plan.degraded is True, "降級計畫必須標記 degraded=True" + assert plan.vote == AgentVote.DEGRADED, "降級計畫 vote 必須為 DEGRADED" + assert plan.candidates == [], ( + f"降級計畫 candidates 必須為空(不假造 RESTART),實際:{plan.candidates}" + ) + assert plan.recommended_actions == [], ( + f"降級計畫 recommended_actions 必須為空(不假造動作),實際:{plan.recommended_actions}" + ) + + @pytest.mark.asyncio + async def test_degraded_plan_no_hardcode_restart(self): + """ + 確認 _degraded_plan 不再含任何 'restart' / 'RESTART' / '重啟' 字串的 candidate + 北極星 §1.1:禁止寫死規則 + """ + from src.agents.solver_agent import SolverAgent + + agent = SolverAgent() + for category in ["HostDiskUsage", "KubePodOOMKilled", "DatabaseConnectionFail"]: + diagnosis = _make_diagnosis( + f"TEST-NOREST-{category}", + f"測試 {category} 降級", + category, + ) + plan = agent._degraded_plan(diagnosis, latency_ms=0, reason="no_restart_test") + + # 確認沒有 hardcode restart candidate + for c in plan.candidates: + action_lower = c.action.lower() + assert "restart" not in action_lower or "rollout restart" in action_lower, ( + f"category={category}: 降級不應有 hardcode restart 動作,實際:{c.action}" + ) + # 最重要:candidates 為空(不假造) + assert plan.candidates == [], ( + f"category={category}: 降級 candidates 必須為空,實際:{plan.candidates}" + ) + + +# ============================================================================= +# Unit — YAML MCP registry 動態載入 +# ============================================================================= + +class TestMcpRegistryLoader: + """_load_mcp_tool_registry 動態載入測試""" + + def test_registry_loads_successfully(self): + """YAML 正常載入,返回非空 dict""" + registry = _load_mcp_tool_registry() + assert isinstance(registry, dict) + assert len(registry) > 0, "callback_action_spec.yaml 應有 action 定義" + + def test_registry_contains_required_actions(self): + """必要的 action 存在於 registry""" + registry = _load_mcp_tool_registry() + required_actions = ["check_pod_logs", "k8s_restart", "describe_pod"] + for action in required_actions: + assert action in registry, f"action '{action}' 應在 registry 中" + + def test_registry_entries_have_required_fields(self): + """每個 entry 都有 provider / tool / risk / label""" + registry = _load_mcp_tool_registry() + for name, info in registry.items(): + assert "provider" in info, f"action '{name}' 缺少 provider" + assert "tool" in info, f"action '{name}' 缺少 tool" + assert "risk" in info, f"action '{name}' 缺少 risk" + assert "label" in info, f"action '{name}' 缺少 label" + + def test_registry_all_providers_valid(self): + """所有 entry 的 provider 必須在 _VALID_MCP_PROVIDERS 清單內""" + from src.agents.solver_agent import _VALID_MCP_PROVIDERS + + registry = _load_mcp_tool_registry() + for name, info in registry.items(): + provider = info.get("provider", "") + assert provider in _VALID_MCP_PROVIDERS, ( + f"action '{name}' 的 provider='{provider}' 不在合法清單 {_VALID_MCP_PROVIDERS}" + ) + + +# ============================================================================= +# E2E — 真實 NIM 三類 incident 測試 +# ============================================================================= + +@pytest.mark.asyncio +async def test_e2e_nim_host_disk_usage(): + """ + E2E: HostDiskUsage — 磁碟使用率過高 + 真實 NIM (192.168.0.188:8088) 呼叫,不允許 mock + + 驗收: + - len(recommended_actions) >= 1 + - 至少 1 個非 restart 類動作 + - plan 未降級(如果 NIM 回應正常) + """ + if not await _nim_reachable(): + pytest.skip("NIM (192.168.0.188:8088) 不可達,跳過 E2E 測試") + + diagnosis = _make_diagnosis( + "TEST-E2E-HOST-DISK-001", + "主機 192.168.0.121 根目錄磁碟使用率達 91%,/var/lib/docker overlay2 目錄佔用最多", + "HostDiskUsage", + confidence=0.88, + ) + + agent = get_solver_agent() + plan = await agent.run(diagnosis) + + # 基本不崩潰 + assert plan is not None + assert plan.diagnosis_report is not None + + if plan.degraded: + # NIM 降級(可能 timeout 或回傳失敗),確認降級是乾淨的 + assert plan.candidates == [], f"降級時 candidates 必須為空:{plan.candidates}" + assert plan.recommended_actions == [], f"降級時 recommended_actions 必須為空:{plan.recommended_actions}" + pytest.skip(f"NIM 回應降級({plan.vote}),跳過 recommended_actions 驗收") + + # 正常路徑驗收 + assert len(plan.recommended_actions) >= 1, ( + f"HostDiskUsage 應至少有 1 個 recommended_action,實際:{plan.recommended_actions}" + ) + + # 至少 1 個非 restart 類動作(多樣性 ≥ 40%) + non_restart_actions = [ + a for a in plan.recommended_actions + if "restart" not in a.name.lower() + and "restart" not in a.label.lower() + and "重啟" not in a.label + ] + assert len(non_restart_actions) >= 1, ( + f"HostDiskUsage 應至少 1 個非 restart 動作(多樣性要求),實際:{[a.name for a in plan.recommended_actions]}" + ) + + # schema 完整性驗證 + for action in plan.recommended_actions: + assert action.name, "name 不可為空" + assert action.label, "label 不可為空" + assert action.mcp_provider in {"k8s", "ssh", "prometheus", "signoz", "database", "internal"}, ( + f"mcp_provider={action.mcp_provider} 不在白名單" + ) + assert action.risk in {"low", "medium", "high", "critical"}, ( + f"risk={action.risk} 不在白名單" + ) + + +@pytest.mark.asyncio +async def test_e2e_nim_kube_pod_oom(): + """ + E2E: KubePodOOM — Pod OOM Killed + 真實 NIM (192.168.0.188:8088) 呼叫,不允許 mock + + 驗收: + - len(recommended_actions) >= 1 + - 至少 1 個非 restart 類動作(如查 log、查資源使用) + """ + if not await _nim_reachable(): + pytest.skip("NIM (192.168.0.188:8088) 不可達,跳過 E2E 測試") + + diagnosis = _make_diagnosis( + "TEST-E2E-OOM-001", + "awoooi-api pod 在 awoooi-prod namespace 因記憶體超限被 OOM Killer 終止," + "記憶體限制 512Mi,實際峰值 587Mi,過去 1h 發生 3 次", + "KubePodOOMKilled", + confidence=0.92, + ) + + agent = get_solver_agent() + plan = await agent.run(diagnosis) + + assert plan is not None + + if plan.degraded: + assert plan.candidates == [] + assert plan.recommended_actions == [] + pytest.skip(f"NIM 回應降級({plan.vote}),跳過 recommended_actions 驗收") + + assert len(plan.recommended_actions) >= 1, ( + f"KubePodOOM 應至少有 1 個 recommended_action,實際:{plan.recommended_actions}" + ) + + non_restart_actions = [ + a for a in plan.recommended_actions + if "restart" not in a.name.lower() + and "restart" not in a.label.lower() + and "重啟" not in a.label + ] + assert len(non_restart_actions) >= 1, ( + f"KubePodOOM 應至少 1 個非 restart 動作,實際:{[a.name for a in plan.recommended_actions]}" + ) + + for action in plan.recommended_actions: + assert action.mcp_provider in {"k8s", "ssh", "prometheus", "signoz", "database", "internal"} + assert action.risk in {"low", "medium", "high", "critical"} + + +@pytest.mark.asyncio +async def test_e2e_nim_database_connection_fail(): + """ + E2E: DatabaseConnectionFail — 資料庫連線失敗 + 真實 NIM (192.168.0.188:8088) 呼叫,不允許 mock + + 驗收: + - len(recommended_actions) >= 1 + - 至少 1 個非 restart 類動作(如查連線、查 DB 狀態) + """ + if not await _nim_reachable(): + pytest.skip("NIM (192.168.0.188:8088) 不可達,跳過 E2E 測試") + + diagnosis = _make_diagnosis( + "TEST-E2E-DB-CONN-001", + "PostgreSQL 資料庫連線池耗盡,最大連線數 100/100 均被佔用," + "awoooi-api 回應 'connection pool exhausted',P95 latency 飆升至 12s", + "DatabaseConnectionFail", + confidence=0.86, + ) + + agent = get_solver_agent() + plan = await agent.run(diagnosis) + + assert plan is not None + + if plan.degraded: + assert plan.candidates == [] + assert plan.recommended_actions == [] + pytest.skip(f"NIM 回應降級({plan.vote}),跳過 recommended_actions 驗收") + + assert len(plan.recommended_actions) >= 1, ( + f"DatabaseConnectionFail 應至少有 1 個 recommended_action,實際:{plan.recommended_actions}" + ) + + non_restart_actions = [ + a for a in plan.recommended_actions + if "restart" not in a.name.lower() + and "restart" not in a.label.lower() + and "重啟" not in a.label + ] + assert len(non_restart_actions) >= 1, ( + f"DatabaseConnectionFail 應至少 1 個非 restart 動作,實際:{[a.name for a in plan.recommended_actions]}" + ) + + for action in plan.recommended_actions: + assert action.mcp_provider in {"k8s", "ssh", "prometheus", "signoz", "database", "internal"} + assert action.risk in {"low", "medium", "high", "critical"}