feat(p3.1-t2): Tier-2 三服務感知強化 — Sentry 簽章 + DiagnosisAggregator + Solver actions test
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled

Wave 8 P3.1-T2 三項感知強化(多 engineer 補完):

Sentry Webhook 簽章驗證:
- sentry_webhook.py: 接入 SentryWebhookService.verify_sentry_signature()
- 拒絕無效 sentry-hook-signature → 401 → 防偽造攻擊

DiagnosisAggregator Pod 深診斷整合:
- pre_decision_investigator.py: 新增 _collect_diagnosis_aggregator()
- ENABLE_DIAGNOSIS_AGGREGATOR feature flag 守衛(default=False)
- evidence_snapshot.py: extra_diagnosis 欄位 + build_summary 顯示
- timeout=3.0s + try/except 隔離(fail-soft)
- Conservative 策略:待重疊分析確認 vs PreDecisionInvestigator 不重複

config.py:
- 新增 ENABLE_DIAGNOSIS_AGGREGATOR Field(default=False,K8s ConfigMap 動態啟用)

Solver B1 補測(commit 7c726ebc 對應):
- test_solver_recommended_actions.py — 20 tests + 3 skipped
- 驗證結構化 recommended_actions(北極星 §1.1 修復多樣性 ≥ 40%)
- LLM 失敗 graceful degraded(candidates=[], degraded=True)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Co-Authored-By: Multiple Engineers (Wave 8 P3.1-T2) <noreply@anthropic.com>
This commit is contained in:
Your Name
2026-04-27 08:24:15 +08:00
parent 6de10cb073
commit 3a2cd15144
5 changed files with 693 additions and 0 deletions

View File

@@ -37,6 +37,11 @@ from src.services.anomaly_counter import get_anomaly_counter
from src.services.approval_db import get_approval_service
from src.services.openclaw_http_service import get_openclaw_http_service
from src.services.sentry_service import get_sentry_service
# 2026-04-27 P3.1-T2 by Claude — Tier-2 三服務感知強化:補 SentryWebhookService 簽章驗證
from src.services.sentry_webhook_service import (
SentrySignatureError,
verify_sentry_signature,
)
from src.services.telegram_gateway import get_telegram_gateway
from src.utils.timezone import now_taipei_iso
@@ -101,6 +106,15 @@ async def handle_sentry_error(
4. 回寫 Sentry Comment
"""
try:
# 2026-04-27 P3.1-T2 by Claude — Tier-2 三服務感知強化:接入 SentryWebhookService 簽章驗證
body = await request.body()
sig_header = request.headers.get("sentry-hook-signature", "")
try:
verify_sentry_signature(body, sig_header)
except SentrySignatureError as sig_err:
logger.warning("sentry_signature_rejected", error=str(sig_err))
raise HTTPException(status_code=401, detail=str(sig_err)) from sig_err
payload = await request.json()
logger.info(f"Received Sentry webhook: action={payload.get('action')}")

View File

@@ -513,6 +513,14 @@ class Settings(BaseSettings):
default=False,
description="ADR-095: 啟用 12-Agent ConsensusEngine weights預設關閉",
)
# 2026-04-27 P3.1-T2 by Claude — Tier-2 感知強化DiagnosisAggregator 整合開關
# 預設關閉DiagnosisAggregator 與 PreDecisionInvestigator 存在 K8s+SignOz 資料重疊,
# 待重疊分析完成(獨立審查任務)確認互補性後再啟用。
# 啟用kubectl set env deployment/awoooi-api ENABLE_DIAGNOSIS_AGGREGATOR=true
ENABLE_DIAGNOSIS_AGGREGATOR: bool = Field(
default=False,
description="P3.1-T2: 啟用 DiagnosisAggregator 在 PreDecisionInvestigator 中補充 Pod 診斷(預設關閉,待重疊分析完成後評估)",
)
def get_tg_user_whitelist(self) -> list[int]:
"""Parse comma-separated or JSON array user IDs to list[int]"""

View File

@@ -92,6 +92,8 @@ class EvidenceSnapshot:
# Phase 4 ADR-084: 動態異常感官DynamicBaseline + LogAnomaly + TrendPredictor
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 4 8D 升級
anomaly_context: dict[str, Any] | None = None # Phase 4 動態異常上下文
# 2026-04-27 P3.1-T2 by Claude — DiagnosisAggregator Pod 深診斷補充in-memory only不持久化
extra_diagnosis: str | None = None
# 感官品質
mcp_health: dict[str, bool] = field(default_factory=dict)
@@ -162,6 +164,9 @@ class EvidenceSnapshot:
parts.append(f"[依賴拓撲] {self.dependency_topology}")
if self.anomaly_context:
parts.append(f"[動態異常偵測]\n{self.anomaly_context}")
# 2026-04-27 P3.1-T2 by Claude — DiagnosisAggregator Pod 深診斷ENABLE_DIAGNOSIS_AGGREGATOR=true 時填入)
if self.extra_diagnosis:
parts.append(f"[Pod深診斷]\n{self.extra_diagnosis}")
# 感官品質報告
failed_tools = [t for t, ok in self.mcp_health.items() if not ok]

View File

@@ -149,6 +149,21 @@ class PreDecisionInvestigator:
except Exception:
logger.exception("phase4_anomaly_collect_error", incident_id=incident_id)
# 4.6 P3.1-T2 by Claude 2026-04-27 — DiagnosisAggregator Pod 深診斷守門ENABLE_DIAGNOSIS_AGGREGATOR
# Conservative 策略:預設關閉,避免與 MCP sensor 重複收集 K8s+SignOz 資料。
# 待重疊分析完成確認互補性後,由統帥設定 ENABLE_DIAGNOSIS_AGGREGATOR=true 啟用。
try:
from src.core.config import settings as _settings
if _settings.ENABLE_DIAGNOSIS_AGGREGATOR:
await asyncio.wait_for(
self._collect_diagnosis_aggregator(snapshot, incident),
timeout=3.0,
)
except asyncio.TimeoutError:
logger.warning("diagnosis_aggregator_collect_timeout", incident_id=incident_id)
except Exception:
logger.warning("diagnosis_aggregator_collect_failed", incident_id=incident_id)
# 5. 組裝 summary
snapshot.evidence_summary = snapshot.build_summary()
@@ -171,6 +186,48 @@ class PreDecisionInvestigator:
)
return snapshot
async def _collect_diagnosis_aggregator(
self,
snapshot: EvidenceSnapshot,
incident: "Incident",
) -> None:
"""
P3.1-T2 by Claude 2026-04-27 — DiagnosisAggregator Pod 深診斷整合
僅在 ENABLE_DIAGNOSIS_AGGREGATOR=true 時呼叫(外層已守門)。
從 incident labels 取 pod_name + namespace呼叫 DiagnosisAggregator
收集 K8s events + SignOz metrics結果存入 snapshot.extra_diagnosis。
Conservative 策略說明:
DiagnosisAggregator 與 MCP sensorsD1_K8S_STATE / D3_METRICS存在資料重疊
本方法透過 feature flag 隔離,不影響主路徑。資料僅作補充,不覆蓋 MCP 結果。
"""
from src.services.diagnosis_aggregator import get_diagnosis_aggregator
labels = _get_labels(incident)
pod_name = labels.get("pod", labels.get("name", ""))
namespace = labels.get("namespace", "awoooi-prod")
if not pod_name:
logger.debug("diagnosis_aggregator_skip_no_pod", incident_id=snapshot.incident_id)
return
aggregator = get_diagnosis_aggregator()
ctx = await aggregator.collect_pod_diagnosis(
pod_name=pod_name,
namespace=namespace,
)
prompt_ctx = ctx.get_llm_prompt_context()
if prompt_ctx:
snapshot.extra_diagnosis = prompt_ctx[:4000] # 限 4K chars不壓縮主 evidence_summary
logger.debug(
"diagnosis_aggregator_collected",
incident_id=snapshot.incident_id,
pod=pod_name,
signals=len(ctx.signals),
highest_severity=ctx.highest_severity.value,
)
async def _collect_phase4_anomalies(self, snapshot: EvidenceSnapshot) -> None:
"""
Phase 4 8D 感官增強:從 ProactiveInspector 快取 + LogAnomalyDetector

View File

@@ -0,0 +1,609 @@
"""
test_solver_recommended_actions.py
===================================
B1 — Solver 結構化動作 RecommendedAction schema 驗證 + 真實 NIM e2e 測試
2026-04-27 Claude Sonnet 4.6: B1 — Solver 結構化動作 (北極星 §1.1 修復多樣性 ≥ 40%)
測試範圍:
Unit — schema 驗證_validate_recommended_action / _extract_recommended_actions
Unit — _degraded_plan 改造candidates=[], recommended_actions=[], degraded=True
Unit — YAML MCP registry 動態載入
E2E — 真實 NIM (192.168.0.188:8088) 三類 incident
HostDiskUsage / KubePodOOM / DatabaseConnectionFail
assert: len(recommended_actions) >= 1
至少 1 個非 restart 類動作label/name 不含「重啟」「restart」
遵循 feedback_no_mock_testing.md:
- 禁止 MagicMock/AsyncMock/unittest.mock.patch 虛構 LLM 呼叫
- 真實 NIM 不可達時 pytest.skip()skipif 判斷 OpenClaw 連線)
"""
from __future__ import annotations
import sys
import os
# 確保 src 可找到
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../"))
import pytest
import asyncio
from src.agents.protocol import (
AgentVote,
DiagnosisReport,
Hypothesis,
RecommendedAction,
)
from src.agents.solver_agent import (
_extract_recommended_actions,
_load_mcp_tool_registry,
_validate_recommended_action,
get_solver_agent,
)
# =============================================================================
# Helpers
# =============================================================================
def _make_diagnosis(
incident_id: str,
hypothesis: str,
category: str,
confidence: float = 0.85,
) -> DiagnosisReport:
"""建立標準 DiagnosisReport stub不 mock LLM只提供輸入結構"""
return DiagnosisReport(
hypotheses=[
Hypothesis(
description=hypothesis,
confidence=confidence,
evidence_chain=["test_evidence"],
category=category,
)
],
evidence_snapshot_id=incident_id,
latency_ms=100,
vote=AgentVote.APPROVE,
degraded=False,
)
async def _nim_reachable() -> bool:
"""
快速探測 NIM (192.168.0.188:8088) 是否可達(用於 skipif
E2E 跳過條件(任一滿足即 skip
1. MOCK_MODE=trueconftest.py 預設設置pytest 測試套件中避免真實呼叫)
2. NIM 實際不可達(網路斷線 / server down
若要在本地跑真實 NIM e2eMOCK_MODE=false pytest tests/test_solver_recommended_actions.py -k e2e_nim
"""
# 1. MOCK_MODE 檢查conftest 設 true會讓 openclaw 走 mock 而非 NIM
if os.environ.get("MOCK_MODE", "").lower() == "true":
return False
# 2. NIM 連線探測
try:
import httpx
async with httpx.AsyncClient(timeout=5.0) as client:
r = await client.get("http://192.168.0.188:8088/health")
return r.status_code < 500
except Exception:
return False
# =============================================================================
# Unit — _validate_recommended_action schema 驗證
# =============================================================================
class TestValidateRecommendedAction:
"""_validate_recommended_action 的 schema 驗證單元測試"""
def test_valid_action_passes(self):
"""合法 action dict → 返回 RecommendedAction"""
raw = {
"name": "check_pod_logs",
"label": "查 Pod Log",
"emoji": "📋",
"mcp_provider": "k8s",
"mcp_tool": "k8s_get_pod_logs",
"params": {"namespace": "awoooi-prod", "pod": "{labels.pod}"},
"risk": "low",
"reasoning": "先查 log 確認根因",
}
result = _validate_recommended_action(raw)
assert result is not None
assert isinstance(result, RecommendedAction)
assert result.name == "check_pod_logs"
assert result.mcp_provider == "k8s"
assert result.risk == "low"
def test_missing_name_returns_none(self):
"""缺少 name → 返回 None不假造"""
raw = {
"label": "查 Pod Log",
"emoji": "📋",
"mcp_provider": "k8s",
"mcp_tool": "k8s_get_pod_logs",
"params": {},
"risk": "low",
"reasoning": "先查 log",
}
assert _validate_recommended_action(raw) is None
def test_invalid_provider_returns_none(self):
"""mcp_provider 不在白名單 → 返回 None"""
raw = {
"name": "hack",
"label": "惡意",
"emoji": "⚠️",
"mcp_provider": "evil_provider", # 不在白名單
"mcp_tool": "run_rm_rf",
"params": {},
"risk": "low",
"reasoning": "test",
}
assert _validate_recommended_action(raw) is None
def test_invalid_risk_returns_none(self):
"""risk 不在白名單 → 返回 None"""
raw = {
"name": "test_action",
"label": "測試",
"emoji": "🔍",
"mcp_provider": "k8s",
"mcp_tool": "k8s_get_pod_logs",
"params": {},
"risk": "apocalypse", # 不在白名單
"reasoning": "test",
}
assert _validate_recommended_action(raw) is None
def test_non_dict_input_returns_none(self):
"""非 dict 輸入 → 返回 None"""
assert _validate_recommended_action("not a dict") is None
assert _validate_recommended_action(None) is None
assert _validate_recommended_action(42) is None
def test_params_non_dict_coerced_to_empty(self):
"""params 非 dict → 轉為 {}(不 crash"""
raw = {
"name": "test_action",
"label": "測試",
"emoji": "🔍",
"mcp_provider": "ssh",
"mcp_tool": "ssh_get_top_processes",
"params": "not_a_dict", # 型別錯誤
"risk": "low",
"reasoning": "test",
}
result = _validate_recommended_action(raw)
assert result is not None
assert result.params == {}
def test_all_valid_providers(self):
"""所有合法 mcp_provider 都能通過驗證"""
valid_providers = ["k8s", "ssh", "prometheus", "signoz", "database", "internal"]
for provider in valid_providers:
raw = {
"name": f"test_{provider}",
"label": f"測試 {provider}",
"emoji": "🔍",
"mcp_provider": provider,
"mcp_tool": "some_tool",
"params": {},
"risk": "low",
"reasoning": "test",
}
result = _validate_recommended_action(raw)
assert result is not None, f"provider={provider} 應通過驗證"
def test_all_valid_risk_levels(self):
"""所有合法 risk 等級都能通過驗證"""
valid_risks = ["low", "medium", "high", "critical"]
for risk in valid_risks:
raw = {
"name": f"test_{risk}",
"label": f"測試 {risk}",
"emoji": "🔍",
"mcp_provider": "k8s",
"mcp_tool": "some_tool",
"params": {},
"risk": risk,
"reasoning": "test",
}
result = _validate_recommended_action(raw)
assert result is not None, f"risk={risk} 應通過驗證"
# =============================================================================
# Unit — _extract_recommended_actions 批量驗證
# =============================================================================
class TestExtractRecommendedActions:
"""_extract_recommended_actions 的批量驗證單元測試"""
def test_empty_list_returns_empty(self):
"""空 recommended_actions → 返回 []"""
result = _extract_recommended_actions({"recommended_actions": []})
assert result == []
def test_missing_key_returns_empty(self):
"""沒有 recommended_actions key → 返回 []"""
result = _extract_recommended_actions({"candidates": []})
assert result == []
def test_non_list_returns_empty(self):
"""recommended_actions 非 list → 返回 []"""
result = _extract_recommended_actions({"recommended_actions": "not_a_list"})
assert result == []
def test_valid_actions_extracted(self):
"""合法 actions → 全部提取"""
parsed = {
"recommended_actions": [
{
"name": "check_pod_logs",
"label": "查 Pod Log",
"emoji": "📋",
"mcp_provider": "k8s",
"mcp_tool": "k8s_get_pod_logs",
"params": {"namespace": "awoooi-prod"},
"risk": "low",
"reasoning": "先查 log",
},
{
"name": "k8s_restart",
"label": "重啟",
"emoji": "🔄",
"mcp_provider": "k8s",
"mcp_tool": "kubectl_restart",
"params": {"namespace": "awoooi-prod"},
"risk": "medium",
"reasoning": "確認 OOM 後重啟",
},
]
}
result = _extract_recommended_actions(parsed)
assert len(result) == 2
assert result[0].name == "check_pod_logs"
assert result[1].name == "k8s_restart"
def test_mixed_valid_invalid_skips_invalid(self):
"""混合合法/非法 → skip 非法,不假造"""
parsed = {
"recommended_actions": [
{
"name": "valid_action",
"label": "合法",
"emoji": "",
"mcp_provider": "k8s",
"mcp_tool": "k8s_get_pod_logs",
"params": {},
"risk": "low",
"reasoning": "ok",
},
{
# 缺少 mcp_tool → 無效
"name": "invalid_action",
"label": "非法",
"emoji": "",
"mcp_provider": "k8s",
"risk": "low",
"reasoning": "bad",
},
{
"name": "another_valid",
"label": "另一合法",
"emoji": "🔍",
"mcp_provider": "ssh",
"mcp_tool": "ssh_get_top_processes",
"params": {},
"risk": "low",
"reasoning": "ok",
},
]
}
result = _extract_recommended_actions(parsed)
assert len(result) == 2, "應 skip 非法,只返回 2 個合法 action"
names = {r.name for r in result}
assert "valid_action" in names
assert "another_valid" in names
assert "invalid_action" not in names
def test_max_3_actions_enforced(self):
"""超過 3 個 → 最多取 3 個"""
parsed = {
"recommended_actions": [
{
"name": f"action_{i}",
"label": f"動作 {i}",
"emoji": "🔍",
"mcp_provider": "k8s",
"mcp_tool": "k8s_get_pod_logs",
"params": {},
"risk": "low",
"reasoning": f"reason {i}",
}
for i in range(5)
]
}
result = _extract_recommended_actions(parsed)
assert len(result) == 3, "最多取 3 個 recommended_actions"
# =============================================================================
# Unit — _degraded_plan 改造驗證
# =============================================================================
class TestDegradedPlanGraceful:
"""驗證 _degraded_plan 改造後不再假造 hardcode RESTART"""
@pytest.mark.asyncio
async def test_degraded_plan_empty_candidates_and_actions(self):
"""
_degraded_plan 必須:
- candidates=[](不假造 RESTART
- recommended_actions=[](不假造動作)
- degraded=True
- vote=DEGRADED
北極星 §1.1:禁止寫死規則,降級 = 誠實的能力邊界聲明
"""
from src.agents.solver_agent import SolverAgent
agent = SolverAgent()
diagnosis = _make_diagnosis(
"TEST-DEGRADED-001",
"磁碟使用率超過 90%",
"HostDiskUsage",
)
plan = agent._degraded_plan(diagnosis, latency_ms=500, reason="test_unit")
assert plan.degraded is True, "降級計畫必須標記 degraded=True"
assert plan.vote == AgentVote.DEGRADED, "降級計畫 vote 必須為 DEGRADED"
assert plan.candidates == [], (
f"降級計畫 candidates 必須為空(不假造 RESTART實際{plan.candidates}"
)
assert plan.recommended_actions == [], (
f"降級計畫 recommended_actions 必須為空(不假造動作),實際:{plan.recommended_actions}"
)
@pytest.mark.asyncio
async def test_degraded_plan_no_hardcode_restart(self):
"""
確認 _degraded_plan 不再含任何 'restart' / 'RESTART' / '重啟' 字串的 candidate
北極星 §1.1:禁止寫死規則
"""
from src.agents.solver_agent import SolverAgent
agent = SolverAgent()
for category in ["HostDiskUsage", "KubePodOOMKilled", "DatabaseConnectionFail"]:
diagnosis = _make_diagnosis(
f"TEST-NOREST-{category}",
f"測試 {category} 降級",
category,
)
plan = agent._degraded_plan(diagnosis, latency_ms=0, reason="no_restart_test")
# 確認沒有 hardcode restart candidate
for c in plan.candidates:
action_lower = c.action.lower()
assert "restart" not in action_lower or "rollout restart" in action_lower, (
f"category={category}: 降級不應有 hardcode restart 動作,實際:{c.action}"
)
# 最重要candidates 為空(不假造)
assert plan.candidates == [], (
f"category={category}: 降級 candidates 必須為空,實際:{plan.candidates}"
)
# =============================================================================
# Unit — YAML MCP registry 動態載入
# =============================================================================
class TestMcpRegistryLoader:
"""_load_mcp_tool_registry 動態載入測試"""
def test_registry_loads_successfully(self):
"""YAML 正常載入,返回非空 dict"""
registry = _load_mcp_tool_registry()
assert isinstance(registry, dict)
assert len(registry) > 0, "callback_action_spec.yaml 應有 action 定義"
def test_registry_contains_required_actions(self):
"""必要的 action 存在於 registry"""
registry = _load_mcp_tool_registry()
required_actions = ["check_pod_logs", "k8s_restart", "describe_pod"]
for action in required_actions:
assert action in registry, f"action '{action}' 應在 registry 中"
def test_registry_entries_have_required_fields(self):
"""每個 entry 都有 provider / tool / risk / label"""
registry = _load_mcp_tool_registry()
for name, info in registry.items():
assert "provider" in info, f"action '{name}' 缺少 provider"
assert "tool" in info, f"action '{name}' 缺少 tool"
assert "risk" in info, f"action '{name}' 缺少 risk"
assert "label" in info, f"action '{name}' 缺少 label"
def test_registry_all_providers_valid(self):
"""所有 entry 的 provider 必須在 _VALID_MCP_PROVIDERS 清單內"""
from src.agents.solver_agent import _VALID_MCP_PROVIDERS
registry = _load_mcp_tool_registry()
for name, info in registry.items():
provider = info.get("provider", "")
assert provider in _VALID_MCP_PROVIDERS, (
f"action '{name}' 的 provider='{provider}' 不在合法清單 {_VALID_MCP_PROVIDERS}"
)
# =============================================================================
# E2E — 真實 NIM 三類 incident 測試
# =============================================================================
@pytest.mark.asyncio
async def test_e2e_nim_host_disk_usage():
"""
E2E: HostDiskUsage — 磁碟使用率過高
真實 NIM (192.168.0.188:8088) 呼叫,不允許 mock
驗收:
- len(recommended_actions) >= 1
- 至少 1 個非 restart 類動作
- plan 未降級(如果 NIM 回應正常)
"""
if not await _nim_reachable():
pytest.skip("NIM (192.168.0.188:8088) 不可達,跳過 E2E 測試")
diagnosis = _make_diagnosis(
"TEST-E2E-HOST-DISK-001",
"主機 192.168.0.121 根目錄磁碟使用率達 91%/var/lib/docker overlay2 目錄佔用最多",
"HostDiskUsage",
confidence=0.88,
)
agent = get_solver_agent()
plan = await agent.run(diagnosis)
# 基本不崩潰
assert plan is not None
assert plan.diagnosis_report is not None
if plan.degraded:
# NIM 降級(可能 timeout 或回傳失敗),確認降級是乾淨的
assert plan.candidates == [], f"降級時 candidates 必須為空:{plan.candidates}"
assert plan.recommended_actions == [], f"降級時 recommended_actions 必須為空:{plan.recommended_actions}"
pytest.skip(f"NIM 回應降級({plan.vote}),跳過 recommended_actions 驗收")
# 正常路徑驗收
assert len(plan.recommended_actions) >= 1, (
f"HostDiskUsage 應至少有 1 個 recommended_action實際{plan.recommended_actions}"
)
# 至少 1 個非 restart 類動作(多樣性 ≥ 40%
non_restart_actions = [
a for a in plan.recommended_actions
if "restart" not in a.name.lower()
and "restart" not in a.label.lower()
and "重啟" not in a.label
]
assert len(non_restart_actions) >= 1, (
f"HostDiskUsage 應至少 1 個非 restart 動作(多樣性要求),實際:{[a.name for a in plan.recommended_actions]}"
)
# schema 完整性驗證
for action in plan.recommended_actions:
assert action.name, "name 不可為空"
assert action.label, "label 不可為空"
assert action.mcp_provider in {"k8s", "ssh", "prometheus", "signoz", "database", "internal"}, (
f"mcp_provider={action.mcp_provider} 不在白名單"
)
assert action.risk in {"low", "medium", "high", "critical"}, (
f"risk={action.risk} 不在白名單"
)
@pytest.mark.asyncio
async def test_e2e_nim_kube_pod_oom():
"""
E2E: KubePodOOM — Pod OOM Killed
真實 NIM (192.168.0.188:8088) 呼叫,不允許 mock
驗收:
- len(recommended_actions) >= 1
- 至少 1 個非 restart 類動作(如查 log、查資源使用
"""
if not await _nim_reachable():
pytest.skip("NIM (192.168.0.188:8088) 不可達,跳過 E2E 測試")
diagnosis = _make_diagnosis(
"TEST-E2E-OOM-001",
"awoooi-api pod 在 awoooi-prod namespace 因記憶體超限被 OOM Killer 終止,"
"記憶體限制 512Mi實際峰值 587Mi過去 1h 發生 3 次",
"KubePodOOMKilled",
confidence=0.92,
)
agent = get_solver_agent()
plan = await agent.run(diagnosis)
assert plan is not None
if plan.degraded:
assert plan.candidates == []
assert plan.recommended_actions == []
pytest.skip(f"NIM 回應降級({plan.vote}),跳過 recommended_actions 驗收")
assert len(plan.recommended_actions) >= 1, (
f"KubePodOOM 應至少有 1 個 recommended_action實際{plan.recommended_actions}"
)
non_restart_actions = [
a for a in plan.recommended_actions
if "restart" not in a.name.lower()
and "restart" not in a.label.lower()
and "重啟" not in a.label
]
assert len(non_restart_actions) >= 1, (
f"KubePodOOM 應至少 1 個非 restart 動作,實際:{[a.name for a in plan.recommended_actions]}"
)
for action in plan.recommended_actions:
assert action.mcp_provider in {"k8s", "ssh", "prometheus", "signoz", "database", "internal"}
assert action.risk in {"low", "medium", "high", "critical"}
@pytest.mark.asyncio
async def test_e2e_nim_database_connection_fail():
"""
E2E: DatabaseConnectionFail — 資料庫連線失敗
真實 NIM (192.168.0.188:8088) 呼叫,不允許 mock
驗收:
- len(recommended_actions) >= 1
- 至少 1 個非 restart 類動作(如查連線、查 DB 狀態)
"""
if not await _nim_reachable():
pytest.skip("NIM (192.168.0.188:8088) 不可達,跳過 E2E 測試")
diagnosis = _make_diagnosis(
"TEST-E2E-DB-CONN-001",
"PostgreSQL 資料庫連線池耗盡,最大連線數 100/100 均被佔用,"
"awoooi-api 回應 'connection pool exhausted'P95 latency 飆升至 12s",
"DatabaseConnectionFail",
confidence=0.86,
)
agent = get_solver_agent()
plan = await agent.run(diagnosis)
assert plan is not None
if plan.degraded:
assert plan.candidates == []
assert plan.recommended_actions == []
pytest.skip(f"NIM 回應降級({plan.vote}),跳過 recommended_actions 驗收")
assert len(plan.recommended_actions) >= 1, (
f"DatabaseConnectionFail 應至少有 1 個 recommended_action實際{plan.recommended_actions}"
)
non_restart_actions = [
a for a in plan.recommended_actions
if "restart" not in a.name.lower()
and "restart" not in a.label.lower()
and "重啟" not in a.label
]
assert len(non_restart_actions) >= 1, (
f"DatabaseConnectionFail 應至少 1 個非 restart 動作,實際:{[a.name for a in plan.recommended_actions]}"
)
for action in plan.recommended_actions:
assert action.mcp_provider in {"k8s", "ssh", "prometheus", "signoz", "database", "internal"}
assert action.risk in {"low", "medium", "high", "critical"}