feat(p3.1-t2): Tier-2 三服務感知強化 — Sentry 簽章 + DiagnosisAggregator + Solver actions test
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Wave 8 P3.1-T2 三項感知強化(多 engineer 補完):
Sentry Webhook 簽章驗證:
- sentry_webhook.py: 接入 SentryWebhookService.verify_sentry_signature()
- 拒絕無效 sentry-hook-signature → 401 → 防偽造攻擊
DiagnosisAggregator Pod 深診斷整合:
- pre_decision_investigator.py: 新增 _collect_diagnosis_aggregator()
- ENABLE_DIAGNOSIS_AGGREGATOR feature flag 守衛(default=False)
- evidence_snapshot.py: extra_diagnosis 欄位 + build_summary 顯示
- timeout=3.0s + try/except 隔離(fail-soft)
- Conservative 策略:待重疊分析確認 vs PreDecisionInvestigator 不重複
config.py:
- 新增 ENABLE_DIAGNOSIS_AGGREGATOR Field(default=False,K8s ConfigMap 動態啟用)
Solver B1 補測(commit 7c726ebc 對應):
- test_solver_recommended_actions.py — 20 tests + 3 skipped
- 驗證結構化 recommended_actions(北極星 §1.1 修復多樣性 ≥ 40%)
- LLM 失敗 graceful degraded(candidates=[], degraded=True)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Co-Authored-By: Multiple Engineers (Wave 8 P3.1-T2) <noreply@anthropic.com>
This commit is contained in:
@@ -37,6 +37,11 @@ from src.services.anomaly_counter import get_anomaly_counter
|
||||
from src.services.approval_db import get_approval_service
|
||||
from src.services.openclaw_http_service import get_openclaw_http_service
|
||||
from src.services.sentry_service import get_sentry_service
|
||||
# 2026-04-27 P3.1-T2 by Claude — Tier-2 三服務感知強化:補 SentryWebhookService 簽章驗證
|
||||
from src.services.sentry_webhook_service import (
|
||||
SentrySignatureError,
|
||||
verify_sentry_signature,
|
||||
)
|
||||
from src.services.telegram_gateway import get_telegram_gateway
|
||||
from src.utils.timezone import now_taipei_iso
|
||||
|
||||
@@ -101,6 +106,15 @@ async def handle_sentry_error(
|
||||
4. 回寫 Sentry Comment
|
||||
"""
|
||||
try:
|
||||
# 2026-04-27 P3.1-T2 by Claude — Tier-2 三服務感知強化:接入 SentryWebhookService 簽章驗證
|
||||
body = await request.body()
|
||||
sig_header = request.headers.get("sentry-hook-signature", "")
|
||||
try:
|
||||
verify_sentry_signature(body, sig_header)
|
||||
except SentrySignatureError as sig_err:
|
||||
logger.warning("sentry_signature_rejected", error=str(sig_err))
|
||||
raise HTTPException(status_code=401, detail=str(sig_err)) from sig_err
|
||||
|
||||
payload = await request.json()
|
||||
logger.info(f"Received Sentry webhook: action={payload.get('action')}")
|
||||
|
||||
|
||||
@@ -513,6 +513,14 @@ class Settings(BaseSettings):
|
||||
default=False,
|
||||
description="ADR-095: 啟用 12-Agent ConsensusEngine weights(預設關閉)",
|
||||
)
|
||||
# 2026-04-27 P3.1-T2 by Claude — Tier-2 感知強化:DiagnosisAggregator 整合開關
|
||||
# 預設關閉:DiagnosisAggregator 與 PreDecisionInvestigator 存在 K8s+SignOz 資料重疊,
|
||||
# 待重疊分析完成(獨立審查任務)確認互補性後再啟用。
|
||||
# 啟用:kubectl set env deployment/awoooi-api ENABLE_DIAGNOSIS_AGGREGATOR=true
|
||||
ENABLE_DIAGNOSIS_AGGREGATOR: bool = Field(
|
||||
default=False,
|
||||
description="P3.1-T2: 啟用 DiagnosisAggregator 在 PreDecisionInvestigator 中補充 Pod 診斷(預設關閉,待重疊分析完成後評估)",
|
||||
)
|
||||
|
||||
def get_tg_user_whitelist(self) -> list[int]:
|
||||
"""Parse comma-separated or JSON array user IDs to list[int]"""
|
||||
|
||||
@@ -92,6 +92,8 @@ class EvidenceSnapshot:
|
||||
# Phase 4 ADR-084: 動態異常感官(DynamicBaseline + LogAnomaly + TrendPredictor)
|
||||
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 4 8D 升級
|
||||
anomaly_context: dict[str, Any] | None = None # Phase 4 動態異常上下文
|
||||
# 2026-04-27 P3.1-T2 by Claude — DiagnosisAggregator Pod 深診斷補充(in-memory only,不持久化)
|
||||
extra_diagnosis: str | None = None
|
||||
|
||||
# 感官品質
|
||||
mcp_health: dict[str, bool] = field(default_factory=dict)
|
||||
@@ -162,6 +164,9 @@ class EvidenceSnapshot:
|
||||
parts.append(f"[依賴拓撲] {self.dependency_topology}")
|
||||
if self.anomaly_context:
|
||||
parts.append(f"[動態異常偵測]\n{self.anomaly_context}")
|
||||
# 2026-04-27 P3.1-T2 by Claude — DiagnosisAggregator Pod 深診斷(ENABLE_DIAGNOSIS_AGGREGATOR=true 時填入)
|
||||
if self.extra_diagnosis:
|
||||
parts.append(f"[Pod深診斷]\n{self.extra_diagnosis}")
|
||||
|
||||
# 感官品質報告
|
||||
failed_tools = [t for t, ok in self.mcp_health.items() if not ok]
|
||||
|
||||
@@ -149,6 +149,21 @@ class PreDecisionInvestigator:
|
||||
except Exception:
|
||||
logger.exception("phase4_anomaly_collect_error", incident_id=incident_id)
|
||||
|
||||
# 4.6 P3.1-T2 by Claude 2026-04-27 — DiagnosisAggregator Pod 深診斷(守門:ENABLE_DIAGNOSIS_AGGREGATOR)
|
||||
# Conservative 策略:預設關閉,避免與 MCP sensor 重複收集 K8s+SignOz 資料。
|
||||
# 待重疊分析完成確認互補性後,由統帥設定 ENABLE_DIAGNOSIS_AGGREGATOR=true 啟用。
|
||||
try:
|
||||
from src.core.config import settings as _settings
|
||||
if _settings.ENABLE_DIAGNOSIS_AGGREGATOR:
|
||||
await asyncio.wait_for(
|
||||
self._collect_diagnosis_aggregator(snapshot, incident),
|
||||
timeout=3.0,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("diagnosis_aggregator_collect_timeout", incident_id=incident_id)
|
||||
except Exception:
|
||||
logger.warning("diagnosis_aggregator_collect_failed", incident_id=incident_id)
|
||||
|
||||
# 5. 組裝 summary
|
||||
snapshot.evidence_summary = snapshot.build_summary()
|
||||
|
||||
@@ -171,6 +186,48 @@ class PreDecisionInvestigator:
|
||||
)
|
||||
return snapshot
|
||||
|
||||
async def _collect_diagnosis_aggregator(
|
||||
self,
|
||||
snapshot: EvidenceSnapshot,
|
||||
incident: "Incident",
|
||||
) -> None:
|
||||
"""
|
||||
P3.1-T2 by Claude 2026-04-27 — DiagnosisAggregator Pod 深診斷整合
|
||||
|
||||
僅在 ENABLE_DIAGNOSIS_AGGREGATOR=true 時呼叫(外層已守門)。
|
||||
從 incident labels 取 pod_name + namespace,呼叫 DiagnosisAggregator
|
||||
收集 K8s events + SignOz metrics,結果存入 snapshot.extra_diagnosis。
|
||||
|
||||
Conservative 策略說明:
|
||||
DiagnosisAggregator 與 MCP sensors(D1_K8S_STATE / D3_METRICS)存在資料重疊,
|
||||
本方法透過 feature flag 隔離,不影響主路徑。資料僅作補充,不覆蓋 MCP 結果。
|
||||
"""
|
||||
from src.services.diagnosis_aggregator import get_diagnosis_aggregator
|
||||
|
||||
labels = _get_labels(incident)
|
||||
pod_name = labels.get("pod", labels.get("name", ""))
|
||||
namespace = labels.get("namespace", "awoooi-prod")
|
||||
|
||||
if not pod_name:
|
||||
logger.debug("diagnosis_aggregator_skip_no_pod", incident_id=snapshot.incident_id)
|
||||
return
|
||||
|
||||
aggregator = get_diagnosis_aggregator()
|
||||
ctx = await aggregator.collect_pod_diagnosis(
|
||||
pod_name=pod_name,
|
||||
namespace=namespace,
|
||||
)
|
||||
prompt_ctx = ctx.get_llm_prompt_context()
|
||||
if prompt_ctx:
|
||||
snapshot.extra_diagnosis = prompt_ctx[:4000] # 限 4K chars,不壓縮主 evidence_summary
|
||||
logger.debug(
|
||||
"diagnosis_aggregator_collected",
|
||||
incident_id=snapshot.incident_id,
|
||||
pod=pod_name,
|
||||
signals=len(ctx.signals),
|
||||
highest_severity=ctx.highest_severity.value,
|
||||
)
|
||||
|
||||
async def _collect_phase4_anomalies(self, snapshot: EvidenceSnapshot) -> None:
|
||||
"""
|
||||
Phase 4 8D 感官增強:從 ProactiveInspector 快取 + LogAnomalyDetector
|
||||
|
||||
609
apps/api/tests/test_solver_recommended_actions.py
Normal file
609
apps/api/tests/test_solver_recommended_actions.py
Normal file
@@ -0,0 +1,609 @@
|
||||
"""
|
||||
test_solver_recommended_actions.py
|
||||
===================================
|
||||
B1 — Solver 結構化動作 RecommendedAction schema 驗證 + 真實 NIM e2e 測試
|
||||
|
||||
2026-04-27 Claude Sonnet 4.6: B1 — Solver 結構化動作 (北極星 §1.1 修復多樣性 ≥ 40%)
|
||||
|
||||
測試範圍:
|
||||
Unit — schema 驗證(_validate_recommended_action / _extract_recommended_actions)
|
||||
Unit — _degraded_plan 改造:candidates=[], recommended_actions=[], degraded=True
|
||||
Unit — YAML MCP registry 動態載入
|
||||
E2E — 真實 NIM (192.168.0.188:8088) 三類 incident:
|
||||
HostDiskUsage / KubePodOOM / DatabaseConnectionFail
|
||||
assert: len(recommended_actions) >= 1
|
||||
至少 1 個非 restart 類動作(label/name 不含「重啟」「restart」)
|
||||
|
||||
遵循 feedback_no_mock_testing.md:
|
||||
- 禁止 MagicMock/AsyncMock/unittest.mock.patch 虛構 LLM 呼叫
|
||||
- 真實 NIM 不可達時 pytest.skip()(skipif 判斷 OpenClaw 連線)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
# 確保 src 可找到
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../"))
|
||||
|
||||
import pytest
|
||||
import asyncio
|
||||
|
||||
from src.agents.protocol import (
|
||||
AgentVote,
|
||||
DiagnosisReport,
|
||||
Hypothesis,
|
||||
RecommendedAction,
|
||||
)
|
||||
from src.agents.solver_agent import (
|
||||
_extract_recommended_actions,
|
||||
_load_mcp_tool_registry,
|
||||
_validate_recommended_action,
|
||||
get_solver_agent,
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Helpers
|
||||
# =============================================================================
|
||||
|
||||
def _make_diagnosis(
|
||||
incident_id: str,
|
||||
hypothesis: str,
|
||||
category: str,
|
||||
confidence: float = 0.85,
|
||||
) -> DiagnosisReport:
|
||||
"""建立標準 DiagnosisReport stub(不 mock LLM,只提供輸入結構)"""
|
||||
return DiagnosisReport(
|
||||
hypotheses=[
|
||||
Hypothesis(
|
||||
description=hypothesis,
|
||||
confidence=confidence,
|
||||
evidence_chain=["test_evidence"],
|
||||
category=category,
|
||||
)
|
||||
],
|
||||
evidence_snapshot_id=incident_id,
|
||||
latency_ms=100,
|
||||
vote=AgentVote.APPROVE,
|
||||
degraded=False,
|
||||
)
|
||||
|
||||
|
||||
async def _nim_reachable() -> bool:
|
||||
"""
|
||||
快速探測 NIM (192.168.0.188:8088) 是否可達(用於 skipif)
|
||||
|
||||
E2E 跳過條件(任一滿足即 skip):
|
||||
1. MOCK_MODE=true(conftest.py 預設設置,pytest 測試套件中避免真實呼叫)
|
||||
2. NIM 實際不可達(網路斷線 / server down)
|
||||
|
||||
若要在本地跑真實 NIM e2e:MOCK_MODE=false pytest tests/test_solver_recommended_actions.py -k e2e_nim
|
||||
"""
|
||||
# 1. MOCK_MODE 檢查(conftest 設 true,會讓 openclaw 走 mock 而非 NIM)
|
||||
if os.environ.get("MOCK_MODE", "").lower() == "true":
|
||||
return False
|
||||
|
||||
# 2. NIM 連線探測
|
||||
try:
|
||||
import httpx
|
||||
async with httpx.AsyncClient(timeout=5.0) as client:
|
||||
r = await client.get("http://192.168.0.188:8088/health")
|
||||
return r.status_code < 500
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Unit — _validate_recommended_action schema 驗證
|
||||
# =============================================================================
|
||||
|
||||
class TestValidateRecommendedAction:
|
||||
"""_validate_recommended_action 的 schema 驗證單元測試"""
|
||||
|
||||
def test_valid_action_passes(self):
|
||||
"""合法 action dict → 返回 RecommendedAction"""
|
||||
raw = {
|
||||
"name": "check_pod_logs",
|
||||
"label": "查 Pod Log",
|
||||
"emoji": "📋",
|
||||
"mcp_provider": "k8s",
|
||||
"mcp_tool": "k8s_get_pod_logs",
|
||||
"params": {"namespace": "awoooi-prod", "pod": "{labels.pod}"},
|
||||
"risk": "low",
|
||||
"reasoning": "先查 log 確認根因",
|
||||
}
|
||||
result = _validate_recommended_action(raw)
|
||||
assert result is not None
|
||||
assert isinstance(result, RecommendedAction)
|
||||
assert result.name == "check_pod_logs"
|
||||
assert result.mcp_provider == "k8s"
|
||||
assert result.risk == "low"
|
||||
|
||||
def test_missing_name_returns_none(self):
|
||||
"""缺少 name → 返回 None(不假造)"""
|
||||
raw = {
|
||||
"label": "查 Pod Log",
|
||||
"emoji": "📋",
|
||||
"mcp_provider": "k8s",
|
||||
"mcp_tool": "k8s_get_pod_logs",
|
||||
"params": {},
|
||||
"risk": "low",
|
||||
"reasoning": "先查 log",
|
||||
}
|
||||
assert _validate_recommended_action(raw) is None
|
||||
|
||||
def test_invalid_provider_returns_none(self):
|
||||
"""mcp_provider 不在白名單 → 返回 None"""
|
||||
raw = {
|
||||
"name": "hack",
|
||||
"label": "惡意",
|
||||
"emoji": "⚠️",
|
||||
"mcp_provider": "evil_provider", # 不在白名單
|
||||
"mcp_tool": "run_rm_rf",
|
||||
"params": {},
|
||||
"risk": "low",
|
||||
"reasoning": "test",
|
||||
}
|
||||
assert _validate_recommended_action(raw) is None
|
||||
|
||||
def test_invalid_risk_returns_none(self):
|
||||
"""risk 不在白名單 → 返回 None"""
|
||||
raw = {
|
||||
"name": "test_action",
|
||||
"label": "測試",
|
||||
"emoji": "🔍",
|
||||
"mcp_provider": "k8s",
|
||||
"mcp_tool": "k8s_get_pod_logs",
|
||||
"params": {},
|
||||
"risk": "apocalypse", # 不在白名單
|
||||
"reasoning": "test",
|
||||
}
|
||||
assert _validate_recommended_action(raw) is None
|
||||
|
||||
def test_non_dict_input_returns_none(self):
|
||||
"""非 dict 輸入 → 返回 None"""
|
||||
assert _validate_recommended_action("not a dict") is None
|
||||
assert _validate_recommended_action(None) is None
|
||||
assert _validate_recommended_action(42) is None
|
||||
|
||||
def test_params_non_dict_coerced_to_empty(self):
|
||||
"""params 非 dict → 轉為 {}(不 crash)"""
|
||||
raw = {
|
||||
"name": "test_action",
|
||||
"label": "測試",
|
||||
"emoji": "🔍",
|
||||
"mcp_provider": "ssh",
|
||||
"mcp_tool": "ssh_get_top_processes",
|
||||
"params": "not_a_dict", # 型別錯誤
|
||||
"risk": "low",
|
||||
"reasoning": "test",
|
||||
}
|
||||
result = _validate_recommended_action(raw)
|
||||
assert result is not None
|
||||
assert result.params == {}
|
||||
|
||||
def test_all_valid_providers(self):
|
||||
"""所有合法 mcp_provider 都能通過驗證"""
|
||||
valid_providers = ["k8s", "ssh", "prometheus", "signoz", "database", "internal"]
|
||||
for provider in valid_providers:
|
||||
raw = {
|
||||
"name": f"test_{provider}",
|
||||
"label": f"測試 {provider}",
|
||||
"emoji": "🔍",
|
||||
"mcp_provider": provider,
|
||||
"mcp_tool": "some_tool",
|
||||
"params": {},
|
||||
"risk": "low",
|
||||
"reasoning": "test",
|
||||
}
|
||||
result = _validate_recommended_action(raw)
|
||||
assert result is not None, f"provider={provider} 應通過驗證"
|
||||
|
||||
def test_all_valid_risk_levels(self):
|
||||
"""所有合法 risk 等級都能通過驗證"""
|
||||
valid_risks = ["low", "medium", "high", "critical"]
|
||||
for risk in valid_risks:
|
||||
raw = {
|
||||
"name": f"test_{risk}",
|
||||
"label": f"測試 {risk}",
|
||||
"emoji": "🔍",
|
||||
"mcp_provider": "k8s",
|
||||
"mcp_tool": "some_tool",
|
||||
"params": {},
|
||||
"risk": risk,
|
||||
"reasoning": "test",
|
||||
}
|
||||
result = _validate_recommended_action(raw)
|
||||
assert result is not None, f"risk={risk} 應通過驗證"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Unit — _extract_recommended_actions 批量驗證
|
||||
# =============================================================================
|
||||
|
||||
class TestExtractRecommendedActions:
|
||||
"""_extract_recommended_actions 的批量驗證單元測試"""
|
||||
|
||||
def test_empty_list_returns_empty(self):
|
||||
"""空 recommended_actions → 返回 []"""
|
||||
result = _extract_recommended_actions({"recommended_actions": []})
|
||||
assert result == []
|
||||
|
||||
def test_missing_key_returns_empty(self):
|
||||
"""沒有 recommended_actions key → 返回 []"""
|
||||
result = _extract_recommended_actions({"candidates": []})
|
||||
assert result == []
|
||||
|
||||
def test_non_list_returns_empty(self):
|
||||
"""recommended_actions 非 list → 返回 []"""
|
||||
result = _extract_recommended_actions({"recommended_actions": "not_a_list"})
|
||||
assert result == []
|
||||
|
||||
def test_valid_actions_extracted(self):
|
||||
"""合法 actions → 全部提取"""
|
||||
parsed = {
|
||||
"recommended_actions": [
|
||||
{
|
||||
"name": "check_pod_logs",
|
||||
"label": "查 Pod Log",
|
||||
"emoji": "📋",
|
||||
"mcp_provider": "k8s",
|
||||
"mcp_tool": "k8s_get_pod_logs",
|
||||
"params": {"namespace": "awoooi-prod"},
|
||||
"risk": "low",
|
||||
"reasoning": "先查 log",
|
||||
},
|
||||
{
|
||||
"name": "k8s_restart",
|
||||
"label": "重啟",
|
||||
"emoji": "🔄",
|
||||
"mcp_provider": "k8s",
|
||||
"mcp_tool": "kubectl_restart",
|
||||
"params": {"namespace": "awoooi-prod"},
|
||||
"risk": "medium",
|
||||
"reasoning": "確認 OOM 後重啟",
|
||||
},
|
||||
]
|
||||
}
|
||||
result = _extract_recommended_actions(parsed)
|
||||
assert len(result) == 2
|
||||
assert result[0].name == "check_pod_logs"
|
||||
assert result[1].name == "k8s_restart"
|
||||
|
||||
def test_mixed_valid_invalid_skips_invalid(self):
|
||||
"""混合合法/非法 → skip 非法,不假造"""
|
||||
parsed = {
|
||||
"recommended_actions": [
|
||||
{
|
||||
"name": "valid_action",
|
||||
"label": "合法",
|
||||
"emoji": "✅",
|
||||
"mcp_provider": "k8s",
|
||||
"mcp_tool": "k8s_get_pod_logs",
|
||||
"params": {},
|
||||
"risk": "low",
|
||||
"reasoning": "ok",
|
||||
},
|
||||
{
|
||||
# 缺少 mcp_tool → 無效
|
||||
"name": "invalid_action",
|
||||
"label": "非法",
|
||||
"emoji": "❌",
|
||||
"mcp_provider": "k8s",
|
||||
"risk": "low",
|
||||
"reasoning": "bad",
|
||||
},
|
||||
{
|
||||
"name": "another_valid",
|
||||
"label": "另一合法",
|
||||
"emoji": "🔍",
|
||||
"mcp_provider": "ssh",
|
||||
"mcp_tool": "ssh_get_top_processes",
|
||||
"params": {},
|
||||
"risk": "low",
|
||||
"reasoning": "ok",
|
||||
},
|
||||
]
|
||||
}
|
||||
result = _extract_recommended_actions(parsed)
|
||||
assert len(result) == 2, "應 skip 非法,只返回 2 個合法 action"
|
||||
names = {r.name for r in result}
|
||||
assert "valid_action" in names
|
||||
assert "another_valid" in names
|
||||
assert "invalid_action" not in names
|
||||
|
||||
def test_max_3_actions_enforced(self):
|
||||
"""超過 3 個 → 最多取 3 個"""
|
||||
parsed = {
|
||||
"recommended_actions": [
|
||||
{
|
||||
"name": f"action_{i}",
|
||||
"label": f"動作 {i}",
|
||||
"emoji": "🔍",
|
||||
"mcp_provider": "k8s",
|
||||
"mcp_tool": "k8s_get_pod_logs",
|
||||
"params": {},
|
||||
"risk": "low",
|
||||
"reasoning": f"reason {i}",
|
||||
}
|
||||
for i in range(5)
|
||||
]
|
||||
}
|
||||
result = _extract_recommended_actions(parsed)
|
||||
assert len(result) == 3, "最多取 3 個 recommended_actions"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Unit — _degraded_plan 改造驗證
|
||||
# =============================================================================
|
||||
|
||||
class TestDegradedPlanGraceful:
|
||||
"""驗證 _degraded_plan 改造後不再假造 hardcode RESTART"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_degraded_plan_empty_candidates_and_actions(self):
|
||||
"""
|
||||
_degraded_plan 必須:
|
||||
- candidates=[](不假造 RESTART)
|
||||
- recommended_actions=[](不假造動作)
|
||||
- degraded=True
|
||||
- vote=DEGRADED
|
||||
|
||||
北極星 §1.1:禁止寫死規則,降級 = 誠實的能力邊界聲明
|
||||
"""
|
||||
from src.agents.solver_agent import SolverAgent
|
||||
|
||||
agent = SolverAgent()
|
||||
diagnosis = _make_diagnosis(
|
||||
"TEST-DEGRADED-001",
|
||||
"磁碟使用率超過 90%",
|
||||
"HostDiskUsage",
|
||||
)
|
||||
|
||||
plan = agent._degraded_plan(diagnosis, latency_ms=500, reason="test_unit")
|
||||
|
||||
assert plan.degraded is True, "降級計畫必須標記 degraded=True"
|
||||
assert plan.vote == AgentVote.DEGRADED, "降級計畫 vote 必須為 DEGRADED"
|
||||
assert plan.candidates == [], (
|
||||
f"降級計畫 candidates 必須為空(不假造 RESTART),實際:{plan.candidates}"
|
||||
)
|
||||
assert plan.recommended_actions == [], (
|
||||
f"降級計畫 recommended_actions 必須為空(不假造動作),實際:{plan.recommended_actions}"
|
||||
)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_degraded_plan_no_hardcode_restart(self):
|
||||
"""
|
||||
確認 _degraded_plan 不再含任何 'restart' / 'RESTART' / '重啟' 字串的 candidate
|
||||
北極星 §1.1:禁止寫死規則
|
||||
"""
|
||||
from src.agents.solver_agent import SolverAgent
|
||||
|
||||
agent = SolverAgent()
|
||||
for category in ["HostDiskUsage", "KubePodOOMKilled", "DatabaseConnectionFail"]:
|
||||
diagnosis = _make_diagnosis(
|
||||
f"TEST-NOREST-{category}",
|
||||
f"測試 {category} 降級",
|
||||
category,
|
||||
)
|
||||
plan = agent._degraded_plan(diagnosis, latency_ms=0, reason="no_restart_test")
|
||||
|
||||
# 確認沒有 hardcode restart candidate
|
||||
for c in plan.candidates:
|
||||
action_lower = c.action.lower()
|
||||
assert "restart" not in action_lower or "rollout restart" in action_lower, (
|
||||
f"category={category}: 降級不應有 hardcode restart 動作,實際:{c.action}"
|
||||
)
|
||||
# 最重要:candidates 為空(不假造)
|
||||
assert plan.candidates == [], (
|
||||
f"category={category}: 降級 candidates 必須為空,實際:{plan.candidates}"
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Unit — YAML MCP registry 動態載入
|
||||
# =============================================================================
|
||||
|
||||
class TestMcpRegistryLoader:
|
||||
"""_load_mcp_tool_registry 動態載入測試"""
|
||||
|
||||
def test_registry_loads_successfully(self):
|
||||
"""YAML 正常載入,返回非空 dict"""
|
||||
registry = _load_mcp_tool_registry()
|
||||
assert isinstance(registry, dict)
|
||||
assert len(registry) > 0, "callback_action_spec.yaml 應有 action 定義"
|
||||
|
||||
def test_registry_contains_required_actions(self):
|
||||
"""必要的 action 存在於 registry"""
|
||||
registry = _load_mcp_tool_registry()
|
||||
required_actions = ["check_pod_logs", "k8s_restart", "describe_pod"]
|
||||
for action in required_actions:
|
||||
assert action in registry, f"action '{action}' 應在 registry 中"
|
||||
|
||||
def test_registry_entries_have_required_fields(self):
|
||||
"""每個 entry 都有 provider / tool / risk / label"""
|
||||
registry = _load_mcp_tool_registry()
|
||||
for name, info in registry.items():
|
||||
assert "provider" in info, f"action '{name}' 缺少 provider"
|
||||
assert "tool" in info, f"action '{name}' 缺少 tool"
|
||||
assert "risk" in info, f"action '{name}' 缺少 risk"
|
||||
assert "label" in info, f"action '{name}' 缺少 label"
|
||||
|
||||
def test_registry_all_providers_valid(self):
|
||||
"""所有 entry 的 provider 必須在 _VALID_MCP_PROVIDERS 清單內"""
|
||||
from src.agents.solver_agent import _VALID_MCP_PROVIDERS
|
||||
|
||||
registry = _load_mcp_tool_registry()
|
||||
for name, info in registry.items():
|
||||
provider = info.get("provider", "")
|
||||
assert provider in _VALID_MCP_PROVIDERS, (
|
||||
f"action '{name}' 的 provider='{provider}' 不在合法清單 {_VALID_MCP_PROVIDERS}"
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# E2E — 真實 NIM 三類 incident 測試
|
||||
# =============================================================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_e2e_nim_host_disk_usage():
|
||||
"""
|
||||
E2E: HostDiskUsage — 磁碟使用率過高
|
||||
真實 NIM (192.168.0.188:8088) 呼叫,不允許 mock
|
||||
|
||||
驗收:
|
||||
- len(recommended_actions) >= 1
|
||||
- 至少 1 個非 restart 類動作
|
||||
- plan 未降級(如果 NIM 回應正常)
|
||||
"""
|
||||
if not await _nim_reachable():
|
||||
pytest.skip("NIM (192.168.0.188:8088) 不可達,跳過 E2E 測試")
|
||||
|
||||
diagnosis = _make_diagnosis(
|
||||
"TEST-E2E-HOST-DISK-001",
|
||||
"主機 192.168.0.121 根目錄磁碟使用率達 91%,/var/lib/docker overlay2 目錄佔用最多",
|
||||
"HostDiskUsage",
|
||||
confidence=0.88,
|
||||
)
|
||||
|
||||
agent = get_solver_agent()
|
||||
plan = await agent.run(diagnosis)
|
||||
|
||||
# 基本不崩潰
|
||||
assert plan is not None
|
||||
assert plan.diagnosis_report is not None
|
||||
|
||||
if plan.degraded:
|
||||
# NIM 降級(可能 timeout 或回傳失敗),確認降級是乾淨的
|
||||
assert plan.candidates == [], f"降級時 candidates 必須為空:{plan.candidates}"
|
||||
assert plan.recommended_actions == [], f"降級時 recommended_actions 必須為空:{plan.recommended_actions}"
|
||||
pytest.skip(f"NIM 回應降級({plan.vote}),跳過 recommended_actions 驗收")
|
||||
|
||||
# 正常路徑驗收
|
||||
assert len(plan.recommended_actions) >= 1, (
|
||||
f"HostDiskUsage 應至少有 1 個 recommended_action,實際:{plan.recommended_actions}"
|
||||
)
|
||||
|
||||
# 至少 1 個非 restart 類動作(多樣性 ≥ 40%)
|
||||
non_restart_actions = [
|
||||
a for a in plan.recommended_actions
|
||||
if "restart" not in a.name.lower()
|
||||
and "restart" not in a.label.lower()
|
||||
and "重啟" not in a.label
|
||||
]
|
||||
assert len(non_restart_actions) >= 1, (
|
||||
f"HostDiskUsage 應至少 1 個非 restart 動作(多樣性要求),實際:{[a.name for a in plan.recommended_actions]}"
|
||||
)
|
||||
|
||||
# schema 完整性驗證
|
||||
for action in plan.recommended_actions:
|
||||
assert action.name, "name 不可為空"
|
||||
assert action.label, "label 不可為空"
|
||||
assert action.mcp_provider in {"k8s", "ssh", "prometheus", "signoz", "database", "internal"}, (
|
||||
f"mcp_provider={action.mcp_provider} 不在白名單"
|
||||
)
|
||||
assert action.risk in {"low", "medium", "high", "critical"}, (
|
||||
f"risk={action.risk} 不在白名單"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_e2e_nim_kube_pod_oom():
|
||||
"""
|
||||
E2E: KubePodOOM — Pod OOM Killed
|
||||
真實 NIM (192.168.0.188:8088) 呼叫,不允許 mock
|
||||
|
||||
驗收:
|
||||
- len(recommended_actions) >= 1
|
||||
- 至少 1 個非 restart 類動作(如查 log、查資源使用)
|
||||
"""
|
||||
if not await _nim_reachable():
|
||||
pytest.skip("NIM (192.168.0.188:8088) 不可達,跳過 E2E 測試")
|
||||
|
||||
diagnosis = _make_diagnosis(
|
||||
"TEST-E2E-OOM-001",
|
||||
"awoooi-api pod 在 awoooi-prod namespace 因記憶體超限被 OOM Killer 終止,"
|
||||
"記憶體限制 512Mi,實際峰值 587Mi,過去 1h 發生 3 次",
|
||||
"KubePodOOMKilled",
|
||||
confidence=0.92,
|
||||
)
|
||||
|
||||
agent = get_solver_agent()
|
||||
plan = await agent.run(diagnosis)
|
||||
|
||||
assert plan is not None
|
||||
|
||||
if plan.degraded:
|
||||
assert plan.candidates == []
|
||||
assert plan.recommended_actions == []
|
||||
pytest.skip(f"NIM 回應降級({plan.vote}),跳過 recommended_actions 驗收")
|
||||
|
||||
assert len(plan.recommended_actions) >= 1, (
|
||||
f"KubePodOOM 應至少有 1 個 recommended_action,實際:{plan.recommended_actions}"
|
||||
)
|
||||
|
||||
non_restart_actions = [
|
||||
a for a in plan.recommended_actions
|
||||
if "restart" not in a.name.lower()
|
||||
and "restart" not in a.label.lower()
|
||||
and "重啟" not in a.label
|
||||
]
|
||||
assert len(non_restart_actions) >= 1, (
|
||||
f"KubePodOOM 應至少 1 個非 restart 動作,實際:{[a.name for a in plan.recommended_actions]}"
|
||||
)
|
||||
|
||||
for action in plan.recommended_actions:
|
||||
assert action.mcp_provider in {"k8s", "ssh", "prometheus", "signoz", "database", "internal"}
|
||||
assert action.risk in {"low", "medium", "high", "critical"}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_e2e_nim_database_connection_fail():
|
||||
"""
|
||||
E2E: DatabaseConnectionFail — 資料庫連線失敗
|
||||
真實 NIM (192.168.0.188:8088) 呼叫,不允許 mock
|
||||
|
||||
驗收:
|
||||
- len(recommended_actions) >= 1
|
||||
- 至少 1 個非 restart 類動作(如查連線、查 DB 狀態)
|
||||
"""
|
||||
if not await _nim_reachable():
|
||||
pytest.skip("NIM (192.168.0.188:8088) 不可達,跳過 E2E 測試")
|
||||
|
||||
diagnosis = _make_diagnosis(
|
||||
"TEST-E2E-DB-CONN-001",
|
||||
"PostgreSQL 資料庫連線池耗盡,最大連線數 100/100 均被佔用,"
|
||||
"awoooi-api 回應 'connection pool exhausted',P95 latency 飆升至 12s",
|
||||
"DatabaseConnectionFail",
|
||||
confidence=0.86,
|
||||
)
|
||||
|
||||
agent = get_solver_agent()
|
||||
plan = await agent.run(diagnosis)
|
||||
|
||||
assert plan is not None
|
||||
|
||||
if plan.degraded:
|
||||
assert plan.candidates == []
|
||||
assert plan.recommended_actions == []
|
||||
pytest.skip(f"NIM 回應降級({plan.vote}),跳過 recommended_actions 驗收")
|
||||
|
||||
assert len(plan.recommended_actions) >= 1, (
|
||||
f"DatabaseConnectionFail 應至少有 1 個 recommended_action,實際:{plan.recommended_actions}"
|
||||
)
|
||||
|
||||
non_restart_actions = [
|
||||
a for a in plan.recommended_actions
|
||||
if "restart" not in a.name.lower()
|
||||
and "restart" not in a.label.lower()
|
||||
and "重啟" not in a.label
|
||||
]
|
||||
assert len(non_restart_actions) >= 1, (
|
||||
f"DatabaseConnectionFail 應至少 1 個非 restart 動作,實際:{[a.name for a in plan.recommended_actions]}"
|
||||
)
|
||||
|
||||
for action in plan.recommended_actions:
|
||||
assert action.mcp_provider in {"k8s", "ssh", "prometheus", "signoz", "database", "internal"}
|
||||
assert action.risk in {"low", "medium", "high", "critical"}
|
||||
Reference in New Issue
Block a user