Files
awoooi/apps/api/tests/test_learning_chain_e2e.py
Your Name 518a16e895
Some checks failed
Code Review / ai-code-review (push) Successful in 10s
CD Pipeline / tests (push) Successful in 1m10s
CD Pipeline / build-and-deploy (push) Failing after 3m16s
CD Pipeline / post-deploy-checks (push) Has been skipped
fix(awooop): persist auto repair verification fallback
2026-05-13 18:47:46 +08:00

417 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
飛輪閉環 E2E 測試 — auto_repair → PostExecutionVerifier → LearningService → EWMA
================================================================================
2026-04-26 Wave4 P1.3+P1.4 by Claude Engineer-B3 — 飛輪閉環最後一哩
測試範圍:
- execute_auto_repair 成功 → verifier 被呼叫 → record_verification_result 被呼叫
- execute_auto_repair 失敗 → verifier 不被呼叫(主 except 路徑)
- matched_playbook_id=None 的 record_verification_result → log warning 不 crash
- verifier 拋例外 → 修復仍回傳成功trust 不更新
🔴 遵循 feedback_no_mock_testing.md:
- 禁止 MagicMock/AsyncMock/unittest.mock.patch
- 使用純 Python Stub 類別 + pytest monkeypatch替換 module-level getter
"""
from __future__ import annotations
import asyncio
import pytest
from src.models.incident import Incident, IncidentStatus, Severity, Signal
from src.models.playbook import (
ActionType,
Playbook,
PlaybookStatus,
RepairStep,
RiskLevel,
SymptomPattern,
)
from src.services.auto_repair_service import AutoRepairService
from src.utils.timezone import now_taipei
# =============================================================================
# Stubs
# =============================================================================
class StubVerifier:
"""PostExecutionVerifier 的輕量 Stub — 記錄呼叫,不真正等 K8s"""
def __init__(self, result: str = "success", raise_exc: Exception | None = None):
self.result = result
self.raise_exc = raise_exc
self.calls: list[dict] = []
async def verify(
self,
incident,
snapshot,
action_taken: str,
warmup_sec: float = 0.0,
) -> str:
self.calls.append(
{"incident_id": incident.incident_id, "snapshot": snapshot, "action_taken": action_taken}
)
if self.raise_exc is not None:
raise self.raise_exc
return self.result
class StubLearningService:
"""LearningService 的輕量 Stub — 記錄 record_verification_result 呼叫"""
def __init__(self) -> None:
self.verification_calls: list[dict] = []
async def record_verification_result(
self,
incident_id: str,
action_taken: str,
verification_result: str,
matched_playbook_id: str | None = None,
) -> None:
self.verification_calls.append(
{
"incident_id": incident_id,
"action_taken": action_taken,
"verification_result": verification_result,
"matched_playbook_id": matched_playbook_id,
}
)
class StubPlaybookService:
"""PlaybookService 的輕量 Stub — 支援 record_execution + get_recommendations"""
def __init__(self) -> None:
self._playbooks: dict[str, Playbook] = {}
self._recommendations: list = []
def add_playbook(self, playbook: Playbook) -> None:
self._playbooks[playbook.playbook_id] = playbook
def set_recommendations(self, recommendations: list) -> None:
self._recommendations = recommendations
async def get_recommendations(self, symptoms, top_k: int = 3) -> list:
return self._recommendations
async def get_by_id(self, playbook_id: str) -> Playbook | None:
return self._playbooks.get(playbook_id)
async def record_execution(self, playbook_id: str, success: bool) -> bool:
playbook = self._playbooks.get(playbook_id)
if playbook is not None:
if success:
playbook.success_count += 1
else:
playbook.failure_count += 1
return playbook is not None
class StubRecommendation:
def __init__(self, playbook: Playbook, similarity_score: float = 0.9) -> None:
self.playbook = playbook
self.similarity_score = similarity_score
# =============================================================================
# Factories
# =============================================================================
def _make_incident(
incident_id: str = "INC-E2E-001",
severity: Severity = Severity.P2,
) -> Incident:
now = now_taipei()
return Incident(
incident_id=incident_id,
status=IncidentStatus.INVESTIGATING,
severity=severity,
affected_services=["e2e-service"],
signals=[
Signal(
alert_name="TestAlert",
severity=severity,
source="prometheus",
fired_at=now,
labels={"namespace": "awoooi-prod", "alertname": "TestAlert"},
)
],
)
def _make_playbook(
playbook_id: str = "PB-E2E-001",
trust_score: float = 0.5,
) -> Playbook:
pb = Playbook(
playbook_id=playbook_id,
name="E2E 測試 Playbook",
description="飛輪閉環 E2E 測試用",
status=PlaybookStatus.APPROVED,
symptom_pattern=SymptomPattern(
alert_names=["TestAlert"],
affected_services=["e2e-service"],
severity_range=["P2"],
),
repair_steps=[
RepairStep(
step_number=1,
action_type=ActionType.MANUAL,
command="echo test",
risk_level=RiskLevel.LOW,
)
],
trust_score=trust_score,
success_count=5,
failure_count=1,
)
return pb
async def _no_cooldown(*args, **kwargs) -> tuple[bool, str]:
return True, "允許修復 (test bypass)"
# =============================================================================
# Tests
# =============================================================================
@pytest.mark.asyncio
async def test_auto_repair_success_triggers_verify_and_learn(monkeypatch):
"""
執行成功 → verifier.verify() 被呼叫 → record_verification_result 被呼叫
驗證飛輪鏈路的前兩段接通。
"""
stub_verifier = StubVerifier(result="success")
stub_learning = StubLearningService()
# 替換 module-level getterspure Python, no MagicMock
import src.services.auto_repair_service as _ars_mod
monkeypatch.setattr(_ars_mod, "_verifier_getter", None, raising=False)
import src.services.post_execution_verifier as _pev_mod
monkeypatch.setattr(_pev_mod, "_verifier", stub_verifier)
import src.services.learning_service as _ls_mod
monkeypatch.setattr(_ls_mod, "_learning_service", stub_learning)
playbook = _make_playbook()
pb_service = StubPlaybookService()
pb_service.add_playbook(playbook)
pb_service.set_recommendations([StubRecommendation(playbook)])
service = AutoRepairService(
playbook_service=pb_service,
cooldown_checker=_no_cooldown,
)
incident = _make_incident()
result = await service.execute_auto_repair(incident, playbook)
assert result.success is True
# fire-and-forget task — 讓 event loop 執行完
# verifier 有 warmup_sec但 Stub 忽略 warmup不 sleep
await asyncio.sleep(0.05)
assert len(stub_verifier.calls) == 1, "verifier.verify() 應被呼叫一次"
assert stub_verifier.calls[0]["incident_id"] == incident.incident_id
assert stub_verifier.calls[0]["snapshot"] is None
assert len(stub_learning.verification_calls) == 1, "record_verification_result 應被呼叫一次"
call = stub_learning.verification_calls[0]
assert call["incident_id"] == incident.incident_id
assert call["verification_result"] == "success"
assert call["matched_playbook_id"] == playbook.playbook_id
@pytest.mark.asyncio
async def test_auto_repair_can_delegate_post_verification(monkeypatch):
"""
webhook 路徑會自行 await PostExecutionVerifierservice 層需可跳過內部
fire-and-forget 驗證,避免同一個修復產生兩組驗證與 Telegram 升級。
"""
stub_verifier = StubVerifier(result="success")
stub_learning = StubLearningService()
import src.services.post_execution_verifier as _pev_mod
monkeypatch.setattr(_pev_mod, "_verifier", stub_verifier)
import src.services.learning_service as _ls_mod
monkeypatch.setattr(_ls_mod, "_learning_service", stub_learning)
playbook = _make_playbook()
pb_service = StubPlaybookService()
pb_service.add_playbook(playbook)
service = AutoRepairService(
playbook_service=pb_service,
cooldown_checker=_no_cooldown,
)
incident = _make_incident()
result = await service.execute_auto_repair(
incident,
playbook,
run_post_verification=False,
)
assert result.success is True
await asyncio.sleep(0.05)
assert stub_verifier.calls == []
assert stub_learning.verification_calls == []
@pytest.mark.asyncio
async def test_auto_repair_failure_does_not_call_verifier(monkeypatch):
"""
執行失敗(步驟拋例外)→ verifier 不被呼叫(失敗路徑不進入 verify-and-learn 區塊)
"""
stub_verifier = StubVerifier(result="success")
import src.services.post_execution_verifier as _pev_mod
monkeypatch.setattr(_pev_mod, "_verifier", stub_verifier)
import src.services.learning_service as _ls_mod
stub_learning = StubLearningService()
monkeypatch.setattr(_ls_mod, "_learning_service", stub_learning)
# 建立一個會讓 _execute_step raise 的 playbookKUBECTL 步驟executor 不可用時只 skip不 raise
# 直接讓 playbook_service.record_execution 正常工作,驗證失敗路徑不呼叫 verifier
class FailingPlaybookService(StubPlaybookService):
async def record_execution(self, playbook_id: str, success: bool) -> bool:
# 正常記錄,不 raise
return True
playbook = _make_playbook()
pb_service = FailingPlaybookService()
pb_service.add_playbook(playbook)
# 讓 _execute_step 拋例外以觸發失敗路徑
original_execute_step = AutoRepairService._execute_step
async def _always_fail(self_inner, incident_arg, step_arg) -> str:
raise RuntimeError("強制測試失敗")
service = AutoRepairService(
playbook_service=pb_service,
cooldown_checker=_no_cooldown,
)
# Monkeypatch instance method
monkeypatch.setattr(AutoRepairService, "_execute_step", _always_fail)
incident = _make_incident()
result = await service.execute_auto_repair(incident, playbook)
assert result.success is False
await asyncio.sleep(0.05)
# 失敗路徑不進入 verify-and-learn 塊
assert len(stub_verifier.calls) == 0, "執行失敗時不應呼叫 verifier"
assert len(stub_learning.verification_calls) == 0, "執行失敗時不應呼叫 record_verification_result"
@pytest.mark.asyncio
async def test_record_verification_result_no_playbook_id_does_not_crash():
"""
matched_playbook_id=None → record_verification_result 正常執行,不 crash。
驗證 learning_service 對 None playbook_id 的防禦性。
"""
from src.services.learning_service import LearningService
from src.repositories.interfaces import ILearningRepository, ITrustRepository
class NullLearningRepo:
async def record_repair(self, **kwargs) -> bool:
return True
async def get_repair_stats(self, *a, **kw):
return {}
async def get_all_repair_stats(self, *a, **kw):
return {}
async def record_disposition(self, *a, **kw):
return True
async def get_dispositions(self, *a, **kw):
return {}
class NullTrustRepo:
async def save_trust_record(self, *a, **kw):
pass
async def load_trust_record(self, *a, **kw):
return None
async def get_all_trust_records(self, *a, **kw):
return []
# 直接呼叫 record_verification_result(matched_playbook_id=None)
# 不應 raise只應 log warning 並略過 _update_playbook_stats
svc = LearningService(
repository=NullLearningRepo(),
trust_repository=NullTrustRepo(),
)
# 不應拋例外
await svc.record_verification_result(
incident_id="INC-NULL-PB-001",
action_taken="auto_repair:none",
verification_result="success",
matched_playbook_id=None,
)
# 只要不 crash 即通過
@pytest.mark.asyncio
async def test_verifier_exception_does_not_block_repair(monkeypatch):
"""
verifier 拋例外 → 修復結果仍回傳 success=Truelearning 不被呼叫。
驗證 _verify_and_learn 的 exception 隔離。
"""
stub_verifier = StubVerifier(
result="success",
raise_exc=RuntimeError("verifier 模擬故障"),
)
stub_learning = StubLearningService()
import src.services.post_execution_verifier as _pev_mod
monkeypatch.setattr(_pev_mod, "_verifier", stub_verifier)
import src.services.learning_service as _ls_mod
monkeypatch.setattr(_ls_mod, "_learning_service", stub_learning)
playbook = _make_playbook()
pb_service = StubPlaybookService()
pb_service.add_playbook(playbook)
service = AutoRepairService(
playbook_service=pb_service,
cooldown_checker=_no_cooldown,
)
incident = _make_incident()
result = await service.execute_auto_repair(incident, playbook)
# 主路徑成功回傳
assert result.success is True
await asyncio.sleep(0.05)
# verifier 被呼叫(但拋了例外)
assert len(stub_verifier.calls) == 1
# learning 不應被呼叫(因為 verifier raise 中斷了 _verify_and_learn
assert len(stub_learning.verification_calls) == 0, "verifier 拋例外後 learning 不應被呼叫"