Files
awoooi/apps/api/tests/test_auto_repair_service.py
Your Name 97be5dedd7
Some checks failed
CD Pipeline / tests (push) Successful in 1m27s
Code Review / ai-code-review (push) Successful in 29s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
fix(aiops): escalate failed host verification
2026-05-01 10:47:42 +08:00

504 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Auto Repair Service Tests - #8 自動升級決策
==========================================
測試自動修復服務層功能
版本: v1.0
建立: 2026-03-26 (台北時區)
建立者: Claude Code (#8 自動升級決策)
"""
import pytest
from src.models.incident import Incident, IncidentStatus, Severity, Signal
from src.models.playbook import (
ActionType,
Playbook,
PlaybookStatus,
RepairStep,
RiskLevel,
SymptomPattern,
)
from src.services.auto_repair_service import AutoRepairService
from src.utils.timezone import now_taipei
class MockPlaybookService:
"""Mock playbook service for testing"""
def __init__(self):
self._playbooks: dict[str, Playbook] = {}
self._recommendations: list = []
def add_playbook(self, playbook: Playbook):
self._playbooks[playbook.playbook_id] = playbook
def set_recommendations(self, recommendations: list):
self._recommendations = recommendations
async def get_recommendations(self, symptoms, top_k=3):
return self._recommendations
async def get_by_id(self, playbook_id: str):
return self._playbooks.get(playbook_id)
async def record_execution(self, playbook_id: str, success: bool):
playbook = self._playbooks.get(playbook_id)
if playbook:
if success:
playbook.success_count += 1
else:
playbook.failure_count += 1
return playbook is not None
def create_test_incident(
incident_id: str = "INC-TEST-001",
severity: Severity = Severity.P2,
alert_category: str | None = None,
alert_name: str = "HighCPU",
) -> Incident:
"""Create a test incident"""
now = now_taipei()
return Incident(
incident_id=incident_id,
status=IncidentStatus.INVESTIGATING,
severity=severity,
affected_services=["test-service"],
alert_category=alert_category,
signals=[
Signal(
alert_name=alert_name,
severity=severity,
source="prometheus",
fired_at=now,
labels={"namespace": "prod", "alertname": alert_name},
),
],
)
def create_high_quality_playbook(
playbook_id: str = "PB-TEST-001",
risk_level: RiskLevel = RiskLevel.MEDIUM,
) -> Playbook:
"""Create a high quality playbook (success_rate >= 95%, count >= 10)"""
return Playbook(
playbook_id=playbook_id,
name="HighCPU - test-service 修復劇本",
description="High quality playbook for auto repair",
status=PlaybookStatus.APPROVED,
symptom_pattern=SymptomPattern(
alert_names=["HighCPU"],
affected_services=["test-service"],
severity_range=["P2"],
),
repair_steps=[
RepairStep(
step_number=1,
action_type=ActionType.KUBECTL,
command="kubectl rollout restart deployment/{target}",
risk_level=risk_level,
),
],
success_count=20, # >= 10
failure_count=1, # success_rate = 95.2%
ai_confidence=0.9,
)
class MockPlaybookRecommendation:
"""Mock recommendation for testing"""
def __init__(self, playbook: Playbook, similarity_score: float):
self.playbook = playbook
self.similarity_score = similarity_score
async def _no_cooldown(*args, **kwargs) -> tuple[bool, str]:
"""單元測試用 cooldown: 永遠允許 (不需要 Redis)"""
return True, "允許自動修復 (test bypass)"
class TestAutoRepairService:
"""Auto Repair Service unit tests"""
@pytest.fixture
def mock_playbook_service(self):
return MockPlaybookService()
@pytest.fixture
def service(self, mock_playbook_service):
# 2026-04-01 ogt: 注入 no-op cooldown 以隔離 Redis 依賴
return AutoRepairService(
playbook_service=mock_playbook_service,
cooldown_checker=_no_cooldown,
)
@pytest.mark.asyncio
async def test_evaluate_blocks_p1_severity(self, service):
"""Test that P1 severity incidents are blocked"""
incident = create_test_incident(severity=Severity.P1)
decision = await service.evaluate_auto_repair(incident)
assert decision.can_auto_repair is False
assert decision.blocked_by == "HIGH_SEVERITY"
@pytest.mark.asyncio
async def test_evaluate_blocks_p0_severity(self, service):
"""Test that P0 severity incidents are blocked"""
incident = create_test_incident(severity=Severity.P0)
decision = await service.evaluate_auto_repair(incident)
assert decision.can_auto_repair is False
assert decision.blocked_by == "HIGH_SEVERITY"
@pytest.mark.asyncio
async def test_evaluate_no_playbook_match(self, service, mock_playbook_service):
"""Test when no playbook matches"""
mock_playbook_service.set_recommendations([])
incident = create_test_incident(severity=Severity.P2)
decision = await service.evaluate_auto_repair(incident)
assert decision.can_auto_repair is False
assert decision.blocked_by == "NO_MATCH"
@pytest.mark.asyncio
async def test_evaluate_low_similarity(self, service, mock_playbook_service):
"""Test that low similarity no longer blocks auto-repair.
2026-04-07: 統帥指令移除相似度門檻 — 只要 APPROVED Playbook 匹配即執行。
2026-04-08 Claude Sonnet 4.6: 更新測試預期以符合當前設計。
"""
playbook = create_high_quality_playbook()
mock_playbook_service.add_playbook(playbook)
mock_playbook_service.set_recommendations([
MockPlaybookRecommendation(playbook, similarity_score=0.5) # Below old 0.7 threshold
])
incident = create_test_incident(severity=Severity.P2)
decision = await service.evaluate_auto_repair(incident)
# 相似度門檻已移除 — APPROVED Playbook 即使低相似度也應通過
assert decision.can_auto_repair is True
assert decision.blocked_by is None
@pytest.mark.asyncio
async def test_evaluate_not_high_quality(self, service, mock_playbook_service):
"""Test low-quality playbook is now approved (gates removed 2026-04-07).
2026-04-07: 統帥指令移除品質門檻 — 只要 APPROVED 狀態即可執行。
2026-04-08 Claude Sonnet 4.6: 更新測試預期以符合當前設計。
"""
playbook = Playbook(
playbook_id="PB-LOW-QUALITY",
name="Low quality playbook",
description="Not enough executions",
status=PlaybookStatus.APPROVED,
symptom_pattern=SymptomPattern(
alert_names=["HighCPU"],
affected_services=["test-service"],
),
repair_steps=[
RepairStep(
step_number=1,
action_type=ActionType.KUBECTL,
command="kubectl rollout restart",
risk_level=RiskLevel.MEDIUM,
description="restart deployment",
),
],
success_count=2,
failure_count=0,
)
mock_playbook_service.add_playbook(playbook)
mock_playbook_service.set_recommendations([
MockPlaybookRecommendation(playbook, similarity_score=0.9)
])
incident = create_test_incident(severity=Severity.P2)
decision = await service.evaluate_auto_repair(incident)
# 品質門檻已移除 — APPROVED Playbook 直接通過
assert decision.can_auto_repair is True
assert decision.blocked_by is None
@pytest.mark.asyncio
async def test_evaluate_high_risk_blocked(self, service, mock_playbook_service):
"""Test HIGH risk playbook is now approved (gates removed 2026-04-07).
2026-04-07: 統帥指令移除風險等級門檻 — 只要 APPROVED 狀態即可執行。
2026-04-08 Claude Sonnet 4.6: 更新測試預期以符合當前設計。
"""
playbook = create_high_quality_playbook(risk_level=RiskLevel.HIGH)
mock_playbook_service.add_playbook(playbook)
mock_playbook_service.set_recommendations([
MockPlaybookRecommendation(playbook, similarity_score=0.9)
])
incident = create_test_incident(severity=Severity.P2)
decision = await service.evaluate_auto_repair(incident)
# 風險等級門檻已移除 — HIGH risk APPROVED Playbook 也通過
assert decision.can_auto_repair is True
assert decision.blocked_by is None
@pytest.mark.asyncio
async def test_evaluate_critical_risk_blocked(self, service, mock_playbook_service):
"""Test CRITICAL risk playbook is now approved (gates removed 2026-04-07).
2026-04-07: 統帥指令移除風險等級門檻。
2026-04-08 Claude Sonnet 4.6: 更新測試預期以符合當前設計。
"""
playbook = create_high_quality_playbook(risk_level=RiskLevel.CRITICAL)
mock_playbook_service.add_playbook(playbook)
mock_playbook_service.set_recommendations([
MockPlaybookRecommendation(playbook, similarity_score=0.9)
])
incident = create_test_incident(severity=Severity.P2)
decision = await service.evaluate_auto_repair(incident)
# 風險等級門檻已移除 — CRITICAL risk APPROVED Playbook 也通過
assert decision.can_auto_repair is True
assert decision.blocked_by is None
@pytest.mark.asyncio
async def test_evaluate_success(self, service, mock_playbook_service):
"""Test successful auto repair evaluation"""
playbook = create_high_quality_playbook(risk_level=RiskLevel.MEDIUM)
mock_playbook_service.add_playbook(playbook)
mock_playbook_service.set_recommendations([
MockPlaybookRecommendation(playbook, similarity_score=0.85)
])
incident = create_test_incident(severity=Severity.P2)
decision = await service.evaluate_auto_repair(incident)
assert decision.can_auto_repair is True
assert decision.playbook is not None
assert decision.playbook.playbook_id == playbook.playbook_id
assert decision.blocked_by is None
@pytest.mark.asyncio
async def test_backup_failure_blocks_k8s_playbook(self, service, mock_playbook_service):
"""Backup/host incidents must not execute K8s rollout playbooks."""
playbook = create_high_quality_playbook(risk_level=RiskLevel.MEDIUM)
mock_playbook_service.add_playbook(playbook)
mock_playbook_service.set_recommendations([
MockPlaybookRecommendation(playbook, similarity_score=0.85)
])
incident = create_test_incident(
severity=Severity.P2,
alert_category="backup_failure",
alert_name="HostBackupFailed",
)
decision = await service.evaluate_auto_repair(incident)
assert decision.can_auto_repair is False
assert decision.blocked_by == "HOST_BACKUP_K8S_PLAYBOOK"
@pytest.mark.asyncio
async def test_backup_failure_allows_ssh_playbook(self, service, mock_playbook_service):
"""Backup/host incidents may still use SSH playbooks."""
playbook = Playbook(
playbook_id="PB-BACKUP-SSH",
name="Backup SSH diagnostics",
description="Read-only backup diagnosis",
status=PlaybookStatus.APPROVED,
symptom_pattern=SymptomPattern(
alert_names=["HostBackupFailed"],
affected_services=["test-service"],
),
repair_steps=[
RepairStep(
step_number=1,
action_type=ActionType.SSH_COMMAND,
command="ssh {host} 'tail -80 /var/log/backup.log'",
risk_level=RiskLevel.LOW,
description="collect backup logs",
),
],
success_count=20,
failure_count=1,
)
mock_playbook_service.add_playbook(playbook)
mock_playbook_service.set_recommendations([
MockPlaybookRecommendation(playbook, similarity_score=0.85)
])
incident = create_test_incident(
severity=Severity.P2,
alert_category="backup_failure",
alert_name="HostBackupFailed",
)
decision = await service.evaluate_auto_repair(incident)
assert decision.can_auto_repair is True
assert decision.blocked_by is None
def test_failed_verification_escalates_for_host_backup_ssh_playbook(self, service):
"""Failed backup SSH diagnostics must not synthesize K8s rollback."""
playbook = Playbook(
playbook_id="PB-BACKUP-SSH",
name="Backup SSH diagnostics",
description="Read-only backup diagnosis",
status=PlaybookStatus.APPROVED,
symptom_pattern=SymptomPattern(
alert_names=["HostBackupFailed"],
affected_services=["test-service"],
),
repair_steps=[
RepairStep(
step_number=1,
action_type=ActionType.SSH_COMMAND,
command="ssh {host} 'tail -80 /var/log/backup.log'",
risk_level=RiskLevel.LOW,
),
],
success_count=20,
failure_count=1,
)
incident = create_test_incident(
severity=Severity.P2,
alert_category="backup_failure",
alert_name="HostBackupFailed",
)
assert service._should_escalate_failed_verification(incident, playbook) is True
def test_failed_verification_allows_k8s_rollback_for_k8s_playbook(self, service):
"""K8s playbooks may still use the existing K8s rollback path."""
playbook = create_high_quality_playbook(risk_level=RiskLevel.MEDIUM)
incident = create_test_incident(severity=Severity.P2)
assert service._should_escalate_failed_verification(incident, playbook) is False
@pytest.mark.asyncio
async def test_evaluate_low_risk_allowed(self, service, mock_playbook_service):
"""Test that LOW risk actions are allowed"""
playbook = create_high_quality_playbook(risk_level=RiskLevel.LOW)
mock_playbook_service.add_playbook(playbook)
mock_playbook_service.set_recommendations([
MockPlaybookRecommendation(playbook, similarity_score=0.85)
])
incident = create_test_incident(severity=Severity.P2)
decision = await service.evaluate_auto_repair(incident)
assert decision.can_auto_repair is True
assert decision.risk_level == RiskLevel.LOW
@pytest.mark.asyncio
async def test_is_high_quality_calculation(self):
"""Test is_high_quality property"""
# High quality: APPROVED + 95%+ success rate + 10+ successes
playbook = create_high_quality_playbook()
assert playbook.is_high_quality is True
assert playbook.success_rate >= 0.95
assert playbook.success_count >= 10
@pytest.mark.asyncio
async def test_not_high_quality_low_success_rate(self):
"""Test playbook with low success rate is not high quality"""
playbook = Playbook(
playbook_id="PB-LOW-RATE",
name="Low success rate",
description="Too many failures",
status=PlaybookStatus.APPROVED,
symptom_pattern=SymptomPattern(
alert_names=["Test"],
affected_services=["test"],
),
repair_steps=[],
success_count=15,
failure_count=5, # 75% success rate
)
assert playbook.is_high_quality is False
assert playbook.success_rate < 0.95
# =============================================================================
# B25/B26 — drain_pending_tasks
# 2026-04-27 Wave8-X3 by Claude — K8s rolling restart drain fix
# =============================================================================
class TestDrainPendingTasks:
"""drain_pending_tasks 優雅關閉背景任務。"""
@pytest.fixture
def service(self):
return AutoRepairService(
playbook_service=MockPlaybookService(),
cooldown_checker=_no_cooldown,
)
@pytest.mark.asyncio
async def test_drain_no_pending_tasks_returns_zero(self, service):
"""沒有待處理 task → 立即返回 drained=0"""
result = await service.drain_pending_tasks(timeout=5.0)
assert result["drained"] == 0
assert result["timeout"] is False
@pytest.mark.asyncio
async def test_drain_waits_for_pending_tasks(self, service):
"""有 pending task → drain 等待完成後回報正確數量"""
import asyncio
completed = []
async def quick_task():
await asyncio.sleep(0.01)
completed.append(1)
task = asyncio.create_task(quick_task())
service._pending_tasks.add(task)
task.add_done_callback(service._pending_tasks.discard)
result = await service.drain_pending_tasks(timeout=5.0)
assert result["drained"] == 1
assert result.get("still_pending", 0) == 0
assert result["timeout"] is False
assert len(completed) == 1
@pytest.mark.asyncio
async def test_drain_timeout_reports_still_pending(self, service):
"""Task 超過 timeout → timeout=Truestill_pending > 0"""
import asyncio
async def slow_task():
await asyncio.sleep(10) # 遠超 timeout
task = asyncio.create_task(slow_task())
service._pending_tasks.add(task)
task.add_done_callback(service._pending_tasks.discard)
result = await service.drain_pending_tasks(timeout=0.05)
assert result["timeout"] is True
assert result.get("still_pending", 0) >= 1
# 清理:取消還在跑的 task 避免 test 洩漏
task.cancel()
try:
await task
except asyncio.CancelledError:
pass
@pytest.mark.asyncio
async def test_drain_multiple_tasks_all_complete(self, service):
"""多個 task → 全部完成drained 等於 task 數"""
import asyncio
async def quick():
await asyncio.sleep(0.01)
tasks = [asyncio.create_task(quick()) for _ in range(3)]
for t in tasks:
service._pending_tasks.add(t)
t.add_done_callback(service._pending_tasks.discard)
result = await service.drain_pending_tasks(timeout=5.0)
assert result["drained"] == 3
assert result["timeout"] is False