Files
awoooi/apps/api/tests/test_repair_candidate_service.py
Your Name 1e08440cd0
Some checks failed
CD Pipeline / tests (push) Successful in 1m28s
Code Review / ai-code-review (push) Successful in 23s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
fix(api): 補修復候選 coverage gap 契約
2026-06-11 19:47:48 +08:00

352 lines
14 KiB
Python

from datetime import datetime, timezone
from uuid import uuid4
import pytest
from src.models.approval import ApprovalRequestCreate, BlastRadius, DataImpact, RiskLevel
from src.models.incident import Incident, Severity, Signal
from src.models.playbook import (
ActionType,
Playbook,
PlaybookSource,
PlaybookStatus,
RepairStep,
)
from src.models.playbook import RiskLevel as PlaybookRiskLevel
from src.services.approval_db import approval_request_to_record_data
from src.services.evidence_snapshot import EvidenceSnapshot
from src.services.repair_candidate_service import RepairCandidateService
class FakeInvestigator:
def __init__(self, evidence: EvidenceSnapshot | None) -> None:
self.evidence = evidence
async def investigate(self, incident: Incident) -> EvidenceSnapshot | None:
return self.evidence
class FakePlaybookRepository:
def __init__(self, playbook: Playbook | None) -> None:
self.playbook = playbook
async def get_by_id(self, playbook_id: str) -> Playbook | None:
if self.playbook and self.playbook.playbook_id == playbook_id:
return self.playbook
return None
class FakeIncidentService:
async def get_from_working_memory(self, incident_id: str) -> None:
return None
class FakeAutoRepairService:
def preview_write_ssh_mcp_route(self, incident: Incident, command: str) -> bool:
return True
def _incident() -> Incident:
return Incident(
incident_id="INC-TEST-REPAIR",
severity=Severity.P2,
signals=[
Signal(
alert_name="NodeExporterDown",
severity=Severity.P2,
source="alertmanager",
fired_at=datetime.now(timezone.utc),
labels={"namespace": "awoooi-prod", "deployment": "awoooi-api"},
annotations={"summary": "node exporter down"},
)
],
affected_services=["awoooi-api"],
)
def _evidence(incident_id: str, *, sensors_succeeded: int = 2) -> EvidenceSnapshot:
return EvidenceSnapshot(
incident_id=incident_id,
sensors_attempted=3,
sensors_succeeded=sensors_succeeded,
mcp_health={"k8s": sensors_succeeded > 0, "prometheus": sensors_succeeded > 1},
evidence_summary="k8s events and metrics collected",
)
def _playbook(command: str, *, risk_level: PlaybookRiskLevel = PlaybookRiskLevel.LOW) -> Playbook:
return Playbook(
playbook_id="PB-REPAIR-001",
name="重啟 API deployment",
description="以 approved PlayBook 修復 API 工作負載",
status=PlaybookStatus.APPROVED,
source=PlaybookSource.MANUAL,
trust_score=0.72,
estimated_duration_minutes=4,
repair_steps=[
RepairStep(
step_number=1,
action_type=ActionType.KUBECTL,
command=command,
expected_result="deployment restarted and rollout verified",
risk_level=risk_level,
requires_approval=True,
)
],
)
def _generic_fallback_playbook() -> Playbook:
playbook = _playbook(
"kubectl rollout restart deployment/{target} -n {namespace}",
risk_level=PlaybookRiskLevel.MEDIUM,
)
playbook.playbook_id = "PB-GENERIC-FALLBACK"
playbook.name = "通用兜底規則"
playbook.symptom_pattern.alert_names = ["*"]
return playbook
@pytest.mark.asyncio
async def test_build_candidate_from_mcp_evidence_and_approved_playbook() -> None:
incident = _incident()
service = RepairCandidateService(
incident_service=FakeIncidentService(),
investigator=FakeInvestigator(_evidence(incident.incident_id)),
playbook_repository=FakePlaybookRepository(
_playbook("kubectl rollout restart deployment/awoooi-api -n awoooi-prod")
),
auto_repair_service=FakeAutoRepairService(),
)
result = await service.build_from_incident(
incident=incident,
alertname="NodeExporterDown",
target_resource="awoooi-api",
namespace="awoooi-prod",
message="node exporter is down",
fallback_action="NO_ACTION - REPAIR_CANDIDATE_MISSING",
matched_playbook_id="PB-REPAIR-001",
severity="medium",
)
assert result.candidate_found is True
assert result.approval_request is not None
assert result.approval_request.action == "kubectl rollout restart deployment/awoooi-api -n awoooi-prod"
assert result.approval_request.risk_level == RiskLevel.MEDIUM
assert result.approval_request.matched_playbook_id == "PB-REPAIR-001"
assert result.metadata["repair_candidate_status"] == "candidate_ready_for_approval"
assert result.metadata["mcp_evidence"]["sensors_succeeded"] == 2
assert result.metadata["playbook_trust"]["trust_score"] == 0.72
assert result.metadata["verifier_plan"]
@pytest.mark.asyncio
async def test_candidate_blocked_when_mcp_evidence_missing() -> None:
incident = _incident()
service = RepairCandidateService(
incident_service=FakeIncidentService(),
investigator=FakeInvestigator(_evidence(incident.incident_id, sensors_succeeded=0)),
playbook_repository=FakePlaybookRepository(
_playbook("kubectl rollout restart deployment/awoooi-api -n awoooi-prod")
),
auto_repair_service=FakeAutoRepairService(),
)
result = await service.build_from_incident(
incident=incident,
alertname="NodeExporterDown",
target_resource="awoooi-api",
namespace="awoooi-prod",
message="node exporter is down",
fallback_action="NO_ACTION - REPAIR_CANDIDATE_MISSING",
matched_playbook_id="PB-REPAIR-001",
severity="medium",
)
assert result.candidate_found is False
assert "mcp_evidence_missing" in result.blockers
assert result.metadata["repair_candidate_status"] == "blocked"
@pytest.mark.asyncio
async def test_candidate_blocked_when_playbook_is_observe_only() -> None:
incident = _incident()
service = RepairCandidateService(
incident_service=FakeIncidentService(),
investigator=FakeInvestigator(_evidence(incident.incident_id)),
playbook_repository=FakePlaybookRepository(
_playbook("kubectl get pods -n awoooi-prod")
),
auto_repair_service=FakeAutoRepairService(),
)
result = await service.build_from_incident(
incident=incident,
alertname="NodeExporterDown",
target_resource="awoooi-api",
namespace="awoooi-prod",
message="node exporter is down",
fallback_action="NO_ACTION - REPAIR_CANDIDATE_MISSING",
matched_playbook_id="PB-REPAIR-001",
severity="medium",
)
assert result.candidate_found is False
assert "playbook_observe_only" in result.blockers
@pytest.mark.asyncio
async def test_candidate_blocked_when_playbook_is_generic_fallback() -> None:
incident = _incident()
service = RepairCandidateService(
incident_service=FakeIncidentService(),
investigator=FakeInvestigator(_evidence(incident.incident_id)),
playbook_repository=FakePlaybookRepository(_generic_fallback_playbook()),
auto_repair_service=FakeAutoRepairService(),
)
result = await service.build_from_incident(
incident=incident,
alertname="UnknownAlert",
target_resource="awoooi-api",
namespace="awoooi-prod",
message="unknown alert",
fallback_action="NO_ACTION - REPAIR_CANDIDATE_MISSING",
matched_playbook_id="PB-GENERIC-FALLBACK",
severity="medium",
)
assert result.candidate_found is False
assert "playbook_generic_fallback_not_repair" in result.blockers
assert "通用兜底" in result.metadata["repair_candidate_blocker_summary"]
assert result.metadata["playbook_draft_required"] is True
assert result.metadata["repair_candidate_draft_package"]["schema_version"] == (
"repair_candidate_draft_package_v1"
)
assert result.metadata["repair_candidate_draft_package"]["lane"] == (
"create_service_specific_repair_playbook"
)
assert "建立專屬 PlayBook 草案" in result.metadata["repair_candidate_next_step"]
assert "repair_command" in result.metadata["repair_candidate_draft_package"]["required_fields"]
coverage_gap = result.metadata["repair_candidate_draft_package"]["coverage_gap"]
assert coverage_gap["schema_version"] == "repair_candidate_coverage_gap_v1"
assert coverage_gap["coverage_key"] == "unknownalert:awoooi-api"
assert coverage_gap["blocking_stage"] == "service_playbook_coverage"
assert coverage_gap["next_owner_lane"] == "create_service_specific_repair_playbook"
assert coverage_gap["mcp_evidence_ready"] is True
assert coverage_gap["runtime_execution_authorized"] is False
assert "recurrence_fingerprint" in coverage_gap["required_mcp_evidence_refs"]
assert "repair_steps.command_or_ansible_ref" in coverage_gap["playbook_template_fields"]
work_item = result.metadata["repair_candidate_draft_package"]["awooop_work_item"]
assert work_item["schema_version"] == "awooop_repair_candidate_draft_work_item_v1"
assert work_item["work_item_id"].startswith(
"repair-candidate-draft:awoooi:INC-TEST-REPAIR:"
)
assert work_item["status"] == "open"
assert work_item["needs_human"] is True
assert work_item["decision_effect"] == "none"
assert work_item["writes_runtime_state"] is False
assert work_item["coverage_gap"]["coverage_key"] == "unknownalert:awoooi-api"
assert "/awooop/work-items?" in work_item["work_item_href"]
assert "https://awoooi.wooo.work/zh-TW/awooop/work-items?" in work_item["work_item_url"]
@pytest.mark.asyncio
async def test_candidate_blocked_observe_only_prompts_repair_playbook_draft() -> None:
incident = _incident()
playbook = _playbook(
"ssh 192.168.0.188 'uptime; ps aux --sort=-%cpu | head -20; docker stats --no-stream'",
risk_level=PlaybookRiskLevel.LOW,
)
playbook.repair_steps[0].action_type = ActionType.SSH_COMMAND
service = RepairCandidateService(
incident_service=FakeIncidentService(),
investigator=FakeInvestigator(_evidence(incident.incident_id)),
playbook_repository=FakePlaybookRepository(playbook),
auto_repair_service=FakeAutoRepairService(),
)
service._auto_repair = type(
"NoRouteAutoRepairService",
(),
{"preview_write_ssh_mcp_route": lambda self, incident, command: False},
)()
result = await service.build_from_incident(
incident=incident,
alertname="NodeExporterDown",
target_resource="node-exporter-188",
namespace="awoooi-prod",
message="node exporter is down",
fallback_action="NO_ACTION - REPAIR_CANDIDATE_MISSING",
matched_playbook_id="PB-REPAIR-001",
severity="medium",
)
assert result.candidate_found is False
assert "playbook_observe_only" in result.blockers
assert result.metadata["repair_candidate_draft_package"]["lane"] == (
"promote_diagnostic_to_repair_playbook"
)
coverage_gap = result.metadata["repair_candidate_draft_package"]["coverage_gap"]
assert coverage_gap["coverage_key"] == "nodeexporterdown:node-exporter-188"
assert coverage_gap["target_kind"] == "host_service"
assert coverage_gap["blocking_stage"] == "service_playbook_coverage"
assert coverage_gap["matched_playbook_id"] == "PB-REPAIR-001"
assert "systemd_or_container_status" in coverage_gap["required_mcp_evidence_refs"]
assert "診斷命令保留為 MCP evidence collector" in result.metadata["repair_candidate_next_step"]
work_item = result.metadata["repair_candidate_draft_package"]["awooop_work_item"]
assert work_item["target_resource"] == "node-exporter-188"
assert work_item["lane"] == "promote_diagnostic_to_repair_playbook"
assert work_item["safety_level"] == "read_only_work_item_projection"
assert work_item["coverage_gap"]["next_owner_lane"] == "promote_diagnostic_to_repair_playbook"
@pytest.mark.asyncio
async def test_missing_mcp_evidence_records_collectable_coverage_gap() -> None:
incident = _incident()
service = RepairCandidateService(
incident_service=FakeIncidentService(),
investigator=FakeInvestigator(_evidence(incident.incident_id, sensors_succeeded=0)),
playbook_repository=FakePlaybookRepository(
_playbook("kubectl rollout restart deployment/awoooi-api -n awoooi-prod")
),
auto_repair_service=FakeAutoRepairService(),
)
result = await service.build_from_incident(
incident=incident,
alertname="NodeExporterDown",
target_resource="awoooi-api",
namespace="awoooi-prod",
message="node exporter is down",
fallback_action="NO_ACTION - REPAIR_CANDIDATE_MISSING",
matched_playbook_id="PB-REPAIR-001",
severity="medium",
)
coverage_gap = result.metadata["repair_candidate_draft_package"]["coverage_gap"]
assert coverage_gap["coverage_key"] == "nodeexporterdown:awoooi-api"
assert coverage_gap["blocking_stage"] == "mcp_evidence"
assert coverage_gap["mcp_evidence_ready"] is False
assert coverage_gap["target_kind"] == "k8s_workload"
assert "mcp_health_snapshot" in coverage_gap["required_mcp_evidence_refs"]
assert coverage_gap["runtime_execution_authorized"] is False
def test_approval_record_data_uses_preallocated_id_without_leaking_metadata() -> None:
approval_id = str(uuid4())
request = ApprovalRequestCreate(
action="kubectl rollout restart deployment/awoooi-api -n awoooi-prod",
description="candidate",
risk_level=RiskLevel.MEDIUM,
requested_by="repair-candidate-test",
blast_radius=BlastRadius(data_impact=DataImpact.WRITE),
metadata={"preallocated_approval_id": approval_id, "source": "test"},
)
data = approval_request_to_record_data(request, RiskLevel.MEDIUM, required_sigs=1)
assert data["id"] == approval_id
assert data["extra_metadata"] == {"source": "test"}