352 lines
14 KiB
Python
352 lines
14 KiB
Python
from datetime import datetime, timezone
|
|
from uuid import uuid4
|
|
|
|
import pytest
|
|
|
|
from src.models.approval import ApprovalRequestCreate, BlastRadius, DataImpact, RiskLevel
|
|
from src.models.incident import Incident, Severity, Signal
|
|
from src.models.playbook import (
|
|
ActionType,
|
|
Playbook,
|
|
PlaybookSource,
|
|
PlaybookStatus,
|
|
RepairStep,
|
|
)
|
|
from src.models.playbook import RiskLevel as PlaybookRiskLevel
|
|
from src.services.approval_db import approval_request_to_record_data
|
|
from src.services.evidence_snapshot import EvidenceSnapshot
|
|
from src.services.repair_candidate_service import RepairCandidateService
|
|
|
|
|
|
class FakeInvestigator:
|
|
def __init__(self, evidence: EvidenceSnapshot | None) -> None:
|
|
self.evidence = evidence
|
|
|
|
async def investigate(self, incident: Incident) -> EvidenceSnapshot | None:
|
|
return self.evidence
|
|
|
|
|
|
class FakePlaybookRepository:
|
|
def __init__(self, playbook: Playbook | None) -> None:
|
|
self.playbook = playbook
|
|
|
|
async def get_by_id(self, playbook_id: str) -> Playbook | None:
|
|
if self.playbook and self.playbook.playbook_id == playbook_id:
|
|
return self.playbook
|
|
return None
|
|
|
|
|
|
class FakeIncidentService:
|
|
async def get_from_working_memory(self, incident_id: str) -> None:
|
|
return None
|
|
|
|
|
|
class FakeAutoRepairService:
|
|
def preview_write_ssh_mcp_route(self, incident: Incident, command: str) -> bool:
|
|
return True
|
|
|
|
|
|
def _incident() -> Incident:
|
|
return Incident(
|
|
incident_id="INC-TEST-REPAIR",
|
|
severity=Severity.P2,
|
|
signals=[
|
|
Signal(
|
|
alert_name="NodeExporterDown",
|
|
severity=Severity.P2,
|
|
source="alertmanager",
|
|
fired_at=datetime.now(timezone.utc),
|
|
labels={"namespace": "awoooi-prod", "deployment": "awoooi-api"},
|
|
annotations={"summary": "node exporter down"},
|
|
)
|
|
],
|
|
affected_services=["awoooi-api"],
|
|
)
|
|
|
|
|
|
def _evidence(incident_id: str, *, sensors_succeeded: int = 2) -> EvidenceSnapshot:
|
|
return EvidenceSnapshot(
|
|
incident_id=incident_id,
|
|
sensors_attempted=3,
|
|
sensors_succeeded=sensors_succeeded,
|
|
mcp_health={"k8s": sensors_succeeded > 0, "prometheus": sensors_succeeded > 1},
|
|
evidence_summary="k8s events and metrics collected",
|
|
)
|
|
|
|
|
|
def _playbook(command: str, *, risk_level: PlaybookRiskLevel = PlaybookRiskLevel.LOW) -> Playbook:
|
|
return Playbook(
|
|
playbook_id="PB-REPAIR-001",
|
|
name="重啟 API deployment",
|
|
description="以 approved PlayBook 修復 API 工作負載",
|
|
status=PlaybookStatus.APPROVED,
|
|
source=PlaybookSource.MANUAL,
|
|
trust_score=0.72,
|
|
estimated_duration_minutes=4,
|
|
repair_steps=[
|
|
RepairStep(
|
|
step_number=1,
|
|
action_type=ActionType.KUBECTL,
|
|
command=command,
|
|
expected_result="deployment restarted and rollout verified",
|
|
risk_level=risk_level,
|
|
requires_approval=True,
|
|
)
|
|
],
|
|
)
|
|
|
|
|
|
def _generic_fallback_playbook() -> Playbook:
|
|
playbook = _playbook(
|
|
"kubectl rollout restart deployment/{target} -n {namespace}",
|
|
risk_level=PlaybookRiskLevel.MEDIUM,
|
|
)
|
|
playbook.playbook_id = "PB-GENERIC-FALLBACK"
|
|
playbook.name = "通用兜底規則"
|
|
playbook.symptom_pattern.alert_names = ["*"]
|
|
return playbook
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_build_candidate_from_mcp_evidence_and_approved_playbook() -> None:
|
|
incident = _incident()
|
|
service = RepairCandidateService(
|
|
incident_service=FakeIncidentService(),
|
|
investigator=FakeInvestigator(_evidence(incident.incident_id)),
|
|
playbook_repository=FakePlaybookRepository(
|
|
_playbook("kubectl rollout restart deployment/awoooi-api -n awoooi-prod")
|
|
),
|
|
auto_repair_service=FakeAutoRepairService(),
|
|
)
|
|
|
|
result = await service.build_from_incident(
|
|
incident=incident,
|
|
alertname="NodeExporterDown",
|
|
target_resource="awoooi-api",
|
|
namespace="awoooi-prod",
|
|
message="node exporter is down",
|
|
fallback_action="NO_ACTION - REPAIR_CANDIDATE_MISSING",
|
|
matched_playbook_id="PB-REPAIR-001",
|
|
severity="medium",
|
|
)
|
|
|
|
assert result.candidate_found is True
|
|
assert result.approval_request is not None
|
|
assert result.approval_request.action == "kubectl rollout restart deployment/awoooi-api -n awoooi-prod"
|
|
assert result.approval_request.risk_level == RiskLevel.MEDIUM
|
|
assert result.approval_request.matched_playbook_id == "PB-REPAIR-001"
|
|
assert result.metadata["repair_candidate_status"] == "candidate_ready_for_approval"
|
|
assert result.metadata["mcp_evidence"]["sensors_succeeded"] == 2
|
|
assert result.metadata["playbook_trust"]["trust_score"] == 0.72
|
|
assert result.metadata["verifier_plan"]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_candidate_blocked_when_mcp_evidence_missing() -> None:
|
|
incident = _incident()
|
|
service = RepairCandidateService(
|
|
incident_service=FakeIncidentService(),
|
|
investigator=FakeInvestigator(_evidence(incident.incident_id, sensors_succeeded=0)),
|
|
playbook_repository=FakePlaybookRepository(
|
|
_playbook("kubectl rollout restart deployment/awoooi-api -n awoooi-prod")
|
|
),
|
|
auto_repair_service=FakeAutoRepairService(),
|
|
)
|
|
|
|
result = await service.build_from_incident(
|
|
incident=incident,
|
|
alertname="NodeExporterDown",
|
|
target_resource="awoooi-api",
|
|
namespace="awoooi-prod",
|
|
message="node exporter is down",
|
|
fallback_action="NO_ACTION - REPAIR_CANDIDATE_MISSING",
|
|
matched_playbook_id="PB-REPAIR-001",
|
|
severity="medium",
|
|
)
|
|
|
|
assert result.candidate_found is False
|
|
assert "mcp_evidence_missing" in result.blockers
|
|
assert result.metadata["repair_candidate_status"] == "blocked"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_candidate_blocked_when_playbook_is_observe_only() -> None:
|
|
incident = _incident()
|
|
service = RepairCandidateService(
|
|
incident_service=FakeIncidentService(),
|
|
investigator=FakeInvestigator(_evidence(incident.incident_id)),
|
|
playbook_repository=FakePlaybookRepository(
|
|
_playbook("kubectl get pods -n awoooi-prod")
|
|
),
|
|
auto_repair_service=FakeAutoRepairService(),
|
|
)
|
|
|
|
result = await service.build_from_incident(
|
|
incident=incident,
|
|
alertname="NodeExporterDown",
|
|
target_resource="awoooi-api",
|
|
namespace="awoooi-prod",
|
|
message="node exporter is down",
|
|
fallback_action="NO_ACTION - REPAIR_CANDIDATE_MISSING",
|
|
matched_playbook_id="PB-REPAIR-001",
|
|
severity="medium",
|
|
)
|
|
|
|
assert result.candidate_found is False
|
|
assert "playbook_observe_only" in result.blockers
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_candidate_blocked_when_playbook_is_generic_fallback() -> None:
|
|
incident = _incident()
|
|
service = RepairCandidateService(
|
|
incident_service=FakeIncidentService(),
|
|
investigator=FakeInvestigator(_evidence(incident.incident_id)),
|
|
playbook_repository=FakePlaybookRepository(_generic_fallback_playbook()),
|
|
auto_repair_service=FakeAutoRepairService(),
|
|
)
|
|
|
|
result = await service.build_from_incident(
|
|
incident=incident,
|
|
alertname="UnknownAlert",
|
|
target_resource="awoooi-api",
|
|
namespace="awoooi-prod",
|
|
message="unknown alert",
|
|
fallback_action="NO_ACTION - REPAIR_CANDIDATE_MISSING",
|
|
matched_playbook_id="PB-GENERIC-FALLBACK",
|
|
severity="medium",
|
|
)
|
|
|
|
assert result.candidate_found is False
|
|
assert "playbook_generic_fallback_not_repair" in result.blockers
|
|
assert "通用兜底" in result.metadata["repair_candidate_blocker_summary"]
|
|
assert result.metadata["playbook_draft_required"] is True
|
|
assert result.metadata["repair_candidate_draft_package"]["schema_version"] == (
|
|
"repair_candidate_draft_package_v1"
|
|
)
|
|
assert result.metadata["repair_candidate_draft_package"]["lane"] == (
|
|
"create_service_specific_repair_playbook"
|
|
)
|
|
assert "建立專屬 PlayBook 草案" in result.metadata["repair_candidate_next_step"]
|
|
assert "repair_command" in result.metadata["repair_candidate_draft_package"]["required_fields"]
|
|
coverage_gap = result.metadata["repair_candidate_draft_package"]["coverage_gap"]
|
|
assert coverage_gap["schema_version"] == "repair_candidate_coverage_gap_v1"
|
|
assert coverage_gap["coverage_key"] == "unknownalert:awoooi-api"
|
|
assert coverage_gap["blocking_stage"] == "service_playbook_coverage"
|
|
assert coverage_gap["next_owner_lane"] == "create_service_specific_repair_playbook"
|
|
assert coverage_gap["mcp_evidence_ready"] is True
|
|
assert coverage_gap["runtime_execution_authorized"] is False
|
|
assert "recurrence_fingerprint" in coverage_gap["required_mcp_evidence_refs"]
|
|
assert "repair_steps.command_or_ansible_ref" in coverage_gap["playbook_template_fields"]
|
|
work_item = result.metadata["repair_candidate_draft_package"]["awooop_work_item"]
|
|
assert work_item["schema_version"] == "awooop_repair_candidate_draft_work_item_v1"
|
|
assert work_item["work_item_id"].startswith(
|
|
"repair-candidate-draft:awoooi:INC-TEST-REPAIR:"
|
|
)
|
|
assert work_item["status"] == "open"
|
|
assert work_item["needs_human"] is True
|
|
assert work_item["decision_effect"] == "none"
|
|
assert work_item["writes_runtime_state"] is False
|
|
assert work_item["coverage_gap"]["coverage_key"] == "unknownalert:awoooi-api"
|
|
assert "/awooop/work-items?" in work_item["work_item_href"]
|
|
assert "https://awoooi.wooo.work/zh-TW/awooop/work-items?" in work_item["work_item_url"]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_candidate_blocked_observe_only_prompts_repair_playbook_draft() -> None:
|
|
incident = _incident()
|
|
playbook = _playbook(
|
|
"ssh 192.168.0.188 'uptime; ps aux --sort=-%cpu | head -20; docker stats --no-stream'",
|
|
risk_level=PlaybookRiskLevel.LOW,
|
|
)
|
|
playbook.repair_steps[0].action_type = ActionType.SSH_COMMAND
|
|
service = RepairCandidateService(
|
|
incident_service=FakeIncidentService(),
|
|
investigator=FakeInvestigator(_evidence(incident.incident_id)),
|
|
playbook_repository=FakePlaybookRepository(playbook),
|
|
auto_repair_service=FakeAutoRepairService(),
|
|
)
|
|
service._auto_repair = type(
|
|
"NoRouteAutoRepairService",
|
|
(),
|
|
{"preview_write_ssh_mcp_route": lambda self, incident, command: False},
|
|
)()
|
|
|
|
result = await service.build_from_incident(
|
|
incident=incident,
|
|
alertname="NodeExporterDown",
|
|
target_resource="node-exporter-188",
|
|
namespace="awoooi-prod",
|
|
message="node exporter is down",
|
|
fallback_action="NO_ACTION - REPAIR_CANDIDATE_MISSING",
|
|
matched_playbook_id="PB-REPAIR-001",
|
|
severity="medium",
|
|
)
|
|
|
|
assert result.candidate_found is False
|
|
assert "playbook_observe_only" in result.blockers
|
|
assert result.metadata["repair_candidate_draft_package"]["lane"] == (
|
|
"promote_diagnostic_to_repair_playbook"
|
|
)
|
|
coverage_gap = result.metadata["repair_candidate_draft_package"]["coverage_gap"]
|
|
assert coverage_gap["coverage_key"] == "nodeexporterdown:node-exporter-188"
|
|
assert coverage_gap["target_kind"] == "host_service"
|
|
assert coverage_gap["blocking_stage"] == "service_playbook_coverage"
|
|
assert coverage_gap["matched_playbook_id"] == "PB-REPAIR-001"
|
|
assert "systemd_or_container_status" in coverage_gap["required_mcp_evidence_refs"]
|
|
assert "診斷命令保留為 MCP evidence collector" in result.metadata["repair_candidate_next_step"]
|
|
work_item = result.metadata["repair_candidate_draft_package"]["awooop_work_item"]
|
|
assert work_item["target_resource"] == "node-exporter-188"
|
|
assert work_item["lane"] == "promote_diagnostic_to_repair_playbook"
|
|
assert work_item["safety_level"] == "read_only_work_item_projection"
|
|
assert work_item["coverage_gap"]["next_owner_lane"] == "promote_diagnostic_to_repair_playbook"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_missing_mcp_evidence_records_collectable_coverage_gap() -> None:
|
|
incident = _incident()
|
|
service = RepairCandidateService(
|
|
incident_service=FakeIncidentService(),
|
|
investigator=FakeInvestigator(_evidence(incident.incident_id, sensors_succeeded=0)),
|
|
playbook_repository=FakePlaybookRepository(
|
|
_playbook("kubectl rollout restart deployment/awoooi-api -n awoooi-prod")
|
|
),
|
|
auto_repair_service=FakeAutoRepairService(),
|
|
)
|
|
|
|
result = await service.build_from_incident(
|
|
incident=incident,
|
|
alertname="NodeExporterDown",
|
|
target_resource="awoooi-api",
|
|
namespace="awoooi-prod",
|
|
message="node exporter is down",
|
|
fallback_action="NO_ACTION - REPAIR_CANDIDATE_MISSING",
|
|
matched_playbook_id="PB-REPAIR-001",
|
|
severity="medium",
|
|
)
|
|
|
|
coverage_gap = result.metadata["repair_candidate_draft_package"]["coverage_gap"]
|
|
assert coverage_gap["coverage_key"] == "nodeexporterdown:awoooi-api"
|
|
assert coverage_gap["blocking_stage"] == "mcp_evidence"
|
|
assert coverage_gap["mcp_evidence_ready"] is False
|
|
assert coverage_gap["target_kind"] == "k8s_workload"
|
|
assert "mcp_health_snapshot" in coverage_gap["required_mcp_evidence_refs"]
|
|
assert coverage_gap["runtime_execution_authorized"] is False
|
|
|
|
|
|
def test_approval_record_data_uses_preallocated_id_without_leaking_metadata() -> None:
|
|
approval_id = str(uuid4())
|
|
request = ApprovalRequestCreate(
|
|
action="kubectl rollout restart deployment/awoooi-api -n awoooi-prod",
|
|
description="candidate",
|
|
risk_level=RiskLevel.MEDIUM,
|
|
requested_by="repair-candidate-test",
|
|
blast_radius=BlastRadius(data_impact=DataImpact.WRITE),
|
|
metadata={"preallocated_approval_id": approval_id, "source": "test"},
|
|
)
|
|
|
|
data = approval_request_to_record_data(request, RiskLevel.MEDIUM, required_sigs=1)
|
|
|
|
assert data["id"] == approval_id
|
|
assert data["extra_metadata"] == {"source": "test"}
|