All checks were successful
CD Pipeline / tests (push) Successful in 1m34s
Code Review / ai-code-review (push) Successful in 28s
Type Sync Check / check-type-sync (push) Successful in 1m10s
CD Pipeline / build-and-deploy (push) Successful in 10m19s
CD Pipeline / post-deploy-checks (push) Successful in 3m1s
269 lines
9.9 KiB
Python
269 lines
9.9 KiB
Python
from src.jobs.playbook_generation_governance_job import run_playbook_generation_governance_once
|
|
from src.models.incident import Incident, IncidentOutcome, IncidentStatus, Severity, Signal
|
|
from src.models.playbook import (
|
|
ActionType,
|
|
Playbook,
|
|
PlaybookRecommendation,
|
|
PlaybookStatus,
|
|
RepairStep,
|
|
RiskLevel,
|
|
SymptomPattern,
|
|
generate_playbook_id,
|
|
)
|
|
from src.services.playbook_generator import LLMPlaybookGenerator
|
|
from src.utils.timezone import now_taipei
|
|
|
|
|
|
class InMemoryPlaybookService:
|
|
def __init__(self):
|
|
self.items = {}
|
|
|
|
async def create(self, playbook):
|
|
self.items[playbook.playbook_id] = playbook
|
|
return playbook
|
|
|
|
async def get_by_id(self, playbook_id):
|
|
return self.items.get(playbook_id)
|
|
|
|
async def get_recommendations(self, symptoms, top_k=3, use_rag=True):
|
|
recommendations = []
|
|
for playbook in self.items.values():
|
|
if playbook.status != PlaybookStatus.APPROVED:
|
|
continue
|
|
alert_match = set(symptoms.alert_names) & set(playbook.symptom_pattern.alert_names)
|
|
service_match = set(symptoms.affected_services) & set(playbook.symptom_pattern.affected_services)
|
|
if alert_match and service_match:
|
|
recommendations.append(
|
|
PlaybookRecommendation(
|
|
playbook=playbook,
|
|
similarity_score=1.0,
|
|
matched_symptoms=[
|
|
*(f"Alert: {name}" for name in alert_match),
|
|
*(f"Service: {name}" for name in service_match),
|
|
],
|
|
reason="test exact match",
|
|
)
|
|
)
|
|
return recommendations[:top_k]
|
|
|
|
async def create_new_version(self, base_playbook_id, candidate, reason):
|
|
base = self.items.get(base_playbook_id)
|
|
if base is None:
|
|
return None
|
|
candidate.playbook_id = generate_playbook_id()
|
|
candidate.version = base.version + 1
|
|
candidate.parent_playbook_id = base.parent_playbook_id or base.playbook_id
|
|
candidate.supersedes_playbook_id = base.playbook_id
|
|
candidate.version_reason = reason
|
|
candidate.success_count = 0
|
|
candidate.failure_count = 0
|
|
self.items[candidate.playbook_id] = candidate
|
|
return candidate
|
|
|
|
async def list_playbooks(self, status=None, tags=None, limit=20, offset=0):
|
|
values = list(self.items.values())
|
|
if status is not None:
|
|
values = [pb for pb in values if pb.status == status]
|
|
return values[offset : offset + limit], len(values)
|
|
|
|
async def update_with_validation(self, playbook_id, update_data):
|
|
playbook = self.items[playbook_id]
|
|
for key, value in update_data.items():
|
|
if key == "status" and isinstance(value, str):
|
|
value = PlaybookStatus(value)
|
|
setattr(playbook, key, value)
|
|
self.items[playbook_id] = playbook
|
|
return playbook
|
|
|
|
|
|
def make_approved_api_playbook() -> Playbook:
|
|
return Playbook(
|
|
playbook_id="PB-APPROVED-API",
|
|
name="Approved API recovery",
|
|
description="Existing approved recovery for API error rate",
|
|
status=PlaybookStatus.APPROVED,
|
|
symptom_pattern=SymptomPattern(
|
|
alert_names=["ApiErrorRateHigh"],
|
|
affected_services=["awoooi-api"],
|
|
severity_range=["P2"],
|
|
),
|
|
repair_steps=[
|
|
RepairStep(
|
|
step_number=1,
|
|
action_type=ActionType.KUBECTL,
|
|
command="kubectl rollout restart deployment/awoooi-api -n awoooi-prod",
|
|
expected_result="new pods become ready",
|
|
risk_level=RiskLevel.MEDIUM,
|
|
)
|
|
],
|
|
ai_confidence=0.91,
|
|
success_count=12,
|
|
failure_count=1,
|
|
tags=["api", "rollout"],
|
|
)
|
|
|
|
|
|
def make_resolved_incident(action: str = "kubectl rollout restart deployment/awoooi-api -n awoooi-prod") -> Incident:
|
|
return Incident(
|
|
incident_id="INC-20260430-PLAYBK",
|
|
status=IncidentStatus.RESOLVED,
|
|
severity=Severity.P2,
|
|
signals=[
|
|
Signal(
|
|
alert_name="ApiErrorRateHigh",
|
|
severity=Severity.P2,
|
|
source="alertmanager",
|
|
fired_at=now_taipei(),
|
|
labels={"namespace": "awoooi-prod", "deployment": "awoooi-api"},
|
|
annotations={"summary": "API error rate high"},
|
|
)
|
|
],
|
|
affected_services=["awoooi-api"],
|
|
outcome=IncidentOutcome(
|
|
proposal_executed=True,
|
|
execution_success=True,
|
|
effectiveness_score=5,
|
|
learning_notes=action,
|
|
),
|
|
)
|
|
|
|
|
|
async def local_llm_ok(_prompt, _context):
|
|
return (
|
|
"""
|
|
{
|
|
"name": "API error rate recovery",
|
|
"description": "Restart the affected API deployment after error-rate alert confirmation.",
|
|
"alert_names": ["ApiErrorRateHigh"],
|
|
"affected_services": ["awoooi-api"],
|
|
"severity_range": ["P2"],
|
|
"keywords": ["error rate", "api"],
|
|
"repair_steps": [
|
|
{
|
|
"action_type": "kubectl",
|
|
"command": "kubectl rollout restart deployment/awoooi-api -n awoooi-prod",
|
|
"expected_result": "new pods become ready",
|
|
"risk_level": "MEDIUM"
|
|
}
|
|
],
|
|
"estimated_duration_minutes": 5,
|
|
"confidence": 0.86,
|
|
"tags": ["api", "rollout"]
|
|
}
|
|
""",
|
|
"ollama",
|
|
True,
|
|
)
|
|
|
|
|
|
async def local_llm_unsafe(_prompt, _context):
|
|
return (
|
|
"""
|
|
{
|
|
"name": "Unsafe namespace cleanup",
|
|
"description": "Bad suggestion should be gated.",
|
|
"alert_names": ["ApiErrorRateHigh"],
|
|
"affected_services": ["awoooi-api"],
|
|
"repair_steps": [
|
|
{
|
|
"action_type": "kubectl",
|
|
"command": "kubectl delete namespace awoooi-prod",
|
|
"risk_level": "CRITICAL"
|
|
}
|
|
],
|
|
"confidence": 0.95,
|
|
"tags": ["unsafe"]
|
|
}
|
|
""",
|
|
"ollama",
|
|
True,
|
|
)
|
|
|
|
|
|
async def test_llm_playbook_generator_creates_review_playbook():
|
|
service = InMemoryPlaybookService()
|
|
generator = LLMPlaybookGenerator(playbook_service=service, llm_callable=local_llm_ok)
|
|
|
|
result = await generator.generate_from_incident(make_resolved_incident())
|
|
|
|
assert result.outcome == "success"
|
|
assert result.playbook is not None
|
|
assert result.playbook.status == PlaybookStatus.REVIEW
|
|
assert result.playbook.source.value == "llm_generated"
|
|
assert result.playbook.repair_steps[0].action_type == ActionType.KUBECTL
|
|
assert result.playbook.repair_steps[0].command == "kubectl rollout restart deployment/awoooi-api -n awoooi-prod"
|
|
|
|
|
|
async def test_llm_playbook_generator_creates_lineage_version_for_similar_playbook():
|
|
service = InMemoryPlaybookService()
|
|
base = await service.create(make_approved_api_playbook())
|
|
generator = LLMPlaybookGenerator(playbook_service=service, llm_callable=local_llm_ok)
|
|
|
|
result = await generator.generate_from_incident(make_resolved_incident())
|
|
|
|
assert result.outcome == "success"
|
|
assert result.playbook is not None
|
|
assert result.playbook.playbook_id != base.playbook_id
|
|
assert result.playbook.version == 2
|
|
assert result.playbook.parent_playbook_id == base.playbook_id
|
|
assert result.playbook.supersedes_playbook_id == base.playbook_id
|
|
assert result.playbook.status == PlaybookStatus.REVIEW
|
|
|
|
|
|
async def test_llm_playbook_generator_downgrades_unsafe_kubectl_to_manual():
|
|
service = InMemoryPlaybookService()
|
|
generator = LLMPlaybookGenerator(playbook_service=service, llm_callable=local_llm_unsafe)
|
|
|
|
result = await generator.generate_from_incident(make_resolved_incident())
|
|
|
|
assert result.playbook is not None
|
|
step = result.playbook.repair_steps[0]
|
|
assert step.action_type == ActionType.MANUAL
|
|
assert step.requires_approval is True
|
|
assert step.risk_level == RiskLevel.HIGH
|
|
assert "namespace" in step.command
|
|
|
|
|
|
async def test_playbook_generation_governance_promotes_review_to_approved(monkeypatch):
|
|
service = InMemoryPlaybookService()
|
|
generator = LLMPlaybookGenerator(playbook_service=service, llm_callable=local_llm_ok)
|
|
result = await generator.generate_from_incident(make_resolved_incident())
|
|
assert result.playbook is not None
|
|
result.playbook.ai_confidence = 0.93
|
|
|
|
class FakeSettings:
|
|
ENABLE_PLAYBOOK_DRAFT_GOVERNANCE_JOB = True
|
|
|
|
import src.jobs.playbook_generation_governance_job as job
|
|
|
|
monkeypatch.setattr(job, "settings", FakeSettings())
|
|
monkeypatch.setattr("src.services.playbook_service.get_playbook_service", lambda: service)
|
|
|
|
report = await run_playbook_generation_governance_once()
|
|
|
|
assert report.approved_count == 1
|
|
assert service.items[result.playbook.playbook_id].status == PlaybookStatus.APPROVED
|
|
|
|
|
|
async def test_playbook_generation_governance_deprecates_superseded_version(monkeypatch):
|
|
service = InMemoryPlaybookService()
|
|
base = await service.create(make_approved_api_playbook())
|
|
generator = LLMPlaybookGenerator(playbook_service=service, llm_callable=local_llm_ok)
|
|
result = await generator.generate_from_incident(make_resolved_incident())
|
|
assert result.playbook is not None
|
|
result.playbook.ai_confidence = 0.93
|
|
|
|
class FakeSettings:
|
|
ENABLE_PLAYBOOK_DRAFT_GOVERNANCE_JOB = True
|
|
|
|
import src.jobs.playbook_generation_governance_job as job
|
|
|
|
monkeypatch.setattr(job, "settings", FakeSettings())
|
|
monkeypatch.setattr("src.services.playbook_service.get_playbook_service", lambda: service)
|
|
|
|
report = await run_playbook_generation_governance_once()
|
|
|
|
assert report.approved_count == 1
|
|
assert service.items[result.playbook.playbook_id].status == PlaybookStatus.APPROVED
|
|
assert service.items[base.playbook_id].status == PlaybookStatus.DEPRECATED
|