fix(auto_repair): playbook_seed_service — 從 alert_rules.yaml 初始化 APPROVED Playbook
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled

根本原因: playbooks 表空 → NO_MATCH → 永遠走審批,從不自動修復
修復: startup 時從 alert_rules.yaml seed APPROVED Playbook(冪等)
確保自動修復鏈路有規則可用

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-10 21:52:38 +08:00
parent cdccc7e826
commit f33d514391
2 changed files with 113 additions and 0 deletions

View File

@@ -286,6 +286,15 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
# 2026-04-10 Claude Sonnet 4.6 Asia/Taipei
# 目的: 確保 playbook_embeddings 表有最新向量,供語義相似度查詢
# 使用 asyncio.create_task 非阻塞 — 不影響 API 啟動速度
# Phase ADR-068 2026-04-10: 從 alert_rules.yaml seed Playbook冪等
# 必須在 embedding indexing 之前,確保 playbook 表有資料
try:
from src.services.playbook_seed_service import seed_playbooks_from_rules
asyncio.create_task(seed_playbooks_from_rules())
logger.info("playbook_seed_scheduled")
except Exception as e:
logger.warning("playbook_seed_schedule_failed", error=str(e))
try:
from src.services.playbook_embedding_service import ensure_playbook_embeddings_indexed
asyncio.create_task(ensure_playbook_embeddings_indexed())

View File

@@ -0,0 +1,104 @@
"""
Playbook Seed Service — 從 alert_rules.yaml 初始化 Playbook 資料
=================================================================
職責:
- 啟動時讀取 alert_rules.yaml
- 將每條規則轉換為 APPROVED Playbook 寫入 DB冪等已存在則跳過
- 確保自動修復鏈路有資料可用
呼叫方: main.py lifespan (asyncio.create_task — 非阻塞)
2026-04-10 Claude Sonnet 4.6 Asia/Taipei
"""
from __future__ import annotations
from pathlib import Path
import structlog
import yaml
logger = structlog.get_logger(__name__)
_RULES_PATH = Path(__file__).parent.parent.parent / "alert_rules.yaml"
async def seed_playbooks_from_rules() -> None:
"""從 alert_rules.yaml 匯入 APPROVED Playbook冪等"""
try:
if not _RULES_PATH.exists():
logger.warning("playbook_seed_rules_not_found", path=str(_RULES_PATH))
return
data = yaml.safe_load(_RULES_PATH.read_text())
rules = data.get("rules", [])
if not rules:
return
from src.models.playbook import (
ActionType, Playbook, PlaybookSource, PlaybookStatus,
RepairStep, RiskLevel, SymptomPattern,
)
from src.repositories.playbook_repository import get_playbook_repository
repo = get_playbook_repository()
# 取得現有 playbook source_ids避免重複建立
existing = await repo.list_playbooks(status=PlaybookStatus.APPROVED, limit=500)
existing_sources = {p.source for p in existing if p.source}
seeded = 0
for rule in rules:
rule_id = rule.get("id", "")
source_key = f"alert_rule:{rule_id}"
if source_key in existing_sources:
continue
resp = rule.get("response", {})
kubectl_cmd = resp.get("kubectl_command", "").strip()
if not kubectl_cmd:
continue
risk_str = resp.get("risk", "medium").lower()
risk_map = {"low": RiskLevel.LOW, "medium": RiskLevel.MEDIUM, "critical": RiskLevel.HIGH}
risk = risk_map.get(risk_str, RiskLevel.MEDIUM)
alertnames = rule.get("match", {}).get("alertname", [])
action_type = ActionType.KUBECTL
if kubectl_cmd.startswith("ssh"):
action_type = ActionType.SSH_COMMAND
playbook = Playbook(
name=rule.get("description", rule_id),
description=resp.get("description", rule.get("description", "")),
status=PlaybookStatus.APPROVED,
source=source_key,
symptom_pattern=SymptomPattern(
alert_names=alertnames,
affected_services=[],
severity_range=["P2", "P3"],
),
repair_steps=[
RepairStep(
step_number=1,
action_type=action_type,
command=kubectl_cmd,
expected_result=resp.get("action_title", ""),
risk_level=risk,
requires_approval=False,
)
],
ai_confidence=1.0,
approved_by="alert_rules_yaml",
)
try:
await repo.create(playbook)
seeded += 1
logger.info("playbook_seeded", rule_id=rule_id, name=playbook.name)
except Exception as e:
logger.warning("playbook_seed_failed", rule_id=rule_id, error=str(e))
logger.info("playbook_seed_complete", seeded=seeded, total=len(rules))
except Exception as e:
logger.error("playbook_seed_error", error=str(e))