fix(playbook): C1-C4 全流程串接 — evolver保護+seeder復活+規則即時建立+watchdog W-4
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled

C1: playbook_evolver — yaml_rule source playbooks 加 YAML_RULE guard,
    evolver 不再封存 seeder 建立的 APPROVED playbook,保護自動修復鏈路

C2: playbook_seed_service — idempotency SQL 排除 DEPRECATED 記錄,
    evolver 封存後重啟可復活 yaml_rule playbooks

C3: alert_rule_engine — AI 自動生成規則成功後立即呼叫 seed_playbooks_from_rules(),
    不等下次重啟即可建立對應 APPROVED Playbook

C4: ai_slo_watchdog_job — 新增 W-4 APPROVED playbook 數量為 0 告警,
    鏈路斷裂立即 TYPE-8M;total checks 由 3 升為 4

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Your Name
2026-04-20 20:18:03 +08:00
parent 7ca6d12ce2
commit de2d34d4cd
4 changed files with 32 additions and 3 deletions

View File

@@ -79,8 +79,16 @@ async def _check_once() -> None:
except Exception as e:
logger.warning("watchdog_w3_flywheel_check_failed", error=str(e))
# W-4: 無 APPROVED Playbook自動修復鏈路斷裂
try:
approved_count = await _count_approved_playbooks()
if approved_count == 0:
violations.append("無 APPROVED Playbook — 自動修復鏈路斷裂evolver 可能全部封存)")
except Exception as e:
logger.warning("watchdog_w4_playbook_check_failed", error=str(e))
if not violations:
logger.debug("ai_slo_watchdog_all_ok", checks=3)
logger.debug("ai_slo_watchdog_all_ok", checks=4)
return
# 去重violations 相同內容 1 小時內不重複發
@@ -146,3 +154,13 @@ async def _count_pending_no_tg_sent() -> int:
silent += 1
return silent
async def _count_approved_playbooks() -> int:
"""查詢 APPROVED 狀態 Playbook 數量,為 0 代表自動修復鏈路斷裂。"""
from sqlalchemy import text as sa_text
async with get_db_context() as db:
result = await db.execute(
sa_text("SELECT COUNT(*) FROM playbooks WHERE status = 'approved'")
)
return result.scalar() or 0

View File

@@ -716,6 +716,10 @@ async def auto_generate_rule(
success = _append_rule_to_yaml(yaml_block, alertname_safe)
if success:
logger.info("auto_rule_success", alertname=alertname_safe, rule_id=rule_id)
# 立即為新規則建立 APPROVED Playbook不等下次重啟
import asyncio as _asyncio
from src.services.playbook_seed_service import seed_playbooks_from_rules
_asyncio.create_task(seed_playbooks_from_rules())
else:
logger.warning("auto_rule_failed_validation", alertname=alertname_safe)

View File

@@ -28,7 +28,7 @@ from datetime import timedelta
import structlog
from src.models.playbook import Playbook, PlaybookStatus
from src.models.playbook import Playbook, PlaybookSource, PlaybookStatus
from src.utils.timezone import now_taipei
logger = structlog.get_logger(__name__)
@@ -129,6 +129,8 @@ async def _archive_low_trust(playbooks: list[Playbook], report: EvolverReport) -
for pb in playbooks:
if pb.status == PlaybookStatus.DEPRECATED:
continue
if pb.source == PlaybookSource.YAML_RULE:
continue # yaml_rule playbooks 由 seeder 管理,不受 trust 封存,保護自動修復鏈路
if pb.trust_score < TRUST_ARCHIVE_THRESHOLD:
try:
await service.update_with_validation(
@@ -164,6 +166,8 @@ async def _archive_dormant(playbooks: list[Playbook], report: EvolverReport) ->
for pb in playbooks:
if pb.status == PlaybookStatus.DEPRECATED:
continue
if pb.source == PlaybookSource.YAML_RULE:
continue # yaml_rule playbooks 由 seeder 管理,不受休眠封存,保護自動修復鏈路
if pb.last_used_at is None:
# 從未使用過 — 只在 trust 低於閾值時封存
if pb.trust_score >= DORMANT_TRUST_THRESHOLD:

View File

@@ -49,7 +49,10 @@ async def seed_playbooks_from_rules() -> None:
from sqlalchemy import text as sa_text
async with get_db_context() as db:
rows = await db.execute(
sa_text("SELECT name FROM playbooks WHERE source = 'yaml_rule'")
sa_text(
"SELECT name FROM playbooks WHERE source = 'yaml_rule'"
" AND status != 'deprecated'"
)
)
existing_names = {r[0] for r in rows.fetchall()}