""" AWOOOI AIOps Phase 3 — Playbook Evolver Agent(知識演化官) ========================================================== 職責:Playbook 自動合併、低信任封存、知識庫精化 觸發方式:定時(每日凌晨)或手動呼叫 `run_evolver()` 三大功能: 1. 低信任封存 — trust_score < 0.1 → DEPRECATED(自動退場) 2. 休眠封存 — 30 天未使用 AND trust_score < 0.5 → DEPRECATED 3. 相似合併 — Jaccard 症狀相似度 > 0.9 → 合併為一(保留 trust 較高者) 設計原則: - 純靜態規則(不依賴 LLM)— 保證確定性 - 熔斷保護:單筆操作失敗不影響其他 Playbook - best-effort 審計:合併/封存寫 logger.info - feature flag 保護:AIOPS_P3_EVOLVER_ENABLED=False 時靜默跳過 ADR-083: Phase 3 學習閉環重建 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 初始建立 """ from __future__ import annotations import asyncio from dataclasses import dataclass, field from datetime import timedelta import structlog from src.models.playbook import Playbook, PlaybookSource, PlaybookStatus from src.utils.timezone import now_taipei logger = structlog.get_logger(__name__) # ── 閾值常數 ──────────────────────────────────────────────────────────────── TRUST_ARCHIVE_THRESHOLD = 0.1 # trust_score < 此值 → 封存 DORMANT_TRUST_THRESHOLD = 0.5 # 休眠封存:trust < 此值 AND 30d 未用 DORMANT_DAYS = 30 # 休眠天數 MERGE_SIMILARITY_THRESHOLD = 0.9 # 症狀相似度 > 此值 → 合併候選 MAX_MERGE_PER_RUN = 10 # 單次合併上限(防止操作過多) # ───────────────────────────────────────────────────────────────────────────── # Result Types # ───────────────────────────────────────────────────────────────────────────── @dataclass class EvolverReport: """Evolver 執行報告""" archived_count: int = 0 merged_count: int = 0 skipped_count: int = 0 archived_ids: list[str] = field(default_factory=list) merged_pairs: list[tuple[str, str]] = field(default_factory=list) # (deprecated_id, kept_id) errors: list[str] = field(default_factory=list) @property def total_affected(self) -> int: return self.archived_count + self.merged_count # ───────────────────────────────────────────────────────────────────────────── # Main Entry Point # ───────────────────────────────────────────────────────────────────────────── async def run_evolver(force: bool = False) -> EvolverReport: """ 執行 Evolver Agent 全流程。 Args: force: True 時跳過 feature flag 檢查(供管理員手動觸發端點使用) Returns: EvolverReport(包含所有操作結果) Raises: 不拋出 — 所有錯誤內部吸收,最壞情況返回空報告 """ from src.core.feature_flags import aiops_flags if not force and not aiops_flags.AIOPS_P3_EVOLVER_ENABLED: logger.debug("evolver_skipped", reason="AIOPS_P3_EVOLVER_ENABLED=False") return EvolverReport() report = EvolverReport() try: playbooks = await _fetch_all_active_playbooks() if not playbooks: logger.info("evolver_no_playbooks") return report # Step 1: 低信任封存 await _archive_low_trust(playbooks, report) # Step 2: 休眠封存(剩餘 APPROVED/DRAFT 中挑) remaining = [p for p in playbooks if p.status not in (PlaybookStatus.DEPRECATED,)] await _archive_dormant(remaining, report) # Step 3: 相似合併 still_active = [p for p in remaining if p.status not in (PlaybookStatus.DEPRECATED,)] await _merge_similar(still_active, report) logger.info( "evolver_done", archived=report.archived_count, merged=report.merged_count, skipped=report.skipped_count, errors=len(report.errors), ) except Exception: logger.exception("evolver_fatal") return report # ───────────────────────────────────────────────────────────────────────────── # Step Implementations # ───────────────────────────────────────────────────────────────────────────── async def _archive_low_trust(playbooks: list[Playbook], report: EvolverReport) -> None: """Step 1: trust_score < 0.1 → DEPRECATED(自動退場)""" from src.services.playbook_service import get_playbook_service service = get_playbook_service() for pb in playbooks: if pb.status == PlaybookStatus.DEPRECATED: continue if pb.source == PlaybookSource.YAML_RULE: continue # yaml_rule playbooks 由 seeder 管理,不受 trust 封存,保護自動修復鏈路 if pb.trust_score < TRUST_ARCHIVE_THRESHOLD: try: await service.update_with_validation( pb.playbook_id, {"status": PlaybookStatus.DEPRECATED.value}, ) logger.info( "evolver_archived_low_trust", playbook_id=pb.playbook_id, playbook_name=pb.name, trust_score=pb.trust_score, ) report.archived_count += 1 report.archived_ids.append(pb.playbook_id) # 原地更新 status 避免後續步驟重複處理 pb.status = PlaybookStatus.DEPRECATED except Exception as e: report.errors.append(f"archive_low_trust:{pb.playbook_id}:{e}") logger.warning( "evolver_archive_failed", playbook_id=pb.playbook_id, error=str(e), ) async def _archive_dormant(playbooks: list[Playbook], report: EvolverReport) -> None: """Step 2: 30d 未使用 AND trust < 0.5 → DEPRECATED(休眠退場)""" from src.services.playbook_service import get_playbook_service service = get_playbook_service() cutoff = now_taipei() - timedelta(days=DORMANT_DAYS) for pb in playbooks: if pb.status == PlaybookStatus.DEPRECATED: continue if pb.source == PlaybookSource.YAML_RULE: continue # yaml_rule playbooks 由 seeder 管理,不受休眠封存,保護自動修復鏈路 if pb.last_used_at is None: # 從未使用過 — 只在 trust 低於閾值時封存 if pb.trust_score >= DORMANT_TRUST_THRESHOLD: report.skipped_count += 1 continue elif pb.last_used_at > cutoff: # 30 天內有使用 — 不封存 report.skipped_count += 1 continue if pb.trust_score >= DORMANT_TRUST_THRESHOLD: # trust 夠高 — 保留休眠 Playbook report.skipped_count += 1 continue try: await service.update_with_validation( pb.playbook_id, {"status": PlaybookStatus.DEPRECATED.value}, ) logger.info( "evolver_archived_dormant", playbook_id=pb.playbook_id, playbook_name=pb.name, trust_score=pb.trust_score, last_used_at=str(pb.last_used_at), ) report.archived_count += 1 report.archived_ids.append(pb.playbook_id) pb.status = PlaybookStatus.DEPRECATED except Exception as e: report.errors.append(f"archive_dormant:{pb.playbook_id}:{e}") logger.warning( "evolver_dormant_archive_failed", playbook_id=pb.playbook_id, error=str(e), ) async def _merge_similar(playbooks: list[Playbook], report: EvolverReport) -> None: """ Step 3: 症狀 Jaccard 相似度 > 0.9 → 合併為一 策略:保留 trust_score 較高的那筆,將較差的標記 DEPRECATED 合併次數上限 MAX_MERGE_PER_RUN,避免單次操作影響太多。 """ from src.services.playbook_service import get_playbook_service service = get_playbook_service() merged_set: set[str] = set() # 已合併(或被合併)的 playbook_id active = [p for p in playbooks if p.status not in (PlaybookStatus.DEPRECATED,)] merge_count = 0 for i, pb_a in enumerate(active): if pb_a.playbook_id in merged_set: continue if merge_count >= MAX_MERGE_PER_RUN: break for pb_b in active[i + 1:]: if pb_b.playbook_id in merged_set: continue if merge_count >= MAX_MERGE_PER_RUN: break sim = _compute_symptom_similarity(pb_a, pb_b) if sim < MERGE_SIMILARITY_THRESHOLD: continue # 相似度 >= 0.9 → 合併 # 保留 trust 較高的,封存較差的 keep, drop = ( (pb_a, pb_b) if pb_a.trust_score >= pb_b.trust_score else (pb_b, pb_a) ) try: # 把 drop 的來源 incident 合入 keep merged_source_ids = list( set(keep.source_incident_ids) | set(drop.source_incident_ids) ) await service.update_with_validation( keep.playbook_id, {"source_incident_ids": merged_source_ids}, ) # 封存被合併的 await service.update_with_validation( drop.playbook_id, {"status": PlaybookStatus.DEPRECATED.value}, ) logger.info( "evolver_merged", kept_id=keep.playbook_id, dropped_id=drop.playbook_id, similarity=f"{sim:.2f}", kept_trust=keep.trust_score, dropped_trust=drop.trust_score, ) merged_set.add(drop.playbook_id) report.merged_count += 1 report.merged_pairs.append((drop.playbook_id, keep.playbook_id)) drop.status = PlaybookStatus.DEPRECATED merge_count += 1 except Exception as e: report.errors.append(f"merge:{keep.playbook_id}+{drop.playbook_id}:{e}") logger.warning( "evolver_merge_failed", keep_id=keep.playbook_id, drop_id=drop.playbook_id, error=str(e), ) # ───────────────────────────────────────────────────────────────────────────── # Helpers # ───────────────────────────────────────────────────────────────────────────── async def _fetch_all_active_playbooks() -> list[Playbook]: """ 抓取所有非 DEPRECATED 的 Playbook(用於 Evolver 掃描)。 2026-04-24 ogt + Claude Sonnet 4.6: 改為分兩次 query(DRAFT + APPROVED) 原本 list_playbooks(limit=500) 不傳 status → 把 294 筆 deprecated 全載入記憶體, 隨 deprecated 累積效能惡化;改用兩次 status 過濾查詢,只拉需要的資料。 """ try: from src.models.playbook import PlaybookStatus from src.services.playbook_service import get_playbook_service service = get_playbook_service() draft_playbooks, _ = await service.list_playbooks( status=PlaybookStatus.DRAFT, limit=500 ) approved_playbooks, _ = await service.list_playbooks( status=PlaybookStatus.APPROVED, limit=500 ) return draft_playbooks + approved_playbooks except Exception as e: logger.warning("evolver_fetch_playbooks_failed", error=str(e)) return [] def _compute_symptom_similarity(pb_a: Playbook, pb_b: Playbook) -> float: """ 計算兩個 Playbook 症狀模式的 Jaccard 相似度。 組合三維度: - alert_names Jaccard(權重 0.5) - keywords Jaccard(權重 0.3) - affected_services Jaccard(權重 0.2) """ from src.utils.similarity import calculate_jaccard_similarity sp_a = pb_a.symptom_pattern sp_b = pb_b.symptom_pattern alert_sim = calculate_jaccard_similarity( set(sp_a.alert_names), set(sp_b.alert_names) ) keyword_sim = calculate_jaccard_similarity( set(sp_a.keywords), set(sp_b.keywords) ) # affected_services 為空 = 通用型 Playbook → 視為完全相符 if not sp_a.affected_services and not sp_b.affected_services: service_sim = 1.0 elif not sp_a.affected_services or not sp_b.affected_services: service_sim = 0.5 # 一個通用一個特定 → 中等 else: service_sim = calculate_jaccard_similarity( set(sp_a.affected_services), set(sp_b.affected_services) ) return 0.5 * alert_sim + 0.3 * keyword_sim + 0.2 * service_sim # ───────────────────────────────────────────────────────────────────────────── # Singleton / Scheduling Hook # ───────────────────────────────────────────────────────────────────────────── async def schedule_daily_evolver() -> None: """ 供 startup 或 APScheduler 呼叫的每日 Evolver 觸發點。 呼叫方式(main.py lifespan 或 scheduler): asyncio.create_task(schedule_daily_evolver()) """ from src.core.feature_flags import aiops_flags if not aiops_flags.AIOPS_P3_EVOLVER_ENABLED: return logger.info("evolver_daily_scheduled") try: report = await run_evolver() logger.info( "evolver_daily_done", archived=report.archived_count, merged=report.merged_count, ) except Exception: logger.exception("evolver_daily_failed") DAILY_INTERVAL_SEC = 86_400 # 24h async def run_evolver_loop() -> None: """ 無限迴圈:每 24 小時執行一次 Evolver Agent。 在 main.py startup 以 asyncio.create_task 掛載。 ADR-083 Phase 3: Evolver 每日掃描(L4×D3 + L5×D2) 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 初始建立 """ while True: try: await schedule_daily_evolver() except Exception as e: logger.error("evolver_loop_error", error=str(e)) await asyncio.sleep(DAILY_INTERVAL_SEC)