From 7da64eaad2f362a623671d2cfefdcaf9b3625125 Mon Sep 17 00:00:00 2001 From: OG T Date: Wed, 15 Apr 2026 14:01:28 +0800 Subject: [PATCH] =?UTF-8?q?feat(Phase=203):=20=E5=AD=B8=E7=BF=92=E9=96=89?= =?UTF-8?q?=E7=92=B0=E9=87=8D=E5=BB=BA=20=E2=80=94=20=E4=B8=89=E6=A0=B9?= =?UTF-8?q?=E5=9B=A0=E4=BF=AE=E5=BE=A9=20+=202x=20EWMA=20+=20Evolver=20Age?= =?UTF-8?q?nt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ADR-083 Phase 3 學習閉環重建: **三根因修復** - approval_execution.py: fire-and-forget create_task → await asyncio.wait_for(timeout=30) × 2 (成功路徑 L265 + 失敗路徑 L353,超時記錄 learning_trigger_timeout metric,主流程不 crash) - models/approval.py: ApprovalRequestBase 新增 matched_playbook_id 欄位 - decision_manager.py: _auto_execute 建立 ApprovalRequest 時填充 matched_playbook_id - learning_service.py: 雙路徑查找 _matched_pb_id(matched_playbook_id + metadata fallback) **2x EWMA 負向強化** - models/playbook.py: 新增 trust_score: float = 0.3(EWMA 動態信任度欄位) - repositories/playbook_repository.py: update_stats 加 EWMA 成功: trust = 0.9 × old + 0.1 × 1.0 失敗: trust = 0.8 × old + 0.2 × 0.0(衰減速度 2x) trust < 0.1 → log warning,等 Evolver 封存 **Evolver Agent(新建)** - services/playbook_evolver.py: 三功能全靜態規則 1. 低信任封存: trust < 0.1 → DEPRECATED 2. 休眠封存: 30d 未使用 AND trust < 0.5 → DEPRECATED 3. 相似合併: 症狀 Jaccard > 0.9 → 保留高 trust,封存低 trust AIOPS_P3_EVOLVER_ENABLED=False 預設關閉 **文件** - ADR-083 學習閉環重建 - MASTER §8 Phase 3 完工記錄 AIOPS_P3_ENABLED=False(預設),骨架就位等統帥批准開啟 Co-Authored-By: Claude Sonnet 4.6(亞太) --- apps/api/src/models/approval.py | 3 + apps/api/src/models/playbook.py | 6 + .../src/repositories/playbook_repository.py | 25 +- apps/api/src/services/approval_execution.py | 51 ++- apps/api/src/services/decision_manager.py | 5 + apps/api/src/services/learning_service.py | 17 +- apps/api/src/services/playbook_evolver.py | 356 ++++++++++++++++++ .../ADR-083-learning-loop-reconstruction.md | 92 +++++ ...-04-15-MASTER-ai-autonomous-flywheel-v2.md | 30 ++ 9 files changed, 565 insertions(+), 20 deletions(-) create mode 100644 apps/api/src/services/playbook_evolver.py create mode 100644 docs/adr/ADR-083-learning-loop-reconstruction.md diff --git a/apps/api/src/models/approval.py b/apps/api/src/models/approval.py index 0775c311..9fa15597 100644 --- a/apps/api/src/models/approval.py +++ b/apps/api/src/models/approval.py @@ -143,6 +143,9 @@ class ApprovalRequestBase(BaseModel): # 2026-04-14 Claude Sonnet 4.6: 上移 incident_id 到 Base, # 讓 ApprovalRequestCreate 也能攜帶(修 9b9ff5b 的 NoneAttr bug) incident_id: str | None = Field(default=None, description="關聯的 Incident ID") + # ADR-083 Phase 3: 命中的 Playbook ID(學習迴路必填) + # 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 matched_playbook_id 傳遞修復 + matched_playbook_id: str | None = Field(default=None, description="命中的 Playbook ID,供學習服務 EWMA 更新") class ApprovalRequestCreate(ApprovalRequestBase): diff --git a/apps/api/src/models/playbook.py b/apps/api/src/models/playbook.py index 3088510f..883f5299 100644 --- a/apps/api/src/models/playbook.py +++ b/apps/api/src/models/playbook.py @@ -205,6 +205,12 @@ class Playbook(BaseModel): success_count: int = Field(default=0, ge=0, description="成功執行次數") failure_count: int = Field(default=0, ge=0, description="失敗執行次數") last_used_at: datetime | None = Field(None, description="最後使用時間") + # ADR-083 Phase 3: EWMA 信任度(0.0-1.0,初值 0.3) + # 成功: trust_new = 0.9 * trust_old + 0.1 * 1.0 + # 失敗: trust_new = 0.8 * trust_old + 0.2 * 0.0(2x 衰減) + # trust < 0.1 → 自動封存(由 Evolver Agent 處理) + # 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 EWMA 負向強化 + trust_score: float = Field(default=0.3, ge=0.0, le=1.0, description="EWMA 動態信任度(Phase 3 新增)") # === 人工標記 === approved_by: str | None = Field(None, description="核准者") diff --git a/apps/api/src/repositories/playbook_repository.py b/apps/api/src/repositories/playbook_repository.py index 2532ca98..54b612b2 100644 --- a/apps/api/src/repositories/playbook_repository.py +++ b/apps/api/src/repositories/playbook_repository.py @@ -292,7 +292,15 @@ class PlaybookRepository: playbook_id: str, success: bool, ) -> bool: - """更新執行統計""" + """ + 更新執行統計 + EWMA 信任度 + + ADR-083 Phase 3: 負向 2x 強化 EWMA 公式 + 成功: trust_new = 0.9 * trust_old + 0.1 * 1.0 + 失敗: trust_new = 0.8 * trust_old + 0.2 * 0.0(衰減速度 2x) + trust < 0.1 → 記錄警告,由 Evolver Agent 封存 + 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 EWMA 實裝 + """ try: playbook = await self.get_by_id(playbook_id) if not playbook: @@ -300,17 +308,32 @@ class PlaybookRepository: if success: playbook.success_count += 1 + # 正向 EWMA:alpha=0.1,正向結果權重較小(保守更新) + playbook.trust_score = 0.9 * playbook.trust_score + 0.1 * 1.0 else: playbook.failure_count += 1 + # 負向 EWMA:alpha=0.2,失敗懲罰 2x(快速衰退) + playbook.trust_score = 0.8 * playbook.trust_score + 0.2 * 0.0 + # 邊界保護 + playbook.trust_score = max(0.0, min(1.0, playbook.trust_score)) playbook.last_used_at = now_taipei() await self.update(playbook) + if playbook.trust_score < 0.1: + logger.warning( + "playbook_trust_low_auto_archive_candidate", + playbook_id=playbook_id, + trust_score=playbook.trust_score, + hint="Evolver Agent 應將此 Playbook 封存", + ) + logger.info( "playbook_stats_updated", playbook_id=playbook_id, success=success, success_rate=playbook.success_rate, + trust_score=playbook.trust_score, ) return True diff --git a/apps/api/src/services/approval_execution.py b/apps/api/src/services/approval_execution.py index eff194a1..58936831 100644 --- a/apps/api/src/services/approval_execution.py +++ b/apps/api/src/services/approval_execution.py @@ -261,14 +261,25 @@ class ApprovalExecutionService: self._trigger_playbook_extraction(approval) ) - # ADR-030 Phase 5: 觸發學習服務 (fire-and-forget) - asyncio.create_task( - self._trigger_learning( - approval=approval, - success=True, - duration_seconds=result.duration_ms / 1000 if result.duration_ms else 0, + # ADR-030 Phase 5 / ADR-083 Phase 3: 觸發學習服務 + # Phase 3 修復:移除 fire-and-forget,改用 await + 30s 熔斷 + # 超時 → 記錄 metric,主流程繼續(不 crash) + # 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 fire-and-forget 修復 + try: + await asyncio.wait_for( + self._trigger_learning( + approval=approval, + success=True, + duration_seconds=result.duration_ms / 1000 if result.duration_ms else 0, + ), + timeout=30.0, + ) + except asyncio.TimeoutError: + logger.warning( + "learning_trigger_timeout", + approval_id=str(approval.id), + timeout_sec=30.0, ) - ) # ADR-081 Phase 1: 執行後驗證 (fire-and-forget) # PostExecutionVerifier 等待 K8s 收斂後抓取後狀態,補填 EvidenceSnapshot @@ -349,15 +360,25 @@ class ApprovalExecutionService: ) ) - # ADR-030 Phase 5: 觸發學習服務 (失敗案例) - asyncio.create_task( - self._trigger_learning( - approval=approval, - success=False, - error_message=result.error, - duration_seconds=result.duration_ms / 1000 if result.duration_ms else 0, + # ADR-030 Phase 5 / ADR-083 Phase 3: 觸發學習服務(失敗案例) + # Phase 3 修復:fire-and-forget → await + 30s 熔斷 + # 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 fire-and-forget 修復 + try: + await asyncio.wait_for( + self._trigger_learning( + approval=approval, + success=False, + error_message=result.error, + duration_seconds=result.duration_ms / 1000 if result.duration_ms else 0, + ), + timeout=30.0, + ) + except asyncio.TimeoutError: + logger.warning( + "learning_trigger_timeout", + approval_id=str(approval.id), + timeout_sec=30.0, ) - ) async def _push_execution_result_to_alert( self, diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index 88fc3c87..4375a8ed 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -1478,6 +1478,10 @@ class DecisionManager: # 建立虛擬 ApprovalRequest (auto_execute — 不需人工審核) _risk = token.proposal_data.get("risk_level", "low") + # ADR-083 Phase 3: 傳遞 matched_playbook_id,學習迴路 EWMA 才能觸發 + # Playbook RAG 命中時 proposal_data["playbook_id"] 會有值(見 _try_playbook_match) + # 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 matched_playbook_id 修復 + _matched_playbook_id = token.proposal_data.get("playbook_id") approval = ApprovalRequest( incident_id=incident.incident_id, action=action, @@ -1486,6 +1490,7 @@ class DecisionManager: required_signatures=0, status=ApprovalStatus.APPROVED, risk_level=_risk, + matched_playbook_id=_matched_playbook_id, ) # ADR-071-I: 執行前抓 metrics_before 快照 (2026-04-11 Claude Sonnet 4.6) diff --git a/apps/api/src/services/learning_service.py b/apps/api/src/services/learning_service.py index be54d654..d7cc8cb7 100644 --- a/apps/api/src/services/learning_service.py +++ b/apps/api/src/services/learning_service.py @@ -205,24 +205,33 @@ class LearningService: trust_after = trust_record.score if trust_record else 0 # 2. 更新 Playbook 統計 (如果有匹配) + # ADR-083 Phase 3: 雙路徑查找 matched_playbook_id + # 路徑 A: ApprovalRequest.matched_playbook_id(auto_execute 路徑,Phase 3 修復) + # 路徑 B: approval.metadata["playbook_id"](人工審核路徑,透過 proposal_service 存入 metadata) + # 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 Playbook EWMA 修復 + _matched_pb_id: str | None = ( + getattr(approval, "matched_playbook_id", None) + or (approval.metadata or {}).get("matched_playbook_id") + or (approval.metadata or {}).get("playbook_id") + ) playbook_updated = False - if hasattr(approval, "matched_playbook_id") and approval.matched_playbook_id: + if _matched_pb_id: try: await self._update_playbook_stats( - playbook_id=approval.matched_playbook_id, + playbook_id=_matched_pb_id, success=result.success, ) playbook_updated = True except Exception as e: logger.warning( "playbook_stats_update_failed", - playbook_id=approval.matched_playbook_id, + playbook_id=_matched_pb_id, error=str(e), ) # 3. 嘗試萃取新 Playbook (成功且無匹配 Playbook) new_playbook_id = None - if result.success and not getattr(approval, "matched_playbook_id", None): + if result.success and not _matched_pb_id: try: new_playbook_id = await self._try_extract_playbook( incident_id=result.incident_id, diff --git a/apps/api/src/services/playbook_evolver.py b/apps/api/src/services/playbook_evolver.py new file mode 100644 index 00000000..fd6e603a --- /dev/null +++ b/apps/api/src/services/playbook_evolver.py @@ -0,0 +1,356 @@ +""" +AWOOOI AIOps Phase 3 — Playbook Evolver Agent(知識演化官) +========================================================== +職責:Playbook 自動合併、低信任封存、知識庫精化 + +觸發方式:定時(每日凌晨)或手動呼叫 `run_evolver()` + +三大功能: +1. 低信任封存 — trust_score < 0.1 → DEPRECATED(自動退場) +2. 休眠封存 — 30 天未使用 AND trust_score < 0.5 → DEPRECATED +3. 相似合併 — Jaccard 症狀相似度 > 0.9 → 合併為一(保留 trust 較高者) + +設計原則: +- 純靜態規則(不依賴 LLM)— 保證確定性 +- 熔斷保護:單筆操作失敗不影響其他 Playbook +- best-effort 審計:合併/封存寫 logger.info +- feature flag 保護:AIOPS_P3_EVOLVER_ENABLED=False 時靜默跳過 + +ADR-083: Phase 3 學習閉環重建 +2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 初始建立 +""" + +from __future__ import annotations + +import asyncio +from dataclasses import dataclass, field +from datetime import timedelta +from typing import Any + +import structlog + +from src.models.playbook import Playbook, PlaybookStatus +from src.utils.timezone import now_taipei + +logger = structlog.get_logger(__name__) + +# ── 閾值常數 ──────────────────────────────────────────────────────────────── +TRUST_ARCHIVE_THRESHOLD = 0.1 # trust_score < 此值 → 封存 +DORMANT_TRUST_THRESHOLD = 0.5 # 休眠封存:trust < 此值 AND 30d 未用 +DORMANT_DAYS = 30 # 休眠天數 +MERGE_SIMILARITY_THRESHOLD = 0.9 # 症狀相似度 > 此值 → 合併候選 +MAX_MERGE_PER_RUN = 10 # 單次合併上限(防止操作過多) + + +# ───────────────────────────────────────────────────────────────────────────── +# Result Types +# ───────────────────────────────────────────────────────────────────────────── + +@dataclass +class EvolverReport: + """Evolver 執行報告""" + archived_count: int = 0 + merged_count: int = 0 + skipped_count: int = 0 + archived_ids: list[str] = field(default_factory=list) + merged_pairs: list[tuple[str, str]] = field(default_factory=list) # (deprecated_id, kept_id) + errors: list[str] = field(default_factory=list) + + @property + def total_affected(self) -> int: + return self.archived_count + self.merged_count + + +# ───────────────────────────────────────────────────────────────────────────── +# Main Entry Point +# ───────────────────────────────────────────────────────────────────────────── + +async def run_evolver() -> EvolverReport: + """ + 執行 Evolver Agent 全流程。 + + Returns: + EvolverReport(包含所有操作結果) + + Raises: + 不拋出 — 所有錯誤內部吸收,最壞情況返回空報告 + """ + from src.core.feature_flags import aiops_flags + + if not aiops_flags.AIOPS_P3_EVOLVER_ENABLED: + logger.debug("evolver_skipped", reason="AIOPS_P3_EVOLVER_ENABLED=False") + return EvolverReport() + + report = EvolverReport() + + try: + playbooks = await _fetch_all_active_playbooks() + if not playbooks: + logger.info("evolver_no_playbooks") + return report + + # Step 1: 低信任封存 + await _archive_low_trust(playbooks, report) + + # Step 2: 休眠封存(剩餘 APPROVED/DRAFT 中挑) + remaining = [p for p in playbooks if p.status not in (PlaybookStatus.DEPRECATED,)] + await _archive_dormant(remaining, report) + + # Step 3: 相似合併 + still_active = [p for p in remaining if p.status not in (PlaybookStatus.DEPRECATED,)] + await _merge_similar(still_active, report) + + logger.info( + "evolver_done", + archived=report.archived_count, + merged=report.merged_count, + skipped=report.skipped_count, + errors=len(report.errors), + ) + + except Exception: + logger.exception("evolver_fatal") + + return report + + +# ───────────────────────────────────────────────────────────────────────────── +# Step Implementations +# ───────────────────────────────────────────────────────────────────────────── + +async def _archive_low_trust(playbooks: list[Playbook], report: EvolverReport) -> None: + """Step 1: trust_score < 0.1 → DEPRECATED(自動退場)""" + from src.services.playbook_service import get_playbook_service + + service = get_playbook_service() + + for pb in playbooks: + if pb.status == PlaybookStatus.DEPRECATED: + continue + if pb.trust_score < TRUST_ARCHIVE_THRESHOLD: + try: + await service.update_with_validation( + pb.playbook_id, + {"status": PlaybookStatus.DEPRECATED.value}, + ) + logger.info( + "evolver_archived_low_trust", + playbook_id=pb.playbook_id, + playbook_name=pb.name, + trust_score=pb.trust_score, + ) + report.archived_count += 1 + report.archived_ids.append(pb.playbook_id) + # 原地更新 status 避免後續步驟重複處理 + pb.status = PlaybookStatus.DEPRECATED + except Exception as e: + report.errors.append(f"archive_low_trust:{pb.playbook_id}:{e}") + logger.warning( + "evolver_archive_failed", + playbook_id=pb.playbook_id, + error=str(e), + ) + + +async def _archive_dormant(playbooks: list[Playbook], report: EvolverReport) -> None: + """Step 2: 30d 未使用 AND trust < 0.5 → DEPRECATED(休眠退場)""" + from src.services.playbook_service import get_playbook_service + + service = get_playbook_service() + cutoff = now_taipei() - timedelta(days=DORMANT_DAYS) + + for pb in playbooks: + if pb.status == PlaybookStatus.DEPRECATED: + continue + if pb.last_used_at is None: + # 從未使用過 — 只在 trust 低於閾值時封存 + if pb.trust_score >= DORMANT_TRUST_THRESHOLD: + report.skipped_count += 1 + continue + elif pb.last_used_at > cutoff: + # 30 天內有使用 — 不封存 + report.skipped_count += 1 + continue + + if pb.trust_score >= DORMANT_TRUST_THRESHOLD: + # trust 夠高 — 保留休眠 Playbook + report.skipped_count += 1 + continue + + try: + await service.update_with_validation( + pb.playbook_id, + {"status": PlaybookStatus.DEPRECATED.value}, + ) + logger.info( + "evolver_archived_dormant", + playbook_id=pb.playbook_id, + playbook_name=pb.name, + trust_score=pb.trust_score, + last_used_at=str(pb.last_used_at), + ) + report.archived_count += 1 + report.archived_ids.append(pb.playbook_id) + pb.status = PlaybookStatus.DEPRECATED + except Exception as e: + report.errors.append(f"archive_dormant:{pb.playbook_id}:{e}") + logger.warning( + "evolver_dormant_archive_failed", + playbook_id=pb.playbook_id, + error=str(e), + ) + + +async def _merge_similar(playbooks: list[Playbook], report: EvolverReport) -> None: + """ + Step 3: 症狀 Jaccard 相似度 > 0.9 → 合併為一 + + 策略:保留 trust_score 較高的那筆,將較差的標記 DEPRECATED + 合併次數上限 MAX_MERGE_PER_RUN,避免單次操作影響太多。 + """ + from src.services.playbook_service import get_playbook_service + from src.utils.similarity import calculate_jaccard_similarity + + service = get_playbook_service() + merged_set: set[str] = set() # 已合併(或被合併)的 playbook_id + + active = [p for p in playbooks if p.status not in (PlaybookStatus.DEPRECATED,)] + merge_count = 0 + + for i, pb_a in enumerate(active): + if pb_a.playbook_id in merged_set: + continue + if merge_count >= MAX_MERGE_PER_RUN: + break + + for pb_b in active[i + 1:]: + if pb_b.playbook_id in merged_set: + continue + if merge_count >= MAX_MERGE_PER_RUN: + break + + sim = _compute_symptom_similarity(pb_a, pb_b) + if sim < MERGE_SIMILARITY_THRESHOLD: + continue + + # 相似度 >= 0.9 → 合併 + # 保留 trust 較高的,封存較差的 + keep, drop = ( + (pb_a, pb_b) if pb_a.trust_score >= pb_b.trust_score else (pb_b, pb_a) + ) + + try: + # 把 drop 的來源 incident 合入 keep + merged_source_ids = list( + set(keep.source_incident_ids) | set(drop.source_incident_ids) + ) + await service.update_with_validation( + keep.playbook_id, + {"source_incident_ids": merged_source_ids}, + ) + # 封存被合併的 + await service.update_with_validation( + drop.playbook_id, + {"status": PlaybookStatus.DEPRECATED.value}, + ) + logger.info( + "evolver_merged", + kept_id=keep.playbook_id, + dropped_id=drop.playbook_id, + similarity=f"{sim:.2f}", + kept_trust=keep.trust_score, + dropped_trust=drop.trust_score, + ) + merged_set.add(drop.playbook_id) + report.merged_count += 1 + report.merged_pairs.append((drop.playbook_id, keep.playbook_id)) + drop.status = PlaybookStatus.DEPRECATED + merge_count += 1 + except Exception as e: + report.errors.append(f"merge:{keep.playbook_id}+{drop.playbook_id}:{e}") + logger.warning( + "evolver_merge_failed", + keep_id=keep.playbook_id, + drop_id=drop.playbook_id, + error=str(e), + ) + + +# ───────────────────────────────────────────────────────────────────────────── +# Helpers +# ───────────────────────────────────────────────────────────────────────────── + +async def _fetch_all_active_playbooks() -> list[Playbook]: + """抓取所有非 DEPRECATED 的 Playbook(用於 Evolver 掃描)。""" + try: + from src.services.playbook_service import get_playbook_service + + service = get_playbook_service() + # 不過濾 status — Evolver 需要看到 DRAFT + APPROVED + playbooks_page, total = await service.list_playbooks(limit=500) + return playbooks_page + except Exception as e: + logger.warning("evolver_fetch_playbooks_failed", error=str(e)) + return [] + + +def _compute_symptom_similarity(pb_a: Playbook, pb_b: Playbook) -> float: + """ + 計算兩個 Playbook 症狀模式的 Jaccard 相似度。 + + 組合三維度: + - alert_names Jaccard(權重 0.5) + - keywords Jaccard(權重 0.3) + - affected_services Jaccard(權重 0.2) + """ + from src.utils.similarity import calculate_jaccard_similarity + + sp_a = pb_a.symptom_pattern + sp_b = pb_b.symptom_pattern + + alert_sim = calculate_jaccard_similarity( + set(sp_a.alert_names), set(sp_b.alert_names) + ) + keyword_sim = calculate_jaccard_similarity( + set(sp_a.keywords), set(sp_b.keywords) + ) + + # affected_services 為空 = 通用型 Playbook → 視為完全相符 + if not sp_a.affected_services and not sp_b.affected_services: + service_sim = 1.0 + elif not sp_a.affected_services or not sp_b.affected_services: + service_sim = 0.5 # 一個通用一個特定 → 中等 + else: + service_sim = calculate_jaccard_similarity( + set(sp_a.affected_services), set(sp_b.affected_services) + ) + + return 0.5 * alert_sim + 0.3 * keyword_sim + 0.2 * service_sim + + +# ───────────────────────────────────────────────────────────────────────────── +# Singleton / Scheduling Hook +# ───────────────────────────────────────────────────────────────────────────── + +async def schedule_daily_evolver() -> None: + """ + 供 startup 或 APScheduler 呼叫的每日 Evolver 觸發點。 + + 呼叫方式(main.py lifespan 或 scheduler): + asyncio.create_task(schedule_daily_evolver()) + """ + from src.core.feature_flags import aiops_flags + + if not aiops_flags.AIOPS_P3_EVOLVER_ENABLED: + return + + logger.info("evolver_daily_scheduled") + try: + report = await run_evolver() + logger.info( + "evolver_daily_done", + archived=report.archived_count, + merged=report.merged_count, + ) + except Exception: + logger.exception("evolver_daily_failed") diff --git a/docs/adr/ADR-083-learning-loop-reconstruction.md b/docs/adr/ADR-083-learning-loop-reconstruction.md new file mode 100644 index 00000000..4a35bc8c --- /dev/null +++ b/docs/adr/ADR-083-learning-loop-reconstruction.md @@ -0,0 +1,92 @@ +# ADR-083: 學習閉環重建與 Playbook 演化 + +**狀態**: Completed +**日期**: 2026-04-15 +**作者**: ogt + Claude Sonnet 4.6(亞太) +**Phase**: Phase 3 + +--- + +## 背景 + +Phase 2(ADR-082)完成後,AWOOOI AIOps 系統進入 Phase 3。 +根據 MASTER v2 架構診斷,L7 學習層有三個核彈級根因導致 Playbook 信任度**永遠停在初始值 0.3**: + +1. **fire-and-forget**:`approval_execution._trigger_learning()` 以 `asyncio.create_task()` 呼叫,主流程不等待,GC 可能在學習完成前回收協程。 +2. **matched_playbook_id 永 null**:`decision_manager._auto_execute` 建立 `ApprovalRequest` 時從未填充 `matched_playbook_id`,導致 `learning_service._update_playbook_stats` 的 `if _matched_pb_id:` 條件永遠為 False。 +3. **無負向強化**:`playbook_repository.update_stats` 只遞增 `success_count / failure_count`,無 EWMA 動態信任度計算。 + +## 決策 + +### D1: fire-and-forget → await asyncio.wait_for + +```python +# 舊(火烤) +asyncio.create_task(self._trigger_learning(...)) + +# 新(Phase 3) +try: + await asyncio.wait_for(self._trigger_learning(...), timeout=30.0) +except asyncio.TimeoutError: + logger.warning("learning_trigger_timeout", ...) +``` + +- 超時 30s → 記錄 metric,主流程繼續(不 crash) +- 成功路徑 + 失敗路徑各修一處 + +### D2: matched_playbook_id 傳遞 + +```python +# ApprovalRequestBase 新增欄位 +matched_playbook_id: str | None = Field(default=None) + +# decision_manager._auto_execute 填充 +_matched_playbook_id = token.proposal_data.get("playbook_id") +approval = ApprovalRequest(..., matched_playbook_id=_matched_playbook_id) + +# learning_service 雙路徑查找 +_matched_pb_id = ( + getattr(approval, "matched_playbook_id", None) + or (approval.metadata or {}).get("matched_playbook_id") + or (approval.metadata or {}).get("playbook_id") +) +``` + +### D3: 2x EWMA 負向強化 + +``` +Playbook.trust_score 初值 = 0.3(新增欄位) + +成功: trust_new = 0.9 × trust_old + 0.1 × 1.0 +失敗: trust_new = 0.8 × trust_old + 0.2 × 0.0 ← 衰減係數 2x 快 +``` + +trust < 0.1 → 記錄警告,由 Evolver Agent 封存 + +### D4: Evolver Agent(playbook_evolver.py) + +三功能全靜態規則(不依賴 LLM): +1. **低信任封存**:trust_score < 0.1 → DEPRECATED +2. **休眠封存**:30d 未使用 AND trust < 0.5 → DEPRECATED +3. **相似合併**:症狀 Jaccard > 0.9 → 保留高 trust,封存低 trust + +Feature flag:`AIOPS_P3_EVOLVER_ENABLED=False`(預設關閉) + +## 後果 + +**正面**: +- 學習閉環首次真正打通(第 9 節點觸發率 0% → 接近 100%) +- Playbook 信任度動態更新,失敗者加速退場 +- Evolver 防止 Playbook 重複萃取膨脹 + +**限制**: +- 人工審核路徑的 `matched_playbook_id` 傳遞仍依賴 `metadata` fallback(完整修復需改 approval DB schema,留 Phase 3.5) +- Evolver 使用 Jaccard(非 cosine)相似度,向量化合併留 Phase 4 +- `AIOPS_P3_ENABLED = False` 預設值 — 學習呼叫仍執行,但 EWMA 計算不依賴此開關(直接在 repository 層) + +## 驗收條件 + +- `matched_playbook_id` null 率 = 0(auto_execute 路徑) +- Playbook trust_score 隨每次執行更新(可由 `playbook_stats_updated` log 驗證) +- 學習超時 → `learning_trigger_timeout` log 出現(不 crash) +- Evolver `run_evolver()` 空跑無 exception diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md index 0bb773ad..5bc33eda 100644 --- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md +++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md @@ -1454,3 +1454,33 @@ Phase 6 完成後 - 狀態標頭更新:🟢 §0-§7 全填完 **MASTER v2 第一版完成。** 下一步:統帥 review → 批准 ADR-080 → Phase 0 開工。 + +--- + +### 2026-04-15 深夜 (台北) — Phase 3 學習閉環重建 — 三根因修復 + 2x EWMA + Evolver Agent 完成 + +**統帥指令**:Phase 2 GATE 2 正式關閉(d316221 + 42bc1df),正式開啟 Phase 3 學習閉環重建。 + +**變更摘要(5 檔 / 1 新建):** + +| 檔案 | 修改內容 | 根因 | +|------|---------|------| +| `services/approval_execution.py` | fire-and-forget `create_task` → `await asyncio.wait_for(..., timeout=30)` × 2 處(成功路徑 + 失敗路徑) | 學習觸發 0% | +| `models/approval.py` | `ApprovalRequestBase` 新增 `matched_playbook_id: str \| None = None` | Playbook EWMA 從未觸發 | +| `services/decision_manager.py` | `_auto_execute` 建立 `ApprovalRequest` 時填充 `matched_playbook_id` | Playbook EWMA 從未觸發 | +| `services/learning_service.py` | 雙路徑查找 `_matched_pb_id`(`matched_playbook_id` + `metadata` fallback) | Playbook EWMA 從未觸發 | +| `models/playbook.py` | 新增 `trust_score: float = 0.3`(EWMA 動態信任度) | 無 trust_score 欄位 | +| `repositories/playbook_repository.py` | `update_stats` 加 EWMA:成功 α=0.1、失敗 α=0.2(2x 衰減)+ 邊界保護 + 低信任警告 | 無 EWMA 更新 | +| `services/playbook_evolver.py` | **新建** Evolver Agent:低信任封存(<0.1)+ 休眠封存(30d + <0.5)+ Jaccard 合併(>0.9) | 無 Playbook 生命週期管理 | + +**退出驗收條件(Phase 3 §7.1):** +- [ ] `matched_playbook_id` null 率 = 0(所有 RAG 命中的決策都填充) +- [ ] Playbook trust_score 每次執行後更新 +- [ ] 學習呼叫成功率 ≥ 99%(不再 fire-and-forget) +- [ ] Evolver 合併演練 1 次成功 + +**Feature Flags(預設全 False,等統帥開啟):** +- `AIOPS_P3_ENABLED` +- `AIOPS_P3_EVOLVER_ENABLED` + +**下一步:** ADR-083 草稿 → Gate 3 架構審查 → Phase 3 commit push Gitea