Files
awoooi/apps/api/src/services/learning_service.py
Your Name 6e04fe9c8a
Some checks failed
CD Pipeline / tests (push) Successful in 1m28s
Code Review / ai-code-review (push) Successful in 29s
Type Sync Check / check-type-sync (push) Failing after 2m41s
CD Pipeline / build-and-deploy (push) Successful in 8m40s
CD Pipeline / post-deploy-checks (push) Successful in 3m10s
feat(playbook): generate drafts with local llm
2026-04-30 23:04:58 +08:00

1342 lines
50 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Learning Service - Phase 5 持續學習迴圈
======================================
ADR-030: 智能自動修復系統
Phase D-G P0 修正: 符合 leWOOOgo 積木化原則
從執行結果中學習,持續優化決策:
1. 更新 Playbook 統計 (成功率/執行次數)
2. 調整信任度 (成功 +分 / 失敗 -分)
3. 萃取新 Playbook (成功案例自動萃取)
4. 處理人工反饋 (有效性評分)
5. 🆕 Redis 持久化學習數據 (透過 Repository)
6. 🆕 修復推薦 (基於歷史成功率)
設計原則:
- 非同步執行,不阻塞主流程
- 失敗容忍,學習失敗不影響執行結果
- 完整審計追蹤
- 🆕 Service 不直接存取 Redis (透過 ILearningRepository)
版本: v1.1
建立: 2026-03-26 (台北時區)
更新: 2026-03-29 (台北時區) - P0 修正: 新增 Repository 層
"""
from dataclasses import dataclass, field
from datetime import UTC, datetime
from enum import Enum
from typing import Any
import structlog
from src.models.approval import ApprovalRequest
from src.models.incident import IncidentStatus
from src.repositories.interfaces import ILearningRepository, ITrustRepository
from src.repositories.learning_repository import get_learning_repository
from src.repositories.trust_repository import get_trust_repository
from src.services.trust_engine import get_trust_manager
logger = structlog.get_logger(__name__)
# =============================================================================
# Constants
# =============================================================================
class FeedbackType(str, Enum):
"""反饋類型"""
EXECUTION_SUCCESS = "execution_success" # 執行成功
EXECUTION_FAILURE = "execution_failure" # 執行失敗
HUMAN_APPROVE = "human_approve" # 人工批准
HUMAN_REJECT = "human_reject" # 人工拒絕
HUMAN_OVERRIDE = "human_override" # 人工覆蓋 AI 決策
EFFECTIVENESS_RATING = "effectiveness_rating" # 有效性評分
# 信任度調整參數
TRUST_SUCCESS_BOOST = 1 # 成功 +1 分
TRUST_FAILURE_PENALTY = 2 # 失敗 -2 分 (或歸零)
TRUST_HUMAN_REJECT_PENALTY = 1 # 人工拒絕 -1 分
# =============================================================================
# Data Models
# =============================================================================
@dataclass
class ExecutionResult:
"""執行結果"""
approval_id: str
incident_id: str
action: str
success: bool
error_message: str | None = None
duration_seconds: float = 0.0
executed_at: datetime = field(default_factory=lambda: datetime.now(UTC))
def to_dict(self) -> dict[str, Any]:
return {
"approval_id": self.approval_id,
"incident_id": self.incident_id,
"action": self.action,
"success": self.success,
"error_message": self.error_message,
"duration_seconds": self.duration_seconds,
"executed_at": self.executed_at.isoformat(),
}
@dataclass
class FeedbackRequest:
"""人工反饋請求"""
incident_id: str
feedback_type: FeedbackType
effectiveness_score: int | None = None # 1-5 分
learning_notes: str | None = None # 學習筆記
submitted_by: str | None = None
submitted_at: datetime = field(default_factory=lambda: datetime.now(UTC))
@dataclass
class LearningRecord:
"""學習記錄"""
incident_id: str
feedback_type: FeedbackType
action_pattern: str
trust_before: int
trust_after: int
playbook_updated: bool = False
new_playbook_id: str | None = None
learned_at: datetime = field(default_factory=lambda: datetime.now(UTC))
def to_dict(self) -> dict[str, Any]:
return {
"incident_id": self.incident_id,
"feedback_type": self.feedback_type.value,
"action_pattern": self.action_pattern,
"trust_before": self.trust_before,
"trust_after": self.trust_after,
"playbook_updated": self.playbook_updated,
"new_playbook_id": self.new_playbook_id,
"learned_at": self.learned_at.isoformat(),
}
# =============================================================================
# Learning Service
# =============================================================================
class LearningService:
"""
持續學習服務
職責:
1. 處理執行結果 → 更新 Playbook + 信任度
2. 處理人工反饋 → 調整 Playbook 有效性
3. 萃取新 Playbook (成功案例)
4. 🆕 Redis 持久化學習數據 (透過 Repository)
5. 🆕 修復推薦 (基於歷史成功率)
2026-03-29 P0 修正: 符合 leWOOOgo 積木化原則
- 透過 ILearningRepository 存取 Redis
- 不直接依賴 Redis Client
"""
# 推薦門檻
MIN_SAMPLES = 5 # 最少需要 N 次數據才能推薦
SUCCESS_RATE_THRESHOLD = 0.6 # 成功率門檻
def __init__(
self,
repository: ILearningRepository | None = None,
trust_repository: ITrustRepository | None = None,
):
self._trust_manager = get_trust_manager()
self._repository = repository or get_learning_repository()
self._trust_repo = trust_repository or get_trust_repository()
async def process_execution_result(
self,
approval: ApprovalRequest,
result: ExecutionResult,
) -> LearningRecord:
"""
處理執行結果,觸發學習
Args:
approval: 原始審批請求
result: 執行結果
Returns:
LearningRecord: 學習記錄
"""
action_pattern = self._extract_action_pattern(approval.action)
# 取得當前信任分數
trust_record = self._trust_manager.get_trust_record(action_pattern)
trust_before = trust_record.score if trust_record else 0
# 1. 調整信任度
if result.success:
# 成功: 記錄批准 (信任分數 +1)
self._trust_manager.record_approval(
action_pattern=action_pattern,
user_role="system",
user_id="auto_learning",
)
feedback_type = FeedbackType.EXECUTION_SUCCESS
else:
# 失敗: 記錄拒絕 (信任分數歸零)
self._trust_manager.record_rejection(
action_pattern=action_pattern,
user_role="system",
user_id="auto_learning",
reason=result.error_message,
)
feedback_type = FeedbackType.EXECUTION_FAILURE
# ADR-088: 持久化信任分數到 PostgreSQL (Pod 重啟後不歸零)
await self._persist_trust(action_pattern)
# 取得更新後的信任分數
trust_record = self._trust_manager.get_trust_record(action_pattern)
trust_after = trust_record.score if trust_record else 0
# 2. 更新 Playbook 統計 (如果有匹配)
# ADR-083 Phase 3: 雙路徑查找 matched_playbook_id
# 路徑 A: ApprovalRequest.matched_playbook_idauto_execute 路徑Phase 3 修復)
# 路徑 B: approval.metadata["playbook_id"](人工審核路徑,透過 proposal_service 存入 metadata
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 Playbook EWMA 修復
_matched_pb_id: str | None = (
getattr(approval, "matched_playbook_id", None)
or (approval.metadata or {}).get("matched_playbook_id")
or (approval.metadata or {}).get("playbook_id")
)
playbook_updated = False
if _matched_pb_id:
try:
await self._update_playbook_stats(
playbook_id=_matched_pb_id,
success=result.success,
)
playbook_updated = True
except Exception as e:
logger.warning(
"playbook_stats_update_failed",
playbook_id=_matched_pb_id,
error=str(e),
)
# 3. 嘗試萃取新 Playbook (成功且無匹配 Playbook)
new_playbook_id = None
if result.success and not _matched_pb_id:
try:
new_playbook_id = await self._try_extract_playbook(
incident_id=result.incident_id,
action=approval.action,
)
except Exception as e:
logger.warning(
"playbook_extraction_failed",
incident_id=result.incident_id,
error=str(e),
)
# 建立學習記錄
record = LearningRecord(
incident_id=result.incident_id,
feedback_type=feedback_type,
action_pattern=action_pattern,
trust_before=trust_before,
trust_after=trust_after,
playbook_updated=playbook_updated,
new_playbook_id=new_playbook_id,
)
logger.info(
"learning_completed",
incident_id=result.incident_id,
success=result.success,
trust_change=f"{trust_before}{trust_after}",
playbook_updated=playbook_updated,
new_playbook=new_playbook_id,
)
return record
async def process_human_feedback(
self,
feedback: FeedbackRequest,
) -> LearningRecord:
"""
處理人工反饋
Args:
feedback: 反饋請求
Returns:
LearningRecord: 學習記錄
"""
# 從 incident 取得 action pattern (需查詢)
action_pattern = f"incident:{feedback.incident_id}"
trust_record = self._trust_manager.get_trust_record(action_pattern)
trust_before = trust_record.score if trust_record else 0
playbook_updated = False
if feedback.feedback_type == FeedbackType.HUMAN_APPROVE:
# 人工批准: 信任 +1
self._trust_manager.record_approval(
action_pattern=action_pattern,
user_role="human",
user_id=feedback.submitted_by,
)
elif feedback.feedback_type == FeedbackType.HUMAN_REJECT:
# 人工拒絕: 信任歸零
self._trust_manager.record_rejection(
action_pattern=action_pattern,
user_role="human",
user_id=feedback.submitted_by,
reason="Human rejected",
)
elif feedback.feedback_type == FeedbackType.EFFECTIVENESS_RATING:
# 有效性評分
if feedback.effectiveness_score is not None:
if feedback.effectiveness_score >= 4:
# 高評分: 增加信任
self._trust_manager.record_approval(
action_pattern=action_pattern,
user_role="feedback",
user_id=feedback.submitted_by,
)
playbook_updated = await self._promote_playbook(feedback.incident_id)
elif feedback.effectiveness_score <= 2:
# 低評分: 降低信任
self._trust_manager.record_rejection(
action_pattern=action_pattern,
user_role="feedback",
user_id=feedback.submitted_by,
reason=f"Low effectiveness score: {feedback.effectiveness_score}",
)
playbook_updated = await self._demote_playbook(feedback.incident_id)
# ADR-088: 持久化信任分數到 PostgreSQL (Pod 重啟後不歸零)
await self._persist_trust(action_pattern)
trust_record = self._trust_manager.get_trust_record(action_pattern)
trust_after = trust_record.score if trust_record else 0
record = LearningRecord(
incident_id=feedback.incident_id,
feedback_type=feedback.feedback_type,
action_pattern=action_pattern,
trust_before=trust_before,
trust_after=trust_after,
playbook_updated=playbook_updated,
)
logger.info(
"human_feedback_processed",
incident_id=feedback.incident_id,
feedback_type=feedback.feedback_type.value,
effectiveness_score=feedback.effectiveness_score,
trust_change=f"{trust_before}{trust_after}",
)
return record
# =========================================================================
# Private Methods
# =========================================================================
def _extract_action_pattern(self, action: str) -> str:
"""從 action 字串提取 pattern"""
if not action:
return "unknown"
parts = action.split()
if len(parts) < 3:
return "unknown"
verb = parts[1] if len(parts) > 1 else "unknown"
resource_part = parts[2] if len(parts) > 2 else ""
if "/" in resource_part:
resource_name = resource_part.split("/")[-1]
else:
resource_name = resource_part
# 移除 pod hash suffix
resource_parts = resource_name.split("-")
if len(resource_parts) >= 3:
resource_name = "-".join(resource_parts[:-2]) + "-*"
return f"{verb}:{resource_name}"
async def _update_playbook_stats(
self,
playbook_id: str,
success: bool,
) -> None:
"""
更新 Playbook 統計
W2 PR-L1: 統計更新後,取 Playbook 的 symptom_pattern hash 觸發邏輯 2
KM 累積門檻檢查 → review_required 標記)。
"""
try:
from src.services.playbook_service import get_playbook_service
service = get_playbook_service()
await service.record_execution(playbook_id, success)
# W2 PR-L1 邏輯 2: 取得 Playbook symptom_pattern hash觸發 KM 累積檢查
from src.core.config import settings
if settings.ENABLE_KM_PLAYBOOK_FEEDBACK_LOOP:
try:
from src.repositories.playbook_repository import get_playbook_repository
from src.models.playbook import SymptomPattern
repo = get_playbook_repository()
playbook = await repo.get_by_id(playbook_id)
if playbook and playbook.symptom_pattern:
sp = playbook.symptom_pattern
# symptom_pattern 可能是 Pydantic model 或 dictORM 載入)
if isinstance(sp, dict):
sp = SymptomPattern.model_validate(sp)
symptoms_hash = sp.compute_hash()
await self._check_and_mark_playbook_review(symptoms_hash)
except Exception as inner_e:
logger.warning(
"playbook_review_check_failed",
playbook_id=playbook_id,
error=str(inner_e),
)
except Exception as e:
logger.warning(
"playbook_stats_update_error",
playbook_id=playbook_id,
error=str(e),
)
async def _try_extract_playbook(
self,
incident_id: str,
action: str,
) -> str | None:
"""嘗試從成功案例萃取 Playbook"""
try:
from src.repositories.incident_repository import get_incident_repository
from src.services.playbook_service import get_playbook_service
from src.core.config import settings
# 取得 Incident
repo = get_incident_repository()
incident = await repo.get_by_id(incident_id)
if not incident:
return None
# 確認狀態為 RESOLVED
if incident.status not in [IncidentStatus.RESOLVED, IncidentStatus.CLOSED]:
return None
# ADR-104: 優先用本地 LLM 生成可治理 Playbook。失敗時服務內會降級成
# deterministic fallbackfeature flag 關閉才回到舊萃取路徑。
if settings.ENABLE_LLM_PLAYBOOK_GENERATION:
from src.services.playbook_generator import get_playbook_generator
generated = await get_playbook_generator().generate_from_incident(
incident=incident,
action=action,
persist=True,
)
if generated.playbook:
logger.info(
"playbook_llm_generated",
incident_id=incident_id,
playbook_id=generated.playbook.playbook_id,
outcome=generated.outcome,
provider=generated.provider,
)
return generated.playbook.playbook_id
# 萃取 Playbook
service = get_playbook_service()
playbook = await service.extract_from_incident(
incident=incident,
auto_approve=False, # 需人工審核
)
if playbook:
logger.info(
"playbook_auto_extracted",
incident_id=incident_id,
playbook_id=playbook.playbook_id,
)
return playbook.playbook_id
return None
except Exception as e:
logger.warning(
"playbook_extraction_error",
incident_id=incident_id,
error=str(e),
)
return None
async def _promote_playbook(self, incident_id: str) -> bool:
"""
提升 Playbook 信心度 (高評分)
2026-03-30 Claude Code: 實作信心度提升邏輯
邏輯:
- 尋找 source_incident_ids 包含此 incident_id 的 Playbooks
- 提升 ai_confidence +0.1 (上限 1.0)
- 若信心度 >= 0.9 且 status == DRAFT → 自動升級為 APPROVED
- W2 PR-L1: 寫 KM 演化條目ENABLE_KM_PLAYBOOK_FEEDBACK_LOOP 開啟時)
"""
try:
from src.repositories.playbook_repository import get_playbook_repository
repo = get_playbook_repository()
playbooks = await repo.find_by_source_incident(incident_id)
if not playbooks:
logger.debug(
"playbook_promote_no_match",
incident_id=incident_id,
)
return False
# 信心度提升參數
CONFIDENCE_BOOST = 0.1
updated_count = 0
for playbook in playbooks:
previous_trust = playbook.trust_score
result = await repo.adjust_confidence(
playbook_id=playbook.playbook_id,
delta=CONFIDENCE_BOOST,
reason=f"High effectiveness rating from incident {incident_id}",
)
if result:
updated_count += 1
# W2 PR-L1: promote 觸發 → 寫 KM 演化條目
await self._write_playbook_evolution_km(
playbook=playbook,
previous_trust=previous_trust,
evolution_type="promote",
incident_id=incident_id,
)
logger.info(
"playbook_promoted",
incident_id=incident_id,
updated_count=updated_count,
total_playbooks=len(playbooks),
)
return updated_count > 0
except Exception as e:
logger.warning(
"playbook_promote_failed",
incident_id=incident_id,
error=str(e),
)
return False
async def _demote_playbook(self, incident_id: str) -> bool:
"""
降低 Playbook 信心度 (低評分)
2026-03-30 Claude Code: 實作信心度降低邏輯
邏輯:
- 尋找 source_incident_ids 包含此 incident_id 的 Playbooks
- 降低 ai_confidence -0.15 (下限 0.0)
- 若信心度 < 0.3 且 failure_rate > 50% → 自動降級為 DEPRECATED
- W2 PR-L1: 寫 KM 演化條目DEPRECATED 時回灌 alert_rule_catalog飛輪 C4 修復)
"""
try:
from src.repositories.playbook_repository import get_playbook_repository
repo = get_playbook_repository()
playbooks = await repo.find_by_source_incident(incident_id)
if not playbooks:
logger.debug(
"playbook_demote_no_match",
incident_id=incident_id,
)
return False
# 信心度降低參數 (懲罰比獎勵更重,避免低品質 Playbook 累積)
CONFIDENCE_PENALTY = -0.15
updated_count = 0
for playbook in playbooks:
previous_trust = playbook.trust_score
result = await repo.adjust_confidence(
playbook_id=playbook.playbook_id,
delta=CONFIDENCE_PENALTY,
reason=f"Low effectiveness rating from incident {incident_id}",
)
if result:
updated_count += 1
# W2 PR-L1: demote 觸發 → 寫 KM 演化條目
await self._write_playbook_evolution_km(
playbook=playbook,
previous_trust=previous_trust,
evolution_type="demote",
incident_id=incident_id,
)
# W2 PR-L1 邏輯 3: DEPRECATED 時回灌 alert_rule_catalog飛輪 C4 修復)
from src.models.playbook import PlaybookStatus
if playbook.status == PlaybookStatus.DEPRECATED:
await self._demote_alert_rule_catalog_confidence(playbook)
logger.info(
"playbook_demoted",
incident_id=incident_id,
updated_count=updated_count,
total_playbooks=len(playbooks),
)
return updated_count > 0
except Exception as e:
logger.warning(
"playbook_demote_failed",
incident_id=incident_id,
error=str(e),
)
return False
# =========================================================================
# W2 PR-L1: KM → Playbook 互饋回路私有方法
# 飛輪斷鏈 C3 + C4 修復
# 2026-04-28 ogt + Claude Sonnet 4.6
# =========================================================================
async def _write_playbook_evolution_km(
self,
playbook: Any,
previous_trust: float,
evolution_type: str,
incident_id: str,
) -> None:
"""
邏輯 1: promote/demote 觸發 → 寫 KM 演化條目(飛輪 C3
KM 條目 metadata 含playbook_id, previous_trust, new_trust,
success_count, failure_count, decision_chain
path_type='playbook_evolution',供冪等 key 使用
(incident_id, path_type) = (incident_id, 'playbook_evolution') 可能重複,
但 playbook_id 不同的演化各自獨立,所以 path_type 加 playbook_id 作為識別。
"""
from src.core.config import settings
if not settings.ENABLE_KM_PLAYBOOK_FEEDBACK_LOOP:
return
try:
import json
from src.services.km_writer import KMWritePayload, km_write_with_flag
from src.utils.timezone import now_taipei
new_trust = getattr(playbook, "trust_score", previous_trust)
success_count = getattr(playbook, "success_count", 0)
failure_count = getattr(playbook, "failure_count", 0)
path_type = f"playbook_evolution:{playbook.playbook_id}"
payload = KMWritePayload(
path_type=path_type,
incident_id=incident_id,
entry_create_kwargs={
"title": f"Playbook {evolution_type}: {playbook.name} [{playbook.playbook_id}]",
"content": (
f"Playbook {evolution_type} 事件記錄\n"
f"Playbook ID: {playbook.playbook_id}\n"
f"名稱: {playbook.name}\n"
f"trust_score 變化: {previous_trust:.3f}{new_trust:.3f}\n"
f"成功次數: {success_count} / 失敗次數: {failure_count}\n"
f"觸發來源: incident {incident_id}\n"
f"記錄時間: {now_taipei().isoformat()}"
),
"entry_type": "best_practice",
"category": "AI系統",
"tags": ["playbook_evolution", evolution_type, playbook.playbook_id],
"source": "ai_extracted",
"related_playbook_id": playbook.playbook_id,
"related_incident_id": incident_id,
"path_type": path_type,
},
metadata={
"playbook_id": playbook.playbook_id,
"previous_trust": previous_trust,
"new_trust": new_trust,
"success_count": success_count,
"failure_count": failure_count,
"evolution_type": evolution_type,
},
)
await km_write_with_flag(payload)
logger.info(
"playbook_evolution_km_written",
playbook_id=playbook.playbook_id,
evolution_type=evolution_type,
trust_change=f"{previous_trust:.3f}{new_trust:.3f}",
)
except Exception as e:
logger.warning(
"playbook_evolution_km_write_failed",
playbook_id=getattr(playbook, "playbook_id", "unknown"),
evolution_type=evolution_type,
error=str(e),
)
async def _check_and_mark_playbook_review(self, symptoms_hash: str) -> None:
"""
邏輯 2: KM 累積 N=5 條同 symptom_pattern_hash → 觸發 Playbook review_required 標記(飛輪 C3
每次 KM 寫入後由 _update_playbook_stats 呼叫端觸發此檢查。
若同 symptoms_hash 在 knowledge_entries 已有 >= threshold 條,
則 UPDATE playbooks SET review_required=true WHERE 症狀 hash 相符。
比對策略:從 KnowledgeEntry 讀 symptoms_hash 計數,
再透過 playbook.symptom_pattern 的 hash 比對 Playbook。
"""
from src.core.config import settings
if not settings.ENABLE_KM_PLAYBOOK_FEEDBACK_LOOP:
return
if not symptoms_hash:
return
try:
from sqlalchemy import text as sa_text
from src.db.base import get_db_context
async with get_db_context() as db:
# 計算同 symptoms_hash 的 KM 條目數
count_result = await db.execute(
sa_text(
"SELECT COUNT(*) FROM knowledge_entries "
"WHERE symptoms_hash = :hash"
),
{"hash": symptoms_hash},
)
count = count_result.scalar() or 0
if count < settings.KM_PLAYBOOK_REVIEW_THRESHOLD:
return
# 累積達到門檻 → 標記相關 Playbook 需要 review
# Playbook 的 symptom_pattern 存為 JSONB無直接 hash 欄位
# 透過 knowledge_entries.related_playbook_id 關聯找到要標記的 Playbook
updated = await db.execute(
sa_text(
"UPDATE playbooks pb "
"SET review_required = true, updated_at = NOW() "
"FROM knowledge_entries ke "
"WHERE ke.symptoms_hash = :hash "
" AND ke.related_playbook_id = pb.playbook_id "
" AND pb.review_required = false "
"RETURNING pb.playbook_id"
),
{"hash": symptoms_hash},
)
marked_ids = [row[0] for row in updated.fetchall()]
await db.commit()
if marked_ids:
logger.info(
"playbook_review_required_marked",
symptoms_hash=symptoms_hash,
km_count=count,
threshold=settings.KM_PLAYBOOK_REVIEW_THRESHOLD,
playbook_ids=marked_ids,
)
except Exception as e:
logger.warning(
"playbook_review_mark_failed",
symptoms_hash=symptoms_hash,
error=str(e),
)
async def _demote_alert_rule_catalog_confidence(self, playbook: Any) -> None:
"""
邏輯 3: Playbook DEPRECATED 時回灌 alert_rule_catalog飛輪 C4 修復)
UPDATE alert_rule_catalog
SET confidence = confidence * 0.5,
review_status = 'draft' -- CHECK constraint 允許 draft/approved/deprecated/retired
WHERE rule_name LIKE pattern(symptom_pattern.alert_names)
注意alert_rule_catalog.review_status CHECK 限制只允許:
draft | approved | deprecated | retired
任務描述的 'needs_review' 不合法,改用 'draft'(語意等效:需要人工審核)
失敗容忍:不影響 demote 主流程。
"""
from src.core.config import settings
if not settings.ENABLE_KM_PLAYBOOK_FEEDBACK_LOOP:
return
try:
import json
from sqlalchemy import text as sa_text
from src.db.base import get_db_context
# 從 playbook symptom_pattern 取出 alert_names 作為比對鍵
symptom = getattr(playbook, "symptom_pattern", None)
if symptom is None:
return
# symptom_pattern 可能是 Pydantic model 或 dict從 ORM 載入為 dict
if hasattr(symptom, "alert_names"):
alert_names: list[str] = symptom.alert_names or []
elif isinstance(symptom, dict):
alert_names = symptom.get("alert_names") or []
else:
return
if not alert_names:
logger.debug(
"playbook_demote_no_alert_names",
playbook_id=playbook.playbook_id,
)
return
async with get_db_context() as db:
updated_count = 0
for alert_name in alert_names:
# rule_name 完全匹配或前綴匹配(去掉 * suffix
match_name = alert_name.rstrip("*")
result = await db.execute(
sa_text(
"UPDATE alert_rule_catalog "
"SET confidence = CASE "
" WHEN confidence IS NOT NULL "
" THEN GREATEST(0.01, confidence * 0.5) "
" ELSE 0.5 "
" END, "
" review_status = 'draft', "
" updated_at = NOW() "
"WHERE rule_name LIKE :pattern "
" AND (review_status IS NULL OR review_status NOT IN "
" ('deprecated', 'retired')) "
"RETURNING rule_id"
),
{"pattern": f"{match_name}%"},
)
affected = result.rowcount or 0
updated_count += affected
await db.commit()
if updated_count > 0:
logger.info(
"alert_rule_catalog_confidence_demoted",
playbook_id=playbook.playbook_id,
alert_names=alert_names,
rules_updated=updated_count,
)
except Exception as e:
logger.warning(
"alert_rule_catalog_demote_failed",
playbook_id=getattr(playbook, "playbook_id", "unknown"),
error=str(e),
)
# =========================================================================
# 🆕 Phase D-G P0 修正: 新增方法
# =========================================================================
async def record_repair_result(
self,
anomaly_key: str,
repair_action: str,
success: bool,
root_cause: str | None = None,
fix_description: str | None = None,
execution_time_seconds: float | None = None,
) -> bool:
"""
記錄修復結果到 Repository (Redis 持久化)
2026-03-29 P0 修正: 透過 Repository 存取 Redis
Args:
anomaly_key: 異常 key
repair_action: 修復動作
success: 是否成功
root_cause: 根因 (如果找到)
fix_description: 修復說明
execution_time_seconds: 執行時間
Returns:
bool: 是否成功記錄
"""
return await self._repository.record_repair(
anomaly_key=anomaly_key,
repair_action=repair_action,
success=success,
root_cause=root_cause,
fix_description=fix_description,
execution_time_seconds=execution_time_seconds,
)
async def record_declarative_outcome(
self,
incident_id: str,
action: str,
blast_radius_score: int,
blast_radius_tier: str,
success: bool,
rollback_triggered: bool = False,
execution_time_seconds: float | None = None,
) -> bool:
"""
記錄 DeclarativeSpec 執行結果到學習系統。
Phase 5 ADR-086DeclarativeSpec 執行結果寫入學習記錄,
讓 AI 能從 Blast Radius 分級的執行歷史中學習。
Args:
incident_id: 關聯 Incident ID
action: 執行的 kubectl 命令
blast_radius_score: 爆炸半徑分數0-100
blast_radius_tier: 執行分級auto/human/dual/blocked
success: 是否執行成功
rollback_triggered: 是否觸發了回滾
execution_time_seconds: 執行耗時
Returns:
bool: 是否成功記錄
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 5 初始建立
"""
import json
from src.utils.timezone import now_taipei
try:
anomaly_key = f"declarative:{incident_id}"
fix_desc = json.dumps({
"blast_radius_score": blast_radius_score,
"blast_radius_tier": blast_radius_tier,
"rollback_triggered": rollback_triggered,
"recorded_at": now_taipei().isoformat(),
}, ensure_ascii=False)
return await self._repository.record_repair(
anomaly_key=anomaly_key,
repair_action=action[:200],
success=success,
root_cause=f"blast_radius_tier={blast_radius_tier}",
fix_description=fix_desc,
execution_time_seconds=execution_time_seconds,
)
except Exception as e:
import structlog as _structlog
_structlog.get_logger(__name__).warning(
"record_declarative_outcome_failed",
incident_id=incident_id,
error=str(e),
)
return False
async def record_diagnosis_outcome(
self,
incident_id: str,
matched_playbook_id: str | None,
was_correct: bool,
actual_fix: str | None = None,
) -> None:
"""
記錄 AI 診斷結果的正確性。
ADR-083 Phase 3: 誤診回寫 playbook_diagnosis_feedbackL3×D4
當 AI 提議被人工拒絕、或執行後驗證失敗,代表診斷可能有誤;
此時回寫負向信號,讓對應 Playbook trust_score EWMA 收縮。
Args:
incident_id: 關聯 Incident ID
matched_playbook_id: 此次診斷使用的 Playbook ID若有
was_correct: 診斷是否正確False = 誤診)
actual_fix: 實際有效的修復動作(可供 Evolver 學習)
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 誤診回饋接線
"""
# 1. 記錄到 Repositorydiag: 前綴與 exec: / verify: 區分)
try:
await self._repository.record_repair(
anomaly_key=f"diag:{incident_id}",
repair_action=actual_fix or "unknown",
success=was_correct,
fix_description=f"diagnosis_correct={was_correct}",
)
except Exception as e:
logger.warning(
"record_diagnosis_to_repo_failed",
incident_id=incident_id,
error=str(e),
)
# 2. 誤診時強化 Playbook 負向學習(已有 2x EWMA 衰減係數)
if matched_playbook_id and not was_correct:
await self._update_playbook_stats(
playbook_id=matched_playbook_id,
success=False,
)
logger.info(
"diagnosis_outcome_recorded",
incident_id=incident_id,
was_correct=was_correct,
matched_playbook_id=matched_playbook_id,
)
async def record_verification_result(
self,
incident_id: str,
action_taken: str,
verification_result: str,
matched_playbook_id: str | None = None,
) -> None:
"""
記錄環境驗證結果到學習系統。
ADR-083 Phase 3 Root cause 3: post_execution_verifier → learning 接線修復。
環境驗證Pod Running / 指標恢復)比執行指令 exit code 更精確,
單獨存一條 verify: 前綴記錄,並更新 Playbook EWMA stats。
Args:
incident_id: 關聯 Incident ID
action_taken: 執行的動作描述(例如 "restart_service:awoooi-api"
verification_result: "success" | "degraded" | "failed" | "timeout"
matched_playbook_id: 匹配的 Playbook ID有則更新 EWMA stats
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 Root cause 3 修復
"""
success = (verification_result == "success")
# 1. 記錄環境驗證結果到 Repositoryanomaly_key 加 verify: 前綴與執行記錄區分)
try:
await self._repository.record_repair(
anomaly_key=f"verify:{incident_id}",
repair_action=action_taken,
success=success,
fix_description=verification_result,
)
except Exception as e:
logger.warning(
"record_verification_to_repo_failed",
incident_id=incident_id,
error=str(e),
)
# 2. 更新 Playbook EWMA stats比執行 exit code 更精確的訊號)
if matched_playbook_id:
await self._update_playbook_stats(
playbook_id=matched_playbook_id,
success=success,
)
logger.info(
"verification_result_recorded",
incident_id=incident_id,
verification_result=verification_result,
matched_playbook_id=matched_playbook_id,
)
async def record_agent_session(
self,
session_id: str,
incident_id: str,
final_confidence: float,
requires_human_approval: bool,
all_agents_degraded: bool,
critic_challenge_count: int,
matched_playbook_id: str | None = None,
) -> None:
"""
記錄 5-Agent 辯證 Session 的學習訊號。
ADR-083 Phase 3: AgentSession 學習接線L7×D2
DB 持久化由 agent_orchestrator._write_agent_turn() 負責,
此方法負責從辯證結果提取學習訊號Redis analytics + Playbook EWMA
信號邏輯:
- all_agents_degraded=True → 系統品質警報(記錄到 Redis governance
- critic_challenge_count > 0 AND matched_playbook_id → 輕度負向 EWMACritic 質疑 Playbook
- requires_human_approval=True → 信心不足,系統整體品質下降紀錄
Args:
session_id: 辯證 Session ID
incident_id: 關聯 Incident ID
final_confidence: Coordinator 最終信心分數0-1
requires_human_approval: 是否升級人工審核
all_agents_degraded: 是否全部 Agent 降級
critic_challenge_count: Critic 提出的質疑數量
matched_playbook_id: 本次決策匹配的 Playbook若有
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 AgentSession 學習接線
"""
# 1. 記錄辯證品質到 Redis供 SLO 監控 + Governance 分析)
try:
await self._repository.record_repair(
anomaly_key=f"debate:{incident_id}",
repair_action=f"session:{session_id[:8]}",
success=not requires_human_approval,
fix_description=f"confidence={final_confidence:.2f},degraded={all_agents_degraded},critic_challenges={critic_challenge_count}",
)
except Exception as e:
logger.warning(
"record_agent_session_to_repo_failed",
session_id=session_id,
error=str(e),
)
# 2. Critic 有重大質疑 + 有匹配 Playbook → 輕度負向 EWMAPlaybook 可能不準確)
if critic_challenge_count > 0 and matched_playbook_id:
await self._update_playbook_stats(
playbook_id=matched_playbook_id,
success=False,
)
logger.info(
"agent_session_recorded",
session_id=session_id,
incident_id=incident_id,
final_confidence=final_confidence,
requires_human_approval=requires_human_approval,
all_agents_degraded=all_agents_degraded,
critic_challenge_count=critic_challenge_count,
)
async def get_recommended_fix(self, anomaly_key: str) -> dict:
"""
根據歷史學習,推薦最佳修復方案
2026-03-29 P0 修正: 使用 Repository 取得統計
Returns:
{
'action': 'scale_up',
'confidence': 0.85,
'tier': 2,
'based_on': '12 次歷史數據',
'avg_execution_time': 45.2,
'alternatives': [...]
}
"""
import math
all_stats = await self._repository.get_all_repair_stats(anomaly_key)
if not all_stats:
return self._default_recommendation()
# 計算各動作的加權分數
scored_actions = []
for action, stats in all_stats.items():
if stats["total"] >= self.MIN_SAMPLES:
success_rate = stats["success_rate"]
if success_rate >= self.SUCCESS_RATE_THRESHOLD:
# 加權: 成功率 * log(樣本數)
score = success_rate * math.log(stats["total"] + 1)
# 取得平均執行時間
history = await self._repository.get_repair_history(
anomaly_key, action, limit=20
)
times = [
h["execution_time"]
for h in history
if h.get("execution_time")
]
avg_time = sum(times) / len(times) if times else 0.0
scored_actions.append({
"action": action,
"score": score,
"success_rate": success_rate,
"total_samples": stats["total"],
"tier": self._get_action_tier(action),
"avg_execution_time": avg_time,
})
if not scored_actions:
return self._default_recommendation()
# 排序: 優先高成功率,其次低 Tier
scored_actions.sort(key=lambda x: (-x["score"], x["tier"]))
best = scored_actions[0]
alternatives = scored_actions[1:3] if len(scored_actions) > 1 else []
return {
"action": best["action"],
"confidence": 0.0, # 🔴 歷史學習不是 AI 分析,信心度設 0
"tier": best["tier"],
"based_on": f"{best['total_samples']} 次歷史數據",
"avg_execution_time": best["avg_execution_time"],
"success_rate": best["success_rate"], # 保留原始成功率作為參考
"alternatives": [
{"action": a["action"], "confidence": 0.0, "success_rate": a["success_rate"], "tier": a["tier"]}
for a in alternatives
],
}
async def get_learning_summary(self, anomaly_key: str) -> dict:
"""
取得學習摘要
Phase 22 P2: 業務邏輯移至 Service 層
2026-03-31 Claude Code (首席架構師技術債修復)
邏輯:
- 從 Repository 取得原始統計數據
- 在 Service 層計算 best_action 和 learning_status
Returns:
{
'anomaly_key': 'abc123',
'total_repair_attempts': 8,
'overall_success_rate': 0.625,
'actions_tried': ['restart_pod', 'scale_up'],
'best_action': {'action': 'scale_up', 'success_rate': 0.75},
'learning_status': 'sufficient',
}
"""
# 從 Repository 取得原始統計
all_stats = await self._repository.get_all_repair_stats(anomaly_key)
if not all_stats:
return {
"anomaly_key": anomaly_key,
"total_repair_attempts": 0,
"overall_success_rate": 0.0,
"actions_tried": [],
"best_action": None,
"learning_status": "insufficient",
}
# === 以下為業務邏輯,應在 Service 層 ===
total_attempts = sum(s["total"] for s in all_stats.values())
total_success = sum(s["success"] for s in all_stats.values())
overall_rate = total_success / total_attempts if total_attempts > 0 else 0.0
# 找出最佳動作 (需要至少 3 次數據)
best_action = None
best_rate = 0.0
for action, stats in all_stats.items():
if stats["total"] >= 3 and stats["success_rate"] > best_rate:
best_rate = stats["success_rate"]
best_action = {"action": action, "success_rate": best_rate}
# 判斷學習狀態
if total_attempts < 3:
learning_status = "insufficient"
elif total_attempts < 10:
learning_status = "learning"
elif overall_rate >= 0.8:
learning_status = "excellent"
else:
learning_status = "sufficient"
return {
"anomaly_key": anomaly_key,
"total_repair_attempts": total_attempts,
"overall_success_rate": overall_rate,
"actions_tried": list(all_stats.keys()),
"best_action": best_action,
"learning_status": learning_status,
}
async def _persist_trust(self, action_pattern: str) -> None:
"""
将内存中的 TrustRecord 持久化到 PostgreSQL。
ADR-088: 每次 approve/reject 後同步寫入 DB
確保 Pod 重啟後信任分數不歸零。
2026-04-17 ogt + Claude Sonnet 4.6(亞太): Phase 4 信任持久化
"""
record = self._trust_manager.get_trust_record(action_pattern)
if not record:
return
try:
await self._trust_repo.upsert(
action_pattern=action_pattern,
score=record.score,
total_approvals=record.total_approvals,
total_rejections=record.total_rejections,
last_approval_by=record.last_approval_by,
last_approval_at=record.last_approval_at,
last_rejection_by=record.last_rejection_by,
last_rejection_at=record.last_rejection_at,
)
except Exception as e:
logger.warning(
"trust_persist_failed",
action_pattern=action_pattern,
error=str(e),
)
def _get_action_tier(self, action: str) -> int:
"""取得動作的 Tier"""
tier_actions = {
1: ["restart_pod", "restart_container", "delete_pod"],
2: ["scale_up", "increase_memory", "increase_cpu", "adjust_limits"],
3: ["apply_hotfix", "update_config", "patch_deployment", "rollback"],
4: ["create_issue", "notify_team", "schedule_fix", "manual_intervention"],
}
for tier, actions in tier_actions.items():
if action in actions:
return tier
return 1 # 預設 Tier 1
def _default_recommendation(self) -> dict:
"""預設推薦 (無歷史數據時)"""
return {
"action": "restart_pod",
"confidence": 0.0, # 🔴 預設推薦不是 AI 分析,信心度設 0
"tier": 1,
"based_on": "無歷史數據,使用預設",
"avg_execution_time": 30.0,
"alternatives": [
{"action": "delete_pod", "confidence": 0.0, "tier": 1},
],
}
# =============================================================================
# Singleton
# =============================================================================
_learning_service: LearningService | None = None
def get_learning_service() -> LearningService:
"""取得學習服務 singleton"""
global _learning_service
if _learning_service is None:
_learning_service = LearningService()
return _learning_service