Some checks failed
CD Pipeline / tests (push) Successful in 1m28s
Code Review / ai-code-review (push) Successful in 29s
Type Sync Check / check-type-sync (push) Failing after 2m41s
CD Pipeline / build-and-deploy (push) Successful in 8m40s
CD Pipeline / post-deploy-checks (push) Successful in 3m10s
1342 lines
50 KiB
Python
1342 lines
50 KiB
Python
"""
|
||
Learning Service - Phase 5 持續學習迴圈
|
||
======================================
|
||
ADR-030: 智能自動修復系統
|
||
Phase D-G P0 修正: 符合 leWOOOgo 積木化原則
|
||
|
||
從執行結果中學習,持續優化決策:
|
||
1. 更新 Playbook 統計 (成功率/執行次數)
|
||
2. 調整信任度 (成功 +分 / 失敗 -分)
|
||
3. 萃取新 Playbook (成功案例自動萃取)
|
||
4. 處理人工反饋 (有效性評分)
|
||
5. 🆕 Redis 持久化學習數據 (透過 Repository)
|
||
6. 🆕 修復推薦 (基於歷史成功率)
|
||
|
||
設計原則:
|
||
- 非同步執行,不阻塞主流程
|
||
- 失敗容忍,學習失敗不影響執行結果
|
||
- 完整審計追蹤
|
||
- 🆕 Service 不直接存取 Redis (透過 ILearningRepository)
|
||
|
||
版本: v1.1
|
||
建立: 2026-03-26 (台北時區)
|
||
更新: 2026-03-29 (台北時區) - P0 修正: 新增 Repository 層
|
||
"""
|
||
|
||
from dataclasses import dataclass, field
|
||
from datetime import UTC, datetime
|
||
from enum import Enum
|
||
from typing import Any
|
||
|
||
import structlog
|
||
|
||
from src.models.approval import ApprovalRequest
|
||
from src.models.incident import IncidentStatus
|
||
from src.repositories.interfaces import ILearningRepository, ITrustRepository
|
||
from src.repositories.learning_repository import get_learning_repository
|
||
from src.repositories.trust_repository import get_trust_repository
|
||
from src.services.trust_engine import get_trust_manager
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
|
||
# =============================================================================
|
||
# Constants
|
||
# =============================================================================
|
||
|
||
|
||
class FeedbackType(str, Enum):
|
||
"""反饋類型"""
|
||
|
||
EXECUTION_SUCCESS = "execution_success" # 執行成功
|
||
EXECUTION_FAILURE = "execution_failure" # 執行失敗
|
||
HUMAN_APPROVE = "human_approve" # 人工批准
|
||
HUMAN_REJECT = "human_reject" # 人工拒絕
|
||
HUMAN_OVERRIDE = "human_override" # 人工覆蓋 AI 決策
|
||
EFFECTIVENESS_RATING = "effectiveness_rating" # 有效性評分
|
||
|
||
|
||
# 信任度調整參數
|
||
TRUST_SUCCESS_BOOST = 1 # 成功 +1 分
|
||
TRUST_FAILURE_PENALTY = 2 # 失敗 -2 分 (或歸零)
|
||
TRUST_HUMAN_REJECT_PENALTY = 1 # 人工拒絕 -1 分
|
||
|
||
|
||
# =============================================================================
|
||
# Data Models
|
||
# =============================================================================
|
||
|
||
|
||
@dataclass
|
||
class ExecutionResult:
|
||
"""執行結果"""
|
||
|
||
approval_id: str
|
||
incident_id: str
|
||
action: str
|
||
success: bool
|
||
error_message: str | None = None
|
||
duration_seconds: float = 0.0
|
||
executed_at: datetime = field(default_factory=lambda: datetime.now(UTC))
|
||
|
||
def to_dict(self) -> dict[str, Any]:
|
||
return {
|
||
"approval_id": self.approval_id,
|
||
"incident_id": self.incident_id,
|
||
"action": self.action,
|
||
"success": self.success,
|
||
"error_message": self.error_message,
|
||
"duration_seconds": self.duration_seconds,
|
||
"executed_at": self.executed_at.isoformat(),
|
||
}
|
||
|
||
|
||
@dataclass
|
||
class FeedbackRequest:
|
||
"""人工反饋請求"""
|
||
|
||
incident_id: str
|
||
feedback_type: FeedbackType
|
||
effectiveness_score: int | None = None # 1-5 分
|
||
learning_notes: str | None = None # 學習筆記
|
||
submitted_by: str | None = None
|
||
submitted_at: datetime = field(default_factory=lambda: datetime.now(UTC))
|
||
|
||
|
||
@dataclass
|
||
class LearningRecord:
|
||
"""學習記錄"""
|
||
|
||
incident_id: str
|
||
feedback_type: FeedbackType
|
||
action_pattern: str
|
||
trust_before: int
|
||
trust_after: int
|
||
playbook_updated: bool = False
|
||
new_playbook_id: str | None = None
|
||
learned_at: datetime = field(default_factory=lambda: datetime.now(UTC))
|
||
|
||
def to_dict(self) -> dict[str, Any]:
|
||
return {
|
||
"incident_id": self.incident_id,
|
||
"feedback_type": self.feedback_type.value,
|
||
"action_pattern": self.action_pattern,
|
||
"trust_before": self.trust_before,
|
||
"trust_after": self.trust_after,
|
||
"playbook_updated": self.playbook_updated,
|
||
"new_playbook_id": self.new_playbook_id,
|
||
"learned_at": self.learned_at.isoformat(),
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# Learning Service
|
||
# =============================================================================
|
||
|
||
|
||
class LearningService:
|
||
"""
|
||
持續學習服務
|
||
|
||
職責:
|
||
1. 處理執行結果 → 更新 Playbook + 信任度
|
||
2. 處理人工反饋 → 調整 Playbook 有效性
|
||
3. 萃取新 Playbook (成功案例)
|
||
4. 🆕 Redis 持久化學習數據 (透過 Repository)
|
||
5. 🆕 修復推薦 (基於歷史成功率)
|
||
|
||
2026-03-29 P0 修正: 符合 leWOOOgo 積木化原則
|
||
- 透過 ILearningRepository 存取 Redis
|
||
- 不直接依賴 Redis Client
|
||
"""
|
||
|
||
# 推薦門檻
|
||
MIN_SAMPLES = 5 # 最少需要 N 次數據才能推薦
|
||
SUCCESS_RATE_THRESHOLD = 0.6 # 成功率門檻
|
||
|
||
def __init__(
|
||
self,
|
||
repository: ILearningRepository | None = None,
|
||
trust_repository: ITrustRepository | None = None,
|
||
):
|
||
self._trust_manager = get_trust_manager()
|
||
self._repository = repository or get_learning_repository()
|
||
self._trust_repo = trust_repository or get_trust_repository()
|
||
|
||
async def process_execution_result(
|
||
self,
|
||
approval: ApprovalRequest,
|
||
result: ExecutionResult,
|
||
) -> LearningRecord:
|
||
"""
|
||
處理執行結果,觸發學習
|
||
|
||
Args:
|
||
approval: 原始審批請求
|
||
result: 執行結果
|
||
|
||
Returns:
|
||
LearningRecord: 學習記錄
|
||
"""
|
||
action_pattern = self._extract_action_pattern(approval.action)
|
||
|
||
# 取得當前信任分數
|
||
trust_record = self._trust_manager.get_trust_record(action_pattern)
|
||
trust_before = trust_record.score if trust_record else 0
|
||
|
||
# 1. 調整信任度
|
||
if result.success:
|
||
# 成功: 記錄批准 (信任分數 +1)
|
||
self._trust_manager.record_approval(
|
||
action_pattern=action_pattern,
|
||
user_role="system",
|
||
user_id="auto_learning",
|
||
)
|
||
feedback_type = FeedbackType.EXECUTION_SUCCESS
|
||
else:
|
||
# 失敗: 記錄拒絕 (信任分數歸零)
|
||
self._trust_manager.record_rejection(
|
||
action_pattern=action_pattern,
|
||
user_role="system",
|
||
user_id="auto_learning",
|
||
reason=result.error_message,
|
||
)
|
||
feedback_type = FeedbackType.EXECUTION_FAILURE
|
||
|
||
# ADR-088: 持久化信任分數到 PostgreSQL (Pod 重啟後不歸零)
|
||
await self._persist_trust(action_pattern)
|
||
|
||
# 取得更新後的信任分數
|
||
trust_record = self._trust_manager.get_trust_record(action_pattern)
|
||
trust_after = trust_record.score if trust_record else 0
|
||
|
||
# 2. 更新 Playbook 統計 (如果有匹配)
|
||
# ADR-083 Phase 3: 雙路徑查找 matched_playbook_id
|
||
# 路徑 A: ApprovalRequest.matched_playbook_id(auto_execute 路徑,Phase 3 修復)
|
||
# 路徑 B: approval.metadata["playbook_id"](人工審核路徑,透過 proposal_service 存入 metadata)
|
||
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 Playbook EWMA 修復
|
||
_matched_pb_id: str | None = (
|
||
getattr(approval, "matched_playbook_id", None)
|
||
or (approval.metadata or {}).get("matched_playbook_id")
|
||
or (approval.metadata or {}).get("playbook_id")
|
||
)
|
||
playbook_updated = False
|
||
if _matched_pb_id:
|
||
try:
|
||
await self._update_playbook_stats(
|
||
playbook_id=_matched_pb_id,
|
||
success=result.success,
|
||
)
|
||
playbook_updated = True
|
||
except Exception as e:
|
||
logger.warning(
|
||
"playbook_stats_update_failed",
|
||
playbook_id=_matched_pb_id,
|
||
error=str(e),
|
||
)
|
||
|
||
# 3. 嘗試萃取新 Playbook (成功且無匹配 Playbook)
|
||
new_playbook_id = None
|
||
if result.success and not _matched_pb_id:
|
||
try:
|
||
new_playbook_id = await self._try_extract_playbook(
|
||
incident_id=result.incident_id,
|
||
action=approval.action,
|
||
)
|
||
except Exception as e:
|
||
logger.warning(
|
||
"playbook_extraction_failed",
|
||
incident_id=result.incident_id,
|
||
error=str(e),
|
||
)
|
||
|
||
# 建立學習記錄
|
||
record = LearningRecord(
|
||
incident_id=result.incident_id,
|
||
feedback_type=feedback_type,
|
||
action_pattern=action_pattern,
|
||
trust_before=trust_before,
|
||
trust_after=trust_after,
|
||
playbook_updated=playbook_updated,
|
||
new_playbook_id=new_playbook_id,
|
||
)
|
||
|
||
logger.info(
|
||
"learning_completed",
|
||
incident_id=result.incident_id,
|
||
success=result.success,
|
||
trust_change=f"{trust_before} → {trust_after}",
|
||
playbook_updated=playbook_updated,
|
||
new_playbook=new_playbook_id,
|
||
)
|
||
|
||
return record
|
||
|
||
async def process_human_feedback(
|
||
self,
|
||
feedback: FeedbackRequest,
|
||
) -> LearningRecord:
|
||
"""
|
||
處理人工反饋
|
||
|
||
Args:
|
||
feedback: 反饋請求
|
||
|
||
Returns:
|
||
LearningRecord: 學習記錄
|
||
"""
|
||
# 從 incident 取得 action pattern (需查詢)
|
||
action_pattern = f"incident:{feedback.incident_id}"
|
||
|
||
trust_record = self._trust_manager.get_trust_record(action_pattern)
|
||
trust_before = trust_record.score if trust_record else 0
|
||
|
||
playbook_updated = False
|
||
|
||
if feedback.feedback_type == FeedbackType.HUMAN_APPROVE:
|
||
# 人工批准: 信任 +1
|
||
self._trust_manager.record_approval(
|
||
action_pattern=action_pattern,
|
||
user_role="human",
|
||
user_id=feedback.submitted_by,
|
||
)
|
||
|
||
elif feedback.feedback_type == FeedbackType.HUMAN_REJECT:
|
||
# 人工拒絕: 信任歸零
|
||
self._trust_manager.record_rejection(
|
||
action_pattern=action_pattern,
|
||
user_role="human",
|
||
user_id=feedback.submitted_by,
|
||
reason="Human rejected",
|
||
)
|
||
|
||
elif feedback.feedback_type == FeedbackType.EFFECTIVENESS_RATING:
|
||
# 有效性評分
|
||
if feedback.effectiveness_score is not None:
|
||
if feedback.effectiveness_score >= 4:
|
||
# 高評分: 增加信任
|
||
self._trust_manager.record_approval(
|
||
action_pattern=action_pattern,
|
||
user_role="feedback",
|
||
user_id=feedback.submitted_by,
|
||
)
|
||
playbook_updated = await self._promote_playbook(feedback.incident_id)
|
||
elif feedback.effectiveness_score <= 2:
|
||
# 低評分: 降低信任
|
||
self._trust_manager.record_rejection(
|
||
action_pattern=action_pattern,
|
||
user_role="feedback",
|
||
user_id=feedback.submitted_by,
|
||
reason=f"Low effectiveness score: {feedback.effectiveness_score}",
|
||
)
|
||
playbook_updated = await self._demote_playbook(feedback.incident_id)
|
||
|
||
# ADR-088: 持久化信任分數到 PostgreSQL (Pod 重啟後不歸零)
|
||
await self._persist_trust(action_pattern)
|
||
|
||
trust_record = self._trust_manager.get_trust_record(action_pattern)
|
||
trust_after = trust_record.score if trust_record else 0
|
||
|
||
record = LearningRecord(
|
||
incident_id=feedback.incident_id,
|
||
feedback_type=feedback.feedback_type,
|
||
action_pattern=action_pattern,
|
||
trust_before=trust_before,
|
||
trust_after=trust_after,
|
||
playbook_updated=playbook_updated,
|
||
)
|
||
|
||
logger.info(
|
||
"human_feedback_processed",
|
||
incident_id=feedback.incident_id,
|
||
feedback_type=feedback.feedback_type.value,
|
||
effectiveness_score=feedback.effectiveness_score,
|
||
trust_change=f"{trust_before} → {trust_after}",
|
||
)
|
||
|
||
return record
|
||
|
||
# =========================================================================
|
||
# Private Methods
|
||
# =========================================================================
|
||
|
||
def _extract_action_pattern(self, action: str) -> str:
|
||
"""從 action 字串提取 pattern"""
|
||
if not action:
|
||
return "unknown"
|
||
|
||
parts = action.split()
|
||
if len(parts) < 3:
|
||
return "unknown"
|
||
|
||
verb = parts[1] if len(parts) > 1 else "unknown"
|
||
resource_part = parts[2] if len(parts) > 2 else ""
|
||
|
||
if "/" in resource_part:
|
||
resource_name = resource_part.split("/")[-1]
|
||
else:
|
||
resource_name = resource_part
|
||
|
||
# 移除 pod hash suffix
|
||
resource_parts = resource_name.split("-")
|
||
if len(resource_parts) >= 3:
|
||
resource_name = "-".join(resource_parts[:-2]) + "-*"
|
||
|
||
return f"{verb}:{resource_name}"
|
||
|
||
async def _update_playbook_stats(
|
||
self,
|
||
playbook_id: str,
|
||
success: bool,
|
||
) -> None:
|
||
"""
|
||
更新 Playbook 統計
|
||
|
||
W2 PR-L1: 統計更新後,取 Playbook 的 symptom_pattern hash 觸發邏輯 2
|
||
(KM 累積門檻檢查 → review_required 標記)。
|
||
"""
|
||
try:
|
||
from src.services.playbook_service import get_playbook_service
|
||
|
||
service = get_playbook_service()
|
||
await service.record_execution(playbook_id, success)
|
||
|
||
# W2 PR-L1 邏輯 2: 取得 Playbook symptom_pattern hash,觸發 KM 累積檢查
|
||
from src.core.config import settings
|
||
if settings.ENABLE_KM_PLAYBOOK_FEEDBACK_LOOP:
|
||
try:
|
||
from src.repositories.playbook_repository import get_playbook_repository
|
||
from src.models.playbook import SymptomPattern
|
||
repo = get_playbook_repository()
|
||
playbook = await repo.get_by_id(playbook_id)
|
||
if playbook and playbook.symptom_pattern:
|
||
sp = playbook.symptom_pattern
|
||
# symptom_pattern 可能是 Pydantic model 或 dict(ORM 載入)
|
||
if isinstance(sp, dict):
|
||
sp = SymptomPattern.model_validate(sp)
|
||
symptoms_hash = sp.compute_hash()
|
||
await self._check_and_mark_playbook_review(symptoms_hash)
|
||
except Exception as inner_e:
|
||
logger.warning(
|
||
"playbook_review_check_failed",
|
||
playbook_id=playbook_id,
|
||
error=str(inner_e),
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.warning(
|
||
"playbook_stats_update_error",
|
||
playbook_id=playbook_id,
|
||
error=str(e),
|
||
)
|
||
|
||
async def _try_extract_playbook(
|
||
self,
|
||
incident_id: str,
|
||
action: str,
|
||
) -> str | None:
|
||
"""嘗試從成功案例萃取 Playbook"""
|
||
try:
|
||
from src.repositories.incident_repository import get_incident_repository
|
||
from src.services.playbook_service import get_playbook_service
|
||
from src.core.config import settings
|
||
|
||
# 取得 Incident
|
||
repo = get_incident_repository()
|
||
incident = await repo.get_by_id(incident_id)
|
||
|
||
if not incident:
|
||
return None
|
||
|
||
# 確認狀態為 RESOLVED
|
||
if incident.status not in [IncidentStatus.RESOLVED, IncidentStatus.CLOSED]:
|
||
return None
|
||
|
||
# ADR-104: 優先用本地 LLM 生成可治理 Playbook。失敗時服務內會降級成
|
||
# deterministic fallback;feature flag 關閉才回到舊萃取路徑。
|
||
if settings.ENABLE_LLM_PLAYBOOK_GENERATION:
|
||
from src.services.playbook_generator import get_playbook_generator
|
||
|
||
generated = await get_playbook_generator().generate_from_incident(
|
||
incident=incident,
|
||
action=action,
|
||
persist=True,
|
||
)
|
||
if generated.playbook:
|
||
logger.info(
|
||
"playbook_llm_generated",
|
||
incident_id=incident_id,
|
||
playbook_id=generated.playbook.playbook_id,
|
||
outcome=generated.outcome,
|
||
provider=generated.provider,
|
||
)
|
||
return generated.playbook.playbook_id
|
||
|
||
# 萃取 Playbook
|
||
service = get_playbook_service()
|
||
playbook = await service.extract_from_incident(
|
||
incident=incident,
|
||
auto_approve=False, # 需人工審核
|
||
)
|
||
|
||
if playbook:
|
||
logger.info(
|
||
"playbook_auto_extracted",
|
||
incident_id=incident_id,
|
||
playbook_id=playbook.playbook_id,
|
||
)
|
||
return playbook.playbook_id
|
||
|
||
return None
|
||
|
||
except Exception as e:
|
||
logger.warning(
|
||
"playbook_extraction_error",
|
||
incident_id=incident_id,
|
||
error=str(e),
|
||
)
|
||
return None
|
||
|
||
async def _promote_playbook(self, incident_id: str) -> bool:
|
||
"""
|
||
提升 Playbook 信心度 (高評分)
|
||
|
||
2026-03-30 Claude Code: 實作信心度提升邏輯
|
||
|
||
邏輯:
|
||
- 尋找 source_incident_ids 包含此 incident_id 的 Playbooks
|
||
- 提升 ai_confidence +0.1 (上限 1.0)
|
||
- 若信心度 >= 0.9 且 status == DRAFT → 自動升級為 APPROVED
|
||
- W2 PR-L1: 寫 KM 演化條目(ENABLE_KM_PLAYBOOK_FEEDBACK_LOOP 開啟時)
|
||
"""
|
||
try:
|
||
from src.repositories.playbook_repository import get_playbook_repository
|
||
|
||
repo = get_playbook_repository()
|
||
playbooks = await repo.find_by_source_incident(incident_id)
|
||
|
||
if not playbooks:
|
||
logger.debug(
|
||
"playbook_promote_no_match",
|
||
incident_id=incident_id,
|
||
)
|
||
return False
|
||
|
||
# 信心度提升參數
|
||
CONFIDENCE_BOOST = 0.1
|
||
|
||
updated_count = 0
|
||
for playbook in playbooks:
|
||
previous_trust = playbook.trust_score
|
||
result = await repo.adjust_confidence(
|
||
playbook_id=playbook.playbook_id,
|
||
delta=CONFIDENCE_BOOST,
|
||
reason=f"High effectiveness rating from incident {incident_id}",
|
||
)
|
||
if result:
|
||
updated_count += 1
|
||
# W2 PR-L1: promote 觸發 → 寫 KM 演化條目
|
||
await self._write_playbook_evolution_km(
|
||
playbook=playbook,
|
||
previous_trust=previous_trust,
|
||
evolution_type="promote",
|
||
incident_id=incident_id,
|
||
)
|
||
|
||
logger.info(
|
||
"playbook_promoted",
|
||
incident_id=incident_id,
|
||
updated_count=updated_count,
|
||
total_playbooks=len(playbooks),
|
||
)
|
||
|
||
return updated_count > 0
|
||
|
||
except Exception as e:
|
||
logger.warning(
|
||
"playbook_promote_failed",
|
||
incident_id=incident_id,
|
||
error=str(e),
|
||
)
|
||
return False
|
||
|
||
async def _demote_playbook(self, incident_id: str) -> bool:
|
||
"""
|
||
降低 Playbook 信心度 (低評分)
|
||
|
||
2026-03-30 Claude Code: 實作信心度降低邏輯
|
||
|
||
邏輯:
|
||
- 尋找 source_incident_ids 包含此 incident_id 的 Playbooks
|
||
- 降低 ai_confidence -0.15 (下限 0.0)
|
||
- 若信心度 < 0.3 且 failure_rate > 50% → 自動降級為 DEPRECATED
|
||
- W2 PR-L1: 寫 KM 演化條目;DEPRECATED 時回灌 alert_rule_catalog(飛輪 C4 修復)
|
||
"""
|
||
try:
|
||
from src.repositories.playbook_repository import get_playbook_repository
|
||
|
||
repo = get_playbook_repository()
|
||
playbooks = await repo.find_by_source_incident(incident_id)
|
||
|
||
if not playbooks:
|
||
logger.debug(
|
||
"playbook_demote_no_match",
|
||
incident_id=incident_id,
|
||
)
|
||
return False
|
||
|
||
# 信心度降低參數 (懲罰比獎勵更重,避免低品質 Playbook 累積)
|
||
CONFIDENCE_PENALTY = -0.15
|
||
|
||
updated_count = 0
|
||
for playbook in playbooks:
|
||
previous_trust = playbook.trust_score
|
||
result = await repo.adjust_confidence(
|
||
playbook_id=playbook.playbook_id,
|
||
delta=CONFIDENCE_PENALTY,
|
||
reason=f"Low effectiveness rating from incident {incident_id}",
|
||
)
|
||
if result:
|
||
updated_count += 1
|
||
# W2 PR-L1: demote 觸發 → 寫 KM 演化條目
|
||
await self._write_playbook_evolution_km(
|
||
playbook=playbook,
|
||
previous_trust=previous_trust,
|
||
evolution_type="demote",
|
||
incident_id=incident_id,
|
||
)
|
||
# W2 PR-L1 邏輯 3: DEPRECATED 時回灌 alert_rule_catalog(飛輪 C4 修復)
|
||
from src.models.playbook import PlaybookStatus
|
||
if playbook.status == PlaybookStatus.DEPRECATED:
|
||
await self._demote_alert_rule_catalog_confidence(playbook)
|
||
|
||
logger.info(
|
||
"playbook_demoted",
|
||
incident_id=incident_id,
|
||
updated_count=updated_count,
|
||
total_playbooks=len(playbooks),
|
||
)
|
||
|
||
return updated_count > 0
|
||
|
||
except Exception as e:
|
||
logger.warning(
|
||
"playbook_demote_failed",
|
||
incident_id=incident_id,
|
||
error=str(e),
|
||
)
|
||
return False
|
||
|
||
# =========================================================================
|
||
# W2 PR-L1: KM → Playbook 互饋回路私有方法
|
||
# 飛輪斷鏈 C3 + C4 修復
|
||
# 2026-04-28 ogt + Claude Sonnet 4.6
|
||
# =========================================================================
|
||
|
||
async def _write_playbook_evolution_km(
|
||
self,
|
||
playbook: Any,
|
||
previous_trust: float,
|
||
evolution_type: str,
|
||
incident_id: str,
|
||
) -> None:
|
||
"""
|
||
邏輯 1: promote/demote 觸發 → 寫 KM 演化條目(飛輪 C3)
|
||
|
||
KM 條目 metadata 含:playbook_id, previous_trust, new_trust,
|
||
success_count, failure_count, decision_chain
|
||
path_type='playbook_evolution',供冪等 key 使用
|
||
(incident_id, path_type) = (incident_id, 'playbook_evolution') 可能重複,
|
||
但 playbook_id 不同的演化各自獨立,所以 path_type 加 playbook_id 作為識別。
|
||
"""
|
||
from src.core.config import settings
|
||
if not settings.ENABLE_KM_PLAYBOOK_FEEDBACK_LOOP:
|
||
return
|
||
|
||
try:
|
||
import json
|
||
from src.services.km_writer import KMWritePayload, km_write_with_flag
|
||
from src.utils.timezone import now_taipei
|
||
|
||
new_trust = getattr(playbook, "trust_score", previous_trust)
|
||
success_count = getattr(playbook, "success_count", 0)
|
||
failure_count = getattr(playbook, "failure_count", 0)
|
||
|
||
path_type = f"playbook_evolution:{playbook.playbook_id}"
|
||
|
||
payload = KMWritePayload(
|
||
path_type=path_type,
|
||
incident_id=incident_id,
|
||
entry_create_kwargs={
|
||
"title": f"Playbook {evolution_type}: {playbook.name} [{playbook.playbook_id}]",
|
||
"content": (
|
||
f"Playbook {evolution_type} 事件記錄\n"
|
||
f"Playbook ID: {playbook.playbook_id}\n"
|
||
f"名稱: {playbook.name}\n"
|
||
f"trust_score 變化: {previous_trust:.3f} → {new_trust:.3f}\n"
|
||
f"成功次數: {success_count} / 失敗次數: {failure_count}\n"
|
||
f"觸發來源: incident {incident_id}\n"
|
||
f"記錄時間: {now_taipei().isoformat()}"
|
||
),
|
||
"entry_type": "best_practice",
|
||
"category": "AI系統",
|
||
"tags": ["playbook_evolution", evolution_type, playbook.playbook_id],
|
||
"source": "ai_extracted",
|
||
"related_playbook_id": playbook.playbook_id,
|
||
"related_incident_id": incident_id,
|
||
"path_type": path_type,
|
||
},
|
||
metadata={
|
||
"playbook_id": playbook.playbook_id,
|
||
"previous_trust": previous_trust,
|
||
"new_trust": new_trust,
|
||
"success_count": success_count,
|
||
"failure_count": failure_count,
|
||
"evolution_type": evolution_type,
|
||
},
|
||
)
|
||
await km_write_with_flag(payload)
|
||
logger.info(
|
||
"playbook_evolution_km_written",
|
||
playbook_id=playbook.playbook_id,
|
||
evolution_type=evolution_type,
|
||
trust_change=f"{previous_trust:.3f} → {new_trust:.3f}",
|
||
)
|
||
except Exception as e:
|
||
logger.warning(
|
||
"playbook_evolution_km_write_failed",
|
||
playbook_id=getattr(playbook, "playbook_id", "unknown"),
|
||
evolution_type=evolution_type,
|
||
error=str(e),
|
||
)
|
||
|
||
async def _check_and_mark_playbook_review(self, symptoms_hash: str) -> None:
|
||
"""
|
||
邏輯 2: KM 累積 N=5 條同 symptom_pattern_hash → 觸發 Playbook review_required 標記(飛輪 C3)
|
||
|
||
每次 KM 寫入後由 _update_playbook_stats 呼叫端觸發此檢查。
|
||
若同 symptoms_hash 在 knowledge_entries 已有 >= threshold 條,
|
||
則 UPDATE playbooks SET review_required=true WHERE 症狀 hash 相符。
|
||
|
||
比對策略:從 KnowledgeEntry 讀 symptoms_hash 計數,
|
||
再透過 playbook.symptom_pattern 的 hash 比對 Playbook。
|
||
"""
|
||
from src.core.config import settings
|
||
if not settings.ENABLE_KM_PLAYBOOK_FEEDBACK_LOOP:
|
||
return
|
||
if not symptoms_hash:
|
||
return
|
||
|
||
try:
|
||
from sqlalchemy import text as sa_text
|
||
from src.db.base import get_db_context
|
||
|
||
async with get_db_context() as db:
|
||
# 計算同 symptoms_hash 的 KM 條目數
|
||
count_result = await db.execute(
|
||
sa_text(
|
||
"SELECT COUNT(*) FROM knowledge_entries "
|
||
"WHERE symptoms_hash = :hash"
|
||
),
|
||
{"hash": symptoms_hash},
|
||
)
|
||
count = count_result.scalar() or 0
|
||
|
||
if count < settings.KM_PLAYBOOK_REVIEW_THRESHOLD:
|
||
return
|
||
|
||
# 累積達到門檻 → 標記相關 Playbook 需要 review
|
||
# Playbook 的 symptom_pattern 存為 JSONB,無直接 hash 欄位
|
||
# 透過 knowledge_entries.related_playbook_id 關聯找到要標記的 Playbook
|
||
updated = await db.execute(
|
||
sa_text(
|
||
"UPDATE playbooks pb "
|
||
"SET review_required = true, updated_at = NOW() "
|
||
"FROM knowledge_entries ke "
|
||
"WHERE ke.symptoms_hash = :hash "
|
||
" AND ke.related_playbook_id = pb.playbook_id "
|
||
" AND pb.review_required = false "
|
||
"RETURNING pb.playbook_id"
|
||
),
|
||
{"hash": symptoms_hash},
|
||
)
|
||
marked_ids = [row[0] for row in updated.fetchall()]
|
||
await db.commit()
|
||
|
||
if marked_ids:
|
||
logger.info(
|
||
"playbook_review_required_marked",
|
||
symptoms_hash=symptoms_hash,
|
||
km_count=count,
|
||
threshold=settings.KM_PLAYBOOK_REVIEW_THRESHOLD,
|
||
playbook_ids=marked_ids,
|
||
)
|
||
except Exception as e:
|
||
logger.warning(
|
||
"playbook_review_mark_failed",
|
||
symptoms_hash=symptoms_hash,
|
||
error=str(e),
|
||
)
|
||
|
||
async def _demote_alert_rule_catalog_confidence(self, playbook: Any) -> None:
|
||
"""
|
||
邏輯 3: Playbook DEPRECATED 時回灌 alert_rule_catalog(飛輪 C4 修復)
|
||
|
||
UPDATE alert_rule_catalog
|
||
SET confidence = confidence * 0.5,
|
||
review_status = 'draft' -- CHECK constraint 允許 draft/approved/deprecated/retired
|
||
WHERE rule_name LIKE pattern(symptom_pattern.alert_names)
|
||
|
||
注意:alert_rule_catalog.review_status CHECK 限制只允許:
|
||
draft | approved | deprecated | retired
|
||
任務描述的 'needs_review' 不合法,改用 'draft'(語意等效:需要人工審核)
|
||
|
||
失敗容忍:不影響 demote 主流程。
|
||
"""
|
||
from src.core.config import settings
|
||
if not settings.ENABLE_KM_PLAYBOOK_FEEDBACK_LOOP:
|
||
return
|
||
|
||
try:
|
||
import json
|
||
from sqlalchemy import text as sa_text
|
||
from src.db.base import get_db_context
|
||
|
||
# 從 playbook symptom_pattern 取出 alert_names 作為比對鍵
|
||
symptom = getattr(playbook, "symptom_pattern", None)
|
||
if symptom is None:
|
||
return
|
||
|
||
# symptom_pattern 可能是 Pydantic model 或 dict(從 ORM 載入為 dict)
|
||
if hasattr(symptom, "alert_names"):
|
||
alert_names: list[str] = symptom.alert_names or []
|
||
elif isinstance(symptom, dict):
|
||
alert_names = symptom.get("alert_names") or []
|
||
else:
|
||
return
|
||
|
||
if not alert_names:
|
||
logger.debug(
|
||
"playbook_demote_no_alert_names",
|
||
playbook_id=playbook.playbook_id,
|
||
)
|
||
return
|
||
|
||
async with get_db_context() as db:
|
||
updated_count = 0
|
||
for alert_name in alert_names:
|
||
# rule_name 完全匹配或前綴匹配(去掉 * suffix)
|
||
match_name = alert_name.rstrip("*")
|
||
result = await db.execute(
|
||
sa_text(
|
||
"UPDATE alert_rule_catalog "
|
||
"SET confidence = CASE "
|
||
" WHEN confidence IS NOT NULL "
|
||
" THEN GREATEST(0.01, confidence * 0.5) "
|
||
" ELSE 0.5 "
|
||
" END, "
|
||
" review_status = 'draft', "
|
||
" updated_at = NOW() "
|
||
"WHERE rule_name LIKE :pattern "
|
||
" AND (review_status IS NULL OR review_status NOT IN "
|
||
" ('deprecated', 'retired')) "
|
||
"RETURNING rule_id"
|
||
),
|
||
{"pattern": f"{match_name}%"},
|
||
)
|
||
affected = result.rowcount or 0
|
||
updated_count += affected
|
||
await db.commit()
|
||
|
||
if updated_count > 0:
|
||
logger.info(
|
||
"alert_rule_catalog_confidence_demoted",
|
||
playbook_id=playbook.playbook_id,
|
||
alert_names=alert_names,
|
||
rules_updated=updated_count,
|
||
)
|
||
except Exception as e:
|
||
logger.warning(
|
||
"alert_rule_catalog_demote_failed",
|
||
playbook_id=getattr(playbook, "playbook_id", "unknown"),
|
||
error=str(e),
|
||
)
|
||
|
||
# =========================================================================
|
||
# 🆕 Phase D-G P0 修正: 新增方法
|
||
# =========================================================================
|
||
|
||
async def record_repair_result(
|
||
self,
|
||
anomaly_key: str,
|
||
repair_action: str,
|
||
success: bool,
|
||
root_cause: str | None = None,
|
||
fix_description: str | None = None,
|
||
execution_time_seconds: float | None = None,
|
||
) -> bool:
|
||
"""
|
||
記錄修復結果到 Repository (Redis 持久化)
|
||
|
||
2026-03-29 P0 修正: 透過 Repository 存取 Redis
|
||
|
||
Args:
|
||
anomaly_key: 異常 key
|
||
repair_action: 修復動作
|
||
success: 是否成功
|
||
root_cause: 根因 (如果找到)
|
||
fix_description: 修復說明
|
||
execution_time_seconds: 執行時間
|
||
|
||
Returns:
|
||
bool: 是否成功記錄
|
||
"""
|
||
return await self._repository.record_repair(
|
||
anomaly_key=anomaly_key,
|
||
repair_action=repair_action,
|
||
success=success,
|
||
root_cause=root_cause,
|
||
fix_description=fix_description,
|
||
execution_time_seconds=execution_time_seconds,
|
||
)
|
||
|
||
async def record_declarative_outcome(
|
||
self,
|
||
incident_id: str,
|
||
action: str,
|
||
blast_radius_score: int,
|
||
blast_radius_tier: str,
|
||
success: bool,
|
||
rollback_triggered: bool = False,
|
||
execution_time_seconds: float | None = None,
|
||
) -> bool:
|
||
"""
|
||
記錄 DeclarativeSpec 執行結果到學習系統。
|
||
|
||
Phase 5 ADR-086:DeclarativeSpec 執行結果寫入學習記錄,
|
||
讓 AI 能從 Blast Radius 分級的執行歷史中學習。
|
||
|
||
Args:
|
||
incident_id: 關聯 Incident ID
|
||
action: 執行的 kubectl 命令
|
||
blast_radius_score: 爆炸半徑分數(0-100)
|
||
blast_radius_tier: 執行分級(auto/human/dual/blocked)
|
||
success: 是否執行成功
|
||
rollback_triggered: 是否觸發了回滾
|
||
execution_time_seconds: 執行耗時
|
||
|
||
Returns:
|
||
bool: 是否成功記錄
|
||
|
||
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 5 初始建立
|
||
"""
|
||
import json
|
||
from src.utils.timezone import now_taipei
|
||
|
||
try:
|
||
anomaly_key = f"declarative:{incident_id}"
|
||
fix_desc = json.dumps({
|
||
"blast_radius_score": blast_radius_score,
|
||
"blast_radius_tier": blast_radius_tier,
|
||
"rollback_triggered": rollback_triggered,
|
||
"recorded_at": now_taipei().isoformat(),
|
||
}, ensure_ascii=False)
|
||
|
||
return await self._repository.record_repair(
|
||
anomaly_key=anomaly_key,
|
||
repair_action=action[:200],
|
||
success=success,
|
||
root_cause=f"blast_radius_tier={blast_radius_tier}",
|
||
fix_description=fix_desc,
|
||
execution_time_seconds=execution_time_seconds,
|
||
)
|
||
except Exception as e:
|
||
import structlog as _structlog
|
||
_structlog.get_logger(__name__).warning(
|
||
"record_declarative_outcome_failed",
|
||
incident_id=incident_id,
|
||
error=str(e),
|
||
)
|
||
return False
|
||
|
||
async def record_diagnosis_outcome(
|
||
self,
|
||
incident_id: str,
|
||
matched_playbook_id: str | None,
|
||
was_correct: bool,
|
||
actual_fix: str | None = None,
|
||
) -> None:
|
||
"""
|
||
記錄 AI 診斷結果的正確性。
|
||
|
||
ADR-083 Phase 3: 誤診回寫 playbook_diagnosis_feedback(L3×D4)。
|
||
當 AI 提議被人工拒絕、或執行後驗證失敗,代表診斷可能有誤;
|
||
此時回寫負向信號,讓對應 Playbook trust_score EWMA 收縮。
|
||
|
||
Args:
|
||
incident_id: 關聯 Incident ID
|
||
matched_playbook_id: 此次診斷使用的 Playbook ID(若有)
|
||
was_correct: 診斷是否正確(False = 誤診)
|
||
actual_fix: 實際有效的修復動作(可供 Evolver 學習)
|
||
|
||
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 誤診回饋接線
|
||
"""
|
||
# 1. 記錄到 Repository(diag: 前綴與 exec: / verify: 區分)
|
||
try:
|
||
await self._repository.record_repair(
|
||
anomaly_key=f"diag:{incident_id}",
|
||
repair_action=actual_fix or "unknown",
|
||
success=was_correct,
|
||
fix_description=f"diagnosis_correct={was_correct}",
|
||
)
|
||
except Exception as e:
|
||
logger.warning(
|
||
"record_diagnosis_to_repo_failed",
|
||
incident_id=incident_id,
|
||
error=str(e),
|
||
)
|
||
|
||
# 2. 誤診時強化 Playbook 負向學習(已有 2x EWMA 衰減係數)
|
||
if matched_playbook_id and not was_correct:
|
||
await self._update_playbook_stats(
|
||
playbook_id=matched_playbook_id,
|
||
success=False,
|
||
)
|
||
|
||
logger.info(
|
||
"diagnosis_outcome_recorded",
|
||
incident_id=incident_id,
|
||
was_correct=was_correct,
|
||
matched_playbook_id=matched_playbook_id,
|
||
)
|
||
|
||
async def record_verification_result(
|
||
self,
|
||
incident_id: str,
|
||
action_taken: str,
|
||
verification_result: str,
|
||
matched_playbook_id: str | None = None,
|
||
) -> None:
|
||
"""
|
||
記錄環境驗證結果到學習系統。
|
||
|
||
ADR-083 Phase 3 Root cause 3: post_execution_verifier → learning 接線修復。
|
||
環境驗證(Pod Running / 指標恢復)比執行指令 exit code 更精確,
|
||
單獨存一條 verify: 前綴記錄,並更新 Playbook EWMA stats。
|
||
|
||
Args:
|
||
incident_id: 關聯 Incident ID
|
||
action_taken: 執行的動作描述(例如 "restart_service:awoooi-api")
|
||
verification_result: "success" | "degraded" | "failed" | "timeout"
|
||
matched_playbook_id: 匹配的 Playbook ID(有則更新 EWMA stats)
|
||
|
||
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 Root cause 3 修復
|
||
"""
|
||
success = (verification_result == "success")
|
||
|
||
# 1. 記錄環境驗證結果到 Repository(anomaly_key 加 verify: 前綴與執行記錄區分)
|
||
try:
|
||
await self._repository.record_repair(
|
||
anomaly_key=f"verify:{incident_id}",
|
||
repair_action=action_taken,
|
||
success=success,
|
||
fix_description=verification_result,
|
||
)
|
||
except Exception as e:
|
||
logger.warning(
|
||
"record_verification_to_repo_failed",
|
||
incident_id=incident_id,
|
||
error=str(e),
|
||
)
|
||
|
||
# 2. 更新 Playbook EWMA stats(比執行 exit code 更精確的訊號)
|
||
if matched_playbook_id:
|
||
await self._update_playbook_stats(
|
||
playbook_id=matched_playbook_id,
|
||
success=success,
|
||
)
|
||
|
||
logger.info(
|
||
"verification_result_recorded",
|
||
incident_id=incident_id,
|
||
verification_result=verification_result,
|
||
matched_playbook_id=matched_playbook_id,
|
||
)
|
||
|
||
async def record_agent_session(
|
||
self,
|
||
session_id: str,
|
||
incident_id: str,
|
||
final_confidence: float,
|
||
requires_human_approval: bool,
|
||
all_agents_degraded: bool,
|
||
critic_challenge_count: int,
|
||
matched_playbook_id: str | None = None,
|
||
) -> None:
|
||
"""
|
||
記錄 5-Agent 辯證 Session 的學習訊號。
|
||
|
||
ADR-083 Phase 3: AgentSession 學習接線(L7×D2)。
|
||
DB 持久化由 agent_orchestrator._write_agent_turn() 負責,
|
||
此方法負責從辯證結果提取學習訊號(Redis analytics + Playbook EWMA)。
|
||
|
||
信號邏輯:
|
||
- all_agents_degraded=True → 系統品質警報(記錄到 Redis governance)
|
||
- critic_challenge_count > 0 AND matched_playbook_id → 輕度負向 EWMA(Critic 質疑 Playbook)
|
||
- requires_human_approval=True → 信心不足,系統整體品質下降紀錄
|
||
|
||
Args:
|
||
session_id: 辯證 Session ID
|
||
incident_id: 關聯 Incident ID
|
||
final_confidence: Coordinator 最終信心分數(0-1)
|
||
requires_human_approval: 是否升級人工審核
|
||
all_agents_degraded: 是否全部 Agent 降級
|
||
critic_challenge_count: Critic 提出的質疑數量
|
||
matched_playbook_id: 本次決策匹配的 Playbook(若有)
|
||
|
||
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 AgentSession 學習接線
|
||
"""
|
||
# 1. 記錄辯證品質到 Redis(供 SLO 監控 + Governance 分析)
|
||
try:
|
||
await self._repository.record_repair(
|
||
anomaly_key=f"debate:{incident_id}",
|
||
repair_action=f"session:{session_id[:8]}",
|
||
success=not requires_human_approval,
|
||
fix_description=f"confidence={final_confidence:.2f},degraded={all_agents_degraded},critic_challenges={critic_challenge_count}",
|
||
)
|
||
except Exception as e:
|
||
logger.warning(
|
||
"record_agent_session_to_repo_failed",
|
||
session_id=session_id,
|
||
error=str(e),
|
||
)
|
||
|
||
# 2. Critic 有重大質疑 + 有匹配 Playbook → 輕度負向 EWMA(Playbook 可能不準確)
|
||
if critic_challenge_count > 0 and matched_playbook_id:
|
||
await self._update_playbook_stats(
|
||
playbook_id=matched_playbook_id,
|
||
success=False,
|
||
)
|
||
|
||
logger.info(
|
||
"agent_session_recorded",
|
||
session_id=session_id,
|
||
incident_id=incident_id,
|
||
final_confidence=final_confidence,
|
||
requires_human_approval=requires_human_approval,
|
||
all_agents_degraded=all_agents_degraded,
|
||
critic_challenge_count=critic_challenge_count,
|
||
)
|
||
|
||
async def get_recommended_fix(self, anomaly_key: str) -> dict:
|
||
"""
|
||
根據歷史學習,推薦最佳修復方案
|
||
|
||
2026-03-29 P0 修正: 使用 Repository 取得統計
|
||
|
||
Returns:
|
||
{
|
||
'action': 'scale_up',
|
||
'confidence': 0.85,
|
||
'tier': 2,
|
||
'based_on': '12 次歷史數據',
|
||
'avg_execution_time': 45.2,
|
||
'alternatives': [...]
|
||
}
|
||
"""
|
||
import math
|
||
|
||
all_stats = await self._repository.get_all_repair_stats(anomaly_key)
|
||
|
||
if not all_stats:
|
||
return self._default_recommendation()
|
||
|
||
# 計算各動作的加權分數
|
||
scored_actions = []
|
||
for action, stats in all_stats.items():
|
||
if stats["total"] >= self.MIN_SAMPLES:
|
||
success_rate = stats["success_rate"]
|
||
if success_rate >= self.SUCCESS_RATE_THRESHOLD:
|
||
# 加權: 成功率 * log(樣本數)
|
||
score = success_rate * math.log(stats["total"] + 1)
|
||
|
||
# 取得平均執行時間
|
||
history = await self._repository.get_repair_history(
|
||
anomaly_key, action, limit=20
|
||
)
|
||
times = [
|
||
h["execution_time"]
|
||
for h in history
|
||
if h.get("execution_time")
|
||
]
|
||
avg_time = sum(times) / len(times) if times else 0.0
|
||
|
||
scored_actions.append({
|
||
"action": action,
|
||
"score": score,
|
||
"success_rate": success_rate,
|
||
"total_samples": stats["total"],
|
||
"tier": self._get_action_tier(action),
|
||
"avg_execution_time": avg_time,
|
||
})
|
||
|
||
if not scored_actions:
|
||
return self._default_recommendation()
|
||
|
||
# 排序: 優先高成功率,其次低 Tier
|
||
scored_actions.sort(key=lambda x: (-x["score"], x["tier"]))
|
||
|
||
best = scored_actions[0]
|
||
alternatives = scored_actions[1:3] if len(scored_actions) > 1 else []
|
||
|
||
return {
|
||
"action": best["action"],
|
||
"confidence": 0.0, # 🔴 歷史學習不是 AI 分析,信心度設 0
|
||
"tier": best["tier"],
|
||
"based_on": f"{best['total_samples']} 次歷史數據",
|
||
"avg_execution_time": best["avg_execution_time"],
|
||
"success_rate": best["success_rate"], # 保留原始成功率作為參考
|
||
"alternatives": [
|
||
{"action": a["action"], "confidence": 0.0, "success_rate": a["success_rate"], "tier": a["tier"]}
|
||
for a in alternatives
|
||
],
|
||
}
|
||
|
||
async def get_learning_summary(self, anomaly_key: str) -> dict:
|
||
"""
|
||
取得學習摘要
|
||
|
||
Phase 22 P2: 業務邏輯移至 Service 層
|
||
2026-03-31 Claude Code (首席架構師技術債修復)
|
||
|
||
邏輯:
|
||
- 從 Repository 取得原始統計數據
|
||
- 在 Service 層計算 best_action 和 learning_status
|
||
|
||
Returns:
|
||
{
|
||
'anomaly_key': 'abc123',
|
||
'total_repair_attempts': 8,
|
||
'overall_success_rate': 0.625,
|
||
'actions_tried': ['restart_pod', 'scale_up'],
|
||
'best_action': {'action': 'scale_up', 'success_rate': 0.75},
|
||
'learning_status': 'sufficient',
|
||
}
|
||
"""
|
||
# 從 Repository 取得原始統計
|
||
all_stats = await self._repository.get_all_repair_stats(anomaly_key)
|
||
|
||
if not all_stats:
|
||
return {
|
||
"anomaly_key": anomaly_key,
|
||
"total_repair_attempts": 0,
|
||
"overall_success_rate": 0.0,
|
||
"actions_tried": [],
|
||
"best_action": None,
|
||
"learning_status": "insufficient",
|
||
}
|
||
|
||
# === 以下為業務邏輯,應在 Service 層 ===
|
||
|
||
total_attempts = sum(s["total"] for s in all_stats.values())
|
||
total_success = sum(s["success"] for s in all_stats.values())
|
||
overall_rate = total_success / total_attempts if total_attempts > 0 else 0.0
|
||
|
||
# 找出最佳動作 (需要至少 3 次數據)
|
||
best_action = None
|
||
best_rate = 0.0
|
||
for action, stats in all_stats.items():
|
||
if stats["total"] >= 3 and stats["success_rate"] > best_rate:
|
||
best_rate = stats["success_rate"]
|
||
best_action = {"action": action, "success_rate": best_rate}
|
||
|
||
# 判斷學習狀態
|
||
if total_attempts < 3:
|
||
learning_status = "insufficient"
|
||
elif total_attempts < 10:
|
||
learning_status = "learning"
|
||
elif overall_rate >= 0.8:
|
||
learning_status = "excellent"
|
||
else:
|
||
learning_status = "sufficient"
|
||
|
||
return {
|
||
"anomaly_key": anomaly_key,
|
||
"total_repair_attempts": total_attempts,
|
||
"overall_success_rate": overall_rate,
|
||
"actions_tried": list(all_stats.keys()),
|
||
"best_action": best_action,
|
||
"learning_status": learning_status,
|
||
}
|
||
|
||
async def _persist_trust(self, action_pattern: str) -> None:
|
||
"""
|
||
将内存中的 TrustRecord 持久化到 PostgreSQL。
|
||
|
||
ADR-088: 每次 approve/reject 後同步寫入 DB,
|
||
確保 Pod 重啟後信任分數不歸零。
|
||
|
||
2026-04-17 ogt + Claude Sonnet 4.6(亞太): Phase 4 信任持久化
|
||
"""
|
||
record = self._trust_manager.get_trust_record(action_pattern)
|
||
if not record:
|
||
return
|
||
try:
|
||
await self._trust_repo.upsert(
|
||
action_pattern=action_pattern,
|
||
score=record.score,
|
||
total_approvals=record.total_approvals,
|
||
total_rejections=record.total_rejections,
|
||
last_approval_by=record.last_approval_by,
|
||
last_approval_at=record.last_approval_at,
|
||
last_rejection_by=record.last_rejection_by,
|
||
last_rejection_at=record.last_rejection_at,
|
||
)
|
||
except Exception as e:
|
||
logger.warning(
|
||
"trust_persist_failed",
|
||
action_pattern=action_pattern,
|
||
error=str(e),
|
||
)
|
||
|
||
def _get_action_tier(self, action: str) -> int:
|
||
"""取得動作的 Tier"""
|
||
tier_actions = {
|
||
1: ["restart_pod", "restart_container", "delete_pod"],
|
||
2: ["scale_up", "increase_memory", "increase_cpu", "adjust_limits"],
|
||
3: ["apply_hotfix", "update_config", "patch_deployment", "rollback"],
|
||
4: ["create_issue", "notify_team", "schedule_fix", "manual_intervention"],
|
||
}
|
||
for tier, actions in tier_actions.items():
|
||
if action in actions:
|
||
return tier
|
||
return 1 # 預設 Tier 1
|
||
|
||
def _default_recommendation(self) -> dict:
|
||
"""預設推薦 (無歷史數據時)"""
|
||
return {
|
||
"action": "restart_pod",
|
||
"confidence": 0.0, # 🔴 預設推薦不是 AI 分析,信心度設 0
|
||
"tier": 1,
|
||
"based_on": "無歷史數據,使用預設",
|
||
"avg_execution_time": 30.0,
|
||
"alternatives": [
|
||
{"action": "delete_pod", "confidence": 0.0, "tier": 1},
|
||
],
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# Singleton
|
||
# =============================================================================
|
||
|
||
_learning_service: LearningService | None = None
|
||
|
||
|
||
def get_learning_service() -> LearningService:
|
||
"""取得學習服務 singleton"""
|
||
global _learning_service
|
||
if _learning_service is None:
|
||
_learning_service = LearningService()
|
||
return _learning_service
|