Wave 2 Deployment: - Worker HPA: min:1 max:3, CPU 70%, Memory 80% - 前置條件: XCLAIM + terminationGracePeriodSeconds:90 (Wave 1 ✅) - 比 API/Web 更保守的擴縮策略 (120s up, 600s down) Confidence Fix: - 非 AI 分析來源 (fallback/playbook/historical/consensus) 設 confidence=0.0 - 避免混淆 AI 信心度與其他指標 (成功率/相似度) - 涉及: github_webhook, decision_manager, intent_classifier, learning_service Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
616 lines
21 KiB
Python
616 lines
21 KiB
Python
"""
|
|
Learning Service - Phase 5 持續學習迴圈
|
|
======================================
|
|
ADR-030: 智能自動修復系統
|
|
Phase D-G P0 修正: 符合 leWOOOgo 積木化原則
|
|
|
|
從執行結果中學習,持續優化決策:
|
|
1. 更新 Playbook 統計 (成功率/執行次數)
|
|
2. 調整信任度 (成功 +分 / 失敗 -分)
|
|
3. 萃取新 Playbook (成功案例自動萃取)
|
|
4. 處理人工反饋 (有效性評分)
|
|
5. 🆕 Redis 持久化學習數據 (透過 Repository)
|
|
6. 🆕 修復推薦 (基於歷史成功率)
|
|
|
|
設計原則:
|
|
- 非同步執行,不阻塞主流程
|
|
- 失敗容忍,學習失敗不影響執行結果
|
|
- 完整審計追蹤
|
|
- 🆕 Service 不直接存取 Redis (透過 ILearningRepository)
|
|
|
|
版本: v1.1
|
|
建立: 2026-03-26 (台北時區)
|
|
更新: 2026-03-29 (台北時區) - P0 修正: 新增 Repository 層
|
|
"""
|
|
|
|
from dataclasses import dataclass, field
|
|
from datetime import UTC, datetime
|
|
from enum import Enum
|
|
from typing import Any
|
|
|
|
import structlog
|
|
|
|
from src.models.approval import ApprovalRequest
|
|
from src.models.incident import IncidentStatus
|
|
from src.repositories.interfaces import ILearningRepository
|
|
from src.repositories.learning_repository import get_learning_repository
|
|
from src.services.trust_engine import get_trust_manager
|
|
|
|
logger = structlog.get_logger(__name__)
|
|
|
|
|
|
# =============================================================================
|
|
# Constants
|
|
# =============================================================================
|
|
|
|
|
|
class FeedbackType(str, Enum):
|
|
"""反饋類型"""
|
|
|
|
EXECUTION_SUCCESS = "execution_success" # 執行成功
|
|
EXECUTION_FAILURE = "execution_failure" # 執行失敗
|
|
HUMAN_APPROVE = "human_approve" # 人工批准
|
|
HUMAN_REJECT = "human_reject" # 人工拒絕
|
|
HUMAN_OVERRIDE = "human_override" # 人工覆蓋 AI 決策
|
|
EFFECTIVENESS_RATING = "effectiveness_rating" # 有效性評分
|
|
|
|
|
|
# 信任度調整參數
|
|
TRUST_SUCCESS_BOOST = 1 # 成功 +1 分
|
|
TRUST_FAILURE_PENALTY = 2 # 失敗 -2 分 (或歸零)
|
|
TRUST_HUMAN_REJECT_PENALTY = 1 # 人工拒絕 -1 分
|
|
|
|
|
|
# =============================================================================
|
|
# Data Models
|
|
# =============================================================================
|
|
|
|
|
|
@dataclass
|
|
class ExecutionResult:
|
|
"""執行結果"""
|
|
|
|
approval_id: str
|
|
incident_id: str
|
|
action: str
|
|
success: bool
|
|
error_message: str | None = None
|
|
duration_seconds: float = 0.0
|
|
executed_at: datetime = field(default_factory=lambda: datetime.now(UTC))
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
return {
|
|
"approval_id": self.approval_id,
|
|
"incident_id": self.incident_id,
|
|
"action": self.action,
|
|
"success": self.success,
|
|
"error_message": self.error_message,
|
|
"duration_seconds": self.duration_seconds,
|
|
"executed_at": self.executed_at.isoformat(),
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class FeedbackRequest:
|
|
"""人工反饋請求"""
|
|
|
|
incident_id: str
|
|
feedback_type: FeedbackType
|
|
effectiveness_score: int | None = None # 1-5 分
|
|
learning_notes: str | None = None # 學習筆記
|
|
submitted_by: str | None = None
|
|
submitted_at: datetime = field(default_factory=lambda: datetime.now(UTC))
|
|
|
|
|
|
@dataclass
|
|
class LearningRecord:
|
|
"""學習記錄"""
|
|
|
|
incident_id: str
|
|
feedback_type: FeedbackType
|
|
action_pattern: str
|
|
trust_before: int
|
|
trust_after: int
|
|
playbook_updated: bool = False
|
|
new_playbook_id: str | None = None
|
|
learned_at: datetime = field(default_factory=lambda: datetime.now(UTC))
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
return {
|
|
"incident_id": self.incident_id,
|
|
"feedback_type": self.feedback_type.value,
|
|
"action_pattern": self.action_pattern,
|
|
"trust_before": self.trust_before,
|
|
"trust_after": self.trust_after,
|
|
"playbook_updated": self.playbook_updated,
|
|
"new_playbook_id": self.new_playbook_id,
|
|
"learned_at": self.learned_at.isoformat(),
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# Learning Service
|
|
# =============================================================================
|
|
|
|
|
|
class LearningService:
|
|
"""
|
|
持續學習服務
|
|
|
|
職責:
|
|
1. 處理執行結果 → 更新 Playbook + 信任度
|
|
2. 處理人工反饋 → 調整 Playbook 有效性
|
|
3. 萃取新 Playbook (成功案例)
|
|
4. 🆕 Redis 持久化學習數據 (透過 Repository)
|
|
5. 🆕 修復推薦 (基於歷史成功率)
|
|
|
|
2026-03-29 P0 修正: 符合 leWOOOgo 積木化原則
|
|
- 透過 ILearningRepository 存取 Redis
|
|
- 不直接依賴 Redis Client
|
|
"""
|
|
|
|
# 推薦門檻
|
|
MIN_SAMPLES = 5 # 最少需要 N 次數據才能推薦
|
|
SUCCESS_RATE_THRESHOLD = 0.6 # 成功率門檻
|
|
|
|
def __init__(
|
|
self,
|
|
repository: ILearningRepository | None = None,
|
|
):
|
|
self._trust_manager = get_trust_manager()
|
|
self._repository = repository or get_learning_repository()
|
|
|
|
async def process_execution_result(
|
|
self,
|
|
approval: ApprovalRequest,
|
|
result: ExecutionResult,
|
|
) -> LearningRecord:
|
|
"""
|
|
處理執行結果,觸發學習
|
|
|
|
Args:
|
|
approval: 原始審批請求
|
|
result: 執行結果
|
|
|
|
Returns:
|
|
LearningRecord: 學習記錄
|
|
"""
|
|
action_pattern = self._extract_action_pattern(approval.action)
|
|
|
|
# 取得當前信任分數
|
|
trust_record = self._trust_manager.get_trust_record(action_pattern)
|
|
trust_before = trust_record.score if trust_record else 0
|
|
|
|
# 1. 調整信任度
|
|
if result.success:
|
|
# 成功: 記錄批准 (信任分數 +1)
|
|
self._trust_manager.record_approval(
|
|
action_pattern=action_pattern,
|
|
user_role="system",
|
|
user_id="auto_learning",
|
|
)
|
|
feedback_type = FeedbackType.EXECUTION_SUCCESS
|
|
else:
|
|
# 失敗: 記錄拒絕 (信任分數歸零)
|
|
self._trust_manager.record_rejection(
|
|
action_pattern=action_pattern,
|
|
user_role="system",
|
|
user_id="auto_learning",
|
|
reason=result.error_message,
|
|
)
|
|
feedback_type = FeedbackType.EXECUTION_FAILURE
|
|
|
|
# 取得更新後的信任分數
|
|
trust_record = self._trust_manager.get_trust_record(action_pattern)
|
|
trust_after = trust_record.score if trust_record else 0
|
|
|
|
# 2. 更新 Playbook 統計 (如果有匹配)
|
|
playbook_updated = False
|
|
if hasattr(approval, "matched_playbook_id") and approval.matched_playbook_id:
|
|
try:
|
|
await self._update_playbook_stats(
|
|
playbook_id=approval.matched_playbook_id,
|
|
success=result.success,
|
|
)
|
|
playbook_updated = True
|
|
except Exception as e:
|
|
logger.warning(
|
|
"playbook_stats_update_failed",
|
|
playbook_id=approval.matched_playbook_id,
|
|
error=str(e),
|
|
)
|
|
|
|
# 3. 嘗試萃取新 Playbook (成功且無匹配 Playbook)
|
|
new_playbook_id = None
|
|
if result.success and not getattr(approval, "matched_playbook_id", None):
|
|
try:
|
|
new_playbook_id = await self._try_extract_playbook(
|
|
incident_id=result.incident_id,
|
|
action=approval.action,
|
|
)
|
|
except Exception as e:
|
|
logger.warning(
|
|
"playbook_extraction_failed",
|
|
incident_id=result.incident_id,
|
|
error=str(e),
|
|
)
|
|
|
|
# 建立學習記錄
|
|
record = LearningRecord(
|
|
incident_id=result.incident_id,
|
|
feedback_type=feedback_type,
|
|
action_pattern=action_pattern,
|
|
trust_before=trust_before,
|
|
trust_after=trust_after,
|
|
playbook_updated=playbook_updated,
|
|
new_playbook_id=new_playbook_id,
|
|
)
|
|
|
|
logger.info(
|
|
"learning_completed",
|
|
incident_id=result.incident_id,
|
|
success=result.success,
|
|
trust_change=f"{trust_before} → {trust_after}",
|
|
playbook_updated=playbook_updated,
|
|
new_playbook=new_playbook_id,
|
|
)
|
|
|
|
return record
|
|
|
|
async def process_human_feedback(
|
|
self,
|
|
feedback: FeedbackRequest,
|
|
) -> LearningRecord:
|
|
"""
|
|
處理人工反饋
|
|
|
|
Args:
|
|
feedback: 反饋請求
|
|
|
|
Returns:
|
|
LearningRecord: 學習記錄
|
|
"""
|
|
# 從 incident 取得 action pattern (需查詢)
|
|
action_pattern = f"incident:{feedback.incident_id}"
|
|
|
|
trust_record = self._trust_manager.get_trust_record(action_pattern)
|
|
trust_before = trust_record.score if trust_record else 0
|
|
|
|
playbook_updated = False
|
|
|
|
if feedback.feedback_type == FeedbackType.HUMAN_APPROVE:
|
|
# 人工批准: 信任 +1
|
|
self._trust_manager.record_approval(
|
|
action_pattern=action_pattern,
|
|
user_role="human",
|
|
user_id=feedback.submitted_by,
|
|
)
|
|
|
|
elif feedback.feedback_type == FeedbackType.HUMAN_REJECT:
|
|
# 人工拒絕: 信任歸零
|
|
self._trust_manager.record_rejection(
|
|
action_pattern=action_pattern,
|
|
user_role="human",
|
|
user_id=feedback.submitted_by,
|
|
reason="Human rejected",
|
|
)
|
|
|
|
elif feedback.feedback_type == FeedbackType.EFFECTIVENESS_RATING:
|
|
# 有效性評分
|
|
if feedback.effectiveness_score is not None:
|
|
if feedback.effectiveness_score >= 4:
|
|
# 高評分: 增加信任
|
|
self._trust_manager.record_approval(
|
|
action_pattern=action_pattern,
|
|
user_role="feedback",
|
|
user_id=feedback.submitted_by,
|
|
)
|
|
playbook_updated = await self._promote_playbook(feedback.incident_id)
|
|
elif feedback.effectiveness_score <= 2:
|
|
# 低評分: 降低信任
|
|
self._trust_manager.record_rejection(
|
|
action_pattern=action_pattern,
|
|
user_role="feedback",
|
|
user_id=feedback.submitted_by,
|
|
reason=f"Low effectiveness score: {feedback.effectiveness_score}",
|
|
)
|
|
playbook_updated = await self._demote_playbook(feedback.incident_id)
|
|
|
|
trust_record = self._trust_manager.get_trust_record(action_pattern)
|
|
trust_after = trust_record.score if trust_record else 0
|
|
|
|
record = LearningRecord(
|
|
incident_id=feedback.incident_id,
|
|
feedback_type=feedback.feedback_type,
|
|
action_pattern=action_pattern,
|
|
trust_before=trust_before,
|
|
trust_after=trust_after,
|
|
playbook_updated=playbook_updated,
|
|
)
|
|
|
|
logger.info(
|
|
"human_feedback_processed",
|
|
incident_id=feedback.incident_id,
|
|
feedback_type=feedback.feedback_type.value,
|
|
effectiveness_score=feedback.effectiveness_score,
|
|
trust_change=f"{trust_before} → {trust_after}",
|
|
)
|
|
|
|
return record
|
|
|
|
# =========================================================================
|
|
# Private Methods
|
|
# =========================================================================
|
|
|
|
def _extract_action_pattern(self, action: str) -> str:
|
|
"""從 action 字串提取 pattern"""
|
|
if not action:
|
|
return "unknown"
|
|
|
|
parts = action.split()
|
|
if len(parts) < 3:
|
|
return "unknown"
|
|
|
|
verb = parts[1] if len(parts) > 1 else "unknown"
|
|
resource_part = parts[2] if len(parts) > 2 else ""
|
|
|
|
if "/" in resource_part:
|
|
resource_name = resource_part.split("/")[-1]
|
|
else:
|
|
resource_name = resource_part
|
|
|
|
# 移除 pod hash suffix
|
|
resource_parts = resource_name.split("-")
|
|
if len(resource_parts) >= 3:
|
|
resource_name = "-".join(resource_parts[:-2]) + "-*"
|
|
|
|
return f"{verb}:{resource_name}"
|
|
|
|
async def _update_playbook_stats(
|
|
self,
|
|
playbook_id: str,
|
|
success: bool,
|
|
) -> None:
|
|
"""更新 Playbook 統計"""
|
|
try:
|
|
from src.services.playbook_service import get_playbook_service
|
|
|
|
service = get_playbook_service()
|
|
await service.record_execution(playbook_id, success)
|
|
|
|
except Exception as e:
|
|
logger.warning(
|
|
"playbook_stats_update_error",
|
|
playbook_id=playbook_id,
|
|
error=str(e),
|
|
)
|
|
|
|
async def _try_extract_playbook(
|
|
self,
|
|
incident_id: str,
|
|
action: str,
|
|
) -> str | None:
|
|
"""嘗試從成功案例萃取 Playbook"""
|
|
try:
|
|
from src.repositories.incident_repository import get_incident_repository
|
|
from src.services.playbook_service import get_playbook_service
|
|
|
|
# 取得 Incident
|
|
repo = get_incident_repository()
|
|
incident = await repo.get_by_id(incident_id)
|
|
|
|
if not incident:
|
|
return None
|
|
|
|
# 確認狀態為 RESOLVED
|
|
if incident.status not in [IncidentStatus.RESOLVED, IncidentStatus.CLOSED]:
|
|
return None
|
|
|
|
# 萃取 Playbook
|
|
service = get_playbook_service()
|
|
playbook = await service.extract_from_incident(
|
|
incident=incident,
|
|
auto_approve=False, # 需人工審核
|
|
)
|
|
|
|
if playbook:
|
|
logger.info(
|
|
"playbook_auto_extracted",
|
|
incident_id=incident_id,
|
|
playbook_id=playbook.playbook_id,
|
|
)
|
|
return playbook.playbook_id
|
|
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.warning(
|
|
"playbook_extraction_error",
|
|
incident_id=incident_id,
|
|
error=str(e),
|
|
)
|
|
return None
|
|
|
|
async def _promote_playbook(self, incident_id: str) -> bool:
|
|
"""提升 Playbook 信心度 (高評分)"""
|
|
# TODO: 實作 Playbook 信心度提升邏輯
|
|
logger.debug("playbook_promoted", incident_id=incident_id)
|
|
return True
|
|
|
|
async def _demote_playbook(self, incident_id: str) -> bool:
|
|
"""降低 Playbook 信心度 (低評分)"""
|
|
# TODO: 實作 Playbook 信心度降低邏輯
|
|
logger.debug("playbook_demoted", incident_id=incident_id)
|
|
return True
|
|
|
|
# =========================================================================
|
|
# 🆕 Phase D-G P0 修正: 新增方法
|
|
# =========================================================================
|
|
|
|
async def record_repair_result(
|
|
self,
|
|
anomaly_key: str,
|
|
repair_action: str,
|
|
success: bool,
|
|
root_cause: str | None = None,
|
|
fix_description: str | None = None,
|
|
execution_time_seconds: float | None = None,
|
|
) -> bool:
|
|
"""
|
|
記錄修復結果到 Repository (Redis 持久化)
|
|
|
|
2026-03-29 P0 修正: 透過 Repository 存取 Redis
|
|
|
|
Args:
|
|
anomaly_key: 異常 key
|
|
repair_action: 修復動作
|
|
success: 是否成功
|
|
root_cause: 根因 (如果找到)
|
|
fix_description: 修復說明
|
|
execution_time_seconds: 執行時間
|
|
|
|
Returns:
|
|
bool: 是否成功記錄
|
|
"""
|
|
return await self._repository.record_repair(
|
|
anomaly_key=anomaly_key,
|
|
repair_action=repair_action,
|
|
success=success,
|
|
root_cause=root_cause,
|
|
fix_description=fix_description,
|
|
execution_time_seconds=execution_time_seconds,
|
|
)
|
|
|
|
async def get_recommended_fix(self, anomaly_key: str) -> dict:
|
|
"""
|
|
根據歷史學習,推薦最佳修復方案
|
|
|
|
2026-03-29 P0 修正: 使用 Repository 取得統計
|
|
|
|
Returns:
|
|
{
|
|
'action': 'scale_up',
|
|
'confidence': 0.85,
|
|
'tier': 2,
|
|
'based_on': '12 次歷史數據',
|
|
'avg_execution_time': 45.2,
|
|
'alternatives': [...]
|
|
}
|
|
"""
|
|
import math
|
|
|
|
all_stats = await self._repository.get_all_repair_stats(anomaly_key)
|
|
|
|
if not all_stats:
|
|
return self._default_recommendation()
|
|
|
|
# 計算各動作的加權分數
|
|
scored_actions = []
|
|
for action, stats in all_stats.items():
|
|
if stats["total"] >= self.MIN_SAMPLES:
|
|
success_rate = stats["success_rate"]
|
|
if success_rate >= self.SUCCESS_RATE_THRESHOLD:
|
|
# 加權: 成功率 * log(樣本數)
|
|
score = success_rate * math.log(stats["total"] + 1)
|
|
|
|
# 取得平均執行時間
|
|
history = await self._repository.get_repair_history(
|
|
anomaly_key, action, limit=20
|
|
)
|
|
times = [
|
|
h["execution_time"]
|
|
for h in history
|
|
if h.get("execution_time")
|
|
]
|
|
avg_time = sum(times) / len(times) if times else 0.0
|
|
|
|
scored_actions.append({
|
|
"action": action,
|
|
"score": score,
|
|
"success_rate": success_rate,
|
|
"total_samples": stats["total"],
|
|
"tier": self._get_action_tier(action),
|
|
"avg_execution_time": avg_time,
|
|
})
|
|
|
|
if not scored_actions:
|
|
return self._default_recommendation()
|
|
|
|
# 排序: 優先高成功率,其次低 Tier
|
|
scored_actions.sort(key=lambda x: (-x["score"], x["tier"]))
|
|
|
|
best = scored_actions[0]
|
|
alternatives = scored_actions[1:3] if len(scored_actions) > 1 else []
|
|
|
|
return {
|
|
"action": best["action"],
|
|
"confidence": 0.0, # 🔴 歷史學習不是 AI 分析,信心度設 0
|
|
"tier": best["tier"],
|
|
"based_on": f"{best['total_samples']} 次歷史數據",
|
|
"avg_execution_time": best["avg_execution_time"],
|
|
"success_rate": best["success_rate"], # 保留原始成功率作為參考
|
|
"alternatives": [
|
|
{"action": a["action"], "confidence": 0.0, "success_rate": a["success_rate"], "tier": a["tier"]}
|
|
for a in alternatives
|
|
],
|
|
}
|
|
|
|
async def get_learning_summary(self, anomaly_key: str) -> dict:
|
|
"""
|
|
取得學習摘要
|
|
|
|
2026-03-29 P0 修正: 委託 Repository 實作
|
|
|
|
Returns:
|
|
{
|
|
'anomaly_key': 'abc123',
|
|
'total_repair_attempts': 8,
|
|
'overall_success_rate': 0.625,
|
|
'actions_tried': ['restart_pod', 'scale_up'],
|
|
'best_action': {'action': 'scale_up', 'success_rate': 0.75},
|
|
'learning_status': 'sufficient',
|
|
}
|
|
"""
|
|
return await self._repository.get_learning_summary(anomaly_key)
|
|
|
|
def _get_action_tier(self, action: str) -> int:
|
|
"""取得動作的 Tier"""
|
|
tier_actions = {
|
|
1: ["restart_pod", "restart_container", "delete_pod"],
|
|
2: ["scale_up", "increase_memory", "increase_cpu", "adjust_limits"],
|
|
3: ["apply_hotfix", "update_config", "patch_deployment", "rollback"],
|
|
4: ["create_issue", "notify_team", "schedule_fix", "manual_intervention"],
|
|
}
|
|
for tier, actions in tier_actions.items():
|
|
if action in actions:
|
|
return tier
|
|
return 1 # 預設 Tier 1
|
|
|
|
def _default_recommendation(self) -> dict:
|
|
"""預設推薦 (無歷史數據時)"""
|
|
return {
|
|
"action": "restart_pod",
|
|
"confidence": 0.0, # 🔴 預設推薦不是 AI 分析,信心度設 0
|
|
"tier": 1,
|
|
"based_on": "無歷史數據,使用預設",
|
|
"avg_execution_time": 30.0,
|
|
"alternatives": [
|
|
{"action": "delete_pod", "confidence": 0.0, "tier": 1},
|
|
],
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# Singleton
|
|
# =============================================================================
|
|
|
|
_learning_service: LearningService | None = None
|
|
|
|
|
|
def get_learning_service() -> LearningService:
|
|
"""取得學習服務 singleton"""
|
|
global _learning_service
|
|
if _learning_service is None:
|
|
_learning_service = LearningService()
|
|
return _learning_service
|