Wave 8 P3.2 模型版本追蹤 + ADR-100 SLO 自我治理 + 配套: P3.2 — Model Version Tracking: - model_version_probe.py (268 行) — 探測 Ollama / OpenRouter 等 provider 的 model version - model_version_tracker.py (101 行) — 對齊 PG provider_version_history 表 - migrations/p3_2_provider_version_history.sql + rollback — 25 行 schema - db/models.py +32 行 — ProviderVersionHistory ORM ADR-100 — AI 自主化 SLO: - docs/adr/ADR-100-ai-autonomous-slo.md (167 行) — 飛輪 SLO 設計與閾值 - ops/monitoring/slo-rules.yml (254 行) — Prometheus SLO recording rules + alerts - ops/monitoring/tests/test_slo_rules.yaml (242 行) — promtool unit tests 整合修改: - main.py +72 行 — Lifespan 啟動 model_version_probe + KB rot cleaner schedule - gitea_webhook.py +45 行 — webhook 接收 model 版本變化通知 - ci_auto_repair.py / evidence_snapshot.py / pre_decision_investigator.py — 配合接線 新測試: - test_kb_rot_cleaner_schedule.py (120 行) — 9 tests pass - test_slo_rules.yaml — promtool 驗收 Tests: 9 passed (test_kb_rot_cleaner_schedule) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-Authored-By: Multiple Engineers (P3.2 + ADR-100) <noreply@anthropic.com>
470 lines
16 KiB
Python
470 lines
16 KiB
Python
"""
|
||
CI Auto-Repair Service - Phase 13.1 #78
|
||
========================================
|
||
CI 失敗自動修復服務,根據風險分級決定執行策略
|
||
|
||
策略:
|
||
- LOW: 自動執行修復 (如重啟 Runner、清理快取)
|
||
- MEDIUM: 發送 Telegram 確認,快速批准後執行
|
||
- HIGH: 建立 Approval,等待人工審核
|
||
- CRITICAL: 禁止自動修復,僅通知
|
||
|
||
整合:
|
||
- Intent Classifier: 判斷修復意圖類型
|
||
- Complexity Scorer: 評估修復複雜度
|
||
- AI Router: 選擇最適 AI 進行分析
|
||
|
||
版本: v1.0
|
||
建立: 2026-03-26 16:50 (台北時區)
|
||
建立者: Claude Code
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from dataclasses import dataclass
|
||
from enum import Enum
|
||
|
||
import structlog
|
||
|
||
from src.services.complexity_scorer import get_complexity_scorer
|
||
from src.services.intent_classifier import IntentType, RiskLevel, get_intent_classifier
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
|
||
# =============================================================================
|
||
# Types
|
||
# =============================================================================
|
||
|
||
|
||
class RepairAction(Enum):
|
||
"""修復動作類型"""
|
||
RESTART_RUNNER = "restart_runner"
|
||
CLEAR_CACHE = "clear_cache"
|
||
RETRY_WORKFLOW = "retry_workflow"
|
||
ROLLBACK_COMMIT = "rollback_commit"
|
||
FIX_CONFIG = "fix_config"
|
||
FIX_DEPENDENCY = "fix_dependency"
|
||
SCALE_RESOURCE = "scale_resource"
|
||
MANUAL_REQUIRED = "manual_required"
|
||
|
||
|
||
class ExecutionDecision(Enum):
|
||
"""執行決策"""
|
||
AUTO_EXECUTE = "auto_execute" # 直接自動執行
|
||
TELEGRAM_CONFIRM = "telegram_confirm" # Telegram 快速確認
|
||
APPROVAL_REQUIRED = "approval_required" # 建立 Approval 等待審核
|
||
BLOCKED = "blocked" # 禁止執行,僅通知
|
||
|
||
|
||
@dataclass
|
||
class RepairRecommendation:
|
||
"""修復建議"""
|
||
action: RepairAction
|
||
command: str | None
|
||
reason: str
|
||
risk_level: RiskLevel
|
||
execution_decision: ExecutionDecision
|
||
confidence: float
|
||
estimated_duration_seconds: int
|
||
rollback_command: str | None = None
|
||
|
||
|
||
@dataclass
|
||
class CIRepairDecision:
|
||
"""CI 修復決策結果"""
|
||
should_repair: bool
|
||
execution_decision: ExecutionDecision
|
||
recommendations: list[RepairRecommendation]
|
||
risk_level: RiskLevel
|
||
complexity_score: int
|
||
intent_type: IntentType
|
||
reason: str
|
||
metadata: dict
|
||
|
||
|
||
# =============================================================================
|
||
# Repair Strategy Mapping
|
||
# =============================================================================
|
||
|
||
|
||
# 錯誤類型 → 修復動作映射
|
||
ERROR_TYPE_REPAIR_MAP: dict[str, list[RepairAction]] = {
|
||
"build": [RepairAction.CLEAR_CACHE, RepairAction.FIX_DEPENDENCY],
|
||
"test": [RepairAction.RETRY_WORKFLOW, RepairAction.FIX_CONFIG],
|
||
"lint": [RepairAction.RETRY_WORKFLOW],
|
||
"deploy": [RepairAction.ROLLBACK_COMMIT, RepairAction.FIX_CONFIG],
|
||
"timeout": [RepairAction.RESTART_RUNNER, RepairAction.SCALE_RESOURCE],
|
||
"runner": [RepairAction.RESTART_RUNNER],
|
||
"dependency": [RepairAction.CLEAR_CACHE, RepairAction.FIX_DEPENDENCY],
|
||
"unknown": [RepairAction.MANUAL_REQUIRED],
|
||
}
|
||
|
||
|
||
# 修復動作 → 風險等級映射
|
||
ACTION_RISK_MAP: dict[RepairAction, RiskLevel] = {
|
||
RepairAction.RETRY_WORKFLOW: RiskLevel.LOW,
|
||
RepairAction.CLEAR_CACHE: RiskLevel.LOW,
|
||
RepairAction.RESTART_RUNNER: RiskLevel.LOW,
|
||
RepairAction.FIX_CONFIG: RiskLevel.MEDIUM,
|
||
RepairAction.FIX_DEPENDENCY: RiskLevel.MEDIUM,
|
||
RepairAction.SCALE_RESOURCE: RiskLevel.MEDIUM,
|
||
RepairAction.ROLLBACK_COMMIT: RiskLevel.HIGH,
|
||
RepairAction.MANUAL_REQUIRED: RiskLevel.CRITICAL,
|
||
}
|
||
|
||
|
||
# 風險等級 → 執行決策映射
|
||
RISK_EXECUTION_MAP: dict[RiskLevel, ExecutionDecision] = {
|
||
RiskLevel.LOW: ExecutionDecision.AUTO_EXECUTE,
|
||
RiskLevel.MEDIUM: ExecutionDecision.TELEGRAM_CONFIRM,
|
||
RiskLevel.HIGH: ExecutionDecision.APPROVAL_REQUIRED,
|
||
RiskLevel.CRITICAL: ExecutionDecision.BLOCKED,
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# CI Auto-Repair Service
|
||
# =============================================================================
|
||
|
||
|
||
class CIAutoRepairService:
|
||
"""
|
||
CI 自動修復服務
|
||
|
||
整合智能路由 (Phase 13.3) 進行風險評估和修復決策
|
||
"""
|
||
|
||
def __init__(self):
|
||
self._intent_classifier = get_intent_classifier()
|
||
self._complexity_scorer = get_complexity_scorer()
|
||
|
||
async def evaluate_repair(
|
||
self,
|
||
error_type: str,
|
||
workflow_name: str,
|
||
repo: str,
|
||
failure_context: dict,
|
||
diagnosis_summary: str | None = None,
|
||
) -> CIRepairDecision:
|
||
"""
|
||
評估 CI 失敗的修復策略
|
||
|
||
Args:
|
||
error_type: 錯誤類型 (build/test/lint/deploy/timeout)
|
||
workflow_name: Workflow 名稱
|
||
repo: 倉庫名稱
|
||
failure_context: 失敗上下文
|
||
diagnosis_summary: AI 診斷摘要 (可選)
|
||
|
||
Returns:
|
||
CIRepairDecision: 修復決策
|
||
"""
|
||
logger.info(
|
||
"ci_repair_evaluation_started",
|
||
error_type=error_type,
|
||
workflow_name=workflow_name,
|
||
repo=repo,
|
||
)
|
||
|
||
# 1. 構建分析文字
|
||
analysis_text = self._build_analysis_text(
|
||
error_type=error_type,
|
||
workflow_name=workflow_name,
|
||
diagnosis_summary=diagnosis_summary,
|
||
)
|
||
|
||
# 2. 意圖分類
|
||
# 2026-04-27 P3.1-T3 by Claude — 修復缺失 await(classify 是 async method)
|
||
intent_result = await self._intent_classifier.classify(analysis_text)
|
||
|
||
# 3. 複雜度評估
|
||
complexity_result = self._complexity_scorer.score(
|
||
text=analysis_text,
|
||
context={
|
||
"error_type": error_type,
|
||
"workflow_name": workflow_name,
|
||
"repo": repo,
|
||
**failure_context,
|
||
},
|
||
)
|
||
|
||
# 4. 獲取可能的修復動作
|
||
possible_actions = ERROR_TYPE_REPAIR_MAP.get(
|
||
error_type.lower(),
|
||
[RepairAction.MANUAL_REQUIRED],
|
||
)
|
||
|
||
# 5. 生成修復建議
|
||
recommendations = self._generate_recommendations(
|
||
possible_actions=possible_actions,
|
||
error_type=error_type,
|
||
workflow_name=workflow_name,
|
||
complexity_score=complexity_result.score,
|
||
)
|
||
|
||
# 6. 決定整體風險等級和執行策略
|
||
overall_risk = self._determine_overall_risk(
|
||
recommendations=recommendations,
|
||
intent_risk=intent_result.risk_level,
|
||
complexity_score=complexity_result.score,
|
||
)
|
||
|
||
execution_decision = RISK_EXECUTION_MAP.get(
|
||
overall_risk,
|
||
ExecutionDecision.APPROVAL_REQUIRED,
|
||
)
|
||
|
||
# 7. 特殊規則覆蓋
|
||
execution_decision = self._apply_special_rules(
|
||
execution_decision=execution_decision,
|
||
error_type=error_type,
|
||
workflow_name=workflow_name,
|
||
repo=repo,
|
||
)
|
||
|
||
decision = CIRepairDecision(
|
||
should_repair=execution_decision != ExecutionDecision.BLOCKED,
|
||
execution_decision=execution_decision,
|
||
recommendations=recommendations,
|
||
risk_level=overall_risk,
|
||
complexity_score=complexity_result.score,
|
||
intent_type=intent_result.intent,
|
||
reason=self._generate_decision_reason(
|
||
execution_decision=execution_decision,
|
||
overall_risk=overall_risk,
|
||
error_type=error_type,
|
||
),
|
||
metadata={
|
||
"intent_confidence": intent_result.confidence,
|
||
"complexity_factors": complexity_result.factors,
|
||
"workflow_name": workflow_name,
|
||
"repo": repo,
|
||
},
|
||
)
|
||
|
||
logger.info(
|
||
"ci_repair_evaluation_completed",
|
||
should_repair=decision.should_repair,
|
||
execution_decision=execution_decision.value,
|
||
risk_level=overall_risk.value,
|
||
recommendations_count=len(recommendations),
|
||
)
|
||
|
||
return decision
|
||
|
||
def _build_analysis_text(
|
||
self,
|
||
error_type: str,
|
||
workflow_name: str,
|
||
diagnosis_summary: str | None,
|
||
) -> str:
|
||
"""構建意圖分類用的分析文字"""
|
||
parts = [
|
||
f"CI workflow '{workflow_name}' failed",
|
||
f"Error type: {error_type}",
|
||
]
|
||
if diagnosis_summary:
|
||
parts.append(f"Diagnosis: {diagnosis_summary}")
|
||
return ". ".join(parts)
|
||
|
||
def _generate_recommendations(
|
||
self,
|
||
possible_actions: list[RepairAction],
|
||
error_type: str,
|
||
workflow_name: str,
|
||
complexity_score: int,
|
||
) -> list[RepairRecommendation]:
|
||
"""生成修復建議列表"""
|
||
recommendations = []
|
||
|
||
for action in possible_actions:
|
||
risk = ACTION_RISK_MAP.get(action, RiskLevel.HIGH)
|
||
|
||
# 根據複雜度調整風險
|
||
if complexity_score >= 4:
|
||
risk = RiskLevel.HIGH if risk == RiskLevel.MEDIUM else risk
|
||
|
||
command, rollback = self._get_repair_command(
|
||
action=action,
|
||
workflow_name=workflow_name,
|
||
)
|
||
|
||
recommendations.append(RepairRecommendation(
|
||
action=action,
|
||
command=command,
|
||
reason=self._get_action_reason(action, error_type),
|
||
risk_level=risk,
|
||
execution_decision=RISK_EXECUTION_MAP.get(risk, ExecutionDecision.APPROVAL_REQUIRED),
|
||
confidence=self._calculate_confidence(action, error_type),
|
||
estimated_duration_seconds=self._estimate_duration(action),
|
||
rollback_command=rollback,
|
||
))
|
||
|
||
# 按風險等級排序 (低風險優先)
|
||
risk_order = {RiskLevel.LOW: 0, RiskLevel.MEDIUM: 1, RiskLevel.HIGH: 2, RiskLevel.CRITICAL: 3}
|
||
recommendations.sort(key=lambda r: risk_order.get(r.risk_level, 99))
|
||
|
||
return recommendations
|
||
|
||
def _get_repair_command(
|
||
self,
|
||
action: RepairAction,
|
||
workflow_name: str,
|
||
) -> tuple[str | None, str | None]:
|
||
"""獲取修復指令和回滾指令"""
|
||
commands: dict[RepairAction, tuple[str | None, str | None]] = {
|
||
RepairAction.RETRY_WORKFLOW: (
|
||
f"gh workflow run {workflow_name}",
|
||
None,
|
||
),
|
||
RepairAction.CLEAR_CACHE: (
|
||
"gh cache delete --all",
|
||
None,
|
||
),
|
||
RepairAction.RESTART_RUNNER: (
|
||
"sudo systemctl restart actions.runner.*",
|
||
None,
|
||
),
|
||
RepairAction.SCALE_RESOURCE: (
|
||
"kubectl scale deployment/actions-runner --replicas=3",
|
||
"kubectl scale deployment/actions-runner --replicas=2",
|
||
),
|
||
RepairAction.ROLLBACK_COMMIT: (
|
||
"git revert HEAD --no-edit && git push",
|
||
"git revert HEAD --no-edit && git push",
|
||
),
|
||
RepairAction.FIX_CONFIG: (
|
||
None, # 需要 AI 生成具體指令
|
||
None,
|
||
),
|
||
RepairAction.FIX_DEPENDENCY: (
|
||
"pnpm install --force && uv sync",
|
||
None,
|
||
),
|
||
RepairAction.MANUAL_REQUIRED: (
|
||
None,
|
||
None,
|
||
),
|
||
}
|
||
return commands.get(action, (None, None))
|
||
|
||
def _get_action_reason(self, action: RepairAction, error_type: str) -> str:
|
||
"""獲取修復動作的原因說明"""
|
||
reasons = {
|
||
RepairAction.RETRY_WORKFLOW: f"Retry workflow to recover from transient {error_type} failure",
|
||
RepairAction.CLEAR_CACHE: "Clear build/dependency cache to resolve potential cache corruption",
|
||
RepairAction.RESTART_RUNNER: "Restart GitHub Actions runner to recover from runner issues",
|
||
RepairAction.SCALE_RESOURCE: "Scale runner resources to handle timeout issues",
|
||
RepairAction.ROLLBACK_COMMIT: "Rollback recent commit that may have introduced the failure",
|
||
RepairAction.FIX_CONFIG: "Fix configuration that may be causing the failure",
|
||
RepairAction.FIX_DEPENDENCY: "Update or fix dependencies to resolve compatibility issues",
|
||
RepairAction.MANUAL_REQUIRED: "Manual investigation required due to complex failure",
|
||
}
|
||
return reasons.get(action, "Unknown action")
|
||
|
||
def _calculate_confidence(self, _action: RepairAction, _error_type: str) -> float:
|
||
"""
|
||
計算修復信心度
|
||
|
||
🔴 2026-03-29 修正: 規則匹配不是 AI 分析,統一返回 0.0
|
||
根據 feedback_confidence_truthfulness.md 鐵律
|
||
"""
|
||
# 規則匹配/規則引擎判斷,非 AI 分析
|
||
return 0.0
|
||
|
||
def _estimate_duration(self, action: RepairAction) -> int:
|
||
"""估算修復時間 (秒)"""
|
||
durations = {
|
||
RepairAction.RETRY_WORKFLOW: 300, # 5 分鐘
|
||
RepairAction.CLEAR_CACHE: 30,
|
||
RepairAction.RESTART_RUNNER: 60,
|
||
RepairAction.SCALE_RESOURCE: 120,
|
||
RepairAction.ROLLBACK_COMMIT: 180,
|
||
RepairAction.FIX_CONFIG: 600,
|
||
RepairAction.FIX_DEPENDENCY: 300,
|
||
RepairAction.MANUAL_REQUIRED: 3600,
|
||
}
|
||
return durations.get(action, 300)
|
||
|
||
def _determine_overall_risk(
|
||
self,
|
||
recommendations: list[RepairRecommendation],
|
||
intent_risk: RiskLevel,
|
||
complexity_score: int,
|
||
) -> RiskLevel:
|
||
"""決定整體風險等級"""
|
||
if not recommendations:
|
||
return RiskLevel.CRITICAL
|
||
|
||
# 取最低風險的建議作為基礎
|
||
min_risk = min(
|
||
recommendations,
|
||
key=lambda r: {RiskLevel.LOW: 0, RiskLevel.MEDIUM: 1, RiskLevel.HIGH: 2, RiskLevel.CRITICAL: 3}.get(r.risk_level, 99),
|
||
).risk_level
|
||
|
||
# 如果複雜度高,提升風險等級
|
||
if complexity_score >= 4 and min_risk == RiskLevel.LOW:
|
||
min_risk = RiskLevel.MEDIUM
|
||
elif complexity_score >= 5 and min_risk == RiskLevel.MEDIUM:
|
||
min_risk = RiskLevel.HIGH
|
||
|
||
# 如果意圖分類顯示高風險,取較高值
|
||
risk_order = {RiskLevel.LOW: 0, RiskLevel.MEDIUM: 1, RiskLevel.HIGH: 2, RiskLevel.CRITICAL: 3}
|
||
if risk_order.get(intent_risk, 0) > risk_order.get(min_risk, 0):
|
||
return intent_risk
|
||
|
||
return min_risk
|
||
|
||
def _apply_special_rules(
|
||
self,
|
||
execution_decision: ExecutionDecision,
|
||
error_type: str,
|
||
workflow_name: str,
|
||
repo: str,
|
||
) -> ExecutionDecision:
|
||
"""應用特殊規則覆蓋"""
|
||
# 生產部署相關的 workflow 強制需要審核
|
||
production_keywords = ["prod", "production", "release", "deploy"]
|
||
if any(kw in workflow_name.lower() for kw in production_keywords):
|
||
if execution_decision == ExecutionDecision.AUTO_EXECUTE:
|
||
return ExecutionDecision.TELEGRAM_CONFIRM
|
||
|
||
# rollback 錯誤類型強制需要審核
|
||
if error_type == "deploy":
|
||
if execution_decision in (ExecutionDecision.AUTO_EXECUTE, ExecutionDecision.TELEGRAM_CONFIRM):
|
||
return ExecutionDecision.APPROVAL_REQUIRED
|
||
|
||
return execution_decision
|
||
|
||
def _generate_decision_reason(
|
||
self,
|
||
execution_decision: ExecutionDecision,
|
||
overall_risk: RiskLevel,
|
||
error_type: str,
|
||
) -> str:
|
||
"""生成決策原因說明"""
|
||
reasons = {
|
||
ExecutionDecision.AUTO_EXECUTE: f"Low risk {error_type} failure, safe for auto-repair",
|
||
ExecutionDecision.TELEGRAM_CONFIRM: f"Medium risk {error_type} failure, quick Telegram confirmation recommended",
|
||
ExecutionDecision.APPROVAL_REQUIRED: f"High risk {error_type} failure, human approval required before repair",
|
||
ExecutionDecision.BLOCKED: f"Critical {error_type} failure, auto-repair blocked for safety",
|
||
}
|
||
return reasons.get(execution_decision, "Unknown decision")
|
||
|
||
|
||
# =============================================================================
|
||
# Singleton
|
||
# =============================================================================
|
||
|
||
|
||
_ci_auto_repair_service: CIAutoRepairService | None = None
|
||
|
||
|
||
def get_ci_auto_repair_service() -> CIAutoRepairService:
|
||
"""取得全域 CI Auto-Repair Service 實例"""
|
||
global _ci_auto_repair_service
|
||
if _ci_auto_repair_service is None:
|
||
_ci_auto_repair_service = CIAutoRepairService()
|
||
return _ci_auto_repair_service
|