Files
awoooi/apps/api/src/services/ci_auto_repair.py
Your Name 025a493f06
Some checks failed
run-migration / migrate (push) Failing after 12s
CD Pipeline / build-and-deploy (push) Has been cancelled
feat(p3.2+adr-100): Model Version Tracker + SLO 自治 + KB rot cleaner
Wave 8 P3.2 模型版本追蹤 + ADR-100 SLO 自我治理 + 配套:

P3.2 — Model Version Tracking:
- model_version_probe.py (268 行) — 探測 Ollama / OpenRouter 等 provider 的 model version
- model_version_tracker.py (101 行) — 對齊 PG provider_version_history 表
- migrations/p3_2_provider_version_history.sql + rollback — 25 行 schema
- db/models.py +32 行 — ProviderVersionHistory ORM

ADR-100 — AI 自主化 SLO:
- docs/adr/ADR-100-ai-autonomous-slo.md (167 行) — 飛輪 SLO 設計與閾值
- ops/monitoring/slo-rules.yml (254 行) — Prometheus SLO recording rules + alerts
- ops/monitoring/tests/test_slo_rules.yaml (242 行) — promtool unit tests

整合修改:
- main.py +72 行 — Lifespan 啟動 model_version_probe + KB rot cleaner schedule
- gitea_webhook.py +45 行 — webhook 接收 model 版本變化通知
- ci_auto_repair.py / evidence_snapshot.py / pre_decision_investigator.py — 配合接線

新測試:
- test_kb_rot_cleaner_schedule.py (120 行) — 9 tests pass
- test_slo_rules.yaml — promtool 驗收

Tests: 9 passed (test_kb_rot_cleaner_schedule)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Co-Authored-By: Multiple Engineers (P3.2 + ADR-100) <noreply@anthropic.com>
2026-04-27 14:54:19 +08:00

470 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
CI Auto-Repair Service - Phase 13.1 #78
========================================
CI 失敗自動修復服務,根據風險分級決定執行策略
策略:
- LOW: 自動執行修復 (如重啟 Runner、清理快取)
- MEDIUM: 發送 Telegram 確認,快速批准後執行
- HIGH: 建立 Approval等待人工審核
- CRITICAL: 禁止自動修復,僅通知
整合:
- Intent Classifier: 判斷修復意圖類型
- Complexity Scorer: 評估修復複雜度
- AI Router: 選擇最適 AI 進行分析
版本: v1.0
建立: 2026-03-26 16:50 (台北時區)
建立者: Claude Code
"""
from __future__ import annotations
from dataclasses import dataclass
from enum import Enum
import structlog
from src.services.complexity_scorer import get_complexity_scorer
from src.services.intent_classifier import IntentType, RiskLevel, get_intent_classifier
logger = structlog.get_logger(__name__)
# =============================================================================
# Types
# =============================================================================
class RepairAction(Enum):
"""修復動作類型"""
RESTART_RUNNER = "restart_runner"
CLEAR_CACHE = "clear_cache"
RETRY_WORKFLOW = "retry_workflow"
ROLLBACK_COMMIT = "rollback_commit"
FIX_CONFIG = "fix_config"
FIX_DEPENDENCY = "fix_dependency"
SCALE_RESOURCE = "scale_resource"
MANUAL_REQUIRED = "manual_required"
class ExecutionDecision(Enum):
"""執行決策"""
AUTO_EXECUTE = "auto_execute" # 直接自動執行
TELEGRAM_CONFIRM = "telegram_confirm" # Telegram 快速確認
APPROVAL_REQUIRED = "approval_required" # 建立 Approval 等待審核
BLOCKED = "blocked" # 禁止執行,僅通知
@dataclass
class RepairRecommendation:
"""修復建議"""
action: RepairAction
command: str | None
reason: str
risk_level: RiskLevel
execution_decision: ExecutionDecision
confidence: float
estimated_duration_seconds: int
rollback_command: str | None = None
@dataclass
class CIRepairDecision:
"""CI 修復決策結果"""
should_repair: bool
execution_decision: ExecutionDecision
recommendations: list[RepairRecommendation]
risk_level: RiskLevel
complexity_score: int
intent_type: IntentType
reason: str
metadata: dict
# =============================================================================
# Repair Strategy Mapping
# =============================================================================
# 錯誤類型 → 修復動作映射
ERROR_TYPE_REPAIR_MAP: dict[str, list[RepairAction]] = {
"build": [RepairAction.CLEAR_CACHE, RepairAction.FIX_DEPENDENCY],
"test": [RepairAction.RETRY_WORKFLOW, RepairAction.FIX_CONFIG],
"lint": [RepairAction.RETRY_WORKFLOW],
"deploy": [RepairAction.ROLLBACK_COMMIT, RepairAction.FIX_CONFIG],
"timeout": [RepairAction.RESTART_RUNNER, RepairAction.SCALE_RESOURCE],
"runner": [RepairAction.RESTART_RUNNER],
"dependency": [RepairAction.CLEAR_CACHE, RepairAction.FIX_DEPENDENCY],
"unknown": [RepairAction.MANUAL_REQUIRED],
}
# 修復動作 → 風險等級映射
ACTION_RISK_MAP: dict[RepairAction, RiskLevel] = {
RepairAction.RETRY_WORKFLOW: RiskLevel.LOW,
RepairAction.CLEAR_CACHE: RiskLevel.LOW,
RepairAction.RESTART_RUNNER: RiskLevel.LOW,
RepairAction.FIX_CONFIG: RiskLevel.MEDIUM,
RepairAction.FIX_DEPENDENCY: RiskLevel.MEDIUM,
RepairAction.SCALE_RESOURCE: RiskLevel.MEDIUM,
RepairAction.ROLLBACK_COMMIT: RiskLevel.HIGH,
RepairAction.MANUAL_REQUIRED: RiskLevel.CRITICAL,
}
# 風險等級 → 執行決策映射
RISK_EXECUTION_MAP: dict[RiskLevel, ExecutionDecision] = {
RiskLevel.LOW: ExecutionDecision.AUTO_EXECUTE,
RiskLevel.MEDIUM: ExecutionDecision.TELEGRAM_CONFIRM,
RiskLevel.HIGH: ExecutionDecision.APPROVAL_REQUIRED,
RiskLevel.CRITICAL: ExecutionDecision.BLOCKED,
}
# =============================================================================
# CI Auto-Repair Service
# =============================================================================
class CIAutoRepairService:
"""
CI 自動修復服務
整合智能路由 (Phase 13.3) 進行風險評估和修復決策
"""
def __init__(self):
self._intent_classifier = get_intent_classifier()
self._complexity_scorer = get_complexity_scorer()
async def evaluate_repair(
self,
error_type: str,
workflow_name: str,
repo: str,
failure_context: dict,
diagnosis_summary: str | None = None,
) -> CIRepairDecision:
"""
評估 CI 失敗的修復策略
Args:
error_type: 錯誤類型 (build/test/lint/deploy/timeout)
workflow_name: Workflow 名稱
repo: 倉庫名稱
failure_context: 失敗上下文
diagnosis_summary: AI 診斷摘要 (可選)
Returns:
CIRepairDecision: 修復決策
"""
logger.info(
"ci_repair_evaluation_started",
error_type=error_type,
workflow_name=workflow_name,
repo=repo,
)
# 1. 構建分析文字
analysis_text = self._build_analysis_text(
error_type=error_type,
workflow_name=workflow_name,
diagnosis_summary=diagnosis_summary,
)
# 2. 意圖分類
# 2026-04-27 P3.1-T3 by Claude — 修復缺失 awaitclassify 是 async method
intent_result = await self._intent_classifier.classify(analysis_text)
# 3. 複雜度評估
complexity_result = self._complexity_scorer.score(
text=analysis_text,
context={
"error_type": error_type,
"workflow_name": workflow_name,
"repo": repo,
**failure_context,
},
)
# 4. 獲取可能的修復動作
possible_actions = ERROR_TYPE_REPAIR_MAP.get(
error_type.lower(),
[RepairAction.MANUAL_REQUIRED],
)
# 5. 生成修復建議
recommendations = self._generate_recommendations(
possible_actions=possible_actions,
error_type=error_type,
workflow_name=workflow_name,
complexity_score=complexity_result.score,
)
# 6. 決定整體風險等級和執行策略
overall_risk = self._determine_overall_risk(
recommendations=recommendations,
intent_risk=intent_result.risk_level,
complexity_score=complexity_result.score,
)
execution_decision = RISK_EXECUTION_MAP.get(
overall_risk,
ExecutionDecision.APPROVAL_REQUIRED,
)
# 7. 特殊規則覆蓋
execution_decision = self._apply_special_rules(
execution_decision=execution_decision,
error_type=error_type,
workflow_name=workflow_name,
repo=repo,
)
decision = CIRepairDecision(
should_repair=execution_decision != ExecutionDecision.BLOCKED,
execution_decision=execution_decision,
recommendations=recommendations,
risk_level=overall_risk,
complexity_score=complexity_result.score,
intent_type=intent_result.intent,
reason=self._generate_decision_reason(
execution_decision=execution_decision,
overall_risk=overall_risk,
error_type=error_type,
),
metadata={
"intent_confidence": intent_result.confidence,
"complexity_factors": complexity_result.factors,
"workflow_name": workflow_name,
"repo": repo,
},
)
logger.info(
"ci_repair_evaluation_completed",
should_repair=decision.should_repair,
execution_decision=execution_decision.value,
risk_level=overall_risk.value,
recommendations_count=len(recommendations),
)
return decision
def _build_analysis_text(
self,
error_type: str,
workflow_name: str,
diagnosis_summary: str | None,
) -> str:
"""構建意圖分類用的分析文字"""
parts = [
f"CI workflow '{workflow_name}' failed",
f"Error type: {error_type}",
]
if diagnosis_summary:
parts.append(f"Diagnosis: {diagnosis_summary}")
return ". ".join(parts)
def _generate_recommendations(
self,
possible_actions: list[RepairAction],
error_type: str,
workflow_name: str,
complexity_score: int,
) -> list[RepairRecommendation]:
"""生成修復建議列表"""
recommendations = []
for action in possible_actions:
risk = ACTION_RISK_MAP.get(action, RiskLevel.HIGH)
# 根據複雜度調整風險
if complexity_score >= 4:
risk = RiskLevel.HIGH if risk == RiskLevel.MEDIUM else risk
command, rollback = self._get_repair_command(
action=action,
workflow_name=workflow_name,
)
recommendations.append(RepairRecommendation(
action=action,
command=command,
reason=self._get_action_reason(action, error_type),
risk_level=risk,
execution_decision=RISK_EXECUTION_MAP.get(risk, ExecutionDecision.APPROVAL_REQUIRED),
confidence=self._calculate_confidence(action, error_type),
estimated_duration_seconds=self._estimate_duration(action),
rollback_command=rollback,
))
# 按風險等級排序 (低風險優先)
risk_order = {RiskLevel.LOW: 0, RiskLevel.MEDIUM: 1, RiskLevel.HIGH: 2, RiskLevel.CRITICAL: 3}
recommendations.sort(key=lambda r: risk_order.get(r.risk_level, 99))
return recommendations
def _get_repair_command(
self,
action: RepairAction,
workflow_name: str,
) -> tuple[str | None, str | None]:
"""獲取修復指令和回滾指令"""
commands: dict[RepairAction, tuple[str | None, str | None]] = {
RepairAction.RETRY_WORKFLOW: (
f"gh workflow run {workflow_name}",
None,
),
RepairAction.CLEAR_CACHE: (
"gh cache delete --all",
None,
),
RepairAction.RESTART_RUNNER: (
"sudo systemctl restart actions.runner.*",
None,
),
RepairAction.SCALE_RESOURCE: (
"kubectl scale deployment/actions-runner --replicas=3",
"kubectl scale deployment/actions-runner --replicas=2",
),
RepairAction.ROLLBACK_COMMIT: (
"git revert HEAD --no-edit && git push",
"git revert HEAD --no-edit && git push",
),
RepairAction.FIX_CONFIG: (
None, # 需要 AI 生成具體指令
None,
),
RepairAction.FIX_DEPENDENCY: (
"pnpm install --force && uv sync",
None,
),
RepairAction.MANUAL_REQUIRED: (
None,
None,
),
}
return commands.get(action, (None, None))
def _get_action_reason(self, action: RepairAction, error_type: str) -> str:
"""獲取修復動作的原因說明"""
reasons = {
RepairAction.RETRY_WORKFLOW: f"Retry workflow to recover from transient {error_type} failure",
RepairAction.CLEAR_CACHE: "Clear build/dependency cache to resolve potential cache corruption",
RepairAction.RESTART_RUNNER: "Restart GitHub Actions runner to recover from runner issues",
RepairAction.SCALE_RESOURCE: "Scale runner resources to handle timeout issues",
RepairAction.ROLLBACK_COMMIT: "Rollback recent commit that may have introduced the failure",
RepairAction.FIX_CONFIG: "Fix configuration that may be causing the failure",
RepairAction.FIX_DEPENDENCY: "Update or fix dependencies to resolve compatibility issues",
RepairAction.MANUAL_REQUIRED: "Manual investigation required due to complex failure",
}
return reasons.get(action, "Unknown action")
def _calculate_confidence(self, _action: RepairAction, _error_type: str) -> float:
"""
計算修復信心度
🔴 2026-03-29 修正: 規則匹配不是 AI 分析,統一返回 0.0
根據 feedback_confidence_truthfulness.md 鐵律
"""
# 規則匹配/規則引擎判斷,非 AI 分析
return 0.0
def _estimate_duration(self, action: RepairAction) -> int:
"""估算修復時間 (秒)"""
durations = {
RepairAction.RETRY_WORKFLOW: 300, # 5 分鐘
RepairAction.CLEAR_CACHE: 30,
RepairAction.RESTART_RUNNER: 60,
RepairAction.SCALE_RESOURCE: 120,
RepairAction.ROLLBACK_COMMIT: 180,
RepairAction.FIX_CONFIG: 600,
RepairAction.FIX_DEPENDENCY: 300,
RepairAction.MANUAL_REQUIRED: 3600,
}
return durations.get(action, 300)
def _determine_overall_risk(
self,
recommendations: list[RepairRecommendation],
intent_risk: RiskLevel,
complexity_score: int,
) -> RiskLevel:
"""決定整體風險等級"""
if not recommendations:
return RiskLevel.CRITICAL
# 取最低風險的建議作為基礎
min_risk = min(
recommendations,
key=lambda r: {RiskLevel.LOW: 0, RiskLevel.MEDIUM: 1, RiskLevel.HIGH: 2, RiskLevel.CRITICAL: 3}.get(r.risk_level, 99),
).risk_level
# 如果複雜度高,提升風險等級
if complexity_score >= 4 and min_risk == RiskLevel.LOW:
min_risk = RiskLevel.MEDIUM
elif complexity_score >= 5 and min_risk == RiskLevel.MEDIUM:
min_risk = RiskLevel.HIGH
# 如果意圖分類顯示高風險,取較高值
risk_order = {RiskLevel.LOW: 0, RiskLevel.MEDIUM: 1, RiskLevel.HIGH: 2, RiskLevel.CRITICAL: 3}
if risk_order.get(intent_risk, 0) > risk_order.get(min_risk, 0):
return intent_risk
return min_risk
def _apply_special_rules(
self,
execution_decision: ExecutionDecision,
error_type: str,
workflow_name: str,
repo: str,
) -> ExecutionDecision:
"""應用特殊規則覆蓋"""
# 生產部署相關的 workflow 強制需要審核
production_keywords = ["prod", "production", "release", "deploy"]
if any(kw in workflow_name.lower() for kw in production_keywords):
if execution_decision == ExecutionDecision.AUTO_EXECUTE:
return ExecutionDecision.TELEGRAM_CONFIRM
# rollback 錯誤類型強制需要審核
if error_type == "deploy":
if execution_decision in (ExecutionDecision.AUTO_EXECUTE, ExecutionDecision.TELEGRAM_CONFIRM):
return ExecutionDecision.APPROVAL_REQUIRED
return execution_decision
def _generate_decision_reason(
self,
execution_decision: ExecutionDecision,
overall_risk: RiskLevel,
error_type: str,
) -> str:
"""生成決策原因說明"""
reasons = {
ExecutionDecision.AUTO_EXECUTE: f"Low risk {error_type} failure, safe for auto-repair",
ExecutionDecision.TELEGRAM_CONFIRM: f"Medium risk {error_type} failure, quick Telegram confirmation recommended",
ExecutionDecision.APPROVAL_REQUIRED: f"High risk {error_type} failure, human approval required before repair",
ExecutionDecision.BLOCKED: f"Critical {error_type} failure, auto-repair blocked for safety",
}
return reasons.get(execution_decision, "Unknown decision")
# =============================================================================
# Singleton
# =============================================================================
_ci_auto_repair_service: CIAutoRepairService | None = None
def get_ci_auto_repair_service() -> CIAutoRepairService:
"""取得全域 CI Auto-Repair Service 實例"""
global _ci_auto_repair_service
if _ci_auto_repair_service is None:
_ci_auto_repair_service = CIAutoRepairService()
return _ci_auto_repair_service