feat(api): Phase 13 智能路由 + CI/CD 整合 (#74-88)
Phase 13.1 CI/CD Integration: - #76 workflow_run handler for CI failure diagnosis - #77 SignOz log query (query_logs, error_logs_summary MCP) - #78 CIAutoRepairService with risk-based execution decisions Phase 13.3 Smart Routing: - #85 Intent Classifier v2.0 (rule engine + LLM fallback) - #86 Complexity Scorer (9-dimension scoring) - #87 AI Router v3.0 (routing decision matrix) - #88 Token Counter (OTEL + Langfuse integration) New files: - services/ci_auto_repair.py (risk stratification) - services/model_registry.py (centralized model config) - services/token_counter.py (677 lines) - Skill 08: Model Router Expert - Skill 09: Strangler Pattern Expert - ADR-023: Smart Routing Architecture - ADR-024: API Layer Architecture Tests: - phase11-conversational.spec.ts (E2E tests) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,16 +1,21 @@
|
||||
"""
|
||||
AWOOOI API - GitHub Webhook Handler
|
||||
====================================
|
||||
Phase 13.1: GitHub PR/Push → OpenClaw AI 代碼審查整合
|
||||
Phase 13.1: GitHub PR/Push/CI → OpenClaw AI 整合
|
||||
|
||||
整合流程:
|
||||
1. GitHub Webhook (PR/Push) → AWOOOI API
|
||||
1. GitHub Webhook (PR/Push/Workflow) → AWOOOI API
|
||||
2. HMAC-SHA256 簽章驗證 (X-Hub-Signature-256)
|
||||
3. 解析 PR diff / Push commits
|
||||
4. 呼叫 OpenClaw 進行 AI 代碼審查
|
||||
3. 解析 PR diff / Push commits / Workflow failure
|
||||
4. 呼叫 OpenClaw 進行 AI 代碼審查 / CI 失敗診斷
|
||||
5. 儲存審查結果到 Redis
|
||||
6. 發送 Telegram 通知
|
||||
7. (可選) 回寫 GitHub PR Comment
|
||||
7. (可選) 建立 Approval 等待人工確認
|
||||
|
||||
支援事件:
|
||||
- pull_request: PR 代碼審查 (#74-75)
|
||||
- push: 主分支推送審查 (#74-75)
|
||||
- workflow_run: CI 失敗診斷 (#76)
|
||||
|
||||
安全要求 (feedback_openclaw_security.md):
|
||||
- HMAC 簽章驗證 (X-Hub-Signature-256)
|
||||
@@ -19,6 +24,11 @@ Phase 13.1: GitHub PR/Push → OpenClaw AI 代碼審查整合
|
||||
- 倉庫白名單驗證
|
||||
|
||||
🔴 HARD RULE: 時間顯示使用 Asia/Taipei (UTC+8)
|
||||
|
||||
版本: v2.0
|
||||
最後修改: 2026-03-26 16:30 (台北時區)
|
||||
修改者: Claude Code
|
||||
變更: Phase 13.1 #76 CI 失敗診斷
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
@@ -109,6 +119,48 @@ class GitHubCommit(BaseModel):
|
||||
modified: list[str] = []
|
||||
|
||||
|
||||
class GitHubWorkflowRun(BaseModel):
|
||||
"""GitHub Workflow Run 資訊 (Phase 13.1 #76)"""
|
||||
id: int
|
||||
name: str
|
||||
status: str # queued, in_progress, completed
|
||||
conclusion: str | None = None # success, failure, cancelled, skipped, timed_out
|
||||
html_url: str
|
||||
run_number: int
|
||||
run_attempt: int = 1
|
||||
head_sha: str
|
||||
head_branch: str | None = None
|
||||
event: str # push, pull_request, schedule, workflow_dispatch
|
||||
created_at: str
|
||||
updated_at: str
|
||||
logs_url: str | None = None # API URL for logs (requires auth)
|
||||
|
||||
|
||||
class GitHubWorkflowJob(BaseModel):
|
||||
"""GitHub Workflow Job 資訊"""
|
||||
id: int
|
||||
name: str
|
||||
status: str
|
||||
conclusion: str | None = None
|
||||
started_at: str | None = None
|
||||
completed_at: str | None = None
|
||||
steps: list[dict] = []
|
||||
|
||||
|
||||
class CIFailureDiagnosis(BaseModel):
|
||||
"""CI 失敗診斷結果 (Phase 13.1 #76)"""
|
||||
summary: str = Field(..., description="失敗摘要")
|
||||
root_cause: str = Field(..., description="根本原因分析")
|
||||
failed_step: str | None = Field(None, description="失敗的步驟名稱")
|
||||
error_type: str = Field(..., description="錯誤類型 (build/test/lint/deploy/timeout)")
|
||||
suggestions: list[str] = Field(default=[], description="修復建議")
|
||||
auto_fixable: bool = Field(False, description="是否可自動修復")
|
||||
fix_command: str | None = Field(None, description="自動修復指令 (如可自動修復)")
|
||||
risk_level: str = Field("medium", description="風險等級 (low/medium/high/critical)")
|
||||
analyzed_by: str = Field(..., description="分析模型")
|
||||
confidence: float = Field(..., ge=0, le=1, description="信心度")
|
||||
|
||||
|
||||
class GitHubWebhookPayload(BaseModel):
|
||||
"""GitHub Webhook Payload (通用)"""
|
||||
action: str | None = None # PR: opened, synchronize, etc.
|
||||
@@ -122,6 +174,9 @@ class GitHubWebhookPayload(BaseModel):
|
||||
after: str | None = None # current commit SHA
|
||||
commits: list[GitHubCommit] | None = None
|
||||
pusher: dict | None = None
|
||||
# Workflow Run 事件 (Phase 13.1 #76)
|
||||
workflow_run: GitHubWorkflowRun | None = None
|
||||
workflow_job: GitHubWorkflowJob | None = None
|
||||
|
||||
|
||||
class CodeReviewResult(BaseModel):
|
||||
@@ -355,6 +410,14 @@ async def handle_github_webhook(
|
||||
delivery_id=x_github_delivery,
|
||||
)
|
||||
|
||||
# Workflow Run 事件 (Phase 13.1 #76 CI 失敗診斷)
|
||||
elif x_github_event == "workflow_run":
|
||||
return await handle_workflow_run(
|
||||
payload=payload,
|
||||
background_tasks=background_tasks,
|
||||
delivery_id=x_github_delivery,
|
||||
)
|
||||
|
||||
# Ping 事件 (GitHub 測試連線)
|
||||
elif x_github_event == "ping":
|
||||
return GitHubWebhookResponse(
|
||||
@@ -505,6 +568,70 @@ async def handle_push(
|
||||
)
|
||||
|
||||
|
||||
async def handle_workflow_run(
|
||||
payload: GitHubWebhookPayload,
|
||||
background_tasks: BackgroundTasks,
|
||||
delivery_id: str | None,
|
||||
) -> GitHubWebhookResponse:
|
||||
"""
|
||||
處理 Workflow Run 事件 (Phase 13.1 #76 CI 失敗診斷)
|
||||
|
||||
只處理 completed + failure 的 workflow run
|
||||
"""
|
||||
workflow_run = payload.workflow_run
|
||||
if not workflow_run:
|
||||
return GitHubWebhookResponse(
|
||||
status="ignored",
|
||||
message="No workflow_run in payload",
|
||||
event_type="workflow_run",
|
||||
)
|
||||
|
||||
# 只處理 completed 狀態
|
||||
if workflow_run.status != "completed":
|
||||
return GitHubWebhookResponse(
|
||||
status="ignored",
|
||||
message=f"Workflow status '{workflow_run.status}' not completed",
|
||||
event_type="workflow_run",
|
||||
)
|
||||
|
||||
# 只處理失敗的 workflow
|
||||
if workflow_run.conclusion not in ("failure", "timed_out"):
|
||||
return GitHubWebhookResponse(
|
||||
status="ignored",
|
||||
message=f"Workflow conclusion '{workflow_run.conclusion}' is not failure",
|
||||
event_type="workflow_run",
|
||||
)
|
||||
|
||||
# 生成診斷 ID
|
||||
diagnosis_id = f"gh-ci-{payload.repository.id}-{workflow_run.id}-{uuid.uuid4().hex[:8]}"
|
||||
|
||||
# 背景執行 CI 失敗診斷
|
||||
background_tasks.add_task(
|
||||
diagnose_ci_failure,
|
||||
repo=payload.repository,
|
||||
workflow_run=workflow_run,
|
||||
sender=payload.sender,
|
||||
diagnosis_id=diagnosis_id,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"github_ci_failure_diagnosis_scheduled",
|
||||
diagnosis_id=diagnosis_id,
|
||||
repo=payload.repository.full_name,
|
||||
workflow_name=workflow_run.name,
|
||||
workflow_id=workflow_run.id,
|
||||
conclusion=workflow_run.conclusion,
|
||||
head_sha=workflow_run.head_sha[:8],
|
||||
)
|
||||
|
||||
return GitHubWebhookResponse(
|
||||
status="accepted",
|
||||
message=f"CI failure diagnosis scheduled for '{workflow_run.name}'",
|
||||
event_type="workflow_run",
|
||||
review_id=diagnosis_id,
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Background Tasks: AI Review
|
||||
# =============================================================================
|
||||
@@ -691,6 +818,143 @@ async def review_push(
|
||||
)
|
||||
|
||||
|
||||
async def diagnose_ci_failure(
|
||||
repo: GitHubRepository,
|
||||
workflow_run: GitHubWorkflowRun,
|
||||
sender: GitHubUser,
|
||||
diagnosis_id: str,
|
||||
):
|
||||
"""
|
||||
背景任務: CI 失敗診斷 (Phase 13.1 #76)
|
||||
|
||||
1. 收集 workflow 失敗資訊
|
||||
2. 呼叫 OpenClaw 進行根因分析
|
||||
3. 評估風險等級與自動修復可行性
|
||||
4. 儲存結果到 Redis
|
||||
5. 發送 Telegram 通知
|
||||
6. (可選) 建立 Approval 等待人工確認
|
||||
"""
|
||||
try:
|
||||
logger.info(
|
||||
"github_ci_failure_diagnosis_started",
|
||||
diagnosis_id=diagnosis_id,
|
||||
repo=repo.full_name,
|
||||
workflow_name=workflow_run.name,
|
||||
workflow_id=workflow_run.id,
|
||||
)
|
||||
|
||||
# 1. 收集失敗資訊
|
||||
failure_context = {
|
||||
"workflow_name": workflow_run.name,
|
||||
"workflow_id": workflow_run.id,
|
||||
"run_number": workflow_run.run_number,
|
||||
"run_attempt": workflow_run.run_attempt,
|
||||
"conclusion": workflow_run.conclusion,
|
||||
"head_sha": workflow_run.head_sha,
|
||||
"head_branch": workflow_run.head_branch,
|
||||
"event_trigger": workflow_run.event,
|
||||
"html_url": workflow_run.html_url,
|
||||
"created_at": workflow_run.created_at,
|
||||
"updated_at": workflow_run.updated_at,
|
||||
}
|
||||
|
||||
# 2. 呼叫 OpenClaw 進行 CI 失敗診斷
|
||||
diagnosis = await call_openclaw_ci_diagnosis(
|
||||
repo_name=repo.full_name,
|
||||
failure_context=failure_context,
|
||||
)
|
||||
|
||||
# 3. 評估自動修復策略 (Phase 13.1 #78)
|
||||
repair_decision = None
|
||||
if diagnosis:
|
||||
from src.services.ci_auto_repair import get_ci_auto_repair_service
|
||||
repair_service = get_ci_auto_repair_service()
|
||||
repair_decision = await repair_service.evaluate_repair(
|
||||
error_type=diagnosis.error_type,
|
||||
workflow_name=workflow_run.name,
|
||||
repo=repo.full_name,
|
||||
failure_context=failure_context,
|
||||
diagnosis_summary=diagnosis.summary,
|
||||
)
|
||||
|
||||
# 4. 儲存結果到 Redis (含修復決策)
|
||||
service = get_github_webhook_service()
|
||||
await service.save_review_result(
|
||||
review_id=diagnosis_id,
|
||||
result={
|
||||
"event_type": "workflow_run",
|
||||
"repo": repo.full_name,
|
||||
"target": f"CI: {workflow_run.name}",
|
||||
"diagnosis": diagnosis.model_dump() if diagnosis else None,
|
||||
"repair_decision": {
|
||||
"should_repair": repair_decision.should_repair,
|
||||
"execution_decision": repair_decision.execution_decision.value,
|
||||
"risk_level": repair_decision.risk_level.value,
|
||||
"reason": repair_decision.reason,
|
||||
"recommendations": [
|
||||
{"action": r.action.value, "command": r.command, "confidence": r.confidence}
|
||||
for r in repair_decision.recommendations[:3]
|
||||
],
|
||||
} if repair_decision else None,
|
||||
"failure_context": failure_context,
|
||||
"reviewed_at": now_taipei_iso(),
|
||||
},
|
||||
ttl=GITHUB_REVIEW_TTL_SECONDS,
|
||||
)
|
||||
|
||||
# 5. 發送 Telegram 通知 (含修復建議)
|
||||
await send_ci_failure_telegram_alert(
|
||||
diagnosis_id=diagnosis_id,
|
||||
repo=repo.full_name,
|
||||
workflow_name=workflow_run.name,
|
||||
workflow_url=workflow_run.html_url,
|
||||
sender=sender.login,
|
||||
diagnosis=diagnosis,
|
||||
repair_decision=repair_decision,
|
||||
)
|
||||
|
||||
# 6. 根據修復決策建立 Approval 或自動執行
|
||||
if repair_decision:
|
||||
from src.services.ci_auto_repair import ExecutionDecision
|
||||
if repair_decision.execution_decision == ExecutionDecision.APPROVAL_REQUIRED:
|
||||
await create_ci_failure_approval(
|
||||
diagnosis_id=diagnosis_id,
|
||||
repo=repo.full_name,
|
||||
workflow_run=workflow_run,
|
||||
diagnosis=diagnosis,
|
||||
)
|
||||
elif repair_decision.execution_decision == ExecutionDecision.AUTO_EXECUTE:
|
||||
logger.info(
|
||||
"ci_auto_repair_eligible",
|
||||
diagnosis_id=diagnosis_id,
|
||||
action=repair_decision.recommendations[0].action.value if repair_decision.recommendations else None,
|
||||
# TODO: 實際執行修復指令 (Phase 13.1 後續迭代)
|
||||
)
|
||||
elif diagnosis and diagnosis.risk_level in ("high", "critical"):
|
||||
await create_ci_failure_approval(
|
||||
diagnosis_id=diagnosis_id,
|
||||
repo=repo.full_name,
|
||||
workflow_run=workflow_run,
|
||||
diagnosis=diagnosis,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"github_ci_failure_diagnosis_completed",
|
||||
diagnosis_id=diagnosis_id,
|
||||
root_cause=diagnosis.root_cause if diagnosis else None,
|
||||
auto_fixable=diagnosis.auto_fixable if diagnosis else False,
|
||||
risk_level=diagnosis.risk_level if diagnosis else None,
|
||||
repair_decision=repair_decision.execution_decision.value if repair_decision else None,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(
|
||||
"github_ci_failure_diagnosis_failed",
|
||||
diagnosis_id=diagnosis_id,
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Helper Functions
|
||||
# =============================================================================
|
||||
@@ -820,6 +1084,240 @@ async def call_openclaw_push_review(
|
||||
return None
|
||||
|
||||
|
||||
async def call_openclaw_ci_diagnosis(
|
||||
repo_name: str,
|
||||
failure_context: dict,
|
||||
) -> CIFailureDiagnosis | None:
|
||||
"""
|
||||
呼叫 OpenClaw 進行 CI 失敗診斷 (Phase 13.1 #76)
|
||||
|
||||
分析 CI/CD pipeline 失敗原因,提供根因分析和修復建議
|
||||
"""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
response = await client.post(
|
||||
f"{OPENCLAW_URL}/api/v1/analyze/ci-failure",
|
||||
json={
|
||||
"repo": repo_name,
|
||||
"workflow_name": failure_context.get("workflow_name"),
|
||||
"conclusion": failure_context.get("conclusion"),
|
||||
"head_sha": failure_context.get("head_sha"),
|
||||
"head_branch": failure_context.get("head_branch"),
|
||||
"event_trigger": failure_context.get("event_trigger"),
|
||||
"run_number": failure_context.get("run_number"),
|
||||
"run_attempt": failure_context.get("run_attempt"),
|
||||
"workflow_url": failure_context.get("html_url"),
|
||||
"prefer_local": True, # 優先 Ollama
|
||||
},
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
return CIFailureDiagnosis(**data)
|
||||
else:
|
||||
logger.warning(
|
||||
"openclaw_ci_diagnosis_failed",
|
||||
status=response.status_code,
|
||||
response=response.text[:200],
|
||||
)
|
||||
# 返回基本診斷結果 (API 失敗時的 fallback)
|
||||
return CIFailureDiagnosis(
|
||||
summary=f"CI workflow '{failure_context.get('workflow_name')}' failed",
|
||||
root_cause="OpenClaw API unavailable, manual investigation required",
|
||||
error_type="unknown",
|
||||
suggestions=["Check workflow logs manually", "Verify runner status"],
|
||||
auto_fixable=False,
|
||||
risk_level="medium",
|
||||
analyzed_by="fallback",
|
||||
confidence=0.3,
|
||||
)
|
||||
|
||||
except httpx.TimeoutException:
|
||||
logger.warning("openclaw_ci_diagnosis_timeout")
|
||||
return CIFailureDiagnosis(
|
||||
summary="CI diagnosis timeout",
|
||||
root_cause="OpenClaw API timeout",
|
||||
error_type="timeout",
|
||||
suggestions=["Check OpenClaw service status"],
|
||||
auto_fixable=False,
|
||||
risk_level="low",
|
||||
analyzed_by="fallback",
|
||||
confidence=0.1,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.exception("openclaw_ci_diagnosis_error", error=str(e))
|
||||
return None
|
||||
|
||||
|
||||
async def send_ci_failure_telegram_alert(
|
||||
diagnosis_id: str,
|
||||
repo: str,
|
||||
workflow_name: str,
|
||||
workflow_url: str,
|
||||
sender: str,
|
||||
diagnosis: CIFailureDiagnosis | None,
|
||||
repair_decision=None, # Phase 13.1 #78: CIRepairDecision
|
||||
):
|
||||
"""
|
||||
發送 CI 失敗診斷 Telegram 通知 (Phase 13.1 #76-78)
|
||||
"""
|
||||
try:
|
||||
telegram = get_telegram_gateway()
|
||||
|
||||
# 構建訊息
|
||||
risk_emoji = {
|
||||
"low": "🟢",
|
||||
"medium": "🟡",
|
||||
"high": "🟠",
|
||||
"critical": "🔴",
|
||||
}
|
||||
emoji = risk_emoji.get(diagnosis.risk_level if diagnosis else "medium", "🟡")
|
||||
|
||||
# 修復決策狀態
|
||||
decision_text = "❓ 待評估"
|
||||
if repair_decision:
|
||||
decision_map = {
|
||||
"auto_execute": "🤖 自動修復中",
|
||||
"telegram_confirm": "📱 等待確認",
|
||||
"approval_required": "📋 需人工審核",
|
||||
"blocked": "🚫 禁止自動修復",
|
||||
}
|
||||
decision_text = decision_map.get(repair_decision.execution_decision.value, "❓ 未知")
|
||||
|
||||
message_lines = [
|
||||
f"{emoji} **CI 失敗診斷** | {repo}",
|
||||
f"",
|
||||
f"📋 **Workflow**: {workflow_name}",
|
||||
f"👤 **觸發者**: {sender}",
|
||||
f"🔗 [查看 Workflow]({workflow_url})",
|
||||
f"",
|
||||
]
|
||||
|
||||
if diagnosis:
|
||||
message_lines.extend([
|
||||
f"**📝 摘要**: {diagnosis.summary}",
|
||||
f"**🔍 根因**: {diagnosis.root_cause}",
|
||||
f"**⚠️ 錯誤類型**: {diagnosis.error_type}",
|
||||
f"**🎯 風險等級**: {diagnosis.risk_level.upper()}",
|
||||
f"**🔧 修復決策**: {decision_text}",
|
||||
f"",
|
||||
])
|
||||
|
||||
if diagnosis.suggestions:
|
||||
message_lines.append("**💡 AI 建議**:")
|
||||
for i, suggestion in enumerate(diagnosis.suggestions[:3], 1):
|
||||
message_lines.append(f" {i}. {suggestion}")
|
||||
|
||||
# 顯示修復建議 (Phase 13.1 #78)
|
||||
if repair_decision and repair_decision.recommendations:
|
||||
message_lines.extend([f"", f"**🔨 修復選項**:"])
|
||||
for i, rec in enumerate(repair_decision.recommendations[:2], 1):
|
||||
confidence_pct = int(rec.confidence * 100)
|
||||
message_lines.append(
|
||||
f" {i}. `{rec.action.value}` ({confidence_pct}% 信心)"
|
||||
)
|
||||
if rec.command:
|
||||
message_lines.append(f" `{rec.command[:50]}...`" if len(rec.command) > 50 else f" `{rec.command}`")
|
||||
|
||||
message_lines.extend([
|
||||
f"",
|
||||
f"🆔 `{diagnosis_id}`",
|
||||
])
|
||||
|
||||
message = "\n".join(message_lines)
|
||||
|
||||
await telegram.send_message(
|
||||
message=message,
|
||||
parse_mode="Markdown",
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"ci_failure_telegram_alert_sent",
|
||||
diagnosis_id=diagnosis_id,
|
||||
repo=repo,
|
||||
repair_decision=repair_decision.execution_decision.value if repair_decision else None,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(
|
||||
"ci_failure_telegram_alert_failed",
|
||||
diagnosis_id=diagnosis_id,
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
|
||||
async def create_ci_failure_approval(
|
||||
diagnosis_id: str,
|
||||
repo: str,
|
||||
workflow_run: GitHubWorkflowRun,
|
||||
diagnosis: CIFailureDiagnosis,
|
||||
) -> str:
|
||||
"""
|
||||
為需要人工審核的 CI 修復建立 Approval 記錄 (Phase 13.1 #76)
|
||||
|
||||
Returns:
|
||||
str: Approval ID
|
||||
"""
|
||||
try:
|
||||
approval_service = get_approval_service()
|
||||
|
||||
# 映射風險等級
|
||||
risk_map = {
|
||||
"low": RiskLevel.LOW,
|
||||
"medium": RiskLevel.MEDIUM,
|
||||
"high": RiskLevel.HIGH,
|
||||
"critical": RiskLevel.CRITICAL,
|
||||
}
|
||||
risk_level = risk_map.get(diagnosis.risk_level, RiskLevel.MEDIUM)
|
||||
|
||||
# 組裝 Approval 請求
|
||||
approval_request = ApprovalRequestCreate(
|
||||
source="github",
|
||||
alert_type="ci_failure_repair",
|
||||
target_resource=repo,
|
||||
namespace="github-actions",
|
||||
risk_level=risk_level,
|
||||
root_cause=diagnosis.root_cause,
|
||||
suggestion=diagnosis.fix_command or "; ".join(diagnosis.suggestions[:2]),
|
||||
blast_radius=BlastRadius.NAMESPACE if diagnosis.auto_fixable else BlastRadius.SERVICE,
|
||||
data_impact=DataImpact.NONE,
|
||||
dry_run_check=DryRunCheck.SKIPPED,
|
||||
llm_provider=diagnosis.analyzed_by,
|
||||
llm_confidence=diagnosis.confidence,
|
||||
metadata={
|
||||
"ci_diagnosis_id": diagnosis_id,
|
||||
"repo": repo,
|
||||
"workflow_name": workflow_run.name,
|
||||
"workflow_id": workflow_run.id,
|
||||
"workflow_url": workflow_run.html_url,
|
||||
"head_sha": workflow_run.head_sha,
|
||||
"error_type": diagnosis.error_type,
|
||||
"auto_fixable": diagnosis.auto_fixable,
|
||||
"fix_command": diagnosis.fix_command,
|
||||
},
|
||||
)
|
||||
|
||||
# 創建 Approval
|
||||
approval_id = str(uuid.uuid4())
|
||||
await approval_service.create_approval(
|
||||
approval_id=approval_id,
|
||||
request=approval_request,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"ci_failure_approval_created",
|
||||
approval_id=approval_id,
|
||||
diagnosis_id=diagnosis_id,
|
||||
risk_level=risk_level.value,
|
||||
)
|
||||
|
||||
return approval_id
|
||||
|
||||
except Exception as e:
|
||||
logger.exception("ci_failure_approval_creation_failed", error=str(e))
|
||||
return f"temp-{uuid.uuid4().hex[:8]}"
|
||||
|
||||
|
||||
async def save_review_result(
|
||||
review_id: str,
|
||||
event_type: str,
|
||||
|
||||
@@ -6,10 +6,17 @@ SignOz MCP Tool Provider - ADR-015 模組化架構
|
||||
- gold_metrics: 取得 Gold Metrics (RPS, Error Rate, P99)
|
||||
- trace_url: 生成 Trace 查詢 URL
|
||||
- system_metrics: 取得系統指標 (CPU/Disk)
|
||||
- query_logs: 查詢日誌 (Phase 13.1 #77)
|
||||
- error_logs_summary: 錯誤日誌摘要 (Phase 13.1 #77)
|
||||
|
||||
透過 DI 注入 SignOzClient,不直接 import services。
|
||||
|
||||
@see docs/adr/ADR-015-mcp-modular-architecture.md
|
||||
|
||||
版本: v1.1
|
||||
最後修改: 2026-03-26 16:45 (台北時區)
|
||||
修改者: Claude Code
|
||||
變更: Phase 13.1 #77 新增 query_logs, error_logs_summary
|
||||
"""
|
||||
|
||||
import uuid
|
||||
@@ -84,6 +91,34 @@ class SignOzProvider(MCPToolProvider):
|
||||
},
|
||||
server_name=self.name,
|
||||
),
|
||||
MCPTool(
|
||||
name="query_logs",
|
||||
description="Query logs from SignOz (Phase 13.1 #77). Use for CI failure diagnosis or service debugging.",
|
||||
input_schema={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"service_name": {"type": "string", "description": "Service name (e.g., awoooi-api, awoooi-worker)"},
|
||||
"severity": {"type": "string", "description": "Log severity filter (ERROR, WARN, INFO, DEBUG). Comma-separated for multiple."},
|
||||
"search_text": {"type": "string", "description": "Text to search in log messages"},
|
||||
"time_window_minutes": {"type": "integer", "description": "Time window in minutes (default: 30)"},
|
||||
"limit": {"type": "integer", "description": "Max logs to return (default: 100)"},
|
||||
},
|
||||
},
|
||||
server_name=self.name,
|
||||
),
|
||||
MCPTool(
|
||||
name="error_logs_summary",
|
||||
description="Get error logs summary with counts and sample messages. Useful for quick diagnosis.",
|
||||
input_schema={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"service_name": {"type": "string", "description": "Service name (required)"},
|
||||
"time_window_minutes": {"type": "integer", "description": "Time window (default: 60)"},
|
||||
},
|
||||
"required": ["service_name"],
|
||||
},
|
||||
server_name=self.name,
|
||||
),
|
||||
]
|
||||
|
||||
async def execute(
|
||||
@@ -101,6 +136,10 @@ class SignOzProvider(MCPToolProvider):
|
||||
output = self._trace_url(client, parameters)
|
||||
elif tool_name == "system_metrics":
|
||||
output = await self._system_metrics(client, parameters)
|
||||
elif tool_name == "query_logs":
|
||||
output = await self._query_logs(client, parameters)
|
||||
elif tool_name == "error_logs_summary":
|
||||
output = await self._error_logs_summary(client, parameters)
|
||||
else:
|
||||
return MCPToolResult(
|
||||
success=False,
|
||||
@@ -184,6 +223,48 @@ class SignOzProvider(MCPToolProvider):
|
||||
"time_range": metrics.get("time_range", {}),
|
||||
}
|
||||
|
||||
async def _query_logs(self, client, parameters: dict) -> dict:
|
||||
"""Query logs from SignOz (Phase 13.1 #77)"""
|
||||
service_name = parameters.get("service_name")
|
||||
severity = parameters.get("severity")
|
||||
search_text = parameters.get("search_text")
|
||||
time_window = parameters.get("time_window_minutes", 30)
|
||||
limit = parameters.get("limit", 100)
|
||||
|
||||
logs = await client.get_logs(
|
||||
service_name=service_name,
|
||||
severity=severity,
|
||||
search_text=search_text,
|
||||
time_window_minutes=time_window,
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
return {
|
||||
"logs": logs,
|
||||
"count": len(logs),
|
||||
"filters": {
|
||||
"service_name": service_name,
|
||||
"severity": severity,
|
||||
"search_text": search_text,
|
||||
"time_window_minutes": time_window,
|
||||
},
|
||||
}
|
||||
|
||||
async def _error_logs_summary(self, client, parameters: dict) -> dict:
|
||||
"""Get error logs summary (Phase 13.1 #77)"""
|
||||
service_name = parameters.get("service_name")
|
||||
if not service_name:
|
||||
return {"error": "Missing 'service_name' parameter"}
|
||||
|
||||
time_window = parameters.get("time_window_minutes", 60)
|
||||
|
||||
summary = await client.get_error_logs_summary(
|
||||
service_name=service_name,
|
||||
time_window_minutes=time_window,
|
||||
)
|
||||
|
||||
return summary
|
||||
|
||||
async def health_check(self) -> bool:
|
||||
"""Check if SignOz is accessible"""
|
||||
try:
|
||||
|
||||
@@ -1,15 +1,41 @@
|
||||
"""
|
||||
AI Router - Phase 13.3 #87
|
||||
==========================
|
||||
動態模型選擇器,整合意圖分類和複雜度評分
|
||||
智能 AI 路由器,根據意圖和複雜度動態選擇 AI Provider
|
||||
|
||||
目標: 根據請求特性自動選擇最適模型
|
||||
策略: Intent + Complexity → Model Selection
|
||||
策略: Intent Classifier + Complexity Scorer → Routing Decision
|
||||
延遲目標: < 50ms (規則引擎優先)
|
||||
|
||||
Phase 13.3 (2026-03-26): 初始實作
|
||||
路由決策矩陣 (ADR-023):
|
||||
┌─────────────────┬───────────────┬──────────────────────────────┐
|
||||
│ 複雜度 + 風險 │ Provider │ 備註 │
|
||||
├─────────────────┼───────────────┼──────────────────────────────┤
|
||||
│ 1-2 + LOW │ Ollama │ 快速本地處理 │
|
||||
│ 3 + MEDIUM │ Ollama │ fallback → Gemini │
|
||||
│ 4-5 + HIGH │ Gemini │ fallback → Claude │
|
||||
│ DELETE/CRITICAL │ Claude │ 強制使用最強模型 │
|
||||
└─────────────────┴───────────────┴──────────────────────────────┘
|
||||
|
||||
版本: v3.0
|
||||
建立: 2026-03-26 (台北時區)
|
||||
建立者: Claude Code
|
||||
最後修改: 2026-03-26 (台北時區)
|
||||
修改者: Claude Code
|
||||
|
||||
變更紀錄:
|
||||
| 版本 | 日期 | 執行者 | 變更內容 |
|
||||
|------|------|--------|----------|
|
||||
| v1.0 | 2026-03-26 | Claude Code | 初始實作 |
|
||||
| v2.0 | 2026-03-26 | Claude Code | 支援 IntentResult + 新意圖類型 |
|
||||
| v3.0 | 2026-03-26 | Claude Code | Phase 13.3 #87 完整路由決策矩陣 |
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
|
||||
import structlog
|
||||
|
||||
@@ -18,58 +44,169 @@ from src.services.complexity_scorer import (
|
||||
get_complexity_scorer,
|
||||
)
|
||||
from src.services.intent_classifier import (
|
||||
IntentResult,
|
||||
IntentType,
|
||||
RiskLevel,
|
||||
get_intent_classifier,
|
||||
normalize_intent,
|
||||
)
|
||||
from src.services.model_registry import get_model_registry
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Provider 定義
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class AIProvider(Enum):
|
||||
"""AI 提供者"""
|
||||
|
||||
OLLAMA = "ollama"
|
||||
GEMINI = "gemini"
|
||||
CLAUDE = "claude"
|
||||
|
||||
|
||||
# Provider 對應延遲預算 (ms)
|
||||
PROVIDER_LATENCY_BUDGET: dict[AIProvider, int] = {
|
||||
AIProvider.OLLAMA: 60000, # 本地,允許較長處理時間
|
||||
AIProvider.GEMINI: 30000, # 雲端,較低延遲
|
||||
AIProvider.CLAUDE: 30000, # 雲端,較低延遲
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class RoutingDecision:
|
||||
"""路由決策結果"""
|
||||
"""
|
||||
路由決策結果 (Phase 13.3 #87)
|
||||
|
||||
model: str # 選擇的模型
|
||||
intent: IntentType # 意圖分類
|
||||
包含完整的路由資訊,供 OpenClaw 主流程使用
|
||||
"""
|
||||
|
||||
# 核心決策
|
||||
selected_provider: AIProvider # 選擇的 AI Provider
|
||||
selected_model: str # 選擇的模型名稱
|
||||
fallback_chain: list[tuple[AIProvider, str]] # 備援鏈 [(provider, model), ...]
|
||||
routing_reason: str # 路由決策原因
|
||||
latency_budget_ms: int # 延遲預算 (毫秒)
|
||||
|
||||
# 分類結果
|
||||
intent: IntentType # 意圖分類 (正規化後)
|
||||
intent_result: IntentResult # 完整 Intent 分類結果
|
||||
complexity: ComplexityScore # 複雜度評分
|
||||
reason: str # 選擇原因
|
||||
fallback_models: list[str] # 備援模型列表
|
||||
risk_level: RiskLevel = field(default=RiskLevel.MEDIUM) # 風險等級
|
||||
|
||||
# 路由 metadata
|
||||
routing_latency_ms: float = 0.0 # 路由決策耗時 (ms)
|
||||
|
||||
# 向後相容 (deprecated)
|
||||
model: str = "" # -> selected_model
|
||||
reason: str = "" # -> routing_reason
|
||||
fallback_models: list[str] = field(default_factory=list) # -> fallback_chain
|
||||
|
||||
def __post_init__(self):
|
||||
"""初始化後設定衍生欄位"""
|
||||
self.risk_level = self.intent_result.risk_level
|
||||
# 向後相容
|
||||
self.model = self.selected_model
|
||||
self.reason = self.routing_reason
|
||||
self.fallback_models = [model for _, model in self.fallback_chain]
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""轉換為字典 (API 回應用)"""
|
||||
return {
|
||||
"selected_provider": self.selected_provider.value,
|
||||
"selected_model": self.selected_model,
|
||||
"fallback_chain": [
|
||||
{"provider": p.value, "model": m} for p, m in self.fallback_chain
|
||||
],
|
||||
"routing_reason": self.routing_reason,
|
||||
"latency_budget_ms": self.latency_budget_ms,
|
||||
"intent": self.intent.value,
|
||||
"risk_level": self.risk_level.value,
|
||||
"complexity_score": self.complexity.score,
|
||||
"routing_latency_ms": round(self.routing_latency_ms, 2),
|
||||
}
|
||||
|
||||
|
||||
class AIRouter:
|
||||
"""
|
||||
AI 路由器
|
||||
AI 路由器 (Phase 13.3 #87)
|
||||
|
||||
整合 IntentClassifier 和 ComplexityScorer,
|
||||
動態選擇最適合的模型。
|
||||
動態選擇最適合的 AI Provider 和模型。
|
||||
|
||||
路由策略:
|
||||
1. 意圖優先覆寫 (某些意圖強制使用特定模型)
|
||||
2. 複雜度導向選擇
|
||||
3. 成本/延遲平衡
|
||||
路由決策矩陣:
|
||||
┌─────────────────┬───────────────┬──────────────────────────────┐
|
||||
│ 複雜度 + 風險 │ Provider │ 備註 │
|
||||
├─────────────────┼───────────────┼──────────────────────────────┤
|
||||
│ 1-2 + LOW │ Ollama │ 快速本地處理 │
|
||||
│ 3 + MEDIUM │ Ollama │ fallback → Gemini │
|
||||
│ 4-5 + HIGH │ Gemini │ fallback → Claude │
|
||||
│ DELETE/CRITICAL │ Claude │ 強制使用最強模型 │
|
||||
└─────────────────┴───────────────┴──────────────────────────────┘
|
||||
|
||||
路由策略 (按優先級):
|
||||
1. CRITICAL 風險強制使用 Claude
|
||||
2. DELETE 意圖強制使用 Claude
|
||||
3. HIGH 風險或複雜度 4-5 → Gemini
|
||||
4. 其他情況 → Ollama (成本優先)
|
||||
"""
|
||||
|
||||
# 意圖強制覆寫
|
||||
INTENT_OVERRIDES: dict[IntentType, str | None] = {
|
||||
IntentType.CODE_REVIEW: "qwen2.5:7b-instruct", # 程式碼審查需要強模型
|
||||
IntentType.DEPLOYMENT: None, # 不覆寫,依複雜度
|
||||
IntentType.ALERT_TRIAGE: None,
|
||||
IntentType.QUERY: "llama3.2:3b", # 查詢用快速模型
|
||||
IntentType.MAINTENANCE: None,
|
||||
IntentType.UNKNOWN: None,
|
||||
}
|
||||
|
||||
# Fallback 順序
|
||||
FALLBACK_ORDER = [
|
||||
"qwen2.5:7b-instruct", # 本地主力
|
||||
"llama3.2:3b", # 本地備援
|
||||
"gemini", # 雲端備援
|
||||
"claude", # 最終備援
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
self._intent_classifier = get_intent_classifier()
|
||||
self._complexity_scorer = get_complexity_scorer()
|
||||
self._model_registry = get_model_registry()
|
||||
|
||||
# 從 ModelRegistry 取得模型配置
|
||||
self._ollama_default = self._model_registry.get_model("ollama", "default")
|
||||
self._ollama_summary = self._model_registry.get_model("ollama", "summary")
|
||||
self._gemini_default = self._model_registry.get_model("gemini", "default")
|
||||
self._claude_default = self._model_registry.get_model("claude", "default")
|
||||
|
||||
# Provider 對應模型映射
|
||||
self._provider_models: dict[AIProvider, str] = {
|
||||
AIProvider.OLLAMA: self._ollama_default,
|
||||
AIProvider.GEMINI: self._gemini_default,
|
||||
AIProvider.CLAUDE: self._claude_default,
|
||||
}
|
||||
|
||||
# 完整 Fallback 鏈 (Provider, Model)
|
||||
self._full_fallback_chain: list[tuple[AIProvider, str]] = [
|
||||
(AIProvider.OLLAMA, self._ollama_default),
|
||||
(AIProvider.GEMINI, self._gemini_default),
|
||||
(AIProvider.CLAUDE, self._claude_default),
|
||||
]
|
||||
|
||||
# 意圖對應 Provider 強制覆寫 (None = 依複雜度決定)
|
||||
self._intent_provider_overrides: dict[IntentType, AIProvider | None] = {
|
||||
# 四大核心意圖
|
||||
IntentType.RESTART: None, # 依複雜度
|
||||
IntentType.SCALE: None, # 依複雜度
|
||||
IntentType.CONFIG: None, # 依複雜度 (但 HIGH 會升級)
|
||||
IntentType.DIAGNOSE: AIProvider.OLLAMA, # 診斷優先本地 (隱私)
|
||||
# 輔助意圖
|
||||
IntentType.DELETE: AIProvider.CLAUDE, # CRITICAL → 強制 Claude
|
||||
IntentType.ROLLBACK: None, # 依複雜度
|
||||
IntentType.UNKNOWN: None,
|
||||
# 舊版兼容
|
||||
IntentType.CODE_REVIEW: None,
|
||||
IntentType.DEPLOYMENT: None,
|
||||
IntentType.ALERT_TRIAGE: AIProvider.OLLAMA,
|
||||
IntentType.QUERY: AIProvider.OLLAMA,
|
||||
IntentType.MAINTENANCE: None,
|
||||
}
|
||||
|
||||
# 向後相容
|
||||
self._default_model = self._ollama_default
|
||||
self._summary_model = self._ollama_summary
|
||||
self._fallback_order = [
|
||||
self._ollama_default,
|
||||
self._ollama_summary,
|
||||
"gemini",
|
||||
"claude",
|
||||
]
|
||||
|
||||
async def route(
|
||||
self,
|
||||
@@ -77,78 +214,203 @@ class AIRouter:
|
||||
context: dict | None = None,
|
||||
) -> RoutingDecision:
|
||||
"""
|
||||
路由請求到最適模型
|
||||
路由請求到最適 AI Provider 和模型
|
||||
|
||||
延遲目標: < 50ms (規則引擎優先,LLM 分類時可能稍長)
|
||||
|
||||
Args:
|
||||
text: 用戶輸入或告警內容
|
||||
context: 額外上下文 (服務、指標等)
|
||||
|
||||
Returns:
|
||||
RoutingDecision: 路由決策
|
||||
RoutingDecision: 完整路由決策
|
||||
"""
|
||||
start_time = time.perf_counter()
|
||||
context = context or {}
|
||||
|
||||
# Step 1: 意圖分類
|
||||
intent = await self._intent_classifier.classify(text)
|
||||
# Step 1: 意圖分類 (返回 IntentResult, 規則引擎 < 10ms)
|
||||
intent_result = await self._intent_classifier.classify(text)
|
||||
intent = normalize_intent(intent_result.intent)
|
||||
|
||||
# Step 2: 複雜度評分
|
||||
# Step 2: 複雜度評分 (< 10ms)
|
||||
complexity = self._complexity_scorer.score(context)
|
||||
|
||||
# Step 3: 模型選擇
|
||||
model, reason = self._select_model(intent, complexity)
|
||||
# Step 3: Provider + Model 選擇 (< 1ms)
|
||||
provider, model, reason = self._select_provider_and_model(
|
||||
intent, intent_result, complexity
|
||||
)
|
||||
|
||||
# Step 4: 建立 Fallback 列表
|
||||
fallbacks = self._build_fallback_list(model)
|
||||
# Step 4: 建立 Fallback 鏈
|
||||
fallback_chain = self._build_fallback_chain(provider)
|
||||
|
||||
# Step 5: 計算延遲預算
|
||||
latency_budget = PROVIDER_LATENCY_BUDGET.get(provider, 30000)
|
||||
|
||||
# 計算路由決策耗時
|
||||
routing_latency = (time.perf_counter() - start_time) * 1000
|
||||
|
||||
decision = RoutingDecision(
|
||||
model=model,
|
||||
selected_provider=provider,
|
||||
selected_model=model,
|
||||
fallback_chain=fallback_chain,
|
||||
routing_reason=reason,
|
||||
latency_budget_ms=latency_budget,
|
||||
intent=intent,
|
||||
intent_result=intent_result,
|
||||
complexity=complexity,
|
||||
reason=reason,
|
||||
fallback_models=fallbacks,
|
||||
routing_latency_ms=routing_latency,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"ai_routing_decision",
|
||||
provider=provider.value,
|
||||
model=model,
|
||||
intent=intent.value,
|
||||
intent_confidence=intent_result.confidence,
|
||||
risk_level=intent_result.risk_level.value,
|
||||
complexity_score=complexity.score,
|
||||
reason=reason,
|
||||
latency_budget_ms=latency_budget,
|
||||
routing_latency_ms=round(routing_latency, 2),
|
||||
fallback_count=len(fallback_chain),
|
||||
)
|
||||
|
||||
return decision
|
||||
|
||||
def _select_provider_and_model(
|
||||
self,
|
||||
intent: IntentType,
|
||||
intent_result: IntentResult,
|
||||
complexity: ComplexityScore,
|
||||
) -> tuple[AIProvider, str, str]:
|
||||
"""
|
||||
選擇 Provider 和模型 (Phase 13.3 #87 核心邏輯)
|
||||
|
||||
路由決策矩陣:
|
||||
┌─────────────────┬───────────────┬──────────────────────────────┐
|
||||
│ 複雜度 + 風險 │ Provider │ 備註 │
|
||||
├─────────────────┼───────────────┼──────────────────────────────┤
|
||||
│ 1-2 + LOW │ Ollama │ 快速本地處理 │
|
||||
│ 3 + MEDIUM │ Ollama │ fallback → Gemini │
|
||||
│ 4-5 + HIGH │ Gemini │ fallback → Claude │
|
||||
│ DELETE/CRITICAL │ Claude │ 強制使用最強模型 │
|
||||
└─────────────────┴───────────────┴──────────────────────────────┘
|
||||
|
||||
Args:
|
||||
intent: 正規化後的意圖
|
||||
intent_result: 完整分類結果
|
||||
complexity: 複雜度評分
|
||||
|
||||
Returns:
|
||||
(provider, model, reason)
|
||||
"""
|
||||
risk = intent_result.risk_level
|
||||
score = complexity.score
|
||||
|
||||
# =======================================================================
|
||||
# 規則 1: CRITICAL 風險強制 Claude (最高優先級)
|
||||
# =======================================================================
|
||||
if risk == RiskLevel.CRITICAL:
|
||||
provider = AIProvider.CLAUDE
|
||||
model = self._claude_default
|
||||
reason = f"CRITICAL 風險 ({intent.value}) 強制使用 Claude"
|
||||
return provider, model, reason
|
||||
|
||||
# =======================================================================
|
||||
# 規則 2: DELETE 意圖強制 Claude (不可逆操作)
|
||||
# =======================================================================
|
||||
if intent == IntentType.DELETE:
|
||||
provider = AIProvider.CLAUDE
|
||||
model = self._claude_default
|
||||
reason = "DELETE 意圖 (不可逆) 強制使用 Claude"
|
||||
return provider, model, reason
|
||||
|
||||
# =======================================================================
|
||||
# 規則 3: 檢查意圖強制覆寫
|
||||
# =======================================================================
|
||||
provider_override = self._intent_provider_overrides.get(intent)
|
||||
if provider_override is not None:
|
||||
provider = provider_override
|
||||
model = self._provider_models[provider]
|
||||
reason = f"意圖 {intent.value} 指定使用 {provider.value}"
|
||||
return provider, model, reason
|
||||
|
||||
# =======================================================================
|
||||
# 規則 4: 複雜度 4-5 或 HIGH 風險 → Gemini
|
||||
# =======================================================================
|
||||
if score >= 4 or risk == RiskLevel.HIGH:
|
||||
provider = AIProvider.GEMINI
|
||||
model = self._gemini_default
|
||||
reason = f"複雜度={score}/5, 風險={risk.value} → Gemini (fallback Claude)"
|
||||
return provider, model, reason
|
||||
|
||||
# =======================================================================
|
||||
# 規則 5: 複雜度 3 + MEDIUM → Ollama (fallback Gemini)
|
||||
# =======================================================================
|
||||
if score == 3:
|
||||
provider = AIProvider.OLLAMA
|
||||
model = self._ollama_default
|
||||
reason = f"複雜度={score}/5, 風險={risk.value} → Ollama (fallback Gemini)"
|
||||
return provider, model, reason
|
||||
|
||||
# =======================================================================
|
||||
# 規則 6: 複雜度 1-2 + LOW/MEDIUM → Ollama (快速本地處理)
|
||||
# =======================================================================
|
||||
provider = AIProvider.OLLAMA
|
||||
# 低複雜度使用輕量模型 (更快回應)
|
||||
model = self._ollama_summary if score <= 1 else self._ollama_default
|
||||
reason = f"複雜度={score}/5, 風險={risk.value} → Ollama (成本優先)"
|
||||
return provider, model, reason
|
||||
|
||||
def _select_model(
|
||||
self,
|
||||
intent: IntentType,
|
||||
intent_result: IntentResult,
|
||||
complexity: ComplexityScore,
|
||||
) -> tuple[str, str]:
|
||||
"""
|
||||
選擇模型
|
||||
選擇模型 (向後相容方法)
|
||||
|
||||
Deprecated: 請使用 _select_provider_and_model
|
||||
|
||||
Args:
|
||||
intent: 正規化後的意圖
|
||||
intent_result: 完整分類結果
|
||||
complexity: 複雜度評分
|
||||
|
||||
Returns:
|
||||
(model_name, reason)
|
||||
"""
|
||||
# 檢查意圖覆寫
|
||||
override = self.INTENT_OVERRIDES.get(intent)
|
||||
if override:
|
||||
return override, f"意圖 {intent.value} 強制使用 {override}"
|
||||
|
||||
# 依複雜度選擇
|
||||
model = complexity.recommended_model
|
||||
reason = f"複雜度 {complexity.score}/5 → {model}"
|
||||
|
||||
# 特殊情況調整
|
||||
if intent == IntentType.ALERT_TRIAGE and complexity.score >= 4:
|
||||
# 高複雜度告警優先用雲端
|
||||
model = "gemini"
|
||||
reason = f"高複雜度告警 (score={complexity.score}) → 使用雲端模型"
|
||||
|
||||
_, model, reason = self._select_provider_and_model(
|
||||
intent, intent_result, complexity
|
||||
)
|
||||
return model, reason
|
||||
|
||||
def _build_fallback_chain(
|
||||
self, selected_provider: AIProvider
|
||||
) -> list[tuple[AIProvider, str]]:
|
||||
"""
|
||||
建立 Fallback 鏈 (排除已選 Provider)
|
||||
|
||||
Fallback 順序: Ollama → Gemini → Claude
|
||||
|
||||
Args:
|
||||
selected_provider: 已選擇的 Provider
|
||||
|
||||
Returns:
|
||||
Fallback 鏈 [(provider, model), ...]
|
||||
"""
|
||||
fallback_chain: list[tuple[AIProvider, str]] = []
|
||||
|
||||
for provider, model in self._full_fallback_chain:
|
||||
if provider != selected_provider:
|
||||
fallback_chain.append((provider, model))
|
||||
|
||||
return fallback_chain
|
||||
|
||||
def _build_fallback_list(self, selected_model: str) -> list[str]:
|
||||
"""建立 Fallback 列表 (排除已選模型)"""
|
||||
fallbacks = [m for m in self.FALLBACK_ORDER if m != selected_model]
|
||||
"""建立 Fallback 列表 (向後相容)"""
|
||||
fallbacks = [m for m in self._fallback_order if m != selected_model]
|
||||
return fallbacks
|
||||
|
||||
def route_sync(
|
||||
@@ -156,22 +418,113 @@ class AIRouter:
|
||||
text: str,
|
||||
context: dict | None = None,
|
||||
) -> RoutingDecision:
|
||||
"""同步版本 (僅關鍵字匹配)"""
|
||||
"""
|
||||
同步版本路由 (僅關鍵字匹配,保證 < 50ms)
|
||||
|
||||
適用場景: 需要快速決策,不需要 LLM 分類的情況
|
||||
|
||||
Args:
|
||||
text: 用戶輸入或告警內容
|
||||
context: 額外上下文
|
||||
|
||||
Returns:
|
||||
RoutingDecision: 路由決策
|
||||
"""
|
||||
start_time = time.perf_counter()
|
||||
context = context or {}
|
||||
|
||||
intent = self._intent_classifier.classify_sync(text)
|
||||
# 同步分類 (僅規則引擎, < 10ms)
|
||||
intent_result = self._intent_classifier.classify_sync(text)
|
||||
intent = normalize_intent(intent_result.intent)
|
||||
|
||||
# 複雜度評分 (< 10ms)
|
||||
complexity = self._complexity_scorer.score(context)
|
||||
model, reason = self._select_model(intent, complexity)
|
||||
fallbacks = self._build_fallback_list(model)
|
||||
|
||||
# Provider + Model 選擇
|
||||
provider, model, reason = self._select_provider_and_model(
|
||||
intent, intent_result, complexity
|
||||
)
|
||||
|
||||
# 建立 Fallback 鏈
|
||||
fallback_chain = self._build_fallback_chain(provider)
|
||||
|
||||
# 延遲預算
|
||||
latency_budget = PROVIDER_LATENCY_BUDGET.get(provider, 30000)
|
||||
|
||||
# 計算路由決策耗時
|
||||
routing_latency = (time.perf_counter() - start_time) * 1000
|
||||
|
||||
return RoutingDecision(
|
||||
model=model,
|
||||
selected_provider=provider,
|
||||
selected_model=model,
|
||||
fallback_chain=fallback_chain,
|
||||
routing_reason=reason,
|
||||
latency_budget_ms=latency_budget,
|
||||
intent=intent,
|
||||
intent_result=intent_result,
|
||||
complexity=complexity,
|
||||
reason=reason,
|
||||
fallback_models=fallbacks,
|
||||
routing_latency_ms=routing_latency,
|
||||
)
|
||||
|
||||
# =========================================================================
|
||||
# 便捷方法
|
||||
# =========================================================================
|
||||
|
||||
def get_provider_for_intent(self, intent: IntentType) -> AIProvider:
|
||||
"""取得意圖對應的 Provider (不考慮複雜度)"""
|
||||
override = self._intent_provider_overrides.get(intent)
|
||||
return override if override else AIProvider.OLLAMA
|
||||
|
||||
def get_model_for_provider(self, provider: AIProvider) -> str:
|
||||
"""取得 Provider 對應的模型"""
|
||||
return self._provider_models.get(provider, self._ollama_default)
|
||||
|
||||
def get_routing_matrix(self) -> list[dict]:
|
||||
"""
|
||||
取得路由決策矩陣 (用於 API 文檔或除錯)
|
||||
|
||||
Returns:
|
||||
路由規則清單
|
||||
"""
|
||||
return [
|
||||
{
|
||||
"rule": 1,
|
||||
"condition": "CRITICAL risk",
|
||||
"provider": "claude",
|
||||
"reason": "不可逆/高風險操作強制最強模型",
|
||||
},
|
||||
{
|
||||
"rule": 2,
|
||||
"condition": "DELETE intent",
|
||||
"provider": "claude",
|
||||
"reason": "刪除操作強制最強模型",
|
||||
},
|
||||
{
|
||||
"rule": 3,
|
||||
"condition": "Intent override",
|
||||
"provider": "depends",
|
||||
"reason": "特定意圖有預設 Provider",
|
||||
},
|
||||
{
|
||||
"rule": 4,
|
||||
"condition": "complexity >= 4 OR HIGH risk",
|
||||
"provider": "gemini",
|
||||
"reason": "高複雜度需要雲端能力",
|
||||
},
|
||||
{
|
||||
"rule": 5,
|
||||
"condition": "complexity == 3",
|
||||
"provider": "ollama",
|
||||
"reason": "中等複雜度本地處理",
|
||||
},
|
||||
{
|
||||
"rule": 6,
|
||||
"condition": "complexity 1-2",
|
||||
"provider": "ollama",
|
||||
"reason": "低複雜度快速處理",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# 單例
|
||||
_router: AIRouter | None = None
|
||||
@@ -183,3 +536,9 @@ def get_ai_router() -> AIRouter:
|
||||
if _router is None:
|
||||
_router = AIRouter()
|
||||
return _router
|
||||
|
||||
|
||||
def reset_ai_router() -> None:
|
||||
"""重置單例 (用於測試)"""
|
||||
global _router
|
||||
_router = None
|
||||
|
||||
483
apps/api/src/services/ci_auto_repair.py
Normal file
483
apps/api/src/services/ci_auto_repair.py
Normal file
@@ -0,0 +1,483 @@
|
||||
"""
|
||||
CI Auto-Repair Service - Phase 13.1 #78
|
||||
========================================
|
||||
CI 失敗自動修復服務,根據風險分級決定執行策略
|
||||
|
||||
策略:
|
||||
- LOW: 自動執行修復 (如重啟 Runner、清理快取)
|
||||
- MEDIUM: 發送 Telegram 確認,快速批准後執行
|
||||
- HIGH: 建立 Approval,等待人工審核
|
||||
- CRITICAL: 禁止自動修復,僅通知
|
||||
|
||||
整合:
|
||||
- Intent Classifier: 判斷修復意圖類型
|
||||
- Complexity Scorer: 評估修復複雜度
|
||||
- AI Router: 選擇最適 AI 進行分析
|
||||
|
||||
版本: v1.0
|
||||
建立: 2026-03-26 16:50 (台北時區)
|
||||
建立者: Claude Code
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
|
||||
import structlog
|
||||
|
||||
from src.services.intent_classifier import IntentType, RiskLevel, get_intent_classifier
|
||||
from src.services.complexity_scorer import get_complexity_scorer
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Types
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class RepairAction(Enum):
|
||||
"""修復動作類型"""
|
||||
RESTART_RUNNER = "restart_runner"
|
||||
CLEAR_CACHE = "clear_cache"
|
||||
RETRY_WORKFLOW = "retry_workflow"
|
||||
ROLLBACK_COMMIT = "rollback_commit"
|
||||
FIX_CONFIG = "fix_config"
|
||||
FIX_DEPENDENCY = "fix_dependency"
|
||||
SCALE_RESOURCE = "scale_resource"
|
||||
MANUAL_REQUIRED = "manual_required"
|
||||
|
||||
|
||||
class ExecutionDecision(Enum):
|
||||
"""執行決策"""
|
||||
AUTO_EXECUTE = "auto_execute" # 直接自動執行
|
||||
TELEGRAM_CONFIRM = "telegram_confirm" # Telegram 快速確認
|
||||
APPROVAL_REQUIRED = "approval_required" # 建立 Approval 等待審核
|
||||
BLOCKED = "blocked" # 禁止執行,僅通知
|
||||
|
||||
|
||||
@dataclass
|
||||
class RepairRecommendation:
|
||||
"""修復建議"""
|
||||
action: RepairAction
|
||||
command: str | None
|
||||
reason: str
|
||||
risk_level: RiskLevel
|
||||
execution_decision: ExecutionDecision
|
||||
confidence: float
|
||||
estimated_duration_seconds: int
|
||||
rollback_command: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class CIRepairDecision:
|
||||
"""CI 修復決策結果"""
|
||||
should_repair: bool
|
||||
execution_decision: ExecutionDecision
|
||||
recommendations: list[RepairRecommendation]
|
||||
risk_level: RiskLevel
|
||||
complexity_score: int
|
||||
intent_type: IntentType
|
||||
reason: str
|
||||
metadata: dict
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Repair Strategy Mapping
|
||||
# =============================================================================
|
||||
|
||||
|
||||
# 錯誤類型 → 修復動作映射
|
||||
ERROR_TYPE_REPAIR_MAP: dict[str, list[RepairAction]] = {
|
||||
"build": [RepairAction.CLEAR_CACHE, RepairAction.FIX_DEPENDENCY],
|
||||
"test": [RepairAction.RETRY_WORKFLOW, RepairAction.FIX_CONFIG],
|
||||
"lint": [RepairAction.RETRY_WORKFLOW],
|
||||
"deploy": [RepairAction.ROLLBACK_COMMIT, RepairAction.FIX_CONFIG],
|
||||
"timeout": [RepairAction.RESTART_RUNNER, RepairAction.SCALE_RESOURCE],
|
||||
"runner": [RepairAction.RESTART_RUNNER],
|
||||
"dependency": [RepairAction.CLEAR_CACHE, RepairAction.FIX_DEPENDENCY],
|
||||
"unknown": [RepairAction.MANUAL_REQUIRED],
|
||||
}
|
||||
|
||||
|
||||
# 修復動作 → 風險等級映射
|
||||
ACTION_RISK_MAP: dict[RepairAction, RiskLevel] = {
|
||||
RepairAction.RETRY_WORKFLOW: RiskLevel.LOW,
|
||||
RepairAction.CLEAR_CACHE: RiskLevel.LOW,
|
||||
RepairAction.RESTART_RUNNER: RiskLevel.LOW,
|
||||
RepairAction.FIX_CONFIG: RiskLevel.MEDIUM,
|
||||
RepairAction.FIX_DEPENDENCY: RiskLevel.MEDIUM,
|
||||
RepairAction.SCALE_RESOURCE: RiskLevel.MEDIUM,
|
||||
RepairAction.ROLLBACK_COMMIT: RiskLevel.HIGH,
|
||||
RepairAction.MANUAL_REQUIRED: RiskLevel.CRITICAL,
|
||||
}
|
||||
|
||||
|
||||
# 風險等級 → 執行決策映射
|
||||
RISK_EXECUTION_MAP: dict[RiskLevel, ExecutionDecision] = {
|
||||
RiskLevel.LOW: ExecutionDecision.AUTO_EXECUTE,
|
||||
RiskLevel.MEDIUM: ExecutionDecision.TELEGRAM_CONFIRM,
|
||||
RiskLevel.HIGH: ExecutionDecision.APPROVAL_REQUIRED,
|
||||
RiskLevel.CRITICAL: ExecutionDecision.BLOCKED,
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CI Auto-Repair Service
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class CIAutoRepairService:
|
||||
"""
|
||||
CI 自動修復服務
|
||||
|
||||
整合智能路由 (Phase 13.3) 進行風險評估和修復決策
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._intent_classifier = get_intent_classifier()
|
||||
self._complexity_scorer = get_complexity_scorer()
|
||||
|
||||
async def evaluate_repair(
|
||||
self,
|
||||
error_type: str,
|
||||
workflow_name: str,
|
||||
repo: str,
|
||||
failure_context: dict,
|
||||
diagnosis_summary: str | None = None,
|
||||
) -> CIRepairDecision:
|
||||
"""
|
||||
評估 CI 失敗的修復策略
|
||||
|
||||
Args:
|
||||
error_type: 錯誤類型 (build/test/lint/deploy/timeout)
|
||||
workflow_name: Workflow 名稱
|
||||
repo: 倉庫名稱
|
||||
failure_context: 失敗上下文
|
||||
diagnosis_summary: AI 診斷摘要 (可選)
|
||||
|
||||
Returns:
|
||||
CIRepairDecision: 修復決策
|
||||
"""
|
||||
logger.info(
|
||||
"ci_repair_evaluation_started",
|
||||
error_type=error_type,
|
||||
workflow_name=workflow_name,
|
||||
repo=repo,
|
||||
)
|
||||
|
||||
# 1. 構建分析文字
|
||||
analysis_text = self._build_analysis_text(
|
||||
error_type=error_type,
|
||||
workflow_name=workflow_name,
|
||||
diagnosis_summary=diagnosis_summary,
|
||||
)
|
||||
|
||||
# 2. 意圖分類
|
||||
intent_result = self._intent_classifier.classify(analysis_text)
|
||||
|
||||
# 3. 複雜度評估
|
||||
complexity_result = self._complexity_scorer.score(
|
||||
text=analysis_text,
|
||||
context={
|
||||
"error_type": error_type,
|
||||
"workflow_name": workflow_name,
|
||||
"repo": repo,
|
||||
**failure_context,
|
||||
},
|
||||
)
|
||||
|
||||
# 4. 獲取可能的修復動作
|
||||
possible_actions = ERROR_TYPE_REPAIR_MAP.get(
|
||||
error_type.lower(),
|
||||
[RepairAction.MANUAL_REQUIRED],
|
||||
)
|
||||
|
||||
# 5. 生成修復建議
|
||||
recommendations = self._generate_recommendations(
|
||||
possible_actions=possible_actions,
|
||||
error_type=error_type,
|
||||
workflow_name=workflow_name,
|
||||
complexity_score=complexity_result.score,
|
||||
)
|
||||
|
||||
# 6. 決定整體風險等級和執行策略
|
||||
overall_risk = self._determine_overall_risk(
|
||||
recommendations=recommendations,
|
||||
intent_risk=intent_result.risk_level,
|
||||
complexity_score=complexity_result.score,
|
||||
)
|
||||
|
||||
execution_decision = RISK_EXECUTION_MAP.get(
|
||||
overall_risk,
|
||||
ExecutionDecision.APPROVAL_REQUIRED,
|
||||
)
|
||||
|
||||
# 7. 特殊規則覆蓋
|
||||
execution_decision = self._apply_special_rules(
|
||||
execution_decision=execution_decision,
|
||||
error_type=error_type,
|
||||
workflow_name=workflow_name,
|
||||
repo=repo,
|
||||
)
|
||||
|
||||
decision = CIRepairDecision(
|
||||
should_repair=execution_decision != ExecutionDecision.BLOCKED,
|
||||
execution_decision=execution_decision,
|
||||
recommendations=recommendations,
|
||||
risk_level=overall_risk,
|
||||
complexity_score=complexity_result.score,
|
||||
intent_type=intent_result.intent,
|
||||
reason=self._generate_decision_reason(
|
||||
execution_decision=execution_decision,
|
||||
overall_risk=overall_risk,
|
||||
error_type=error_type,
|
||||
),
|
||||
metadata={
|
||||
"intent_confidence": intent_result.confidence,
|
||||
"complexity_factors": complexity_result.factors,
|
||||
"workflow_name": workflow_name,
|
||||
"repo": repo,
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"ci_repair_evaluation_completed",
|
||||
should_repair=decision.should_repair,
|
||||
execution_decision=execution_decision.value,
|
||||
risk_level=overall_risk.value,
|
||||
recommendations_count=len(recommendations),
|
||||
)
|
||||
|
||||
return decision
|
||||
|
||||
def _build_analysis_text(
|
||||
self,
|
||||
error_type: str,
|
||||
workflow_name: str,
|
||||
diagnosis_summary: str | None,
|
||||
) -> str:
|
||||
"""構建意圖分類用的分析文字"""
|
||||
parts = [
|
||||
f"CI workflow '{workflow_name}' failed",
|
||||
f"Error type: {error_type}",
|
||||
]
|
||||
if diagnosis_summary:
|
||||
parts.append(f"Diagnosis: {diagnosis_summary}")
|
||||
return ". ".join(parts)
|
||||
|
||||
def _generate_recommendations(
|
||||
self,
|
||||
possible_actions: list[RepairAction],
|
||||
error_type: str,
|
||||
workflow_name: str,
|
||||
complexity_score: int,
|
||||
) -> list[RepairRecommendation]:
|
||||
"""生成修復建議列表"""
|
||||
recommendations = []
|
||||
|
||||
for action in possible_actions:
|
||||
risk = ACTION_RISK_MAP.get(action, RiskLevel.HIGH)
|
||||
|
||||
# 根據複雜度調整風險
|
||||
if complexity_score >= 4:
|
||||
risk = RiskLevel.HIGH if risk == RiskLevel.MEDIUM else risk
|
||||
|
||||
command, rollback = self._get_repair_command(
|
||||
action=action,
|
||||
workflow_name=workflow_name,
|
||||
)
|
||||
|
||||
recommendations.append(RepairRecommendation(
|
||||
action=action,
|
||||
command=command,
|
||||
reason=self._get_action_reason(action, error_type),
|
||||
risk_level=risk,
|
||||
execution_decision=RISK_EXECUTION_MAP.get(risk, ExecutionDecision.APPROVAL_REQUIRED),
|
||||
confidence=self._calculate_confidence(action, error_type),
|
||||
estimated_duration_seconds=self._estimate_duration(action),
|
||||
rollback_command=rollback,
|
||||
))
|
||||
|
||||
# 按風險等級排序 (低風險優先)
|
||||
risk_order = {RiskLevel.LOW: 0, RiskLevel.MEDIUM: 1, RiskLevel.HIGH: 2, RiskLevel.CRITICAL: 3}
|
||||
recommendations.sort(key=lambda r: risk_order.get(r.risk_level, 99))
|
||||
|
||||
return recommendations
|
||||
|
||||
def _get_repair_command(
|
||||
self,
|
||||
action: RepairAction,
|
||||
workflow_name: str,
|
||||
) -> tuple[str | None, str | None]:
|
||||
"""獲取修復指令和回滾指令"""
|
||||
commands: dict[RepairAction, tuple[str | None, str | None]] = {
|
||||
RepairAction.RETRY_WORKFLOW: (
|
||||
f"gh workflow run {workflow_name}",
|
||||
None,
|
||||
),
|
||||
RepairAction.CLEAR_CACHE: (
|
||||
"gh cache delete --all",
|
||||
None,
|
||||
),
|
||||
RepairAction.RESTART_RUNNER: (
|
||||
"sudo systemctl restart actions.runner.*",
|
||||
None,
|
||||
),
|
||||
RepairAction.SCALE_RESOURCE: (
|
||||
"kubectl scale deployment/actions-runner --replicas=3",
|
||||
"kubectl scale deployment/actions-runner --replicas=2",
|
||||
),
|
||||
RepairAction.ROLLBACK_COMMIT: (
|
||||
"git revert HEAD --no-edit && git push",
|
||||
"git revert HEAD --no-edit && git push",
|
||||
),
|
||||
RepairAction.FIX_CONFIG: (
|
||||
None, # 需要 AI 生成具體指令
|
||||
None,
|
||||
),
|
||||
RepairAction.FIX_DEPENDENCY: (
|
||||
"pnpm install --force && uv sync",
|
||||
None,
|
||||
),
|
||||
RepairAction.MANUAL_REQUIRED: (
|
||||
None,
|
||||
None,
|
||||
),
|
||||
}
|
||||
return commands.get(action, (None, None))
|
||||
|
||||
def _get_action_reason(self, action: RepairAction, error_type: str) -> str:
|
||||
"""獲取修復動作的原因說明"""
|
||||
reasons = {
|
||||
RepairAction.RETRY_WORKFLOW: f"Retry workflow to recover from transient {error_type} failure",
|
||||
RepairAction.CLEAR_CACHE: "Clear build/dependency cache to resolve potential cache corruption",
|
||||
RepairAction.RESTART_RUNNER: "Restart GitHub Actions runner to recover from runner issues",
|
||||
RepairAction.SCALE_RESOURCE: "Scale runner resources to handle timeout issues",
|
||||
RepairAction.ROLLBACK_COMMIT: "Rollback recent commit that may have introduced the failure",
|
||||
RepairAction.FIX_CONFIG: "Fix configuration that may be causing the failure",
|
||||
RepairAction.FIX_DEPENDENCY: "Update or fix dependencies to resolve compatibility issues",
|
||||
RepairAction.MANUAL_REQUIRED: "Manual investigation required due to complex failure",
|
||||
}
|
||||
return reasons.get(action, "Unknown action")
|
||||
|
||||
def _calculate_confidence(self, action: RepairAction, error_type: str) -> float:
|
||||
"""計算修復信心度"""
|
||||
# 基礎信心度
|
||||
base_confidence = {
|
||||
RepairAction.RETRY_WORKFLOW: 0.6,
|
||||
RepairAction.CLEAR_CACHE: 0.7,
|
||||
RepairAction.RESTART_RUNNER: 0.8,
|
||||
RepairAction.SCALE_RESOURCE: 0.5,
|
||||
RepairAction.ROLLBACK_COMMIT: 0.4,
|
||||
RepairAction.FIX_CONFIG: 0.3,
|
||||
RepairAction.FIX_DEPENDENCY: 0.5,
|
||||
RepairAction.MANUAL_REQUIRED: 0.1,
|
||||
}
|
||||
|
||||
confidence = base_confidence.get(action, 0.5)
|
||||
|
||||
# 錯誤類型與動作的匹配度調整
|
||||
if error_type == "timeout" and action == RepairAction.RESTART_RUNNER:
|
||||
confidence += 0.2
|
||||
elif error_type == "build" and action == RepairAction.CLEAR_CACHE:
|
||||
confidence += 0.15
|
||||
|
||||
return min(confidence, 1.0)
|
||||
|
||||
def _estimate_duration(self, action: RepairAction) -> int:
|
||||
"""估算修復時間 (秒)"""
|
||||
durations = {
|
||||
RepairAction.RETRY_WORKFLOW: 300, # 5 分鐘
|
||||
RepairAction.CLEAR_CACHE: 30,
|
||||
RepairAction.RESTART_RUNNER: 60,
|
||||
RepairAction.SCALE_RESOURCE: 120,
|
||||
RepairAction.ROLLBACK_COMMIT: 180,
|
||||
RepairAction.FIX_CONFIG: 600,
|
||||
RepairAction.FIX_DEPENDENCY: 300,
|
||||
RepairAction.MANUAL_REQUIRED: 3600,
|
||||
}
|
||||
return durations.get(action, 300)
|
||||
|
||||
def _determine_overall_risk(
|
||||
self,
|
||||
recommendations: list[RepairRecommendation],
|
||||
intent_risk: RiskLevel,
|
||||
complexity_score: int,
|
||||
) -> RiskLevel:
|
||||
"""決定整體風險等級"""
|
||||
if not recommendations:
|
||||
return RiskLevel.CRITICAL
|
||||
|
||||
# 取最低風險的建議作為基礎
|
||||
min_risk = min(
|
||||
recommendations,
|
||||
key=lambda r: {RiskLevel.LOW: 0, RiskLevel.MEDIUM: 1, RiskLevel.HIGH: 2, RiskLevel.CRITICAL: 3}.get(r.risk_level, 99),
|
||||
).risk_level
|
||||
|
||||
# 如果複雜度高,提升風險等級
|
||||
if complexity_score >= 4 and min_risk == RiskLevel.LOW:
|
||||
min_risk = RiskLevel.MEDIUM
|
||||
elif complexity_score >= 5 and min_risk == RiskLevel.MEDIUM:
|
||||
min_risk = RiskLevel.HIGH
|
||||
|
||||
# 如果意圖分類顯示高風險,取較高值
|
||||
risk_order = {RiskLevel.LOW: 0, RiskLevel.MEDIUM: 1, RiskLevel.HIGH: 2, RiskLevel.CRITICAL: 3}
|
||||
if risk_order.get(intent_risk, 0) > risk_order.get(min_risk, 0):
|
||||
return intent_risk
|
||||
|
||||
return min_risk
|
||||
|
||||
def _apply_special_rules(
|
||||
self,
|
||||
execution_decision: ExecutionDecision,
|
||||
error_type: str,
|
||||
workflow_name: str,
|
||||
repo: str,
|
||||
) -> ExecutionDecision:
|
||||
"""應用特殊規則覆蓋"""
|
||||
# 生產部署相關的 workflow 強制需要審核
|
||||
production_keywords = ["prod", "production", "release", "deploy"]
|
||||
if any(kw in workflow_name.lower() for kw in production_keywords):
|
||||
if execution_decision == ExecutionDecision.AUTO_EXECUTE:
|
||||
return ExecutionDecision.TELEGRAM_CONFIRM
|
||||
|
||||
# rollback 錯誤類型強制需要審核
|
||||
if error_type == "deploy":
|
||||
if execution_decision in (ExecutionDecision.AUTO_EXECUTE, ExecutionDecision.TELEGRAM_CONFIRM):
|
||||
return ExecutionDecision.APPROVAL_REQUIRED
|
||||
|
||||
return execution_decision
|
||||
|
||||
def _generate_decision_reason(
|
||||
self,
|
||||
execution_decision: ExecutionDecision,
|
||||
overall_risk: RiskLevel,
|
||||
error_type: str,
|
||||
) -> str:
|
||||
"""生成決策原因說明"""
|
||||
reasons = {
|
||||
ExecutionDecision.AUTO_EXECUTE: f"Low risk {error_type} failure, safe for auto-repair",
|
||||
ExecutionDecision.TELEGRAM_CONFIRM: f"Medium risk {error_type} failure, quick Telegram confirmation recommended",
|
||||
ExecutionDecision.APPROVAL_REQUIRED: f"High risk {error_type} failure, human approval required before repair",
|
||||
ExecutionDecision.BLOCKED: f"Critical {error_type} failure, auto-repair blocked for safety",
|
||||
}
|
||||
return reasons.get(execution_decision, "Unknown decision")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Singleton
|
||||
# =============================================================================
|
||||
|
||||
|
||||
_ci_auto_repair_service: CIAutoRepairService | None = None
|
||||
|
||||
|
||||
def get_ci_auto_repair_service() -> CIAutoRepairService:
|
||||
"""取得全域 CI Auto-Repair Service 實例"""
|
||||
global _ci_auto_repair_service
|
||||
if _ci_auto_repair_service is None:
|
||||
_ci_auto_repair_service = CIAutoRepairService()
|
||||
return _ci_auto_repair_service
|
||||
@@ -7,139 +7,415 @@ Complexity Scorer - Phase 13.3 #86
|
||||
策略: 基於特徵提取的加權評分
|
||||
|
||||
Phase 13.3 (2026-03-26): 初始實作
|
||||
Phase 13.3 (2026-03-26): 增強版 - 9 維度完整評分系統 (ADR-023)
|
||||
|
||||
版本: v2.0
|
||||
建立: 2026-03-26 (台北時區)
|
||||
建立者: Claude Code
|
||||
最後修改: 2026-03-26 (台北時區)
|
||||
修改者: Claude Code
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from typing import Protocol
|
||||
|
||||
import structlog
|
||||
|
||||
from src.services.model_registry import get_model_registry
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Enums
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class DataImpact(Enum):
|
||||
"""資料影響等級 (ADR-023)"""
|
||||
|
||||
NONE = "none" # 無資料影響
|
||||
READ_ONLY = "read_only" # 只讀操作
|
||||
WRITE = "write" # 寫入操作
|
||||
DESTRUCTIVE = "destructive" # 破壞性操作 (刪除、DROP)
|
||||
|
||||
|
||||
class BusinessCriticality(Enum):
|
||||
"""業務關鍵度等級"""
|
||||
|
||||
NON_CRITICAL = "non_critical" # 非關鍵服務
|
||||
SUPPORTING = "supporting" # 支援服務
|
||||
IMPORTANT = "important" # 重要服務
|
||||
CRITICAL = "critical" # 核心服務
|
||||
MISSION_CRITICAL = "mission_critical" # 業務命脈
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Interface (支援 DI 測試)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class IComplexityScorer(Protocol):
|
||||
"""Complexity Scorer Interface for DI"""
|
||||
|
||||
def score(self, context: dict) -> "ComplexityScore":
|
||||
"""計算複雜度分數"""
|
||||
...
|
||||
|
||||
def get_dimension_weights(self) -> dict[str, float]:
|
||||
"""取得維度權重配置"""
|
||||
...
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Data Classes
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def _get_default_model() -> str:
|
||||
"""取得預設模型 (從 ModelRegistry)"""
|
||||
return get_model_registry().get_model("ollama", "default")
|
||||
|
||||
|
||||
@dataclass
|
||||
class DimensionScore:
|
||||
"""單一維度評分"""
|
||||
|
||||
name: str # 維度名稱
|
||||
raw_value: int | float | str | bool # 原始值
|
||||
normalized_score: int # 正規化分數 (1-5)
|
||||
weight: float # 權重
|
||||
weighted_score: float # 加權後分數
|
||||
reason: str # 評分原因
|
||||
|
||||
|
||||
@dataclass
|
||||
class ComplexityScore:
|
||||
"""複雜度評分結果"""
|
||||
|
||||
score: int # 1-5 (1=簡單, 5=極複雜)
|
||||
features: dict[str, int] = field(default_factory=dict)
|
||||
recommended_model: str = "qwen2.5:7b-instruct"
|
||||
features: dict[str, int] = field(default_factory=dict) # 向後相容
|
||||
recommended_model: str = "" # 由 ComplexityScorer 填入
|
||||
reasoning: str = ""
|
||||
|
||||
# v2.0 新增欄位
|
||||
dimensions: list[DimensionScore] = field(default_factory=list)
|
||||
raw_weighted_sum: float = 0.0 # 加權總分 (正規化前)
|
||||
total_weight: float = 0.0 # 總權重
|
||||
|
||||
# 模型映射 (依複雜度)
|
||||
MODEL_BY_COMPLEXITY = {
|
||||
1: "llama3.2:3b", # 簡單任務,快速回應
|
||||
2: "qwen2.5:7b-instruct", # 中等任務
|
||||
3: "qwen2.5:7b-instruct", # 複雜任務
|
||||
4: "gemini", # 需要雲端能力
|
||||
5: "claude", # 極複雜,需要最強模型
|
||||
}
|
||||
def __post_init__(self):
|
||||
"""初始化後設定預設模型"""
|
||||
if not self.recommended_model:
|
||||
self.recommended_model = _get_default_model()
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""轉換為字典 (API 回應用)"""
|
||||
return {
|
||||
"score": self.score,
|
||||
"recommended_model": self.recommended_model,
|
||||
"reasoning": self.reasoning,
|
||||
"dimensions": [
|
||||
{
|
||||
"name": d.name,
|
||||
"raw_value": d.raw_value if not isinstance(d.raw_value, Enum) else d.raw_value.value,
|
||||
"normalized_score": d.normalized_score,
|
||||
"weight": d.weight,
|
||||
"weighted_score": round(d.weighted_score, 3),
|
||||
"reason": d.reason,
|
||||
}
|
||||
for d in self.dimensions
|
||||
],
|
||||
"raw_weighted_sum": round(self.raw_weighted_sum, 3),
|
||||
"total_weight": round(self.total_weight, 3),
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Complexity Scorer Implementation
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class ComplexityScorer:
|
||||
"""
|
||||
複雜度評分器
|
||||
複雜度評分器 (v2.0)
|
||||
|
||||
基於規則的複雜度評估,無 LLM 依賴,確保 < 10ms
|
||||
|
||||
評分維度:
|
||||
1. 服務數量 (affected_services)
|
||||
2. 指標數量 (metrics)
|
||||
3. 是否需要程式碼分析 (requires_code_analysis)
|
||||
4. 是否跨系統 (cross_system)
|
||||
5. 是否有歷史關聯 (has_history)
|
||||
6. 嚴重程度 (severity)
|
||||
評分維度 (9 個,ADR-023):
|
||||
1. 資源數量 (resource_count)
|
||||
2. 跨命名空間 (cross_namespace)
|
||||
3. 有狀態資源 (stateful_resource)
|
||||
4. 資料影響 (data_impact)
|
||||
5. 服務依賴 (service_dependencies)
|
||||
6. 回滾難度 (rollback_difficulty)
|
||||
7. 停機時間 (downtime_estimate)
|
||||
8. 安全敏感度 (security_sensitivity)
|
||||
9. 業務關鍵度 (business_criticality)
|
||||
|
||||
權重配置說明:
|
||||
- 權重越高,對最終分數影響越大
|
||||
- 總權重 = 所有啟用維度權重之和
|
||||
- 最終分數 = 加權平均 (1-5)
|
||||
"""
|
||||
|
||||
# 權重配置
|
||||
WEIGHTS = {
|
||||
"service_count": 0.5, # 每增加一個服務 +0.5
|
||||
"metric_count": 0.3, # 每增加一個指標 +0.3
|
||||
"code_analysis": 1.5, # 需要代碼分析 +1.5
|
||||
"cross_system": 1.0, # 跨系統 +1.0
|
||||
"has_history": -0.5, # 有歷史案例 -0.5 (降低複雜度)
|
||||
"critical_severity": 1.0, # CRITICAL 告警 +1.0
|
||||
# ==========================================================================
|
||||
# 權重配置 (可透過 models.json 覆寫)
|
||||
# ==========================================================================
|
||||
|
||||
DEFAULT_WEIGHTS = {
|
||||
# 維度名稱: 權重
|
||||
"resource_count": 1.0, # 資源數量
|
||||
"cross_namespace": 1.5, # 跨命名空間 (風險較高)
|
||||
"stateful_resource": 2.0, # 有狀態資源 (最高風險)
|
||||
"data_impact": 2.0, # 資料影響 (最高風險)
|
||||
"service_dependencies": 1.0, # 服務依賴
|
||||
"rollback_difficulty": 1.5, # 回滾難度
|
||||
"downtime_estimate": 1.0, # 停機時間
|
||||
"security_sensitivity": 1.5, # 安全敏感度
|
||||
"business_criticality": 1.5, # 業務關鍵度
|
||||
# 降低複雜度的維度 (負權重)
|
||||
"has_playbook": -0.5, # 有歷史 Playbook
|
||||
"has_history": -0.5, # 有歷史案例
|
||||
}
|
||||
|
||||
# ==========================================================================
|
||||
# 評分閾值
|
||||
# ==========================================================================
|
||||
|
||||
# 資源數量閾值
|
||||
RESOURCE_COUNT_THRESHOLDS = {
|
||||
1: 1, # 1 個資源 = 分數 1
|
||||
2: 2, # 2 個資源 = 分數 2
|
||||
3: 3, # 3-4 個資源 = 分數 3
|
||||
5: 4, # 5-9 個資源 = 分數 4
|
||||
10: 5, # 10+ 個資源 = 分數 5
|
||||
}
|
||||
|
||||
# 服務依賴閾值
|
||||
SERVICE_DEPENDENCY_THRESHOLDS = {
|
||||
0: 1, # 獨立服務 = 分數 1
|
||||
1: 2, # 1 個依賴 = 分數 2
|
||||
2: 3, # 2 個依賴 = 分數 3
|
||||
4: 4, # 4 個依賴 = 分數 4
|
||||
6: 5, # 6+ 個依賴 = 分數 5
|
||||
}
|
||||
|
||||
# 停機時間閾值 (分鐘)
|
||||
DOWNTIME_THRESHOLDS = {
|
||||
0: 1, # 0 分鐘 = 分數 1
|
||||
1: 2, # 1-4 分鐘 = 分數 2
|
||||
5: 3, # 5-14 分鐘 = 分數 3
|
||||
15: 4, # 15-29 分鐘 = 分數 4
|
||||
30: 5, # 30+ 分鐘 = 分數 5
|
||||
}
|
||||
|
||||
# 資料影響對應分數
|
||||
DATA_IMPACT_SCORES = {
|
||||
DataImpact.NONE: 1,
|
||||
DataImpact.READ_ONLY: 2,
|
||||
DataImpact.WRITE: 4,
|
||||
DataImpact.DESTRUCTIVE: 5,
|
||||
}
|
||||
|
||||
# 業務關鍵度對應分數
|
||||
BUSINESS_CRITICALITY_SCORES = {
|
||||
BusinessCriticality.NON_CRITICAL: 1,
|
||||
BusinessCriticality.SUPPORTING: 2,
|
||||
BusinessCriticality.IMPORTANT: 3,
|
||||
BusinessCriticality.CRITICAL: 4,
|
||||
BusinessCriticality.MISSION_CRITICAL: 5,
|
||||
}
|
||||
|
||||
def __init__(self, weights: dict[str, float] | None = None):
|
||||
"""
|
||||
初始化 ComplexityScorer
|
||||
|
||||
Args:
|
||||
weights: 自訂權重配置,None 使用預設
|
||||
"""
|
||||
self._weights = weights or self.DEFAULT_WEIGHTS.copy()
|
||||
|
||||
def get_dimension_weights(self) -> dict[str, float]:
|
||||
"""取得維度權重配置"""
|
||||
return self._weights.copy()
|
||||
|
||||
def score(self, context: dict) -> ComplexityScore:
|
||||
"""
|
||||
計算複雜度分數
|
||||
|
||||
Args:
|
||||
context: 上下文資訊,包含:
|
||||
- affected_services: list[str]
|
||||
- metrics: list[str]
|
||||
context: 上下文資訊,包含 (全部可選):
|
||||
# 基本維度
|
||||
- resource_count: int (受影響資源數量)
|
||||
- affected_services: list[str] (受影響服務清單,向後相容)
|
||||
- metrics: list[str] (相關指標,向後相容)
|
||||
|
||||
# 命名空間與資源類型
|
||||
- namespaces: list[str] (涉及的命名空間)
|
||||
- cross_namespace: bool (是否跨命名空間)
|
||||
- stateful_resources: list[str] (有狀態資源清單)
|
||||
- has_statefulset: bool (是否有 StatefulSet)
|
||||
- has_pvc: bool (是否有 PVC)
|
||||
|
||||
# 資料影響
|
||||
- data_impact: str | DataImpact (資料影響等級)
|
||||
|
||||
# 服務依賴
|
||||
- service_dependencies: list[str] (服務依賴清單)
|
||||
- dependency_count: int (依賴數量)
|
||||
|
||||
# 回滾
|
||||
- rollback_difficulty: int (1-5)
|
||||
- can_rollback_immediately: bool (是否可立即回滾)
|
||||
- irreversible: bool (是否不可逆)
|
||||
|
||||
# 停機時間
|
||||
- downtime_minutes: int (預估停機時間)
|
||||
- zero_downtime: bool (是否零停機)
|
||||
|
||||
# 安全
|
||||
- involves_secrets: bool (是否涉及 Secret)
|
||||
- involves_rbac: bool (是否涉及 RBAC)
|
||||
- security_sensitive: bool (是否安全敏感)
|
||||
|
||||
# 業務
|
||||
- business_criticality: str | BusinessCriticality (業務關鍵度)
|
||||
- is_core_service: bool (是否核心服務)
|
||||
|
||||
# 歷史
|
||||
- has_playbook: bool (是否有 Playbook)
|
||||
- has_history: bool (是否有歷史案例)
|
||||
|
||||
# 其他 (向後相容)
|
||||
- requires_code_analysis: bool
|
||||
- cross_system: bool
|
||||
- has_history: bool
|
||||
- severity: str
|
||||
|
||||
Returns:
|
||||
ComplexityScore: 評分結果
|
||||
"""
|
||||
raw_score = 1.0 # 基準分
|
||||
features: dict[str, int] = {}
|
||||
reasons: list[str] = []
|
||||
dimensions: list[DimensionScore] = []
|
||||
features: dict[str, int] = {} # 向後相容
|
||||
|
||||
# 特徵 1: 服務數量
|
||||
services = context.get("affected_services", [])
|
||||
service_count = len(services)
|
||||
if service_count > 1:
|
||||
delta = (service_count - 1) * self.WEIGHTS["service_count"]
|
||||
raw_score += delta
|
||||
features["service_count"] = service_count
|
||||
reasons.append(f"涉及 {service_count} 個服務")
|
||||
# =======================================================================
|
||||
# 評估各維度
|
||||
# =======================================================================
|
||||
|
||||
# 特徵 2: 指標數量
|
||||
metrics = context.get("metrics", [])
|
||||
metric_count = len(metrics)
|
||||
if metric_count > 2:
|
||||
delta = (metric_count - 2) * self.WEIGHTS["metric_count"]
|
||||
raw_score += delta
|
||||
features["metric_count"] = metric_count
|
||||
reasons.append(f"涉及 {metric_count} 個指標")
|
||||
# 維度 1: 資源數量
|
||||
dim1 = self._score_resource_count(context)
|
||||
if dim1:
|
||||
dimensions.append(dim1)
|
||||
features["resource_count"] = dim1.normalized_score
|
||||
|
||||
# 特徵 3: 是否需要程式碼分析
|
||||
if context.get("requires_code_analysis", False):
|
||||
raw_score += self.WEIGHTS["code_analysis"]
|
||||
features["code_analysis"] = 1
|
||||
reasons.append("需要程式碼分析")
|
||||
# 維度 2: 跨命名空間
|
||||
dim2 = self._score_cross_namespace(context)
|
||||
if dim2:
|
||||
dimensions.append(dim2)
|
||||
features["cross_namespace"] = dim2.normalized_score
|
||||
|
||||
# 特徵 4: 是否跨系統
|
||||
if context.get("cross_system", False):
|
||||
raw_score += self.WEIGHTS["cross_system"]
|
||||
features["cross_system"] = 1
|
||||
reasons.append("跨系統問題")
|
||||
# 維度 3: 有狀態資源
|
||||
dim3 = self._score_stateful_resource(context)
|
||||
if dim3:
|
||||
dimensions.append(dim3)
|
||||
features["stateful_resource"] = dim3.normalized_score
|
||||
|
||||
# 特徵 5: 是否有歷史關聯
|
||||
if context.get("has_history", False):
|
||||
raw_score += self.WEIGHTS["has_history"] # 負數,降低複雜度
|
||||
# 維度 4: 資料影響
|
||||
dim4 = self._score_data_impact(context)
|
||||
if dim4:
|
||||
dimensions.append(dim4)
|
||||
features["data_impact"] = dim4.normalized_score
|
||||
|
||||
# 維度 5: 服務依賴
|
||||
dim5 = self._score_service_dependencies(context)
|
||||
if dim5:
|
||||
dimensions.append(dim5)
|
||||
features["service_dependencies"] = dim5.normalized_score
|
||||
|
||||
# 維度 6: 回滾難度
|
||||
dim6 = self._score_rollback_difficulty(context)
|
||||
if dim6:
|
||||
dimensions.append(dim6)
|
||||
features["rollback_difficulty"] = dim6.normalized_score
|
||||
|
||||
# 維度 7: 停機時間
|
||||
dim7 = self._score_downtime(context)
|
||||
if dim7:
|
||||
dimensions.append(dim7)
|
||||
features["downtime_estimate"] = dim7.normalized_score
|
||||
|
||||
# 維度 8: 安全敏感度
|
||||
dim8 = self._score_security_sensitivity(context)
|
||||
if dim8:
|
||||
dimensions.append(dim8)
|
||||
features["security_sensitivity"] = dim8.normalized_score
|
||||
|
||||
# 維度 9: 業務關鍵度
|
||||
dim9 = self._score_business_criticality(context)
|
||||
if dim9:
|
||||
dimensions.append(dim9)
|
||||
features["business_criticality"] = dim9.normalized_score
|
||||
|
||||
# 降低複雜度的維度
|
||||
dim_playbook = self._score_has_playbook(context)
|
||||
if dim_playbook:
|
||||
dimensions.append(dim_playbook)
|
||||
features["has_playbook"] = 1
|
||||
|
||||
dim_history = self._score_has_history(context)
|
||||
if dim_history:
|
||||
dimensions.append(dim_history)
|
||||
features["has_history"] = 1
|
||||
reasons.append("有歷史案例參考")
|
||||
|
||||
# 特徵 6: 嚴重程度
|
||||
severity = context.get("severity", "").upper()
|
||||
if severity == "CRITICAL":
|
||||
raw_score += self.WEIGHTS["critical_severity"]
|
||||
features["severity"] = 4
|
||||
reasons.append("CRITICAL 嚴重程度")
|
||||
elif severity == "HIGH":
|
||||
raw_score += 0.5
|
||||
features["severity"] = 3
|
||||
# =======================================================================
|
||||
# 計算加權平均
|
||||
# =======================================================================
|
||||
|
||||
# 正規化到 1-5
|
||||
final_score = max(1, min(5, round(raw_score)))
|
||||
if not dimensions:
|
||||
# 無維度資料,返回基本分數
|
||||
final_score = 1
|
||||
raw_weighted_sum = 1.0
|
||||
total_weight = 1.0
|
||||
reasoning = "基本複雜度 (無足夠資訊)"
|
||||
else:
|
||||
# 計算加權總分
|
||||
weighted_sum = sum(d.weighted_score for d in dimensions)
|
||||
total_weight = sum(abs(d.weight) for d in dimensions)
|
||||
|
||||
# 選擇推薦模型
|
||||
recommended_model = MODEL_BY_COMPLEXITY.get(
|
||||
final_score, "qwen2.5:7b-instruct"
|
||||
)
|
||||
# 加權平均
|
||||
if total_weight > 0:
|
||||
avg_score = weighted_sum / total_weight
|
||||
else:
|
||||
avg_score = 1.0
|
||||
|
||||
# 正規化到 1-5
|
||||
final_score = max(1, min(5, round(avg_score)))
|
||||
raw_weighted_sum = weighted_sum
|
||||
|
||||
# 生成 reasoning
|
||||
high_impact_dims = [d for d in dimensions if d.normalized_score >= 4]
|
||||
if high_impact_dims:
|
||||
reasons = [d.reason for d in high_impact_dims[:3]] # 最多 3 個
|
||||
reasoning = "; ".join(reasons)
|
||||
else:
|
||||
reasons = [d.reason for d in dimensions if d.normalized_score >= 2][:3]
|
||||
reasoning = "; ".join(reasons) if reasons else "基本複雜度"
|
||||
|
||||
# =======================================================================
|
||||
# 從 ModelRegistry 取得推薦模型
|
||||
# =======================================================================
|
||||
|
||||
registry = get_model_registry()
|
||||
recommended_model = registry.get_model_by_complexity(final_score)
|
||||
|
||||
result = ComplexityScore(
|
||||
score=final_score,
|
||||
features=features,
|
||||
recommended_model=recommended_model,
|
||||
reasoning="; ".join(reasons) if reasons else "基本複雜度",
|
||||
reasoning=reasoning,
|
||||
dimensions=dimensions,
|
||||
raw_weighted_sum=raw_weighted_sum,
|
||||
total_weight=total_weight,
|
||||
)
|
||||
|
||||
logger.debug(
|
||||
@@ -147,12 +423,361 @@ class ComplexityScorer:
|
||||
score=final_score,
|
||||
features=features,
|
||||
model=recommended_model,
|
||||
dimension_count=len(dimensions),
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
# ==========================================================================
|
||||
# 維度評分方法
|
||||
# ==========================================================================
|
||||
|
||||
def _score_resource_count(self, context: dict) -> DimensionScore | None:
|
||||
"""維度 1: 資源數量"""
|
||||
# 優先使用 resource_count,否則計算 affected_services
|
||||
count = context.get("resource_count")
|
||||
if count is None:
|
||||
services = context.get("affected_services", [])
|
||||
if not services:
|
||||
return None
|
||||
count = len(services)
|
||||
|
||||
if count < 1:
|
||||
return None
|
||||
|
||||
# 計算分數
|
||||
score = 1
|
||||
for threshold, s in sorted(self.RESOURCE_COUNT_THRESHOLDS.items()):
|
||||
if count >= threshold:
|
||||
score = s
|
||||
|
||||
weight = self._weights.get("resource_count", 1.0)
|
||||
|
||||
return DimensionScore(
|
||||
name="resource_count",
|
||||
raw_value=count,
|
||||
normalized_score=score,
|
||||
weight=weight,
|
||||
weighted_score=score * weight,
|
||||
reason=f"{count} 個資源" if count <= 5 else f"{count} 個資源 (大規模)",
|
||||
)
|
||||
|
||||
def _score_cross_namespace(self, context: dict) -> DimensionScore | None:
|
||||
"""維度 2: 跨命名空間"""
|
||||
# 直接標記
|
||||
cross_ns = context.get("cross_namespace", False)
|
||||
|
||||
# 或從 namespaces 推斷
|
||||
if not cross_ns:
|
||||
namespaces = context.get("namespaces", [])
|
||||
cross_ns = len(namespaces) > 1
|
||||
|
||||
# 或從 cross_system 推斷 (向後相容)
|
||||
if not cross_ns:
|
||||
cross_ns = context.get("cross_system", False)
|
||||
|
||||
if not cross_ns:
|
||||
return None
|
||||
|
||||
namespaces = context.get("namespaces", [])
|
||||
ns_count = len(namespaces) if namespaces else 2
|
||||
|
||||
# 跨命名空間基本分數 = 3,多個 = 4-5
|
||||
score = 3 if ns_count <= 2 else (4 if ns_count <= 4 else 5)
|
||||
weight = self._weights.get("cross_namespace", 1.5)
|
||||
|
||||
return DimensionScore(
|
||||
name="cross_namespace",
|
||||
raw_value=True,
|
||||
normalized_score=score,
|
||||
weight=weight,
|
||||
weighted_score=score * weight,
|
||||
reason=f"跨 {ns_count} 個命名空間" if ns_count > 1 else "跨命名空間操作",
|
||||
)
|
||||
|
||||
def _score_stateful_resource(self, context: dict) -> DimensionScore | None:
|
||||
"""維度 3: 有狀態資源 (StatefulSet, PVC)"""
|
||||
stateful_resources = context.get("stateful_resources", [])
|
||||
has_sts = context.get("has_statefulset", False)
|
||||
has_pvc = context.get("has_pvc", False)
|
||||
|
||||
if not stateful_resources and not has_sts and not has_pvc:
|
||||
return None
|
||||
|
||||
# 計算分數
|
||||
if has_pvc or "pvc" in str(stateful_resources).lower():
|
||||
score = 5 # PVC 最高風險
|
||||
reason = "涉及 PVC (資料持久化)"
|
||||
elif has_sts or "statefulset" in str(stateful_resources).lower():
|
||||
score = 4 # StatefulSet 高風險
|
||||
reason = "涉及 StatefulSet (有序部署)"
|
||||
else:
|
||||
score = 3
|
||||
reason = f"涉及 {len(stateful_resources)} 個有狀態資源"
|
||||
|
||||
weight = self._weights.get("stateful_resource", 2.0)
|
||||
|
||||
return DimensionScore(
|
||||
name="stateful_resource",
|
||||
raw_value=stateful_resources or [has_sts, has_pvc],
|
||||
normalized_score=score,
|
||||
weight=weight,
|
||||
weighted_score=score * weight,
|
||||
reason=reason,
|
||||
)
|
||||
|
||||
def _score_data_impact(self, context: dict) -> DimensionScore | None:
|
||||
"""維度 4: 資料影響"""
|
||||
impact = context.get("data_impact")
|
||||
|
||||
if impact is None:
|
||||
return None
|
||||
|
||||
# 轉換為 Enum
|
||||
if isinstance(impact, str):
|
||||
try:
|
||||
impact = DataImpact(impact.lower())
|
||||
except ValueError:
|
||||
return None
|
||||
elif not isinstance(impact, DataImpact):
|
||||
return None
|
||||
|
||||
if impact == DataImpact.NONE:
|
||||
return None # 無影響不計分
|
||||
|
||||
score = self.DATA_IMPACT_SCORES.get(impact, 1)
|
||||
weight = self._weights.get("data_impact", 2.0)
|
||||
|
||||
reason_map = {
|
||||
DataImpact.READ_ONLY: "只讀操作",
|
||||
DataImpact.WRITE: "寫入操作 (資料變更)",
|
||||
DataImpact.DESTRUCTIVE: "破壞性操作 (不可恢復)",
|
||||
}
|
||||
|
||||
return DimensionScore(
|
||||
name="data_impact",
|
||||
raw_value=impact,
|
||||
normalized_score=score,
|
||||
weight=weight,
|
||||
weighted_score=score * weight,
|
||||
reason=reason_map.get(impact, "資料影響"),
|
||||
)
|
||||
|
||||
def _score_service_dependencies(self, context: dict) -> DimensionScore | None:
|
||||
"""維度 5: 服務依賴"""
|
||||
deps = context.get("service_dependencies", [])
|
||||
dep_count = context.get("dependency_count")
|
||||
|
||||
if dep_count is None:
|
||||
dep_count = len(deps) if deps else 0
|
||||
|
||||
if dep_count == 0:
|
||||
return None
|
||||
|
||||
# 計算分數
|
||||
score = 1
|
||||
for threshold, s in sorted(self.SERVICE_DEPENDENCY_THRESHOLDS.items()):
|
||||
if dep_count >= threshold:
|
||||
score = s
|
||||
|
||||
weight = self._weights.get("service_dependencies", 1.0)
|
||||
|
||||
return DimensionScore(
|
||||
name="service_dependencies",
|
||||
raw_value=dep_count,
|
||||
normalized_score=score,
|
||||
weight=weight,
|
||||
weighted_score=score * weight,
|
||||
reason=f"依賴 {dep_count} 個服務",
|
||||
)
|
||||
|
||||
def _score_rollback_difficulty(self, context: dict) -> DimensionScore | None:
|
||||
"""維度 6: 回滾難度"""
|
||||
# 直接指定難度
|
||||
difficulty = context.get("rollback_difficulty")
|
||||
|
||||
if difficulty is None:
|
||||
# 從其他欄位推斷
|
||||
if context.get("irreversible", False):
|
||||
difficulty = 5
|
||||
elif context.get("can_rollback_immediately", True):
|
||||
return None # 可立即回滾,不加分
|
||||
else:
|
||||
difficulty = 3 # 預設中等
|
||||
|
||||
if difficulty is None or difficulty < 2:
|
||||
return None
|
||||
|
||||
score = max(1, min(5, difficulty))
|
||||
weight = self._weights.get("rollback_difficulty", 1.5)
|
||||
|
||||
reason_map = {
|
||||
2: "回滾需要額外步驟",
|
||||
3: "回滾難度中等",
|
||||
4: "回滾困難 (需人工介入)",
|
||||
5: "不可逆操作",
|
||||
}
|
||||
|
||||
return DimensionScore(
|
||||
name="rollback_difficulty",
|
||||
raw_value=difficulty,
|
||||
normalized_score=score,
|
||||
weight=weight,
|
||||
weighted_score=score * weight,
|
||||
reason=reason_map.get(score, f"回滾難度 {score}"),
|
||||
)
|
||||
|
||||
def _score_downtime(self, context: dict) -> DimensionScore | None:
|
||||
"""維度 7: 停機時間"""
|
||||
if context.get("zero_downtime", False):
|
||||
return None # 零停機不加分
|
||||
|
||||
downtime = context.get("downtime_minutes")
|
||||
if downtime is None or downtime == 0:
|
||||
return None
|
||||
|
||||
# 計算分數
|
||||
score = 1
|
||||
for threshold, s in sorted(self.DOWNTIME_THRESHOLDS.items()):
|
||||
if downtime >= threshold:
|
||||
score = s
|
||||
|
||||
weight = self._weights.get("downtime_estimate", 1.0)
|
||||
|
||||
if downtime < 5:
|
||||
reason = f"預估停機 {downtime} 分鐘"
|
||||
elif downtime < 15:
|
||||
reason = f"預估停機 {downtime} 分鐘 (中等)"
|
||||
else:
|
||||
reason = f"預估停機 {downtime} 分鐘 (長時間)"
|
||||
|
||||
return DimensionScore(
|
||||
name="downtime_estimate",
|
||||
raw_value=downtime,
|
||||
normalized_score=score,
|
||||
weight=weight,
|
||||
weighted_score=score * weight,
|
||||
reason=reason,
|
||||
)
|
||||
|
||||
def _score_security_sensitivity(self, context: dict) -> DimensionScore | None:
|
||||
"""維度 8: 安全敏感度 (Secret/RBAC)"""
|
||||
involves_secrets = context.get("involves_secrets", False)
|
||||
involves_rbac = context.get("involves_rbac", False)
|
||||
security_sensitive = context.get("security_sensitive", False)
|
||||
|
||||
if not involves_secrets and not involves_rbac and not security_sensitive:
|
||||
return None
|
||||
|
||||
# 計算分數
|
||||
if involves_rbac:
|
||||
score = 5 # RBAC 最敏感
|
||||
reason = "涉及 RBAC 權限變更"
|
||||
elif involves_secrets:
|
||||
score = 4 # Secret 高敏感
|
||||
reason = "涉及 Secret 操作"
|
||||
else:
|
||||
score = 3
|
||||
reason = "安全敏感操作"
|
||||
|
||||
weight = self._weights.get("security_sensitivity", 1.5)
|
||||
|
||||
return DimensionScore(
|
||||
name="security_sensitivity",
|
||||
raw_value={"secrets": involves_secrets, "rbac": involves_rbac},
|
||||
normalized_score=score,
|
||||
weight=weight,
|
||||
weighted_score=score * weight,
|
||||
reason=reason,
|
||||
)
|
||||
|
||||
def _score_business_criticality(self, context: dict) -> DimensionScore | None:
|
||||
"""維度 9: 業務關鍵度"""
|
||||
criticality = context.get("business_criticality")
|
||||
|
||||
if criticality is None:
|
||||
# 從 is_core_service 推斷
|
||||
if context.get("is_core_service", False):
|
||||
criticality = BusinessCriticality.CRITICAL
|
||||
else:
|
||||
return None
|
||||
|
||||
# 轉換為 Enum
|
||||
if isinstance(criticality, str):
|
||||
try:
|
||||
criticality = BusinessCriticality(criticality.lower())
|
||||
except ValueError:
|
||||
# 嘗試映射常見值
|
||||
mapping = {
|
||||
"low": BusinessCriticality.NON_CRITICAL,
|
||||
"medium": BusinessCriticality.IMPORTANT,
|
||||
"high": BusinessCriticality.CRITICAL,
|
||||
}
|
||||
criticality = mapping.get(criticality.lower())
|
||||
if criticality is None:
|
||||
return None
|
||||
elif not isinstance(criticality, BusinessCriticality):
|
||||
return None
|
||||
|
||||
if criticality == BusinessCriticality.NON_CRITICAL:
|
||||
return None # 非關鍵不加分
|
||||
|
||||
score = self.BUSINESS_CRITICALITY_SCORES.get(criticality, 1)
|
||||
weight = self._weights.get("business_criticality", 1.5)
|
||||
|
||||
reason_map = {
|
||||
BusinessCriticality.SUPPORTING: "支援服務",
|
||||
BusinessCriticality.IMPORTANT: "重要服務",
|
||||
BusinessCriticality.CRITICAL: "核心服務",
|
||||
BusinessCriticality.MISSION_CRITICAL: "業務命脈 (最高優先)",
|
||||
}
|
||||
|
||||
return DimensionScore(
|
||||
name="business_criticality",
|
||||
raw_value=criticality,
|
||||
normalized_score=score,
|
||||
weight=weight,
|
||||
weighted_score=score * weight,
|
||||
reason=reason_map.get(criticality, "業務關鍵度"),
|
||||
)
|
||||
|
||||
def _score_has_playbook(self, context: dict) -> DimensionScore | None:
|
||||
"""降低複雜度: 有 Playbook"""
|
||||
if not context.get("has_playbook", False):
|
||||
return None
|
||||
|
||||
weight = self._weights.get("has_playbook", -0.5)
|
||||
|
||||
return DimensionScore(
|
||||
name="has_playbook",
|
||||
raw_value=True,
|
||||
normalized_score=1, # 正向降低
|
||||
weight=weight, # 負權重
|
||||
weighted_score=1 * weight, # 負分
|
||||
reason="有歷史 Playbook (降低複雜度)",
|
||||
)
|
||||
|
||||
def _score_has_history(self, context: dict) -> DimensionScore | None:
|
||||
"""降低複雜度: 有歷史案例"""
|
||||
if not context.get("has_history", False):
|
||||
return None
|
||||
|
||||
weight = self._weights.get("has_history", -0.5)
|
||||
|
||||
return DimensionScore(
|
||||
name="has_history",
|
||||
raw_value=True,
|
||||
normalized_score=1,
|
||||
weight=weight,
|
||||
weighted_score=1 * weight,
|
||||
reason="有歷史案例參考 (降低複雜度)",
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Singleton
|
||||
# =============================================================================
|
||||
|
||||
# 單例
|
||||
_scorer: ComplexityScorer | None = None
|
||||
|
||||
|
||||
@@ -162,3 +787,19 @@ def get_complexity_scorer() -> ComplexityScorer:
|
||||
if _scorer is None:
|
||||
_scorer = ComplexityScorer()
|
||||
return _scorer
|
||||
|
||||
|
||||
def reset_complexity_scorer() -> None:
|
||||
"""重置單例 (用於測試)"""
|
||||
global _scorer
|
||||
_scorer = None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Convenience Functions
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def score_complexity(context: dict) -> ComplexityScore:
|
||||
"""便捷函數: 計算複雜度"""
|
||||
return get_complexity_scorer().score(context)
|
||||
|
||||
@@ -1,141 +1,600 @@
|
||||
"""
|
||||
Intent Classifier - Phase 13.3 #85
|
||||
===================================
|
||||
快速意圖分類,用於智能路由
|
||||
K8s 操作意圖分類器,用於智能路由模型選擇
|
||||
|
||||
目標: < 100ms 延遲
|
||||
策略: 關鍵字優先 → 小模型備援
|
||||
目標: < 100ms 延遲 (規則引擎 < 10ms)
|
||||
策略: 方案 A (規則引擎) → 方案 B (LLM 備援)
|
||||
|
||||
Phase 13.3 (2026-03-26): 初始實作
|
||||
版本: v2.0
|
||||
建立: 2026-03-26 (台北時區)
|
||||
建立者: Claude Code
|
||||
最後修改: 2026-03-26 (台北時區)
|
||||
修改者: Claude Code
|
||||
|
||||
變更紀錄:
|
||||
| 版本 | 日期 | 執行者 | 變更內容 |
|
||||
|------|------|--------|----------|
|
||||
| v1.0 | 2026-03-26 | Claude Code | 初始實作 (舊版 IntentType) |
|
||||
| v2.0 | 2026-03-26 | Claude Code | Phase 13.3 #85 升級 (四大核心+輔助意圖) |
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from typing import Protocol, runtime_checkable
|
||||
|
||||
import structlog
|
||||
|
||||
from src.services.model_registry import get_model_registry
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 意圖類型定義 (Phase 13.3 #85)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class IntentType(Enum):
|
||||
"""意圖類型"""
|
||||
"""
|
||||
K8s 操作意圖類型
|
||||
|
||||
ALERT_TRIAGE = "alert_triage" # 告警分流/處理
|
||||
DEPLOYMENT = "deployment" # 部署操作 (kubectl, rollout)
|
||||
QUERY = "query" # 資訊查詢 (狀態, 日誌)
|
||||
MAINTENANCE = "maintenance" # 維運操作 (重啟, 擴容)
|
||||
CODE_REVIEW = "code_review" # 程式碼審查
|
||||
UNKNOWN = "unknown"
|
||||
四大核心意圖:
|
||||
- RESTART: 重啟 Pod/Deployment/StatefulSet
|
||||
- SCALE: 擴縮容、HPA 調整
|
||||
- CONFIG: ConfigMap/Secret/ENV 變更
|
||||
- DIAGNOSE: 日誌查詢、健康檢查、RCA
|
||||
|
||||
輔助意圖:
|
||||
- DELETE: 刪除資源(高風險)
|
||||
- ROLLBACK: 回滾版本
|
||||
- UNKNOWN: 無法判斷
|
||||
|
||||
舊版兼容 (已棄用,映射到新意圖):
|
||||
- ALERT_TRIAGE → DIAGNOSE
|
||||
- DEPLOYMENT → CONFIG
|
||||
- QUERY → DIAGNOSE
|
||||
- MAINTENANCE → RESTART
|
||||
- CODE_REVIEW → DIAGNOSE
|
||||
"""
|
||||
|
||||
# 四大核心意圖
|
||||
RESTART = "restart" # 重啟 Pod/Deployment/StatefulSet
|
||||
SCALE = "scale" # 擴縮容、HPA 調整
|
||||
CONFIG = "config" # ConfigMap/Secret/ENV 變更
|
||||
DIAGNOSE = "diagnose" # 日誌查詢、健康檢查、RCA
|
||||
|
||||
# 輔助意圖
|
||||
DELETE = "delete" # 刪除資源(高風險)
|
||||
ROLLBACK = "rollback" # 回滾版本
|
||||
UNKNOWN = "unknown" # 無法判斷
|
||||
|
||||
# 舊版兼容 (棄用,保留向後兼容)
|
||||
ALERT_TRIAGE = "alert_triage" # → DIAGNOSE
|
||||
DEPLOYMENT = "deployment" # → CONFIG
|
||||
QUERY = "query" # → DIAGNOSE
|
||||
MAINTENANCE = "maintenance" # → RESTART
|
||||
CODE_REVIEW = "code_review" # → DIAGNOSE
|
||||
|
||||
|
||||
# 關鍵字映射 (優先匹配,0ms)
|
||||
# 舊版意圖到新版的映射
|
||||
LEGACY_INTENT_MAP: dict[IntentType, IntentType] = {
|
||||
IntentType.ALERT_TRIAGE: IntentType.DIAGNOSE,
|
||||
IntentType.DEPLOYMENT: IntentType.CONFIG,
|
||||
IntentType.QUERY: IntentType.DIAGNOSE,
|
||||
IntentType.MAINTENANCE: IntentType.RESTART,
|
||||
IntentType.CODE_REVIEW: IntentType.DIAGNOSE,
|
||||
}
|
||||
|
||||
|
||||
def normalize_intent(intent: IntentType) -> IntentType:
|
||||
"""
|
||||
正規化意圖 (將舊版意圖映射到新版)
|
||||
|
||||
Args:
|
||||
intent: 原始意圖
|
||||
|
||||
Returns:
|
||||
正規化後的意圖
|
||||
"""
|
||||
return LEGACY_INTENT_MAP.get(intent, intent)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 風險等級定義
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class RiskLevel(Enum):
|
||||
"""意圖風險等級"""
|
||||
|
||||
LOW = "low" # 只讀操作 (DIAGNOSE)
|
||||
MEDIUM = "medium" # 可逆操作 (RESTART, SCALE, ROLLBACK)
|
||||
HIGH = "high" # 配置變更 (CONFIG)
|
||||
CRITICAL = "critical" # 不可逆操作 (DELETE)
|
||||
|
||||
|
||||
# 意圖對應風險等級
|
||||
INTENT_RISK_MAP: dict[IntentType, RiskLevel] = {
|
||||
IntentType.DIAGNOSE: RiskLevel.LOW,
|
||||
IntentType.RESTART: RiskLevel.MEDIUM,
|
||||
IntentType.SCALE: RiskLevel.MEDIUM,
|
||||
IntentType.ROLLBACK: RiskLevel.MEDIUM,
|
||||
IntentType.CONFIG: RiskLevel.HIGH,
|
||||
IntentType.DELETE: RiskLevel.CRITICAL,
|
||||
IntentType.UNKNOWN: RiskLevel.MEDIUM,
|
||||
# 舊版兼容
|
||||
IntentType.ALERT_TRIAGE: RiskLevel.LOW,
|
||||
IntentType.DEPLOYMENT: RiskLevel.HIGH,
|
||||
IntentType.QUERY: RiskLevel.LOW,
|
||||
IntentType.MAINTENANCE: RiskLevel.MEDIUM,
|
||||
IntentType.CODE_REVIEW: RiskLevel.LOW,
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 關鍵字規則引擎 (方案 A, < 10ms)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
# 核心意圖關鍵字映射
|
||||
INTENT_KEYWORDS: dict[IntentType, list[str]] = {
|
||||
IntentType.ALERT_TRIAGE: [
|
||||
"alert", "告警", "警報", "異常", "error", "critical", "warning",
|
||||
"高負載", "high cpu", "memory", "oom", "crash", "down",
|
||||
# 四大核心意圖
|
||||
IntentType.RESTART: [
|
||||
# 英文
|
||||
"restart",
|
||||
"reboot",
|
||||
"recreate",
|
||||
"kill",
|
||||
"delete pod",
|
||||
"rollout restart",
|
||||
# 中文
|
||||
"重啟",
|
||||
"重新啟動",
|
||||
"重建",
|
||||
"刪除 pod",
|
||||
"殺掉",
|
||||
],
|
||||
IntentType.DEPLOYMENT: [
|
||||
"deploy", "部署", "rollout", "kubectl apply", "helm", "release",
|
||||
"版本", "upgrade", "更新", "上線",
|
||||
IntentType.SCALE: [
|
||||
# 英文
|
||||
"scale",
|
||||
"replica",
|
||||
"hpa",
|
||||
"autoscale",
|
||||
"scale up",
|
||||
"scale down",
|
||||
"horizontal pod autoscaler",
|
||||
# 中文
|
||||
"擴容",
|
||||
"縮容",
|
||||
"擴縮",
|
||||
"副本",
|
||||
"水平擴展",
|
||||
],
|
||||
IntentType.QUERY: [
|
||||
"查詢", "狀態", "status", "describe", "get", "list", "日誌", "log",
|
||||
"哪個", "什麼", "how many", "多少",
|
||||
IntentType.CONFIG: [
|
||||
# 英文
|
||||
"configmap",
|
||||
"secret",
|
||||
"env",
|
||||
"environment",
|
||||
"config",
|
||||
"setting",
|
||||
"configuration",
|
||||
"kubectl apply",
|
||||
"helm upgrade",
|
||||
# 中文
|
||||
"配置",
|
||||
"設定",
|
||||
"環境變數",
|
||||
"部署",
|
||||
"更新配置",
|
||||
],
|
||||
IntentType.MAINTENANCE: [
|
||||
"restart", "重啟", "scale", "擴容", "縮容", "rollback", "回滾",
|
||||
"維護", "maintenance", "patch", "修補",
|
||||
IntentType.DIAGNOSE: [
|
||||
# 英文
|
||||
"log",
|
||||
"logs",
|
||||
"describe",
|
||||
"get",
|
||||
"status",
|
||||
"health",
|
||||
"check",
|
||||
"debug",
|
||||
"trace",
|
||||
"diagnose",
|
||||
"rca",
|
||||
"root cause",
|
||||
"investigate",
|
||||
"why",
|
||||
"what happened",
|
||||
# 中文
|
||||
"日誌",
|
||||
"查看",
|
||||
"檢查",
|
||||
"狀態",
|
||||
"健康",
|
||||
"診斷",
|
||||
"原因",
|
||||
"為什麼",
|
||||
"什麼問題",
|
||||
"分析",
|
||||
],
|
||||
IntentType.CODE_REVIEW: [
|
||||
"review", "審查", "pr", "pull request", "commit", "diff",
|
||||
"程式碼", "code", "merge",
|
||||
# 輔助意圖
|
||||
IntentType.DELETE: [
|
||||
# 英文
|
||||
"delete",
|
||||
"remove",
|
||||
"destroy",
|
||||
"kubectl delete",
|
||||
"helm uninstall",
|
||||
"drop",
|
||||
# 中文
|
||||
"刪除",
|
||||
"移除",
|
||||
"銷毀",
|
||||
"清除",
|
||||
],
|
||||
IntentType.ROLLBACK: [
|
||||
# 英文
|
||||
"rollback",
|
||||
"rollout undo",
|
||||
"revert",
|
||||
"previous version",
|
||||
"last version",
|
||||
# 中文
|
||||
"回滾",
|
||||
"回復",
|
||||
"還原",
|
||||
"上一版",
|
||||
"前一版",
|
||||
],
|
||||
}
|
||||
|
||||
# 告警關鍵字 (強化 DIAGNOSE 分類)
|
||||
ALERT_KEYWORDS: list[str] = [
|
||||
"alert",
|
||||
"alerting",
|
||||
"firing",
|
||||
"告警",
|
||||
"警報",
|
||||
"異常",
|
||||
"error",
|
||||
"critical",
|
||||
"warning",
|
||||
"high cpu",
|
||||
"high memory",
|
||||
"oom",
|
||||
"crash",
|
||||
"down",
|
||||
"timeout",
|
||||
"failed",
|
||||
"unhealthy",
|
||||
]
|
||||
|
||||
# 資源類型關鍵字 (用於上下文判斷)
|
||||
RESOURCE_KEYWORDS: dict[str, list[str]] = {
|
||||
"pod": ["pod", "pods", "po"],
|
||||
"deployment": ["deployment", "deployments", "deploy"],
|
||||
"statefulset": ["statefulset", "statefulsets", "sts"],
|
||||
"daemonset": ["daemonset", "daemonsets", "ds"],
|
||||
"service": ["service", "services", "svc"],
|
||||
"configmap": ["configmap", "configmaps", "cm"],
|
||||
"secret": ["secret", "secrets"],
|
||||
"ingress": ["ingress", "ingresses", "ing"],
|
||||
"namespace": ["namespace", "namespaces", "ns"],
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 分類結果
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@dataclass
|
||||
class IntentResult:
|
||||
"""意圖分類結果"""
|
||||
|
||||
intent: IntentType # 分類意圖
|
||||
confidence: float # 信心度 (0.0-1.0)
|
||||
method: str # 分類方法 (keyword/llm)
|
||||
risk_level: RiskLevel = field(default=RiskLevel.MEDIUM)
|
||||
matched_keywords: list[str] = field(default_factory=list)
|
||||
detected_resources: list[str] = field(default_factory=list)
|
||||
reasoning: str = ""
|
||||
|
||||
def __post_init__(self):
|
||||
"""初始化後設定風險等級"""
|
||||
self.risk_level = INTENT_RISK_MAP.get(self.intent, RiskLevel.MEDIUM)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Protocol 介面 (支援 DI)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class IIntentClassifier(Protocol):
|
||||
"""Intent Classifier Interface for DI"""
|
||||
|
||||
async def classify(self, text: str) -> IntentResult:
|
||||
"""分類意圖 (非同步)"""
|
||||
...
|
||||
|
||||
def classify_sync(self, text: str) -> IntentResult:
|
||||
"""分類意圖 (同步)"""
|
||||
...
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 實作
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class IntentClassifier:
|
||||
"""
|
||||
意圖分類器
|
||||
K8s 操作意圖分類器
|
||||
|
||||
使用兩階段分類策略:
|
||||
1. 關鍵字快速匹配 (0ms)
|
||||
2. 小模型 LLM 分類 (< 100ms) - 備援
|
||||
1. 方案 A: 規則引擎 (關鍵字匹配, < 10ms)
|
||||
2. 方案 B: 輕量 LLM (qwen2.5:1b, < 100ms) - 備援
|
||||
|
||||
Usage:
|
||||
classifier = get_intent_classifier()
|
||||
result = await classifier.classify("重啟 api-server pod")
|
||||
# IntentResult(intent=RESTART, confidence=0.95, method='keyword')
|
||||
"""
|
||||
|
||||
# 小模型,低延遲
|
||||
MODEL = "qwen2.5:1b"
|
||||
# LLM 備援模型 (從 ModelRegistry 取得)
|
||||
_llm_model: str | None = None
|
||||
|
||||
def __init__(self):
|
||||
self._keyword_cache: dict[str, IntentType] = {}
|
||||
self._keyword_cache: dict[str, IntentResult] = {}
|
||||
self._cache_max_size = 1000 # 最大快取條目
|
||||
|
||||
async def classify(self, text: str) -> IntentType:
|
||||
@property
|
||||
def llm_model(self) -> str:
|
||||
"""取得 LLM 備援模型 (延遲載入)"""
|
||||
if self._llm_model is None:
|
||||
try:
|
||||
registry = get_model_registry()
|
||||
self._llm_model = registry.get_model("ollama", "intent")
|
||||
except Exception:
|
||||
self._llm_model = "qwen2.5:1b" # fallback
|
||||
return self._llm_model
|
||||
|
||||
async def classify(self, text: str) -> IntentResult:
|
||||
"""
|
||||
分類意圖
|
||||
分類意圖 (非同步)
|
||||
|
||||
Args:
|
||||
text: 用戶輸入或告警內容
|
||||
|
||||
Returns:
|
||||
IntentType: 分類結果
|
||||
IntentResult: 分類結果
|
||||
"""
|
||||
text_lower = text.lower()
|
||||
text_lower = text.lower().strip()
|
||||
|
||||
# 階段 1: 關鍵字快速匹配 (0ms)
|
||||
intent = self._keyword_match(text_lower)
|
||||
if intent != IntentType.UNKNOWN:
|
||||
# 階段 1: 規則引擎快速匹配 (< 10ms)
|
||||
result = self._keyword_classify(text_lower)
|
||||
if result.confidence >= 0.7: # 信心度閾值
|
||||
logger.debug(
|
||||
"intent_classified_by_keyword",
|
||||
intent=intent.value,
|
||||
intent=result.intent.value,
|
||||
confidence=result.confidence,
|
||||
matched_keywords=result.matched_keywords,
|
||||
text_preview=text[:50],
|
||||
)
|
||||
return intent
|
||||
return result
|
||||
|
||||
# 階段 2: LLM 分類 (< 100ms)
|
||||
# 目前先用關鍵字,LLM 整合待 Qwen 1B 部署
|
||||
llm_result = await self._llm_classify(text_lower)
|
||||
if llm_result.confidence > result.confidence:
|
||||
logger.debug(
|
||||
"intent_classified_by_llm",
|
||||
intent=llm_result.intent.value,
|
||||
confidence=llm_result.confidence,
|
||||
text_preview=text[:50],
|
||||
)
|
||||
return llm_result
|
||||
|
||||
# 使用規則引擎結果
|
||||
logger.debug(
|
||||
"intent_fallback_to_unknown",
|
||||
"intent_classified_fallback",
|
||||
intent=result.intent.value,
|
||||
confidence=result.confidence,
|
||||
text_preview=text[:50],
|
||||
)
|
||||
return IntentType.UNKNOWN
|
||||
return result
|
||||
|
||||
def _keyword_match(self, text: str) -> IntentType:
|
||||
"""關鍵字匹配"""
|
||||
def classify_sync(self, text: str) -> IntentResult:
|
||||
"""
|
||||
同步版本 (僅關鍵字匹配)
|
||||
|
||||
Args:
|
||||
text: 用戶輸入或告警內容
|
||||
|
||||
Returns:
|
||||
IntentResult: 分類結果
|
||||
"""
|
||||
return self._keyword_classify(text.lower().strip())
|
||||
|
||||
def _keyword_classify(self, text: str) -> IntentResult:
|
||||
"""
|
||||
規則引擎分類 (方案 A)
|
||||
|
||||
目標延遲: < 10ms
|
||||
|
||||
Args:
|
||||
text: 已轉小寫的輸入文字
|
||||
|
||||
Returns:
|
||||
IntentResult: 分類結果
|
||||
"""
|
||||
# 檢查快取
|
||||
cache_key = text[:100]
|
||||
if cache_key in self._keyword_cache:
|
||||
return self._keyword_cache[cache_key]
|
||||
|
||||
# 計算每個意圖的匹配分數
|
||||
scores: dict[IntentType, int] = {}
|
||||
scores: dict[IntentType, tuple[int, list[str]]] = {}
|
||||
|
||||
for intent, keywords in INTENT_KEYWORDS.items():
|
||||
score = 0
|
||||
matched: list[str] = []
|
||||
for keyword in keywords:
|
||||
if keyword in text:
|
||||
score += 1
|
||||
# 完整匹配加分
|
||||
matched.append(keyword)
|
||||
# 完整詞匹配加分
|
||||
if re.search(rf"\b{re.escape(keyword)}\b", text):
|
||||
score += 1
|
||||
if score > 0:
|
||||
scores[intent] = score
|
||||
scores[intent] = (score, matched)
|
||||
|
||||
# 檢測告警內容 (強化 DIAGNOSE)
|
||||
is_alert = any(kw in text for kw in ALERT_KEYWORDS)
|
||||
if is_alert and IntentType.DIAGNOSE not in scores:
|
||||
scores[IntentType.DIAGNOSE] = (1, ["(alert_detected)"])
|
||||
|
||||
# 檢測資源類型
|
||||
detected_resources: list[str] = []
|
||||
for resource_type, keywords in RESOURCE_KEYWORDS.items():
|
||||
if any(kw in text for kw in keywords):
|
||||
detected_resources.append(resource_type)
|
||||
|
||||
# 選擇最高分意圖
|
||||
if not scores:
|
||||
return IntentType.UNKNOWN
|
||||
result = IntentResult(
|
||||
intent=IntentType.UNKNOWN,
|
||||
confidence=0.0,
|
||||
method="keyword",
|
||||
matched_keywords=[],
|
||||
detected_resources=detected_resources,
|
||||
reasoning="無匹配關鍵字",
|
||||
)
|
||||
else:
|
||||
best_intent = max(scores, key=lambda k: scores[k][0])
|
||||
best_score, matched_keywords = scores[best_intent]
|
||||
|
||||
# 選擇最高分
|
||||
best_intent = max(scores, key=lambda k: scores[k])
|
||||
# 計算信心度 (基於匹配數量)
|
||||
max_possible = len(INTENT_KEYWORDS.get(best_intent, [])) * 2
|
||||
confidence = min(1.0, best_score / max(max_possible, 1) + 0.5)
|
||||
|
||||
# 快取結果
|
||||
self._keyword_cache[cache_key] = best_intent
|
||||
# 如果有多個競爭意圖,降低信心度
|
||||
if len(scores) > 1:
|
||||
second_best_score = sorted(
|
||||
[s[0] for s in scores.values()], reverse=True
|
||||
)[1]
|
||||
if second_best_score > best_score * 0.7:
|
||||
confidence *= 0.8
|
||||
|
||||
return best_intent
|
||||
result = IntentResult(
|
||||
intent=best_intent,
|
||||
confidence=round(confidence, 2),
|
||||
method="keyword",
|
||||
matched_keywords=matched_keywords,
|
||||
detected_resources=detected_resources,
|
||||
reasoning=f"匹配關鍵字: {', '.join(matched_keywords)}",
|
||||
)
|
||||
|
||||
def classify_sync(self, text: str) -> IntentType:
|
||||
"""同步版本 (僅關鍵字匹配)"""
|
||||
return self._keyword_match(text.lower())
|
||||
# 快取結果 (LRU 簡易實作)
|
||||
if len(self._keyword_cache) >= self._cache_max_size:
|
||||
# 移除最舊的一半
|
||||
keys = list(self._keyword_cache.keys())
|
||||
for k in keys[: len(keys) // 2]:
|
||||
del self._keyword_cache[k]
|
||||
|
||||
self._keyword_cache[cache_key] = result
|
||||
return result
|
||||
|
||||
async def _llm_classify(self, text: str) -> IntentResult:
|
||||
"""
|
||||
LLM 分類 (方案 B)
|
||||
|
||||
目標延遲: < 100ms (使用 qwen2.5:1b)
|
||||
|
||||
Args:
|
||||
text: 已轉小寫的輸入文字
|
||||
|
||||
Returns:
|
||||
IntentResult: 分類結果
|
||||
|
||||
Note:
|
||||
目前返回 UNKNOWN,待 Ollama qwen2.5:1b 部署後啟用
|
||||
"""
|
||||
# TODO: 整合 Ollama qwen2.5:1b (Phase 13.4)
|
||||
# 預計使用 text 呼叫 Ollama API 進行分類
|
||||
# 目前先返回低信心度 UNKNOWN,規則引擎已能處理大部分情況
|
||||
del text # 預留給 LLM 分類使用,避免 unused-parameter 警告
|
||||
return IntentResult(
|
||||
intent=IntentType.UNKNOWN,
|
||||
confidence=0.3,
|
||||
method="llm",
|
||||
matched_keywords=[],
|
||||
detected_resources=[],
|
||||
reasoning="LLM 分類尚未啟用",
|
||||
)
|
||||
|
||||
def get_supported_intents(self) -> list[dict]:
|
||||
"""
|
||||
取得支援的意圖清單
|
||||
|
||||
Returns:
|
||||
意圖清單 (含描述和風險等級)
|
||||
"""
|
||||
intents = [
|
||||
{
|
||||
"intent": IntentType.RESTART.value,
|
||||
"description": "重啟 Pod/Deployment/StatefulSet",
|
||||
"risk_level": RiskLevel.MEDIUM.value,
|
||||
"keywords_sample": INTENT_KEYWORDS[IntentType.RESTART][:5],
|
||||
},
|
||||
{
|
||||
"intent": IntentType.SCALE.value,
|
||||
"description": "擴縮容、HPA 調整",
|
||||
"risk_level": RiskLevel.MEDIUM.value,
|
||||
"keywords_sample": INTENT_KEYWORDS[IntentType.SCALE][:5],
|
||||
},
|
||||
{
|
||||
"intent": IntentType.CONFIG.value,
|
||||
"description": "ConfigMap/Secret/ENV 變更",
|
||||
"risk_level": RiskLevel.HIGH.value,
|
||||
"keywords_sample": INTENT_KEYWORDS[IntentType.CONFIG][:5],
|
||||
},
|
||||
{
|
||||
"intent": IntentType.DIAGNOSE.value,
|
||||
"description": "日誌查詢、健康檢查、RCA",
|
||||
"risk_level": RiskLevel.LOW.value,
|
||||
"keywords_sample": INTENT_KEYWORDS[IntentType.DIAGNOSE][:5],
|
||||
},
|
||||
{
|
||||
"intent": IntentType.DELETE.value,
|
||||
"description": "刪除資源(高風險)",
|
||||
"risk_level": RiskLevel.CRITICAL.value,
|
||||
"keywords_sample": INTENT_KEYWORDS[IntentType.DELETE][:5],
|
||||
},
|
||||
{
|
||||
"intent": IntentType.ROLLBACK.value,
|
||||
"description": "回滾版本",
|
||||
"risk_level": RiskLevel.MEDIUM.value,
|
||||
"keywords_sample": INTENT_KEYWORDS[IntentType.ROLLBACK][:5],
|
||||
},
|
||||
{
|
||||
"intent": IntentType.UNKNOWN.value,
|
||||
"description": "無法判斷意圖",
|
||||
"risk_level": RiskLevel.MEDIUM.value,
|
||||
"keywords_sample": [],
|
||||
},
|
||||
]
|
||||
return intents
|
||||
|
||||
|
||||
# 單例
|
||||
# =============================================================================
|
||||
# Singleton
|
||||
# =============================================================================
|
||||
|
||||
_classifier: IntentClassifier | None = None
|
||||
|
||||
|
||||
@@ -145,3 +604,29 @@ def get_intent_classifier() -> IntentClassifier:
|
||||
if _classifier is None:
|
||||
_classifier = IntentClassifier()
|
||||
return _classifier
|
||||
|
||||
|
||||
def reset_intent_classifier() -> None:
|
||||
"""重置單例 (用於測試)"""
|
||||
global _classifier
|
||||
_classifier = None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Convenience Functions
|
||||
# =============================================================================
|
||||
|
||||
|
||||
async def classify_intent(text: str) -> IntentResult:
|
||||
"""便捷函數: 分類意圖 (非同步)"""
|
||||
return await get_intent_classifier().classify(text)
|
||||
|
||||
|
||||
def classify_intent_sync(text: str) -> IntentResult:
|
||||
"""便捷函數: 分類意圖 (同步)"""
|
||||
return get_intent_classifier().classify_sync(text)
|
||||
|
||||
|
||||
def get_intent_risk(intent: IntentType) -> RiskLevel:
|
||||
"""便捷函數: 取得意圖風險等級"""
|
||||
return INTENT_RISK_MAP.get(intent, RiskLevel.MEDIUM)
|
||||
|
||||
264
apps/api/src/services/model_registry.py
Normal file
264
apps/api/src/services/model_registry.py
Normal file
@@ -0,0 +1,264 @@
|
||||
"""
|
||||
Model Registry - Phase 12 P1 修復
|
||||
=================================
|
||||
集中管理 AI 模型配置,消除 hardcode 模型名稱
|
||||
|
||||
功能:
|
||||
- 從 models.json 讀取配置
|
||||
- 提供 get_model(provider, purpose) 方法
|
||||
- Singleton 模式
|
||||
- 支援依賴注入測試
|
||||
|
||||
版本: v1.0
|
||||
建立: 2026-03-26 23:00 (台北時區)
|
||||
建立者: Claude Code
|
||||
最後修改: 2026-03-26 23:00 (台北時區)
|
||||
修改者: Claude Code
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Protocol
|
||||
|
||||
import structlog
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Interface (支援 DI 測試)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class IModelRegistry(Protocol):
|
||||
"""Model Registry Interface for DI"""
|
||||
|
||||
def get_model(self, provider: str, purpose: str = "default") -> str:
|
||||
"""取得模型名稱"""
|
||||
...
|
||||
|
||||
def get_fallback_order(self) -> list[str]:
|
||||
"""取得備援順序"""
|
||||
...
|
||||
|
||||
def get_model_by_complexity(self, complexity: int) -> str:
|
||||
"""依複雜度取得推薦模型"""
|
||||
...
|
||||
|
||||
def get_provider_config(self, provider: str) -> dict:
|
||||
"""取得 provider 完整配置"""
|
||||
...
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Implementation
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class ModelRegistry:
|
||||
"""
|
||||
Model Registry 實作
|
||||
|
||||
從 models.json 讀取配置,提供統一的模型查詢介面
|
||||
|
||||
Usage:
|
||||
registry = get_model_registry()
|
||||
model = registry.get_model("ollama", "rca") # -> "qwen2.5:7b-instruct"
|
||||
"""
|
||||
|
||||
def __init__(self, config_path: Path | str | None = None):
|
||||
"""
|
||||
初始化 ModelRegistry
|
||||
|
||||
Args:
|
||||
config_path: models.json 路徑,None 使用預設路徑
|
||||
"""
|
||||
if config_path is None:
|
||||
# 預設路徑: apps/api/models.json
|
||||
config_path = Path(__file__).parent.parent.parent / "models.json"
|
||||
elif isinstance(config_path, str):
|
||||
config_path = Path(config_path)
|
||||
|
||||
self._config_path = config_path
|
||||
self._config: dict = {}
|
||||
self._load_config()
|
||||
|
||||
# 複雜度對應模型 (從 config 或使用預設)
|
||||
self._complexity_map = self._build_complexity_map()
|
||||
|
||||
def _load_config(self) -> None:
|
||||
"""載入 models.json"""
|
||||
try:
|
||||
with open(self._config_path) as f:
|
||||
self._config = json.load(f)
|
||||
logger.info(
|
||||
"model_registry_loaded",
|
||||
path=str(self._config_path),
|
||||
providers=list(self._config.get("providers", {}).keys()),
|
||||
)
|
||||
except FileNotFoundError:
|
||||
logger.warning(
|
||||
"models_json_not_found",
|
||||
path=str(self._config_path),
|
||||
using="fallback_defaults",
|
||||
)
|
||||
self._config = self._get_default_config()
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(
|
||||
"models_json_parse_error",
|
||||
path=str(self._config_path),
|
||||
error=str(e),
|
||||
)
|
||||
self._config = self._get_default_config()
|
||||
|
||||
def _get_default_config(self) -> dict:
|
||||
"""預設配置 (fallback)"""
|
||||
return {
|
||||
"default_provider": "ollama",
|
||||
"fallback_order": ["ollama", "gemini", "claude"],
|
||||
"providers": {
|
||||
"ollama": {
|
||||
"models": {
|
||||
"default": "qwen2.5:7b-instruct",
|
||||
"rca": "qwen2.5:7b-instruct",
|
||||
"summary": "llama3.2:3b",
|
||||
}
|
||||
},
|
||||
"gemini": {
|
||||
"models": {
|
||||
"default": "gemini-1.5-flash",
|
||||
"rca": "gemini-1.5-flash",
|
||||
"summary": "gemini-1.5-flash",
|
||||
}
|
||||
},
|
||||
"claude": {
|
||||
"models": {
|
||||
"default": "claude-3-haiku-20240307",
|
||||
"rca": "claude-3-haiku-20240307",
|
||||
"summary": "claude-3-haiku-20240307",
|
||||
}
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
def _build_complexity_map(self) -> dict[int, str]:
|
||||
"""建立複雜度對應模型映射"""
|
||||
# 從 config 或使用預設
|
||||
ollama_models = self._config.get("providers", {}).get("ollama", {}).get("models", {})
|
||||
default_model = ollama_models.get("default", "qwen2.5:7b-instruct")
|
||||
summary_model = ollama_models.get("summary", "llama3.2:3b")
|
||||
|
||||
return {
|
||||
1: summary_model, # 簡單任務,快速回應
|
||||
2: default_model, # 中等任務
|
||||
3: default_model, # 複雜任務
|
||||
4: "gemini", # 需要雲端能力
|
||||
5: "claude", # 極複雜,需要最強模型
|
||||
}
|
||||
|
||||
def get_model(self, provider: str, purpose: str = "default") -> str:
|
||||
"""
|
||||
取得模型名稱
|
||||
|
||||
Args:
|
||||
provider: 提供者 (ollama, gemini, claude)
|
||||
purpose: 用途 (default, rca, summary)
|
||||
|
||||
Returns:
|
||||
模型名稱
|
||||
"""
|
||||
providers = self._config.get("providers", {})
|
||||
provider_config = providers.get(provider, {})
|
||||
models = provider_config.get("models", {})
|
||||
|
||||
# 優先取用途,fallback 到 default
|
||||
model = models.get(purpose) or models.get("default")
|
||||
|
||||
if not model:
|
||||
# 最終 fallback
|
||||
fallback_map = {
|
||||
"ollama": "qwen2.5:7b-instruct",
|
||||
"gemini": "gemini-1.5-flash",
|
||||
"claude": "claude-3-haiku-20240307",
|
||||
}
|
||||
model = fallback_map.get(provider, provider)
|
||||
logger.warning(
|
||||
"model_not_found_using_fallback",
|
||||
provider=provider,
|
||||
purpose=purpose,
|
||||
fallback=model,
|
||||
)
|
||||
|
||||
return model
|
||||
|
||||
def get_fallback_order(self) -> list[str]:
|
||||
"""取得備援順序"""
|
||||
return self._config.get("fallback_order", ["ollama", "gemini", "claude"])
|
||||
|
||||
def get_model_by_complexity(self, complexity: int) -> str:
|
||||
"""
|
||||
依複雜度取得推薦模型
|
||||
|
||||
Args:
|
||||
complexity: 複雜度分數 (1-5)
|
||||
|
||||
Returns:
|
||||
推薦模型名稱
|
||||
"""
|
||||
# 確保在範圍內
|
||||
complexity = max(1, min(5, complexity))
|
||||
return self._complexity_map.get(complexity, self.get_model("ollama", "default"))
|
||||
|
||||
def get_provider_config(self, provider: str) -> dict:
|
||||
"""取得 provider 完整配置"""
|
||||
return self._config.get("providers", {}).get(provider, {})
|
||||
|
||||
def get_default_provider(self) -> str:
|
||||
"""取得預設 provider"""
|
||||
return self._config.get("default_provider", "ollama")
|
||||
|
||||
def get_provider_options(self, provider: str) -> dict:
|
||||
"""取得 provider 的 options"""
|
||||
provider_config = self.get_provider_config(provider)
|
||||
return provider_config.get("options", {})
|
||||
|
||||
def get_provider_timeout(self, provider: str) -> int:
|
||||
"""取得 provider 的 timeout (秒)"""
|
||||
provider_config = self.get_provider_config(provider)
|
||||
return provider_config.get("timeout_seconds", 30)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Singleton
|
||||
# =============================================================================
|
||||
|
||||
_registry: ModelRegistry | None = None
|
||||
|
||||
|
||||
def get_model_registry() -> ModelRegistry:
|
||||
"""取得 ModelRegistry 單例"""
|
||||
global _registry
|
||||
if _registry is None:
|
||||
_registry = ModelRegistry()
|
||||
return _registry
|
||||
|
||||
|
||||
def reset_model_registry() -> None:
|
||||
"""重置單例 (用於測試)"""
|
||||
global _registry
|
||||
_registry = None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Convenience Functions
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def get_model(provider: str, purpose: str = "default") -> str:
|
||||
"""便捷函數: 取得模型名稱"""
|
||||
return get_model_registry().get_model(provider, purpose)
|
||||
|
||||
|
||||
def get_model_by_complexity(complexity: int) -> str:
|
||||
"""便捷函數: 依複雜度取得模型"""
|
||||
return get_model_registry().get_model_by_complexity(complexity)
|
||||
@@ -418,6 +418,140 @@ class SignOzClient:
|
||||
},
|
||||
}
|
||||
|
||||
# =========================================================================
|
||||
# Log Query (Phase 13.1 #77)
|
||||
# =========================================================================
|
||||
|
||||
async def get_logs(
|
||||
self,
|
||||
service_name: str | None = None,
|
||||
severity: str | None = None,
|
||||
search_text: str | None = None,
|
||||
time_window_minutes: int = 30,
|
||||
limit: int = 100,
|
||||
) -> list[dict]:
|
||||
"""
|
||||
從 SignOz/ClickHouse 查詢日誌 (Phase 13.1 #77)
|
||||
|
||||
SignOz 日誌儲存在 signoz_logs.distributed_logs 表
|
||||
Schema: timestamp, severity_text, body, resources, attributes
|
||||
|
||||
Args:
|
||||
service_name: 服務名稱 (過濾 resources.service.name)
|
||||
severity: 日誌級別 (ERROR, WARN, INFO, DEBUG)
|
||||
search_text: 日誌內容搜尋文字
|
||||
time_window_minutes: 時間窗口 (分鐘)
|
||||
limit: 返回筆數上限
|
||||
|
||||
Returns:
|
||||
list[dict]: 日誌記錄列表
|
||||
"""
|
||||
now = datetime.now(UTC)
|
||||
start_ns = int((now - timedelta(minutes=time_window_minutes)).timestamp() * 1_000_000_000)
|
||||
end_ns = int(now.timestamp() * 1_000_000_000)
|
||||
|
||||
# 構建 WHERE 條件
|
||||
conditions = [
|
||||
f"timestamp >= {start_ns}",
|
||||
f"timestamp <= {end_ns}",
|
||||
]
|
||||
|
||||
if service_name:
|
||||
# SignOz 儲存 service.name 在 resources 欄位
|
||||
conditions.append(f"resources['service.name'] = '{service_name}'")
|
||||
|
||||
if severity:
|
||||
# 支援多個級別 (如 'ERROR,WARN')
|
||||
severities = [s.strip().upper() for s in severity.split(",")]
|
||||
severity_list = ", ".join([f"'{s}'" for s in severities])
|
||||
conditions.append(f"severity_text IN ({severity_list})")
|
||||
|
||||
if search_text:
|
||||
# 日誌內容搜尋 (避免 SQL injection)
|
||||
safe_text = search_text.replace("'", "''")
|
||||
conditions.append(f"body LIKE '%{safe_text}%'")
|
||||
|
||||
where_clause = " AND ".join(conditions)
|
||||
|
||||
query = f"""
|
||||
SELECT
|
||||
timestamp,
|
||||
severity_text,
|
||||
body,
|
||||
resources,
|
||||
attributes,
|
||||
trace_id,
|
||||
span_id
|
||||
FROM signoz_logs.distributed_logs
|
||||
WHERE {where_clause}
|
||||
ORDER BY timestamp DESC
|
||||
LIMIT {limit}
|
||||
"""
|
||||
|
||||
results = await self._query_clickhouse(query)
|
||||
|
||||
# 格式化結果
|
||||
formatted_logs = []
|
||||
for row in results:
|
||||
formatted_logs.append({
|
||||
"timestamp": row.get("timestamp"),
|
||||
"severity": row.get("severity_text", "UNKNOWN"),
|
||||
"message": row.get("body", ""),
|
||||
"service": row.get("resources", {}).get("service.name", "unknown"),
|
||||
"trace_id": row.get("trace_id", ""),
|
||||
"span_id": row.get("span_id", ""),
|
||||
"attributes": row.get("attributes", {}),
|
||||
})
|
||||
|
||||
logger.info(
|
||||
"signoz_logs_query_completed",
|
||||
service_name=service_name,
|
||||
severity=severity,
|
||||
result_count=len(formatted_logs),
|
||||
time_window_minutes=time_window_minutes,
|
||||
)
|
||||
|
||||
return formatted_logs
|
||||
|
||||
async def get_error_logs_summary(
|
||||
self,
|
||||
service_name: str,
|
||||
time_window_minutes: int = 60,
|
||||
) -> dict:
|
||||
"""
|
||||
取得錯誤日誌摘要 (Phase 13.1 #77 - CI 診斷用)
|
||||
|
||||
統計各類錯誤的出現次數和代表性訊息
|
||||
"""
|
||||
now = datetime.now(UTC)
|
||||
start_ns = int((now - timedelta(minutes=time_window_minutes)).timestamp() * 1_000_000_000)
|
||||
end_ns = int(now.timestamp() * 1_000_000_000)
|
||||
|
||||
query = f"""
|
||||
SELECT
|
||||
severity_text,
|
||||
count() as count,
|
||||
any(body) as sample_message
|
||||
FROM signoz_logs.distributed_logs
|
||||
WHERE
|
||||
timestamp >= {start_ns}
|
||||
AND timestamp <= {end_ns}
|
||||
AND resources['service.name'] = '{service_name}'
|
||||
AND severity_text IN ('ERROR', 'FATAL', 'CRITICAL')
|
||||
GROUP BY severity_text
|
||||
ORDER BY count DESC
|
||||
LIMIT 10
|
||||
"""
|
||||
|
||||
results = await self._query_clickhouse(query)
|
||||
|
||||
return {
|
||||
"service_name": service_name,
|
||||
"time_window_minutes": time_window_minutes,
|
||||
"error_summary": results,
|
||||
"total_errors": sum(r.get("count", 0) for r in results),
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Singleton
|
||||
|
||||
676
apps/api/src/services/token_counter.py
Normal file
676
apps/api/src/services/token_counter.py
Normal file
@@ -0,0 +1,676 @@
|
||||
"""
|
||||
Token Counter Service - Phase 13.3 #88 AI Token Dashboard
|
||||
=========================================================
|
||||
Token 用量監控,整合 SignOz OTEL Metrics + Langfuse
|
||||
|
||||
功能:
|
||||
- 記錄每次 LLM 呼叫的 input/output tokens
|
||||
- 按 provider 分類統計
|
||||
- 成本估算 (Gemini/Claude 有成本,Ollama 免費)
|
||||
- 每日/每月 Token 預算監控
|
||||
- 超標時通知切換到本地模型
|
||||
|
||||
SignOz 指標:
|
||||
- llm.tokens.input (Counter) - 輸入 Token 數
|
||||
- llm.tokens.output (Counter) - 輸出 Token 數
|
||||
- llm.cost.usd (Counter) - 累計成本
|
||||
- llm.latency.ms (Histogram) - 延遲分佈
|
||||
- llm.requests.total (Counter) - 總請求數
|
||||
- llm.requests.failed (Counter) - 失敗請求數
|
||||
|
||||
版本: v1.0
|
||||
建立: 2026-03-26 14:30 (台北時區)
|
||||
建立者: Claude Code
|
||||
最後修改: 2026-03-26 14:30 (台北時區)
|
||||
修改者: Claude Code
|
||||
|
||||
變更紀錄:
|
||||
| 版本 | 日期 | 執行者 | 變更內容 |
|
||||
|------|------|--------|----------|
|
||||
| v1.0 | 2026-03-26 | Claude Code | Phase 13.3 #88 初始實作 |
|
||||
"""
|
||||
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import UTC, datetime, timedelta
|
||||
from typing import Protocol
|
||||
|
||||
import structlog
|
||||
from opentelemetry import metrics
|
||||
from opentelemetry.metrics import Counter, Histogram, Meter
|
||||
|
||||
from src.core.config import settings
|
||||
from src.services.langfuse_client import get_langfuse
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Constants - Cost Per 1K Tokens (USD)
|
||||
# =============================================================================
|
||||
|
||||
# 成本定義 (from models.json)
|
||||
COST_PER_1K_TOKENS = {
|
||||
"ollama": 0.0, # 本地免費
|
||||
"gemini": 0.001, # Gemini 1.5 Flash
|
||||
"claude": 0.008, # Claude 3 Haiku
|
||||
}
|
||||
|
||||
# 預算閾值 (from models.json monitoring.alerts)
|
||||
DAILY_COST_THRESHOLD_USD = 5.0
|
||||
MONTHLY_COST_THRESHOLD_USD = 10.0
|
||||
DAILY_TOKEN_BUDGET = {
|
||||
"gemini": 100_000, # 每日 100K tokens
|
||||
"claude": 50_000, # 每日 50K tokens
|
||||
}
|
||||
MONTHLY_TOKEN_BUDGET = {
|
||||
"gemini": 2_000_000, # 每月 2M tokens
|
||||
"claude": 500_000, # 每月 500K tokens
|
||||
}
|
||||
ALERT_THRESHOLD_PERCENT = 70 # 70% 預警
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Data Classes
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@dataclass
|
||||
class TokenUsage:
|
||||
"""單次 LLM 呼叫的 Token 使用量"""
|
||||
|
||||
input_tokens: int
|
||||
output_tokens: int
|
||||
total_tokens: int = field(init=False)
|
||||
provider: str
|
||||
model: str
|
||||
latency_ms: float = 0.0
|
||||
success: bool = True
|
||||
error_message: str | None = None
|
||||
timestamp: datetime = field(default_factory=lambda: datetime.now(UTC))
|
||||
|
||||
def __post_init__(self):
|
||||
self.total_tokens = self.input_tokens + self.output_tokens
|
||||
|
||||
@property
|
||||
def estimated_cost_usd(self) -> float:
|
||||
"""估算成本 (USD)"""
|
||||
cost_per_1k = COST_PER_1K_TOKENS.get(self.provider.lower(), 0.0)
|
||||
return (self.total_tokens / 1000) * cost_per_1k
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProviderStats:
|
||||
"""Provider 統計"""
|
||||
|
||||
provider: str
|
||||
total_input_tokens: int = 0
|
||||
total_output_tokens: int = 0
|
||||
total_requests: int = 0
|
||||
failed_requests: int = 0
|
||||
total_latency_ms: float = 0.0
|
||||
total_cost_usd: float = 0.0
|
||||
period_start: datetime = field(default_factory=lambda: datetime.now(UTC))
|
||||
|
||||
@property
|
||||
def total_tokens(self) -> int:
|
||||
return self.total_input_tokens + self.total_output_tokens
|
||||
|
||||
@property
|
||||
def success_rate(self) -> float:
|
||||
if self.total_requests == 0:
|
||||
return 100.0
|
||||
return ((self.total_requests - self.failed_requests) / self.total_requests) * 100
|
||||
|
||||
@property
|
||||
def avg_latency_ms(self) -> float:
|
||||
if self.total_requests == 0:
|
||||
return 0.0
|
||||
return self.total_latency_ms / self.total_requests
|
||||
|
||||
|
||||
@dataclass
|
||||
class BudgetStatus:
|
||||
"""預算狀態"""
|
||||
|
||||
provider: str
|
||||
daily_tokens_used: int
|
||||
daily_tokens_budget: int
|
||||
daily_cost_usd: float
|
||||
monthly_tokens_used: int
|
||||
monthly_tokens_budget: int
|
||||
monthly_cost_usd: float
|
||||
is_over_budget: bool = False
|
||||
alert_triggered: bool = False
|
||||
recommendation: str = ""
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Interface (Protocol for DI)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class ITokenCounter(Protocol):
|
||||
"""Token Counter Interface"""
|
||||
|
||||
def record_usage(self, usage: TokenUsage) -> None:
|
||||
"""記錄 Token 使用"""
|
||||
...
|
||||
|
||||
def get_provider_stats(self, provider: str) -> ProviderStats:
|
||||
"""取得 Provider 統計"""
|
||||
...
|
||||
|
||||
def get_budget_status(self, provider: str) -> BudgetStatus:
|
||||
"""取得預算狀態"""
|
||||
...
|
||||
|
||||
def should_fallback_to_local(self, provider: str) -> tuple[bool, str]:
|
||||
"""檢查是否應該 fallback 到本地模型"""
|
||||
...
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Token Counter Implementation
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class TokenCounter:
|
||||
"""
|
||||
Token 計數器 - OTEL Metrics + Langfuse 整合
|
||||
|
||||
使用 OpenTelemetry Metrics API 將指標送到 SignOz,
|
||||
同時整合 Langfuse 記錄詳細的 LLM trace。
|
||||
|
||||
Usage:
|
||||
counter = get_token_counter()
|
||||
counter.record_usage(TokenUsage(
|
||||
input_tokens=500,
|
||||
output_tokens=200,
|
||||
provider="ollama",
|
||||
model="qwen2.5:7b-instruct",
|
||||
latency_ms=1500,
|
||||
))
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._provider_stats: dict[str, ProviderStats] = {}
|
||||
self._daily_stats: dict[str, ProviderStats] = {}
|
||||
self._monthly_stats: dict[str, ProviderStats] = {}
|
||||
self._last_daily_reset: datetime = datetime.now(UTC).replace(
|
||||
hour=0, minute=0, second=0, microsecond=0
|
||||
)
|
||||
self._last_monthly_reset: datetime = datetime.now(UTC).replace(
|
||||
day=1, hour=0, minute=0, second=0, microsecond=0
|
||||
)
|
||||
|
||||
# OTEL Metrics 初始化
|
||||
self._meter: Meter | None = None
|
||||
self._input_tokens_counter: Counter | None = None
|
||||
self._output_tokens_counter: Counter | None = None
|
||||
self._cost_counter: Counter | None = None
|
||||
self._latency_histogram: Histogram | None = None
|
||||
self._request_counter: Counter | None = None
|
||||
self._failed_counter: Counter | None = None
|
||||
|
||||
self._init_metrics()
|
||||
|
||||
def _init_metrics(self) -> None:
|
||||
"""初始化 OTEL Metrics"""
|
||||
if not settings.OTEL_ENABLED or settings.MOCK_MODE:
|
||||
logger.info("otel_metrics_disabled", reason="OTEL_ENABLED=false or MOCK_MODE=true")
|
||||
return
|
||||
|
||||
try:
|
||||
# 取得 MeterProvider
|
||||
self._meter = metrics.get_meter(
|
||||
name="awoooi.llm",
|
||||
version=settings.VERSION,
|
||||
)
|
||||
|
||||
# 建立 Counters
|
||||
self._input_tokens_counter = self._meter.create_counter(
|
||||
name="llm.tokens.input",
|
||||
description="LLM input tokens count",
|
||||
unit="tokens",
|
||||
)
|
||||
|
||||
self._output_tokens_counter = self._meter.create_counter(
|
||||
name="llm.tokens.output",
|
||||
description="LLM output tokens count",
|
||||
unit="tokens",
|
||||
)
|
||||
|
||||
self._cost_counter = self._meter.create_counter(
|
||||
name="llm.cost.usd",
|
||||
description="Estimated LLM cost in USD",
|
||||
unit="USD",
|
||||
)
|
||||
|
||||
self._request_counter = self._meter.create_counter(
|
||||
name="llm.requests.total",
|
||||
description="Total LLM requests",
|
||||
unit="requests",
|
||||
)
|
||||
|
||||
self._failed_counter = self._meter.create_counter(
|
||||
name="llm.requests.failed",
|
||||
description="Failed LLM requests",
|
||||
unit="requests",
|
||||
)
|
||||
|
||||
# 建立 Histogram (延遲分佈)
|
||||
self._latency_histogram = self._meter.create_histogram(
|
||||
name="llm.latency.ms",
|
||||
description="LLM request latency in milliseconds",
|
||||
unit="ms",
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"otel_llm_metrics_initialized",
|
||||
meter_name="awoooi.llm",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"otel_metrics_init_failed",
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
def _reset_if_needed(self) -> None:
|
||||
"""檢查並重置每日/每月統計"""
|
||||
now = datetime.now(UTC)
|
||||
|
||||
# 每日重置
|
||||
today_start = now.replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
if today_start > self._last_daily_reset:
|
||||
logger.info(
|
||||
"daily_stats_reset",
|
||||
previous_date=self._last_daily_reset.isoformat(),
|
||||
)
|
||||
self._daily_stats = {}
|
||||
self._last_daily_reset = today_start
|
||||
|
||||
# 每月重置
|
||||
month_start = now.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
|
||||
if month_start > self._last_monthly_reset:
|
||||
logger.info(
|
||||
"monthly_stats_reset",
|
||||
previous_month=self._last_monthly_reset.isoformat(),
|
||||
)
|
||||
self._monthly_stats = {}
|
||||
self._last_monthly_reset = month_start
|
||||
|
||||
def _get_or_create_stats(
|
||||
self, provider: str, stats_dict: dict[str, ProviderStats]
|
||||
) -> ProviderStats:
|
||||
"""取得或建立 Provider 統計"""
|
||||
if provider not in stats_dict:
|
||||
stats_dict[provider] = ProviderStats(provider=provider)
|
||||
return stats_dict[provider]
|
||||
|
||||
def record_usage(self, usage: TokenUsage) -> None:
|
||||
"""
|
||||
記錄 Token 使用量
|
||||
|
||||
同時更新:
|
||||
1. 內存統計 (總計/每日/每月)
|
||||
2. OTEL Metrics (SignOz)
|
||||
3. Langfuse (如果有 trace context)
|
||||
"""
|
||||
self._reset_if_needed()
|
||||
|
||||
provider = usage.provider.lower()
|
||||
attributes = {
|
||||
"provider": provider,
|
||||
"model": usage.model,
|
||||
"environment": settings.ENVIRONMENT,
|
||||
}
|
||||
|
||||
# 更新內存統計
|
||||
for stats_dict in [self._provider_stats, self._daily_stats, self._monthly_stats]:
|
||||
stats = self._get_or_create_stats(provider, stats_dict)
|
||||
stats.total_input_tokens += usage.input_tokens
|
||||
stats.total_output_tokens += usage.output_tokens
|
||||
stats.total_requests += 1
|
||||
stats.total_latency_ms += usage.latency_ms
|
||||
stats.total_cost_usd += usage.estimated_cost_usd
|
||||
if not usage.success:
|
||||
stats.failed_requests += 1
|
||||
|
||||
# 發送 OTEL Metrics
|
||||
if self._input_tokens_counter:
|
||||
self._input_tokens_counter.add(usage.input_tokens, attributes)
|
||||
|
||||
if self._output_tokens_counter:
|
||||
self._output_tokens_counter.add(usage.output_tokens, attributes)
|
||||
|
||||
if self._cost_counter and usage.estimated_cost_usd > 0:
|
||||
# Counter 只接受整數或 float,成本用 micro-USD (乘以 1,000,000)
|
||||
# 或直接用 float
|
||||
self._cost_counter.add(usage.estimated_cost_usd, attributes)
|
||||
|
||||
if self._request_counter:
|
||||
self._request_counter.add(1, attributes)
|
||||
|
||||
if not usage.success and self._failed_counter:
|
||||
self._failed_counter.add(1, attributes)
|
||||
|
||||
if self._latency_histogram and usage.latency_ms > 0:
|
||||
self._latency_histogram.record(usage.latency_ms, attributes)
|
||||
|
||||
# 記錄日誌
|
||||
logger.info(
|
||||
"token_usage_recorded",
|
||||
provider=provider,
|
||||
model=usage.model,
|
||||
input_tokens=usage.input_tokens,
|
||||
output_tokens=usage.output_tokens,
|
||||
total_tokens=usage.total_tokens,
|
||||
latency_ms=round(usage.latency_ms, 2),
|
||||
cost_usd=round(usage.estimated_cost_usd, 6),
|
||||
success=usage.success,
|
||||
)
|
||||
|
||||
# 檢查預算告警
|
||||
self._check_budget_alert(provider)
|
||||
|
||||
def _check_budget_alert(self, provider: str) -> None:
|
||||
"""檢查預算告警"""
|
||||
status = self.get_budget_status(provider)
|
||||
|
||||
if status.alert_triggered:
|
||||
logger.warning(
|
||||
"llm_budget_alert",
|
||||
provider=provider,
|
||||
daily_usage_percent=round(
|
||||
(status.daily_tokens_used / status.daily_tokens_budget * 100)
|
||||
if status.daily_tokens_budget > 0
|
||||
else 0,
|
||||
1,
|
||||
),
|
||||
monthly_usage_percent=round(
|
||||
(status.monthly_tokens_used / status.monthly_tokens_budget * 100)
|
||||
if status.monthly_tokens_budget > 0
|
||||
else 0,
|
||||
1,
|
||||
),
|
||||
recommendation=status.recommendation,
|
||||
)
|
||||
|
||||
if status.is_over_budget:
|
||||
logger.error(
|
||||
"llm_budget_exceeded",
|
||||
provider=provider,
|
||||
daily_tokens_used=status.daily_tokens_used,
|
||||
monthly_tokens_used=status.monthly_tokens_used,
|
||||
recommendation=status.recommendation,
|
||||
)
|
||||
|
||||
def get_provider_stats(self, provider: str) -> ProviderStats:
|
||||
"""取得 Provider 總計統計"""
|
||||
return self._get_or_create_stats(provider.lower(), self._provider_stats)
|
||||
|
||||
def get_daily_stats(self, provider: str) -> ProviderStats:
|
||||
"""取得 Provider 每日統計"""
|
||||
self._reset_if_needed()
|
||||
return self._get_or_create_stats(provider.lower(), self._daily_stats)
|
||||
|
||||
def get_monthly_stats(self, provider: str) -> ProviderStats:
|
||||
"""取得 Provider 每月統計"""
|
||||
self._reset_if_needed()
|
||||
return self._get_or_create_stats(provider.lower(), self._monthly_stats)
|
||||
|
||||
def get_budget_status(self, provider: str) -> BudgetStatus:
|
||||
"""取得預算狀態"""
|
||||
self._reset_if_needed()
|
||||
provider = provider.lower()
|
||||
|
||||
daily_stats = self.get_daily_stats(provider)
|
||||
monthly_stats = self.get_monthly_stats(provider)
|
||||
|
||||
daily_budget = DAILY_TOKEN_BUDGET.get(provider, 0)
|
||||
monthly_budget = MONTHLY_TOKEN_BUDGET.get(provider, 0)
|
||||
|
||||
# 計算使用率
|
||||
daily_usage_percent = (
|
||||
(daily_stats.total_tokens / daily_budget * 100) if daily_budget > 0 else 0
|
||||
)
|
||||
monthly_usage_percent = (
|
||||
(monthly_stats.total_tokens / monthly_budget * 100) if monthly_budget > 0 else 0
|
||||
)
|
||||
|
||||
# 判斷告警狀態
|
||||
alert_triggered = (
|
||||
daily_usage_percent >= ALERT_THRESHOLD_PERCENT
|
||||
or monthly_usage_percent >= ALERT_THRESHOLD_PERCENT
|
||||
)
|
||||
is_over_budget = daily_usage_percent >= 100 or monthly_usage_percent >= 100
|
||||
|
||||
# 建議
|
||||
recommendation = ""
|
||||
if is_over_budget:
|
||||
recommendation = f"建議切換到本地模型 (Ollama) 以節省成本"
|
||||
elif alert_triggered:
|
||||
recommendation = f"接近預算上限 ({max(daily_usage_percent, monthly_usage_percent):.1f}%),考慮減少 {provider} 呼叫"
|
||||
|
||||
return BudgetStatus(
|
||||
provider=provider,
|
||||
daily_tokens_used=daily_stats.total_tokens,
|
||||
daily_tokens_budget=daily_budget,
|
||||
daily_cost_usd=daily_stats.total_cost_usd,
|
||||
monthly_tokens_used=monthly_stats.total_tokens,
|
||||
monthly_tokens_budget=monthly_budget,
|
||||
monthly_cost_usd=monthly_stats.total_cost_usd,
|
||||
is_over_budget=is_over_budget,
|
||||
alert_triggered=alert_triggered,
|
||||
recommendation=recommendation,
|
||||
)
|
||||
|
||||
def should_fallback_to_local(self, provider: str) -> tuple[bool, str]:
|
||||
"""
|
||||
檢查是否應該 fallback 到本地模型
|
||||
|
||||
Returns:
|
||||
(should_fallback, reason)
|
||||
"""
|
||||
if provider.lower() == "ollama":
|
||||
return False, "Already using local model"
|
||||
|
||||
status = self.get_budget_status(provider)
|
||||
|
||||
if status.is_over_budget:
|
||||
return True, f"Budget exceeded for {provider}: {status.recommendation}"
|
||||
|
||||
if status.alert_triggered:
|
||||
# 70% 以上時,可選擇 fallback
|
||||
return False, f"Near budget threshold for {provider}: {status.recommendation}"
|
||||
|
||||
return False, "Budget OK"
|
||||
|
||||
def get_all_stats_summary(self) -> dict:
|
||||
"""取得所有 Provider 統計摘要"""
|
||||
self._reset_if_needed()
|
||||
|
||||
summary = {
|
||||
"timestamp": datetime.now(UTC).isoformat(),
|
||||
"providers": {},
|
||||
"total": {
|
||||
"input_tokens": 0,
|
||||
"output_tokens": 0,
|
||||
"cost_usd": 0.0,
|
||||
"requests": 0,
|
||||
},
|
||||
}
|
||||
|
||||
for provider in ["ollama", "gemini", "claude"]:
|
||||
daily = self.get_daily_stats(provider)
|
||||
monthly = self.get_monthly_stats(provider)
|
||||
budget = self.get_budget_status(provider)
|
||||
|
||||
summary["providers"][provider] = {
|
||||
"daily": {
|
||||
"input_tokens": daily.total_input_tokens,
|
||||
"output_tokens": daily.total_output_tokens,
|
||||
"total_tokens": daily.total_tokens,
|
||||
"cost_usd": round(daily.total_cost_usd, 4),
|
||||
"requests": daily.total_requests,
|
||||
"success_rate": round(daily.success_rate, 1),
|
||||
"avg_latency_ms": round(daily.avg_latency_ms, 1),
|
||||
},
|
||||
"monthly": {
|
||||
"input_tokens": monthly.total_input_tokens,
|
||||
"output_tokens": monthly.total_output_tokens,
|
||||
"total_tokens": monthly.total_tokens,
|
||||
"cost_usd": round(monthly.total_cost_usd, 4),
|
||||
"requests": monthly.total_requests,
|
||||
},
|
||||
"budget": {
|
||||
"daily_budget": budget.daily_tokens_budget,
|
||||
"daily_usage_percent": round(
|
||||
(budget.daily_tokens_used / budget.daily_tokens_budget * 100)
|
||||
if budget.daily_tokens_budget > 0
|
||||
else 0,
|
||||
1,
|
||||
),
|
||||
"monthly_budget": budget.monthly_tokens_budget,
|
||||
"monthly_usage_percent": round(
|
||||
(budget.monthly_tokens_used / budget.monthly_tokens_budget * 100)
|
||||
if budget.monthly_tokens_budget > 0
|
||||
else 0,
|
||||
1,
|
||||
),
|
||||
"is_over_budget": budget.is_over_budget,
|
||||
"alert_triggered": budget.alert_triggered,
|
||||
},
|
||||
}
|
||||
|
||||
# 累計總計
|
||||
summary["total"]["input_tokens"] += daily.total_input_tokens
|
||||
summary["total"]["output_tokens"] += daily.total_output_tokens
|
||||
summary["total"]["cost_usd"] += daily.total_cost_usd
|
||||
summary["total"]["requests"] += daily.total_requests
|
||||
|
||||
summary["total"]["cost_usd"] = round(summary["total"]["cost_usd"], 4)
|
||||
|
||||
return summary
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Helper: Usage Tracker Context Manager
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class UsageTracker:
|
||||
"""
|
||||
Token 使用追蹤器 - Context Manager
|
||||
|
||||
自動計時並記錄 Token 使用
|
||||
|
||||
Usage:
|
||||
async with UsageTracker("ollama", "qwen2.5:7b-instruct") as tracker:
|
||||
result = await call_llm(prompt)
|
||||
tracker.set_tokens(input_tokens=500, output_tokens=200)
|
||||
"""
|
||||
|
||||
def __init__(self, provider: str, model: str):
|
||||
self.provider = provider
|
||||
self.model = model
|
||||
self.start_time: float = 0
|
||||
self.input_tokens: int = 0
|
||||
self.output_tokens: int = 0
|
||||
self.success: bool = True
|
||||
self.error_message: str | None = None
|
||||
self._counter = get_token_counter()
|
||||
|
||||
def __enter__(self):
|
||||
self.start_time = time.perf_counter()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
latency_ms = (time.perf_counter() - self.start_time) * 1000
|
||||
|
||||
if exc_type is not None:
|
||||
self.success = False
|
||||
self.error_message = str(exc_val)
|
||||
|
||||
usage = TokenUsage(
|
||||
input_tokens=self.input_tokens,
|
||||
output_tokens=self.output_tokens,
|
||||
provider=self.provider,
|
||||
model=self.model,
|
||||
latency_ms=latency_ms,
|
||||
success=self.success,
|
||||
error_message=self.error_message,
|
||||
)
|
||||
|
||||
self._counter.record_usage(usage)
|
||||
|
||||
async def __aenter__(self):
|
||||
return self.__enter__()
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
return self.__exit__(exc_type, exc_val, exc_tb)
|
||||
|
||||
def set_tokens(self, input_tokens: int, output_tokens: int) -> None:
|
||||
"""設定 Token 數量"""
|
||||
self.input_tokens = input_tokens
|
||||
self.output_tokens = output_tokens
|
||||
|
||||
def mark_failed(self, error_message: str) -> None:
|
||||
"""標記失敗"""
|
||||
self.success = False
|
||||
self.error_message = error_message
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Singleton
|
||||
# =============================================================================
|
||||
|
||||
_token_counter: TokenCounter | None = None
|
||||
|
||||
|
||||
def get_token_counter() -> TokenCounter:
|
||||
"""取得 TokenCounter 單例"""
|
||||
global _token_counter
|
||||
if _token_counter is None:
|
||||
_token_counter = TokenCounter()
|
||||
return _token_counter
|
||||
|
||||
|
||||
def reset_token_counter() -> None:
|
||||
"""重置單例 (用於測試)"""
|
||||
global _token_counter
|
||||
_token_counter = None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Convenience Functions
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def record_token_usage(
|
||||
provider: str,
|
||||
model: str,
|
||||
input_tokens: int,
|
||||
output_tokens: int,
|
||||
latency_ms: float = 0.0,
|
||||
success: bool = True,
|
||||
error_message: str | None = None,
|
||||
) -> None:
|
||||
"""便捷函數: 記錄 Token 使用"""
|
||||
usage = TokenUsage(
|
||||
input_tokens=input_tokens,
|
||||
output_tokens=output_tokens,
|
||||
provider=provider,
|
||||
model=model,
|
||||
latency_ms=latency_ms,
|
||||
success=success,
|
||||
error_message=error_message,
|
||||
)
|
||||
get_token_counter().record_usage(usage)
|
||||
|
||||
|
||||
def should_use_local_model(provider: str) -> tuple[bool, str]:
|
||||
"""便捷函數: 檢查是否應該使用本地模型"""
|
||||
return get_token_counter().should_fallback_to_local(provider)
|
||||
Reference in New Issue
Block a user