1045 lines
41 KiB
Python
1045 lines
41 KiB
Python
"""
|
||
Auto Repair Service - #8 自動升級決策
|
||
=====================================
|
||
高品質 Playbook 自動修復執行
|
||
|
||
Phase 8: 自動化層實作
|
||
建立時間: 2026-03-26 17:30 (台北時區)
|
||
建立者: Claude Code (#8 自動升級決策)
|
||
|
||
遵循 leWOOOgo 積木化原則:
|
||
- Service 層只依賴 Repository/Service Interface
|
||
- 不直接存取 Redis/DB
|
||
- 封裝所有自動修復邏輯
|
||
|
||
觸發條件 (AND):
|
||
1. 有匹配的高品質 Playbook (is_high_quality = True)
|
||
2. Playbook 中的動作風險等級 <= MEDIUM
|
||
3. Incident 嚴重度 <= P2
|
||
|
||
安全邊界:
|
||
- HIGH/CRITICAL 風險動作永遠需要人工審核
|
||
- P0/P1 嚴重度 Incident 需要人工確認
|
||
"""
|
||
|
||
from dataclasses import dataclass
|
||
from collections.abc import Callable
|
||
from typing import Protocol
|
||
|
||
import structlog
|
||
|
||
from src.models.incident import Incident, Severity
|
||
from src.models.playbook import (
|
||
ActionType,
|
||
Playbook,
|
||
PlaybookStatus,
|
||
RiskLevel,
|
||
SymptomPattern,
|
||
)
|
||
from src.services.anomaly_counter import AnomalyFrequency, get_anomaly_counter
|
||
from src.services.executor import get_executor
|
||
from src.services.global_repair_cooldown import (
|
||
check_global_repair_cooldown,
|
||
record_global_repair_action,
|
||
)
|
||
# Sprint 5.1: Service Registry Guardrail (ADR-062)
|
||
from src.services.service_registry import StatefulLevel, get_service_registry
|
||
from src.services.playbook_service import IPlaybookService, get_playbook_service
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
|
||
# =============================================================================
|
||
# Types
|
||
# =============================================================================
|
||
|
||
|
||
@dataclass
|
||
class AutoRepairDecision:
|
||
"""自動修復決策結果"""
|
||
|
||
can_auto_repair: bool
|
||
playbook: Playbook | None = None
|
||
reason: str = ""
|
||
risk_level: RiskLevel = RiskLevel.MEDIUM
|
||
blocked_by: str | None = None # 阻擋原因 (如 HIGH_RISK, P1_SEVERITY)
|
||
# 2026-04-07 Claude Code: Sprint 4 B2 — 追蹤首次信任
|
||
is_cold_start: bool = False
|
||
# 2026-04-08 Claude Code: 傳入 execute_auto_repair 供 DB 記錄
|
||
similarity_score: float | None = None
|
||
|
||
|
||
@dataclass
|
||
class AutoRepairResult:
|
||
"""自動修復執行結果"""
|
||
|
||
success: bool
|
||
playbook_id: str
|
||
incident_id: str
|
||
executed_steps: list[str]
|
||
error: str | None = None
|
||
execution_time_ms: int = 0
|
||
|
||
|
||
# =============================================================================
|
||
# Auto Repair Service Interface
|
||
# =============================================================================
|
||
|
||
|
||
class IAutoRepairService(Protocol):
|
||
"""自動修復服務介面"""
|
||
|
||
async def evaluate_auto_repair(
|
||
self,
|
||
incident: Incident,
|
||
) -> AutoRepairDecision:
|
||
"""
|
||
評估是否可自動修復
|
||
|
||
Args:
|
||
incident: 待處理的 Incident
|
||
|
||
Returns:
|
||
AutoRepairDecision: 決策結果
|
||
"""
|
||
...
|
||
|
||
async def execute_auto_repair(
|
||
self,
|
||
incident: Incident,
|
||
playbook: Playbook,
|
||
) -> AutoRepairResult:
|
||
"""
|
||
執行自動修復
|
||
|
||
Args:
|
||
incident: 待處理的 Incident
|
||
playbook: 要執行的 Playbook
|
||
|
||
Returns:
|
||
AutoRepairResult: 執行結果
|
||
"""
|
||
...
|
||
|
||
|
||
# =============================================================================
|
||
# Auto Repair Service Implementation
|
||
# =============================================================================
|
||
|
||
|
||
class AutoRepairService:
|
||
"""
|
||
自動修復服務實作
|
||
|
||
職責:
|
||
- 評估 Incident 是否可自動修復
|
||
- 執行高品質 Playbook
|
||
- 更新執行統計
|
||
"""
|
||
|
||
# === 安全邊界常數 ===
|
||
# 2026-04-07 Claude Code: 統帥指令「直接全部跳成自動修復」
|
||
# 移除相似度/品質/風險門檻,只保留 P0/P1 嚴重度阻擋
|
||
MAX_AUTO_REPAIR_RISK = RiskLevel.MEDIUM # 保留供日後參考,不再用於阻擋
|
||
MAX_AUTO_REPAIR_SEVERITY = Severity.P2 # P0/P1 仍需人工審核
|
||
MIN_SIMILARITY_SCORE = 0.0 # 🔴 已取消門檻
|
||
COLD_START_TRUST_MAX_EXECUTIONS = 3 # 保留供參考
|
||
COLD_START_TRUST_DAILY_LIMIT = 5 # 保留供參考
|
||
|
||
def __init__(
|
||
self,
|
||
playbook_service: IPlaybookService | None = None,
|
||
cooldown_checker: Callable | None = None,
|
||
):
|
||
# 2026-04-01 ogt: 注入 cooldown_checker 支援測試隔離 (DI 原則)
|
||
self._playbook_service = playbook_service or get_playbook_service()
|
||
self._cooldown_checker = cooldown_checker or check_global_repair_cooldown
|
||
# 2026-04-04 Claude Code: Phase 25 P1 — 持有 runbook_generator task 引用,防 GC 回收
|
||
import asyncio
|
||
self._pending_tasks: set[asyncio.Task] = set()
|
||
|
||
async def drain_pending_tasks(self, timeout: float = 60.0) -> dict:
|
||
"""K8s rolling restart 時優雅等待所有背景任務完成。
|
||
|
||
# 2026-04-27 Wave8-X3 by Claude — B25/B26 drain fix
|
||
在 lifespan shutdown 中呼叫,確保 _verify_and_learn / runbook_generator
|
||
等 fire-and-forget task 在 SIGTERM 後仍有機會寫入 trust_score / runbook。
|
||
"""
|
||
import asyncio as _asyncio
|
||
|
||
if not self._pending_tasks:
|
||
return {"drained": 0, "timeout": False}
|
||
|
||
pending_count = len(self._pending_tasks)
|
||
logger.info(
|
||
"auto_repair_draining_pending_tasks",
|
||
count=pending_count,
|
||
timeout=timeout,
|
||
)
|
||
|
||
try:
|
||
done, still_pending = await _asyncio.wait(
|
||
self._pending_tasks,
|
||
timeout=timeout,
|
||
return_when=_asyncio.ALL_COMPLETED,
|
||
)
|
||
return {
|
||
"drained": len(done),
|
||
"still_pending": len(still_pending),
|
||
"timeout": len(still_pending) > 0,
|
||
}
|
||
except Exception as e:
|
||
logger.exception("drain_pending_tasks_failed", error=str(e))
|
||
return {"drained": 0, "still_pending": pending_count, "error": str(e)}
|
||
|
||
async def evaluate_auto_repair(
|
||
self,
|
||
incident: Incident,
|
||
) -> AutoRepairDecision:
|
||
"""
|
||
評估是否可自動修復
|
||
|
||
決策流程:
|
||
1. 檢查 Incident 嚴重度 (P0/P1 需人工)
|
||
2. 從 Playbook 找匹配項
|
||
3. 檢查 Playbook 是否為高品質
|
||
4. 檢查動作風險等級
|
||
"""
|
||
logger.info(
|
||
"auto_repair_evaluate_start",
|
||
incident_id=incident.incident_id,
|
||
severity=incident.severity.value if incident.severity else None,
|
||
)
|
||
|
||
# 0. 全域熔斷檢查(ADR-039 最優先)
|
||
can_repair, cooldown_reason = await self._cooldown_checker(
|
||
incident_id=incident.incident_id,
|
||
affected_services=incident.affected_services or [],
|
||
)
|
||
if not can_repair:
|
||
logger.warning(
|
||
"auto_repair_blocked_global_cooldown",
|
||
incident_id=incident.incident_id,
|
||
reason=cooldown_reason,
|
||
)
|
||
return AutoRepairDecision(
|
||
can_auto_repair=False,
|
||
reason=cooldown_reason,
|
||
blocked_by="GLOBAL_GUARDRAIL",
|
||
)
|
||
|
||
# 0.5 Sprint 5.1 Guardrail: Service Registry 服務分級檢查
|
||
# (2026-04-08 Claude Sonnet 4.6 Asia/Taipei,ADR-062)
|
||
# 全域熔斷之後、嚴重度之前,BLOCK 等級直接拒絕
|
||
# 保守原則:Registry 讀取失敗也 block(優先安全,不放行)
|
||
try:
|
||
_registry = get_service_registry()
|
||
_service_name = (incident.target_resource or "") if hasattr(incident, "target_resource") else ""
|
||
if not _service_name and incident.affected_services:
|
||
_service_name = incident.affected_services[0]
|
||
_stateful_level = _registry.get_stateful_level(_service_name)
|
||
if _stateful_level == StatefulLevel.BLOCK:
|
||
logger.warning(
|
||
"auto_repair_blocked_guardrail",
|
||
incident_id=incident.incident_id,
|
||
service_name=_service_name,
|
||
stateful_level="BLOCK",
|
||
)
|
||
return AutoRepairDecision(
|
||
can_auto_repair=False,
|
||
reason=f"GUARDRAIL_BLOCK: 服務 '{_service_name}' 屬於禁止自動修復清單(資料安全,見 service-registry.yaml)",
|
||
blocked_by="SERVICE_REGISTRY_BLOCK",
|
||
)
|
||
except Exception as _guardrail_err:
|
||
# S1-3 修正: Registry 失敗時保守拒絕,不允許穿透(ADR-062 審查修正 2026-04-08)
|
||
logger.error("guardrail_check_failed", error=str(_guardrail_err))
|
||
return AutoRepairDecision(
|
||
can_auto_repair=False,
|
||
reason="Guardrail Service Registry 讀取異常,保守拒絕自動修復",
|
||
blocked_by="GUARDRAIL_ERROR",
|
||
)
|
||
|
||
# 1. 檢查 Incident 嚴重度
|
||
if incident.severity and incident.severity.value in ["P0", "P1"]:
|
||
logger.info(
|
||
"auto_repair_blocked_severity",
|
||
incident_id=incident.incident_id,
|
||
severity=incident.severity.value,
|
||
)
|
||
return AutoRepairDecision(
|
||
can_auto_repair=False,
|
||
reason=f"Incident 嚴重度 {incident.severity.value} 需要人工審核",
|
||
blocked_by="HIGH_SEVERITY",
|
||
)
|
||
|
||
# 2. 提取症狀模式
|
||
symptoms = self._extract_symptoms(incident)
|
||
|
||
# 2.1 2026-04-04 Claude Code: Phase 25 P1 — Anti-Pattern 閘門
|
||
# 根據確定性 hash 比對近 7 天失敗案例,避免 AI 在同一個坑重複摔倒
|
||
try:
|
||
from src.services.knowledge_service import get_knowledge_service
|
||
symptoms_hash = symptoms.compute_hash()
|
||
anti_patterns = await get_knowledge_service().check_anti_pattern(
|
||
symptoms_hash, days=7
|
||
)
|
||
if anti_patterns:
|
||
ap = anti_patterns[0]
|
||
logger.warning(
|
||
"auto_repair_blocked_anti_pattern",
|
||
incident_id=incident.incident_id,
|
||
symptoms_hash=symptoms_hash,
|
||
anti_pattern_id=ap.id,
|
||
anti_pattern_title=ap.title,
|
||
)
|
||
return AutoRepairDecision(
|
||
can_auto_repair=False,
|
||
reason=f"過去 7 天有失敗案例: {ap.title}",
|
||
blocked_by="ANTI_PATTERN",
|
||
)
|
||
except Exception as _ap_e:
|
||
# Anti-Pattern 閘門失敗不阻塞主流程(僅記錄)
|
||
logger.warning("anti_pattern_gate_error", error=str(_ap_e))
|
||
symptoms_hash = ""
|
||
|
||
# 3. 找匹配的 Playbook
|
||
recommendations = await self._playbook_service.get_recommendations(
|
||
symptoms=symptoms,
|
||
top_k=3,
|
||
)
|
||
|
||
if not recommendations:
|
||
logger.info(
|
||
"auto_repair_no_playbook_match",
|
||
incident_id=incident.incident_id,
|
||
)
|
||
return AutoRepairDecision(
|
||
can_auto_repair=False,
|
||
reason="未找到匹配的 Playbook",
|
||
blocked_by="NO_MATCH",
|
||
)
|
||
|
||
# 4. 檢查最佳匹配
|
||
best_match = recommendations[0]
|
||
|
||
# 2026-04-07 Claude Code: 統帥指令「直接全部跳成自動修復」
|
||
# 移除: 相似度門檻、is_high_quality 門檻、冷啟動機制、風險等級門檻
|
||
# 只要有匹配 Playbook 且 APPROVED,直接執行
|
||
max_risk = self._get_max_risk_level(best_match.playbook)
|
||
_is_cold_start = False
|
||
|
||
# 只保留: Playbook 必須是 APPROVED 狀態
|
||
if best_match.playbook.status != PlaybookStatus.APPROVED:
|
||
return AutoRepairDecision(
|
||
can_auto_repair=False,
|
||
playbook=best_match.playbook,
|
||
reason=f"Playbook 狀態為 {best_match.playbook.status.value},必須是 APPROVED",
|
||
blocked_by="NOT_APPROVED",
|
||
)
|
||
|
||
if self._is_host_or_backup_incident(incident) and self._playbook_has_k8s_steps(best_match.playbook):
|
||
logger.warning(
|
||
"auto_repair_blocked_host_backup_k8s_playbook",
|
||
incident_id=incident.incident_id,
|
||
playbook_id=best_match.playbook.playbook_id,
|
||
alert_category=getattr(incident, "alert_category", None),
|
||
)
|
||
return AutoRepairDecision(
|
||
can_auto_repair=False,
|
||
playbook=best_match.playbook,
|
||
reason=(
|
||
"主機/備份類告警禁止執行 K8s Playbook;"
|
||
"需改走 SSH 診斷或緊急介入"
|
||
),
|
||
blocked_by="HOST_BACKUP_K8S_PLAYBOOK",
|
||
)
|
||
|
||
# 5. 可以自動修復
|
||
logger.info(
|
||
"auto_repair_approved",
|
||
incident_id=incident.incident_id,
|
||
playbook_id=best_match.playbook.playbook_id,
|
||
similarity=best_match.similarity_score,
|
||
success_rate=best_match.playbook.success_rate,
|
||
)
|
||
|
||
return AutoRepairDecision(
|
||
can_auto_repair=True,
|
||
playbook=best_match.playbook,
|
||
reason=f"匹配 Playbook: {best_match.playbook.name} (相似度 {best_match.similarity_score:.0%})",
|
||
risk_level=max_risk,
|
||
is_cold_start=_is_cold_start,
|
||
similarity_score=best_match.similarity_score,
|
||
)
|
||
|
||
async def execute_auto_repair(
|
||
self,
|
||
incident: Incident,
|
||
playbook: Playbook,
|
||
is_cold_start: bool = False,
|
||
similarity_score: float | None = None,
|
||
) -> AutoRepairResult:
|
||
"""
|
||
執行自動修復
|
||
|
||
流程:
|
||
1. 依序執行 Playbook 中的 repair_steps
|
||
2. 記錄執行結果到 DB (auto_repair_executions)
|
||
3. 更新 Playbook 統計
|
||
4. 記錄處置類型 (Sprint 4 B1/B2)
|
||
"""
|
||
import time
|
||
|
||
start_time = time.perf_counter()
|
||
executed_steps: list[str] = []
|
||
|
||
logger.info(
|
||
"auto_repair_execute_start",
|
||
incident_id=incident.incident_id,
|
||
playbook_id=playbook.playbook_id,
|
||
steps_count=len(playbook.repair_steps),
|
||
)
|
||
|
||
# ADR-039: 記錄全域修復計數(用於熔斷檢查)
|
||
await record_global_repair_action()
|
||
|
||
try:
|
||
# 執行每個步驟
|
||
for step in playbook.repair_steps:
|
||
# 2026-04-07 Claude Code: 統帥指令「直接全部跳成自動修復」
|
||
# 移除 step-level 風險門檻,所有步驟直接執行
|
||
|
||
# 執行步驟
|
||
step_result = await self._execute_step(incident, step)
|
||
executed_steps.append(
|
||
f"Step {step.step_number}: {step.command[:50]}... -> {step_result}"
|
||
)
|
||
|
||
# 更新 Playbook 統計
|
||
await self._playbook_service.record_execution(
|
||
playbook_id=playbook.playbook_id,
|
||
success=True,
|
||
)
|
||
|
||
execution_time = int((time.perf_counter() - start_time) * 1000)
|
||
|
||
logger.info(
|
||
"auto_repair_execute_success",
|
||
incident_id=incident.incident_id,
|
||
playbook_id=playbook.playbook_id,
|
||
executed_steps=len(executed_steps),
|
||
execution_time_ms=execution_time,
|
||
)
|
||
|
||
repair_result = AutoRepairResult(
|
||
success=True,
|
||
playbook_id=playbook.playbook_id,
|
||
incident_id=incident.incident_id,
|
||
executed_steps=executed_steps,
|
||
execution_time_ms=execution_time,
|
||
)
|
||
|
||
# 2026-04-08 Claude Code: 統帥指令「所有操作都必須被記錄,寫入資料庫」
|
||
try:
|
||
from src.repositories.audit_log_repository import get_auto_repair_execution_repository
|
||
max_risk = self._get_max_risk_level(playbook)
|
||
await get_auto_repair_execution_repository().create(
|
||
incident_id=incident.incident_id,
|
||
playbook_id=playbook.playbook_id,
|
||
playbook_name=playbook.name,
|
||
success=True,
|
||
executed_steps=executed_steps,
|
||
triggered_by="cold_start_trust" if is_cold_start else "auto_repair",
|
||
similarity_score=similarity_score,
|
||
risk_level=max_risk.value if max_risk else None,
|
||
execution_time_ms=execution_time,
|
||
)
|
||
except Exception as _db_e:
|
||
logger.error("auto_repair_db_write_failed", error=str(_db_e))
|
||
|
||
self._record_auto_repair_metric(playbook, success=True)
|
||
|
||
# 2026-04-07 Claude Code: Sprint 4 B1/B2 — 記錄處置類型
|
||
# P0-1 Fix: 統一使用 AnomalyCounter.hash_signature()
|
||
try:
|
||
from src.services.anomaly_counter import get_anomaly_counter
|
||
counter = get_anomaly_counter()
|
||
anomaly_key = self._derive_anomaly_key(incident)
|
||
if anomaly_key:
|
||
disposition_type = "cold_start_trust" if is_cold_start else "auto_repair"
|
||
await counter.record_disposition(anomaly_key, disposition_type)
|
||
except Exception as _disp_e:
|
||
logger.warning("disposition_record_failed", error=str(_disp_e))
|
||
|
||
# 2026-04-04 Claude Code: Phase 25 P1 — 成功修復後 fire-and-forget 生成 AUTO_RUNBOOK
|
||
try:
|
||
from src.services.runbook_generator import get_runbook_generator
|
||
symptoms = self._extract_symptoms(incident)
|
||
symptoms_hash = symptoms.compute_hash()
|
||
gen = get_runbook_generator()
|
||
import asyncio as _asyncio
|
||
task = _asyncio.create_task(
|
||
gen.generate_runbook(incident, playbook, repair_result, symptoms_hash)
|
||
)
|
||
self._pending_tasks.add(task) if hasattr(self, "_pending_tasks") else None
|
||
task.add_done_callback(
|
||
lambda t: self._pending_tasks.discard(t) if hasattr(self, "_pending_tasks") else None
|
||
)
|
||
except Exception as _rg_e:
|
||
logger.warning("runbook_generator_task_failed", error=str(_rg_e))
|
||
|
||
# 2026-04-26 Wave4 P1.3+P1.4 by Claude Engineer-B3 — 飛輪閉環最後一哩
|
||
# 成功執行後,fire-and-forget 啟動後執行驗證 + EWMA 學習回饋
|
||
# verifier 有 10s warmup + 30s timeout,不能阻塞在主路徑
|
||
try:
|
||
import asyncio as _asyncio
|
||
from src.services.post_execution_verifier import get_post_execution_verifier
|
||
from src.services.learning_service import get_learning_service
|
||
|
||
_action_taken = f"auto_repair:{playbook.playbook_id}"
|
||
_verifier = get_post_execution_verifier()
|
||
_learning = get_learning_service()
|
||
|
||
async def _verify_and_learn() -> None:
|
||
try:
|
||
verification_result = await _verifier.verify(
|
||
incident=incident,
|
||
snapshot=None,
|
||
action_taken=_action_taken,
|
||
)
|
||
await _learning.record_verification_result(
|
||
incident_id=incident.incident_id,
|
||
action_taken=_action_taken,
|
||
verification_result=verification_result,
|
||
matched_playbook_id=playbook.playbook_id,
|
||
)
|
||
logger.info(
|
||
"auto_repair_verify_and_learn_done",
|
||
incident_id=incident.incident_id,
|
||
playbook_id=playbook.playbook_id,
|
||
verification_result=verification_result,
|
||
)
|
||
|
||
# 2026-04-27 P3.1-T1 by Claude — 三 Tier-1 服務整合
|
||
# PostExecutionVerifier 判斷失敗/降級 → 觸發自動 Rollback
|
||
if verification_result in ("failed", "degraded"):
|
||
if self._should_escalate_failed_verification(incident, playbook):
|
||
await self._escalate_failed_verification(
|
||
incident=incident,
|
||
playbook=playbook,
|
||
verification_result=verification_result,
|
||
)
|
||
return
|
||
try:
|
||
from src.services.rollback_manager import get_rollback_manager
|
||
from src.services.declarative_remediation import DeclarativeRemediation
|
||
from src.core.metrics import ROLLBACK_EXECUTED_TOTAL
|
||
|
||
# 從 Incident 推導 target / namespace / action
|
||
_rb_target = (incident.affected_services or ["unknown"])[0]
|
||
_rb_ns = "awoooi-prod"
|
||
_rb_action = f"kubectl rollout restart deployment/{_rb_target} -n {_rb_ns}"
|
||
_spec = DeclarativeRemediation().evaluate(
|
||
action=_rb_action,
|
||
target=_rb_target,
|
||
namespace=_rb_ns,
|
||
)
|
||
rollback_mgr = get_rollback_manager()
|
||
rollback_result = await rollback_mgr.trigger(
|
||
incident_id=incident.incident_id,
|
||
spec=_spec,
|
||
verification_result=verification_result,
|
||
)
|
||
_rb_status = "success" if rollback_result.success else "failed"
|
||
_rb_reason = "converged" if rollback_result.convergence_confirmed else (
|
||
"no_previous_revision" if rollback_result.error and "revision" in (rollback_result.error or "")
|
||
else "error"
|
||
)
|
||
ROLLBACK_EXECUTED_TOTAL.labels(
|
||
status=_rb_status, reason=_rb_reason
|
||
).inc()
|
||
logger.info(
|
||
"auto_rollback_triggered",
|
||
incident_id=incident.incident_id,
|
||
rollback_success=rollback_result.success,
|
||
convergence_confirmed=rollback_result.convergence_confirmed,
|
||
rollback_error=rollback_result.error,
|
||
)
|
||
except Exception as _rb_e:
|
||
logger.exception(
|
||
"auto_rollback_failed",
|
||
incident_id=incident.incident_id,
|
||
error=str(_rb_e),
|
||
)
|
||
|
||
except Exception as _inner_e:
|
||
logger.warning(
|
||
"auto_repair_verify_and_learn_failed",
|
||
incident_id=incident.incident_id,
|
||
error=str(_inner_e),
|
||
)
|
||
|
||
_vl_task = _asyncio.create_task(_verify_and_learn())
|
||
if hasattr(self, "_pending_tasks"):
|
||
self._pending_tasks.add(_vl_task)
|
||
_vl_task.add_done_callback(self._pending_tasks.discard)
|
||
except Exception as _vl_e:
|
||
logger.warning("auto_repair_verifier_setup_failed", error=str(_vl_e))
|
||
|
||
return repair_result
|
||
|
||
except Exception as e:
|
||
# 更新失敗統計
|
||
await self._playbook_service.record_execution(
|
||
playbook_id=playbook.playbook_id,
|
||
success=False,
|
||
)
|
||
|
||
execution_time = int((time.perf_counter() - start_time) * 1000)
|
||
|
||
logger.error(
|
||
"auto_repair_execute_failed",
|
||
incident_id=incident.incident_id,
|
||
playbook_id=playbook.playbook_id,
|
||
error=str(e),
|
||
)
|
||
|
||
fail_result = AutoRepairResult(
|
||
success=False,
|
||
playbook_id=playbook.playbook_id,
|
||
incident_id=incident.incident_id,
|
||
executed_steps=executed_steps,
|
||
error=str(e),
|
||
execution_time_ms=execution_time,
|
||
)
|
||
|
||
# 2026-04-08 Claude Code: 失敗也必須寫入 DB
|
||
try:
|
||
from src.repositories.audit_log_repository import get_auto_repair_execution_repository
|
||
max_risk = self._get_max_risk_level(playbook)
|
||
await get_auto_repair_execution_repository().create(
|
||
incident_id=incident.incident_id,
|
||
playbook_id=playbook.playbook_id,
|
||
playbook_name=playbook.name,
|
||
success=False,
|
||
executed_steps=executed_steps,
|
||
error_message=str(e),
|
||
triggered_by="cold_start_trust" if is_cold_start else "auto_repair",
|
||
similarity_score=similarity_score,
|
||
risk_level=max_risk.value if max_risk else None,
|
||
execution_time_ms=execution_time,
|
||
)
|
||
except Exception as _db_e:
|
||
logger.error("auto_repair_db_write_failed", error=str(_db_e))
|
||
|
||
self._record_auto_repair_metric(playbook, success=False)
|
||
|
||
# 2026-04-04 Claude Code: Phase 25 P1 — 失敗修復後 fire-and-forget 生成 ANTI_PATTERN
|
||
# 2026-04-05 Claude Code: I1 修正 — 補齊 _pending_tasks GC 防護(對稱化)
|
||
try:
|
||
from src.services.runbook_generator import get_runbook_generator
|
||
import asyncio as _asyncio
|
||
symptoms = self._extract_symptoms(incident)
|
||
symptoms_hash = symptoms.compute_hash()
|
||
gen = get_runbook_generator()
|
||
_ap_task = _asyncio.create_task(
|
||
gen.generate_anti_pattern(incident, playbook, fail_result, symptoms_hash)
|
||
)
|
||
self._pending_tasks.add(_ap_task)
|
||
_ap_task.add_done_callback(self._pending_tasks.discard)
|
||
except Exception as _ap_e:
|
||
logger.warning("anti_pattern_task_failed", error=str(_ap_e))
|
||
|
||
return fail_result
|
||
|
||
# === Private Helpers ===
|
||
|
||
@staticmethod
|
||
def _derive_anomaly_key(incident: Incident) -> str | None:
|
||
"""
|
||
從 Incident 推導 anomaly_key。
|
||
2026-04-07 Claude Code: I1+S1 Fix — 委託 AnomalyCounter.derive_key_from_incident()
|
||
"""
|
||
from src.services.anomaly_counter import AnomalyCounter
|
||
return AnomalyCounter.derive_key_from_incident(incident)
|
||
|
||
def _extract_symptoms(self, incident: Incident) -> SymptomPattern:
|
||
"""從 Incident 提取症狀模式"""
|
||
alert_names = []
|
||
keywords = []
|
||
|
||
if incident.signals:
|
||
for signal in incident.signals:
|
||
# 優先用 labels["alertname"](原始 Prometheus alertname),
|
||
# fallback 到 signal.alert_name(可能是 "custom" 等類別值)
|
||
# (2026-04-09 Claude Sonnet 4.6 Asia/Taipei, L7 E2E 修正)
|
||
raw_alertname = signal.labels.get("alertname") if signal.labels else None
|
||
alert_names.append(raw_alertname or signal.alert_name)
|
||
# 從 annotations 提取關鍵字
|
||
if signal.annotations:
|
||
for value in signal.annotations.values():
|
||
if isinstance(value, str) and len(value) < 50:
|
||
keywords.append(value)
|
||
|
||
return SymptomPattern(
|
||
alert_names=alert_names,
|
||
affected_services=incident.affected_services or [],
|
||
severity_range=[incident.severity.value] if incident.severity else ["P2"],
|
||
keywords=keywords[:10],
|
||
)
|
||
|
||
def _get_max_risk_level(self, playbook: Playbook) -> RiskLevel:
|
||
"""取得 Playbook 中最高的風險等級"""
|
||
risk_order = {
|
||
RiskLevel.LOW: 0,
|
||
RiskLevel.MEDIUM: 1,
|
||
RiskLevel.HIGH: 2,
|
||
RiskLevel.CRITICAL: 3,
|
||
}
|
||
|
||
max_risk = RiskLevel.LOW
|
||
for step in playbook.repair_steps:
|
||
if risk_order.get(step.risk_level, 0) > risk_order.get(max_risk, 0):
|
||
max_risk = step.risk_level
|
||
|
||
return max_risk
|
||
|
||
def _record_auto_repair_metric(self, playbook: Playbook, success: bool) -> None:
|
||
"""把實際 auto-repair 執行寫入 Prometheus 指標。
|
||
|
||
2026-05-06 ogt + Codex:DB 已有 auto_repair_executions,但
|
||
core.metrics.record_auto_repair() 長期零 caller,導致治理/心跳用
|
||
Prometheus 看起來像「飛輪沒做事」。label 使用 action_type,避免
|
||
playbook_id 造成高基數。
|
||
"""
|
||
try:
|
||
from src.core.metrics import record_auto_repair
|
||
|
||
first_step = playbook.repair_steps[0] if playbook.repair_steps else None
|
||
action = first_step.action_type.value if first_step else "unknown"
|
||
max_risk = self._get_max_risk_level(playbook)
|
||
tier = {
|
||
RiskLevel.LOW: 1,
|
||
RiskLevel.MEDIUM: 2,
|
||
RiskLevel.HIGH: 3,
|
||
RiskLevel.CRITICAL: 4,
|
||
}.get(max_risk, 0)
|
||
record_auto_repair(action=action, tier=tier, success=success)
|
||
except Exception as e:
|
||
logger.warning(
|
||
"auto_repair_metric_record_failed",
|
||
playbook_id=playbook.playbook_id,
|
||
success=success,
|
||
error=str(e),
|
||
)
|
||
|
||
def _is_host_or_backup_incident(self, incident: Incident) -> bool:
|
||
"""主機/備份類事件只能走 SSH/只讀診斷,不允許 K8s rollout 類修復。"""
|
||
|
||
category = (getattr(incident, "alert_category", None) or "").lower()
|
||
if category in {"host_resource", "backup_failure"}:
|
||
return True
|
||
|
||
for signal in incident.signals or []:
|
||
labels = signal.labels or {}
|
||
alertname = str(labels.get("alertname") or signal.alert_name or "")
|
||
if alertname.startswith("HostBackup") or alertname.startswith("Host"):
|
||
return True
|
||
return False
|
||
|
||
def _playbook_has_k8s_steps(self, playbook: Playbook) -> bool:
|
||
"""檢查 Playbook 是否包含 K8s 指令,避免主機告警誤執行 deployment 操作。"""
|
||
|
||
for step in playbook.repair_steps:
|
||
command = (step.command or "").strip().lower()
|
||
if step.action_type == ActionType.KUBECTL or command.startswith("kubectl "):
|
||
return True
|
||
return False
|
||
|
||
def _should_escalate_failed_verification(self, incident: Incident, playbook: Playbook) -> bool:
|
||
"""非 K8s 修復或主機/備份事件驗證失敗時,禁止合成 K8s rollback。"""
|
||
|
||
return self._is_host_or_backup_incident(incident) or not self._playbook_has_k8s_steps(playbook)
|
||
|
||
async def _escalate_failed_verification(
|
||
self,
|
||
*,
|
||
incident: Incident,
|
||
playbook: Playbook,
|
||
verification_result: str,
|
||
) -> None:
|
||
"""Post-verification failed but rollback is unsafe; notify emergency channel."""
|
||
|
||
target = (incident.affected_services or ["unknown"])[0]
|
||
namespace = "awoooi-prod"
|
||
alert_type = self._incident_alert_type(incident)
|
||
reason = (
|
||
f"auto repair playbook {playbook.playbook_id} verification={verification_result}; "
|
||
"rollback is unsafe for host/backup or non-K8s remediation"
|
||
)
|
||
logger.warning(
|
||
"auto_repair_verification_failed_emergency",
|
||
incident_id=incident.incident_id,
|
||
playbook_id=playbook.playbook_id,
|
||
verification_result=verification_result,
|
||
target=target,
|
||
)
|
||
try:
|
||
from src.services.emergency_escalation_service import (
|
||
escalate_auto_repair_unavailable,
|
||
)
|
||
|
||
await escalate_auto_repair_unavailable(
|
||
incident_id=incident.incident_id,
|
||
approval_id=None,
|
||
alert_type=alert_type,
|
||
target_resource=target,
|
||
namespace=namespace,
|
||
failure_reason=reason,
|
||
attempted_actions=(
|
||
f"auto_repair:{playbook.playbook_id} -> verifier:{verification_result} "
|
||
"-> emergency_intervention"
|
||
),
|
||
)
|
||
except Exception as exc:
|
||
logger.warning(
|
||
"auto_repair_verification_emergency_failed",
|
||
incident_id=incident.incident_id,
|
||
playbook_id=playbook.playbook_id,
|
||
error=str(exc),
|
||
)
|
||
|
||
def _incident_alert_type(self, incident: Incident) -> str:
|
||
"""Best-effort alertname for emergency cards."""
|
||
|
||
for signal in incident.signals or []:
|
||
labels = signal.labels or {}
|
||
alertname = labels.get("alertname") or signal.alert_name
|
||
if alertname:
|
||
return str(alertname)
|
||
return "AutoRepairVerificationFailed"
|
||
|
||
def _risk_exceeds_threshold(self, risk: RiskLevel) -> bool:
|
||
"""檢查風險是否超過自動修復門檻"""
|
||
high_risks = {RiskLevel.HIGH, RiskLevel.CRITICAL}
|
||
return risk in high_risks
|
||
|
||
async def _check_cold_start_daily_limit(self) -> bool:
|
||
"""
|
||
檢查今日首次信任自動修復次數是否在限額內。
|
||
使用 Redis counter,key 含日期,自動過期。
|
||
2026-04-07 Claude Code: 方案 C — 冷啟動每日上限防護
|
||
"""
|
||
try:
|
||
from src.core.redis_client import get_redis
|
||
redis = await get_redis()
|
||
if redis is None:
|
||
# Redis 不可用 → 保守拒絕
|
||
return False
|
||
|
||
from src.utils.timezone import now_taipei
|
||
today_key = f"cold_start_trust:{now_taipei().strftime('%Y-%m-%d')}"
|
||
count = await redis.incr(today_key)
|
||
|
||
# 首次建立 key 時設定過期 (25 小時,確保跨日清理)
|
||
if count == 1:
|
||
await redis.expire(today_key, 90000)
|
||
|
||
if count > self.COLD_START_TRUST_DAILY_LIMIT:
|
||
logger.warning(
|
||
"cold_start_daily_limit_reached",
|
||
today_key=today_key,
|
||
count=count,
|
||
limit=self.COLD_START_TRUST_DAILY_LIMIT,
|
||
)
|
||
return False
|
||
|
||
return True
|
||
except Exception as e:
|
||
logger.warning("cold_start_daily_limit_check_failed", error=str(e))
|
||
# 安全降級:檢查失敗 → 保守拒絕
|
||
return False
|
||
|
||
async def _execute_step(self, incident: Incident, step) -> str:
|
||
"""
|
||
執行單一修復步驟
|
||
|
||
目前整合:
|
||
- kubectl 命令: 透過 ActionExecutor
|
||
- script: 透過 subprocess
|
||
- manual: 跳過 (需人工)
|
||
"""
|
||
if step.action_type == ActionType.MANUAL:
|
||
return "SKIPPED (manual step)"
|
||
|
||
if step.action_type == ActionType.KUBECTL:
|
||
# 整合 ActionExecutor
|
||
try:
|
||
executor = get_executor()
|
||
|
||
# 替換 {target} 為實際目標
|
||
command = step.command
|
||
if incident.affected_services:
|
||
command = command.replace("{target}", incident.affected_services[0])
|
||
|
||
result = await executor.execute_kubectl_command(command)
|
||
return "SUCCESS" if result.success else f"FAILED: {result.error}"
|
||
|
||
except ImportError:
|
||
logger.warning("action_executor_not_available")
|
||
return "SKIPPED (executor not available)"
|
||
|
||
# 2026-04-06 Claude Code: Sprint 3 — repair_by_uri (URI scheme 路由)
|
||
if step.action_type == ActionType.SSH_COMMAND:
|
||
from src.services.host_repair_agent import HostRepairAgent
|
||
agent = HostRepairAgent()
|
||
approved = not getattr(step, "requires_approval", False)
|
||
result = await agent.repair_by_uri(step.command, approved=approved)
|
||
if result.success:
|
||
return f"SUCCESS: {result.output}"
|
||
else:
|
||
return f"FAILED: {result.error}"
|
||
|
||
return "UNKNOWN_ACTION_TYPE"
|
||
|
||
# === ADR-037: Tier-based Repair (2026-03-29) ===
|
||
|
||
# Tier 分級動作映射
|
||
TIER_ACTIONS = {
|
||
1: ["restart_pod", "restart_container"], # 臨時修復
|
||
2: ["scale_up", "increase_memory", "adjust_limits"], # 緩解修復
|
||
3: ["apply_hotfix", "update_config", "patch_deployment"], # 根因修復
|
||
4: ["create_issue", "notify_team", "schedule_fix"], # 架構修復
|
||
}
|
||
|
||
async def determine_repair_tier(
|
||
self,
|
||
anomaly_key: str,
|
||
frequency: AnomalyFrequency,
|
||
) -> int:
|
||
"""
|
||
根據頻率決定修復 Tier (ADR-037)
|
||
|
||
統帥指示 (2026-03-29):
|
||
- "重啟只是治標,不是治本!太常發生的異常必須徹底解決"
|
||
- 根據異常頻率和修復歷史決定應該嘗試的修復層級
|
||
|
||
Returns:
|
||
1: 臨時修復 (重啟)
|
||
2: 緩解修復 (擴容)
|
||
3: 根因修復 (配置變更)
|
||
4: 架構修復 (需開發)
|
||
"""
|
||
# 取得修復歷史
|
||
counter = get_anomaly_counter()
|
||
stats = await counter.get_all_repair_stats(anomaly_key)
|
||
|
||
# 計算重啟次數
|
||
restart_count = stats.get("restart_pod", {}).get("total", 0)
|
||
restart_count += stats.get("restart_container", {}).get("total", 0)
|
||
|
||
# Tier 決策邏輯
|
||
if frequency.permanent_fix_applied:
|
||
# 已有永久修復但仍出問題 → 需架構級修復
|
||
logger.info(
|
||
"tier_decision",
|
||
anomaly_key=anomaly_key,
|
||
tier=4,
|
||
reason="permanent_fix_still_failing",
|
||
)
|
||
return 4
|
||
|
||
if frequency.escalation_level == "PERMANENT_FIX":
|
||
# 24h 內 ≥10 次 → 根因修復
|
||
logger.info(
|
||
"tier_decision",
|
||
anomaly_key=anomaly_key,
|
||
tier=3,
|
||
reason="escalation_permanent_fix",
|
||
)
|
||
return 3
|
||
|
||
if frequency.escalation_level == "ESCALATE":
|
||
# 24h 內 ≥5 次 → 緩解修復
|
||
logger.info(
|
||
"tier_decision",
|
||
anomaly_key=anomaly_key,
|
||
tier=2,
|
||
reason="escalation_escalate",
|
||
)
|
||
return 2
|
||
|
||
if restart_count >= 2:
|
||
# 已重啟 2 次 → 升級到緩解
|
||
logger.info(
|
||
"tier_decision",
|
||
anomaly_key=anomaly_key,
|
||
tier=2,
|
||
reason=f"restart_count_{restart_count}",
|
||
)
|
||
return 2
|
||
|
||
# 預設臨時修復
|
||
return 1
|
||
|
||
def get_tier_actions(self, tier: int) -> list[str]:
|
||
"""
|
||
根據 Tier 返回可用修復動作 (ADR-037)
|
||
"""
|
||
return self.TIER_ACTIONS.get(tier, self.TIER_ACTIONS[1])
|
||
|
||
async def record_repair_result(
|
||
self,
|
||
anomaly_key: str,
|
||
action: str,
|
||
success: bool,
|
||
tier: int = 1,
|
||
) -> None:
|
||
"""
|
||
記錄修復結果到 AnomalyCounter (ADR-037)
|
||
|
||
Args:
|
||
anomaly_key: 異常 key
|
||
action: 修復動作
|
||
success: 是否成功
|
||
tier: 修復 Tier
|
||
"""
|
||
counter = get_anomaly_counter()
|
||
await counter.record_repair_attempt(anomaly_key, action, success)
|
||
|
||
# 如果是 Tier 3 永久修復成功,標記已套用
|
||
if tier >= 3 and success:
|
||
await counter.mark_permanent_fix_applied(
|
||
anomaly_key=anomaly_key,
|
||
fix_description=f"Tier {tier} repair: {action}",
|
||
)
|
||
|
||
logger.info(
|
||
"repair_result_recorded",
|
||
anomaly_key=anomaly_key,
|
||
action=action,
|
||
success=success,
|
||
tier=tier,
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# Singleton
|
||
# =============================================================================
|
||
|
||
_service: AutoRepairService | None = None
|
||
|
||
|
||
def get_auto_repair_service() -> IAutoRepairService:
|
||
"""取得 AutoRepairService 單例"""
|
||
global _service
|
||
if _service is None:
|
||
_service = AutoRepairService()
|
||
return _service
|
||
|
||
|
||
def set_auto_repair_service(service: AutoRepairService | None) -> None:
|
||
"""注入 AutoRepairService 實例 (用於 DI 或測試)"""
|
||
global _service
|
||
_service = service
|