Files
awoooi/apps/api/src/services/auto_repair_service.py
Your Name a2c4b3d47e
Some checks failed
Code Review / ai-code-review (push) Has been cancelled
CD Pipeline / tests (push) Successful in 2m22s
CD Pipeline / build-and-deploy (push) Successful in 3m54s
CD Pipeline / post-deploy-checks (push) Successful in 1m17s
fix(awooop): align console with flywheel execution metrics
2026-05-06 00:46:08 +08:00

1045 lines
41 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Auto Repair Service - #8 自動升級決策
=====================================
高品質 Playbook 自動修復執行
Phase 8: 自動化層實作
建立時間: 2026-03-26 17:30 (台北時區)
建立者: Claude Code (#8 自動升級決策)
遵循 leWOOOgo 積木化原則:
- Service 層只依賴 Repository/Service Interface
- 不直接存取 Redis/DB
- 封裝所有自動修復邏輯
觸發條件 (AND):
1. 有匹配的高品質 Playbook (is_high_quality = True)
2. Playbook 中的動作風險等級 <= MEDIUM
3. Incident 嚴重度 <= P2
安全邊界:
- HIGH/CRITICAL 風險動作永遠需要人工審核
- P0/P1 嚴重度 Incident 需要人工確認
"""
from dataclasses import dataclass
from collections.abc import Callable
from typing import Protocol
import structlog
from src.models.incident import Incident, Severity
from src.models.playbook import (
ActionType,
Playbook,
PlaybookStatus,
RiskLevel,
SymptomPattern,
)
from src.services.anomaly_counter import AnomalyFrequency, get_anomaly_counter
from src.services.executor import get_executor
from src.services.global_repair_cooldown import (
check_global_repair_cooldown,
record_global_repair_action,
)
# Sprint 5.1: Service Registry Guardrail (ADR-062)
from src.services.service_registry import StatefulLevel, get_service_registry
from src.services.playbook_service import IPlaybookService, get_playbook_service
logger = structlog.get_logger(__name__)
# =============================================================================
# Types
# =============================================================================
@dataclass
class AutoRepairDecision:
"""自動修復決策結果"""
can_auto_repair: bool
playbook: Playbook | None = None
reason: str = ""
risk_level: RiskLevel = RiskLevel.MEDIUM
blocked_by: str | None = None # 阻擋原因 (如 HIGH_RISK, P1_SEVERITY)
# 2026-04-07 Claude Code: Sprint 4 B2 — 追蹤首次信任
is_cold_start: bool = False
# 2026-04-08 Claude Code: 傳入 execute_auto_repair 供 DB 記錄
similarity_score: float | None = None
@dataclass
class AutoRepairResult:
"""自動修復執行結果"""
success: bool
playbook_id: str
incident_id: str
executed_steps: list[str]
error: str | None = None
execution_time_ms: int = 0
# =============================================================================
# Auto Repair Service Interface
# =============================================================================
class IAutoRepairService(Protocol):
"""自動修復服務介面"""
async def evaluate_auto_repair(
self,
incident: Incident,
) -> AutoRepairDecision:
"""
評估是否可自動修復
Args:
incident: 待處理的 Incident
Returns:
AutoRepairDecision: 決策結果
"""
...
async def execute_auto_repair(
self,
incident: Incident,
playbook: Playbook,
) -> AutoRepairResult:
"""
執行自動修復
Args:
incident: 待處理的 Incident
playbook: 要執行的 Playbook
Returns:
AutoRepairResult: 執行結果
"""
...
# =============================================================================
# Auto Repair Service Implementation
# =============================================================================
class AutoRepairService:
"""
自動修復服務實作
職責:
- 評估 Incident 是否可自動修復
- 執行高品質 Playbook
- 更新執行統計
"""
# === 安全邊界常數 ===
# 2026-04-07 Claude Code: 統帥指令「直接全部跳成自動修復」
# 移除相似度/品質/風險門檻,只保留 P0/P1 嚴重度阻擋
MAX_AUTO_REPAIR_RISK = RiskLevel.MEDIUM # 保留供日後參考,不再用於阻擋
MAX_AUTO_REPAIR_SEVERITY = Severity.P2 # P0/P1 仍需人工審核
MIN_SIMILARITY_SCORE = 0.0 # 🔴 已取消門檻
COLD_START_TRUST_MAX_EXECUTIONS = 3 # 保留供參考
COLD_START_TRUST_DAILY_LIMIT = 5 # 保留供參考
def __init__(
self,
playbook_service: IPlaybookService | None = None,
cooldown_checker: Callable | None = None,
):
# 2026-04-01 ogt: 注入 cooldown_checker 支援測試隔離 (DI 原則)
self._playbook_service = playbook_service or get_playbook_service()
self._cooldown_checker = cooldown_checker or check_global_repair_cooldown
# 2026-04-04 Claude Code: Phase 25 P1 — 持有 runbook_generator task 引用,防 GC 回收
import asyncio
self._pending_tasks: set[asyncio.Task] = set()
async def drain_pending_tasks(self, timeout: float = 60.0) -> dict:
"""K8s rolling restart 時優雅等待所有背景任務完成。
# 2026-04-27 Wave8-X3 by Claude — B25/B26 drain fix
在 lifespan shutdown 中呼叫,確保 _verify_and_learn / runbook_generator
等 fire-and-forget task 在 SIGTERM 後仍有機會寫入 trust_score / runbook。
"""
import asyncio as _asyncio
if not self._pending_tasks:
return {"drained": 0, "timeout": False}
pending_count = len(self._pending_tasks)
logger.info(
"auto_repair_draining_pending_tasks",
count=pending_count,
timeout=timeout,
)
try:
done, still_pending = await _asyncio.wait(
self._pending_tasks,
timeout=timeout,
return_when=_asyncio.ALL_COMPLETED,
)
return {
"drained": len(done),
"still_pending": len(still_pending),
"timeout": len(still_pending) > 0,
}
except Exception as e:
logger.exception("drain_pending_tasks_failed", error=str(e))
return {"drained": 0, "still_pending": pending_count, "error": str(e)}
async def evaluate_auto_repair(
self,
incident: Incident,
) -> AutoRepairDecision:
"""
評估是否可自動修復
決策流程:
1. 檢查 Incident 嚴重度 (P0/P1 需人工)
2. 從 Playbook 找匹配項
3. 檢查 Playbook 是否為高品質
4. 檢查動作風險等級
"""
logger.info(
"auto_repair_evaluate_start",
incident_id=incident.incident_id,
severity=incident.severity.value if incident.severity else None,
)
# 0. 全域熔斷檢查ADR-039 最優先)
can_repair, cooldown_reason = await self._cooldown_checker(
incident_id=incident.incident_id,
affected_services=incident.affected_services or [],
)
if not can_repair:
logger.warning(
"auto_repair_blocked_global_cooldown",
incident_id=incident.incident_id,
reason=cooldown_reason,
)
return AutoRepairDecision(
can_auto_repair=False,
reason=cooldown_reason,
blocked_by="GLOBAL_GUARDRAIL",
)
# 0.5 Sprint 5.1 Guardrail: Service Registry 服務分級檢查
# (2026-04-08 Claude Sonnet 4.6 Asia/TaipeiADR-062)
# 全域熔斷之後、嚴重度之前BLOCK 等級直接拒絕
# 保守原則Registry 讀取失敗也 block優先安全不放行
try:
_registry = get_service_registry()
_service_name = (incident.target_resource or "") if hasattr(incident, "target_resource") else ""
if not _service_name and incident.affected_services:
_service_name = incident.affected_services[0]
_stateful_level = _registry.get_stateful_level(_service_name)
if _stateful_level == StatefulLevel.BLOCK:
logger.warning(
"auto_repair_blocked_guardrail",
incident_id=incident.incident_id,
service_name=_service_name,
stateful_level="BLOCK",
)
return AutoRepairDecision(
can_auto_repair=False,
reason=f"GUARDRAIL_BLOCK: 服務 '{_service_name}' 屬於禁止自動修復清單(資料安全,見 service-registry.yaml",
blocked_by="SERVICE_REGISTRY_BLOCK",
)
except Exception as _guardrail_err:
# S1-3 修正: Registry 失敗時保守拒絕不允許穿透ADR-062 審查修正 2026-04-08
logger.error("guardrail_check_failed", error=str(_guardrail_err))
return AutoRepairDecision(
can_auto_repair=False,
reason="Guardrail Service Registry 讀取異常,保守拒絕自動修復",
blocked_by="GUARDRAIL_ERROR",
)
# 1. 檢查 Incident 嚴重度
if incident.severity and incident.severity.value in ["P0", "P1"]:
logger.info(
"auto_repair_blocked_severity",
incident_id=incident.incident_id,
severity=incident.severity.value,
)
return AutoRepairDecision(
can_auto_repair=False,
reason=f"Incident 嚴重度 {incident.severity.value} 需要人工審核",
blocked_by="HIGH_SEVERITY",
)
# 2. 提取症狀模式
symptoms = self._extract_symptoms(incident)
# 2.1 2026-04-04 Claude Code: Phase 25 P1 — Anti-Pattern 閘門
# 根據確定性 hash 比對近 7 天失敗案例,避免 AI 在同一個坑重複摔倒
try:
from src.services.knowledge_service import get_knowledge_service
symptoms_hash = symptoms.compute_hash()
anti_patterns = await get_knowledge_service().check_anti_pattern(
symptoms_hash, days=7
)
if anti_patterns:
ap = anti_patterns[0]
logger.warning(
"auto_repair_blocked_anti_pattern",
incident_id=incident.incident_id,
symptoms_hash=symptoms_hash,
anti_pattern_id=ap.id,
anti_pattern_title=ap.title,
)
return AutoRepairDecision(
can_auto_repair=False,
reason=f"過去 7 天有失敗案例: {ap.title}",
blocked_by="ANTI_PATTERN",
)
except Exception as _ap_e:
# Anti-Pattern 閘門失敗不阻塞主流程(僅記錄)
logger.warning("anti_pattern_gate_error", error=str(_ap_e))
symptoms_hash = ""
# 3. 找匹配的 Playbook
recommendations = await self._playbook_service.get_recommendations(
symptoms=symptoms,
top_k=3,
)
if not recommendations:
logger.info(
"auto_repair_no_playbook_match",
incident_id=incident.incident_id,
)
return AutoRepairDecision(
can_auto_repair=False,
reason="未找到匹配的 Playbook",
blocked_by="NO_MATCH",
)
# 4. 檢查最佳匹配
best_match = recommendations[0]
# 2026-04-07 Claude Code: 統帥指令「直接全部跳成自動修復」
# 移除: 相似度門檻、is_high_quality 門檻、冷啟動機制、風險等級門檻
# 只要有匹配 Playbook 且 APPROVED直接執行
max_risk = self._get_max_risk_level(best_match.playbook)
_is_cold_start = False
# 只保留: Playbook 必須是 APPROVED 狀態
if best_match.playbook.status != PlaybookStatus.APPROVED:
return AutoRepairDecision(
can_auto_repair=False,
playbook=best_match.playbook,
reason=f"Playbook 狀態為 {best_match.playbook.status.value},必須是 APPROVED",
blocked_by="NOT_APPROVED",
)
if self._is_host_or_backup_incident(incident) and self._playbook_has_k8s_steps(best_match.playbook):
logger.warning(
"auto_repair_blocked_host_backup_k8s_playbook",
incident_id=incident.incident_id,
playbook_id=best_match.playbook.playbook_id,
alert_category=getattr(incident, "alert_category", None),
)
return AutoRepairDecision(
can_auto_repair=False,
playbook=best_match.playbook,
reason=(
"主機/備份類告警禁止執行 K8s Playbook"
"需改走 SSH 診斷或緊急介入"
),
blocked_by="HOST_BACKUP_K8S_PLAYBOOK",
)
# 5. 可以自動修復
logger.info(
"auto_repair_approved",
incident_id=incident.incident_id,
playbook_id=best_match.playbook.playbook_id,
similarity=best_match.similarity_score,
success_rate=best_match.playbook.success_rate,
)
return AutoRepairDecision(
can_auto_repair=True,
playbook=best_match.playbook,
reason=f"匹配 Playbook: {best_match.playbook.name} (相似度 {best_match.similarity_score:.0%})",
risk_level=max_risk,
is_cold_start=_is_cold_start,
similarity_score=best_match.similarity_score,
)
async def execute_auto_repair(
self,
incident: Incident,
playbook: Playbook,
is_cold_start: bool = False,
similarity_score: float | None = None,
) -> AutoRepairResult:
"""
執行自動修復
流程:
1. 依序執行 Playbook 中的 repair_steps
2. 記錄執行結果到 DB (auto_repair_executions)
3. 更新 Playbook 統計
4. 記錄處置類型 (Sprint 4 B1/B2)
"""
import time
start_time = time.perf_counter()
executed_steps: list[str] = []
logger.info(
"auto_repair_execute_start",
incident_id=incident.incident_id,
playbook_id=playbook.playbook_id,
steps_count=len(playbook.repair_steps),
)
# ADR-039: 記錄全域修復計數(用於熔斷檢查)
await record_global_repair_action()
try:
# 執行每個步驟
for step in playbook.repair_steps:
# 2026-04-07 Claude Code: 統帥指令「直接全部跳成自動修復」
# 移除 step-level 風險門檻,所有步驟直接執行
# 執行步驟
step_result = await self._execute_step(incident, step)
executed_steps.append(
f"Step {step.step_number}: {step.command[:50]}... -> {step_result}"
)
# 更新 Playbook 統計
await self._playbook_service.record_execution(
playbook_id=playbook.playbook_id,
success=True,
)
execution_time = int((time.perf_counter() - start_time) * 1000)
logger.info(
"auto_repair_execute_success",
incident_id=incident.incident_id,
playbook_id=playbook.playbook_id,
executed_steps=len(executed_steps),
execution_time_ms=execution_time,
)
repair_result = AutoRepairResult(
success=True,
playbook_id=playbook.playbook_id,
incident_id=incident.incident_id,
executed_steps=executed_steps,
execution_time_ms=execution_time,
)
# 2026-04-08 Claude Code: 統帥指令「所有操作都必須被記錄,寫入資料庫」
try:
from src.repositories.audit_log_repository import get_auto_repair_execution_repository
max_risk = self._get_max_risk_level(playbook)
await get_auto_repair_execution_repository().create(
incident_id=incident.incident_id,
playbook_id=playbook.playbook_id,
playbook_name=playbook.name,
success=True,
executed_steps=executed_steps,
triggered_by="cold_start_trust" if is_cold_start else "auto_repair",
similarity_score=similarity_score,
risk_level=max_risk.value if max_risk else None,
execution_time_ms=execution_time,
)
except Exception as _db_e:
logger.error("auto_repair_db_write_failed", error=str(_db_e))
self._record_auto_repair_metric(playbook, success=True)
# 2026-04-07 Claude Code: Sprint 4 B1/B2 — 記錄處置類型
# P0-1 Fix: 統一使用 AnomalyCounter.hash_signature()
try:
from src.services.anomaly_counter import get_anomaly_counter
counter = get_anomaly_counter()
anomaly_key = self._derive_anomaly_key(incident)
if anomaly_key:
disposition_type = "cold_start_trust" if is_cold_start else "auto_repair"
await counter.record_disposition(anomaly_key, disposition_type)
except Exception as _disp_e:
logger.warning("disposition_record_failed", error=str(_disp_e))
# 2026-04-04 Claude Code: Phase 25 P1 — 成功修復後 fire-and-forget 生成 AUTO_RUNBOOK
try:
from src.services.runbook_generator import get_runbook_generator
symptoms = self._extract_symptoms(incident)
symptoms_hash = symptoms.compute_hash()
gen = get_runbook_generator()
import asyncio as _asyncio
task = _asyncio.create_task(
gen.generate_runbook(incident, playbook, repair_result, symptoms_hash)
)
self._pending_tasks.add(task) if hasattr(self, "_pending_tasks") else None
task.add_done_callback(
lambda t: self._pending_tasks.discard(t) if hasattr(self, "_pending_tasks") else None
)
except Exception as _rg_e:
logger.warning("runbook_generator_task_failed", error=str(_rg_e))
# 2026-04-26 Wave4 P1.3+P1.4 by Claude Engineer-B3 — 飛輪閉環最後一哩
# 成功執行後fire-and-forget 啟動後執行驗證 + EWMA 學習回饋
# verifier 有 10s warmup + 30s timeout不能阻塞在主路徑
try:
import asyncio as _asyncio
from src.services.post_execution_verifier import get_post_execution_verifier
from src.services.learning_service import get_learning_service
_action_taken = f"auto_repair:{playbook.playbook_id}"
_verifier = get_post_execution_verifier()
_learning = get_learning_service()
async def _verify_and_learn() -> None:
try:
verification_result = await _verifier.verify(
incident=incident,
snapshot=None,
action_taken=_action_taken,
)
await _learning.record_verification_result(
incident_id=incident.incident_id,
action_taken=_action_taken,
verification_result=verification_result,
matched_playbook_id=playbook.playbook_id,
)
logger.info(
"auto_repair_verify_and_learn_done",
incident_id=incident.incident_id,
playbook_id=playbook.playbook_id,
verification_result=verification_result,
)
# 2026-04-27 P3.1-T1 by Claude — 三 Tier-1 服務整合
# PostExecutionVerifier 判斷失敗/降級 → 觸發自動 Rollback
if verification_result in ("failed", "degraded"):
if self._should_escalate_failed_verification(incident, playbook):
await self._escalate_failed_verification(
incident=incident,
playbook=playbook,
verification_result=verification_result,
)
return
try:
from src.services.rollback_manager import get_rollback_manager
from src.services.declarative_remediation import DeclarativeRemediation
from src.core.metrics import ROLLBACK_EXECUTED_TOTAL
# 從 Incident 推導 target / namespace / action
_rb_target = (incident.affected_services or ["unknown"])[0]
_rb_ns = "awoooi-prod"
_rb_action = f"kubectl rollout restart deployment/{_rb_target} -n {_rb_ns}"
_spec = DeclarativeRemediation().evaluate(
action=_rb_action,
target=_rb_target,
namespace=_rb_ns,
)
rollback_mgr = get_rollback_manager()
rollback_result = await rollback_mgr.trigger(
incident_id=incident.incident_id,
spec=_spec,
verification_result=verification_result,
)
_rb_status = "success" if rollback_result.success else "failed"
_rb_reason = "converged" if rollback_result.convergence_confirmed else (
"no_previous_revision" if rollback_result.error and "revision" in (rollback_result.error or "")
else "error"
)
ROLLBACK_EXECUTED_TOTAL.labels(
status=_rb_status, reason=_rb_reason
).inc()
logger.info(
"auto_rollback_triggered",
incident_id=incident.incident_id,
rollback_success=rollback_result.success,
convergence_confirmed=rollback_result.convergence_confirmed,
rollback_error=rollback_result.error,
)
except Exception as _rb_e:
logger.exception(
"auto_rollback_failed",
incident_id=incident.incident_id,
error=str(_rb_e),
)
except Exception as _inner_e:
logger.warning(
"auto_repair_verify_and_learn_failed",
incident_id=incident.incident_id,
error=str(_inner_e),
)
_vl_task = _asyncio.create_task(_verify_and_learn())
if hasattr(self, "_pending_tasks"):
self._pending_tasks.add(_vl_task)
_vl_task.add_done_callback(self._pending_tasks.discard)
except Exception as _vl_e:
logger.warning("auto_repair_verifier_setup_failed", error=str(_vl_e))
return repair_result
except Exception as e:
# 更新失敗統計
await self._playbook_service.record_execution(
playbook_id=playbook.playbook_id,
success=False,
)
execution_time = int((time.perf_counter() - start_time) * 1000)
logger.error(
"auto_repair_execute_failed",
incident_id=incident.incident_id,
playbook_id=playbook.playbook_id,
error=str(e),
)
fail_result = AutoRepairResult(
success=False,
playbook_id=playbook.playbook_id,
incident_id=incident.incident_id,
executed_steps=executed_steps,
error=str(e),
execution_time_ms=execution_time,
)
# 2026-04-08 Claude Code: 失敗也必須寫入 DB
try:
from src.repositories.audit_log_repository import get_auto_repair_execution_repository
max_risk = self._get_max_risk_level(playbook)
await get_auto_repair_execution_repository().create(
incident_id=incident.incident_id,
playbook_id=playbook.playbook_id,
playbook_name=playbook.name,
success=False,
executed_steps=executed_steps,
error_message=str(e),
triggered_by="cold_start_trust" if is_cold_start else "auto_repair",
similarity_score=similarity_score,
risk_level=max_risk.value if max_risk else None,
execution_time_ms=execution_time,
)
except Exception as _db_e:
logger.error("auto_repair_db_write_failed", error=str(_db_e))
self._record_auto_repair_metric(playbook, success=False)
# 2026-04-04 Claude Code: Phase 25 P1 — 失敗修復後 fire-and-forget 生成 ANTI_PATTERN
# 2026-04-05 Claude Code: I1 修正 — 補齊 _pending_tasks GC 防護(對稱化)
try:
from src.services.runbook_generator import get_runbook_generator
import asyncio as _asyncio
symptoms = self._extract_symptoms(incident)
symptoms_hash = symptoms.compute_hash()
gen = get_runbook_generator()
_ap_task = _asyncio.create_task(
gen.generate_anti_pattern(incident, playbook, fail_result, symptoms_hash)
)
self._pending_tasks.add(_ap_task)
_ap_task.add_done_callback(self._pending_tasks.discard)
except Exception as _ap_e:
logger.warning("anti_pattern_task_failed", error=str(_ap_e))
return fail_result
# === Private Helpers ===
@staticmethod
def _derive_anomaly_key(incident: Incident) -> str | None:
"""
從 Incident 推導 anomaly_key。
2026-04-07 Claude Code: I1+S1 Fix — 委託 AnomalyCounter.derive_key_from_incident()
"""
from src.services.anomaly_counter import AnomalyCounter
return AnomalyCounter.derive_key_from_incident(incident)
def _extract_symptoms(self, incident: Incident) -> SymptomPattern:
"""從 Incident 提取症狀模式"""
alert_names = []
keywords = []
if incident.signals:
for signal in incident.signals:
# 優先用 labels["alertname"](原始 Prometheus alertname
# fallback 到 signal.alert_name可能是 "custom" 等類別值)
# (2026-04-09 Claude Sonnet 4.6 Asia/Taipei, L7 E2E 修正)
raw_alertname = signal.labels.get("alertname") if signal.labels else None
alert_names.append(raw_alertname or signal.alert_name)
# 從 annotations 提取關鍵字
if signal.annotations:
for value in signal.annotations.values():
if isinstance(value, str) and len(value) < 50:
keywords.append(value)
return SymptomPattern(
alert_names=alert_names,
affected_services=incident.affected_services or [],
severity_range=[incident.severity.value] if incident.severity else ["P2"],
keywords=keywords[:10],
)
def _get_max_risk_level(self, playbook: Playbook) -> RiskLevel:
"""取得 Playbook 中最高的風險等級"""
risk_order = {
RiskLevel.LOW: 0,
RiskLevel.MEDIUM: 1,
RiskLevel.HIGH: 2,
RiskLevel.CRITICAL: 3,
}
max_risk = RiskLevel.LOW
for step in playbook.repair_steps:
if risk_order.get(step.risk_level, 0) > risk_order.get(max_risk, 0):
max_risk = step.risk_level
return max_risk
def _record_auto_repair_metric(self, playbook: Playbook, success: bool) -> None:
"""把實際 auto-repair 執行寫入 Prometheus 指標。
2026-05-06 ogt + CodexDB 已有 auto_repair_executions
core.metrics.record_auto_repair() 長期零 caller導致治理/心跳用
Prometheus 看起來像「飛輪沒做事」。label 使用 action_type避免
playbook_id 造成高基數。
"""
try:
from src.core.metrics import record_auto_repair
first_step = playbook.repair_steps[0] if playbook.repair_steps else None
action = first_step.action_type.value if first_step else "unknown"
max_risk = self._get_max_risk_level(playbook)
tier = {
RiskLevel.LOW: 1,
RiskLevel.MEDIUM: 2,
RiskLevel.HIGH: 3,
RiskLevel.CRITICAL: 4,
}.get(max_risk, 0)
record_auto_repair(action=action, tier=tier, success=success)
except Exception as e:
logger.warning(
"auto_repair_metric_record_failed",
playbook_id=playbook.playbook_id,
success=success,
error=str(e),
)
def _is_host_or_backup_incident(self, incident: Incident) -> bool:
"""主機/備份類事件只能走 SSH/只讀診斷,不允許 K8s rollout 類修復。"""
category = (getattr(incident, "alert_category", None) or "").lower()
if category in {"host_resource", "backup_failure"}:
return True
for signal in incident.signals or []:
labels = signal.labels or {}
alertname = str(labels.get("alertname") or signal.alert_name or "")
if alertname.startswith("HostBackup") or alertname.startswith("Host"):
return True
return False
def _playbook_has_k8s_steps(self, playbook: Playbook) -> bool:
"""檢查 Playbook 是否包含 K8s 指令,避免主機告警誤執行 deployment 操作。"""
for step in playbook.repair_steps:
command = (step.command or "").strip().lower()
if step.action_type == ActionType.KUBECTL or command.startswith("kubectl "):
return True
return False
def _should_escalate_failed_verification(self, incident: Incident, playbook: Playbook) -> bool:
"""非 K8s 修復或主機/備份事件驗證失敗時,禁止合成 K8s rollback。"""
return self._is_host_or_backup_incident(incident) or not self._playbook_has_k8s_steps(playbook)
async def _escalate_failed_verification(
self,
*,
incident: Incident,
playbook: Playbook,
verification_result: str,
) -> None:
"""Post-verification failed but rollback is unsafe; notify emergency channel."""
target = (incident.affected_services or ["unknown"])[0]
namespace = "awoooi-prod"
alert_type = self._incident_alert_type(incident)
reason = (
f"auto repair playbook {playbook.playbook_id} verification={verification_result}; "
"rollback is unsafe for host/backup or non-K8s remediation"
)
logger.warning(
"auto_repair_verification_failed_emergency",
incident_id=incident.incident_id,
playbook_id=playbook.playbook_id,
verification_result=verification_result,
target=target,
)
try:
from src.services.emergency_escalation_service import (
escalate_auto_repair_unavailable,
)
await escalate_auto_repair_unavailable(
incident_id=incident.incident_id,
approval_id=None,
alert_type=alert_type,
target_resource=target,
namespace=namespace,
failure_reason=reason,
attempted_actions=(
f"auto_repair:{playbook.playbook_id} -> verifier:{verification_result} "
"-> emergency_intervention"
),
)
except Exception as exc:
logger.warning(
"auto_repair_verification_emergency_failed",
incident_id=incident.incident_id,
playbook_id=playbook.playbook_id,
error=str(exc),
)
def _incident_alert_type(self, incident: Incident) -> str:
"""Best-effort alertname for emergency cards."""
for signal in incident.signals or []:
labels = signal.labels or {}
alertname = labels.get("alertname") or signal.alert_name
if alertname:
return str(alertname)
return "AutoRepairVerificationFailed"
def _risk_exceeds_threshold(self, risk: RiskLevel) -> bool:
"""檢查風險是否超過自動修復門檻"""
high_risks = {RiskLevel.HIGH, RiskLevel.CRITICAL}
return risk in high_risks
async def _check_cold_start_daily_limit(self) -> bool:
"""
檢查今日首次信任自動修復次數是否在限額內。
使用 Redis counterkey 含日期,自動過期。
2026-04-07 Claude Code: 方案 C — 冷啟動每日上限防護
"""
try:
from src.core.redis_client import get_redis
redis = await get_redis()
if redis is None:
# Redis 不可用 → 保守拒絕
return False
from src.utils.timezone import now_taipei
today_key = f"cold_start_trust:{now_taipei().strftime('%Y-%m-%d')}"
count = await redis.incr(today_key)
# 首次建立 key 時設定過期 (25 小時,確保跨日清理)
if count == 1:
await redis.expire(today_key, 90000)
if count > self.COLD_START_TRUST_DAILY_LIMIT:
logger.warning(
"cold_start_daily_limit_reached",
today_key=today_key,
count=count,
limit=self.COLD_START_TRUST_DAILY_LIMIT,
)
return False
return True
except Exception as e:
logger.warning("cold_start_daily_limit_check_failed", error=str(e))
# 安全降級:檢查失敗 → 保守拒絕
return False
async def _execute_step(self, incident: Incident, step) -> str:
"""
執行單一修復步驟
目前整合:
- kubectl 命令: 透過 ActionExecutor
- script: 透過 subprocess
- manual: 跳過 (需人工)
"""
if step.action_type == ActionType.MANUAL:
return "SKIPPED (manual step)"
if step.action_type == ActionType.KUBECTL:
# 整合 ActionExecutor
try:
executor = get_executor()
# 替換 {target} 為實際目標
command = step.command
if incident.affected_services:
command = command.replace("{target}", incident.affected_services[0])
result = await executor.execute_kubectl_command(command)
return "SUCCESS" if result.success else f"FAILED: {result.error}"
except ImportError:
logger.warning("action_executor_not_available")
return "SKIPPED (executor not available)"
# 2026-04-06 Claude Code: Sprint 3 — repair_by_uri (URI scheme 路由)
if step.action_type == ActionType.SSH_COMMAND:
from src.services.host_repair_agent import HostRepairAgent
agent = HostRepairAgent()
approved = not getattr(step, "requires_approval", False)
result = await agent.repair_by_uri(step.command, approved=approved)
if result.success:
return f"SUCCESS: {result.output}"
else:
return f"FAILED: {result.error}"
return "UNKNOWN_ACTION_TYPE"
# === ADR-037: Tier-based Repair (2026-03-29) ===
# Tier 分級動作映射
TIER_ACTIONS = {
1: ["restart_pod", "restart_container"], # 臨時修復
2: ["scale_up", "increase_memory", "adjust_limits"], # 緩解修復
3: ["apply_hotfix", "update_config", "patch_deployment"], # 根因修復
4: ["create_issue", "notify_team", "schedule_fix"], # 架構修復
}
async def determine_repair_tier(
self,
anomaly_key: str,
frequency: AnomalyFrequency,
) -> int:
"""
根據頻率決定修復 Tier (ADR-037)
統帥指示 (2026-03-29):
- "重啟只是治標,不是治本!太常發生的異常必須徹底解決"
- 根據異常頻率和修復歷史決定應該嘗試的修復層級
Returns:
1: 臨時修復 (重啟)
2: 緩解修復 (擴容)
3: 根因修復 (配置變更)
4: 架構修復 (需開發)
"""
# 取得修復歷史
counter = get_anomaly_counter()
stats = await counter.get_all_repair_stats(anomaly_key)
# 計算重啟次數
restart_count = stats.get("restart_pod", {}).get("total", 0)
restart_count += stats.get("restart_container", {}).get("total", 0)
# Tier 決策邏輯
if frequency.permanent_fix_applied:
# 已有永久修復但仍出問題 → 需架構級修復
logger.info(
"tier_decision",
anomaly_key=anomaly_key,
tier=4,
reason="permanent_fix_still_failing",
)
return 4
if frequency.escalation_level == "PERMANENT_FIX":
# 24h 內 ≥10 次 → 根因修復
logger.info(
"tier_decision",
anomaly_key=anomaly_key,
tier=3,
reason="escalation_permanent_fix",
)
return 3
if frequency.escalation_level == "ESCALATE":
# 24h 內 ≥5 次 → 緩解修復
logger.info(
"tier_decision",
anomaly_key=anomaly_key,
tier=2,
reason="escalation_escalate",
)
return 2
if restart_count >= 2:
# 已重啟 2 次 → 升級到緩解
logger.info(
"tier_decision",
anomaly_key=anomaly_key,
tier=2,
reason=f"restart_count_{restart_count}",
)
return 2
# 預設臨時修復
return 1
def get_tier_actions(self, tier: int) -> list[str]:
"""
根據 Tier 返回可用修復動作 (ADR-037)
"""
return self.TIER_ACTIONS.get(tier, self.TIER_ACTIONS[1])
async def record_repair_result(
self,
anomaly_key: str,
action: str,
success: bool,
tier: int = 1,
) -> None:
"""
記錄修復結果到 AnomalyCounter (ADR-037)
Args:
anomaly_key: 異常 key
action: 修復動作
success: 是否成功
tier: 修復 Tier
"""
counter = get_anomaly_counter()
await counter.record_repair_attempt(anomaly_key, action, success)
# 如果是 Tier 3 永久修復成功,標記已套用
if tier >= 3 and success:
await counter.mark_permanent_fix_applied(
anomaly_key=anomaly_key,
fix_description=f"Tier {tier} repair: {action}",
)
logger.info(
"repair_result_recorded",
anomaly_key=anomaly_key,
action=action,
success=success,
tier=tier,
)
# =============================================================================
# Singleton
# =============================================================================
_service: AutoRepairService | None = None
def get_auto_repair_service() -> IAutoRepairService:
"""取得 AutoRepairService 單例"""
global _service
if _service is None:
_service = AutoRepairService()
return _service
def set_auto_repair_service(service: AutoRepairService | None) -> None:
"""注入 AutoRepairService 實例 (用於 DI 或測試)"""
global _service
_service = service