"""
Failure Watcher Service - Phase 18 失敗自動修復閉環
====================================================
Phase 18: Failure Auto-Repair Loop (2026-03-31 統帥批准)
職責:
- 監聽 AuditLog 失敗事件
- AI 分析失敗原因
- 評估風險等級
- 執行自動修復 (LOW 風險) 或請求人工授權 (MEDIUM/CRITICAL)
設計原則:
- 實作 IFailureWatcher Protocol
- 使用 OpenClaw 進行 AI 分析
- 與 Telegram 整合推送修復請求
版本: v1.0
建立: 2026-03-31 (台北時區)
建立者: Claude Code (Phase 18 失敗自動修復)
"""
import json
from dataclasses import dataclass
from datetime import UTC, datetime
import structlog
from src.core.redis_client import get_redis
from src.db.base import get_db_context
from src.db.models import AuditLog
from src.repositories.interfaces import IFailureWatcher
logger = structlog.get_logger(__name__)
# =============================================================================
# Constants
# =============================================================================
# 失敗分類
FAILURE_CLASSIFICATIONS = {
"TIMEOUT": ["timeout", "timed out", "deadline exceeded"],
"K8S_ERROR": ["kubernetes", "k8s", "pod", "deployment", "service", "forbidden"],
"NETWORK_ERROR": ["connection", "network", "unreachable", "dns", "resolve"],
"PERMISSION_DENIED": ["permission", "denied", "unauthorized", "403", "401"],
"RESOURCE_ERROR": ["oom", "memory", "cpu", "quota", "limit"],
}
# 風險等級配置
RISK_LEVELS = {
"LOW": {
"auto_repair": True,
"operations": ["restart_pod", "restart_deployment", "clear_cache"],
},
"MEDIUM": {
"auto_repair": False,
"operations": ["scale_deployment", "rollback", "update_config"],
},
"CRITICAL": {
"auto_repair": False,
"operations": ["delete_pvc", "drop_database", "network_policy"],
},
}
# Redis Stream for failures
FAILURE_STREAM_KEY = "awoooi:failures"
FAILURE_CONSUMER_GROUP = "failure_watchers"
# 自動修復重試上限
MAX_AUTO_REPAIR_RETRIES = 3
# =============================================================================
# Failure Analysis Result
# =============================================================================
@dataclass
class FailureAnalysis:
"""失敗分析結果"""
classification: str # TIMEOUT/K8S_ERROR/NETWORK_ERROR/PERMISSION_DENIED
root_cause: str
suggested_repair: str
risk_level: str # LOW/MEDIUM/CRITICAL
confidence: float
def to_dict(self) -> dict:
return {
"classification": self.classification,
"root_cause": self.root_cause,
"suggested_repair": self.suggested_repair,
"risk_level": self.risk_level,
"confidence": self.confidence,
}
# =============================================================================
# Failure Watcher Service
# =============================================================================
class FailureWatcherService(IFailureWatcher):
"""
失敗監聯服務 - Phase 18 核心元件
流程:
1. 收到失敗事件 (from Redis Stream or direct call)
2. AI 分析失敗原因 (OpenClaw)
3. 評估風險等級
4. LOW → 自動修復 → 揭露通知
5. MEDIUM/CRITICAL → Telegram + 前端等待授權
"""
def __init__(self) -> None:
pass # Stateless service
async def process_failure(
self,
audit_log_id: str,
failure_data: dict,
) -> dict:
"""
處理單一失敗事件
Args:
audit_log_id: AuditLog ID
failure_data: {error_message, operation_type, target_resource, ...}
Returns:
{repair_attempted, repair_result, risk_level, next_action}
"""
error_message = failure_data.get("error_message", "Unknown error")
operation_type = failure_data.get("operation_type", "UNKNOWN")
target_resource = failure_data.get("target_resource", "unknown")
logger.info(
"failure_watcher_processing",
audit_log_id=audit_log_id,
operation_type=operation_type,
target_resource=target_resource,
)
# 1. AI 分析失敗原因
analysis = await self.analyze_failure(
error_message=error_message,
operation_type=operation_type,
target_resource=target_resource,
)
# 2. 更新 AuditLog 分類
await self._update_audit_log_classification(
audit_log_id=audit_log_id,
classification=analysis["classification"],
auto_repair_attempted=False,
)
# 3. 根據風險等級決定行動
risk_level = analysis["risk_level"]
result = {
"repair_attempted": False,
"repair_result": None,
"risk_level": risk_level,
"next_action": "unknown",
"analysis": analysis,
}
# =====================================================================
# P0-1 修復: 全域自動修復熔斷檢查 (ADR-040)
# 2026-03-31 首席架構師審查要求
# =====================================================================
from src.services.global_repair_cooldown import check_global_repair_cooldown
can_global_repair, global_reason = await check_global_repair_cooldown(
incident_id=audit_log_id,
affected_services=[target_resource],
)
if not can_global_repair:
logger.warning(
"global_repair_cooldown_blocked",
audit_log_id=audit_log_id,
target_resource=target_resource,
reason=global_reason,
)
# 強制升級為 CRITICAL,必須人工授權
risk_level = "CRITICAL"
result["risk_level"] = "CRITICAL"
result["next_action"] = "blocked_by_global_cooldown"
await self._request_human_approval(
audit_log_id=audit_log_id,
analysis=analysis,
reason=global_reason,
)
return result
# Phase 18.3: 單資源安全檢查 - 防止修復風暴
can_auto_repair = await self._check_repair_cooldown(
target_resource=target_resource,
namespace=failure_data.get("namespace", "awoooi"),
)
if not can_auto_repair:
# 超過冷卻期限制,升級為 MEDIUM
logger.info(
"repair_cooldown_escalate",
audit_log_id=audit_log_id,
target_resource=target_resource,
original_risk_level=risk_level,
)
risk_level = "MEDIUM"
result["risk_level"] = "MEDIUM"
if risk_level == "LOW" and RISK_LEVELS["LOW"]["auto_repair"]:
# 自動修復 (Phase 18.3: 傳入完整 failure_data)
success, repair_result = await self.execute_auto_repair(
audit_log_id=audit_log_id,
repair_strategy=analysis["suggested_repair"],
failure_data=failure_data,
)
result["repair_attempted"] = True
result["repair_result"] = repair_result
result["next_action"] = "auto_repaired" if success else "escalate"
# 更新 AuditLog
await self._update_audit_log_classification(
audit_log_id=audit_log_id,
classification=analysis["classification"],
auto_repair_attempted=True,
auto_repair_result=repair_result,
)
if success:
# P0-1 補充: 記錄全域修復動作 (ADR-040)
from src.services.global_repair_cooldown import record_global_repair_action
await record_global_repair_action()
# 推送揭露通知 (自動修復成功)
await self._push_repair_notification(
audit_log_id=audit_log_id,
repair_result=repair_result,
auto=True,
)
else:
# 升級為 MEDIUM,請求人工授權
result["risk_level"] = "MEDIUM"
await self._request_human_approval(
audit_log_id=audit_log_id,
analysis=analysis,
reason="自動修復失敗,需人工介入",
)
result["next_action"] = "await_approval"
else:
# MEDIUM/CRITICAL: 請求人工授權
await self._request_human_approval(
audit_log_id=audit_log_id,
analysis=analysis,
reason=f"風險等級 {risk_level},需人工審核",
)
result["next_action"] = "await_approval"
logger.info(
"failure_watcher_processed",
audit_log_id=audit_log_id,
risk_level=risk_level,
next_action=result["next_action"],
repair_attempted=result["repair_attempted"],
)
return result
async def analyze_failure(
self,
error_message: str,
operation_type: str,
target_resource: str,
) -> dict:
"""
AI 分析失敗原因
先用規則引擎快速分類,再用 LLM 深度分析
"""
# 1. 規則引擎快速分類
classification = self._classify_by_rules(error_message)
# 2. 評估風險等級 (基於操作類型)
risk_level = self._assess_risk_level(operation_type)
# 3. LLM 深度分析 (非阻塞,失敗降級為規則結果)
llm_analysis = await self._llm_analyze(
error_message=error_message,
operation_type=operation_type,
target_resource=target_resource,
initial_classification=classification,
)
if llm_analysis:
# LLM 分析成功,使用 LLM 結果
return llm_analysis
# LLM 失敗,使用規則引擎結果
# 2026-04-16 ogt + Claude Sonnet 4.6: 修復 root_cause 只顯示 "規則引擎分類: K8S_ERROR"
# 根因:LLM 分析失敗時未帶入實際 error_message,用戶看到的卡片無任何有用資訊
_error_preview = (error_message[:200] if error_message else "未知錯誤").strip()
return {
"classification": classification,
"root_cause": (
f"[{classification}] {operation_type} 操作在 {target_resource} 失敗\n"
f"錯誤:{_error_preview}"
),
"suggested_repair": self._suggest_repair(classification),
"risk_level": risk_level,
"confidence": 0.4,
}
async def execute_auto_repair(
self,
audit_log_id: str,
repair_strategy: str,
failure_data: dict | None = None,
) -> tuple[bool, str]:
"""
執行自動修復 (僅限 LOW 風險)
Phase 18.3: K8s Executor 整合
2026-03-31 Claude Code (統帥批准)
支援操作:
- restart_deployment: 重啟 Deployment (rollout restart)
- restart_pod: 刪除 Pod 觸發重建
- clear_cache: 清理 Redis 快取
Returns:
(success, result_message)
"""
logger.info(
"auto_repair_executing",
audit_log_id=audit_log_id,
strategy=repair_strategy,
target=failure_data.get("target_resource") if failure_data else None,
)
try:
# 解析目標資源
target_resource = failure_data.get("target_resource", "") if failure_data else ""
namespace = failure_data.get("namespace", "awoooi") if failure_data else "awoooi"
# 解析資源類型和名稱 (格式: "deployment/api" 或 "pod/api-xxx")
resource_type = ""
resource_name = ""
if "/" in target_resource:
parts = target_resource.split("/", 1)
resource_type = parts[0].lower()
resource_name = parts[1] if len(parts) > 1 else ""
# =====================================================================
# Phase 18.3: 實際執行 K8s 修復操作
# P0-2 修復: 加入 Dry-run 驗證 (首席架構師審查要求)
# =====================================================================
if "restart" in repair_strategy.lower() and resource_name:
from src.services.executor import OperationType, get_executor
executor = get_executor()
# P0-2: Dry-run 驗證資源存在
if resource_type == "deployment":
dry_run = await executor.validate_action(
operation_type=OperationType.RESTART_DEPLOYMENT,
resource_name=resource_name,
namespace=namespace,
)
if not dry_run.passed:
logger.warning(
"auto_repair_dry_run_failed",
audit_log_id=audit_log_id,
resource=f"{resource_type}/{resource_name}",
reason=dry_run.message,
)
return False, f"Dry-run 失敗: {dry_run.message}"
# 重啟 Deployment
result = await executor.restart_deployment(
name=resource_name,
namespace=namespace,
)
if result.success:
logger.info(
"auto_repair_deployment_restarted",
audit_log_id=audit_log_id,
deployment=resource_name,
namespace=namespace,
)
return True, f"✅ Deployment {resource_name} 已重啟"
else:
return False, f"❌ 重啟失敗: {result.message}"
elif resource_type == "pod":
# P0-2: Dry-run 驗證 Pod 存在
dry_run = await executor.validate_action(
operation_type=OperationType.DELETE_POD,
resource_name=resource_name,
namespace=namespace,
)
if not dry_run.passed:
logger.warning(
"auto_repair_dry_run_failed",
audit_log_id=audit_log_id,
resource=f"{resource_type}/{resource_name}",
reason=dry_run.message,
)
return False, f"Dry-run 失敗: {dry_run.message}"
# 刪除 Pod 觸發重建
result = await executor.delete_pod(
name=resource_name,
namespace=namespace,
)
if result.success:
logger.info(
"auto_repair_pod_deleted",
audit_log_id=audit_log_id,
pod=resource_name,
namespace=namespace,
)
return True, f"✅ Pod {resource_name} 已刪除,等待重建"
else:
return False, f"❌ 刪除失敗: {result.message}"
else:
# 未知資源類型,記錄但不執行
logger.warning(
"auto_repair_unknown_resource_type",
audit_log_id=audit_log_id,
resource_type=resource_type,
resource_name=resource_name,
)
return False, f"未知資源類型: {resource_type}"
elif "clear_cache" in repair_strategy.lower():
# 清理 Redis 快取 (只清理特定前綴)
redis = get_redis()
# 安全清理: 只清理 cache 前綴
keys = await redis.keys("awoooi:cache:*")
if keys:
await redis.delete(*keys)
logger.info(
"auto_repair_cache_cleared",
audit_log_id=audit_log_id,
keys_deleted=len(keys),
)
return True, f"✅ 已清理 {len(keys)} 個快取 key"
else:
return True, "ℹ️ 無快取需清理"
else:
return False, f"未知修復策略: {repair_strategy}"
except Exception as e:
logger.exception(
"auto_repair_error",
audit_log_id=audit_log_id,
strategy=repair_strategy,
error=str(e),
)
return False, f"修復執行失敗: {e}"
# =========================================================================
# Private Methods
# =========================================================================
def _classify_by_rules(self, error_message: str) -> str:
"""規則引擎快速分類"""
error_lower = error_message.lower()
for classification, keywords in FAILURE_CLASSIFICATIONS.items():
if any(kw in error_lower for kw in keywords):
return classification
return "UNKNOWN"
def _assess_risk_level(self, operation_type: str) -> str:
"""評估風險等級"""
op_lower = operation_type.lower()
# CRITICAL 操作
if any(kw in op_lower for kw in ["delete", "drop", "force"]):
return "CRITICAL"
# MEDIUM 操作
if any(kw in op_lower for kw in ["scale", "rollback", "update", "patch"]):
return "MEDIUM"
# LOW 操作 (重啟類)
if any(kw in op_lower for kw in ["restart", "refresh", "clear"]):
return "LOW"
return "MEDIUM" # 預設 MEDIUM
def _suggest_repair(self, classification: str) -> str:
"""基於分類建議修復策略"""
suggestions = {
"TIMEOUT": "增加超時時間或重試操作",
"K8S_ERROR": "檢查 K8s 資源狀態,考慮重啟 Pod",
"NETWORK_ERROR": "檢查網路連線,驗證 DNS 解析",
"PERMISSION_DENIED": "檢查 RBAC 權限配置",
"RESOURCE_ERROR": "增加資源配額或清理資源",
"UNKNOWN": "需人工分析日誌",
}
return suggestions.get(classification, "需人工分析")
async def _check_repair_cooldown(
self,
target_resource: str,
namespace: str,
) -> bool:
"""
檢查修復冷卻期 - 防止修復風暴
Phase 18.3: 安全機制
2026-03-31 Claude Code (統帥批准)
規則:
- 同一資源 5 分鐘內最多修復 3 次
- 超過則升級為 MEDIUM 風險,請求人工授權
Returns:
True 如果可以自動修復,False 如果超過限制
"""
try:
redis = get_redis()
cooldown_key = f"awoooi:repair_cooldown:{namespace}:{target_resource}"
# 檢查修復次數
repair_count = await redis.get(cooldown_key)
if repair_count and int(repair_count) >= MAX_AUTO_REPAIR_RETRIES:
logger.warning(
"repair_cooldown_exceeded",
target_resource=target_resource,
namespace=namespace,
repair_count=int(repair_count),
max_retries=MAX_AUTO_REPAIR_RETRIES,
)
return False
# 增加計數並設置 5 分鐘過期
await redis.incr(cooldown_key)
await redis.expire(cooldown_key, 300) # 5 分鐘
return True
except Exception as e:
logger.warning(
"repair_cooldown_check_error",
target_resource=target_resource,
error=str(e),
)
# 檢查失敗時,保守起見返回 True 允許修復
return True
async def _llm_analyze(
self,
error_message: str,
operation_type: str,
target_resource: str,
initial_classification: str,
) -> dict | None:
"""
LLM 深度分析失敗原因
Phase 18.4: OpenClaw 整合
2026-03-31 Claude Code (統帥批准)
使用 OpenClawService 進行 AI 分析,
整合 SignOz 監控數據提供更精準的 RCA。
"""
try:
from src.services.openclaw import get_openclaw
openclaw = get_openclaw()
# 建構告警上下文
alert_context = {
"alert_type": "execution_failure",
"severity": "warning",
"error_message": error_message,
"operation_type": operation_type,
"target_resource": target_resource,
"initial_classification": initial_classification,
"source": "failure_watcher",
}
# 呼叫 OpenClaw 分析 (含 SignOz 整合)
analysis_result, ai_provider, raw_response, signoz_metrics, trace_url, tokens, cost = (
await openclaw.analyze_alert(alert_context)
)
if analysis_result:
# 從 OpenClaw 結果建構修復分析
logger.info(
"openclaw_failure_analysis_success",
ai_provider=ai_provider,
severity=analysis_result.severity,
confidence=analysis_result.confidence,
tokens=tokens,
cost_usd=cost,
)
# 映射 OpenClaw 結果到修復分析格式
risk_level = self._map_severity_to_risk(analysis_result.severity)
return {
"classification": initial_classification, # 保留規則引擎分類
"root_cause": analysis_result.root_cause_analysis[:100],
"suggested_repair": self._extract_repair_action(
analysis_result.recommended_action
),
"risk_level": risk_level,
"confidence": analysis_result.confidence,
"ai_provider": ai_provider,
"signoz_trace_url": trace_url,
}
logger.warning(
"openclaw_failure_analysis_no_result",
raw_response=raw_response[:200] if raw_response else None,
)
return None
except Exception as e:
logger.warning(
"openclaw_failure_analysis_error",
error=str(e),
)
return None
def _map_severity_to_risk(self, severity: str) -> str:
"""
將 OpenClaw severity 映射到修復風險等級
Phase 18.4: 嚴重度映射
"""
severity_lower = severity.lower()
if severity_lower in ["critical", "高"]:
return "CRITICAL"
elif severity_lower in ["warning", "medium", "中"]:
return "MEDIUM"
else:
return "LOW"
def _extract_repair_action(self, recommended_action: str) -> str:
"""
從 OpenClaw 建議中提取可執行的修復動作
Phase 18.4: 動作提取
"""
action_lower = recommended_action.lower()
# 識別可自動執行的動作
if any(kw in action_lower for kw in ["restart", "重啟", "重新啟動"]):
if "deployment" in action_lower or "部署" in action_lower:
return "restart_deployment"
elif "pod" in action_lower:
return "restart_pod"
return "restart_pod" # 預設重啟 Pod
if any(kw in action_lower for kw in ["clear", "清理", "cache", "快取"]):
return "clear_cache"
if any(kw in action_lower for kw in ["scale", "擴展", "增加"]):
return "scale_up" # 需人工授權
# 無法自動執行,返回原始建議
return recommended_action[:50]
async def _update_audit_log_classification(
self,
audit_log_id: str,
classification: str,
auto_repair_attempted: bool,
auto_repair_result: str | None = None,
) -> None:
"""更新 AuditLog 的失敗分類"""
try:
async with get_db_context() as db:
from sqlalchemy import update
stmt = (
update(AuditLog)
.where(AuditLog.id == audit_log_id)
.values(
failure_classification=classification,
auto_repair_attempted=auto_repair_attempted,
auto_repair_result=auto_repair_result,
)
)
await db.execute(stmt)
await db.commit()
logger.debug(
"audit_log_classification_updated",
audit_log_id=audit_log_id,
classification=classification,
)
except Exception as e:
logger.warning(
"audit_log_classification_update_failed",
audit_log_id=audit_log_id,
error=str(e),
)
async def _request_human_approval(
self,
audit_log_id: str,
analysis: dict,
reason: str,
) -> None:
"""請求人工授權 (推送到 Telegram + 前端)"""
try:
# 推送到 Redis (前端 WebSocket 訂閱)
redis = get_redis()
repair_request = {
"type": "repair_request",
"audit_log_id": audit_log_id,
"analysis": analysis,
"reason": reason,
"created_at": datetime.now(UTC).isoformat(),
}
await redis.publish(
"awoooi:repair_requests",
json.dumps(repair_request),
)
# 推送到 Telegram
from src.services.telegram_gateway import get_telegram_gateway
tg = get_telegram_gateway()
message = (
f"🔧 修復請求\n\n"
f"├ 📋 AuditLog: {audit_log_id[:8]}...\n"
f"├ 📊 分類: {analysis.get('classification', 'UNKNOWN')}\n"
f"├ ⚠️ 風險: {analysis.get('risk_level', 'MEDIUM')}\n"
f"├ 🔍 原因: {analysis.get('root_cause', reason)}\n"
f"└ 💡 建議: {analysis.get('suggested_repair', '需人工分析')}\n\n"
f"請在 Dashboard 授權或使用 /repair {audit_log_id[:8]}"
)
await tg.send_alert_notification(message)
logger.info(
"repair_request_sent",
audit_log_id=audit_log_id,
risk_level=analysis.get("risk_level"),
)
except Exception as e:
logger.warning(
"repair_request_send_failed",
audit_log_id=audit_log_id,
error=str(e),
)
async def _push_repair_notification(
self,
audit_log_id: str,
repair_result: str,
auto: bool = True,
) -> None:
"""推送修復完成通知"""
try:
from src.services.telegram_gateway import get_telegram_gateway
tg = get_telegram_gateway()
prefix = "🤖 自動修復" if auto else "✅ 手動修復"
message = (
f"{prefix} 完成\n\n"
f"├ 📋 AuditLog: {audit_log_id[:8]}...\n"
f"└ 📝 結果: {repair_result}"
)
await tg.send_alert_notification(message)
except Exception as e:
logger.warning(
"repair_notification_send_failed",
audit_log_id=audit_log_id,
error=str(e),
)
# =============================================================================
# Singleton
# =============================================================================
_failure_watcher: FailureWatcherService | None = None
def get_failure_watcher() -> FailureWatcherService:
"""取得 FailureWatcherService 實例 (Singleton)"""
global _failure_watcher
if _failure_watcher is None:
_failure_watcher = FailureWatcherService()
return _failure_watcher