1726 lines
71 KiB
Python
1726 lines
71 KiB
Python
"""
|
||
Approval Execution Service - Phase 16 R4.2 瘦身 Router 抽取
|
||
============================================================
|
||
|
||
從 approvals.py 抽取執行編排邏輯,整合:
|
||
- OperationParser: 解析操作類型
|
||
- K8s Executor: 執行 K8s 操作
|
||
- ApprovalDBService: 更新狀態
|
||
- TimelineService: 記錄事件
|
||
- NotificationManager: 發送通知
|
||
- Phase 7.6: Playbook 自動萃取
|
||
|
||
版本: v1.2
|
||
建立: 2026-03-25 (台北時區)
|
||
更新: 2026-03-26 (Phase 7.6 自動萃取)
|
||
更新: 2026-04-14 (ADR-076 Task 3: 執行失敗重試機制 — Claude Haiku 4.5 Asia/Taipei)
|
||
建立者: Claude Code (Phase 16 R4.2)
|
||
|
||
重試設計 (ADR-076):
|
||
- MAX_RETRY = 2 次(共最多 3 次嘗試)
|
||
- RETRY_DELAY_SECONDS = 30 秒
|
||
- 只重試瞬態錯誤(connection refused, timeout, i/o error 等)
|
||
- 永久性錯誤(not found, permission denied, already exists)不重試
|
||
"""
|
||
|
||
import asyncio
|
||
import time
|
||
from typing import TYPE_CHECKING, Any
|
||
from uuid import UUID
|
||
|
||
import structlog
|
||
|
||
from src.core.config import settings
|
||
from src.core.redis_client import get_redis
|
||
from src.db.base import get_db_context
|
||
from src.models.approval import ApprovalRequest
|
||
from src.plugins.mcp.gateway import GatewayContext, McpGateway, McpGatewayError
|
||
from src.plugins.mcp.interfaces import MCPToolResult
|
||
from src.services.approval_db import get_approval_service, get_timeline_service
|
||
from src.services.executor import ExecutionResult, OperationType, get_executor
|
||
from src.services.operation_parser import parse_operation_from_action
|
||
|
||
if TYPE_CHECKING:
|
||
from src.services.notifications import ExecutionStatus
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
# ADR-090 § 自動化動作回灌 (2026-04-19 ogt + Claude Opus 4.7 亞太):
|
||
# PostExecutionVerifier 從 fire-and-forget 改 await,確保 verification_result 必寫入 incident_evidence.
|
||
# 上限 60s 涵蓋 verifier warmup(10s) + collect(30s) + 緩衝 20s.
|
||
_VERIFIER_AWAIT_TIMEOUT_SEC = 60.0
|
||
|
||
# T9: approved SSH execution must go through AwoooP MCP Gateway.
|
||
# ApprovalRequest itself is the human/multi-sig decision artifact; for write/admin
|
||
# tools we project it into the short-lived Gate 5 Redis key expected by Gateway.
|
||
_SSH_GATEWAY_AGENT_ID = "approval_executor"
|
||
_SSH_GATEWAY_PROJECT_ID = "awoooi"
|
||
_SSH_GATEWAY_APPROVAL_TTL_SECONDS = 600
|
||
_SSH_GATEWAY_TOOL_SCOPES: dict[str, str] = {
|
||
"ssh_diagnose": "read",
|
||
"ssh_docker_restart": "write",
|
||
"ssh_docker_compose_restart": "write",
|
||
"ssh_systemctl_restart": "write",
|
||
"ssh_clear_docker_logs": "write",
|
||
"ssh_renew_ssl": "write",
|
||
"ssh_reload_nginx": "write",
|
||
"ssh_docker_prune": "admin",
|
||
}
|
||
|
||
|
||
class ApprovalExecutionService:
|
||
"""
|
||
授權執行服務 - 編排整個執行流程
|
||
|
||
職責:
|
||
1. 解析操作類型
|
||
2. 呼叫 K8s Executor 執行(含重試)
|
||
3. 更新資料庫狀態
|
||
4. 記錄 Timeline 事件
|
||
5. 發送通知
|
||
"""
|
||
|
||
# ADR-076 Task 3: 重試常數
|
||
MAX_RETRY: int = 2
|
||
RETRY_DELAY_SECONDS: int = 30
|
||
|
||
# 瞬態錯誤關鍵字(小寫比對),符合任一 → 可重試
|
||
_TRANSIENT_ERROR_KEYWORDS: tuple[str, ...] = (
|
||
"connection refused",
|
||
"connection reset",
|
||
"timeout",
|
||
"timed out",
|
||
"i/o error",
|
||
"io error",
|
||
"temporary failure",
|
||
"service unavailable",
|
||
"too many requests",
|
||
"dial tcp",
|
||
"eof",
|
||
)
|
||
|
||
# 永久性錯誤關鍵字(小寫比對),符合任一 → 不重試
|
||
_PERMANENT_ERROR_KEYWORDS: tuple[str, ...] = (
|
||
"not found",
|
||
"forbidden",
|
||
"permission denied",
|
||
"unauthorized",
|
||
"already exists",
|
||
"invalid",
|
||
"immutable",
|
||
"destructive",
|
||
"blocked",
|
||
)
|
||
|
||
@classmethod
|
||
def _is_transient_error(cls, error_message: str | None) -> bool:
|
||
"""
|
||
判斷執行錯誤是否為瞬態(可重試)
|
||
|
||
優先檢查永久性錯誤(比瞬態錯誤有更高的優先順序),
|
||
避免 "connection refused (not found)" 這類混合訊息誤判。
|
||
|
||
Args:
|
||
error_message: 執行錯誤訊息
|
||
|
||
Returns:
|
||
True 表示可重試,False 表示永久失敗
|
||
"""
|
||
if not error_message:
|
||
return False
|
||
lower = error_message.lower()
|
||
# 永久性錯誤 → 不重試
|
||
if any(kw in lower for kw in cls._PERMANENT_ERROR_KEYWORDS):
|
||
return False
|
||
# 瞬態錯誤 → 可重試
|
||
return any(kw in lower for kw in cls._TRANSIENT_ERROR_KEYWORDS)
|
||
|
||
async def execute_approved_action(self, approval: ApprovalRequest) -> bool:
|
||
"""
|
||
背景執行已批准的操作
|
||
|
||
此函數由 BackgroundTasks 呼叫,不阻塞 API 回應
|
||
Phase 5: 執行後更新資料庫狀態
|
||
Phase 6: 執行後發送通知 (Post-Execution Hook)
|
||
|
||
2026-04-17 ogt + Claude Sonnet 4.6: 返回 bool 表示 K8s 執行成功與否
|
||
根本原因: 原本返回 None → decision_manager.py auto-execute 路徑無法得知結果
|
||
→ 永遠傳 success=True 給 _push_auto_repair_result → 假成功廣播
|
||
修復: 返回 result.success,讓呼叫端自行決定 Telegram 訊息
|
||
|
||
Args:
|
||
approval: 已批准的授權請求
|
||
|
||
Returns:
|
||
bool: True = K8s 執行成功,False = 執行失敗(含解析失敗)
|
||
"""
|
||
from src.services.notifications import ExecutionStatus
|
||
|
||
logger.info(
|
||
"background_execution_start",
|
||
approval_id=str(approval.id),
|
||
action=approval.action,
|
||
)
|
||
|
||
# ADR-090 § 自動化動作回灌 (2026-04-19): 主流程開始即在 aol 留痕,
|
||
# 結束時 update。不依賴 fire-and-forget,確保 33 件/7d approval 全部可觀測。
|
||
_aol_op_id = await self._log_aol_started(approval)
|
||
_aol_started_ms = time.time()
|
||
|
||
service = get_approval_service()
|
||
timeline = get_timeline_service()
|
||
|
||
# Parse operation details
|
||
parsed = parse_operation_from_action(approval.action)
|
||
operation_type = parsed.operation_type
|
||
resource_name = parsed.resource_name
|
||
namespace = parsed.namespace
|
||
|
||
# 2026-04-27 P3.1-T1 by Claude — 三 Tier-1 服務整合
|
||
# kubectl 指令解析後,動態驗證資源是否存在於 K8s,並套用 normalized name
|
||
# exception 不阻斷主流程;miss/suggestion 只記 warning + metadata,不攔截執行
|
||
if resource_name is not None and operation_type is not None:
|
||
try:
|
||
from src.services.resource_resolver import get_resource_resolver
|
||
from src.core.metrics import RESOURCE_RESOLVE_TOTAL
|
||
|
||
_resolver = get_resource_resolver()
|
||
_resolve = await _resolver.resolve(
|
||
raw_resource=resource_name,
|
||
namespace=namespace,
|
||
resource_kind="deployment",
|
||
)
|
||
if _resolve.success and _resolve.resource_name:
|
||
if _resolve.resource_name != resource_name:
|
||
logger.info(
|
||
"resource_name_normalized",
|
||
original=resource_name,
|
||
normalized=_resolve.resource_name,
|
||
namespace=namespace,
|
||
)
|
||
resource_name = _resolve.resource_name
|
||
RESOURCE_RESOLVE_TOTAL.labels(result="hit").inc()
|
||
elif _resolve.candidates:
|
||
logger.warning(
|
||
"resource_not_found_in_k8s",
|
||
resource=resource_name,
|
||
namespace=namespace,
|
||
suggestions=_resolve.candidates,
|
||
)
|
||
RESOURCE_RESOLVE_TOTAL.labels(result="suggestion").inc()
|
||
else:
|
||
logger.warning(
|
||
"resource_not_found_in_k8s",
|
||
resource=resource_name,
|
||
namespace=namespace,
|
||
suggestions=[],
|
||
)
|
||
RESOURCE_RESOLVE_TOTAL.labels(result="miss").inc()
|
||
except Exception as _rr_e:
|
||
logger.warning("resource_resolve_failed", error=str(_rr_e))
|
||
try:
|
||
from src.core.metrics import RESOURCE_RESOLVE_TOTAL
|
||
RESOURCE_RESOLVE_TOTAL.labels(result="error").inc()
|
||
except Exception:
|
||
pass
|
||
|
||
if operation_type is None or resource_name is None:
|
||
# 2026-04-19 ogt + Claude Opus 4.7: 區分 NO_ACTION vs 真解析失敗
|
||
# NO_ACTION 是 AI 刻意選的「純調查不破壞」,不該誤標 EXECUTION_FAILED
|
||
# 污染 auto_execute 成功率 KPI (MASTER §7.1 #11)
|
||
_action_upper = (approval.action or "").upper()
|
||
_is_no_action = (
|
||
"NO_ACTION" in _action_upper
|
||
or "NO-ACTION" in _action_upper
|
||
or "NOACTION" in _action_upper
|
||
or "(未設)" in approval.action
|
||
or _action_upper.startswith("OBSERVE")
|
||
or _action_upper.startswith("INVESTIGATE")
|
||
)
|
||
|
||
if _is_no_action:
|
||
logger.info(
|
||
"background_execution_noop",
|
||
approval_id=str(approval.id),
|
||
action=approval.action,
|
||
reason="NO_ACTION - 純調查/觀察類,不執行破壞動作",
|
||
path="no_action",
|
||
)
|
||
# 標為 SUCCESS (觀察/調查本身就是成功完成)
|
||
await service.update_execution_status(approval.id, success=True)
|
||
await timeline.add_event(
|
||
event_type="exec",
|
||
status="success",
|
||
title="✅ 純觀察類動作完成 (NO_ACTION)",
|
||
description=f"Action: {approval.action[:120]}",
|
||
actor="leWOOOgo",
|
||
actor_role="executor",
|
||
approval_id=str(approval.id),
|
||
incident_id=approval.incident_id,
|
||
)
|
||
# 執行結果 reply 原告警卡片
|
||
asyncio.create_task(
|
||
self._push_execution_result_to_alert(
|
||
approval, success=True, error=None,
|
||
)
|
||
)
|
||
# ADR-090 § aol completed (NO_ACTION 視為成功)
|
||
await self._log_aol_completed(
|
||
op_id=_aol_op_id,
|
||
status="success",
|
||
duration_ms=int((time.time() - _aol_started_ms) * 1000),
|
||
output={"reason": "NO_ACTION", "action": approval.action[:200]},
|
||
)
|
||
# F2 (2026-05-07 ogt + Claude Sonnet 4.6 + Codex):
|
||
# NO_ACTION 路徑要把 incident 推到 RESOLVED,否則 incident 永遠卡
|
||
# INVESTIGATING(FlywheelExecutionRateMissing 死告警 + 566 stuck 增長根因 #1)。
|
||
# resolve_incident 內已加 RESOLVED 冪等 guard,重複 resolve 會 idempotent
|
||
# return existing incident 不會重觸發 postmortem。
|
||
if approval.incident_id:
|
||
try:
|
||
from src.services.incident_service import get_incident_service
|
||
|
||
await get_incident_service().resolve_incident(approval.incident_id)
|
||
logger.info(
|
||
"incident_resolved_after_no_action_execution",
|
||
incident_id=approval.incident_id,
|
||
approval_id=str(approval.id),
|
||
path="no_action",
|
||
)
|
||
except Exception as _resolve_e:
|
||
logger.warning(
|
||
"incident_resolve_after_no_action_execution_failed",
|
||
incident_id=approval.incident_id,
|
||
approval_id=str(approval.id),
|
||
error=str(_resolve_e),
|
||
)
|
||
return True # NO_ACTION 視為成功完成
|
||
|
||
# 真解析失敗 (非 NO_ACTION)
|
||
logger.warning(
|
||
"background_execution_skip",
|
||
approval_id=str(approval.id),
|
||
reason="Could not parse operation type from action",
|
||
action=approval.action,
|
||
)
|
||
# Phase 5: 更新資料庫狀態 + 帶 error_message (P0.2)
|
||
await service.update_execution_status(
|
||
approval.id, success=False,
|
||
error_message=f"Could not parse operation type from action: {approval.action[:150]}",
|
||
)
|
||
await timeline.add_event(
|
||
event_type="exec",
|
||
status="error",
|
||
title="執行失敗: 無法解析操作類型",
|
||
description=f"Action: {approval.action}",
|
||
actor="leWOOOgo",
|
||
actor_role="executor",
|
||
approval_id=str(approval.id),
|
||
incident_id=approval.incident_id,
|
||
)
|
||
|
||
# Phase 6: 發送失敗通知 (fire-and-forget)
|
||
asyncio.create_task(
|
||
self._send_execution_notification(
|
||
approval=approval,
|
||
execution_status=ExecutionStatus.FAILED,
|
||
operation_type="unknown",
|
||
namespace=namespace,
|
||
error_message="Could not parse operation type",
|
||
)
|
||
)
|
||
# ADR-090 § aol completed (parse 失敗)
|
||
await self._log_aol_completed(
|
||
op_id=_aol_op_id,
|
||
status="failed",
|
||
duration_ms=int((time.time() - _aol_started_ms) * 1000),
|
||
error=f"parse_fail: {approval.action[:300]}",
|
||
)
|
||
return False # 解析失敗 → 執行未發生
|
||
|
||
executor = get_executor()
|
||
attempt = 1 # 重試計數(INVESTIGATE 路徑不進入重試迴圈,保持 1)
|
||
|
||
# 2026-05-02 ogt + Claude Sonnet 4.6: 主機 SSH 操作分支
|
||
# 根因:手動批准 ssh action 時 parser 只懂 kubectl,回 None → 「Could not parse」假失敗
|
||
# 修法:偵測到 SSH_HOST 類型,走 SSHProvider 而非 K8s executor
|
||
if operation_type == OperationType.SSH_HOST:
|
||
result = await self._execute_ssh_host_action(
|
||
approval=approval,
|
||
host=resource_name or "",
|
||
)
|
||
logger.info(
|
||
"background_execution_ssh_host",
|
||
approval_id=str(approval.id),
|
||
action=approval.action,
|
||
host=resource_name,
|
||
success=result.success,
|
||
message=result.message,
|
||
)
|
||
elif operation_type == OperationType.INVESTIGATE:
|
||
# 2026-04-24 ogt + Claude Sonnet 4.6: Gate 11 修復 — INVESTIGATE 唯讀查詢
|
||
# 根因:INVESTIGATE 不在 executor.execute_with_audit 的 switch,走 else → success=False
|
||
# 修法:偵測到 INVESTIGATE 類型,直接呼叫 execute_kubectl_command(approval.action)
|
||
# 唯讀指令無需重試迴圈(失敗即失敗,不會有 transient error 改善空間)
|
||
result = await executor.execute_kubectl_command(
|
||
command=approval.action,
|
||
timeout_sec=30,
|
||
)
|
||
logger.info(
|
||
"background_execution_investigate",
|
||
approval_id=str(approval.id),
|
||
action=approval.action,
|
||
success=result.success,
|
||
message=result.message,
|
||
)
|
||
else:
|
||
# ADR-076 Task 3: 執行失敗重試機制
|
||
# 瞬態錯誤 (connection refused, timeout 等) 自動重試,最多 MAX_RETRY 次
|
||
result = await executor.execute_with_audit(
|
||
approval=approval,
|
||
operation_type=operation_type,
|
||
resource_name=resource_name,
|
||
namespace=namespace,
|
||
)
|
||
|
||
attempt = 1
|
||
while not result.success and attempt <= self.MAX_RETRY:
|
||
if not self._is_transient_error(result.error):
|
||
logger.info(
|
||
"execution_retry_skipped_permanent_error",
|
||
approval_id=str(approval.id),
|
||
attempt=attempt,
|
||
error=result.error,
|
||
)
|
||
break
|
||
|
||
logger.warning(
|
||
"execution_retry_transient_error",
|
||
approval_id=str(approval.id),
|
||
attempt=attempt,
|
||
max_retry=self.MAX_RETRY,
|
||
error=result.error,
|
||
delay_seconds=self.RETRY_DELAY_SECONDS,
|
||
)
|
||
await timeline.add_event(
|
||
event_type="exec",
|
||
status="warning",
|
||
title=f"⚠️ 執行失敗,{self.RETRY_DELAY_SECONDS}s 後重試 ({attempt}/{self.MAX_RETRY})",
|
||
description=f"Error: {result.error}",
|
||
actor="leWOOOgo",
|
||
actor_role="executor",
|
||
approval_id=str(approval.id),
|
||
incident_id=approval.incident_id,
|
||
)
|
||
await asyncio.sleep(self.RETRY_DELAY_SECONDS)
|
||
result = await executor.execute_with_audit(
|
||
approval=approval,
|
||
operation_type=operation_type,
|
||
resource_name=resource_name,
|
||
namespace=namespace,
|
||
)
|
||
attempt += 1
|
||
|
||
# Phase 5: 更新資料庫狀態
|
||
# 2026-04-18 ADR-090 L5 P0.2: 失敗時帶上 error_message,寫進 rejection_reason
|
||
await service.update_execution_status(
|
||
approval.id,
|
||
success=result.success,
|
||
error_message=None if result.success else (result.error or "(executor 未回傳錯誤)"),
|
||
)
|
||
|
||
# Update approval status based on result
|
||
total_attempts = attempt # attempt 在重試迴圈後為最終嘗試次數
|
||
if result.success:
|
||
logger.info(
|
||
"background_execution_success",
|
||
approval_id=str(approval.id),
|
||
operation=operation_type.value,
|
||
target=resource_name,
|
||
namespace=namespace,
|
||
duration_ms=result.duration_ms,
|
||
total_attempts=total_attempts,
|
||
)
|
||
retry_note = f" (重試 {total_attempts - 1} 次後成功)" if total_attempts > 1 else ""
|
||
await timeline.add_event(
|
||
event_type="exec",
|
||
status="success",
|
||
title=f"✅ K8s 執行成功: {operation_type.value}{retry_note}",
|
||
description=f"Target: {resource_name} @ {namespace} ({result.duration_ms}ms)",
|
||
actor="leWOOOgo",
|
||
actor_role="executor",
|
||
approval_id=str(approval.id),
|
||
incident_id=approval.incident_id,
|
||
)
|
||
|
||
# Phase 6: 發送成功通知 (fire-and-forget)
|
||
asyncio.create_task(
|
||
self._send_execution_notification(
|
||
approval=approval,
|
||
execution_status=ExecutionStatus.SUCCESS,
|
||
operation_type=operation_type.value,
|
||
namespace=namespace,
|
||
duration_ms=result.duration_ms,
|
||
)
|
||
)
|
||
|
||
# 2026-04-14 Claude Sonnet 4.6: reply_to 原告警卡片顯示執行結果
|
||
# auto_approve 路徑由 _push_auto_repair_result 處理,此處僅處理人工批准
|
||
asyncio.create_task(
|
||
self._push_execution_result_to_alert(approval, success=True, error=None)
|
||
)
|
||
|
||
# Phase 7.6: 觸發 Playbook 自動萃取 (fire-and-forget)
|
||
asyncio.create_task(
|
||
self._trigger_playbook_extraction(approval)
|
||
)
|
||
|
||
# ADR-030 Phase 5 / ADR-083 Phase 3: 觸發學習服務
|
||
# Phase 3 修復:移除 fire-and-forget,改用 await + 30s 熔斷
|
||
# 超時 → 記錄 metric,主流程繼續(不 crash)
|
||
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 fire-and-forget 修復
|
||
try:
|
||
await asyncio.wait_for(
|
||
self._trigger_learning(
|
||
approval=approval,
|
||
success=True,
|
||
duration_seconds=result.duration_ms / 1000 if result.duration_ms else 0,
|
||
),
|
||
timeout=30.0,
|
||
)
|
||
except asyncio.TimeoutError:
|
||
logger.warning(
|
||
"learning_trigger_timeout",
|
||
approval_id=str(approval.id),
|
||
timeout_sec=30.0,
|
||
)
|
||
|
||
# ADR-081 Phase 1 + ADR-090 修復 (2026-04-19 ogt + Claude Opus 4.7):
|
||
# PostExecutionVerifier 改 await + 60s timeout,確保 verification_result 必寫入。
|
||
# 之前 fire-and-forget 在 Pod recycle 時 task 被殺,導致 1212 筆 evidence 全 NULL.
|
||
from src.core.feature_flags import aiops_flags
|
||
if aiops_flags.is_sub_flag_enabled("AIOPS_P1_POST_EXECUTION_VERIFIER"):
|
||
try:
|
||
await asyncio.wait_for(
|
||
self._run_post_execution_verify(
|
||
approval=approval,
|
||
action_taken=f"{operation_type.value}:{resource_name}",
|
||
),
|
||
timeout=_VERIFIER_AWAIT_TIMEOUT_SEC,
|
||
)
|
||
except asyncio.TimeoutError:
|
||
logger.warning(
|
||
"post_verify_timeout_exceeded",
|
||
approval_id=str(approval.id),
|
||
timeout_sec=_VERIFIER_AWAIT_TIMEOUT_SEC,
|
||
)
|
||
|
||
# 2026-04-07 Claude Code: Sprint 4 B3 — 記錄人工批准處置類型
|
||
try:
|
||
anomaly_key = await self._get_anomaly_key_from_approval(approval)
|
||
if anomaly_key:
|
||
from src.services.anomaly_counter import get_anomaly_counter
|
||
counter = get_anomaly_counter()
|
||
await counter.record_disposition(anomaly_key, "human_approved")
|
||
except Exception as _disp_e:
|
||
logger.warning("disposition_record_failed", error=str(_disp_e))
|
||
|
||
# ADR-073 修補: 執行成功 → 解決 Incident → 觸發 KM 轉換
|
||
# 之前 RESOLVED 從未被呼叫,導致 KM 永遠不生成、Playbook 永遠是 0
|
||
if approval.incident_id:
|
||
try:
|
||
from src.services.incident_service import get_incident_service
|
||
_inc_svc = get_incident_service()
|
||
await _inc_svc.resolve_incident(approval.incident_id)
|
||
logger.info(
|
||
"incident_resolved_after_execution",
|
||
incident_id=approval.incident_id,
|
||
approval_id=str(approval.id),
|
||
)
|
||
except Exception as _resolve_e:
|
||
logger.warning("incident_resolve_after_execution_failed", error=str(_resolve_e))
|
||
|
||
# ADR-090 § aol completed (執行成功)
|
||
await self._log_aol_completed(
|
||
op_id=_aol_op_id,
|
||
status="success",
|
||
duration_ms=int((time.time() - _aol_started_ms) * 1000),
|
||
output={
|
||
"operation_type": operation_type.value,
|
||
"resource_name": resource_name,
|
||
"namespace": namespace,
|
||
"executor_duration_ms": result.duration_ms,
|
||
"total_attempts": total_attempts,
|
||
},
|
||
)
|
||
return True # K8s 執行成功
|
||
|
||
else:
|
||
logger.error(
|
||
"background_execution_failed",
|
||
approval_id=str(approval.id),
|
||
operation=operation_type.value,
|
||
target=resource_name,
|
||
namespace=namespace,
|
||
error=result.error,
|
||
)
|
||
await timeline.add_event(
|
||
event_type="exec",
|
||
status="error",
|
||
title=f"❌ K8s 執行失敗: {operation_type.value}",
|
||
description=f"Error: {result.error}",
|
||
actor="leWOOOgo",
|
||
actor_role="executor",
|
||
approval_id=str(approval.id),
|
||
incident_id=approval.incident_id,
|
||
)
|
||
|
||
# Phase 6: 發送失敗通知 (fire-and-forget, 包含 Dry-Run 攔截)
|
||
exec_status = (
|
||
ExecutionStatus.DRY_RUN_BLOCKED
|
||
if "not found" in (result.error or "")
|
||
else ExecutionStatus.FAILED
|
||
)
|
||
asyncio.create_task(
|
||
self._send_execution_notification(
|
||
approval=approval,
|
||
execution_status=exec_status,
|
||
operation_type=operation_type.value,
|
||
namespace=namespace,
|
||
error_message=result.error,
|
||
duration_ms=result.duration_ms,
|
||
)
|
||
)
|
||
|
||
# 2026-04-14 Claude Sonnet 4.6: reply_to 原告警卡片顯示失敗結果
|
||
asyncio.create_task(
|
||
self._push_execution_result_to_alert(
|
||
approval, success=False, error=result.error
|
||
)
|
||
)
|
||
|
||
# ADR-030 Phase 5 / ADR-083 Phase 3: 觸發學習服務(失敗案例)
|
||
# Phase 3 修復:fire-and-forget → await + 30s 熔斷
|
||
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 fire-and-forget 修復
|
||
try:
|
||
await asyncio.wait_for(
|
||
self._trigger_learning(
|
||
approval=approval,
|
||
success=False,
|
||
error_message=result.error,
|
||
duration_seconds=result.duration_ms / 1000 if result.duration_ms else 0,
|
||
),
|
||
timeout=30.0,
|
||
)
|
||
except asyncio.TimeoutError:
|
||
logger.warning(
|
||
"learning_trigger_timeout",
|
||
approval_id=str(approval.id),
|
||
timeout_sec=30.0,
|
||
)
|
||
|
||
# ADR-090 修復 (2026-04-19 ogt + Claude Opus 4.7):
|
||
# 失敗時也跑 verifier,把 verification_result='failed' 回寫 evidence。
|
||
# 改 await + 60s timeout (原為 fire-and-forget,task 在 Pod recycle 時被殺)。
|
||
from src.core.feature_flags import aiops_flags
|
||
if aiops_flags.is_sub_flag_enabled("AIOPS_P1_POST_EXECUTION_VERIFIER"):
|
||
try:
|
||
await asyncio.wait_for(
|
||
self._run_post_execution_verify(
|
||
approval=approval,
|
||
action_taken=f"{operation_type.value}:{resource_name}:FAILED",
|
||
),
|
||
timeout=_VERIFIER_AWAIT_TIMEOUT_SEC,
|
||
)
|
||
except asyncio.TimeoutError:
|
||
logger.warning(
|
||
"post_verify_timeout_exceeded_failed_path",
|
||
approval_id=str(approval.id),
|
||
timeout_sec=_VERIFIER_AWAIT_TIMEOUT_SEC,
|
||
)
|
||
|
||
# ADR-090 § aol completed (執行失敗)
|
||
await self._log_aol_completed(
|
||
op_id=_aol_op_id,
|
||
status="failed",
|
||
duration_ms=int((time.time() - _aol_started_ms) * 1000),
|
||
output={
|
||
"operation_type": operation_type.value,
|
||
"resource_name": resource_name,
|
||
"namespace": namespace,
|
||
"executor_duration_ms": result.duration_ms,
|
||
"total_attempts": total_attempts,
|
||
},
|
||
error=result.error,
|
||
stderr=result.error, # E6 stderr 回灌 — 給 retry/Playbook 負向強化用
|
||
)
|
||
return False # K8s 執行失敗
|
||
|
||
async def _execute_ssh_host_action(
|
||
self,
|
||
approval: ApprovalRequest,
|
||
host: str,
|
||
) -> ExecutionResult:
|
||
"""
|
||
執行 SSH 主機 action(手動批准路徑專用)
|
||
|
||
2026-05-02 ogt + Claude Sonnet 4.6: 修補手動批准 SSH action 卡住的 bug
|
||
根因:parse_operation_from_action 只懂 kubectl,approval_execution 走 K8s executor 拒收
|
||
修法:偵測 SSH_HOST 後改走 SSHProvider,行為與 decision_manager._ssh_execute 對齊
|
||
|
||
action 解析邏輯:
|
||
- "docker prune" / "docker image prune" / "docker volume prune" → ssh_docker_prune
|
||
- "docker restart <name>" → ssh_docker_restart
|
||
- "systemctl restart <svc>" → ssh_systemctl_restart
|
||
- "ps aux" / "df -h" / "free -h" / "top" / "uptime" / 'echo' / 'ls -lah' → ssh_diagnose
|
||
- 其他:回傳失敗,提示 LLM 改寫 action
|
||
"""
|
||
start = time.time()
|
||
action = approval.action or ""
|
||
action_lower = action.lower().strip()
|
||
|
||
# 路由 SSH MCP tool(與 decision_manager._ssh_execute 對齊)
|
||
params: dict = {"host": host}
|
||
tool_name: str | None = None
|
||
|
||
if "docker" in action_lower and "prune" in action_lower:
|
||
tool_name = "ssh_docker_prune"
|
||
params["trust_score"] = 0.85
|
||
elif "docker restart" in action_lower:
|
||
tool_name = "ssh_docker_restart"
|
||
# 嘗試萃取 container name
|
||
import re as _re
|
||
m = _re.search(r"docker\s+restart\s+([a-z0-9._-]+)", action_lower)
|
||
if m:
|
||
params["container_name"] = m.group(1)
|
||
params["trust_score"] = 0.85
|
||
else:
|
||
tool_name = None # 沒抓到 container 名稱,降級
|
||
elif "systemctl restart" in action_lower:
|
||
tool_name = "ssh_systemctl_restart"
|
||
import re as _re
|
||
m = _re.search(r"systemctl\s+restart\s+([a-z0-9._-]+)", action_lower)
|
||
if m:
|
||
params["service"] = m.group(1)
|
||
params["trust_score"] = 0.85
|
||
else:
|
||
tool_name = None
|
||
elif any(kw in action_lower for kw in ("ps aux", "df -h", "free -h", "top ", "uptime", "echo ", "ls -")):
|
||
# 主機診斷類(合 ssh_diagnose 一鍵收集)
|
||
tool_name = "ssh_diagnose"
|
||
|
||
if tool_name is None:
|
||
duration_ms = int((time.time() - start) * 1000)
|
||
err = f"SSH action 無法路由到 SSH MCP tool: {action[:120]}"
|
||
logger.warning(
|
||
"ssh_host_action_unrouted",
|
||
approval_id=str(approval.id),
|
||
action=action,
|
||
host=host,
|
||
)
|
||
return ExecutionResult(
|
||
success=False,
|
||
message="SSH action unrouted",
|
||
operation_type=OperationType.SSH_HOST,
|
||
target_resource=host,
|
||
namespace="host",
|
||
duration_ms=duration_ms,
|
||
error=err,
|
||
)
|
||
|
||
try:
|
||
logger.warning(
|
||
"mcp_gateway_approved_ssh_execution_path",
|
||
approval_id=str(approval.id),
|
||
incident_id=approval.incident_id,
|
||
tool=tool_name,
|
||
host=host,
|
||
agent_id=_SSH_GATEWAY_AGENT_ID,
|
||
)
|
||
mcp_result = await self._execute_ssh_tool_via_gateway(
|
||
approval=approval,
|
||
tool_name=tool_name,
|
||
params=params,
|
||
)
|
||
duration_ms = int((time.time() - start) * 1000)
|
||
success = bool(mcp_result.success)
|
||
return ExecutionResult(
|
||
success=success,
|
||
message=f"ssh_mcp:{tool_name} {'ok' if success else 'failed'}",
|
||
operation_type=OperationType.SSH_HOST,
|
||
target_resource=host,
|
||
namespace="host",
|
||
duration_ms=duration_ms,
|
||
k8s_response={"tool": tool_name, "output": mcp_result.output} if success else None,
|
||
error=None if success else (mcp_result.error or "ssh_mcp execution failed"),
|
||
)
|
||
except Exception as e:
|
||
duration_ms = int((time.time() - start) * 1000)
|
||
logger.warning(
|
||
"ssh_host_action_exception",
|
||
approval_id=str(approval.id),
|
||
tool=tool_name,
|
||
error=str(e),
|
||
)
|
||
return ExecutionResult(
|
||
success=False,
|
||
message="ssh_mcp exception",
|
||
operation_type=OperationType.SSH_HOST,
|
||
target_resource=host,
|
||
namespace="host",
|
||
duration_ms=duration_ms,
|
||
error=str(e),
|
||
)
|
||
|
||
async def _execute_ssh_tool_via_gateway(
|
||
self,
|
||
approval: ApprovalRequest,
|
||
tool_name: str,
|
||
params: dict[str, Any],
|
||
) -> MCPToolResult:
|
||
required_scope = _SSH_GATEWAY_TOOL_SCOPES.get(tool_name, "read")
|
||
run_id = approval.id if isinstance(approval.id, UUID) else UUID(str(approval.id))
|
||
|
||
if required_scope != "read":
|
||
approval_key = (
|
||
f"mcp_approval:{_SSH_GATEWAY_PROJECT_ID}:{_SSH_GATEWAY_AGENT_ID}:"
|
||
f"{tool_name}:{run_id}"
|
||
)
|
||
try:
|
||
redis = get_redis()
|
||
await redis.set(
|
||
approval_key,
|
||
"approved",
|
||
ex=_SSH_GATEWAY_APPROVAL_TTL_SECONDS,
|
||
)
|
||
except Exception as exc:
|
||
logger.warning(
|
||
"mcp_gateway_approval_projection_failed",
|
||
approval_id=str(approval.id),
|
||
tool=tool_name,
|
||
approval_key=approval_key,
|
||
error=str(exc),
|
||
)
|
||
|
||
params_with_audit = {
|
||
**params,
|
||
"_mcp_audit": {
|
||
"session_id": f"approval:{approval.id}",
|
||
"incident_id": approval.incident_id,
|
||
"agent_role": _SSH_GATEWAY_AGENT_ID,
|
||
"flywheel_node": "execute",
|
||
"approval_id": str(approval.id),
|
||
},
|
||
}
|
||
async with get_db_context(_SSH_GATEWAY_PROJECT_ID) as db:
|
||
ctx = GatewayContext(
|
||
project_id=_SSH_GATEWAY_PROJECT_ID,
|
||
agent_id=_SSH_GATEWAY_AGENT_ID,
|
||
tool_name=tool_name,
|
||
run_id=run_id,
|
||
trace_id=approval.incident_id or str(approval.id),
|
||
is_shadow=False,
|
||
environment={"env": "prod"},
|
||
required_scope=required_scope,
|
||
)
|
||
try:
|
||
return await McpGateway(db).call(ctx, params_with_audit)
|
||
except McpGatewayError as exc:
|
||
logger.warning(
|
||
"mcp_gateway_approved_ssh_blocked",
|
||
approval_id=str(approval.id),
|
||
incident_id=approval.incident_id,
|
||
tool=tool_name,
|
||
gate=exc.gate,
|
||
error_code=exc.error_code,
|
||
error=str(exc),
|
||
)
|
||
return MCPToolResult(
|
||
success=False,
|
||
execution_id=f"blocked:{tool_name}:{run_id}",
|
||
error=f"{exc.error_code}: {exc}",
|
||
)
|
||
|
||
async def _push_execution_result_to_alert(
|
||
self,
|
||
approval: ApprovalRequest,
|
||
success: bool,
|
||
error: str | None,
|
||
) -> None:
|
||
"""
|
||
執行結果回覆到原告警 Telegram 卡片(reply_to_message_id)
|
||
|
||
2026-04-14 Claude Sonnet 4.6 實裝:
|
||
- 人工路徑:人類在 Telegram 點批准後,等執行完成,在原告警下 reply 執行結果
|
||
- 自動路徑 (requested_by=auto_approve) 由 _push_auto_repair_result 處理,此處 skip
|
||
|
||
透過 Redis tg_msg:{incident_id} 查原告警 message_id,找不到則靜默不發。
|
||
"""
|
||
try:
|
||
# 自動執行路徑 skip(避免與 _push_auto_repair_result 重複發訊息)
|
||
if self._is_auto_approved_request(approval):
|
||
return
|
||
|
||
if not approval.incident_id:
|
||
return
|
||
|
||
from src.core.redis_client import get_redis
|
||
redis = get_redis()
|
||
msg_id_raw = await redis.get(f"tg_msg:{approval.incident_id}")
|
||
if not msg_id_raw:
|
||
logger.debug(
|
||
"push_execution_result_no_msg_id",
|
||
incident_id=approval.incident_id,
|
||
approval_id=str(approval.id),
|
||
)
|
||
return
|
||
|
||
try:
|
||
orig_msg_id = int(msg_id_raw)
|
||
except (TypeError, ValueError):
|
||
return
|
||
|
||
from src.core.config import get_settings
|
||
from src.services.telegram_gateway import get_telegram_gateway
|
||
settings = get_settings()
|
||
gateway = get_telegram_gateway()
|
||
target_chat_id = settings.SRE_GROUP_CHAT_ID or settings.OPENCLAW_TG_CHAT_ID
|
||
|
||
# 2026-04-19 ogt + Claude Opus 4.7 修 AP-2: 除了 reply 外,
|
||
# 也 edit 原卡片移除按鈕 + 更新狀態戳記(避免卡片永遠停在「執行中」)
|
||
try:
|
||
await gateway._send_request("editMessageReplyMarkup", {
|
||
"chat_id": target_chat_id,
|
||
"message_id": orig_msg_id,
|
||
"reply_markup": {"inline_keyboard": []},
|
||
})
|
||
except Exception as _edit_e:
|
||
logger.debug("push_execution_edit_buttons_failed",
|
||
approval_id=str(approval.id), error=str(_edit_e))
|
||
|
||
# 附加 KM/Playbook 增量(查最近該 incident 的 KM + playbook 使用)
|
||
km_info = ""
|
||
try:
|
||
from sqlalchemy import text as _sql
|
||
from src.db.base import get_db_context
|
||
async with get_db_context() as _db:
|
||
_km_row = await _db.execute(
|
||
_sql("""SELECT COUNT(*) FROM knowledge_entries
|
||
WHERE created_at > NOW() - interval '2 minutes'"""),
|
||
)
|
||
_km_count = _km_row.scalar() or 0
|
||
_pb_row = await _db.execute(
|
||
_sql("""SELECT COUNT(*) FROM playbooks
|
||
WHERE updated_at > NOW() - interval '2 minutes'"""),
|
||
)
|
||
_pb_count = _pb_row.scalar() or 0
|
||
if _km_count or _pb_count:
|
||
km_info = f"\n📚 KM +{_km_count} 🎯 Playbook 更新×{_pb_count}"
|
||
except Exception:
|
||
pass
|
||
|
||
if success:
|
||
text = (
|
||
f"✅ <b>執行成功</b>\n"
|
||
f"<code>{(approval.action or '')[:180]}</code>"
|
||
f"{km_info}"
|
||
)
|
||
else:
|
||
err_short = (error or "未知錯誤")[:150]
|
||
text = (
|
||
f"❌ <b>執行失敗</b>\n"
|
||
f"<code>{(approval.action or '')[:180]}</code>\n"
|
||
f"原因: {err_short}"
|
||
f"{km_info}"
|
||
)
|
||
|
||
await gateway._send_request(
|
||
"sendMessage",
|
||
{
|
||
"chat_id": target_chat_id,
|
||
"text": text,
|
||
"parse_mode": "HTML",
|
||
"reply_to_message_id": orig_msg_id,
|
||
},
|
||
)
|
||
logger.info(
|
||
"push_execution_result_sent",
|
||
incident_id=approval.incident_id,
|
||
approval_id=str(approval.id),
|
||
success=success,
|
||
orig_msg_id=orig_msg_id,
|
||
)
|
||
except Exception as e:
|
||
logger.warning(
|
||
"push_execution_result_failed",
|
||
approval_id=str(approval.id),
|
||
error=str(e),
|
||
)
|
||
|
||
async def _get_anomaly_key_from_approval(self, approval: ApprovalRequest) -> str | None:
|
||
"""
|
||
從 approval → incident → anomaly_key。
|
||
2026-04-07 Claude Code: I1+S1 Fix — 委託 AnomalyCounter.derive_key_from_incident()
|
||
"""
|
||
try:
|
||
if not approval.incident_id:
|
||
return None
|
||
from src.services.incident_service import get_incident_service
|
||
incident_service = get_incident_service()
|
||
incident = await incident_service.get_from_working_memory(approval.incident_id)
|
||
if not incident:
|
||
return None
|
||
from src.services.anomaly_counter import AnomalyCounter
|
||
return AnomalyCounter.derive_key_from_incident(incident)
|
||
except Exception as e:
|
||
logger.warning("get_anomaly_key_from_approval_failed", error=str(e))
|
||
return None
|
||
|
||
async def _trigger_learning(
|
||
self,
|
||
approval: ApprovalRequest,
|
||
success: bool,
|
||
duration_seconds: float = 0,
|
||
error_message: str | None = None,
|
||
) -> None:
|
||
"""
|
||
ADR-030 Phase 5: 觸發學習服務
|
||
|
||
處理執行結果,調整信任度和 Playbook 統計
|
||
"""
|
||
try:
|
||
from src.services.learning_service import (
|
||
ExecutionResult,
|
||
get_learning_service,
|
||
)
|
||
|
||
learning = get_learning_service()
|
||
result = ExecutionResult(
|
||
approval_id=str(approval.id),
|
||
incident_id=approval.incident_id or "",
|
||
action=approval.action,
|
||
success=success,
|
||
error_message=error_message,
|
||
duration_seconds=duration_seconds,
|
||
)
|
||
|
||
await learning.process_execution_result(
|
||
approval=approval,
|
||
result=result,
|
||
)
|
||
|
||
except Exception as e:
|
||
# 學習失敗不影響主流程
|
||
logger.warning(
|
||
"learning_trigger_failed",
|
||
approval_id=str(approval.id),
|
||
error=str(e),
|
||
)
|
||
|
||
# 2026-04-04 ogt: 執行結果沉澱到 KM — 移出 try/except 確保 learning 失敗也寫入
|
||
# 統帥鐵律: 所有異常與自動修復紀錄必須回寫 KM
|
||
# P1.5 fix 2026-04-24 ogt + Claude Sonnet 4.6: fire-and-forget → await(30s 熔斷)
|
||
# P1-1 2026-04-28 ogt + Claude Sonnet 4.6: 改用 write_execution_result_to_km(公開)
|
||
# KMWriter 統一契約:timeout / retry / DLQ 由 km_writer.py 統一管理
|
||
await self.write_execution_result_to_km(approval, success, error_message)
|
||
|
||
async def _run_post_execution_verify(
|
||
self,
|
||
approval: "ApprovalRequest",
|
||
action_taken: str,
|
||
) -> None:
|
||
"""
|
||
ADR-081 Phase 1: 執行後驗證 (fire-and-forget 包裝)
|
||
|
||
1. 從 incident_id 查 Incident
|
||
2. 從 incident_evidence 取最新 EvidenceSnapshot
|
||
3. 呼叫 PostExecutionVerifier.verify() 補填後狀態 + 驗證結果
|
||
4. 結果傳給 learning_service 更新 Playbook trust_score(Phase 3)
|
||
"""
|
||
if not approval.incident_id:
|
||
return
|
||
|
||
try:
|
||
from src.services.incident_service import get_incident_service
|
||
from src.services.post_execution_verifier import get_post_execution_verifier
|
||
# 2026-04-26 critic-B2 hotfix by Claude Opus 4.7
|
||
# get_latest_snapshot 是 module-level async function,不是 EvidenceSnapshot classmethod
|
||
from src.services.evidence_snapshot import get_latest_snapshot
|
||
|
||
incident_svc = get_incident_service()
|
||
# 2026-04-25 修復 L1:IncidentService 沒有 get_incident() 方法
|
||
# 應用正確方法 get_from_working_memory() 或 get_from_episodic_memory()
|
||
incident = await incident_svc.get_from_working_memory(approval.incident_id)
|
||
if incident is None:
|
||
incident = await incident_svc.get_from_episodic_memory(approval.incident_id)
|
||
if incident is None:
|
||
logger.warning(
|
||
"post_verify_incident_not_found",
|
||
approval_id=str(approval.id),
|
||
incident_id=approval.incident_id,
|
||
)
|
||
return
|
||
|
||
# 取最新 EvidenceSnapshot(若 Phase 1 flag 有啟動才會有)
|
||
snapshot = await get_latest_snapshot(approval.incident_id)
|
||
|
||
verifier = get_post_execution_verifier()
|
||
verification_result = await verifier.verify(
|
||
incident=incident,
|
||
snapshot=snapshot,
|
||
action_taken=action_taken,
|
||
)
|
||
|
||
logger.info(
|
||
"post_verify_complete",
|
||
approval_id=str(approval.id),
|
||
incident_id=approval.incident_id,
|
||
result=verification_result,
|
||
action=action_taken,
|
||
)
|
||
|
||
# ADR-083 Phase 3 Root cause 3: 驗證結果接線到學習服務
|
||
# 環境驗證(Pod Running / 指標恢復)是比執行 exit code 更精確的學習訊號
|
||
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太)
|
||
try:
|
||
from src.services.learning_service import get_learning_service
|
||
_matched_pb_id = getattr(approval, "matched_playbook_id", None)
|
||
await get_learning_service().record_verification_result(
|
||
incident_id=approval.incident_id,
|
||
action_taken=action_taken,
|
||
verification_result=verification_result,
|
||
matched_playbook_id=_matched_pb_id,
|
||
)
|
||
except Exception as _lerr:
|
||
logger.warning(
|
||
"post_verify_learning_failed",
|
||
approval_id=str(approval.id),
|
||
error=str(_lerr),
|
||
)
|
||
|
||
except Exception as _e:
|
||
# 驗證失敗不影響執行結果
|
||
logger.warning(
|
||
"post_verify_failed",
|
||
approval_id=str(approval.id),
|
||
error=str(_e),
|
||
)
|
||
|
||
@staticmethod
|
||
def _is_auto_approved_request(approval: "ApprovalRequest") -> bool:
|
||
requested_by = (getattr(approval, "requested_by", "") or "").lower()
|
||
return requested_by.startswith("auto_approve")
|
||
|
||
@staticmethod
|
||
def _is_observation_only_action(action: str | None) -> bool:
|
||
action_upper = (action or "").strip().upper()
|
||
return (
|
||
not action_upper
|
||
or "NO_ACTION" in action_upper
|
||
or "NO-ACTION" in action_upper
|
||
or "NOACTION" in action_upper
|
||
or action_upper.startswith("OBSERVE")
|
||
or action_upper.startswith("INVESTIGATE")
|
||
)
|
||
|
||
@staticmethod
|
||
def _approval_risk_value(approval: "ApprovalRequest") -> str | None:
|
||
risk_level = getattr(approval, "risk_level", None)
|
||
if risk_level is None:
|
||
return None
|
||
return getattr(risk_level, "value", str(risk_level))
|
||
|
||
async def finalize_auto_approved_execution(
|
||
self,
|
||
approval: "ApprovalRequest",
|
||
*,
|
||
success: bool,
|
||
error_message: str | None = None,
|
||
) -> None:
|
||
"""
|
||
補齊「自動批准已執行」路徑的 incident-linked 證據鏈。
|
||
|
||
CS2/CS3 webhook 路徑為了快速執行,會先呼叫 execute_approved_action(),
|
||
再建立 Incident。executor 當下沒有 incident_id,導致 verifier/KM/
|
||
auto_repair_executions 都無法串回同一張告警卡。此方法只在 incident
|
||
建立後補上 durable trace,不重新執行 action。
|
||
"""
|
||
if not self._is_auto_approved_request(approval):
|
||
return
|
||
|
||
incident_id = getattr(approval, "incident_id", None)
|
||
if not incident_id:
|
||
logger.warning(
|
||
"auto_approved_execution_finalize_skipped_no_incident",
|
||
approval_id=str(getattr(approval, "id", "")),
|
||
requested_by=getattr(approval, "requested_by", None),
|
||
)
|
||
return
|
||
|
||
if self._is_observation_only_action(getattr(approval, "action", None)):
|
||
logger.info(
|
||
"auto_approved_execution_finalize_skipped_observation_only",
|
||
approval_id=str(approval.id),
|
||
incident_id=incident_id,
|
||
action=(approval.action or "")[:120],
|
||
)
|
||
return
|
||
|
||
parsed = parse_operation_from_action(approval.action)
|
||
operation_type = parsed.operation_type
|
||
resource_name = parsed.resource_name or "unknown"
|
||
namespace = parsed.namespace or "default"
|
||
|
||
playbook_id = str(getattr(approval, "matched_playbook_id", None) or approval.id)[:36]
|
||
operation_label = operation_type.value if operation_type else "unknown"
|
||
playbook_name = f"approval_auto_execute:{operation_label}:{resource_name}"[:200]
|
||
triggered_by = (getattr(approval, "requested_by", None) or "auto_approve")[:50]
|
||
action_taken = f"auto_repair_playbook:{playbook_id}:{operation_label}:{resource_name}"
|
||
if not success:
|
||
action_taken = f"{action_taken}:FAILED"
|
||
error_message = error_message or "auto-approved executor returned failure; see approval/aol logs"
|
||
|
||
try:
|
||
from src.repositories.audit_log_repository import get_auto_repair_execution_repository
|
||
|
||
repo = get_auto_repair_execution_repository()
|
||
existing = await repo.list_by_incident(incident_id)
|
||
already_recorded = any(
|
||
str(getattr(row, "playbook_id", "")) == playbook_id
|
||
and getattr(row, "triggered_by", "") == triggered_by
|
||
and (approval.action or "") in list(getattr(row, "executed_steps", []) or [])
|
||
for row in existing
|
||
)
|
||
if not already_recorded:
|
||
await repo.create(
|
||
incident_id=incident_id,
|
||
playbook_id=playbook_id,
|
||
playbook_name=playbook_name,
|
||
success=success,
|
||
executed_steps=[approval.action],
|
||
error_message=error_message,
|
||
triggered_by=triggered_by,
|
||
risk_level=self._approval_risk_value(approval),
|
||
)
|
||
else:
|
||
logger.info(
|
||
"auto_approved_execution_record_already_exists",
|
||
approval_id=str(approval.id),
|
||
incident_id=incident_id,
|
||
playbook_id=playbook_id,
|
||
)
|
||
except Exception as exc:
|
||
logger.warning(
|
||
"auto_approved_execution_record_failed",
|
||
approval_id=str(approval.id),
|
||
incident_id=incident_id,
|
||
error=str(exc),
|
||
)
|
||
|
||
try:
|
||
timeline = get_timeline_service()
|
||
await timeline.add_event(
|
||
event_type="exec",
|
||
status="success" if success else "error",
|
||
title=f"{'✅' if success else '❌'} 自動批准執行已補鏈: {operation_label}",
|
||
description=(
|
||
f"Target: {resource_name} @ {namespace}; "
|
||
f"source={triggered_by}; action={approval.action[:160]}"
|
||
),
|
||
actor="leWOOOgo",
|
||
actor_role="executor",
|
||
approval_id=str(approval.id),
|
||
incident_id=incident_id,
|
||
)
|
||
except Exception as exc:
|
||
logger.warning(
|
||
"auto_approved_execution_timeline_failed",
|
||
approval_id=str(approval.id),
|
||
incident_id=incident_id,
|
||
error=str(exc),
|
||
)
|
||
|
||
try:
|
||
await self.write_execution_result_to_km(approval, success, error_message)
|
||
except Exception as exc:
|
||
logger.warning(
|
||
"auto_approved_execution_km_failed",
|
||
approval_id=str(approval.id),
|
||
incident_id=incident_id,
|
||
error=str(exc),
|
||
)
|
||
|
||
from src.core.feature_flags import aiops_flags
|
||
if aiops_flags.is_sub_flag_enabled("AIOPS_P1_POST_EXECUTION_VERIFIER"):
|
||
try:
|
||
await asyncio.wait_for(
|
||
self._run_post_execution_verify(
|
||
approval=approval,
|
||
action_taken=action_taken,
|
||
),
|
||
timeout=_VERIFIER_AWAIT_TIMEOUT_SEC,
|
||
)
|
||
except asyncio.TimeoutError:
|
||
logger.warning(
|
||
"auto_approved_execution_post_verify_timeout",
|
||
approval_id=str(approval.id),
|
||
incident_id=incident_id,
|
||
timeout_sec=_VERIFIER_AWAIT_TIMEOUT_SEC,
|
||
)
|
||
|
||
if success:
|
||
try:
|
||
from src.services.incident_service import get_incident_service
|
||
|
||
await get_incident_service().resolve_incident(incident_id)
|
||
logger.info(
|
||
"incident_resolved_after_auto_approved_execution_finalize",
|
||
incident_id=incident_id,
|
||
approval_id=str(approval.id),
|
||
)
|
||
except Exception as exc:
|
||
logger.warning(
|
||
"incident_resolve_after_auto_approved_execution_finalize_failed",
|
||
incident_id=incident_id,
|
||
approval_id=str(approval.id),
|
||
error=str(exc),
|
||
)
|
||
|
||
async def write_execution_result_to_km(
|
||
self,
|
||
approval: "ApprovalRequest",
|
||
success: bool,
|
||
error_message: str | None,
|
||
) -> None:
|
||
"""
|
||
執行結果沉澱到 KM (Knowledge Base)
|
||
|
||
2026-04-04 ogt: 統帥鐵律 — 成功/失敗執行記錄都必須回寫 KM
|
||
2026-04-14 Claude Sonnet 4.6 (BP-1 B.1 精修): 區分 auto_approve vs 人工路徑,
|
||
補齊 alert_category / alertname / affected_services 供 RAG 檢索。
|
||
P1-1 2026-04-28 ogt + Claude Sonnet 4.6: 改名公開(去底線),委派 KMWriter 統一契約。
|
||
"""
|
||
from src.models.knowledge import EntrySource, EntryType
|
||
from src.services.km_writer import KMWritePayload, km_write_with_flag
|
||
|
||
# 來源辨識(B.1 精修)
|
||
_is_auto = self._is_auto_approved_request(approval)
|
||
_mode_prefix = "[自動修復]" if _is_auto else "[人工修復]"
|
||
_mode_tag = "auto_executed" if _is_auto else "human_approved"
|
||
|
||
status_icon = "✅" if success else "❌"
|
||
status_text = "成功" if success else f"失敗: {error_message or '未知原因'}"
|
||
_status_tag = "success" if success else "failure"
|
||
|
||
# 從關聯 Incident 提取豐富元資料
|
||
alertname = "unknown"
|
||
alert_category = "general"
|
||
affected_services: list[str] = []
|
||
if approval.incident_id:
|
||
try:
|
||
from src.services.incident_service import get_incident_service
|
||
_svc = get_incident_service()
|
||
# get_from_working_memory (Redis) → fallback get_from_episodic_memory (PG)
|
||
_inc = await _svc.get_from_working_memory(approval.incident_id)
|
||
if _inc is None:
|
||
_inc = await _svc.get_from_episodic_memory(approval.incident_id)
|
||
if _inc:
|
||
if _inc.signals:
|
||
alertname = _inc.signals[0].labels.get("alertname", "unknown") or "unknown"
|
||
alert_category = getattr(_inc, "alert_category", "") or "general"
|
||
affected_services = list(_inc.affected_services or [])
|
||
except Exception as _ie:
|
||
logger.debug("km_incident_enrich_failed",
|
||
incident_id=approval.incident_id, error=str(_ie))
|
||
|
||
_services_str = ", ".join(affected_services) if affected_services else "未關聯"
|
||
|
||
content = (
|
||
f"# {status_icon} {_mode_prefix} {alertname}\n\n"
|
||
f"**告警名稱**: {alertname}\n"
|
||
f"**告警類別**: {alert_category}\n"
|
||
f"**受影響服務**: {_services_str}\n"
|
||
f"**執行命令**: `{approval.action[:200]}`\n"
|
||
f"**執行結果**: {status_text}\n"
|
||
f"**風險等級**: {approval.risk_level.value if approval.risk_level else '未知'}\n"
|
||
f"**執行路徑**: {'自動執行 (confidence >= 0.65)' if _is_auto else '人工審核批准'}\n"
|
||
f"**Incident ID**: {approval.incident_id or '未關聯'}\n"
|
||
f"**Approval ID**: {approval.id}\n\n"
|
||
f"## 操作描述\n{approval.description or '無描述'}\n"
|
||
)
|
||
|
||
# Tags: 模式 + 狀態 + 類別(供 RAG 多維度檢索)
|
||
tags = [_mode_tag, _status_tag, alert_category, "execution"]
|
||
if not success:
|
||
tags.append("execution_failed")
|
||
|
||
payload = KMWritePayload(
|
||
path_type="approval_auto_ok" if (_is_auto and success) else
|
||
"approval_auto_fail" if (_is_auto and not success) else
|
||
"approval_manual",
|
||
entry_create_kwargs=dict(
|
||
title=f"{_mode_prefix} {alertname}: {approval.action[:50]}",
|
||
content=content,
|
||
entry_type=EntryType.INCIDENT_CASE,
|
||
category=alert_category,
|
||
tags=tags,
|
||
source=EntrySource.AI_EXTRACTED,
|
||
related_incident_id=approval.incident_id or None,
|
||
created_by="auto_execute" if _is_auto else "approval_execution",
|
||
),
|
||
incident_id=approval.incident_id or None,
|
||
approval_id=str(approval.id),
|
||
)
|
||
await km_write_with_flag(payload)
|
||
|
||
async def _send_execution_notification(
|
||
self,
|
||
approval: ApprovalRequest,
|
||
execution_status: "ExecutionStatus",
|
||
operation_type: str,
|
||
namespace: str,
|
||
duration_ms: int | None = None,
|
||
error_message: str | None = None,
|
||
) -> None:
|
||
"""
|
||
Phase 6: 發送執行通知 (Post-Execution Hook)
|
||
|
||
將執行結果發送至所有已配置的通知頻道 (Discord, Slack, etc.)
|
||
"""
|
||
from src.services.notifications import (
|
||
NotificationMessage,
|
||
get_notification_manager,
|
||
)
|
||
|
||
if not settings.NOTIFICATION_ENABLED:
|
||
logger.info("notification_disabled", approval_id=str(approval.id))
|
||
return
|
||
|
||
try:
|
||
# 建構簽核者列表
|
||
signers = [
|
||
{"name": sig.signer_name, "comment": sig.comment or ""}
|
||
for sig in approval.signatures
|
||
]
|
||
|
||
# 建構通知訊息
|
||
message = NotificationMessage(
|
||
execution_status=execution_status,
|
||
action_title=approval.action[:100],
|
||
action_description=approval.description[:200] if approval.description else "",
|
||
approval_id=str(approval.id),
|
||
signers=signers,
|
||
required_signatures=approval.required_signatures,
|
||
affected_pods=approval.blast_radius.affected_pods if approval.blast_radius else 0,
|
||
estimated_downtime=approval.blast_radius.estimated_downtime if approval.blast_radius else "N/A",
|
||
related_services=approval.blast_radius.related_services if approval.blast_radius else [],
|
||
data_impact=approval.blast_radius.data_impact.value if approval.blast_radius else "none",
|
||
namespace=namespace,
|
||
operation_type=operation_type,
|
||
duration_ms=duration_ms,
|
||
error_message=error_message,
|
||
risk_level=approval.risk_level.value,
|
||
ai_provider=approval.requested_by,
|
||
)
|
||
|
||
# 發送通知
|
||
manager = get_notification_manager()
|
||
results = await manager.send_all(message)
|
||
|
||
for result in results:
|
||
logger.info(
|
||
"notification_result",
|
||
approval_id=str(approval.id),
|
||
provider=result.provider,
|
||
status=result.status.value,
|
||
message=result.message,
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.exception(
|
||
"notification_failed",
|
||
approval_id=str(approval.id),
|
||
error=str(e),
|
||
)
|
||
|
||
async def _trigger_playbook_extraction(
|
||
self,
|
||
approval: ApprovalRequest,
|
||
) -> None:
|
||
"""
|
||
Phase 7.6: 觸發 Playbook 自動萃取
|
||
|
||
條件:
|
||
- 執行成功
|
||
- 關聯的 Incident 狀態為 RESOLVED 或 CLOSED
|
||
- effectiveness_score >= 4
|
||
|
||
此函數為 fire-and-forget,失敗不影響主流程
|
||
"""
|
||
try:
|
||
# 1. 從 approval.incident_id 直接取得 (Phase 26 修復)
|
||
# 原本靠 regex 掃文字找 INC- 前綴,中文 action 完全找不到
|
||
incident_id = getattr(approval, "incident_id", None)
|
||
if not incident_id:
|
||
# Fallback: 嘗試文字解析 (向後兼容舊資料)
|
||
incident_id = self._extract_incident_id_from_approval(approval)
|
||
if not incident_id:
|
||
logger.info(
|
||
"playbook_extraction_skipped",
|
||
approval_id=str(approval.id),
|
||
reason="No incident_id found in approval.incident_id or text",
|
||
)
|
||
return
|
||
|
||
# 2. 取得 Incident
|
||
from src.services.incident_service import get_incident_service
|
||
|
||
incident_service = get_incident_service()
|
||
# 2026-04-25 修復 L1:IncidentService 沒有 get_incident() 方法
|
||
incident = await incident_service.get_from_working_memory(incident_id)
|
||
if incident is None:
|
||
incident = await incident_service.get_from_episodic_memory(incident_id)
|
||
|
||
if not incident:
|
||
logger.info(
|
||
"playbook_extraction_skipped",
|
||
approval_id=str(approval.id),
|
||
incident_id=incident_id,
|
||
reason="Incident not found",
|
||
)
|
||
return
|
||
|
||
# 3. 執行成功後自動設定 outcome (冷啟動關鍵)
|
||
# 2026-04-04 ogt: 首席架構師 Review — 補上 execution_success + effectiveness_score
|
||
# 確保 Playbook 萃取前置條件能成立,不再依賴人工填分
|
||
from src.models.incident import IncidentOutcome, IncidentStatus
|
||
from src.utils.timezone import now_taipei
|
||
|
||
if incident.outcome is None:
|
||
incident.outcome = IncidentOutcome()
|
||
if not incident.outcome.execution_success:
|
||
incident.outcome.execution_success = True
|
||
if incident.outcome.effectiveness_score is None or incident.outcome.effectiveness_score < 4:
|
||
incident.outcome.effectiveness_score = 4 # 系統判斷:K8s 執行成功 = 有效
|
||
if incident.status not in [IncidentStatus.RESOLVED, IncidentStatus.CLOSED]:
|
||
incident.status = IncidentStatus.RESOLVED
|
||
incident.resolved_at = now_taipei()
|
||
# Task 3.3 (2026-04-14): 記錄執行動作供 SSH 路徑 KM 萃取
|
||
# approval.action 含實際執行指令(可能是 kubectl 或 ssh ...),
|
||
# 寫入 learning_notes 供 playbook_service._extract_repair_steps 萃取 SSH RepairStep
|
||
if not incident.outcome.learning_notes and approval.action:
|
||
incident.outcome.learning_notes = approval.action
|
||
|
||
# 回存 Incident(fire-and-forget 路徑,失敗不影響主流程)
|
||
await incident_service.save_to_working_memory(incident)
|
||
|
||
logger.info(
|
||
"playbook_extraction_incident_updated",
|
||
approval_id=str(approval.id),
|
||
incident_id=incident_id,
|
||
effectiveness_score=incident.outcome.effectiveness_score,
|
||
status=incident.status.value,
|
||
)
|
||
|
||
# 4. 觸發萃取(effectiveness 已保證 >= 4)
|
||
from src.services.playbook_service import get_playbook_service
|
||
|
||
playbook_service = get_playbook_service()
|
||
effectiveness = incident.outcome.effectiveness_score or 4
|
||
playbook = await playbook_service.extract_from_incident(
|
||
incident=incident,
|
||
auto_approve=effectiveness >= 5, # 滿分自動核准
|
||
)
|
||
|
||
if playbook:
|
||
logger.info(
|
||
"playbook_auto_extracted",
|
||
approval_id=str(approval.id),
|
||
incident_id=incident_id,
|
||
playbook_id=playbook.playbook_id,
|
||
playbook_name=playbook.name,
|
||
auto_approved=playbook.status.value == "approved",
|
||
)
|
||
else:
|
||
logger.debug(
|
||
"playbook_extraction_no_result",
|
||
approval_id=str(approval.id),
|
||
incident_id=incident_id,
|
||
)
|
||
|
||
except Exception as e:
|
||
# 萃取失敗不影響主流程
|
||
logger.warning(
|
||
"playbook_extraction_error",
|
||
approval_id=str(approval.id),
|
||
error=str(e),
|
||
)
|
||
|
||
def _extract_incident_id_from_approval(
|
||
self,
|
||
approval: ApprovalRequest,
|
||
) -> str | None:
|
||
"""
|
||
從 approval 提取關聯的 incident_id
|
||
|
||
嘗試以下來源:
|
||
1. approval.metadata (如果有)
|
||
2. approval.description 中的 INC- 模式
|
||
3. approval.requested_by 中的 incident 資訊
|
||
"""
|
||
import re
|
||
|
||
# 從 description 或 action 中尋找 INC-XXXXXX 模式
|
||
text = f"{approval.description or ''} {approval.action or ''}"
|
||
match = re.search(r"INC-([A-Z0-9-]+)", text)
|
||
if match:
|
||
return match.group(0) # 返回完整的 INC-XXXXX
|
||
|
||
# 從 requested_by 尋找
|
||
if approval.requested_by and "INC-" in approval.requested_by:
|
||
match = re.search(r"INC-([A-Z0-9-]+)", approval.requested_by)
|
||
if match:
|
||
return match.group(0)
|
||
|
||
return None
|
||
|
||
|
||
# =========================================================================
|
||
# ADR-090 § AOL Writer (2026-04-19 ogt + Claude Opus 4.7 亞太)
|
||
# 把 approval execution 的生命週期回灌 automation_operation_log.
|
||
# 之前 33 件/7d approval 動作完全沒寫入 aol,只有 drift_narrator 的
|
||
# 22 筆 notification_formatted。修復後每次執行都留痕。
|
||
# =========================================================================
|
||
|
||
async def _log_aol_started(self, approval: ApprovalRequest) -> str | None:
|
||
"""
|
||
在 automation_operation_log 寫一筆 'pending' 紀錄,回傳 op_id 供 _log_aol_completed 更新。
|
||
|
||
失敗時 (DB 異常) 回 None,主流程繼續 — aol 寫入永不阻塞執行。
|
||
|
||
2026-04-20 P0.3: input 補 target / operation_type / namespace,
|
||
失敗時 aol.input 就能直接看到 target 是什麼(追 awoooi-service 類誤判的 source trace)。
|
||
"""
|
||
try:
|
||
from sqlalchemy import text as _sql
|
||
from src.db.base import get_db_context
|
||
import json as _json
|
||
|
||
# 2026-04-20 P0.3: 先嘗試從 action 解析 target / op_type,失敗不阻塞
|
||
_parsed_target: str | None = None
|
||
_parsed_op: str | None = None
|
||
_parsed_ns: str | None = None
|
||
try:
|
||
_parsed = parse_operation_from_action(approval.action or "")
|
||
_parsed_target = _parsed.resource_name
|
||
_parsed_op = _parsed.operation_type.value if _parsed.operation_type else None
|
||
_parsed_ns = _parsed.namespace
|
||
except Exception:
|
||
pass
|
||
|
||
input_payload = {
|
||
"approval_id": str(approval.id),
|
||
"incident_id": approval.incident_id or "",
|
||
"action": (approval.action or "")[:500],
|
||
"risk_level": getattr(approval, "risk_level", None) or "",
|
||
"requested_by": getattr(approval, "requested_by", "") or "",
|
||
# 2026-04-20 P0.3: target source trace
|
||
"parsed_target": _parsed_target or "",
|
||
"parsed_operation": _parsed_op or "",
|
||
"parsed_namespace": _parsed_ns or "",
|
||
}
|
||
|
||
async with get_db_context() as db:
|
||
row = await db.execute(
|
||
_sql("""
|
||
INSERT INTO automation_operation_log (
|
||
operation_type, actor, status,
|
||
input, output, tags
|
||
) VALUES (
|
||
'playbook_executed',
|
||
'approval_execution',
|
||
'pending',
|
||
CAST(:input AS jsonb),
|
||
'{}'::jsonb,
|
||
:tags
|
||
)
|
||
RETURNING op_id
|
||
"""),
|
||
{
|
||
"input": _json.dumps(input_payload, ensure_ascii=False),
|
||
"tags": ["approval", "execution", "playbook"],
|
||
},
|
||
)
|
||
op_id = row.scalar()
|
||
return str(op_id) if op_id else None
|
||
except Exception as e:
|
||
logger.warning("aol_started_write_failed", approval_id=str(approval.id), error=str(e))
|
||
return None
|
||
|
||
async def _log_aol_completed(
|
||
self,
|
||
op_id: str | None,
|
||
status: str,
|
||
duration_ms: int,
|
||
output: dict | None = None,
|
||
error: str | None = None,
|
||
stderr: str | None = None,
|
||
) -> None:
|
||
"""
|
||
UPDATE automation_operation_log 為 success/failed 並寫入結果摘要 + stderr。
|
||
|
||
status 必須是 aol constraint 允許的值:
|
||
pending | success | failed | dry_run | rolled_back
|
||
|
||
op_id 為 None 時靜默跳過 (started 寫入失敗時不應觸發 update 例外)。
|
||
"""
|
||
if not op_id:
|
||
return
|
||
try:
|
||
from sqlalchemy import text as _sql
|
||
from src.db.base import get_db_context
|
||
import json as _json
|
||
|
||
async with get_db_context() as db:
|
||
await db.execute(
|
||
_sql("""
|
||
UPDATE automation_operation_log
|
||
SET status = :status,
|
||
duration_ms = :duration_ms,
|
||
output = CAST(:output AS jsonb),
|
||
error = :error,
|
||
stderr_feed_back = :stderr
|
||
WHERE op_id = CAST(:op_id AS uuid)
|
||
"""),
|
||
{
|
||
"status": status,
|
||
"duration_ms": duration_ms,
|
||
"output": _json.dumps(output or {}, ensure_ascii=False),
|
||
"error": (error or "")[:2000] if error else None,
|
||
"stderr": (stderr or "")[:8000] if stderr else None,
|
||
"op_id": op_id,
|
||
},
|
||
)
|
||
except Exception as e:
|
||
logger.warning("aol_completed_write_failed", op_id=op_id, error=str(e))
|
||
|
||
|
||
# =============================================================================
|
||
# Singleton Instance
|
||
# =============================================================================
|
||
|
||
_execution_service: ApprovalExecutionService | None = None
|
||
|
||
|
||
def get_execution_service() -> ApprovalExecutionService:
|
||
"""
|
||
取得 ApprovalExecutionService 單例
|
||
|
||
Returns:
|
||
ApprovalExecutionService: 執行服務實例
|
||
"""
|
||
global _execution_service
|
||
if _execution_service is None:
|
||
_execution_service = ApprovalExecutionService()
|
||
return _execution_service
|