Files
awoooi/apps/api/src/services/approval_execution.py
Your Name 596f2f6820
All checks were successful
Code Review / ai-code-review (push) Successful in 11s
CD Pipeline / tests (push) Successful in 1m17s
CD Pipeline / build-and-deploy (push) Successful in 3m42s
CD Pipeline / post-deploy-checks (push) Successful in 1m21s
fix(awooop): link auto approved execution evidence
2026-05-13 19:14:17 +08:00

1726 lines
71 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Approval Execution Service - Phase 16 R4.2 瘦身 Router 抽取
============================================================
從 approvals.py 抽取執行編排邏輯,整合:
- OperationParser: 解析操作類型
- K8s Executor: 執行 K8s 操作
- ApprovalDBService: 更新狀態
- TimelineService: 記錄事件
- NotificationManager: 發送通知
- Phase 7.6: Playbook 自動萃取
版本: v1.2
建立: 2026-03-25 (台北時區)
更新: 2026-03-26 (Phase 7.6 自動萃取)
更新: 2026-04-14 (ADR-076 Task 3: 執行失敗重試機制 — Claude Haiku 4.5 Asia/Taipei)
建立者: Claude Code (Phase 16 R4.2)
重試設計 (ADR-076):
- MAX_RETRY = 2 次(共最多 3 次嘗試)
- RETRY_DELAY_SECONDS = 30 秒
- 只重試瞬態錯誤connection refused, timeout, i/o error 等)
- 永久性錯誤not found, permission denied, already exists不重試
"""
import asyncio
import time
from typing import TYPE_CHECKING, Any
from uuid import UUID
import structlog
from src.core.config import settings
from src.core.redis_client import get_redis
from src.db.base import get_db_context
from src.models.approval import ApprovalRequest
from src.plugins.mcp.gateway import GatewayContext, McpGateway, McpGatewayError
from src.plugins.mcp.interfaces import MCPToolResult
from src.services.approval_db import get_approval_service, get_timeline_service
from src.services.executor import ExecutionResult, OperationType, get_executor
from src.services.operation_parser import parse_operation_from_action
if TYPE_CHECKING:
from src.services.notifications import ExecutionStatus
logger = structlog.get_logger(__name__)
# ADR-090 § 自動化動作回灌 (2026-04-19 ogt + Claude Opus 4.7 亞太):
# PostExecutionVerifier 從 fire-and-forget 改 await,確保 verification_result 必寫入 incident_evidence.
# 上限 60s 涵蓋 verifier warmup(10s) + collect(30s) + 緩衝 20s.
_VERIFIER_AWAIT_TIMEOUT_SEC = 60.0
# T9: approved SSH execution must go through AwoooP MCP Gateway.
# ApprovalRequest itself is the human/multi-sig decision artifact; for write/admin
# tools we project it into the short-lived Gate 5 Redis key expected by Gateway.
_SSH_GATEWAY_AGENT_ID = "approval_executor"
_SSH_GATEWAY_PROJECT_ID = "awoooi"
_SSH_GATEWAY_APPROVAL_TTL_SECONDS = 600
_SSH_GATEWAY_TOOL_SCOPES: dict[str, str] = {
"ssh_diagnose": "read",
"ssh_docker_restart": "write",
"ssh_docker_compose_restart": "write",
"ssh_systemctl_restart": "write",
"ssh_clear_docker_logs": "write",
"ssh_renew_ssl": "write",
"ssh_reload_nginx": "write",
"ssh_docker_prune": "admin",
}
class ApprovalExecutionService:
"""
授權執行服務 - 編排整個執行流程
職責:
1. 解析操作類型
2. 呼叫 K8s Executor 執行(含重試)
3. 更新資料庫狀態
4. 記錄 Timeline 事件
5. 發送通知
"""
# ADR-076 Task 3: 重試常數
MAX_RETRY: int = 2
RETRY_DELAY_SECONDS: int = 30
# 瞬態錯誤關鍵字(小寫比對),符合任一 → 可重試
_TRANSIENT_ERROR_KEYWORDS: tuple[str, ...] = (
"connection refused",
"connection reset",
"timeout",
"timed out",
"i/o error",
"io error",
"temporary failure",
"service unavailable",
"too many requests",
"dial tcp",
"eof",
)
# 永久性錯誤關鍵字(小寫比對),符合任一 → 不重試
_PERMANENT_ERROR_KEYWORDS: tuple[str, ...] = (
"not found",
"forbidden",
"permission denied",
"unauthorized",
"already exists",
"invalid",
"immutable",
"destructive",
"blocked",
)
@classmethod
def _is_transient_error(cls, error_message: str | None) -> bool:
"""
判斷執行錯誤是否為瞬態(可重試)
優先檢查永久性錯誤(比瞬態錯誤有更高的優先順序),
避免 "connection refused (not found)" 這類混合訊息誤判。
Args:
error_message: 執行錯誤訊息
Returns:
True 表示可重試False 表示永久失敗
"""
if not error_message:
return False
lower = error_message.lower()
# 永久性錯誤 → 不重試
if any(kw in lower for kw in cls._PERMANENT_ERROR_KEYWORDS):
return False
# 瞬態錯誤 → 可重試
return any(kw in lower for kw in cls._TRANSIENT_ERROR_KEYWORDS)
async def execute_approved_action(self, approval: ApprovalRequest) -> bool:
"""
背景執行已批准的操作
此函數由 BackgroundTasks 呼叫,不阻塞 API 回應
Phase 5: 執行後更新資料庫狀態
Phase 6: 執行後發送通知 (Post-Execution Hook)
2026-04-17 ogt + Claude Sonnet 4.6: 返回 bool 表示 K8s 執行成功與否
根本原因: 原本返回 None → decision_manager.py auto-execute 路徑無法得知結果
→ 永遠傳 success=True 給 _push_auto_repair_result → 假成功廣播
修復: 返回 result.success讓呼叫端自行決定 Telegram 訊息
Args:
approval: 已批准的授權請求
Returns:
bool: True = K8s 執行成功False = 執行失敗(含解析失敗)
"""
from src.services.notifications import ExecutionStatus
logger.info(
"background_execution_start",
approval_id=str(approval.id),
action=approval.action,
)
# ADR-090 § 自動化動作回灌 (2026-04-19): 主流程開始即在 aol 留痕,
# 結束時 update。不依賴 fire-and-forget,確保 33 件/7d approval 全部可觀測。
_aol_op_id = await self._log_aol_started(approval)
_aol_started_ms = time.time()
service = get_approval_service()
timeline = get_timeline_service()
# Parse operation details
parsed = parse_operation_from_action(approval.action)
operation_type = parsed.operation_type
resource_name = parsed.resource_name
namespace = parsed.namespace
# 2026-04-27 P3.1-T1 by Claude — 三 Tier-1 服務整合
# kubectl 指令解析後,動態驗證資源是否存在於 K8s並套用 normalized name
# exception 不阻斷主流程miss/suggestion 只記 warning + metadata不攔截執行
if resource_name is not None and operation_type is not None:
try:
from src.services.resource_resolver import get_resource_resolver
from src.core.metrics import RESOURCE_RESOLVE_TOTAL
_resolver = get_resource_resolver()
_resolve = await _resolver.resolve(
raw_resource=resource_name,
namespace=namespace,
resource_kind="deployment",
)
if _resolve.success and _resolve.resource_name:
if _resolve.resource_name != resource_name:
logger.info(
"resource_name_normalized",
original=resource_name,
normalized=_resolve.resource_name,
namespace=namespace,
)
resource_name = _resolve.resource_name
RESOURCE_RESOLVE_TOTAL.labels(result="hit").inc()
elif _resolve.candidates:
logger.warning(
"resource_not_found_in_k8s",
resource=resource_name,
namespace=namespace,
suggestions=_resolve.candidates,
)
RESOURCE_RESOLVE_TOTAL.labels(result="suggestion").inc()
else:
logger.warning(
"resource_not_found_in_k8s",
resource=resource_name,
namespace=namespace,
suggestions=[],
)
RESOURCE_RESOLVE_TOTAL.labels(result="miss").inc()
except Exception as _rr_e:
logger.warning("resource_resolve_failed", error=str(_rr_e))
try:
from src.core.metrics import RESOURCE_RESOLVE_TOTAL
RESOURCE_RESOLVE_TOTAL.labels(result="error").inc()
except Exception:
pass
if operation_type is None or resource_name is None:
# 2026-04-19 ogt + Claude Opus 4.7: 區分 NO_ACTION vs 真解析失敗
# NO_ACTION 是 AI 刻意選的「純調查不破壞」,不該誤標 EXECUTION_FAILED
# 污染 auto_execute 成功率 KPI (MASTER §7.1 #11)
_action_upper = (approval.action or "").upper()
_is_no_action = (
"NO_ACTION" in _action_upper
or "NO-ACTION" in _action_upper
or "NOACTION" in _action_upper
or "(未設)" in approval.action
or _action_upper.startswith("OBSERVE")
or _action_upper.startswith("INVESTIGATE")
)
if _is_no_action:
logger.info(
"background_execution_noop",
approval_id=str(approval.id),
action=approval.action,
reason="NO_ACTION - 純調查/觀察類,不執行破壞動作",
path="no_action",
)
# 標為 SUCCESS (觀察/調查本身就是成功完成)
await service.update_execution_status(approval.id, success=True)
await timeline.add_event(
event_type="exec",
status="success",
title="✅ 純觀察類動作完成 (NO_ACTION)",
description=f"Action: {approval.action[:120]}",
actor="leWOOOgo",
actor_role="executor",
approval_id=str(approval.id),
incident_id=approval.incident_id,
)
# 執行結果 reply 原告警卡片
asyncio.create_task(
self._push_execution_result_to_alert(
approval, success=True, error=None,
)
)
# ADR-090 § aol completed (NO_ACTION 視為成功)
await self._log_aol_completed(
op_id=_aol_op_id,
status="success",
duration_ms=int((time.time() - _aol_started_ms) * 1000),
output={"reason": "NO_ACTION", "action": approval.action[:200]},
)
# F2 (2026-05-07 ogt + Claude Sonnet 4.6 + Codex):
# NO_ACTION 路徑要把 incident 推到 RESOLVED否則 incident 永遠卡
# INVESTIGATINGFlywheelExecutionRateMissing 死告警 + 566 stuck 增長根因 #1
# resolve_incident 內已加 RESOLVED 冪等 guard重複 resolve 會 idempotent
# return existing incident 不會重觸發 postmortem。
if approval.incident_id:
try:
from src.services.incident_service import get_incident_service
await get_incident_service().resolve_incident(approval.incident_id)
logger.info(
"incident_resolved_after_no_action_execution",
incident_id=approval.incident_id,
approval_id=str(approval.id),
path="no_action",
)
except Exception as _resolve_e:
logger.warning(
"incident_resolve_after_no_action_execution_failed",
incident_id=approval.incident_id,
approval_id=str(approval.id),
error=str(_resolve_e),
)
return True # NO_ACTION 視為成功完成
# 真解析失敗 (非 NO_ACTION)
logger.warning(
"background_execution_skip",
approval_id=str(approval.id),
reason="Could not parse operation type from action",
action=approval.action,
)
# Phase 5: 更新資料庫狀態 + 帶 error_message (P0.2)
await service.update_execution_status(
approval.id, success=False,
error_message=f"Could not parse operation type from action: {approval.action[:150]}",
)
await timeline.add_event(
event_type="exec",
status="error",
title="執行失敗: 無法解析操作類型",
description=f"Action: {approval.action}",
actor="leWOOOgo",
actor_role="executor",
approval_id=str(approval.id),
incident_id=approval.incident_id,
)
# Phase 6: 發送失敗通知 (fire-and-forget)
asyncio.create_task(
self._send_execution_notification(
approval=approval,
execution_status=ExecutionStatus.FAILED,
operation_type="unknown",
namespace=namespace,
error_message="Could not parse operation type",
)
)
# ADR-090 § aol completed (parse 失敗)
await self._log_aol_completed(
op_id=_aol_op_id,
status="failed",
duration_ms=int((time.time() - _aol_started_ms) * 1000),
error=f"parse_fail: {approval.action[:300]}",
)
return False # 解析失敗 → 執行未發生
executor = get_executor()
attempt = 1 # 重試計數INVESTIGATE 路徑不進入重試迴圈,保持 1
# 2026-05-02 ogt + Claude Sonnet 4.6: 主機 SSH 操作分支
# 根因:手動批准 ssh action 時 parser 只懂 kubectl回 None → 「Could not parse」假失敗
# 修法:偵測到 SSH_HOST 類型,走 SSHProvider 而非 K8s executor
if operation_type == OperationType.SSH_HOST:
result = await self._execute_ssh_host_action(
approval=approval,
host=resource_name or "",
)
logger.info(
"background_execution_ssh_host",
approval_id=str(approval.id),
action=approval.action,
host=resource_name,
success=result.success,
message=result.message,
)
elif operation_type == OperationType.INVESTIGATE:
# 2026-04-24 ogt + Claude Sonnet 4.6: Gate 11 修復 — INVESTIGATE 唯讀查詢
# 根因INVESTIGATE 不在 executor.execute_with_audit 的 switch走 else → success=False
# 修法:偵測到 INVESTIGATE 類型,直接呼叫 execute_kubectl_command(approval.action)
# 唯讀指令無需重試迴圈(失敗即失敗,不會有 transient error 改善空間)
result = await executor.execute_kubectl_command(
command=approval.action,
timeout_sec=30,
)
logger.info(
"background_execution_investigate",
approval_id=str(approval.id),
action=approval.action,
success=result.success,
message=result.message,
)
else:
# ADR-076 Task 3: 執行失敗重試機制
# 瞬態錯誤 (connection refused, timeout 等) 自動重試,最多 MAX_RETRY 次
result = await executor.execute_with_audit(
approval=approval,
operation_type=operation_type,
resource_name=resource_name,
namespace=namespace,
)
attempt = 1
while not result.success and attempt <= self.MAX_RETRY:
if not self._is_transient_error(result.error):
logger.info(
"execution_retry_skipped_permanent_error",
approval_id=str(approval.id),
attempt=attempt,
error=result.error,
)
break
logger.warning(
"execution_retry_transient_error",
approval_id=str(approval.id),
attempt=attempt,
max_retry=self.MAX_RETRY,
error=result.error,
delay_seconds=self.RETRY_DELAY_SECONDS,
)
await timeline.add_event(
event_type="exec",
status="warning",
title=f"⚠️ 執行失敗,{self.RETRY_DELAY_SECONDS}s 後重試 ({attempt}/{self.MAX_RETRY})",
description=f"Error: {result.error}",
actor="leWOOOgo",
actor_role="executor",
approval_id=str(approval.id),
incident_id=approval.incident_id,
)
await asyncio.sleep(self.RETRY_DELAY_SECONDS)
result = await executor.execute_with_audit(
approval=approval,
operation_type=operation_type,
resource_name=resource_name,
namespace=namespace,
)
attempt += 1
# Phase 5: 更新資料庫狀態
# 2026-04-18 ADR-090 L5 P0.2: 失敗時帶上 error_message,寫進 rejection_reason
await service.update_execution_status(
approval.id,
success=result.success,
error_message=None if result.success else (result.error or "(executor 未回傳錯誤)"),
)
# Update approval status based on result
total_attempts = attempt # attempt 在重試迴圈後為最終嘗試次數
if result.success:
logger.info(
"background_execution_success",
approval_id=str(approval.id),
operation=operation_type.value,
target=resource_name,
namespace=namespace,
duration_ms=result.duration_ms,
total_attempts=total_attempts,
)
retry_note = f" (重試 {total_attempts - 1} 次後成功)" if total_attempts > 1 else ""
await timeline.add_event(
event_type="exec",
status="success",
title=f"✅ K8s 執行成功: {operation_type.value}{retry_note}",
description=f"Target: {resource_name} @ {namespace} ({result.duration_ms}ms)",
actor="leWOOOgo",
actor_role="executor",
approval_id=str(approval.id),
incident_id=approval.incident_id,
)
# Phase 6: 發送成功通知 (fire-and-forget)
asyncio.create_task(
self._send_execution_notification(
approval=approval,
execution_status=ExecutionStatus.SUCCESS,
operation_type=operation_type.value,
namespace=namespace,
duration_ms=result.duration_ms,
)
)
# 2026-04-14 Claude Sonnet 4.6: reply_to 原告警卡片顯示執行結果
# auto_approve 路徑由 _push_auto_repair_result 處理,此處僅處理人工批准
asyncio.create_task(
self._push_execution_result_to_alert(approval, success=True, error=None)
)
# Phase 7.6: 觸發 Playbook 自動萃取 (fire-and-forget)
asyncio.create_task(
self._trigger_playbook_extraction(approval)
)
# ADR-030 Phase 5 / ADR-083 Phase 3: 觸發學習服務
# Phase 3 修復:移除 fire-and-forget改用 await + 30s 熔斷
# 超時 → 記錄 metric主流程繼續不 crash
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 fire-and-forget 修復
try:
await asyncio.wait_for(
self._trigger_learning(
approval=approval,
success=True,
duration_seconds=result.duration_ms / 1000 if result.duration_ms else 0,
),
timeout=30.0,
)
except asyncio.TimeoutError:
logger.warning(
"learning_trigger_timeout",
approval_id=str(approval.id),
timeout_sec=30.0,
)
# ADR-081 Phase 1 + ADR-090 修復 (2026-04-19 ogt + Claude Opus 4.7):
# PostExecutionVerifier 改 await + 60s timeout,確保 verification_result 必寫入。
# 之前 fire-and-forget 在 Pod recycle 時 task 被殺,導致 1212 筆 evidence 全 NULL.
from src.core.feature_flags import aiops_flags
if aiops_flags.is_sub_flag_enabled("AIOPS_P1_POST_EXECUTION_VERIFIER"):
try:
await asyncio.wait_for(
self._run_post_execution_verify(
approval=approval,
action_taken=f"{operation_type.value}:{resource_name}",
),
timeout=_VERIFIER_AWAIT_TIMEOUT_SEC,
)
except asyncio.TimeoutError:
logger.warning(
"post_verify_timeout_exceeded",
approval_id=str(approval.id),
timeout_sec=_VERIFIER_AWAIT_TIMEOUT_SEC,
)
# 2026-04-07 Claude Code: Sprint 4 B3 — 記錄人工批准處置類型
try:
anomaly_key = await self._get_anomaly_key_from_approval(approval)
if anomaly_key:
from src.services.anomaly_counter import get_anomaly_counter
counter = get_anomaly_counter()
await counter.record_disposition(anomaly_key, "human_approved")
except Exception as _disp_e:
logger.warning("disposition_record_failed", error=str(_disp_e))
# ADR-073 修補: 執行成功 → 解決 Incident → 觸發 KM 轉換
# 之前 RESOLVED 從未被呼叫,導致 KM 永遠不生成、Playbook 永遠是 0
if approval.incident_id:
try:
from src.services.incident_service import get_incident_service
_inc_svc = get_incident_service()
await _inc_svc.resolve_incident(approval.incident_id)
logger.info(
"incident_resolved_after_execution",
incident_id=approval.incident_id,
approval_id=str(approval.id),
)
except Exception as _resolve_e:
logger.warning("incident_resolve_after_execution_failed", error=str(_resolve_e))
# ADR-090 § aol completed (執行成功)
await self._log_aol_completed(
op_id=_aol_op_id,
status="success",
duration_ms=int((time.time() - _aol_started_ms) * 1000),
output={
"operation_type": operation_type.value,
"resource_name": resource_name,
"namespace": namespace,
"executor_duration_ms": result.duration_ms,
"total_attempts": total_attempts,
},
)
return True # K8s 執行成功
else:
logger.error(
"background_execution_failed",
approval_id=str(approval.id),
operation=operation_type.value,
target=resource_name,
namespace=namespace,
error=result.error,
)
await timeline.add_event(
event_type="exec",
status="error",
title=f"❌ K8s 執行失敗: {operation_type.value}",
description=f"Error: {result.error}",
actor="leWOOOgo",
actor_role="executor",
approval_id=str(approval.id),
incident_id=approval.incident_id,
)
# Phase 6: 發送失敗通知 (fire-and-forget, 包含 Dry-Run 攔截)
exec_status = (
ExecutionStatus.DRY_RUN_BLOCKED
if "not found" in (result.error or "")
else ExecutionStatus.FAILED
)
asyncio.create_task(
self._send_execution_notification(
approval=approval,
execution_status=exec_status,
operation_type=operation_type.value,
namespace=namespace,
error_message=result.error,
duration_ms=result.duration_ms,
)
)
# 2026-04-14 Claude Sonnet 4.6: reply_to 原告警卡片顯示失敗結果
asyncio.create_task(
self._push_execution_result_to_alert(
approval, success=False, error=result.error
)
)
# ADR-030 Phase 5 / ADR-083 Phase 3: 觸發學習服務(失敗案例)
# Phase 3 修復fire-and-forget → await + 30s 熔斷
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 fire-and-forget 修復
try:
await asyncio.wait_for(
self._trigger_learning(
approval=approval,
success=False,
error_message=result.error,
duration_seconds=result.duration_ms / 1000 if result.duration_ms else 0,
),
timeout=30.0,
)
except asyncio.TimeoutError:
logger.warning(
"learning_trigger_timeout",
approval_id=str(approval.id),
timeout_sec=30.0,
)
# ADR-090 修復 (2026-04-19 ogt + Claude Opus 4.7):
# 失敗時也跑 verifier,把 verification_result='failed' 回寫 evidence。
# 改 await + 60s timeout (原為 fire-and-forget,task 在 Pod recycle 時被殺)。
from src.core.feature_flags import aiops_flags
if aiops_flags.is_sub_flag_enabled("AIOPS_P1_POST_EXECUTION_VERIFIER"):
try:
await asyncio.wait_for(
self._run_post_execution_verify(
approval=approval,
action_taken=f"{operation_type.value}:{resource_name}:FAILED",
),
timeout=_VERIFIER_AWAIT_TIMEOUT_SEC,
)
except asyncio.TimeoutError:
logger.warning(
"post_verify_timeout_exceeded_failed_path",
approval_id=str(approval.id),
timeout_sec=_VERIFIER_AWAIT_TIMEOUT_SEC,
)
# ADR-090 § aol completed (執行失敗)
await self._log_aol_completed(
op_id=_aol_op_id,
status="failed",
duration_ms=int((time.time() - _aol_started_ms) * 1000),
output={
"operation_type": operation_type.value,
"resource_name": resource_name,
"namespace": namespace,
"executor_duration_ms": result.duration_ms,
"total_attempts": total_attempts,
},
error=result.error,
stderr=result.error, # E6 stderr 回灌 — 給 retry/Playbook 負向強化用
)
return False # K8s 執行失敗
async def _execute_ssh_host_action(
self,
approval: ApprovalRequest,
host: str,
) -> ExecutionResult:
"""
執行 SSH 主機 action手動批准路徑專用
2026-05-02 ogt + Claude Sonnet 4.6: 修補手動批准 SSH action 卡住的 bug
根因parse_operation_from_action 只懂 kubectlapproval_execution 走 K8s executor 拒收
修法:偵測 SSH_HOST 後改走 SSHProvider行為與 decision_manager._ssh_execute 對齊
action 解析邏輯:
- "docker prune" / "docker image prune" / "docker volume prune" → ssh_docker_prune
- "docker restart <name>" → ssh_docker_restart
- "systemctl restart <svc>" → ssh_systemctl_restart
- "ps aux" / "df -h" / "free -h" / "top" / "uptime" / 'echo' / 'ls -lah' → ssh_diagnose
- 其他:回傳失敗,提示 LLM 改寫 action
"""
start = time.time()
action = approval.action or ""
action_lower = action.lower().strip()
# 路由 SSH MCP tool與 decision_manager._ssh_execute 對齊)
params: dict = {"host": host}
tool_name: str | None = None
if "docker" in action_lower and "prune" in action_lower:
tool_name = "ssh_docker_prune"
params["trust_score"] = 0.85
elif "docker restart" in action_lower:
tool_name = "ssh_docker_restart"
# 嘗試萃取 container name
import re as _re
m = _re.search(r"docker\s+restart\s+([a-z0-9._-]+)", action_lower)
if m:
params["container_name"] = m.group(1)
params["trust_score"] = 0.85
else:
tool_name = None # 沒抓到 container 名稱,降級
elif "systemctl restart" in action_lower:
tool_name = "ssh_systemctl_restart"
import re as _re
m = _re.search(r"systemctl\s+restart\s+([a-z0-9._-]+)", action_lower)
if m:
params["service"] = m.group(1)
params["trust_score"] = 0.85
else:
tool_name = None
elif any(kw in action_lower for kw in ("ps aux", "df -h", "free -h", "top ", "uptime", "echo ", "ls -")):
# 主機診斷類(合 ssh_diagnose 一鍵收集)
tool_name = "ssh_diagnose"
if tool_name is None:
duration_ms = int((time.time() - start) * 1000)
err = f"SSH action 無法路由到 SSH MCP tool: {action[:120]}"
logger.warning(
"ssh_host_action_unrouted",
approval_id=str(approval.id),
action=action,
host=host,
)
return ExecutionResult(
success=False,
message="SSH action unrouted",
operation_type=OperationType.SSH_HOST,
target_resource=host,
namespace="host",
duration_ms=duration_ms,
error=err,
)
try:
logger.warning(
"mcp_gateway_approved_ssh_execution_path",
approval_id=str(approval.id),
incident_id=approval.incident_id,
tool=tool_name,
host=host,
agent_id=_SSH_GATEWAY_AGENT_ID,
)
mcp_result = await self._execute_ssh_tool_via_gateway(
approval=approval,
tool_name=tool_name,
params=params,
)
duration_ms = int((time.time() - start) * 1000)
success = bool(mcp_result.success)
return ExecutionResult(
success=success,
message=f"ssh_mcp:{tool_name} {'ok' if success else 'failed'}",
operation_type=OperationType.SSH_HOST,
target_resource=host,
namespace="host",
duration_ms=duration_ms,
k8s_response={"tool": tool_name, "output": mcp_result.output} if success else None,
error=None if success else (mcp_result.error or "ssh_mcp execution failed"),
)
except Exception as e:
duration_ms = int((time.time() - start) * 1000)
logger.warning(
"ssh_host_action_exception",
approval_id=str(approval.id),
tool=tool_name,
error=str(e),
)
return ExecutionResult(
success=False,
message="ssh_mcp exception",
operation_type=OperationType.SSH_HOST,
target_resource=host,
namespace="host",
duration_ms=duration_ms,
error=str(e),
)
async def _execute_ssh_tool_via_gateway(
self,
approval: ApprovalRequest,
tool_name: str,
params: dict[str, Any],
) -> MCPToolResult:
required_scope = _SSH_GATEWAY_TOOL_SCOPES.get(tool_name, "read")
run_id = approval.id if isinstance(approval.id, UUID) else UUID(str(approval.id))
if required_scope != "read":
approval_key = (
f"mcp_approval:{_SSH_GATEWAY_PROJECT_ID}:{_SSH_GATEWAY_AGENT_ID}:"
f"{tool_name}:{run_id}"
)
try:
redis = get_redis()
await redis.set(
approval_key,
"approved",
ex=_SSH_GATEWAY_APPROVAL_TTL_SECONDS,
)
except Exception as exc:
logger.warning(
"mcp_gateway_approval_projection_failed",
approval_id=str(approval.id),
tool=tool_name,
approval_key=approval_key,
error=str(exc),
)
params_with_audit = {
**params,
"_mcp_audit": {
"session_id": f"approval:{approval.id}",
"incident_id": approval.incident_id,
"agent_role": _SSH_GATEWAY_AGENT_ID,
"flywheel_node": "execute",
"approval_id": str(approval.id),
},
}
async with get_db_context(_SSH_GATEWAY_PROJECT_ID) as db:
ctx = GatewayContext(
project_id=_SSH_GATEWAY_PROJECT_ID,
agent_id=_SSH_GATEWAY_AGENT_ID,
tool_name=tool_name,
run_id=run_id,
trace_id=approval.incident_id or str(approval.id),
is_shadow=False,
environment={"env": "prod"},
required_scope=required_scope,
)
try:
return await McpGateway(db).call(ctx, params_with_audit)
except McpGatewayError as exc:
logger.warning(
"mcp_gateway_approved_ssh_blocked",
approval_id=str(approval.id),
incident_id=approval.incident_id,
tool=tool_name,
gate=exc.gate,
error_code=exc.error_code,
error=str(exc),
)
return MCPToolResult(
success=False,
execution_id=f"blocked:{tool_name}:{run_id}",
error=f"{exc.error_code}: {exc}",
)
async def _push_execution_result_to_alert(
self,
approval: ApprovalRequest,
success: bool,
error: str | None,
) -> None:
"""
執行結果回覆到原告警 Telegram 卡片reply_to_message_id
2026-04-14 Claude Sonnet 4.6 實裝:
- 人工路徑:人類在 Telegram 點批准後,等執行完成,在原告警下 reply 執行結果
- 自動路徑 (requested_by=auto_approve) 由 _push_auto_repair_result 處理,此處 skip
透過 Redis tg_msg:{incident_id} 查原告警 message_id找不到則靜默不發。
"""
try:
# 自動執行路徑 skip避免與 _push_auto_repair_result 重複發訊息)
if self._is_auto_approved_request(approval):
return
if not approval.incident_id:
return
from src.core.redis_client import get_redis
redis = get_redis()
msg_id_raw = await redis.get(f"tg_msg:{approval.incident_id}")
if not msg_id_raw:
logger.debug(
"push_execution_result_no_msg_id",
incident_id=approval.incident_id,
approval_id=str(approval.id),
)
return
try:
orig_msg_id = int(msg_id_raw)
except (TypeError, ValueError):
return
from src.core.config import get_settings
from src.services.telegram_gateway import get_telegram_gateway
settings = get_settings()
gateway = get_telegram_gateway()
target_chat_id = settings.SRE_GROUP_CHAT_ID or settings.OPENCLAW_TG_CHAT_ID
# 2026-04-19 ogt + Claude Opus 4.7 修 AP-2: 除了 reply 外,
# 也 edit 原卡片移除按鈕 + 更新狀態戳記(避免卡片永遠停在「執行中」)
try:
await gateway._send_request("editMessageReplyMarkup", {
"chat_id": target_chat_id,
"message_id": orig_msg_id,
"reply_markup": {"inline_keyboard": []},
})
except Exception as _edit_e:
logger.debug("push_execution_edit_buttons_failed",
approval_id=str(approval.id), error=str(_edit_e))
# 附加 KM/Playbook 增量(查最近該 incident 的 KM + playbook 使用)
km_info = ""
try:
from sqlalchemy import text as _sql
from src.db.base import get_db_context
async with get_db_context() as _db:
_km_row = await _db.execute(
_sql("""SELECT COUNT(*) FROM knowledge_entries
WHERE created_at > NOW() - interval '2 minutes'"""),
)
_km_count = _km_row.scalar() or 0
_pb_row = await _db.execute(
_sql("""SELECT COUNT(*) FROM playbooks
WHERE updated_at > NOW() - interval '2 minutes'"""),
)
_pb_count = _pb_row.scalar() or 0
if _km_count or _pb_count:
km_info = f"\n📚 KM +{_km_count} 🎯 Playbook 更新×{_pb_count}"
except Exception:
pass
if success:
text = (
f"✅ <b>執行成功</b>\n"
f"<code>{(approval.action or '')[:180]}</code>"
f"{km_info}"
)
else:
err_short = (error or "未知錯誤")[:150]
text = (
f"❌ <b>執行失敗</b>\n"
f"<code>{(approval.action or '')[:180]}</code>\n"
f"原因: {err_short}"
f"{km_info}"
)
await gateway._send_request(
"sendMessage",
{
"chat_id": target_chat_id,
"text": text,
"parse_mode": "HTML",
"reply_to_message_id": orig_msg_id,
},
)
logger.info(
"push_execution_result_sent",
incident_id=approval.incident_id,
approval_id=str(approval.id),
success=success,
orig_msg_id=orig_msg_id,
)
except Exception as e:
logger.warning(
"push_execution_result_failed",
approval_id=str(approval.id),
error=str(e),
)
async def _get_anomaly_key_from_approval(self, approval: ApprovalRequest) -> str | None:
"""
從 approval → incident → anomaly_key。
2026-04-07 Claude Code: I1+S1 Fix — 委託 AnomalyCounter.derive_key_from_incident()
"""
try:
if not approval.incident_id:
return None
from src.services.incident_service import get_incident_service
incident_service = get_incident_service()
incident = await incident_service.get_from_working_memory(approval.incident_id)
if not incident:
return None
from src.services.anomaly_counter import AnomalyCounter
return AnomalyCounter.derive_key_from_incident(incident)
except Exception as e:
logger.warning("get_anomaly_key_from_approval_failed", error=str(e))
return None
async def _trigger_learning(
self,
approval: ApprovalRequest,
success: bool,
duration_seconds: float = 0,
error_message: str | None = None,
) -> None:
"""
ADR-030 Phase 5: 觸發學習服務
處理執行結果,調整信任度和 Playbook 統計
"""
try:
from src.services.learning_service import (
ExecutionResult,
get_learning_service,
)
learning = get_learning_service()
result = ExecutionResult(
approval_id=str(approval.id),
incident_id=approval.incident_id or "",
action=approval.action,
success=success,
error_message=error_message,
duration_seconds=duration_seconds,
)
await learning.process_execution_result(
approval=approval,
result=result,
)
except Exception as e:
# 學習失敗不影響主流程
logger.warning(
"learning_trigger_failed",
approval_id=str(approval.id),
error=str(e),
)
# 2026-04-04 ogt: 執行結果沉澱到 KM — 移出 try/except 確保 learning 失敗也寫入
# 統帥鐵律: 所有異常與自動修復紀錄必須回寫 KM
# P1.5 fix 2026-04-24 ogt + Claude Sonnet 4.6: fire-and-forget → await30s 熔斷)
# P1-1 2026-04-28 ogt + Claude Sonnet 4.6: 改用 write_execution_result_to_km公開
# KMWriter 統一契約timeout / retry / DLQ 由 km_writer.py 統一管理
await self.write_execution_result_to_km(approval, success, error_message)
async def _run_post_execution_verify(
self,
approval: "ApprovalRequest",
action_taken: str,
) -> None:
"""
ADR-081 Phase 1: 執行後驗證 (fire-and-forget 包裝)
1. 從 incident_id 查 Incident
2. 從 incident_evidence 取最新 EvidenceSnapshot
3. 呼叫 PostExecutionVerifier.verify() 補填後狀態 + 驗證結果
4. 結果傳給 learning_service 更新 Playbook trust_scorePhase 3
"""
if not approval.incident_id:
return
try:
from src.services.incident_service import get_incident_service
from src.services.post_execution_verifier import get_post_execution_verifier
# 2026-04-26 critic-B2 hotfix by Claude Opus 4.7
# get_latest_snapshot 是 module-level async function不是 EvidenceSnapshot classmethod
from src.services.evidence_snapshot import get_latest_snapshot
incident_svc = get_incident_service()
# 2026-04-25 修復 L1IncidentService 沒有 get_incident() 方法
# 應用正確方法 get_from_working_memory() 或 get_from_episodic_memory()
incident = await incident_svc.get_from_working_memory(approval.incident_id)
if incident is None:
incident = await incident_svc.get_from_episodic_memory(approval.incident_id)
if incident is None:
logger.warning(
"post_verify_incident_not_found",
approval_id=str(approval.id),
incident_id=approval.incident_id,
)
return
# 取最新 EvidenceSnapshot若 Phase 1 flag 有啟動才會有)
snapshot = await get_latest_snapshot(approval.incident_id)
verifier = get_post_execution_verifier()
verification_result = await verifier.verify(
incident=incident,
snapshot=snapshot,
action_taken=action_taken,
)
logger.info(
"post_verify_complete",
approval_id=str(approval.id),
incident_id=approval.incident_id,
result=verification_result,
action=action_taken,
)
# ADR-083 Phase 3 Root cause 3: 驗證結果接線到學習服務
# 環境驗證Pod Running / 指標恢復)是比執行 exit code 更精確的學習訊號
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太)
try:
from src.services.learning_service import get_learning_service
_matched_pb_id = getattr(approval, "matched_playbook_id", None)
await get_learning_service().record_verification_result(
incident_id=approval.incident_id,
action_taken=action_taken,
verification_result=verification_result,
matched_playbook_id=_matched_pb_id,
)
except Exception as _lerr:
logger.warning(
"post_verify_learning_failed",
approval_id=str(approval.id),
error=str(_lerr),
)
except Exception as _e:
# 驗證失敗不影響執行結果
logger.warning(
"post_verify_failed",
approval_id=str(approval.id),
error=str(_e),
)
@staticmethod
def _is_auto_approved_request(approval: "ApprovalRequest") -> bool:
requested_by = (getattr(approval, "requested_by", "") or "").lower()
return requested_by.startswith("auto_approve")
@staticmethod
def _is_observation_only_action(action: str | None) -> bool:
action_upper = (action or "").strip().upper()
return (
not action_upper
or "NO_ACTION" in action_upper
or "NO-ACTION" in action_upper
or "NOACTION" in action_upper
or action_upper.startswith("OBSERVE")
or action_upper.startswith("INVESTIGATE")
)
@staticmethod
def _approval_risk_value(approval: "ApprovalRequest") -> str | None:
risk_level = getattr(approval, "risk_level", None)
if risk_level is None:
return None
return getattr(risk_level, "value", str(risk_level))
async def finalize_auto_approved_execution(
self,
approval: "ApprovalRequest",
*,
success: bool,
error_message: str | None = None,
) -> None:
"""
補齊「自動批准已執行」路徑的 incident-linked 證據鏈。
CS2/CS3 webhook 路徑為了快速執行,會先呼叫 execute_approved_action()
再建立 Incident。executor 當下沒有 incident_id導致 verifier/KM/
auto_repair_executions 都無法串回同一張告警卡。此方法只在 incident
建立後補上 durable trace不重新執行 action。
"""
if not self._is_auto_approved_request(approval):
return
incident_id = getattr(approval, "incident_id", None)
if not incident_id:
logger.warning(
"auto_approved_execution_finalize_skipped_no_incident",
approval_id=str(getattr(approval, "id", "")),
requested_by=getattr(approval, "requested_by", None),
)
return
if self._is_observation_only_action(getattr(approval, "action", None)):
logger.info(
"auto_approved_execution_finalize_skipped_observation_only",
approval_id=str(approval.id),
incident_id=incident_id,
action=(approval.action or "")[:120],
)
return
parsed = parse_operation_from_action(approval.action)
operation_type = parsed.operation_type
resource_name = parsed.resource_name or "unknown"
namespace = parsed.namespace or "default"
playbook_id = str(getattr(approval, "matched_playbook_id", None) or approval.id)[:36]
operation_label = operation_type.value if operation_type else "unknown"
playbook_name = f"approval_auto_execute:{operation_label}:{resource_name}"[:200]
triggered_by = (getattr(approval, "requested_by", None) or "auto_approve")[:50]
action_taken = f"auto_repair_playbook:{playbook_id}:{operation_label}:{resource_name}"
if not success:
action_taken = f"{action_taken}:FAILED"
error_message = error_message or "auto-approved executor returned failure; see approval/aol logs"
try:
from src.repositories.audit_log_repository import get_auto_repair_execution_repository
repo = get_auto_repair_execution_repository()
existing = await repo.list_by_incident(incident_id)
already_recorded = any(
str(getattr(row, "playbook_id", "")) == playbook_id
and getattr(row, "triggered_by", "") == triggered_by
and (approval.action or "") in list(getattr(row, "executed_steps", []) or [])
for row in existing
)
if not already_recorded:
await repo.create(
incident_id=incident_id,
playbook_id=playbook_id,
playbook_name=playbook_name,
success=success,
executed_steps=[approval.action],
error_message=error_message,
triggered_by=triggered_by,
risk_level=self._approval_risk_value(approval),
)
else:
logger.info(
"auto_approved_execution_record_already_exists",
approval_id=str(approval.id),
incident_id=incident_id,
playbook_id=playbook_id,
)
except Exception as exc:
logger.warning(
"auto_approved_execution_record_failed",
approval_id=str(approval.id),
incident_id=incident_id,
error=str(exc),
)
try:
timeline = get_timeline_service()
await timeline.add_event(
event_type="exec",
status="success" if success else "error",
title=f"{'' if success else ''} 自動批准執行已補鏈: {operation_label}",
description=(
f"Target: {resource_name} @ {namespace}; "
f"source={triggered_by}; action={approval.action[:160]}"
),
actor="leWOOOgo",
actor_role="executor",
approval_id=str(approval.id),
incident_id=incident_id,
)
except Exception as exc:
logger.warning(
"auto_approved_execution_timeline_failed",
approval_id=str(approval.id),
incident_id=incident_id,
error=str(exc),
)
try:
await self.write_execution_result_to_km(approval, success, error_message)
except Exception as exc:
logger.warning(
"auto_approved_execution_km_failed",
approval_id=str(approval.id),
incident_id=incident_id,
error=str(exc),
)
from src.core.feature_flags import aiops_flags
if aiops_flags.is_sub_flag_enabled("AIOPS_P1_POST_EXECUTION_VERIFIER"):
try:
await asyncio.wait_for(
self._run_post_execution_verify(
approval=approval,
action_taken=action_taken,
),
timeout=_VERIFIER_AWAIT_TIMEOUT_SEC,
)
except asyncio.TimeoutError:
logger.warning(
"auto_approved_execution_post_verify_timeout",
approval_id=str(approval.id),
incident_id=incident_id,
timeout_sec=_VERIFIER_AWAIT_TIMEOUT_SEC,
)
if success:
try:
from src.services.incident_service import get_incident_service
await get_incident_service().resolve_incident(incident_id)
logger.info(
"incident_resolved_after_auto_approved_execution_finalize",
incident_id=incident_id,
approval_id=str(approval.id),
)
except Exception as exc:
logger.warning(
"incident_resolve_after_auto_approved_execution_finalize_failed",
incident_id=incident_id,
approval_id=str(approval.id),
error=str(exc),
)
async def write_execution_result_to_km(
self,
approval: "ApprovalRequest",
success: bool,
error_message: str | None,
) -> None:
"""
執行結果沉澱到 KM (Knowledge Base)
2026-04-04 ogt: 統帥鐵律 — 成功/失敗執行記錄都必須回寫 KM
2026-04-14 Claude Sonnet 4.6 (BP-1 B.1 精修): 區分 auto_approve vs 人工路徑,
補齊 alert_category / alertname / affected_services 供 RAG 檢索。
P1-1 2026-04-28 ogt + Claude Sonnet 4.6: 改名公開(去底線),委派 KMWriter 統一契約。
"""
from src.models.knowledge import EntrySource, EntryType
from src.services.km_writer import KMWritePayload, km_write_with_flag
# 來源辨識B.1 精修)
_is_auto = self._is_auto_approved_request(approval)
_mode_prefix = "[自動修復]" if _is_auto else "[人工修復]"
_mode_tag = "auto_executed" if _is_auto else "human_approved"
status_icon = "" if success else ""
status_text = "成功" if success else f"失敗: {error_message or '未知原因'}"
_status_tag = "success" if success else "failure"
# 從關聯 Incident 提取豐富元資料
alertname = "unknown"
alert_category = "general"
affected_services: list[str] = []
if approval.incident_id:
try:
from src.services.incident_service import get_incident_service
_svc = get_incident_service()
# get_from_working_memory (Redis) → fallback get_from_episodic_memory (PG)
_inc = await _svc.get_from_working_memory(approval.incident_id)
if _inc is None:
_inc = await _svc.get_from_episodic_memory(approval.incident_id)
if _inc:
if _inc.signals:
alertname = _inc.signals[0].labels.get("alertname", "unknown") or "unknown"
alert_category = getattr(_inc, "alert_category", "") or "general"
affected_services = list(_inc.affected_services or [])
except Exception as _ie:
logger.debug("km_incident_enrich_failed",
incident_id=approval.incident_id, error=str(_ie))
_services_str = ", ".join(affected_services) if affected_services else "未關聯"
content = (
f"# {status_icon} {_mode_prefix} {alertname}\n\n"
f"**告警名稱**: {alertname}\n"
f"**告警類別**: {alert_category}\n"
f"**受影響服務**: {_services_str}\n"
f"**執行命令**: `{approval.action[:200]}`\n"
f"**執行結果**: {status_text}\n"
f"**風險等級**: {approval.risk_level.value if approval.risk_level else '未知'}\n"
f"**執行路徑**: {'自動執行 (confidence >= 0.65)' if _is_auto else '人工審核批准'}\n"
f"**Incident ID**: {approval.incident_id or '未關聯'}\n"
f"**Approval ID**: {approval.id}\n\n"
f"## 操作描述\n{approval.description or '無描述'}\n"
)
# Tags: 模式 + 狀態 + 類別(供 RAG 多維度檢索)
tags = [_mode_tag, _status_tag, alert_category, "execution"]
if not success:
tags.append("execution_failed")
payload = KMWritePayload(
path_type="approval_auto_ok" if (_is_auto and success) else
"approval_auto_fail" if (_is_auto and not success) else
"approval_manual",
entry_create_kwargs=dict(
title=f"{_mode_prefix} {alertname}: {approval.action[:50]}",
content=content,
entry_type=EntryType.INCIDENT_CASE,
category=alert_category,
tags=tags,
source=EntrySource.AI_EXTRACTED,
related_incident_id=approval.incident_id or None,
created_by="auto_execute" if _is_auto else "approval_execution",
),
incident_id=approval.incident_id or None,
approval_id=str(approval.id),
)
await km_write_with_flag(payload)
async def _send_execution_notification(
self,
approval: ApprovalRequest,
execution_status: "ExecutionStatus",
operation_type: str,
namespace: str,
duration_ms: int | None = None,
error_message: str | None = None,
) -> None:
"""
Phase 6: 發送執行通知 (Post-Execution Hook)
將執行結果發送至所有已配置的通知頻道 (Discord, Slack, etc.)
"""
from src.services.notifications import (
NotificationMessage,
get_notification_manager,
)
if not settings.NOTIFICATION_ENABLED:
logger.info("notification_disabled", approval_id=str(approval.id))
return
try:
# 建構簽核者列表
signers = [
{"name": sig.signer_name, "comment": sig.comment or ""}
for sig in approval.signatures
]
# 建構通知訊息
message = NotificationMessage(
execution_status=execution_status,
action_title=approval.action[:100],
action_description=approval.description[:200] if approval.description else "",
approval_id=str(approval.id),
signers=signers,
required_signatures=approval.required_signatures,
affected_pods=approval.blast_radius.affected_pods if approval.blast_radius else 0,
estimated_downtime=approval.blast_radius.estimated_downtime if approval.blast_radius else "N/A",
related_services=approval.blast_radius.related_services if approval.blast_radius else [],
data_impact=approval.blast_radius.data_impact.value if approval.blast_radius else "none",
namespace=namespace,
operation_type=operation_type,
duration_ms=duration_ms,
error_message=error_message,
risk_level=approval.risk_level.value,
ai_provider=approval.requested_by,
)
# 發送通知
manager = get_notification_manager()
results = await manager.send_all(message)
for result in results:
logger.info(
"notification_result",
approval_id=str(approval.id),
provider=result.provider,
status=result.status.value,
message=result.message,
)
except Exception as e:
logger.exception(
"notification_failed",
approval_id=str(approval.id),
error=str(e),
)
async def _trigger_playbook_extraction(
self,
approval: ApprovalRequest,
) -> None:
"""
Phase 7.6: 觸發 Playbook 自動萃取
條件:
- 執行成功
- 關聯的 Incident 狀態為 RESOLVED 或 CLOSED
- effectiveness_score >= 4
此函數為 fire-and-forget失敗不影響主流程
"""
try:
# 1. 從 approval.incident_id 直接取得 (Phase 26 修復)
# 原本靠 regex 掃文字找 INC- 前綴,中文 action 完全找不到
incident_id = getattr(approval, "incident_id", None)
if not incident_id:
# Fallback: 嘗試文字解析 (向後兼容舊資料)
incident_id = self._extract_incident_id_from_approval(approval)
if not incident_id:
logger.info(
"playbook_extraction_skipped",
approval_id=str(approval.id),
reason="No incident_id found in approval.incident_id or text",
)
return
# 2. 取得 Incident
from src.services.incident_service import get_incident_service
incident_service = get_incident_service()
# 2026-04-25 修復 L1IncidentService 沒有 get_incident() 方法
incident = await incident_service.get_from_working_memory(incident_id)
if incident is None:
incident = await incident_service.get_from_episodic_memory(incident_id)
if not incident:
logger.info(
"playbook_extraction_skipped",
approval_id=str(approval.id),
incident_id=incident_id,
reason="Incident not found",
)
return
# 3. 執行成功後自動設定 outcome (冷啟動關鍵)
# 2026-04-04 ogt: 首席架構師 Review — 補上 execution_success + effectiveness_score
# 確保 Playbook 萃取前置條件能成立,不再依賴人工填分
from src.models.incident import IncidentOutcome, IncidentStatus
from src.utils.timezone import now_taipei
if incident.outcome is None:
incident.outcome = IncidentOutcome()
if not incident.outcome.execution_success:
incident.outcome.execution_success = True
if incident.outcome.effectiveness_score is None or incident.outcome.effectiveness_score < 4:
incident.outcome.effectiveness_score = 4 # 系統判斷K8s 執行成功 = 有效
if incident.status not in [IncidentStatus.RESOLVED, IncidentStatus.CLOSED]:
incident.status = IncidentStatus.RESOLVED
incident.resolved_at = now_taipei()
# Task 3.3 (2026-04-14): 記錄執行動作供 SSH 路徑 KM 萃取
# approval.action 含實際執行指令(可能是 kubectl 或 ssh ...
# 寫入 learning_notes 供 playbook_service._extract_repair_steps 萃取 SSH RepairStep
if not incident.outcome.learning_notes and approval.action:
incident.outcome.learning_notes = approval.action
# 回存 Incidentfire-and-forget 路徑,失敗不影響主流程)
await incident_service.save_to_working_memory(incident)
logger.info(
"playbook_extraction_incident_updated",
approval_id=str(approval.id),
incident_id=incident_id,
effectiveness_score=incident.outcome.effectiveness_score,
status=incident.status.value,
)
# 4. 觸發萃取effectiveness 已保證 >= 4
from src.services.playbook_service import get_playbook_service
playbook_service = get_playbook_service()
effectiveness = incident.outcome.effectiveness_score or 4
playbook = await playbook_service.extract_from_incident(
incident=incident,
auto_approve=effectiveness >= 5, # 滿分自動核准
)
if playbook:
logger.info(
"playbook_auto_extracted",
approval_id=str(approval.id),
incident_id=incident_id,
playbook_id=playbook.playbook_id,
playbook_name=playbook.name,
auto_approved=playbook.status.value == "approved",
)
else:
logger.debug(
"playbook_extraction_no_result",
approval_id=str(approval.id),
incident_id=incident_id,
)
except Exception as e:
# 萃取失敗不影響主流程
logger.warning(
"playbook_extraction_error",
approval_id=str(approval.id),
error=str(e),
)
def _extract_incident_id_from_approval(
self,
approval: ApprovalRequest,
) -> str | None:
"""
從 approval 提取關聯的 incident_id
嘗試以下來源:
1. approval.metadata (如果有)
2. approval.description 中的 INC- 模式
3. approval.requested_by 中的 incident 資訊
"""
import re
# 從 description 或 action 中尋找 INC-XXXXXX 模式
text = f"{approval.description or ''} {approval.action or ''}"
match = re.search(r"INC-([A-Z0-9-]+)", text)
if match:
return match.group(0) # 返回完整的 INC-XXXXX
# 從 requested_by 尋找
if approval.requested_by and "INC-" in approval.requested_by:
match = re.search(r"INC-([A-Z0-9-]+)", approval.requested_by)
if match:
return match.group(0)
return None
# =========================================================================
# ADR-090 § AOL Writer (2026-04-19 ogt + Claude Opus 4.7 亞太)
# 把 approval execution 的生命週期回灌 automation_operation_log.
# 之前 33 件/7d approval 動作完全沒寫入 aol,只有 drift_narrator 的
# 22 筆 notification_formatted。修復後每次執行都留痕。
# =========================================================================
async def _log_aol_started(self, approval: ApprovalRequest) -> str | None:
"""
在 automation_operation_log 寫一筆 'pending' 紀錄,回傳 op_id 供 _log_aol_completed 更新。
失敗時 (DB 異常) 回 None,主流程繼續 — aol 寫入永不阻塞執行。
2026-04-20 P0.3: input 補 target / operation_type / namespace
失敗時 aol.input 就能直接看到 target 是什麼(追 awoooi-service 類誤判的 source trace
"""
try:
from sqlalchemy import text as _sql
from src.db.base import get_db_context
import json as _json
# 2026-04-20 P0.3: 先嘗試從 action 解析 target / op_type失敗不阻塞
_parsed_target: str | None = None
_parsed_op: str | None = None
_parsed_ns: str | None = None
try:
_parsed = parse_operation_from_action(approval.action or "")
_parsed_target = _parsed.resource_name
_parsed_op = _parsed.operation_type.value if _parsed.operation_type else None
_parsed_ns = _parsed.namespace
except Exception:
pass
input_payload = {
"approval_id": str(approval.id),
"incident_id": approval.incident_id or "",
"action": (approval.action or "")[:500],
"risk_level": getattr(approval, "risk_level", None) or "",
"requested_by": getattr(approval, "requested_by", "") or "",
# 2026-04-20 P0.3: target source trace
"parsed_target": _parsed_target or "",
"parsed_operation": _parsed_op or "",
"parsed_namespace": _parsed_ns or "",
}
async with get_db_context() as db:
row = await db.execute(
_sql("""
INSERT INTO automation_operation_log (
operation_type, actor, status,
input, output, tags
) VALUES (
'playbook_executed',
'approval_execution',
'pending',
CAST(:input AS jsonb),
'{}'::jsonb,
:tags
)
RETURNING op_id
"""),
{
"input": _json.dumps(input_payload, ensure_ascii=False),
"tags": ["approval", "execution", "playbook"],
},
)
op_id = row.scalar()
return str(op_id) if op_id else None
except Exception as e:
logger.warning("aol_started_write_failed", approval_id=str(approval.id), error=str(e))
return None
async def _log_aol_completed(
self,
op_id: str | None,
status: str,
duration_ms: int,
output: dict | None = None,
error: str | None = None,
stderr: str | None = None,
) -> None:
"""
UPDATE automation_operation_log 為 success/failed 並寫入結果摘要 + stderr。
status 必須是 aol constraint 允許的值:
pending | success | failed | dry_run | rolled_back
op_id 為 None 時靜默跳過 (started 寫入失敗時不應觸發 update 例外)。
"""
if not op_id:
return
try:
from sqlalchemy import text as _sql
from src.db.base import get_db_context
import json as _json
async with get_db_context() as db:
await db.execute(
_sql("""
UPDATE automation_operation_log
SET status = :status,
duration_ms = :duration_ms,
output = CAST(:output AS jsonb),
error = :error,
stderr_feed_back = :stderr
WHERE op_id = CAST(:op_id AS uuid)
"""),
{
"status": status,
"duration_ms": duration_ms,
"output": _json.dumps(output or {}, ensure_ascii=False),
"error": (error or "")[:2000] if error else None,
"stderr": (stderr or "")[:8000] if stderr else None,
"op_id": op_id,
},
)
except Exception as e:
logger.warning("aol_completed_write_failed", op_id=op_id, error=str(e))
# =============================================================================
# Singleton Instance
# =============================================================================
_execution_service: ApprovalExecutionService | None = None
def get_execution_service() -> ApprovalExecutionService:
"""
取得 ApprovalExecutionService 單例
Returns:
ApprovalExecutionService: 執行服務實例
"""
global _execution_service
if _execution_service is None:
_execution_service = ApprovalExecutionService()
return _execution_service