diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py
index 10a4edc9..584aa822 100644
--- a/apps/api/src/services/decision_manager.py
+++ b/apps/api/src/services/decision_manager.py
@@ -259,6 +259,91 @@ async def _push_decision_to_telegram(
)
+async def _fetch_metrics_snapshot(incident: Incident) -> dict:
+ """
+ ADR-071-I: 從 Prometheus 抓取與此 incident 相關的指標快照
+ 失敗時靜默返回空 dict,不阻塞主流程
+
+ 2026-04-11 Claude Sonnet 4.6 Asia/Taipei
+ """
+ try:
+ from src.plugins.mcp.providers.prometheus_provider import PrometheusProvider
+ prom = PrometheusProvider()
+ if not prom.enabled:
+ return {}
+
+ labels = incident.signals[0].labels if incident.signals else {}
+ alertname = labels.get("alertname", "")
+ instance = labels.get("instance", "")
+ snapshots: dict = {}
+
+ # 根據 alertname 選擇最相關的指標
+ if alertname in ("HostHighCpuLoad", "HostOutOfMemory"):
+ if instance:
+ host = instance.split(":")[0]
+ r = await prom._instant_query(
+ f'100 - (avg by(instance) (irate(node_cpu_seconds_total{{mode="idle",instance=~"{host}.*"}}[5m])) * 100)'
+ )
+ if r.get("status") == "success":
+ for item in r.get("data", {}).get("result", []):
+ snapshots["cpu_pct"] = round(float(item["value"][1]), 1)
+ r2 = await prom._instant_query(
+ f'(1 - (node_memory_MemAvailable_bytes{{instance=~"{instance}"}} / node_memory_MemTotal_bytes{{instance=~"{instance}"}})) * 100'
+ if instance else "100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)"
+ )
+ if r2.get("status") == "success":
+ for item in r2.get("data", {}).get("result", []):
+ snapshots["mem_pct"] = round(float(item["value"][1]), 1)
+
+ elif alertname == "HostOutOfDiskSpace":
+ r = await prom._instant_query(
+ 'max(100 - ((node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"}) * 100))'
+ )
+ if r.get("status") == "success":
+ for item in r.get("data", {}).get("result", []):
+ snapshots["disk_pct"] = round(float(item["value"][1]), 1)
+
+ elif alertname in ("PodRestartingTooMuch", "PodCrashLoopBackOff"):
+ pod = labels.get("pod", labels.get("component", ""))
+ if pod:
+ r = await prom._instant_query(
+ f'sum(kube_pod_container_status_restarts_total{{namespace="awoooi-prod",pod=~"{pod}.*"}})'
+ )
+ if r.get("status") == "success":
+ for item in r.get("data", {}).get("result", []):
+ snapshots["restart_count"] = int(float(item["value"][1]))
+
+ return snapshots
+ except Exception as _e:
+ logger.debug("metrics_snapshot_failed", incident_id=incident.incident_id, error=str(_e))
+ return {}
+
+
+def _format_metrics_delta(before: dict, after: dict) -> str:
+ """
+ ADR-071-I: 格式化指標前後對比文字
+ 例:CPU 92%→23% | Mem 78%→45%
+ """
+ if not before and not after:
+ return ""
+ parts = []
+ for key, label in [("cpu_pct", "CPU"), ("mem_pct", "Mem"), ("disk_pct", "Disk")]:
+ b = before.get(key)
+ a = after.get(key)
+ if b is not None and a is not None:
+ parts.append(f"{label} {b}%→{a}%")
+ elif b is not None:
+ parts.append(f"{label} {b}% (before)")
+ elif a is not None:
+ parts.append(f"{label} {a}% (after)")
+ for key, label in [("restart_count", "Restarts")]:
+ b = before.get(key)
+ a = after.get(key)
+ if b is not None and a is not None:
+ parts.append(f"{label} {b}→{a}")
+ return " | ".join(parts)
+
+
async def _push_auto_repair_result(
incident: Incident,
action: str,
@@ -270,7 +355,10 @@ async def _push_auto_repair_result(
統帥要求: 所有狀態變更必須在原告警訊息延續,不發新訊息。
- append_incident_update() 取 Redis tg_msg:{id} → reply 原訊息 + 換按鈕
- 找不到 message_id 時 fallback 到 send_notification(降級)
+
+ ADR-071-I: 成功時抓 metrics_after 快照,寫入 incidents 表,並在通知中顯示前後對比
2026-04-09 Claude Sonnet 4.6 Asia/Taipei
+ 2026-04-11 Claude Sonnet 4.6: +ADR-071-I 指標快照
"""
try:
from src.services.telegram_gateway import get_telegram_gateway
@@ -279,10 +367,37 @@ async def _push_auto_repair_result(
target = incident.affected_services[0] if incident.affected_services else "unknown"
inc_id = incident.incident_id
+ # ADR-071-I: 抓 metrics_after(成功時)
+ metrics_delta_text = ""
if success:
+ metrics_after = await _fetch_metrics_snapshot(incident)
+ metrics_before = getattr(incident, "metrics_before", None) or {}
+
+ # 寫入 DB(不阻塞主流程)
+ if metrics_after:
+ try:
+ from src.db.base import get_db_context
+ from src.db.models import Incident as IncidentORM
+ from sqlalchemy import update as _update
+ async with get_db_context() as db:
+ await db.execute(
+ _update(IncidentORM)
+ .where(IncidentORM.incident_id == inc_id)
+ .values(metrics_after=metrics_after)
+ )
+ await db.commit()
+ logger.info("metrics_after_saved", incident_id=inc_id, metrics=metrics_after)
+ except Exception as _e:
+ logger.warning("metrics_after_save_failed", incident_id=inc_id, error=str(_e))
+
+ metrics_delta_text = _format_metrics_delta(metrics_before, metrics_after)
+
+ if success:
+ delta_line = f"\n├ 指標: {metrics_delta_text}" if metrics_delta_text else ""
status_line = (
f"✅ 自動修復完成\n"
- f"└ {action[:100] if action else '已執行'}"
+ f"├ {action[:100] if action else '已執行'}"
+ f"{delta_line}"
)
else:
status_line = (
@@ -716,6 +831,24 @@ class DecisionManager:
risk_level=_risk,
)
+ # ADR-071-I: 執行前抓 metrics_before 快照 (2026-04-11 Claude Sonnet 4.6)
+ _metrics_before = await _fetch_metrics_snapshot(incident)
+ if _metrics_before:
+ try:
+ from src.db.base import get_db_context
+ from src.db.models import Incident as IncidentORM
+ from sqlalchemy import update as _sa_update
+ async with get_db_context() as _db:
+ await _db.execute(
+ _sa_update(IncidentORM)
+ .where(IncidentORM.incident_id == incident.incident_id)
+ .values(metrics_before=_metrics_before)
+ )
+ await _db.commit()
+ except Exception as _mb_err:
+ logger.debug("metrics_before_save_failed",
+ incident_id=incident.incident_id, error=str(_mb_err))
+
# 執行
executor = ApprovalExecutionService()
await executor.execute_approved_action(approval)
diff --git a/apps/api/src/services/km_conversion_service.py b/apps/api/src/services/km_conversion_service.py
index 2bb32cd2..9d3d7531 100644
--- a/apps/api/src/services/km_conversion_service.py
+++ b/apps/api/src/services/km_conversion_service.py
@@ -209,16 +209,31 @@ class KMConversionService:
action_type = decision_chain.get("action_type", "")
action_command = decision_chain.get("action", "")
- # 指標快照(若有)
+ # 指標快照(若有)— ADR-071-J: 使用精簡 delta 格式 (2026-04-11 Claude Sonnet 4.6)
metrics_section = ""
if incident.metrics_before or incident.metrics_after:
mb = incident.metrics_before or {}
ma = incident.metrics_after or {}
+ delta_parts = []
+ for key, label in (("cpu_pct", "CPU"), ("mem_pct", "Mem"), ("disk_pct", "Disk")):
+ if key in mb and key in ma:
+ delta_parts.append(f"{label} {mb[key]:.0f}%→{ma[key]:.0f}%")
+ elif key in ma:
+ delta_parts.append(f"{label} 後={ma[key]:.0f}%")
+ if "restart_count" in mb and "restart_count" in ma:
+ delta_parts.append(f"重啟 {mb['restart_count']}→{ma['restart_count']}")
+ delta_str = " | ".join(delta_parts) if delta_parts else "(無量化數據)"
+
+ # K8s 狀態(若有)
+ k8s_state = getattr(incident, "k8s_state_after", None)
+ k8s_line = f"\n- K8s 狀態: {k8s_state}" if k8s_state else ""
+
metrics_section = (
f"\n## 效果驗證\n"
- f"- 執行前: {mb}\n"
- f"- 執行後: {ma}\n"
+ f"- 指標變化: {delta_str}\n"
f"- 恢復耗時: {resolution_time}\n"
+ + k8s_line
+ + "\n"
)
# 驗證結果(若有)