feat(adr071-ij): TYPE-2 指標快照卡片 + KM 三段資料整合
ADR-071-I: decision_manager 執行前後各抓一次 Prometheus metrics - _fetch_metrics_snapshot(): 依 alertname 選擇 CPU/Mem/Disk/Restart 查詢 - _format_metrics_delta(): 輸出 "CPU 92%→23% | Mem 78%→45%" 格式 - _push_auto_repair_result(): metrics_after 寫 DB + TYPE-2 卡片顯示 delta - _auto_execute(): metrics_before 在執行前寫 DB(完成閉環) ADR-071-J: km_conversion_service._build_content() 使用精簡 delta 格式 - 從 metrics_before/after 產生人讀 delta(CPU/Mem/Disk/重啟次數) - 附加 k8s_state_after(若有) - 格式: 症狀 + 根因 + 動作 + 效果數字(症狀→情境→動作→效果) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -259,6 +259,91 @@ async def _push_decision_to_telegram(
|
||||
)
|
||||
|
||||
|
||||
async def _fetch_metrics_snapshot(incident: Incident) -> dict:
|
||||
"""
|
||||
ADR-071-I: 從 Prometheus 抓取與此 incident 相關的指標快照
|
||||
失敗時靜默返回空 dict,不阻塞主流程
|
||||
|
||||
2026-04-11 Claude Sonnet 4.6 Asia/Taipei
|
||||
"""
|
||||
try:
|
||||
from src.plugins.mcp.providers.prometheus_provider import PrometheusProvider
|
||||
prom = PrometheusProvider()
|
||||
if not prom.enabled:
|
||||
return {}
|
||||
|
||||
labels = incident.signals[0].labels if incident.signals else {}
|
||||
alertname = labels.get("alertname", "")
|
||||
instance = labels.get("instance", "")
|
||||
snapshots: dict = {}
|
||||
|
||||
# 根據 alertname 選擇最相關的指標
|
||||
if alertname in ("HostHighCpuLoad", "HostOutOfMemory"):
|
||||
if instance:
|
||||
host = instance.split(":")[0]
|
||||
r = await prom._instant_query(
|
||||
f'100 - (avg by(instance) (irate(node_cpu_seconds_total{{mode="idle",instance=~"{host}.*"}}[5m])) * 100)'
|
||||
)
|
||||
if r.get("status") == "success":
|
||||
for item in r.get("data", {}).get("result", []):
|
||||
snapshots["cpu_pct"] = round(float(item["value"][1]), 1)
|
||||
r2 = await prom._instant_query(
|
||||
f'(1 - (node_memory_MemAvailable_bytes{{instance=~"{instance}"}} / node_memory_MemTotal_bytes{{instance=~"{instance}"}})) * 100'
|
||||
if instance else "100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)"
|
||||
)
|
||||
if r2.get("status") == "success":
|
||||
for item in r2.get("data", {}).get("result", []):
|
||||
snapshots["mem_pct"] = round(float(item["value"][1]), 1)
|
||||
|
||||
elif alertname == "HostOutOfDiskSpace":
|
||||
r = await prom._instant_query(
|
||||
'max(100 - ((node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"}) * 100))'
|
||||
)
|
||||
if r.get("status") == "success":
|
||||
for item in r.get("data", {}).get("result", []):
|
||||
snapshots["disk_pct"] = round(float(item["value"][1]), 1)
|
||||
|
||||
elif alertname in ("PodRestartingTooMuch", "PodCrashLoopBackOff"):
|
||||
pod = labels.get("pod", labels.get("component", ""))
|
||||
if pod:
|
||||
r = await prom._instant_query(
|
||||
f'sum(kube_pod_container_status_restarts_total{{namespace="awoooi-prod",pod=~"{pod}.*"}})'
|
||||
)
|
||||
if r.get("status") == "success":
|
||||
for item in r.get("data", {}).get("result", []):
|
||||
snapshots["restart_count"] = int(float(item["value"][1]))
|
||||
|
||||
return snapshots
|
||||
except Exception as _e:
|
||||
logger.debug("metrics_snapshot_failed", incident_id=incident.incident_id, error=str(_e))
|
||||
return {}
|
||||
|
||||
|
||||
def _format_metrics_delta(before: dict, after: dict) -> str:
|
||||
"""
|
||||
ADR-071-I: 格式化指標前後對比文字
|
||||
例:CPU 92%→23% | Mem 78%→45%
|
||||
"""
|
||||
if not before and not after:
|
||||
return ""
|
||||
parts = []
|
||||
for key, label in [("cpu_pct", "CPU"), ("mem_pct", "Mem"), ("disk_pct", "Disk")]:
|
||||
b = before.get(key)
|
||||
a = after.get(key)
|
||||
if b is not None and a is not None:
|
||||
parts.append(f"{label} {b}%→{a}%")
|
||||
elif b is not None:
|
||||
parts.append(f"{label} {b}% (before)")
|
||||
elif a is not None:
|
||||
parts.append(f"{label} {a}% (after)")
|
||||
for key, label in [("restart_count", "Restarts")]:
|
||||
b = before.get(key)
|
||||
a = after.get(key)
|
||||
if b is not None and a is not None:
|
||||
parts.append(f"{label} {b}→{a}")
|
||||
return " | ".join(parts)
|
||||
|
||||
|
||||
async def _push_auto_repair_result(
|
||||
incident: Incident,
|
||||
action: str,
|
||||
@@ -270,7 +355,10 @@ async def _push_auto_repair_result(
|
||||
統帥要求: 所有狀態變更必須在原告警訊息延續,不發新訊息。
|
||||
- append_incident_update() 取 Redis tg_msg:{id} → reply 原訊息 + 換按鈕
|
||||
- 找不到 message_id 時 fallback 到 send_notification(降級)
|
||||
|
||||
ADR-071-I: 成功時抓 metrics_after 快照,寫入 incidents 表,並在通知中顯示前後對比
|
||||
2026-04-09 Claude Sonnet 4.6 Asia/Taipei
|
||||
2026-04-11 Claude Sonnet 4.6: +ADR-071-I 指標快照
|
||||
"""
|
||||
try:
|
||||
from src.services.telegram_gateway import get_telegram_gateway
|
||||
@@ -279,10 +367,37 @@ async def _push_auto_repair_result(
|
||||
target = incident.affected_services[0] if incident.affected_services else "unknown"
|
||||
inc_id = incident.incident_id
|
||||
|
||||
# ADR-071-I: 抓 metrics_after(成功時)
|
||||
metrics_delta_text = ""
|
||||
if success:
|
||||
metrics_after = await _fetch_metrics_snapshot(incident)
|
||||
metrics_before = getattr(incident, "metrics_before", None) or {}
|
||||
|
||||
# 寫入 DB(不阻塞主流程)
|
||||
if metrics_after:
|
||||
try:
|
||||
from src.db.base import get_db_context
|
||||
from src.db.models import Incident as IncidentORM
|
||||
from sqlalchemy import update as _update
|
||||
async with get_db_context() as db:
|
||||
await db.execute(
|
||||
_update(IncidentORM)
|
||||
.where(IncidentORM.incident_id == inc_id)
|
||||
.values(metrics_after=metrics_after)
|
||||
)
|
||||
await db.commit()
|
||||
logger.info("metrics_after_saved", incident_id=inc_id, metrics=metrics_after)
|
||||
except Exception as _e:
|
||||
logger.warning("metrics_after_save_failed", incident_id=inc_id, error=str(_e))
|
||||
|
||||
metrics_delta_text = _format_metrics_delta(metrics_before, metrics_after)
|
||||
|
||||
if success:
|
||||
delta_line = f"\n├ 指標: <code>{metrics_delta_text}</code>" if metrics_delta_text else ""
|
||||
status_line = (
|
||||
f"✅ <b>自動修復完成</b>\n"
|
||||
f"└ <code>{action[:100] if action else '已執行'}</code>"
|
||||
f"├ <code>{action[:100] if action else '已執行'}</code>"
|
||||
f"{delta_line}"
|
||||
)
|
||||
else:
|
||||
status_line = (
|
||||
@@ -716,6 +831,24 @@ class DecisionManager:
|
||||
risk_level=_risk,
|
||||
)
|
||||
|
||||
# ADR-071-I: 執行前抓 metrics_before 快照 (2026-04-11 Claude Sonnet 4.6)
|
||||
_metrics_before = await _fetch_metrics_snapshot(incident)
|
||||
if _metrics_before:
|
||||
try:
|
||||
from src.db.base import get_db_context
|
||||
from src.db.models import Incident as IncidentORM
|
||||
from sqlalchemy import update as _sa_update
|
||||
async with get_db_context() as _db:
|
||||
await _db.execute(
|
||||
_sa_update(IncidentORM)
|
||||
.where(IncidentORM.incident_id == incident.incident_id)
|
||||
.values(metrics_before=_metrics_before)
|
||||
)
|
||||
await _db.commit()
|
||||
except Exception as _mb_err:
|
||||
logger.debug("metrics_before_save_failed",
|
||||
incident_id=incident.incident_id, error=str(_mb_err))
|
||||
|
||||
# 執行
|
||||
executor = ApprovalExecutionService()
|
||||
await executor.execute_approved_action(approval)
|
||||
|
||||
@@ -209,16 +209,31 @@ class KMConversionService:
|
||||
action_type = decision_chain.get("action_type", "")
|
||||
action_command = decision_chain.get("action", "")
|
||||
|
||||
# 指標快照(若有)
|
||||
# 指標快照(若有)— ADR-071-J: 使用精簡 delta 格式 (2026-04-11 Claude Sonnet 4.6)
|
||||
metrics_section = ""
|
||||
if incident.metrics_before or incident.metrics_after:
|
||||
mb = incident.metrics_before or {}
|
||||
ma = incident.metrics_after or {}
|
||||
delta_parts = []
|
||||
for key, label in (("cpu_pct", "CPU"), ("mem_pct", "Mem"), ("disk_pct", "Disk")):
|
||||
if key in mb and key in ma:
|
||||
delta_parts.append(f"{label} {mb[key]:.0f}%→{ma[key]:.0f}%")
|
||||
elif key in ma:
|
||||
delta_parts.append(f"{label} 後={ma[key]:.0f}%")
|
||||
if "restart_count" in mb and "restart_count" in ma:
|
||||
delta_parts.append(f"重啟 {mb['restart_count']}→{ma['restart_count']}")
|
||||
delta_str = " | ".join(delta_parts) if delta_parts else "(無量化數據)"
|
||||
|
||||
# K8s 狀態(若有)
|
||||
k8s_state = getattr(incident, "k8s_state_after", None)
|
||||
k8s_line = f"\n- K8s 狀態: {k8s_state}" if k8s_state else ""
|
||||
|
||||
metrics_section = (
|
||||
f"\n## 效果驗證\n"
|
||||
f"- 執行前: {mb}\n"
|
||||
f"- 執行後: {ma}\n"
|
||||
f"- 指標變化: {delta_str}\n"
|
||||
f"- 恢復耗時: {resolution_time}\n"
|
||||
+ k8s_line
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
# 驗證結果(若有)
|
||||
|
||||
Reference in New Issue
Block a user