feat(adr071-ij): TYPE-2 指標快照卡片 + KM 三段資料整合
Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 8m17s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 36s
Ansible Lint / lint (push) Has been cancelled

ADR-071-I: decision_manager 執行前後各抓一次 Prometheus metrics
  - _fetch_metrics_snapshot(): 依 alertname 選擇 CPU/Mem/Disk/Restart 查詢
  - _format_metrics_delta(): 輸出 "CPU 92%→23% | Mem 78%→45%" 格式
  - _push_auto_repair_result(): metrics_after 寫 DB + TYPE-2 卡片顯示 delta
  - _auto_execute(): metrics_before 在執行前寫 DB(完成閉環)

ADR-071-J: km_conversion_service._build_content() 使用精簡 delta 格式
  - 從 metrics_before/after 產生人讀 delta(CPU/Mem/Disk/重啟次數)
  - 附加 k8s_state_after(若有)
  - 格式: 症狀 + 根因 + 動作 + 效果數字(症狀→情境→動作→效果)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-11 03:09:35 +08:00
parent 43edff184d
commit 1ec19656b5
2 changed files with 152 additions and 4 deletions

View File

@@ -259,6 +259,91 @@ async def _push_decision_to_telegram(
)
async def _fetch_metrics_snapshot(incident: Incident) -> dict:
"""
ADR-071-I: 從 Prometheus 抓取與此 incident 相關的指標快照
失敗時靜默返回空 dict不阻塞主流程
2026-04-11 Claude Sonnet 4.6 Asia/Taipei
"""
try:
from src.plugins.mcp.providers.prometheus_provider import PrometheusProvider
prom = PrometheusProvider()
if not prom.enabled:
return {}
labels = incident.signals[0].labels if incident.signals else {}
alertname = labels.get("alertname", "")
instance = labels.get("instance", "")
snapshots: dict = {}
# 根據 alertname 選擇最相關的指標
if alertname in ("HostHighCpuLoad", "HostOutOfMemory"):
if instance:
host = instance.split(":")[0]
r = await prom._instant_query(
f'100 - (avg by(instance) (irate(node_cpu_seconds_total{{mode="idle",instance=~"{host}.*"}}[5m])) * 100)'
)
if r.get("status") == "success":
for item in r.get("data", {}).get("result", []):
snapshots["cpu_pct"] = round(float(item["value"][1]), 1)
r2 = await prom._instant_query(
f'(1 - (node_memory_MemAvailable_bytes{{instance=~"{instance}"}} / node_memory_MemTotal_bytes{{instance=~"{instance}"}})) * 100'
if instance else "100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)"
)
if r2.get("status") == "success":
for item in r2.get("data", {}).get("result", []):
snapshots["mem_pct"] = round(float(item["value"][1]), 1)
elif alertname == "HostOutOfDiskSpace":
r = await prom._instant_query(
'max(100 - ((node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"}) * 100))'
)
if r.get("status") == "success":
for item in r.get("data", {}).get("result", []):
snapshots["disk_pct"] = round(float(item["value"][1]), 1)
elif alertname in ("PodRestartingTooMuch", "PodCrashLoopBackOff"):
pod = labels.get("pod", labels.get("component", ""))
if pod:
r = await prom._instant_query(
f'sum(kube_pod_container_status_restarts_total{{namespace="awoooi-prod",pod=~"{pod}.*"}})'
)
if r.get("status") == "success":
for item in r.get("data", {}).get("result", []):
snapshots["restart_count"] = int(float(item["value"][1]))
return snapshots
except Exception as _e:
logger.debug("metrics_snapshot_failed", incident_id=incident.incident_id, error=str(_e))
return {}
def _format_metrics_delta(before: dict, after: dict) -> str:
"""
ADR-071-I: 格式化指標前後對比文字
CPU 92%→23% | Mem 78%→45%
"""
if not before and not after:
return ""
parts = []
for key, label in [("cpu_pct", "CPU"), ("mem_pct", "Mem"), ("disk_pct", "Disk")]:
b = before.get(key)
a = after.get(key)
if b is not None and a is not None:
parts.append(f"{label} {b}%→{a}%")
elif b is not None:
parts.append(f"{label} {b}% (before)")
elif a is not None:
parts.append(f"{label} {a}% (after)")
for key, label in [("restart_count", "Restarts")]:
b = before.get(key)
a = after.get(key)
if b is not None and a is not None:
parts.append(f"{label} {b}{a}")
return " | ".join(parts)
async def _push_auto_repair_result(
incident: Incident,
action: str,
@@ -270,7 +355,10 @@ async def _push_auto_repair_result(
統帥要求: 所有狀態變更必須在原告警訊息延續,不發新訊息。
- append_incident_update() 取 Redis tg_msg:{id} → reply 原訊息 + 換按鈕
- 找不到 message_id 時 fallback 到 send_notification降級
ADR-071-I: 成功時抓 metrics_after 快照,寫入 incidents 表,並在通知中顯示前後對比
2026-04-09 Claude Sonnet 4.6 Asia/Taipei
2026-04-11 Claude Sonnet 4.6: +ADR-071-I 指標快照
"""
try:
from src.services.telegram_gateway import get_telegram_gateway
@@ -279,10 +367,37 @@ async def _push_auto_repair_result(
target = incident.affected_services[0] if incident.affected_services else "unknown"
inc_id = incident.incident_id
# ADR-071-I: 抓 metrics_after成功時
metrics_delta_text = ""
if success:
metrics_after = await _fetch_metrics_snapshot(incident)
metrics_before = getattr(incident, "metrics_before", None) or {}
# 寫入 DB不阻塞主流程
if metrics_after:
try:
from src.db.base import get_db_context
from src.db.models import Incident as IncidentORM
from sqlalchemy import update as _update
async with get_db_context() as db:
await db.execute(
_update(IncidentORM)
.where(IncidentORM.incident_id == inc_id)
.values(metrics_after=metrics_after)
)
await db.commit()
logger.info("metrics_after_saved", incident_id=inc_id, metrics=metrics_after)
except Exception as _e:
logger.warning("metrics_after_save_failed", incident_id=inc_id, error=str(_e))
metrics_delta_text = _format_metrics_delta(metrics_before, metrics_after)
if success:
delta_line = f"\n├ 指標: <code>{metrics_delta_text}</code>" if metrics_delta_text else ""
status_line = (
f"✅ <b>自動修復完成</b>\n"
f" <code>{action[:100] if action else '已執行'}</code>"
f" <code>{action[:100] if action else '已執行'}</code>"
f"{delta_line}"
)
else:
status_line = (
@@ -716,6 +831,24 @@ class DecisionManager:
risk_level=_risk,
)
# ADR-071-I: 執行前抓 metrics_before 快照 (2026-04-11 Claude Sonnet 4.6)
_metrics_before = await _fetch_metrics_snapshot(incident)
if _metrics_before:
try:
from src.db.base import get_db_context
from src.db.models import Incident as IncidentORM
from sqlalchemy import update as _sa_update
async with get_db_context() as _db:
await _db.execute(
_sa_update(IncidentORM)
.where(IncidentORM.incident_id == incident.incident_id)
.values(metrics_before=_metrics_before)
)
await _db.commit()
except Exception as _mb_err:
logger.debug("metrics_before_save_failed",
incident_id=incident.incident_id, error=str(_mb_err))
# 執行
executor = ApprovalExecutionService()
await executor.execute_approved_action(approval)

View File

@@ -209,16 +209,31 @@ class KMConversionService:
action_type = decision_chain.get("action_type", "")
action_command = decision_chain.get("action", "")
# 指標快照(若有)
# 指標快照(若有)— ADR-071-J: 使用精簡 delta 格式 (2026-04-11 Claude Sonnet 4.6)
metrics_section = ""
if incident.metrics_before or incident.metrics_after:
mb = incident.metrics_before or {}
ma = incident.metrics_after or {}
delta_parts = []
for key, label in (("cpu_pct", "CPU"), ("mem_pct", "Mem"), ("disk_pct", "Disk")):
if key in mb and key in ma:
delta_parts.append(f"{label} {mb[key]:.0f}%→{ma[key]:.0f}%")
elif key in ma:
delta_parts.append(f"{label} 後={ma[key]:.0f}%")
if "restart_count" in mb and "restart_count" in ma:
delta_parts.append(f"重啟 {mb['restart_count']}{ma['restart_count']}")
delta_str = " | ".join(delta_parts) if delta_parts else "(無量化數據)"
# K8s 狀態(若有)
k8s_state = getattr(incident, "k8s_state_after", None)
k8s_line = f"\n- K8s 狀態: {k8s_state}" if k8s_state else ""
metrics_section = (
f"\n## 效果驗證\n"
f"- 執行前: {mb}\n"
f"- 執行後: {ma}\n"
f"- 指標變化: {delta_str}\n"
f"- 恢復耗時: {resolution_time}\n"
+ k8s_line
+ "\n"
)
# 驗證結果(若有)