diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index 10a4edc9..584aa822 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -259,6 +259,91 @@ async def _push_decision_to_telegram( ) +async def _fetch_metrics_snapshot(incident: Incident) -> dict: + """ + ADR-071-I: 從 Prometheus 抓取與此 incident 相關的指標快照 + 失敗時靜默返回空 dict,不阻塞主流程 + + 2026-04-11 Claude Sonnet 4.6 Asia/Taipei + """ + try: + from src.plugins.mcp.providers.prometheus_provider import PrometheusProvider + prom = PrometheusProvider() + if not prom.enabled: + return {} + + labels = incident.signals[0].labels if incident.signals else {} + alertname = labels.get("alertname", "") + instance = labels.get("instance", "") + snapshots: dict = {} + + # 根據 alertname 選擇最相關的指標 + if alertname in ("HostHighCpuLoad", "HostOutOfMemory"): + if instance: + host = instance.split(":")[0] + r = await prom._instant_query( + f'100 - (avg by(instance) (irate(node_cpu_seconds_total{{mode="idle",instance=~"{host}.*"}}[5m])) * 100)' + ) + if r.get("status") == "success": + for item in r.get("data", {}).get("result", []): + snapshots["cpu_pct"] = round(float(item["value"][1]), 1) + r2 = await prom._instant_query( + f'(1 - (node_memory_MemAvailable_bytes{{instance=~"{instance}"}} / node_memory_MemTotal_bytes{{instance=~"{instance}"}})) * 100' + if instance else "100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)" + ) + if r2.get("status") == "success": + for item in r2.get("data", {}).get("result", []): + snapshots["mem_pct"] = round(float(item["value"][1]), 1) + + elif alertname == "HostOutOfDiskSpace": + r = await prom._instant_query( + 'max(100 - ((node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"}) * 100))' + ) + if r.get("status") == "success": + for item in r.get("data", {}).get("result", []): + snapshots["disk_pct"] = round(float(item["value"][1]), 1) + + elif alertname in ("PodRestartingTooMuch", "PodCrashLoopBackOff"): + pod = labels.get("pod", labels.get("component", "")) + if pod: + r = await prom._instant_query( + f'sum(kube_pod_container_status_restarts_total{{namespace="awoooi-prod",pod=~"{pod}.*"}})' + ) + if r.get("status") == "success": + for item in r.get("data", {}).get("result", []): + snapshots["restart_count"] = int(float(item["value"][1])) + + return snapshots + except Exception as _e: + logger.debug("metrics_snapshot_failed", incident_id=incident.incident_id, error=str(_e)) + return {} + + +def _format_metrics_delta(before: dict, after: dict) -> str: + """ + ADR-071-I: 格式化指標前後對比文字 + 例:CPU 92%→23% | Mem 78%→45% + """ + if not before and not after: + return "" + parts = [] + for key, label in [("cpu_pct", "CPU"), ("mem_pct", "Mem"), ("disk_pct", "Disk")]: + b = before.get(key) + a = after.get(key) + if b is not None and a is not None: + parts.append(f"{label} {b}%→{a}%") + elif b is not None: + parts.append(f"{label} {b}% (before)") + elif a is not None: + parts.append(f"{label} {a}% (after)") + for key, label in [("restart_count", "Restarts")]: + b = before.get(key) + a = after.get(key) + if b is not None and a is not None: + parts.append(f"{label} {b}→{a}") + return " | ".join(parts) + + async def _push_auto_repair_result( incident: Incident, action: str, @@ -270,7 +355,10 @@ async def _push_auto_repair_result( 統帥要求: 所有狀態變更必須在原告警訊息延續,不發新訊息。 - append_incident_update() 取 Redis tg_msg:{id} → reply 原訊息 + 換按鈕 - 找不到 message_id 時 fallback 到 send_notification(降級) + + ADR-071-I: 成功時抓 metrics_after 快照,寫入 incidents 表,並在通知中顯示前後對比 2026-04-09 Claude Sonnet 4.6 Asia/Taipei + 2026-04-11 Claude Sonnet 4.6: +ADR-071-I 指標快照 """ try: from src.services.telegram_gateway import get_telegram_gateway @@ -279,10 +367,37 @@ async def _push_auto_repair_result( target = incident.affected_services[0] if incident.affected_services else "unknown" inc_id = incident.incident_id + # ADR-071-I: 抓 metrics_after(成功時) + metrics_delta_text = "" if success: + metrics_after = await _fetch_metrics_snapshot(incident) + metrics_before = getattr(incident, "metrics_before", None) or {} + + # 寫入 DB(不阻塞主流程) + if metrics_after: + try: + from src.db.base import get_db_context + from src.db.models import Incident as IncidentORM + from sqlalchemy import update as _update + async with get_db_context() as db: + await db.execute( + _update(IncidentORM) + .where(IncidentORM.incident_id == inc_id) + .values(metrics_after=metrics_after) + ) + await db.commit() + logger.info("metrics_after_saved", incident_id=inc_id, metrics=metrics_after) + except Exception as _e: + logger.warning("metrics_after_save_failed", incident_id=inc_id, error=str(_e)) + + metrics_delta_text = _format_metrics_delta(metrics_before, metrics_after) + + if success: + delta_line = f"\n├ 指標: {metrics_delta_text}" if metrics_delta_text else "" status_line = ( f"✅ 自動修復完成\n" - f"└ {action[:100] if action else '已執行'}" + f"├ {action[:100] if action else '已執行'}" + f"{delta_line}" ) else: status_line = ( @@ -716,6 +831,24 @@ class DecisionManager: risk_level=_risk, ) + # ADR-071-I: 執行前抓 metrics_before 快照 (2026-04-11 Claude Sonnet 4.6) + _metrics_before = await _fetch_metrics_snapshot(incident) + if _metrics_before: + try: + from src.db.base import get_db_context + from src.db.models import Incident as IncidentORM + from sqlalchemy import update as _sa_update + async with get_db_context() as _db: + await _db.execute( + _sa_update(IncidentORM) + .where(IncidentORM.incident_id == incident.incident_id) + .values(metrics_before=_metrics_before) + ) + await _db.commit() + except Exception as _mb_err: + logger.debug("metrics_before_save_failed", + incident_id=incident.incident_id, error=str(_mb_err)) + # 執行 executor = ApprovalExecutionService() await executor.execute_approved_action(approval) diff --git a/apps/api/src/services/km_conversion_service.py b/apps/api/src/services/km_conversion_service.py index 2bb32cd2..9d3d7531 100644 --- a/apps/api/src/services/km_conversion_service.py +++ b/apps/api/src/services/km_conversion_service.py @@ -209,16 +209,31 @@ class KMConversionService: action_type = decision_chain.get("action_type", "") action_command = decision_chain.get("action", "") - # 指標快照(若有) + # 指標快照(若有)— ADR-071-J: 使用精簡 delta 格式 (2026-04-11 Claude Sonnet 4.6) metrics_section = "" if incident.metrics_before or incident.metrics_after: mb = incident.metrics_before or {} ma = incident.metrics_after or {} + delta_parts = [] + for key, label in (("cpu_pct", "CPU"), ("mem_pct", "Mem"), ("disk_pct", "Disk")): + if key in mb and key in ma: + delta_parts.append(f"{label} {mb[key]:.0f}%→{ma[key]:.0f}%") + elif key in ma: + delta_parts.append(f"{label} 後={ma[key]:.0f}%") + if "restart_count" in mb and "restart_count" in ma: + delta_parts.append(f"重啟 {mb['restart_count']}→{ma['restart_count']}") + delta_str = " | ".join(delta_parts) if delta_parts else "(無量化數據)" + + # K8s 狀態(若有) + k8s_state = getattr(incident, "k8s_state_after", None) + k8s_line = f"\n- K8s 狀態: {k8s_state}" if k8s_state else "" + metrics_section = ( f"\n## 效果驗證\n" - f"- 執行前: {mb}\n" - f"- 執行後: {ma}\n" + f"- 指標變化: {delta_str}\n" f"- 恢復耗時: {resolution_time}\n" + + k8s_line + + "\n" ) # 驗證結果(若有)