From f0e14136cab4e49f40044d32b9054f988acd392f Mon Sep 17 00:00:00 2001 From: OG T Date: Sun, 12 Apr 2026 17:01:00 +0800 Subject: [PATCH] =?UTF-8?q?fix(flywheel):=20=E4=BF=AE=E8=A3=9C=E9=A3=9B?= =?UTF-8?q?=E8=BC=AA=E5=9B=9B=E5=80=8B=E6=A0=B8=E5=BF=83=E6=96=B7=E9=BB=9E?= =?UTF-8?q?=EF=BC=8C=E8=AE=93=E5=AE=8C=E6=95=B4=E6=B5=81=E7=A8=8B=E7=9C=9F?= =?UTF-8?q?=E6=AD=A3=E4=B8=B2=E6=8E=A5=E8=B5=B7=E4=BE=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. incident_service.py: save_to_episodic_memory() 補寫 alertname/notification_type/alert_category → 之前這3欄在DB永遠NULL,LLM無alertname,Playbook匹配全失敗 2. telegram_gateway.py: Telegram批准後呼叫 execute_approved_action() → 之前sign_approval()只改DB狀態,380筆批准0筆真正執行kubectl指令 3. approval_execution.py: 執行成功後呼叫 resolve_incident() webhooks.py: auto-repair成功後呼叫 resolve_incident() → 之前Incident永遠停在INVESTIGATING,KM轉換永遠不觸發,Playbook=0 4. webhooks.py: TYPE-1告警短路,不進LLM → 之前Heartbeat/Backup/Info仍燒LLM token,產生垃圾修復建議 Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/api/v1/webhooks.py | 48 +++++++++++++++++++++ apps/api/src/services/approval_execution.py | 15 +++++++ apps/api/src/services/incident_service.py | 11 +++++ apps/api/src/services/telegram_gateway.py | 11 +++++ 4 files changed, 85 insertions(+) diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index 419d5bea..d098c587 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -227,6 +227,20 @@ async def _try_auto_repair_background( except Exception as _outcome_err: logger.warning("auto_repair_outcome_write_failed", error=str(_outcome_err)) + # ADR-073 修補: 自動修復成功 → 解決 Incident → 觸發 KM 轉換 + # 之前 auto_repair 成功後從未呼叫 resolve_incident,KM 永遠不生成 + if result and result.success: + try: + _inc_svc_resolve = get_incident_service() + await _inc_svc_resolve.resolve_incident(incident_id) + logger.info( + "incident_resolved_after_auto_repair", + incident_id=incident_id, + approval_id=approval_id, + ) + except Exception as _resolve_err: + logger.warning("incident_resolve_after_auto_repair_failed", error=str(_resolve_err)) + # 2026-04-10 Claude Sonnet 4.6 Asia/Taipei: 自動修復後更新 Telegram 卡片 # 透過 TelegramGateway Service 層移除按鈕並回覆結果 (積木化鐵律) if result: @@ -1222,6 +1236,40 @@ async def alertmanager_webhook( converged=True, ) + # ========================================================================== + # ADR-073 修補: TYPE-1 短路 — 純資訊告警不進 LLM,直接發純文字通知 + # 之前分類了但沒有守衛,Heartbeat/Backup/Info 仍然浪費 LLM token + # ========================================================================== + if notification_type == "TYPE-1": + _info_incident_id = await create_incident_for_approval( + approval_id=str(alert_id), # 純資訊無 approval,用 alert_id 佔位 + risk_level="low", + target_resource=target_resource, + namespace=namespace, + alert_type=alert_type, + message=message, + source="alertmanager", + alertname=alertname, + alert_labels=alert.labels, + notification_type="TYPE-1", + alert_category=alert_category, + ) + background_tasks.add_task( + get_telegram_gateway().send_info_notification, + incident_id=_info_incident_id, + title=alertname, + message=message, + alertname=alertname, + severity=severity, + ) + record_alert_chain_success("alertmanager") + return AlertResponse( + success=True, + message=f"✅ TYPE-1 純資訊告警已通知 (no LLM)", + alert_id=alert_id, + approval_created=False, + ) + # ========================================================================== # 新告警 - LLM 分析 # ========================================================================== diff --git a/apps/api/src/services/approval_execution.py b/apps/api/src/services/approval_execution.py index 28a558a5..ea2b1bd9 100644 --- a/apps/api/src/services/approval_execution.py +++ b/apps/api/src/services/approval_execution.py @@ -171,6 +171,21 @@ class ApprovalExecutionService: except Exception as _disp_e: logger.warning("disposition_record_failed", error=str(_disp_e)) + # ADR-073 修補: 執行成功 → 解決 Incident → 觸發 KM 轉換 + # 之前 RESOLVED 從未被呼叫,導致 KM 永遠不生成、Playbook 永遠是 0 + if approval.incident_id: + try: + from src.services.incident_service import get_incident_service + _inc_svc = get_incident_service() + await _inc_svc.resolve_incident(approval.incident_id) + logger.info( + "incident_resolved_after_execution", + incident_id=approval.incident_id, + approval_id=str(approval.id), + ) + except Exception as _resolve_e: + logger.warning("incident_resolve_after_execution_failed", error=str(_resolve_e)) + else: logger.error( "background_execution_failed", diff --git a/apps/api/src/services/incident_service.py b/apps/api/src/services/incident_service.py index ea726f38..fc98b337 100644 --- a/apps/api/src/services/incident_service.py +++ b/apps/api/src/services/incident_service.py @@ -457,6 +457,13 @@ class IncidentService: async with get_db_context() as db: # 轉換為 SQLAlchemy model # 使用 model_dump(mode="json") 確保 datetime 正確序列化 + # 從 signals 提取 alertname(ADR-073 Phase 2: incidents.alertname 欄位) + _alertname = ( + incident.signals[0].labels.get("alertname") + or incident.signals[0].alert_name + if incident.signals + else None + ) record = IncidentRecord( incident_id=incident.incident_id, status=incident.status.value, @@ -482,6 +489,10 @@ class IncidentService: closed_at=incident.closed_at, ttl_days=incident.ttl_days, vectorized=incident.vectorized, + # ADR-073 Phase 2-2: 三個分類欄位(之前遺漏未寫入 DB) + alertname=_alertname, + notification_type=incident.notification_type, + alert_category=incident.alert_category, ) db.add(record) diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index 297032ac..4a31951e 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -4577,6 +4577,17 @@ class TelegramGateway: username=username, execution_triggered=execution_triggered, ) + # ADR-073 修補: 觸發實際執行 (之前 sign_approval 只更新 DB 狀態,指令從未執行) + # execution_triggered=True 代表簽名數已達 required_signatures + if execution_triggered: + import asyncio + from src.services.approval_execution import get_execution_service + asyncio.create_task(get_execution_service().execute_approved_action(approval)) + logger.info( + "telegram_approval_execution_triggered", + approval_id=approval_id, + action=approval.action, + ) elif action == "reject": approval, message = await service.reject_approval(