fix(flywheel): 修補飛輪四個核心斷點,讓完整流程真正串接起來
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled

1. incident_service.py: save_to_episodic_memory() 補寫 alertname/notification_type/alert_category
   → 之前這3欄在DB永遠NULL,LLM無alertname,Playbook匹配全失敗

2. telegram_gateway.py: Telegram批准後呼叫 execute_approved_action()
   → 之前sign_approval()只改DB狀態,380筆批准0筆真正執行kubectl指令

3. approval_execution.py: 執行成功後呼叫 resolve_incident()
   webhooks.py: auto-repair成功後呼叫 resolve_incident()
   → 之前Incident永遠停在INVESTIGATING,KM轉換永遠不觸發,Playbook=0

4. webhooks.py: TYPE-1告警短路,不進LLM
   → 之前Heartbeat/Backup/Info仍燒LLM token,產生垃圾修復建議

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-12 17:01:00 +08:00
parent d2286ca827
commit f0e14136ca
4 changed files with 85 additions and 0 deletions

View File

@@ -227,6 +227,20 @@ async def _try_auto_repair_background(
except Exception as _outcome_err:
logger.warning("auto_repair_outcome_write_failed", error=str(_outcome_err))
# ADR-073 修補: 自動修復成功 → 解決 Incident → 觸發 KM 轉換
# 之前 auto_repair 成功後從未呼叫 resolve_incidentKM 永遠不生成
if result and result.success:
try:
_inc_svc_resolve = get_incident_service()
await _inc_svc_resolve.resolve_incident(incident_id)
logger.info(
"incident_resolved_after_auto_repair",
incident_id=incident_id,
approval_id=approval_id,
)
except Exception as _resolve_err:
logger.warning("incident_resolve_after_auto_repair_failed", error=str(_resolve_err))
# 2026-04-10 Claude Sonnet 4.6 Asia/Taipei: 自動修復後更新 Telegram 卡片
# 透過 TelegramGateway Service 層移除按鈕並回覆結果 (積木化鐵律)
if result:
@@ -1222,6 +1236,40 @@ async def alertmanager_webhook(
converged=True,
)
# ==========================================================================
# ADR-073 修補: TYPE-1 短路 — 純資訊告警不進 LLM直接發純文字通知
# 之前分類了但沒有守衛Heartbeat/Backup/Info 仍然浪費 LLM token
# ==========================================================================
if notification_type == "TYPE-1":
_info_incident_id = await create_incident_for_approval(
approval_id=str(alert_id), # 純資訊無 approval用 alert_id 佔位
risk_level="low",
target_resource=target_resource,
namespace=namespace,
alert_type=alert_type,
message=message,
source="alertmanager",
alertname=alertname,
alert_labels=alert.labels,
notification_type="TYPE-1",
alert_category=alert_category,
)
background_tasks.add_task(
get_telegram_gateway().send_info_notification,
incident_id=_info_incident_id,
title=alertname,
message=message,
alertname=alertname,
severity=severity,
)
record_alert_chain_success("alertmanager")
return AlertResponse(
success=True,
message=f"✅ TYPE-1 純資訊告警已通知 (no LLM)",
alert_id=alert_id,
approval_created=False,
)
# ==========================================================================
# 新告警 - LLM 分析
# ==========================================================================

View File

@@ -171,6 +171,21 @@ class ApprovalExecutionService:
except Exception as _disp_e:
logger.warning("disposition_record_failed", error=str(_disp_e))
# ADR-073 修補: 執行成功 → 解決 Incident → 觸發 KM 轉換
# 之前 RESOLVED 從未被呼叫,導致 KM 永遠不生成、Playbook 永遠是 0
if approval.incident_id:
try:
from src.services.incident_service import get_incident_service
_inc_svc = get_incident_service()
await _inc_svc.resolve_incident(approval.incident_id)
logger.info(
"incident_resolved_after_execution",
incident_id=approval.incident_id,
approval_id=str(approval.id),
)
except Exception as _resolve_e:
logger.warning("incident_resolve_after_execution_failed", error=str(_resolve_e))
else:
logger.error(
"background_execution_failed",

View File

@@ -457,6 +457,13 @@ class IncidentService:
async with get_db_context() as db:
# 轉換為 SQLAlchemy model
# 使用 model_dump(mode="json") 確保 datetime 正確序列化
# 從 signals 提取 alertnameADR-073 Phase 2: incidents.alertname 欄位)
_alertname = (
incident.signals[0].labels.get("alertname")
or incident.signals[0].alert_name
if incident.signals
else None
)
record = IncidentRecord(
incident_id=incident.incident_id,
status=incident.status.value,
@@ -482,6 +489,10 @@ class IncidentService:
closed_at=incident.closed_at,
ttl_days=incident.ttl_days,
vectorized=incident.vectorized,
# ADR-073 Phase 2-2: 三個分類欄位(之前遺漏未寫入 DB
alertname=_alertname,
notification_type=incident.notification_type,
alert_category=incident.alert_category,
)
db.add(record)

View File

@@ -4577,6 +4577,17 @@ class TelegramGateway:
username=username,
execution_triggered=execution_triggered,
)
# ADR-073 修補: 觸發實際執行 (之前 sign_approval 只更新 DB 狀態,指令從未執行)
# execution_triggered=True 代表簽名數已達 required_signatures
if execution_triggered:
import asyncio
from src.services.approval_execution import get_execution_service
asyncio.create_task(get_execution_service().execute_approved_action(approval))
logger.info(
"telegram_approval_execution_triggered",
approval_id=approval_id,
action=approval.action,
)
elif action == "reject":
approval, message = await service.reject_approval(