fix(flywheel): 修補飛輪四個核心斷點,讓完整流程真正串接起來
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
1. incident_service.py: save_to_episodic_memory() 補寫 alertname/notification_type/alert_category → 之前這3欄在DB永遠NULL,LLM無alertname,Playbook匹配全失敗 2. telegram_gateway.py: Telegram批准後呼叫 execute_approved_action() → 之前sign_approval()只改DB狀態,380筆批准0筆真正執行kubectl指令 3. approval_execution.py: 執行成功後呼叫 resolve_incident() webhooks.py: auto-repair成功後呼叫 resolve_incident() → 之前Incident永遠停在INVESTIGATING,KM轉換永遠不觸發,Playbook=0 4. webhooks.py: TYPE-1告警短路,不進LLM → 之前Heartbeat/Backup/Info仍燒LLM token,產生垃圾修復建議 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -227,6 +227,20 @@ async def _try_auto_repair_background(
|
||||
except Exception as _outcome_err:
|
||||
logger.warning("auto_repair_outcome_write_failed", error=str(_outcome_err))
|
||||
|
||||
# ADR-073 修補: 自動修復成功 → 解決 Incident → 觸發 KM 轉換
|
||||
# 之前 auto_repair 成功後從未呼叫 resolve_incident,KM 永遠不生成
|
||||
if result and result.success:
|
||||
try:
|
||||
_inc_svc_resolve = get_incident_service()
|
||||
await _inc_svc_resolve.resolve_incident(incident_id)
|
||||
logger.info(
|
||||
"incident_resolved_after_auto_repair",
|
||||
incident_id=incident_id,
|
||||
approval_id=approval_id,
|
||||
)
|
||||
except Exception as _resolve_err:
|
||||
logger.warning("incident_resolve_after_auto_repair_failed", error=str(_resolve_err))
|
||||
|
||||
# 2026-04-10 Claude Sonnet 4.6 Asia/Taipei: 自動修復後更新 Telegram 卡片
|
||||
# 透過 TelegramGateway Service 層移除按鈕並回覆結果 (積木化鐵律)
|
||||
if result:
|
||||
@@ -1222,6 +1236,40 @@ async def alertmanager_webhook(
|
||||
converged=True,
|
||||
)
|
||||
|
||||
# ==========================================================================
|
||||
# ADR-073 修補: TYPE-1 短路 — 純資訊告警不進 LLM,直接發純文字通知
|
||||
# 之前分類了但沒有守衛,Heartbeat/Backup/Info 仍然浪費 LLM token
|
||||
# ==========================================================================
|
||||
if notification_type == "TYPE-1":
|
||||
_info_incident_id = await create_incident_for_approval(
|
||||
approval_id=str(alert_id), # 純資訊無 approval,用 alert_id 佔位
|
||||
risk_level="low",
|
||||
target_resource=target_resource,
|
||||
namespace=namespace,
|
||||
alert_type=alert_type,
|
||||
message=message,
|
||||
source="alertmanager",
|
||||
alertname=alertname,
|
||||
alert_labels=alert.labels,
|
||||
notification_type="TYPE-1",
|
||||
alert_category=alert_category,
|
||||
)
|
||||
background_tasks.add_task(
|
||||
get_telegram_gateway().send_info_notification,
|
||||
incident_id=_info_incident_id,
|
||||
title=alertname,
|
||||
message=message,
|
||||
alertname=alertname,
|
||||
severity=severity,
|
||||
)
|
||||
record_alert_chain_success("alertmanager")
|
||||
return AlertResponse(
|
||||
success=True,
|
||||
message=f"✅ TYPE-1 純資訊告警已通知 (no LLM)",
|
||||
alert_id=alert_id,
|
||||
approval_created=False,
|
||||
)
|
||||
|
||||
# ==========================================================================
|
||||
# 新告警 - LLM 分析
|
||||
# ==========================================================================
|
||||
|
||||
@@ -171,6 +171,21 @@ class ApprovalExecutionService:
|
||||
except Exception as _disp_e:
|
||||
logger.warning("disposition_record_failed", error=str(_disp_e))
|
||||
|
||||
# ADR-073 修補: 執行成功 → 解決 Incident → 觸發 KM 轉換
|
||||
# 之前 RESOLVED 從未被呼叫,導致 KM 永遠不生成、Playbook 永遠是 0
|
||||
if approval.incident_id:
|
||||
try:
|
||||
from src.services.incident_service import get_incident_service
|
||||
_inc_svc = get_incident_service()
|
||||
await _inc_svc.resolve_incident(approval.incident_id)
|
||||
logger.info(
|
||||
"incident_resolved_after_execution",
|
||||
incident_id=approval.incident_id,
|
||||
approval_id=str(approval.id),
|
||||
)
|
||||
except Exception as _resolve_e:
|
||||
logger.warning("incident_resolve_after_execution_failed", error=str(_resolve_e))
|
||||
|
||||
else:
|
||||
logger.error(
|
||||
"background_execution_failed",
|
||||
|
||||
@@ -457,6 +457,13 @@ class IncidentService:
|
||||
async with get_db_context() as db:
|
||||
# 轉換為 SQLAlchemy model
|
||||
# 使用 model_dump(mode="json") 確保 datetime 正確序列化
|
||||
# 從 signals 提取 alertname(ADR-073 Phase 2: incidents.alertname 欄位)
|
||||
_alertname = (
|
||||
incident.signals[0].labels.get("alertname")
|
||||
or incident.signals[0].alert_name
|
||||
if incident.signals
|
||||
else None
|
||||
)
|
||||
record = IncidentRecord(
|
||||
incident_id=incident.incident_id,
|
||||
status=incident.status.value,
|
||||
@@ -482,6 +489,10 @@ class IncidentService:
|
||||
closed_at=incident.closed_at,
|
||||
ttl_days=incident.ttl_days,
|
||||
vectorized=incident.vectorized,
|
||||
# ADR-073 Phase 2-2: 三個分類欄位(之前遺漏未寫入 DB)
|
||||
alertname=_alertname,
|
||||
notification_type=incident.notification_type,
|
||||
alert_category=incident.alert_category,
|
||||
)
|
||||
|
||||
db.add(record)
|
||||
|
||||
@@ -4577,6 +4577,17 @@ class TelegramGateway:
|
||||
username=username,
|
||||
execution_triggered=execution_triggered,
|
||||
)
|
||||
# ADR-073 修補: 觸發實際執行 (之前 sign_approval 只更新 DB 狀態,指令從未執行)
|
||||
# execution_triggered=True 代表簽名數已達 required_signatures
|
||||
if execution_triggered:
|
||||
import asyncio
|
||||
from src.services.approval_execution import get_execution_service
|
||||
asyncio.create_task(get_execution_service().execute_approved_action(approval))
|
||||
logger.info(
|
||||
"telegram_approval_execution_triggered",
|
||||
approval_id=approval_id,
|
||||
action=approval.action,
|
||||
)
|
||||
|
||||
elif action == "reject":
|
||||
approval, message = await service.reject_approval(
|
||||
|
||||
Reference in New Issue
Block a user