Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
問題: 首次部署 sweeper 時,找到 117 個無 sweeper_done: 標記的舊 incident (最舊 2026-04-09,7 天前) → 觸發全部 LLM 分析 舊 incident 資料格式 → OPENCLAW_NEMO timeout → Expert System 降級 confidence=0.2 "降級" → Telegram 連發相同格式告警洗版 修正: 加入 _MAX_INCIDENT_AGE_HOURS=48 過濾 只處理 48h 內的 INVESTIGATING incident 確保 created_at 時區安全(naive → UTC) 2026-04-16 Claude Sonnet 4.6 Asia/Taipei Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
143 lines
5.2 KiB
Python
143 lines
5.2 KiB
Python
"""
|
||
Incident Analysis Sweeper — 自動觸發 INVESTIGATING 事件 AI 分析
|
||
================================================================
|
||
問題背景:
|
||
Signal Worker 創建 Incident 後,AI 分析 (decision_manager) 原本只在
|
||
GET /api/v1/incidents 被呼叫時才觸發 (背景 fire-and-forget)。
|
||
若前端沒人看或 Telegram Bot 未呼叫該端點,新 Incident 永遠沒有 AI 分析。
|
||
|
||
解法:
|
||
每 90 秒掃描 INVESTIGATING 狀態且無 decision token 的 Incident,
|
||
自動在背景觸發 get_or_create_decision()。
|
||
|
||
限流:
|
||
Semaphore(3) — 避免並發壓垮 OPENCLAW_NEMO/Ollama
|
||
每批最多 5 個 incident,避免啟動雪崩
|
||
|
||
Key 格式說明:
|
||
decision token 儲存為 decision:DEC-{HEX},內部 incident_id 欄位對應 INC-*。
|
||
使用 sweeper_done:{incident_id} 輕量標記避免重複掃描。
|
||
get_or_create_decision() 本身已有 COMPLETED/READY 去重,雙重保護。
|
||
|
||
2026-04-16 Claude Sonnet 4.6 Asia/Taipei — 修正 key 格式 BUG
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
|
||
import structlog
|
||
|
||
from src.models.incident import Incident, IncidentStatus, Severity
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
_SWEEP_INTERVAL_SEC = 90 # 每 90 秒掃一次
|
||
_MAX_BATCH = 5 # 每批最多 5 個
|
||
_SEMAPHORE_LIMIT = 3 # 最多 3 個並發 AI 分析
|
||
_DONE_MARKER_PREFIX = "sweeper_done:" # 輕量標記:已觸發過分析
|
||
_DONE_MARKER_TTL = 3600 # 1 小時 TTL,後續由 get_or_create 去重
|
||
# 2026-04-16 ogt: 只處理 48h 內的 incident,避免首次啟動把所有歷史舊案洗版到 Telegram
|
||
_MAX_INCIDENT_AGE_HOURS = 48
|
||
|
||
|
||
async def run_incident_analysis_sweeper() -> None:
|
||
"""
|
||
永久迴圈:每 90 秒自動為未分析的 INVESTIGATING Incident 觸發 AI 分析。
|
||
由 main.py lifespan 透過 asyncio.create_task() 啟動。
|
||
"""
|
||
logger.info("incident_analysis_sweeper_started", interval_sec=_SWEEP_INTERVAL_SEC)
|
||
sem = asyncio.Semaphore(_SEMAPHORE_LIMIT)
|
||
|
||
while True:
|
||
try:
|
||
await _sweep_once(sem)
|
||
except Exception as e:
|
||
logger.warning("incident_analysis_sweeper_error", error=str(e))
|
||
|
||
await asyncio.sleep(_SWEEP_INTERVAL_SEC)
|
||
|
||
|
||
async def _sweep_once(sem: asyncio.Semaphore) -> None:
|
||
"""
|
||
執行一次掃描:找出沒有 decision token 的 INVESTIGATING incidents,
|
||
在背景觸發 AI 分析。
|
||
|
||
Decision token key 格式: decision:DEC-{HEX12} (非 decision:INC-*)
|
||
使用 sweeper_done:{incident_id} 輕量標記避免重複觸發。
|
||
"""
|
||
from src.services.decision_manager import get_decision_manager
|
||
from src.services.incident_service import get_incident_service
|
||
from src.core.redis_client import get_redis
|
||
|
||
redis = get_redis()
|
||
incident_service = get_incident_service()
|
||
dm = get_decision_manager()
|
||
|
||
# 取得所有 INVESTIGATING incidents
|
||
try:
|
||
incidents: list[Incident] = await incident_service.get_active_incidents()
|
||
except Exception as e:
|
||
logger.warning("sweeper_get_incidents_failed", error=str(e))
|
||
return
|
||
|
||
if not incidents:
|
||
return
|
||
|
||
# 過濾:只處理 48h 內的 incident(避免首次啟動把全部歷史舊案洗版 Telegram)
|
||
from datetime import datetime, timezone, timedelta
|
||
now_utc = datetime.now(timezone.utc)
|
||
cutoff = now_utc - timedelta(hours=_MAX_INCIDENT_AGE_HOURS)
|
||
|
||
recent_incidents = []
|
||
for incident in incidents:
|
||
created = getattr(incident, "created_at", None)
|
||
if created:
|
||
# 確保 created_at 有時區資訊
|
||
if created.tzinfo is None:
|
||
created = created.replace(tzinfo=timezone.utc)
|
||
if created >= cutoff:
|
||
recent_incidents.append(incident)
|
||
else:
|
||
# 沒有 created_at 的舊資料:跳過
|
||
pass
|
||
|
||
if not recent_incidents:
|
||
return
|
||
|
||
# 找出尚未觸發過分析的 (用輕量標記,不掃描 decision:DEC-* 全集)
|
||
unanalyzed = []
|
||
for incident in recent_incidents:
|
||
done_key = f"{_DONE_MARKER_PREFIX}{incident.incident_id}"
|
||
if not await redis.exists(done_key):
|
||
unanalyzed.append(incident)
|
||
|
||
if not unanalyzed:
|
||
return
|
||
|
||
# 限制每批
|
||
batch = unanalyzed[:_MAX_BATCH]
|
||
logger.info(
|
||
"sweeper_triggering_analysis",
|
||
total_unanalyzed=len(unanalyzed),
|
||
batch_size=len(batch),
|
||
)
|
||
|
||
async def _analyze(incident: Incident) -> None:
|
||
async with sem:
|
||
try:
|
||
timeout = 120.0 if incident.severity in (Severity.P0, Severity.P1) else 180.0
|
||
await dm.get_or_create_decision(incident=incident, timeout_sec=timeout)
|
||
# 設 done 標記,避免下次掃描重複觸發
|
||
done_key = f"{_DONE_MARKER_PREFIX}{incident.incident_id}"
|
||
await redis.set(done_key, "1", ex=_DONE_MARKER_TTL)
|
||
logger.info("sweeper_analysis_done", incident_id=incident.incident_id)
|
||
except Exception as e:
|
||
logger.warning(
|
||
"sweeper_analysis_failed",
|
||
incident_id=incident.incident_id,
|
||
error=str(e),
|
||
)
|
||
|
||
tasks = [asyncio.create_task(_analyze(inc)) for inc in batch]
|
||
await asyncio.gather(*tasks, return_exceptions=True)
|