fix(aiops): bound phase2 timeout and repair incident links
This commit is contained in:
@@ -20,6 +20,7 @@ ADR-082: Phase 2 多 Agent 協作
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import time
|
||||
from typing import Any
|
||||
@@ -42,6 +43,9 @@ logger = structlog.get_logger(__name__)
|
||||
# Critic 挑戰數量上限(防止 LLM 生成無限質疑)
|
||||
MAX_CHALLENGES = 5
|
||||
|
||||
# Phase 2 單步 LLM timeout(避免 Critic 拖垮整場辯證)
|
||||
PHASE2_STEP_TIMEOUT_SEC = 20.0
|
||||
|
||||
|
||||
class CriticAgent(BaseAgent):
|
||||
"""
|
||||
@@ -109,9 +113,32 @@ class CriticAgent(BaseAgent):
|
||||
"confidence": top_hypothesis.confidence if top_hypothesis else 0.0,
|
||||
})
|
||||
|
||||
_critic_signal = (
|
||||
f"hypothesis={top_hypothesis.description[:300] if top_hypothesis else 'none'}; "
|
||||
f"action={top_candidate.action[:300] if top_candidate else 'none'}"
|
||||
)
|
||||
alert_context = {
|
||||
"incident_id": diagnosis.evidence_snapshot_id or "UNKNOWN",
|
||||
"severity": "P3",
|
||||
"signals": [{"alert_name": "critic_review", "description": _critic_signal}],
|
||||
"affected_services": [],
|
||||
"intent_hint": "diagnose",
|
||||
}
|
||||
|
||||
from src.services.openclaw import get_openclaw
|
||||
openclaw = get_openclaw()
|
||||
response_text, _provider, success = await openclaw.call(prompt)
|
||||
try:
|
||||
response_text, _provider, success = await asyncio.wait_for(
|
||||
openclaw.call(prompt, alert_context=alert_context),
|
||||
timeout=PHASE2_STEP_TIMEOUT_SEC,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning(
|
||||
"critic_step_timeout",
|
||||
snapshot_id=diagnosis.evidence_snapshot_id,
|
||||
timeout_sec=PHASE2_STEP_TIMEOUT_SEC,
|
||||
)
|
||||
return self._degraded_report(0, "step_timeout")
|
||||
|
||||
if not success or not response_text:
|
||||
return self._degraded_report(0, "llm_failed")
|
||||
|
||||
@@ -18,6 +18,7 @@ ADR-082: Phase 2 多 Agent 協作
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import json
|
||||
import time
|
||||
@@ -45,6 +46,9 @@ MAX_EVIDENCE_CHAIN = 5
|
||||
# Confidence 閾值 — 低於此值 vote = ABSTAIN
|
||||
ABSTAIN_CONFIDENCE_THRESHOLD = 0.4
|
||||
|
||||
# Phase 2 單步 LLM timeout(防單一 Agent 吃光 90s 全局預算)
|
||||
PHASE2_STEP_TIMEOUT_SEC = 20.0
|
||||
|
||||
|
||||
class DiagnosticianAgent(BaseAgent):
|
||||
"""
|
||||
@@ -112,11 +116,23 @@ class DiagnosticianAgent(BaseAgent):
|
||||
"severity": "P3",
|
||||
"signals": [{"alert_name": "evidence_snapshot", "description": _evidence}],
|
||||
"affected_services": [],
|
||||
"intent_hint": "diagnose",
|
||||
}
|
||||
|
||||
from src.services.openclaw import get_openclaw
|
||||
openclaw = get_openclaw()
|
||||
response_text, _provider, success = await openclaw.call(prompt, alert_context=alert_context)
|
||||
try:
|
||||
response_text, _provider, success = await asyncio.wait_for(
|
||||
openclaw.call(prompt, alert_context=alert_context),
|
||||
timeout=PHASE2_STEP_TIMEOUT_SEC,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning(
|
||||
"diagnostician_step_timeout",
|
||||
snapshot_id=snapshot.snapshot_id,
|
||||
timeout_sec=PHASE2_STEP_TIMEOUT_SEC,
|
||||
)
|
||||
return self._degraded_report(snapshot, 0, reason="step_timeout")
|
||||
|
||||
if not success or not response_text:
|
||||
return self._degraded_report(snapshot, 0, reason="llm_failed")
|
||||
|
||||
@@ -19,6 +19,7 @@ ADR-082: Phase 2 多 Agent 協作
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import time
|
||||
from typing import Any
|
||||
@@ -37,6 +38,9 @@ from src.services.sanitization_service import sanitize
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
# Phase 2 單步 LLM timeout(保留 Critic/Coordinator 的全局預算)
|
||||
PHASE2_STEP_TIMEOUT_SEC = 20.0
|
||||
|
||||
|
||||
class SolverAgent(BaseAgent):
|
||||
"""
|
||||
@@ -128,11 +132,23 @@ class SolverAgent(BaseAgent):
|
||||
"severity": "P3",
|
||||
"signals": [{"alert_name": "diagnosis_hypothesis", "description": _hypothesis_text}],
|
||||
"affected_services": [],
|
||||
"intent_hint": "diagnose",
|
||||
}
|
||||
|
||||
from src.services.openclaw import get_openclaw
|
||||
openclaw = get_openclaw()
|
||||
response_text, _provider, success = await openclaw.call(prompt, alert_context=alert_context)
|
||||
try:
|
||||
response_text, _provider, success = await asyncio.wait_for(
|
||||
openclaw.call(prompt, alert_context=alert_context),
|
||||
timeout=PHASE2_STEP_TIMEOUT_SEC,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning(
|
||||
"solver_step_timeout",
|
||||
snapshot_id=diagnosis.evidence_snapshot_id,
|
||||
timeout_sec=PHASE2_STEP_TIMEOUT_SEC,
|
||||
)
|
||||
return self._degraded_plan(diagnosis, 0, "step_timeout")
|
||||
|
||||
if not success or not response_text:
|
||||
return self._degraded_plan(diagnosis, 0, "llm_failed")
|
||||
|
||||
@@ -235,6 +235,7 @@ async def process_signoz_alert(
|
||||
# =================================================================
|
||||
await send_signoz_telegram(
|
||||
approval_id=approval_id,
|
||||
incident_id=incident.incident_id,
|
||||
alert_name=alert_name,
|
||||
labels=labels,
|
||||
annotations=annotations,
|
||||
@@ -349,6 +350,7 @@ async def create_signoz_approval(
|
||||
kubectl_command=command,
|
||||
dry_run_checks=[],
|
||||
requested_by="signoz-webhook",
|
||||
incident_id=incident_id,
|
||||
metadata={
|
||||
"source": "signoz",
|
||||
"alert_name": alert_name,
|
||||
@@ -371,6 +373,7 @@ async def create_signoz_approval(
|
||||
|
||||
async def send_signoz_telegram(
|
||||
approval_id: str,
|
||||
incident_id: str,
|
||||
alert_name: str,
|
||||
labels: dict,
|
||||
annotations: dict,
|
||||
@@ -392,7 +395,6 @@ async def send_signoz_telegram(
|
||||
summary = annotations.get("summary", f"SignOz Alert: {alert_name}")
|
||||
description = annotations.get("description", "")
|
||||
|
||||
# TODO(2026-04-05): SignOz 路徑無 incident_id,待 SignOz→Incident 關聯後補傳
|
||||
await telegram.send_approval_card(
|
||||
approval_id=approval_id,
|
||||
risk_level=analysis_result.risk_level if analysis_result else (
|
||||
@@ -411,6 +413,7 @@ async def send_signoz_telegram(
|
||||
anomaly_frequency=anomaly_frequency,
|
||||
# 2026-04-02 ogt: 修復 ai_provider 未傳遞 → Telegram 顯示「AI 仲裁判定」而非具體模型名稱
|
||||
ai_provider=ai_provider if ai_provider != "none" else "",
|
||||
incident_id=incident_id,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
|
||||
@@ -1300,6 +1300,17 @@ async def _process_new_alert_background(
|
||||
alert_category=alert_category,
|
||||
)
|
||||
|
||||
try:
|
||||
await service.update_incident_id(approval.id, fallback_incident_id)
|
||||
approval.incident_id = fallback_incident_id
|
||||
except Exception as _meta_err:
|
||||
logger.warning(
|
||||
"fallback_approval_incident_id_update_failed",
|
||||
approval_id=str(approval.id),
|
||||
incident_id=fallback_incident_id,
|
||||
error=str(_meta_err),
|
||||
)
|
||||
|
||||
await _push_to_telegram_background(
|
||||
approval_id=str(approval.id),
|
||||
risk_level="medium",
|
||||
|
||||
@@ -292,6 +292,52 @@ class AIRouter:
|
||||
"claude",
|
||||
]
|
||||
|
||||
def _resolve_intent_from_context(
|
||||
self,
|
||||
context: dict | None,
|
||||
) -> IntentResult | None:
|
||||
"""
|
||||
從 context 解析集中治理的 intent hint。
|
||||
|
||||
僅作為 AI Router 的內部快路徑,避免呼叫端自行繞過 Router 規則。
|
||||
例如 Phase 2 agent 已知屬於診斷分析,就不必再多跑一次 intent LLM。
|
||||
"""
|
||||
if not context:
|
||||
return None
|
||||
|
||||
raw_hint = str(context.get("intent_hint", "")).strip().lower()
|
||||
if not raw_hint:
|
||||
return None
|
||||
|
||||
alias_map = {
|
||||
"restart": IntentType.RESTART,
|
||||
"scale": IntentType.SCALE,
|
||||
"config": IntentType.CONFIG,
|
||||
"diagnose": IntentType.DIAGNOSE,
|
||||
"delete": IntentType.DELETE,
|
||||
"rollback": IntentType.ROLLBACK,
|
||||
"unknown": IntentType.UNKNOWN,
|
||||
# legacy aliases
|
||||
"alert_triage": IntentType.ALERT_TRIAGE,
|
||||
"deployment": IntentType.DEPLOYMENT,
|
||||
"query": IntentType.QUERY,
|
||||
"maintenance": IntentType.MAINTENANCE,
|
||||
"code_review": IntentType.CODE_REVIEW,
|
||||
}
|
||||
intent = alias_map.get(raw_hint)
|
||||
if intent is None:
|
||||
logger.warning("ai_router_invalid_intent_hint", intent_hint=raw_hint)
|
||||
return None
|
||||
|
||||
return IntentResult(
|
||||
intent=intent,
|
||||
confidence=1.0,
|
||||
method="context_hint",
|
||||
matched_keywords=[f"context:{raw_hint}"],
|
||||
detected_resources=[],
|
||||
reasoning=f"context intent_hint={raw_hint}",
|
||||
)
|
||||
|
||||
async def route(
|
||||
self,
|
||||
text: str,
|
||||
@@ -313,7 +359,9 @@ class AIRouter:
|
||||
context = context or {}
|
||||
|
||||
# Step 1: 意圖分類 (返回 IntentResult, 規則引擎 < 10ms)
|
||||
intent_result = await self._intent_classifier.classify(text)
|
||||
intent_result = self._resolve_intent_from_context(context)
|
||||
if intent_result is None:
|
||||
intent_result = await self._intent_classifier.classify(text)
|
||||
intent = normalize_intent(intent_result.intent)
|
||||
|
||||
# Step 2: 複雜度評分 (< 10ms)
|
||||
@@ -529,7 +577,9 @@ class AIRouter:
|
||||
context = context or {}
|
||||
|
||||
# 同步分類 (僅規則引擎, < 10ms)
|
||||
intent_result = self._intent_classifier.classify_sync(text)
|
||||
intent_result = self._resolve_intent_from_context(context)
|
||||
if intent_result is None:
|
||||
intent_result = self._intent_classifier.classify_sync(text)
|
||||
intent = normalize_intent(intent_result.intent)
|
||||
|
||||
# 複雜度評分 (< 10ms)
|
||||
|
||||
@@ -161,6 +161,7 @@ class IncidentApprovalService:
|
||||
requested_by=approval_data.requested_by,
|
||||
expires_at=approval_data.expires_at,
|
||||
extra_metadata=approval_metadata,
|
||||
incident_id=incident_id,
|
||||
fingerprint=incident_data.get("fingerprint"),
|
||||
)
|
||||
uow.session.add(approval_record)
|
||||
@@ -276,6 +277,7 @@ class IncidentApprovalService:
|
||||
"blast_radius": record.blast_radius,
|
||||
"requested_by": record.requested_by,
|
||||
"created_at": record.created_at.isoformat() if record.created_at else None,
|
||||
"incident_id": getattr(record, "incident_id", None),
|
||||
"metadata": record.extra_metadata,
|
||||
})
|
||||
|
||||
@@ -323,7 +325,7 @@ class IncidentApprovalService:
|
||||
approval.resolved_at = datetime.now(UTC)
|
||||
|
||||
# 3. 取得關聯 Incident ID
|
||||
incident_id = (approval.extra_metadata or {}).get("incident_id")
|
||||
incident_id = approval.incident_id or (approval.extra_metadata or {}).get("incident_id")
|
||||
if not incident_id:
|
||||
logger.debug(
|
||||
"no_linked_incident",
|
||||
|
||||
@@ -253,6 +253,7 @@ class ProposalService:
|
||||
blast_radius=blast_radius,
|
||||
dry_run_checks=dry_run_checks,
|
||||
requested_by="OpenClaw AI",
|
||||
incident_id=incident_id,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
|
||||
@@ -6,6 +6,46 @@
|
||||
|
||||
---
|
||||
|
||||
## 📍 2026-04-24 — Telegram「AI 分析超時」止血 + incident_id 單一真相補強
|
||||
|
||||
### 本次修復
|
||||
- **Phase 2 Agent Timeout**:`Diagnostician / Solver / Critic` 各自新增 `20s` step-level timeout,超時直接走既有 degraded fallback,避免 3 段 LLM 串行一路拖到 `AgentOrchestrator` 全局 `90s`
|
||||
- **AI Router 中央治理**:新增 `intent_hint` 快路徑,讓 Phase 2 internal-agent routing 可在 Router 內集中指定 `diagnose`,不再為同一場辯證重複跑慢速 intent LLM 分類
|
||||
- **Alertmanager fallback 鏈路**:`webhooks.py` 的 LLM fallback 路徑補上 `update_incident_id()`,修正 incident 建立後 approval 不回填的 DB 斷鏈
|
||||
- **incident_id 單一真相補強**:`IncidentApprovalService` 改為 `approval.incident_id` 優先、metadata 僅做 fallback;`ProposalService`、`SignOz webhook` 建 approval 時直接寫入 `incident_id` 欄位;SignOz Telegram 發卡同步帶上 `incident_id`
|
||||
|
||||
### 本地驗證
|
||||
- `python3 -m py_compile` 通過:
|
||||
- `apps/api/src/services/ai_router.py`
|
||||
- `apps/api/src/agents/{diagnostician_agent,solver_agent,critic_agent}.py`
|
||||
- `apps/api/src/api/v1/webhooks.py`
|
||||
- `apps/api/src/services/{incident_approval_service,proposal_service}.py`
|
||||
- `apps/api/src/api/v1/signoz_webhook.py`
|
||||
- `cd apps/api && pytest tests/test_p0_diagnose_routing.py -q` → `4 passed`
|
||||
- `cd apps/api && pytest tests/test_intent_classifier.py -q` → `16 passed, 7 skipped`
|
||||
|
||||
### 殘餘風險
|
||||
- 尚未對 production live DB / logs 做二次驗證,無法在本 session 直接證明 Telegram 超時卡片數已下降
|
||||
- `/api/v1/webhooks/alerts` 舊 approval-only 路徑、Sentry 路徑仍可能產生 `approval_records.incident_id = NULL`,後續需決定是否全面收斂到 Incident-first 流程
|
||||
|
||||
## 📍 2026-04-24 — 12-Agent 新遊戲規則 v1 定版 + 文件治理同步
|
||||
|
||||
### 本次補強
|
||||
- 新增 `[docs/12-agent-game-rules.md](/Users/ogt/awoooi/docs/12-agent-game-rules.md)`:把 12-agent 從審計/設計概念落成日常派工規則
|
||||
- 定義 `12 agents vs 9 skills` 對照、模組責任區、自動派工規則、強制加簽規則、常用組隊模板
|
||||
- 補記 `ADR-095`:新增「日常工作模式(Game Rules v1)」章節,明確 12-agent 不等於 repo 內 9 skills
|
||||
- 更新 `Skill 06`:加入 12-agent 協作治理,規範任務判型 → 主責 agent → 對應 skills 的工作流
|
||||
|
||||
### 治理決策
|
||||
- `12 agents` 定位為任務角色與分工編排
|
||||
- `.agents/skills/*.md` 定位為工程規範與實作守則
|
||||
- 後續工作模式:先用 12-agent 判型與派工,再落到 skills / HARD_RULES / MASTER 執行
|
||||
|
||||
### 相關文件
|
||||
- `docs/12-agent-game-rules.md`
|
||||
- `docs/adr/ADR-095-12agent-sdk-integration.md`
|
||||
- `.agents/skills/06-awoooi-monorepo-master.md`
|
||||
|
||||
## 📍 2026-04-24 — ADR-092 P0+P1+P2.1 全修(commit 7f4088b / 04ff225 / bb5f16f)
|
||||
|
||||
### P2.1 修復(commit bb5f16f)
|
||||
|
||||
Reference in New Issue
Block a user