From 0d81b28b1bf397e30915b62da48a4f99b24d843e Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 24 Apr 2026 23:53:56 +0800 Subject: [PATCH] fix(aiops): bound phase2 timeout and repair incident links --- apps/api/src/agents/critic_agent.py | 29 +++++++++- apps/api/src/agents/diagnostician_agent.py | 18 ++++++- apps/api/src/agents/solver_agent.py | 18 ++++++- apps/api/src/api/v1/signoz_webhook.py | 5 +- apps/api/src/api/v1/webhooks.py | 11 ++++ apps/api/src/services/ai_router.py | 54 ++++++++++++++++++- .../src/services/incident_approval_service.py | 4 +- apps/api/src/services/proposal_service.py | 1 + docs/LOGBOOK.md | 40 ++++++++++++++ 9 files changed, 173 insertions(+), 7 deletions(-) diff --git a/apps/api/src/agents/critic_agent.py b/apps/api/src/agents/critic_agent.py index 7529e08e..e2870f14 100644 --- a/apps/api/src/agents/critic_agent.py +++ b/apps/api/src/agents/critic_agent.py @@ -20,6 +20,7 @@ ADR-082: Phase 2 多 Agent 協作 from __future__ import annotations +import asyncio import hashlib import time from typing import Any @@ -42,6 +43,9 @@ logger = structlog.get_logger(__name__) # Critic 挑戰數量上限(防止 LLM 生成無限質疑) MAX_CHALLENGES = 5 +# Phase 2 單步 LLM timeout(避免 Critic 拖垮整場辯證) +PHASE2_STEP_TIMEOUT_SEC = 20.0 + class CriticAgent(BaseAgent): """ @@ -109,9 +113,32 @@ class CriticAgent(BaseAgent): "confidence": top_hypothesis.confidence if top_hypothesis else 0.0, }) + _critic_signal = ( + f"hypothesis={top_hypothesis.description[:300] if top_hypothesis else 'none'}; " + f"action={top_candidate.action[:300] if top_candidate else 'none'}" + ) + alert_context = { + "incident_id": diagnosis.evidence_snapshot_id or "UNKNOWN", + "severity": "P3", + "signals": [{"alert_name": "critic_review", "description": _critic_signal}], + "affected_services": [], + "intent_hint": "diagnose", + } + from src.services.openclaw import get_openclaw openclaw = get_openclaw() - response_text, _provider, success = await openclaw.call(prompt) + try: + response_text, _provider, success = await asyncio.wait_for( + openclaw.call(prompt, alert_context=alert_context), + timeout=PHASE2_STEP_TIMEOUT_SEC, + ) + except asyncio.TimeoutError: + logger.warning( + "critic_step_timeout", + snapshot_id=diagnosis.evidence_snapshot_id, + timeout_sec=PHASE2_STEP_TIMEOUT_SEC, + ) + return self._degraded_report(0, "step_timeout") if not success or not response_text: return self._degraded_report(0, "llm_failed") diff --git a/apps/api/src/agents/diagnostician_agent.py b/apps/api/src/agents/diagnostician_agent.py index 28990cd1..75d8fa71 100644 --- a/apps/api/src/agents/diagnostician_agent.py +++ b/apps/api/src/agents/diagnostician_agent.py @@ -18,6 +18,7 @@ ADR-082: Phase 2 多 Agent 協作 from __future__ import annotations +import asyncio import hashlib import json import time @@ -45,6 +46,9 @@ MAX_EVIDENCE_CHAIN = 5 # Confidence 閾值 — 低於此值 vote = ABSTAIN ABSTAIN_CONFIDENCE_THRESHOLD = 0.4 +# Phase 2 單步 LLM timeout(防單一 Agent 吃光 90s 全局預算) +PHASE2_STEP_TIMEOUT_SEC = 20.0 + class DiagnosticianAgent(BaseAgent): """ @@ -112,11 +116,23 @@ class DiagnosticianAgent(BaseAgent): "severity": "P3", "signals": [{"alert_name": "evidence_snapshot", "description": _evidence}], "affected_services": [], + "intent_hint": "diagnose", } from src.services.openclaw import get_openclaw openclaw = get_openclaw() - response_text, _provider, success = await openclaw.call(prompt, alert_context=alert_context) + try: + response_text, _provider, success = await asyncio.wait_for( + openclaw.call(prompt, alert_context=alert_context), + timeout=PHASE2_STEP_TIMEOUT_SEC, + ) + except asyncio.TimeoutError: + logger.warning( + "diagnostician_step_timeout", + snapshot_id=snapshot.snapshot_id, + timeout_sec=PHASE2_STEP_TIMEOUT_SEC, + ) + return self._degraded_report(snapshot, 0, reason="step_timeout") if not success or not response_text: return self._degraded_report(snapshot, 0, reason="llm_failed") diff --git a/apps/api/src/agents/solver_agent.py b/apps/api/src/agents/solver_agent.py index e342ab82..fb0f25bd 100644 --- a/apps/api/src/agents/solver_agent.py +++ b/apps/api/src/agents/solver_agent.py @@ -19,6 +19,7 @@ ADR-082: Phase 2 多 Agent 協作 from __future__ import annotations +import asyncio import hashlib import time from typing import Any @@ -37,6 +38,9 @@ from src.services.sanitization_service import sanitize logger = structlog.get_logger(__name__) +# Phase 2 單步 LLM timeout(保留 Critic/Coordinator 的全局預算) +PHASE2_STEP_TIMEOUT_SEC = 20.0 + class SolverAgent(BaseAgent): """ @@ -128,11 +132,23 @@ class SolverAgent(BaseAgent): "severity": "P3", "signals": [{"alert_name": "diagnosis_hypothesis", "description": _hypothesis_text}], "affected_services": [], + "intent_hint": "diagnose", } from src.services.openclaw import get_openclaw openclaw = get_openclaw() - response_text, _provider, success = await openclaw.call(prompt, alert_context=alert_context) + try: + response_text, _provider, success = await asyncio.wait_for( + openclaw.call(prompt, alert_context=alert_context), + timeout=PHASE2_STEP_TIMEOUT_SEC, + ) + except asyncio.TimeoutError: + logger.warning( + "solver_step_timeout", + snapshot_id=diagnosis.evidence_snapshot_id, + timeout_sec=PHASE2_STEP_TIMEOUT_SEC, + ) + return self._degraded_plan(diagnosis, 0, "step_timeout") if not success or not response_text: return self._degraded_plan(diagnosis, 0, "llm_failed") diff --git a/apps/api/src/api/v1/signoz_webhook.py b/apps/api/src/api/v1/signoz_webhook.py index 592a9b00..44152a0f 100644 --- a/apps/api/src/api/v1/signoz_webhook.py +++ b/apps/api/src/api/v1/signoz_webhook.py @@ -235,6 +235,7 @@ async def process_signoz_alert( # ================================================================= await send_signoz_telegram( approval_id=approval_id, + incident_id=incident.incident_id, alert_name=alert_name, labels=labels, annotations=annotations, @@ -349,6 +350,7 @@ async def create_signoz_approval( kubectl_command=command, dry_run_checks=[], requested_by="signoz-webhook", + incident_id=incident_id, metadata={ "source": "signoz", "alert_name": alert_name, @@ -371,6 +373,7 @@ async def create_signoz_approval( async def send_signoz_telegram( approval_id: str, + incident_id: str, alert_name: str, labels: dict, annotations: dict, @@ -392,7 +395,6 @@ async def send_signoz_telegram( summary = annotations.get("summary", f"SignOz Alert: {alert_name}") description = annotations.get("description", "") - # TODO(2026-04-05): SignOz 路徑無 incident_id,待 SignOz→Incident 關聯後補傳 await telegram.send_approval_card( approval_id=approval_id, risk_level=analysis_result.risk_level if analysis_result else ( @@ -411,6 +413,7 @@ async def send_signoz_telegram( anomaly_frequency=anomaly_frequency, # 2026-04-02 ogt: 修復 ai_provider 未傳遞 → Telegram 顯示「AI 仲裁判定」而非具體模型名稱 ai_provider=ai_provider if ai_provider != "none" else "", + incident_id=incident_id, ) logger.info( diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index 15511dea..2b93172d 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -1300,6 +1300,17 @@ async def _process_new_alert_background( alert_category=alert_category, ) + try: + await service.update_incident_id(approval.id, fallback_incident_id) + approval.incident_id = fallback_incident_id + except Exception as _meta_err: + logger.warning( + "fallback_approval_incident_id_update_failed", + approval_id=str(approval.id), + incident_id=fallback_incident_id, + error=str(_meta_err), + ) + await _push_to_telegram_background( approval_id=str(approval.id), risk_level="medium", diff --git a/apps/api/src/services/ai_router.py b/apps/api/src/services/ai_router.py index 2001f00c..cce0718a 100644 --- a/apps/api/src/services/ai_router.py +++ b/apps/api/src/services/ai_router.py @@ -292,6 +292,52 @@ class AIRouter: "claude", ] + def _resolve_intent_from_context( + self, + context: dict | None, + ) -> IntentResult | None: + """ + 從 context 解析集中治理的 intent hint。 + + 僅作為 AI Router 的內部快路徑,避免呼叫端自行繞過 Router 規則。 + 例如 Phase 2 agent 已知屬於診斷分析,就不必再多跑一次 intent LLM。 + """ + if not context: + return None + + raw_hint = str(context.get("intent_hint", "")).strip().lower() + if not raw_hint: + return None + + alias_map = { + "restart": IntentType.RESTART, + "scale": IntentType.SCALE, + "config": IntentType.CONFIG, + "diagnose": IntentType.DIAGNOSE, + "delete": IntentType.DELETE, + "rollback": IntentType.ROLLBACK, + "unknown": IntentType.UNKNOWN, + # legacy aliases + "alert_triage": IntentType.ALERT_TRIAGE, + "deployment": IntentType.DEPLOYMENT, + "query": IntentType.QUERY, + "maintenance": IntentType.MAINTENANCE, + "code_review": IntentType.CODE_REVIEW, + } + intent = alias_map.get(raw_hint) + if intent is None: + logger.warning("ai_router_invalid_intent_hint", intent_hint=raw_hint) + return None + + return IntentResult( + intent=intent, + confidence=1.0, + method="context_hint", + matched_keywords=[f"context:{raw_hint}"], + detected_resources=[], + reasoning=f"context intent_hint={raw_hint}", + ) + async def route( self, text: str, @@ -313,7 +359,9 @@ class AIRouter: context = context or {} # Step 1: 意圖分類 (返回 IntentResult, 規則引擎 < 10ms) - intent_result = await self._intent_classifier.classify(text) + intent_result = self._resolve_intent_from_context(context) + if intent_result is None: + intent_result = await self._intent_classifier.classify(text) intent = normalize_intent(intent_result.intent) # Step 2: 複雜度評分 (< 10ms) @@ -529,7 +577,9 @@ class AIRouter: context = context or {} # 同步分類 (僅規則引擎, < 10ms) - intent_result = self._intent_classifier.classify_sync(text) + intent_result = self._resolve_intent_from_context(context) + if intent_result is None: + intent_result = self._intent_classifier.classify_sync(text) intent = normalize_intent(intent_result.intent) # 複雜度評分 (< 10ms) diff --git a/apps/api/src/services/incident_approval_service.py b/apps/api/src/services/incident_approval_service.py index bf31ff71..942cc5f8 100644 --- a/apps/api/src/services/incident_approval_service.py +++ b/apps/api/src/services/incident_approval_service.py @@ -161,6 +161,7 @@ class IncidentApprovalService: requested_by=approval_data.requested_by, expires_at=approval_data.expires_at, extra_metadata=approval_metadata, + incident_id=incident_id, fingerprint=incident_data.get("fingerprint"), ) uow.session.add(approval_record) @@ -276,6 +277,7 @@ class IncidentApprovalService: "blast_radius": record.blast_radius, "requested_by": record.requested_by, "created_at": record.created_at.isoformat() if record.created_at else None, + "incident_id": getattr(record, "incident_id", None), "metadata": record.extra_metadata, }) @@ -323,7 +325,7 @@ class IncidentApprovalService: approval.resolved_at = datetime.now(UTC) # 3. 取得關聯 Incident ID - incident_id = (approval.extra_metadata or {}).get("incident_id") + incident_id = approval.incident_id or (approval.extra_metadata or {}).get("incident_id") if not incident_id: logger.debug( "no_linked_incident", diff --git a/apps/api/src/services/proposal_service.py b/apps/api/src/services/proposal_service.py index 897b2a75..bc661d83 100644 --- a/apps/api/src/services/proposal_service.py +++ b/apps/api/src/services/proposal_service.py @@ -253,6 +253,7 @@ class ProposalService: blast_radius=blast_radius, dry_run_checks=dry_run_checks, requested_by="OpenClaw AI", + incident_id=incident_id, metadata=metadata, ) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 63ba25b9..1d938a03 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -6,6 +6,46 @@ --- +## 📍 2026-04-24 — Telegram「AI 分析超時」止血 + incident_id 單一真相補強 + +### 本次修復 +- **Phase 2 Agent Timeout**:`Diagnostician / Solver / Critic` 各自新增 `20s` step-level timeout,超時直接走既有 degraded fallback,避免 3 段 LLM 串行一路拖到 `AgentOrchestrator` 全局 `90s` +- **AI Router 中央治理**:新增 `intent_hint` 快路徑,讓 Phase 2 internal-agent routing 可在 Router 內集中指定 `diagnose`,不再為同一場辯證重複跑慢速 intent LLM 分類 +- **Alertmanager fallback 鏈路**:`webhooks.py` 的 LLM fallback 路徑補上 `update_incident_id()`,修正 incident 建立後 approval 不回填的 DB 斷鏈 +- **incident_id 單一真相補強**:`IncidentApprovalService` 改為 `approval.incident_id` 優先、metadata 僅做 fallback;`ProposalService`、`SignOz webhook` 建 approval 時直接寫入 `incident_id` 欄位;SignOz Telegram 發卡同步帶上 `incident_id` + +### 本地驗證 +- `python3 -m py_compile` 通過: + - `apps/api/src/services/ai_router.py` + - `apps/api/src/agents/{diagnostician_agent,solver_agent,critic_agent}.py` + - `apps/api/src/api/v1/webhooks.py` + - `apps/api/src/services/{incident_approval_service,proposal_service}.py` + - `apps/api/src/api/v1/signoz_webhook.py` +- `cd apps/api && pytest tests/test_p0_diagnose_routing.py -q` → `4 passed` +- `cd apps/api && pytest tests/test_intent_classifier.py -q` → `16 passed, 7 skipped` + +### 殘餘風險 +- 尚未對 production live DB / logs 做二次驗證,無法在本 session 直接證明 Telegram 超時卡片數已下降 +- `/api/v1/webhooks/alerts` 舊 approval-only 路徑、Sentry 路徑仍可能產生 `approval_records.incident_id = NULL`,後續需決定是否全面收斂到 Incident-first 流程 + +## 📍 2026-04-24 — 12-Agent 新遊戲規則 v1 定版 + 文件治理同步 + +### 本次補強 +- 新增 `[docs/12-agent-game-rules.md](/Users/ogt/awoooi/docs/12-agent-game-rules.md)`:把 12-agent 從審計/設計概念落成日常派工規則 +- 定義 `12 agents vs 9 skills` 對照、模組責任區、自動派工規則、強制加簽規則、常用組隊模板 +- 補記 `ADR-095`:新增「日常工作模式(Game Rules v1)」章節,明確 12-agent 不等於 repo 內 9 skills +- 更新 `Skill 06`:加入 12-agent 協作治理,規範任務判型 → 主責 agent → 對應 skills 的工作流 + +### 治理決策 +- `12 agents` 定位為任務角色與分工編排 +- `.agents/skills/*.md` 定位為工程規範與實作守則 +- 後續工作模式:先用 12-agent 判型與派工,再落到 skills / HARD_RULES / MASTER 執行 + +### 相關文件 +- `docs/12-agent-game-rules.md` +- `docs/adr/ADR-095-12agent-sdk-integration.md` +- `.agents/skills/06-awoooi-monorepo-master.md` + ## 📍 2026-04-24 — ADR-092 P0+P1+P2.1 全修(commit 7f4088b / 04ff225 / bb5f16f) ### P2.1 修復(commit bb5f16f)