fix(aiops): bound phase2 timeout and repair incident links

2026-04-24 23:53:56 +08:00
parent ad494288cb
commit 0d81b28b1b
9 changed files with 173 additions and 7 deletions
--- a/apps/api/src/agents/critic_agent.py
+++ b/apps/api/src/agents/critic_agent.py
@@ -20,6 +20,7 @@ ADR-082: Phase 2 多 Agent 協作

 from __future__ import annotations

+import asyncio
 import hashlib
 import time
 from typing import Any
@@ -42,6 +43,9 @@ logger = structlog.get_logger(__name__)
 # Critic 挑戰數量上限（防止 LLM 生成無限質疑）
 MAX_CHALLENGES = 5

+# Phase 2 單步 LLM timeout（避免 Critic 拖垮整場辯證）
+PHASE2_STEP_TIMEOUT_SEC = 20.0
+

 class CriticAgent(BaseAgent):
    """
@@ -109,9 +113,32 @@ class CriticAgent(BaseAgent):
            "confidence": top_hypothesis.confidence if top_hypothesis else 0.0,
        })

+        _critic_signal = (
+            f"hypothesis={top_hypothesis.description[:300] if top_hypothesis else 'none'}; "
+            f"action={top_candidate.action[:300] if top_candidate else 'none'}"
+        )
+        alert_context = {
+            "incident_id": diagnosis.evidence_snapshot_id or "UNKNOWN",
+            "severity": "P3",
+            "signals": [{"alert_name": "critic_review", "description": _critic_signal}],
+            "affected_services": [],
+            "intent_hint": "diagnose",
+        }
+
        from src.services.openclaw import get_openclaw
        openclaw = get_openclaw()
-        response_text, _provider, success = await openclaw.call(prompt)
+        try:
+            response_text, _provider, success = await asyncio.wait_for(
+                openclaw.call(prompt, alert_context=alert_context),
+                timeout=PHASE2_STEP_TIMEOUT_SEC,
+            )
+        except asyncio.TimeoutError:
+            logger.warning(
+                "critic_step_timeout",
+                snapshot_id=diagnosis.evidence_snapshot_id,
+                timeout_sec=PHASE2_STEP_TIMEOUT_SEC,
+            )
+            return self._degraded_report(0, "step_timeout")

        if not success or not response_text:
            return self._degraded_report(0, "llm_failed")
--- a/apps/api/src/agents/diagnostician_agent.py
+++ b/apps/api/src/agents/diagnostician_agent.py
@@ -18,6 +18,7 @@ ADR-082: Phase 2 多 Agent 協作

 from __future__ import annotations

+import asyncio
 import hashlib
 import json
 import time
@@ -45,6 +46,9 @@ MAX_EVIDENCE_CHAIN = 5
 # Confidence 閾值 — 低於此值 vote = ABSTAIN
 ABSTAIN_CONFIDENCE_THRESHOLD = 0.4

+# Phase 2 單步 LLM timeout（防單一 Agent 吃光 90s 全局預算）
+PHASE2_STEP_TIMEOUT_SEC = 20.0
+

 class DiagnosticianAgent(BaseAgent):
    """
@@ -112,11 +116,23 @@ class DiagnosticianAgent(BaseAgent):
            "severity": "P3",
            "signals": [{"alert_name": "evidence_snapshot", "description": _evidence}],
            "affected_services": [],
+            "intent_hint": "diagnose",
        }

        from src.services.openclaw import get_openclaw
        openclaw = get_openclaw()
-        response_text, _provider, success = await openclaw.call(prompt, alert_context=alert_context)
+        try:
+            response_text, _provider, success = await asyncio.wait_for(
+                openclaw.call(prompt, alert_context=alert_context),
+                timeout=PHASE2_STEP_TIMEOUT_SEC,
+            )
+        except asyncio.TimeoutError:
+            logger.warning(
+                "diagnostician_step_timeout",
+                snapshot_id=snapshot.snapshot_id,
+                timeout_sec=PHASE2_STEP_TIMEOUT_SEC,
+            )
+            return self._degraded_report(snapshot, 0, reason="step_timeout")

        if not success or not response_text:
            return self._degraded_report(snapshot, 0, reason="llm_failed")
--- a/apps/api/src/agents/solver_agent.py
+++ b/apps/api/src/agents/solver_agent.py
@@ -19,6 +19,7 @@ ADR-082: Phase 2 多 Agent 協作

 from __future__ import annotations

+import asyncio
 import hashlib
 import time
 from typing import Any
@@ -37,6 +38,9 @@ from src.services.sanitization_service import sanitize

 logger = structlog.get_logger(__name__)

+# Phase 2 單步 LLM timeout（保留 Critic/Coordinator 的全局預算）
+PHASE2_STEP_TIMEOUT_SEC = 20.0
+

 class SolverAgent(BaseAgent):
    """
@@ -128,11 +132,23 @@ class SolverAgent(BaseAgent):
            "severity": "P3",
            "signals": [{"alert_name": "diagnosis_hypothesis", "description": _hypothesis_text}],
            "affected_services": [],
+            "intent_hint": "diagnose",
        }

        from src.services.openclaw import get_openclaw
        openclaw = get_openclaw()
-        response_text, _provider, success = await openclaw.call(prompt, alert_context=alert_context)
+        try:
+            response_text, _provider, success = await asyncio.wait_for(
+                openclaw.call(prompt, alert_context=alert_context),
+                timeout=PHASE2_STEP_TIMEOUT_SEC,
+            )
+        except asyncio.TimeoutError:
+            logger.warning(
+                "solver_step_timeout",
+                snapshot_id=diagnosis.evidence_snapshot_id,
+                timeout_sec=PHASE2_STEP_TIMEOUT_SEC,
+            )
+            return self._degraded_plan(diagnosis, 0, "step_timeout")

        if not success or not response_text:
            return self._degraded_plan(diagnosis, 0, "llm_failed")
--- a/apps/api/src/api/v1/signoz_webhook.py
+++ b/apps/api/src/api/v1/signoz_webhook.py
@@ -235,6 +235,7 @@ async def process_signoz_alert(
        # =================================================================
        await send_signoz_telegram(
            approval_id=approval_id,
+            incident_id=incident.incident_id,
            alert_name=alert_name,
            labels=labels,
            annotations=annotations,
@@ -349,6 +350,7 @@ async def create_signoz_approval(
            kubectl_command=command,
            dry_run_checks=[],
            requested_by="signoz-webhook",
+            incident_id=incident_id,
            metadata={
                "source": "signoz",
                "alert_name": alert_name,
@@ -371,6 +373,7 @@ async def create_signoz_approval(

 async def send_signoz_telegram(
    approval_id: str,
+    incident_id: str,
    alert_name: str,
    labels: dict,
    annotations: dict,
@@ -392,7 +395,6 @@ async def send_signoz_telegram(
        summary = annotations.get("summary", f"SignOz Alert: {alert_name}")
        description = annotations.get("description", "")

-        # TODO(2026-04-05): SignOz 路徑無 incident_id，待 SignOz→Incident 關聯後補傳
        await telegram.send_approval_card(
            approval_id=approval_id,
            risk_level=analysis_result.risk_level if analysis_result else (
@@ -411,6 +413,7 @@ async def send_signoz_telegram(
            anomaly_frequency=anomaly_frequency,
            # 2026-04-02 ogt: 修復 ai_provider 未傳遞 → Telegram 顯示「AI 仲裁判定」而非具體模型名稱
            ai_provider=ai_provider if ai_provider != "none" else "",
+            incident_id=incident_id,
        )

        logger.info(
--- a/apps/api/src/api/v1/webhooks.py
+++ b/apps/api/src/api/v1/webhooks.py
@@ -1300,6 +1300,17 @@ async def _process_new_alert_background(
                alert_category=alert_category,
            )

+            try:
+                await service.update_incident_id(approval.id, fallback_incident_id)
+                approval.incident_id = fallback_incident_id
+            except Exception as _meta_err:
+                logger.warning(
+                    "fallback_approval_incident_id_update_failed",
+                    approval_id=str(approval.id),
+                    incident_id=fallback_incident_id,
+                    error=str(_meta_err),
+                )
+
            await _push_to_telegram_background(
                approval_id=str(approval.id),
                risk_level="medium",
--- a/apps/api/src/services/ai_router.py
+++ b/apps/api/src/services/ai_router.py
@@ -292,6 +292,52 @@ class AIRouter:
            "claude",
        ]

+    def _resolve_intent_from_context(
+        self,
+        context: dict | None,
+    ) -> IntentResult | None:
+        """
+        從 context 解析集中治理的 intent hint。
+
+        僅作為 AI Router 的內部快路徑，避免呼叫端自行繞過 Router 規則。
+        例如 Phase 2 agent 已知屬於診斷分析，就不必再多跑一次 intent LLM。
+        """
+        if not context:
+            return None
+
+        raw_hint = str(context.get("intent_hint", "")).strip().lower()
+        if not raw_hint:
+            return None
+
+        alias_map = {
+            "restart": IntentType.RESTART,
+            "scale": IntentType.SCALE,
+            "config": IntentType.CONFIG,
+            "diagnose": IntentType.DIAGNOSE,
+            "delete": IntentType.DELETE,
+            "rollback": IntentType.ROLLBACK,
+            "unknown": IntentType.UNKNOWN,
+            # legacy aliases
+            "alert_triage": IntentType.ALERT_TRIAGE,
+            "deployment": IntentType.DEPLOYMENT,
+            "query": IntentType.QUERY,
+            "maintenance": IntentType.MAINTENANCE,
+            "code_review": IntentType.CODE_REVIEW,
+        }
+        intent = alias_map.get(raw_hint)
+        if intent is None:
+            logger.warning("ai_router_invalid_intent_hint", intent_hint=raw_hint)
+            return None
+
+        return IntentResult(
+            intent=intent,
+            confidence=1.0,
+            method="context_hint",
+            matched_keywords=[f"context:{raw_hint}"],
+            detected_resources=[],
+            reasoning=f"context intent_hint={raw_hint}",
+        )
+
    async def route(
        self,
        text: str,
@@ -313,7 +359,9 @@ class AIRouter:
        context = context or {}

        # Step 1: 意圖分類 (返回 IntentResult, 規則引擎 < 10ms)
-        intent_result = await self._intent_classifier.classify(text)
+        intent_result = self._resolve_intent_from_context(context)
+        if intent_result is None:
+            intent_result = await self._intent_classifier.classify(text)
        intent = normalize_intent(intent_result.intent)

        # Step 2: 複雜度評分 (< 10ms)
@@ -529,7 +577,9 @@ class AIRouter:
        context = context or {}

        # 同步分類 (僅規則引擎, < 10ms)
-        intent_result = self._intent_classifier.classify_sync(text)
+        intent_result = self._resolve_intent_from_context(context)
+        if intent_result is None:
+            intent_result = self._intent_classifier.classify_sync(text)
        intent = normalize_intent(intent_result.intent)

        # 複雜度評分 (< 10ms)
--- a/apps/api/src/services/incident_approval_service.py
+++ b/apps/api/src/services/incident_approval_service.py
@@ -161,6 +161,7 @@ class IncidentApprovalService:
                    requested_by=approval_data.requested_by,
                    expires_at=approval_data.expires_at,
                    extra_metadata=approval_metadata,
+                    incident_id=incident_id,
                    fingerprint=incident_data.get("fingerprint"),
                )
                uow.session.add(approval_record)
@@ -276,6 +277,7 @@ class IncidentApprovalService:
            "blast_radius": record.blast_radius,
            "requested_by": record.requested_by,
            "created_at": record.created_at.isoformat() if record.created_at else None,
+            "incident_id": getattr(record, "incident_id", None),
            "metadata": record.extra_metadata,
        })

@@ -323,7 +325,7 @@ class IncidentApprovalService:
                approval.resolved_at = datetime.now(UTC)

            # 3. 取得關聯 Incident ID
-            incident_id = (approval.extra_metadata or {}).get("incident_id")
+            incident_id = approval.incident_id or (approval.extra_metadata or {}).get("incident_id")
            if not incident_id:
                logger.debug(
                    "no_linked_incident",
--- a/apps/api/src/services/proposal_service.py
+++ b/apps/api/src/services/proposal_service.py
@@ -253,6 +253,7 @@ class ProposalService:
                blast_radius=blast_radius,
                dry_run_checks=dry_run_checks,
                requested_by="OpenClaw AI",
+                incident_id=incident_id,
                metadata=metadata,
            )

--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -6,6 +6,46 @@

 ---

+## 📍 2026-04-24 — Telegram「AI 分析超時」止血 + incident_id 單一真相補強
+
+### 本次修復
+- **Phase 2 Agent Timeout**：`Diagnostician / Solver / Critic` 各自新增 `20s` step-level timeout，超時直接走既有 degraded fallback，避免 3 段 LLM 串行一路拖到 `AgentOrchestrator` 全局 `90s`
+- **AI Router 中央治理**：新增 `intent_hint` 快路徑，讓 Phase 2 internal-agent routing 可在 Router 內集中指定 `diagnose`，不再為同一場辯證重複跑慢速 intent LLM 分類
+- **Alertmanager fallback 鏈路**：`webhooks.py` 的 LLM fallback 路徑補上 `update_incident_id()`，修正 incident 建立後 approval 不回填的 DB 斷鏈
+- **incident_id 單一真相補強**：`IncidentApprovalService` 改為 `approval.incident_id` 優先、metadata 僅做 fallback；`ProposalService`、`SignOz webhook` 建 approval 時直接寫入 `incident_id` 欄位；SignOz Telegram 發卡同步帶上 `incident_id`
+
+### 本地驗證
+- `python3 -m py_compile` 通過：
+  - `apps/api/src/services/ai_router.py`
+  - `apps/api/src/agents/{diagnostician_agent,solver_agent,critic_agent}.py`
+  - `apps/api/src/api/v1/webhooks.py`
+  - `apps/api/src/services/{incident_approval_service,proposal_service}.py`
+  - `apps/api/src/api/v1/signoz_webhook.py`
+- `cd apps/api && pytest tests/test_p0_diagnose_routing.py -q` → `4 passed`
+- `cd apps/api && pytest tests/test_intent_classifier.py -q` → `16 passed, 7 skipped`
+
+### 殘餘風險
+- 尚未對 production live DB / logs 做二次驗證，無法在本 session 直接證明 Telegram 超時卡片數已下降
+- `/api/v1/webhooks/alerts` 舊 approval-only 路徑、Sentry 路徑仍可能產生 `approval_records.incident_id = NULL`，後續需決定是否全面收斂到 Incident-first 流程
+
+## 📍 2026-04-24 — 12-Agent 新遊戲規則 v1 定版 + 文件治理同步
+
+### 本次補強
+- 新增 `[docs/12-agent-game-rules.md](/Users/ogt/awoooi/docs/12-agent-game-rules.md)`：把 12-agent 從審計/設計概念落成日常派工規則
+- 定義 `12 agents vs 9 skills` 對照、模組責任區、自動派工規則、強制加簽規則、常用組隊模板
+- 補記 `ADR-095`：新增「日常工作模式（Game Rules v1）」章節，明確 12-agent 不等於 repo 內 9 skills
+- 更新 `Skill 06`：加入 12-agent 協作治理，規範任務判型 → 主責 agent → 對應 skills 的工作流
+
+### 治理決策
+- `12 agents` 定位為任務角色與分工編排
+- `.agents/skills/*.md` 定位為工程規範與實作守則
+- 後續工作模式：先用 12-agent 判型與派工，再落到 skills / HARD_RULES / MASTER 執行
+
+### 相關文件
+- `docs/12-agent-game-rules.md`
+- `docs/adr/ADR-095-12agent-sdk-integration.md`
+- `.agents/skills/06-awoooi-monorepo-master.md`
+
 ## 📍 2026-04-24 — ADR-092 P0+P1+P2.1 全修（commit 7f4088b / 04ff225 / bb5f16f）

 ### P2.1 修復（commit bb5f16f）