fix(aiops): bound phase2 timeout and repair incident links
All checks were successful
E2E Health Check / e2e-health (push) Successful in 52s
CD Pipeline / build-and-deploy (push) Successful in 9m24s

This commit is contained in:
Your Name
2026-04-24 23:53:56 +08:00
parent ad494288cb
commit 0d81b28b1b
9 changed files with 173 additions and 7 deletions

View File

@@ -20,6 +20,7 @@ ADR-082: Phase 2 多 Agent 協作
from __future__ import annotations
import asyncio
import hashlib
import time
from typing import Any
@@ -42,6 +43,9 @@ logger = structlog.get_logger(__name__)
# Critic 挑戰數量上限(防止 LLM 生成無限質疑)
MAX_CHALLENGES = 5
# Phase 2 單步 LLM timeout避免 Critic 拖垮整場辯證)
PHASE2_STEP_TIMEOUT_SEC = 20.0
class CriticAgent(BaseAgent):
"""
@@ -109,9 +113,32 @@ class CriticAgent(BaseAgent):
"confidence": top_hypothesis.confidence if top_hypothesis else 0.0,
})
_critic_signal = (
f"hypothesis={top_hypothesis.description[:300] if top_hypothesis else 'none'}; "
f"action={top_candidate.action[:300] if top_candidate else 'none'}"
)
alert_context = {
"incident_id": diagnosis.evidence_snapshot_id or "UNKNOWN",
"severity": "P3",
"signals": [{"alert_name": "critic_review", "description": _critic_signal}],
"affected_services": [],
"intent_hint": "diagnose",
}
from src.services.openclaw import get_openclaw
openclaw = get_openclaw()
response_text, _provider, success = await openclaw.call(prompt)
try:
response_text, _provider, success = await asyncio.wait_for(
openclaw.call(prompt, alert_context=alert_context),
timeout=PHASE2_STEP_TIMEOUT_SEC,
)
except asyncio.TimeoutError:
logger.warning(
"critic_step_timeout",
snapshot_id=diagnosis.evidence_snapshot_id,
timeout_sec=PHASE2_STEP_TIMEOUT_SEC,
)
return self._degraded_report(0, "step_timeout")
if not success or not response_text:
return self._degraded_report(0, "llm_failed")

View File

@@ -18,6 +18,7 @@ ADR-082: Phase 2 多 Agent 協作
from __future__ import annotations
import asyncio
import hashlib
import json
import time
@@ -45,6 +46,9 @@ MAX_EVIDENCE_CHAIN = 5
# Confidence 閾值 — 低於此值 vote = ABSTAIN
ABSTAIN_CONFIDENCE_THRESHOLD = 0.4
# Phase 2 單步 LLM timeout防單一 Agent 吃光 90s 全局預算)
PHASE2_STEP_TIMEOUT_SEC = 20.0
class DiagnosticianAgent(BaseAgent):
"""
@@ -112,11 +116,23 @@ class DiagnosticianAgent(BaseAgent):
"severity": "P3",
"signals": [{"alert_name": "evidence_snapshot", "description": _evidence}],
"affected_services": [],
"intent_hint": "diagnose",
}
from src.services.openclaw import get_openclaw
openclaw = get_openclaw()
response_text, _provider, success = await openclaw.call(prompt, alert_context=alert_context)
try:
response_text, _provider, success = await asyncio.wait_for(
openclaw.call(prompt, alert_context=alert_context),
timeout=PHASE2_STEP_TIMEOUT_SEC,
)
except asyncio.TimeoutError:
logger.warning(
"diagnostician_step_timeout",
snapshot_id=snapshot.snapshot_id,
timeout_sec=PHASE2_STEP_TIMEOUT_SEC,
)
return self._degraded_report(snapshot, 0, reason="step_timeout")
if not success or not response_text:
return self._degraded_report(snapshot, 0, reason="llm_failed")

View File

@@ -19,6 +19,7 @@ ADR-082: Phase 2 多 Agent 協作
from __future__ import annotations
import asyncio
import hashlib
import time
from typing import Any
@@ -37,6 +38,9 @@ from src.services.sanitization_service import sanitize
logger = structlog.get_logger(__name__)
# Phase 2 單步 LLM timeout保留 Critic/Coordinator 的全局預算)
PHASE2_STEP_TIMEOUT_SEC = 20.0
class SolverAgent(BaseAgent):
"""
@@ -128,11 +132,23 @@ class SolverAgent(BaseAgent):
"severity": "P3",
"signals": [{"alert_name": "diagnosis_hypothesis", "description": _hypothesis_text}],
"affected_services": [],
"intent_hint": "diagnose",
}
from src.services.openclaw import get_openclaw
openclaw = get_openclaw()
response_text, _provider, success = await openclaw.call(prompt, alert_context=alert_context)
try:
response_text, _provider, success = await asyncio.wait_for(
openclaw.call(prompt, alert_context=alert_context),
timeout=PHASE2_STEP_TIMEOUT_SEC,
)
except asyncio.TimeoutError:
logger.warning(
"solver_step_timeout",
snapshot_id=diagnosis.evidence_snapshot_id,
timeout_sec=PHASE2_STEP_TIMEOUT_SEC,
)
return self._degraded_plan(diagnosis, 0, "step_timeout")
if not success or not response_text:
return self._degraded_plan(diagnosis, 0, "llm_failed")

View File

@@ -235,6 +235,7 @@ async def process_signoz_alert(
# =================================================================
await send_signoz_telegram(
approval_id=approval_id,
incident_id=incident.incident_id,
alert_name=alert_name,
labels=labels,
annotations=annotations,
@@ -349,6 +350,7 @@ async def create_signoz_approval(
kubectl_command=command,
dry_run_checks=[],
requested_by="signoz-webhook",
incident_id=incident_id,
metadata={
"source": "signoz",
"alert_name": alert_name,
@@ -371,6 +373,7 @@ async def create_signoz_approval(
async def send_signoz_telegram(
approval_id: str,
incident_id: str,
alert_name: str,
labels: dict,
annotations: dict,
@@ -392,7 +395,6 @@ async def send_signoz_telegram(
summary = annotations.get("summary", f"SignOz Alert: {alert_name}")
description = annotations.get("description", "")
# TODO(2026-04-05): SignOz 路徑無 incident_id待 SignOz→Incident 關聯後補傳
await telegram.send_approval_card(
approval_id=approval_id,
risk_level=analysis_result.risk_level if analysis_result else (
@@ -411,6 +413,7 @@ async def send_signoz_telegram(
anomaly_frequency=anomaly_frequency,
# 2026-04-02 ogt: 修復 ai_provider 未傳遞 → Telegram 顯示「AI 仲裁判定」而非具體模型名稱
ai_provider=ai_provider if ai_provider != "none" else "",
incident_id=incident_id,
)
logger.info(

View File

@@ -1300,6 +1300,17 @@ async def _process_new_alert_background(
alert_category=alert_category,
)
try:
await service.update_incident_id(approval.id, fallback_incident_id)
approval.incident_id = fallback_incident_id
except Exception as _meta_err:
logger.warning(
"fallback_approval_incident_id_update_failed",
approval_id=str(approval.id),
incident_id=fallback_incident_id,
error=str(_meta_err),
)
await _push_to_telegram_background(
approval_id=str(approval.id),
risk_level="medium",

View File

@@ -292,6 +292,52 @@ class AIRouter:
"claude",
]
def _resolve_intent_from_context(
self,
context: dict | None,
) -> IntentResult | None:
"""
從 context 解析集中治理的 intent hint。
僅作為 AI Router 的內部快路徑,避免呼叫端自行繞過 Router 規則。
例如 Phase 2 agent 已知屬於診斷分析,就不必再多跑一次 intent LLM。
"""
if not context:
return None
raw_hint = str(context.get("intent_hint", "")).strip().lower()
if not raw_hint:
return None
alias_map = {
"restart": IntentType.RESTART,
"scale": IntentType.SCALE,
"config": IntentType.CONFIG,
"diagnose": IntentType.DIAGNOSE,
"delete": IntentType.DELETE,
"rollback": IntentType.ROLLBACK,
"unknown": IntentType.UNKNOWN,
# legacy aliases
"alert_triage": IntentType.ALERT_TRIAGE,
"deployment": IntentType.DEPLOYMENT,
"query": IntentType.QUERY,
"maintenance": IntentType.MAINTENANCE,
"code_review": IntentType.CODE_REVIEW,
}
intent = alias_map.get(raw_hint)
if intent is None:
logger.warning("ai_router_invalid_intent_hint", intent_hint=raw_hint)
return None
return IntentResult(
intent=intent,
confidence=1.0,
method="context_hint",
matched_keywords=[f"context:{raw_hint}"],
detected_resources=[],
reasoning=f"context intent_hint={raw_hint}",
)
async def route(
self,
text: str,
@@ -313,7 +359,9 @@ class AIRouter:
context = context or {}
# Step 1: 意圖分類 (返回 IntentResult, 規則引擎 < 10ms)
intent_result = await self._intent_classifier.classify(text)
intent_result = self._resolve_intent_from_context(context)
if intent_result is None:
intent_result = await self._intent_classifier.classify(text)
intent = normalize_intent(intent_result.intent)
# Step 2: 複雜度評分 (< 10ms)
@@ -529,7 +577,9 @@ class AIRouter:
context = context or {}
# 同步分類 (僅規則引擎, < 10ms)
intent_result = self._intent_classifier.classify_sync(text)
intent_result = self._resolve_intent_from_context(context)
if intent_result is None:
intent_result = self._intent_classifier.classify_sync(text)
intent = normalize_intent(intent_result.intent)
# 複雜度評分 (< 10ms)

View File

@@ -161,6 +161,7 @@ class IncidentApprovalService:
requested_by=approval_data.requested_by,
expires_at=approval_data.expires_at,
extra_metadata=approval_metadata,
incident_id=incident_id,
fingerprint=incident_data.get("fingerprint"),
)
uow.session.add(approval_record)
@@ -276,6 +277,7 @@ class IncidentApprovalService:
"blast_radius": record.blast_radius,
"requested_by": record.requested_by,
"created_at": record.created_at.isoformat() if record.created_at else None,
"incident_id": getattr(record, "incident_id", None),
"metadata": record.extra_metadata,
})
@@ -323,7 +325,7 @@ class IncidentApprovalService:
approval.resolved_at = datetime.now(UTC)
# 3. 取得關聯 Incident ID
incident_id = (approval.extra_metadata or {}).get("incident_id")
incident_id = approval.incident_id or (approval.extra_metadata or {}).get("incident_id")
if not incident_id:
logger.debug(
"no_linked_incident",

View File

@@ -253,6 +253,7 @@ class ProposalService:
blast_radius=blast_radius,
dry_run_checks=dry_run_checks,
requested_by="OpenClaw AI",
incident_id=incident_id,
metadata=metadata,
)

View File

@@ -6,6 +6,46 @@
---
## 📍 2026-04-24 — Telegram「AI 分析超時」止血 + incident_id 單一真相補強
### 本次修復
- **Phase 2 Agent Timeout**`Diagnostician / Solver / Critic` 各自新增 `20s` step-level timeout超時直接走既有 degraded fallback避免 3 段 LLM 串行一路拖到 `AgentOrchestrator` 全局 `90s`
- **AI Router 中央治理**:新增 `intent_hint` 快路徑,讓 Phase 2 internal-agent routing 可在 Router 內集中指定 `diagnose`,不再為同一場辯證重複跑慢速 intent LLM 分類
- **Alertmanager fallback 鏈路**`webhooks.py` 的 LLM fallback 路徑補上 `update_incident_id()`,修正 incident 建立後 approval 不回填的 DB 斷鏈
- **incident_id 單一真相補強**`IncidentApprovalService` 改為 `approval.incident_id` 優先、metadata 僅做 fallback`ProposalService``SignOz webhook` 建 approval 時直接寫入 `incident_id` 欄位SignOz Telegram 發卡同步帶上 `incident_id`
### 本地驗證
- `python3 -m py_compile` 通過:
- `apps/api/src/services/ai_router.py`
- `apps/api/src/agents/{diagnostician_agent,solver_agent,critic_agent}.py`
- `apps/api/src/api/v1/webhooks.py`
- `apps/api/src/services/{incident_approval_service,proposal_service}.py`
- `apps/api/src/api/v1/signoz_webhook.py`
- `cd apps/api && pytest tests/test_p0_diagnose_routing.py -q``4 passed`
- `cd apps/api && pytest tests/test_intent_classifier.py -q``16 passed, 7 skipped`
### 殘餘風險
- 尚未對 production live DB / logs 做二次驗證,無法在本 session 直接證明 Telegram 超時卡片數已下降
- `/api/v1/webhooks/alerts` 舊 approval-only 路徑、Sentry 路徑仍可能產生 `approval_records.incident_id = NULL`,後續需決定是否全面收斂到 Incident-first 流程
## 📍 2026-04-24 — 12-Agent 新遊戲規則 v1 定版 + 文件治理同步
### 本次補強
- 新增 `[docs/12-agent-game-rules.md](/Users/ogt/awoooi/docs/12-agent-game-rules.md)`:把 12-agent 從審計/設計概念落成日常派工規則
- 定義 `12 agents vs 9 skills` 對照、模組責任區、自動派工規則、強制加簽規則、常用組隊模板
- 補記 `ADR-095`新增「日常工作模式Game Rules v1」章節明確 12-agent 不等於 repo 內 9 skills
- 更新 `Skill 06`:加入 12-agent 協作治理,規範任務判型 → 主責 agent → 對應 skills 的工作流
### 治理決策
- `12 agents` 定位為任務角色與分工編排
- `.agents/skills/*.md` 定位為工程規範與實作守則
- 後續工作模式:先用 12-agent 判型與派工,再落到 skills / HARD_RULES / MASTER 執行
### 相關文件
- `docs/12-agent-game-rules.md`
- `docs/adr/ADR-095-12agent-sdk-integration.md`
- `.agents/skills/06-awoooi-monorepo-master.md`
## 📍 2026-04-24 — ADR-092 P0+P1+P2.1 全修commit 7f4088b / 04ff225 / bb5f16f
### P2.1 修復commit bb5f16f