284 lines
9.9 KiB
Python
284 lines
9.9 KiB
Python
"""
|
||
Knowledge Extractor Service — KB Phase 2-A
|
||
==========================================
|
||
Incident resolved 後自動萃取 KB 草稿。
|
||
|
||
設計原則:
|
||
- 使用 `settings.OLLAMA_TOOL_MODEL`,依全域順序 GCP-A → GCP-B → 111 嘗試
|
||
- fire-and-forget:失敗不影響 resolve 主流程
|
||
- logger.exception 保留完整 Stack Trace 供 Prompt 調優
|
||
|
||
2026-04-03 ogt: KB Phase 2-A 初始實作
|
||
"""
|
||
|
||
import structlog
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
# 2026-05-19 Codex: 統帥校正,全 Ollama workload 固定 GCP-A → GCP-B → 111。
|
||
def _get_ollama_endpoints():
|
||
from src.services.ollama_endpoint_circuit_breaker import (
|
||
resolve_ollama_order_with_cooldown,
|
||
)
|
||
|
||
return resolve_ollama_order_with_cooldown("hermes")
|
||
|
||
|
||
def _get_extract_model() -> str:
|
||
from src.core.config import settings
|
||
|
||
return str(getattr(settings, "OLLAMA_TOOL_MODEL", "hermes3:latest") or "hermes3:latest")
|
||
|
||
|
||
_EXTRACT_TIMEOUT = 30.0 # 秒,容忍慢速
|
||
|
||
# Linear / Nothing.tech 風格的 SRE KB Prompt
|
||
_PROMPT_TEMPLATE = """你是一位資深 SRE 工程師,請用**繁體中文**撰寫一份知識庫條目(Markdown 格式)。
|
||
|
||
## 事件資訊
|
||
- 事件 ID:{incident_id}
|
||
- 嚴重度:{severity}
|
||
- 發生時間:{created_at}
|
||
- 解決時間:{resolved_at}
|
||
|
||
## 觸發信號
|
||
{signals}
|
||
|
||
## 請輸出以下結構的 Markdown(只輸出 Markdown,不要其他說明文字):
|
||
|
||
# [一句話摘要標題]
|
||
|
||
## 問題描述
|
||
(簡述發生了什麼問題,2-3 句)
|
||
|
||
## 根本原因
|
||
(分析可能的根本原因,條列式)
|
||
|
||
## 解決方法
|
||
(列出實際採取的解決步驟,條列式)
|
||
|
||
## 預防措施
|
||
(如何避免未來再發生,條列式)
|
||
|
||
## 相關標籤
|
||
`{severity}` `ai_extracted`
|
||
"""
|
||
|
||
# 信號關鍵字 → KB 分類映射
|
||
_CATEGORY_KEYWORDS: dict[str, list[str]] = {
|
||
"infrastructure": ["k8s", "pod", "node", "deploy", "container", "namespace", "kubectl",
|
||
"memory", "cpu", "disk", "oom", "evict", "crashloop"],
|
||
"application": ["api", "http", "latency", "5xx", "4xx", "error rate", "timeout",
|
||
"connection", "database", "redis", "postgres", "slow"],
|
||
"ai_system": ["ai", "llm", "openclaw", "nemo", "ollama", "gemini", "claude",
|
||
"router", "provider", "inference", "token"],
|
||
"security": ["ssl", "cert", "auth", "permission", "scan", "vuln", "exploit",
|
||
"unauthorized", "403", "401"],
|
||
}
|
||
|
||
|
||
class KnowledgeExtractorService:
|
||
"""
|
||
Incident → KB 草稿自動萃取器
|
||
|
||
使用目前配置的 Ollama tool model 產生 Markdown 格式的 SRE 知識條目。
|
||
"""
|
||
|
||
async def extract_from_incident(self, incident) -> bool:
|
||
"""
|
||
從已解決的 Incident 萃取 KB 草稿。
|
||
|
||
Args:
|
||
incident: Incident 物件(需有 incident_id, severity, signals, created_at)
|
||
|
||
Returns:
|
||
True = 萃取成功,False = 失敗(已記錄 Stack Trace)
|
||
"""
|
||
try:
|
||
# 1. 組 Prompt
|
||
# 2026-04-16 ogt: Signal 無 description 欄位,用 alert_name + annotations.summary
|
||
signals_text = "\n".join(
|
||
f"- {s.alert_name}: {s.annotations.get('summary', s.annotations.get('description', ''))}"
|
||
for s in (incident.signals or [])
|
||
) or "(無信號記錄)"
|
||
|
||
prompt = _PROMPT_TEMPLATE.format(
|
||
incident_id=incident.incident_id,
|
||
severity=incident.severity.value,
|
||
created_at=str(getattr(incident, "created_at", "未知"))[:19],
|
||
resolved_at=str(getattr(incident, "resolved_at", "未知"))[:19],
|
||
signals=signals_text,
|
||
)
|
||
|
||
# 2. 呼叫 Ollama(直接 HTTP,不走 AIRouter 避免路由邏輯開銷)
|
||
markdown_content = await self._call_ollama(prompt)
|
||
model = _get_extract_model()
|
||
if not markdown_content:
|
||
logger.warning(
|
||
"kb_extract_empty_response",
|
||
incident_id=incident.incident_id,
|
||
model=model,
|
||
)
|
||
return False
|
||
|
||
# 3. 萃取標題(第一行 `# 標題`)
|
||
title = self._extract_title(markdown_content, incident)
|
||
|
||
# 4. 推斷分類
|
||
category = self._infer_category(incident)
|
||
|
||
# 5. 建立 KB 條目
|
||
from src.models.knowledge import (
|
||
EntrySource,
|
||
EntryType,
|
||
KnowledgeEntryCreate,
|
||
)
|
||
from src.services.knowledge_service import get_knowledge_service
|
||
|
||
entry_data = KnowledgeEntryCreate(
|
||
title=title,
|
||
content=markdown_content,
|
||
entry_type=EntryType.INCIDENT_CASE,
|
||
category=category,
|
||
tags=[incident.severity.value, "ai_extracted", category],
|
||
source=EntrySource.AI_EXTRACTED,
|
||
related_incident_id=incident.incident_id,
|
||
created_by="openclaw_ai",
|
||
)
|
||
await get_knowledge_service().create_entry(entry_data)
|
||
|
||
logger.info(
|
||
"kb_extract_success",
|
||
incident_id=incident.incident_id,
|
||
title=title,
|
||
category=category,
|
||
model=model,
|
||
)
|
||
return True
|
||
|
||
except Exception:
|
||
# 統帥指示:保留完整 Stack Trace 供初期 Prompt 調優
|
||
logger.exception(
|
||
"kb_extract_failed",
|
||
incident_id=getattr(incident, "incident_id", "unknown"),
|
||
)
|
||
return False
|
||
|
||
async def _call_ollama(self, prompt: str) -> str | None:
|
||
"""
|
||
直接呼叫 Ollama REST API。
|
||
|
||
不走 AIRouter 是刻意設計:
|
||
- KB 萃取是背景工作,不需要完整的路由/閘門/Cache 邏輯
|
||
- Ollama endpoint 固定依 GCP-A → GCP-B → 111 嘗試
|
||
"""
|
||
import httpx
|
||
|
||
endpoints = _get_ollama_endpoints()
|
||
model = _get_extract_model()
|
||
async with httpx.AsyncClient(timeout=_EXTRACT_TIMEOUT) as client:
|
||
for endpoint in endpoints:
|
||
if not endpoint.url:
|
||
continue
|
||
try:
|
||
r = await client.post(
|
||
f"{endpoint.url}/api/generate",
|
||
json={
|
||
"model": model,
|
||
"prompt": prompt,
|
||
"stream": False,
|
||
"options": {
|
||
"temperature": 0.3, # 低溫:減少幻覺
|
||
"num_predict": 800, # 控制長度
|
||
"stop": ["\n\n\n"], # 防止無限生成
|
||
},
|
||
},
|
||
)
|
||
r.raise_for_status()
|
||
text = r.json().get("response", "").strip()
|
||
if text:
|
||
logger.info(
|
||
"kb_ollama_call_success",
|
||
model=model,
|
||
provider=endpoint.provider_name,
|
||
base=endpoint.url,
|
||
)
|
||
from src.services.ollama_endpoint_circuit_breaker import (
|
||
record_ollama_endpoint_success,
|
||
)
|
||
|
||
record_ollama_endpoint_success(endpoint.url)
|
||
return text
|
||
except Exception as e:
|
||
from src.services.ollama_endpoint_circuit_breaker import (
|
||
record_ollama_endpoint_failure,
|
||
)
|
||
|
||
record_ollama_endpoint_failure(endpoint.url)
|
||
logger.warning(
|
||
"kb_ollama_call_failed",
|
||
model=model,
|
||
provider=endpoint.provider_name,
|
||
base=endpoint.url,
|
||
error=str(e),
|
||
)
|
||
|
||
logger.error(
|
||
"kb_ollama_all_endpoints_failed",
|
||
model=model,
|
||
attempted=[endpoint.provider_name for endpoint in endpoints],
|
||
)
|
||
return None
|
||
|
||
def _extract_title(self, markdown: str, incident) -> str:
|
||
"""
|
||
從 Markdown 第一行 `# 標題` 萃取標題。
|
||
Fallback:使用 incident_id + 第一個 signal 描述。
|
||
"""
|
||
for line in markdown.splitlines():
|
||
stripped = line.strip()
|
||
if stripped.startswith("# "):
|
||
title = stripped[2:].strip()
|
||
if title:
|
||
return title[:200] # DB column max 255
|
||
|
||
# Fallback
|
||
# 2026-04-16 ogt: Signal 無 description 欄位,改用 alert_name
|
||
signals = incident.signals or []
|
||
desc = signals[0].alert_name[:60] if signals else "未知事件"
|
||
return f"[AI 萃取] {incident.incident_id}: {desc}"
|
||
|
||
def _infer_category(self, incident) -> str:
|
||
"""
|
||
依 signals 關鍵字推斷 KB 分類。
|
||
依序比對,第一個匹配的分類獲勝。
|
||
"""
|
||
# 2026-04-24 ogt: Signal 無 description 欄位,改用 alert_name + annotations.summary
|
||
text = " ".join(
|
||
(
|
||
(s.alert_name or "") + " " +
|
||
(s.annotations.get("summary", "") if s.annotations else "")
|
||
).lower()
|
||
for s in (incident.signals or [])
|
||
)
|
||
for category, keywords in _CATEGORY_KEYWORDS.items():
|
||
if any(k in text for k in keywords):
|
||
return category
|
||
|
||
# 保守 fallback
|
||
return "infrastructure"
|
||
|
||
|
||
# =============================================================================
|
||
# Singleton
|
||
# =============================================================================
|
||
|
||
_extractor: KnowledgeExtractorService | None = None
|
||
|
||
|
||
def get_knowledge_extractor() -> KnowledgeExtractorService:
|
||
global _extractor
|
||
if _extractor is None:
|
||
_extractor = KnowledgeExtractorService()
|
||
return _extractor
|