Files
awoooi/apps/api/src/services/log_summary_service.py
OG T f2c18c4e63
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 12m56s
feat(D1): models.json 集中化 — ADR-067 五大 Ollama 應用 hardcode 消除
- models.json v1.3.0: providers.ollama.models 新增 9 個 purpose keys
  (drift_summary/drift_intent/log_anomaly/nemoclaw/playbook_draft/
   code_review/embedding/rag_generate/image_analysis)
- drift_narrator_service: NARRATOR_MODEL → get_model("ollama","drift_summary")
- drift_interpreter: MODEL → get_model("ollama","drift_intent")
- log_summary_service: SUMMARY_MODEL → get_model("ollama","log_anomaly")
- local_code_review_service: _MODEL_OLLAMA → get_model("ollama","code_review")
- image_analysis_service: _MODEL → get_model("ollama","image_analysis")
- decision_manager: nemoclaw + playbook_draft 兩處 → get_model()
- embedding_service: get_embedding_service() factory → get_model("ollama","embedding")
- knowledge_service: OllamaEmbeddingService(model=...) → get_model()

所有模型名稱現在統一由 models.json 管理,修改模型只需改一個檔案。
LOGBOOK 更新:D1 完成 + B2 已完成確認

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-11 20:45:53 +08:00

247 lines
8.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Log Summary Service - Phase 31
================================
職責:擷取 K8s Pod log用 deepseek-r1:14b 生成繁中異常摘要
設計邊界:
- 只輸出摘要文字,不做 RCA、不生成修復指令
- 5s 軟超時:超過則回傳 None主流程不阻塞
- LLM 硬超時 180sdeepseek-r1:14b 慢但準)
- Redis 快取log_summary:{pod}:{date} TTL 24h同 pod 同天不重複呼叫
- 敏感資料遮蔽Bearer token、密碼 regex → [REDACTED]
- 複用 K8sDiagnosticsService 取得 logs
版本: v1.0
建立: 2026-04-10 (台北時區)
建立者: Claude Code (Phase 31 ADR-067)
"""
from __future__ import annotations
import asyncio
import re
from datetime import UTC, datetime
import httpx
import structlog
from src.core.redis_client import get_redis
from src.services.model_registry import get_model
logger = structlog.get_logger(__name__)
# ============================================================
# 設定
# ============================================================
OLLAMA_URL = "http://192.168.0.111:11434"
# D1 集中化 2026-04-11: 從 models.json providers.ollama.models.log_anomaly 讀取
SUMMARY_MODEL = get_model("ollama", "log_anomaly")
LLM_TIMEOUT = 180.0 # deepseek-r1 硬超時
SOFT_TIMEOUT = 5.0 # 主流程軟超時(超過回 None
LOG_TAIL_LINES = 100
ANOMALY_TAIL_LINES = 50 # 只取最後 50 行異常行送 LLM
CACHE_TTL = 86400 # 24 小時
CACHE_PREFIX = "log_summary:"
# 異常關鍵字 pattern
_ANOMALY_PATTERN = re.compile(
r"(ERROR|FATAL|Exception|Traceback|OOMKilled|panic|CRITICAL|WARN|WARNING)",
re.IGNORECASE,
)
# 敏感資料遮蔽 pattern
_SENSITIVE_PATTERNS = [
(re.compile(r"Bearer\s+[A-Za-z0-9+/=._-]{10,}", re.IGNORECASE), "Bearer [REDACTED]"),
(re.compile(r"password[=:\s]+\S+", re.IGNORECASE), "password=[REDACTED]"),
(re.compile(r"token[=:\s]+[A-Za-z0-9+/=._-]{10,}", re.IGNORECASE), "token=[REDACTED]"),
(re.compile(r"secret[=:\s]+\S+", re.IGNORECASE), "secret=[REDACTED]"),
]
# ============================================================
# Prompt
# ============================================================
_SUMMARY_PROMPT = """你是 AWOOOI SRE 維運助理,請分析以下 K8s Pod log 片段。
## Pod: {pod_name} (namespace: {namespace})
## Log 異常片段(最近 {line_count} 行):
{log_snippet}
## 任務
用繁體中文3 行以內,說明:
1. 異常的主要原因是什麼
2. 影響程度(例如 OOM Killed、連線失敗、程式錯誤
3. 建議立即查看哪個方向
只輸出摘要文字,不要標題或 markdown 格式。
"""
class LogSummaryService:
"""
Pod Log 異常摘要服務
職責邊界:
✅ 擷取 Pod log複用 K8sDiagnosticsService
✅ 過濾異常行 + 敏感資料遮蔽
✅ deepseek-r1:14b 生成繁中摘要180s timeout
✅ Redis 快取24h TTL
❌ 不做 RCA
❌ 不生成修復指令
❌ 不阻塞告警主流程(軟超時 5s
"""
async def summarize(
self,
pod_name: str,
namespace: str = "awoooi-prod",
) -> str | None:
"""
取得 Pod log 異常摘要
Returns:
str: 繁中摘要文字3 行內)
None: 快取命中、無異常、或超時
"""
cache_key = self._cache_key(pod_name)
redis = await get_redis()
cached = await redis.get(cache_key)
if cached:
logger.debug("log_summary_cache_hit", pod=pod_name)
return cached.decode() if isinstance(cached, bytes) else cached
raw_logs = await self._fetch_logs(pod_name, namespace)
if not raw_logs:
return None
anomaly_lines = self._extract_anomaly_lines(raw_logs)
if not anomaly_lines:
logger.debug("log_summary_no_anomaly", pod=pod_name)
return None
summary = await self._call_llm(pod_name, namespace, anomaly_lines)
if summary:
await redis.set(cache_key, summary, ex=CACHE_TTL)
logger.info("log_summary_generated", pod=pod_name, lines=len(anomaly_lines))
return summary
async def summarize_with_soft_timeout(
self,
pod_name: str,
namespace: str = "awoooi-prod",
) -> str | None:
"""
帶 5s 軟超時的摘要取得
用於告警主流程:超過軟超時回傳 None不阻塞 Telegram 推送
LLM 繼續在背景跑,結果寫入 Redis 快取供下次使用
"""
try:
return await asyncio.wait_for(
self.summarize(pod_name, namespace),
timeout=SOFT_TIMEOUT,
)
except asyncio.TimeoutError:
logger.info(
"log_summary_soft_timeout",
pod=pod_name,
soft_timeout=SOFT_TIMEOUT,
)
# 繼續在背景跑,結果存 Redis 備用
asyncio.create_task(self.summarize(pod_name, namespace))
return None
# --------------------------------------------------------
# Private helpers
# --------------------------------------------------------
def _cache_key(self, pod_name: str) -> str:
date_str = datetime.now(UTC).strftime("%Y-%m-%d")
return f"{CACHE_PREFIX}{pod_name}:{date_str}"
async def _fetch_logs(self, pod_name: str, namespace: str) -> str | None:
"""複用 K8sDiagnosticsService 取得 logs"""
try:
from src.services.k8s_diagnostics import K8sDiagnosticsService
svc = K8sDiagnosticsService(default_namespace=namespace)
diag = await svc.collect_diagnostics(
pod_name=pod_name,
namespace=namespace,
log_tail_lines=LOG_TAIL_LINES,
include_previous_logs=False,
)
return diag.logs or None
except Exception as e:
logger.warning("log_summary_fetch_failed", pod=pod_name, error=str(e))
return None
def _extract_anomaly_lines(self, raw_logs: str) -> list[str]:
"""過濾異常行 + 敏感資料遮蔽,取最後 ANOMALY_TAIL_LINES 行"""
lines = raw_logs.splitlines()
anomaly = [l for l in lines if _ANOMALY_PATTERN.search(l)]
# 取最後 N 行
anomaly = anomaly[-ANOMALY_TAIL_LINES:]
# 遮蔽敏感資料
result = []
for line in anomaly:
for pattern, replacement in _SENSITIVE_PATTERNS:
line = pattern.sub(replacement, line)
result.append(line)
return result
async def _call_llm(
self,
pod_name: str,
namespace: str,
anomaly_lines: list[str],
) -> str | None:
"""呼叫 deepseek-r1:14b 生成摘要"""
log_snippet = "\n".join(anomaly_lines)
prompt = _SUMMARY_PROMPT.format(
pod_name=pod_name,
namespace=namespace,
line_count=len(anomaly_lines),
log_snippet=log_snippet[:3000], # 避免超出 context
)
try:
async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
resp = await client.post(
f"{OLLAMA_URL}/api/generate",
json={
"model": SUMMARY_MODEL,
"prompt": prompt,
"stream": False,
"options": {"temperature": 0.1, "num_predict": 200},
},
)
resp.raise_for_status()
data = resp.json()
raw = data.get("response", "").strip()
# 過濾 deepseek-r1 的 <think>...</think> 推理區塊
text = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
return text or raw or None
except httpx.TimeoutException:
logger.warning("log_summary_llm_timeout", model=SUMMARY_MODEL)
return None
except Exception as e:
logger.warning("log_summary_llm_error", error=str(e))
return None
# ============================================================
# Singleton
# ============================================================
_log_summary_service: LogSummaryService | None = None
def get_log_summary_service() -> LogSummaryService:
global _log_summary_service
if _log_summary_service is None:
_log_summary_service = LogSummaryService()
return _log_summary_service