All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 12m56s
- models.json v1.3.0: providers.ollama.models 新增 9 個 purpose keys
(drift_summary/drift_intent/log_anomaly/nemoclaw/playbook_draft/
code_review/embedding/rag_generate/image_analysis)
- drift_narrator_service: NARRATOR_MODEL → get_model("ollama","drift_summary")
- drift_interpreter: MODEL → get_model("ollama","drift_intent")
- log_summary_service: SUMMARY_MODEL → get_model("ollama","log_anomaly")
- local_code_review_service: _MODEL_OLLAMA → get_model("ollama","code_review")
- image_analysis_service: _MODEL → get_model("ollama","image_analysis")
- decision_manager: nemoclaw + playbook_draft 兩處 → get_model()
- embedding_service: get_embedding_service() factory → get_model("ollama","embedding")
- knowledge_service: OllamaEmbeddingService(model=...) → get_model()
所有模型名稱現在統一由 models.json 管理,修改模型只需改一個檔案。
LOGBOOK 更新:D1 完成 + B2 已完成確認
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
247 lines
8.3 KiB
Python
247 lines
8.3 KiB
Python
"""
|
||
Log Summary Service - Phase 31
|
||
================================
|
||
職責:擷取 K8s Pod log,用 deepseek-r1:14b 生成繁中異常摘要
|
||
|
||
設計邊界:
|
||
- 只輸出摘要文字,不做 RCA、不生成修復指令
|
||
- 5s 軟超時:超過則回傳 None,主流程不阻塞
|
||
- LLM 硬超時 180s(deepseek-r1:14b 慢但準)
|
||
- Redis 快取:log_summary:{pod}:{date} TTL 24h,同 pod 同天不重複呼叫
|
||
- 敏感資料遮蔽:Bearer token、密碼 regex → [REDACTED]
|
||
- 複用 K8sDiagnosticsService 取得 logs
|
||
|
||
版本: v1.0
|
||
建立: 2026-04-10 (台北時區)
|
||
建立者: Claude Code (Phase 31 ADR-067)
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import re
|
||
from datetime import UTC, datetime
|
||
|
||
import httpx
|
||
import structlog
|
||
|
||
from src.core.redis_client import get_redis
|
||
from src.services.model_registry import get_model
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
# ============================================================
|
||
# 設定
|
||
# ============================================================
|
||
OLLAMA_URL = "http://192.168.0.111:11434"
|
||
# D1 集中化 2026-04-11: 從 models.json providers.ollama.models.log_anomaly 讀取
|
||
SUMMARY_MODEL = get_model("ollama", "log_anomaly")
|
||
LLM_TIMEOUT = 180.0 # deepseek-r1 硬超時
|
||
SOFT_TIMEOUT = 5.0 # 主流程軟超時(超過回 None)
|
||
LOG_TAIL_LINES = 100
|
||
ANOMALY_TAIL_LINES = 50 # 只取最後 50 行異常行送 LLM
|
||
CACHE_TTL = 86400 # 24 小時
|
||
CACHE_PREFIX = "log_summary:"
|
||
|
||
# 異常關鍵字 pattern
|
||
_ANOMALY_PATTERN = re.compile(
|
||
r"(ERROR|FATAL|Exception|Traceback|OOMKilled|panic|CRITICAL|WARN|WARNING)",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
# 敏感資料遮蔽 pattern
|
||
_SENSITIVE_PATTERNS = [
|
||
(re.compile(r"Bearer\s+[A-Za-z0-9+/=._-]{10,}", re.IGNORECASE), "Bearer [REDACTED]"),
|
||
(re.compile(r"password[=:\s]+\S+", re.IGNORECASE), "password=[REDACTED]"),
|
||
(re.compile(r"token[=:\s]+[A-Za-z0-9+/=._-]{10,}", re.IGNORECASE), "token=[REDACTED]"),
|
||
(re.compile(r"secret[=:\s]+\S+", re.IGNORECASE), "secret=[REDACTED]"),
|
||
]
|
||
|
||
# ============================================================
|
||
# Prompt
|
||
# ============================================================
|
||
_SUMMARY_PROMPT = """你是 AWOOOI SRE 維運助理,請分析以下 K8s Pod log 片段。
|
||
|
||
## Pod: {pod_name} (namespace: {namespace})
|
||
## Log 異常片段(最近 {line_count} 行):
|
||
{log_snippet}
|
||
|
||
## 任務
|
||
用繁體中文,3 行以內,說明:
|
||
1. 異常的主要原因是什麼
|
||
2. 影響程度(例如 OOM Killed、連線失敗、程式錯誤)
|
||
3. 建議立即查看哪個方向
|
||
|
||
只輸出摘要文字,不要標題或 markdown 格式。
|
||
"""
|
||
|
||
|
||
class LogSummaryService:
|
||
"""
|
||
Pod Log 異常摘要服務
|
||
|
||
職責邊界:
|
||
✅ 擷取 Pod log(複用 K8sDiagnosticsService)
|
||
✅ 過濾異常行 + 敏感資料遮蔽
|
||
✅ deepseek-r1:14b 生成繁中摘要(180s timeout)
|
||
✅ Redis 快取(24h TTL)
|
||
❌ 不做 RCA
|
||
❌ 不生成修復指令
|
||
❌ 不阻塞告警主流程(軟超時 5s)
|
||
"""
|
||
|
||
async def summarize(
|
||
self,
|
||
pod_name: str,
|
||
namespace: str = "awoooi-prod",
|
||
) -> str | None:
|
||
"""
|
||
取得 Pod log 異常摘要
|
||
|
||
Returns:
|
||
str: 繁中摘要文字(3 行內)
|
||
None: 快取命中、無異常、或超時
|
||
"""
|
||
cache_key = self._cache_key(pod_name)
|
||
redis = await get_redis()
|
||
|
||
cached = await redis.get(cache_key)
|
||
if cached:
|
||
logger.debug("log_summary_cache_hit", pod=pod_name)
|
||
return cached.decode() if isinstance(cached, bytes) else cached
|
||
|
||
raw_logs = await self._fetch_logs(pod_name, namespace)
|
||
if not raw_logs:
|
||
return None
|
||
|
||
anomaly_lines = self._extract_anomaly_lines(raw_logs)
|
||
if not anomaly_lines:
|
||
logger.debug("log_summary_no_anomaly", pod=pod_name)
|
||
return None
|
||
|
||
summary = await self._call_llm(pod_name, namespace, anomaly_lines)
|
||
if summary:
|
||
await redis.set(cache_key, summary, ex=CACHE_TTL)
|
||
logger.info("log_summary_generated", pod=pod_name, lines=len(anomaly_lines))
|
||
|
||
return summary
|
||
|
||
async def summarize_with_soft_timeout(
|
||
self,
|
||
pod_name: str,
|
||
namespace: str = "awoooi-prod",
|
||
) -> str | None:
|
||
"""
|
||
帶 5s 軟超時的摘要取得
|
||
|
||
用於告警主流程:超過軟超時回傳 None,不阻塞 Telegram 推送
|
||
LLM 繼續在背景跑,結果寫入 Redis 快取供下次使用
|
||
"""
|
||
try:
|
||
return await asyncio.wait_for(
|
||
self.summarize(pod_name, namespace),
|
||
timeout=SOFT_TIMEOUT,
|
||
)
|
||
except asyncio.TimeoutError:
|
||
logger.info(
|
||
"log_summary_soft_timeout",
|
||
pod=pod_name,
|
||
soft_timeout=SOFT_TIMEOUT,
|
||
)
|
||
# 繼續在背景跑,結果存 Redis 備用
|
||
asyncio.create_task(self.summarize(pod_name, namespace))
|
||
return None
|
||
|
||
# --------------------------------------------------------
|
||
# Private helpers
|
||
# --------------------------------------------------------
|
||
|
||
def _cache_key(self, pod_name: str) -> str:
|
||
date_str = datetime.now(UTC).strftime("%Y-%m-%d")
|
||
return f"{CACHE_PREFIX}{pod_name}:{date_str}"
|
||
|
||
async def _fetch_logs(self, pod_name: str, namespace: str) -> str | None:
|
||
"""複用 K8sDiagnosticsService 取得 logs"""
|
||
try:
|
||
from src.services.k8s_diagnostics import K8sDiagnosticsService
|
||
svc = K8sDiagnosticsService(default_namespace=namespace)
|
||
diag = await svc.collect_diagnostics(
|
||
pod_name=pod_name,
|
||
namespace=namespace,
|
||
log_tail_lines=LOG_TAIL_LINES,
|
||
include_previous_logs=False,
|
||
)
|
||
return diag.logs or None
|
||
except Exception as e:
|
||
logger.warning("log_summary_fetch_failed", pod=pod_name, error=str(e))
|
||
return None
|
||
|
||
def _extract_anomaly_lines(self, raw_logs: str) -> list[str]:
|
||
"""過濾異常行 + 敏感資料遮蔽,取最後 ANOMALY_TAIL_LINES 行"""
|
||
lines = raw_logs.splitlines()
|
||
anomaly = [l for l in lines if _ANOMALY_PATTERN.search(l)]
|
||
# 取最後 N 行
|
||
anomaly = anomaly[-ANOMALY_TAIL_LINES:]
|
||
# 遮蔽敏感資料
|
||
result = []
|
||
for line in anomaly:
|
||
for pattern, replacement in _SENSITIVE_PATTERNS:
|
||
line = pattern.sub(replacement, line)
|
||
result.append(line)
|
||
return result
|
||
|
||
async def _call_llm(
|
||
self,
|
||
pod_name: str,
|
||
namespace: str,
|
||
anomaly_lines: list[str],
|
||
) -> str | None:
|
||
"""呼叫 deepseek-r1:14b 生成摘要"""
|
||
log_snippet = "\n".join(anomaly_lines)
|
||
prompt = _SUMMARY_PROMPT.format(
|
||
pod_name=pod_name,
|
||
namespace=namespace,
|
||
line_count=len(anomaly_lines),
|
||
log_snippet=log_snippet[:3000], # 避免超出 context
|
||
)
|
||
|
||
try:
|
||
async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
|
||
resp = await client.post(
|
||
f"{OLLAMA_URL}/api/generate",
|
||
json={
|
||
"model": SUMMARY_MODEL,
|
||
"prompt": prompt,
|
||
"stream": False,
|
||
"options": {"temperature": 0.1, "num_predict": 200},
|
||
},
|
||
)
|
||
resp.raise_for_status()
|
||
data = resp.json()
|
||
raw = data.get("response", "").strip()
|
||
|
||
# 過濾 deepseek-r1 的 <think>...</think> 推理區塊
|
||
text = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
|
||
return text or raw or None
|
||
|
||
except httpx.TimeoutException:
|
||
logger.warning("log_summary_llm_timeout", model=SUMMARY_MODEL)
|
||
return None
|
||
except Exception as e:
|
||
logger.warning("log_summary_llm_error", error=str(e))
|
||
return None
|
||
|
||
|
||
# ============================================================
|
||
# Singleton
|
||
# ============================================================
|
||
|
||
_log_summary_service: LogSummaryService | None = None
|
||
|
||
|
||
def get_log_summary_service() -> LogSummaryService:
|
||
global _log_summary_service
|
||
if _log_summary_service is None:
|
||
_log_summary_service = LogSummaryService()
|
||
return _log_summary_service
|