385 lines
15 KiB
Python
385 lines
15 KiB
Python
"""
|
||
AWOOOI AIOps Phase 4 — Log Anomaly Detector(日誌異常偵測)
|
||
==========================================================
|
||
職責:Drain3 log clustering,即時偵測新 pattern
|
||
|
||
核心 API:
|
||
process_log_line(line) -> LogAnomalyEvent | None
|
||
get_new_patterns(since_ts) -> list[LogCluster]
|
||
|
||
設計原則:
|
||
- Shadow Mode:新 pattern 只記錄 logger.info,不觸發 Alert
|
||
- 狀態持久化到 Redis:cluster tree 序列化(JSON)
|
||
- 熔斷:Drain3 失敗 → 僅記錄,不 raise
|
||
- 非同步:所有 Redis I/O 非同步;Drain3 計算在同步 helper 中執行
|
||
|
||
ADR-084: Phase 4 動態異常偵測源頭升級
|
||
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 4 初始建立
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import hashlib
|
||
import json
|
||
from dataclasses import dataclass
|
||
from typing import Any
|
||
|
||
import structlog
|
||
|
||
from src.utils.timezone import now_taipei
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
# ── 常數 ────────────────────────────────────────────────────────────────────
|
||
REDIS_KEY_CLUSTERS = "log_anomaly:clusters" # hash: cluster_id → cluster data
|
||
REDIS_KEY_NEW_PATTERNS = "log_anomaly:new" # list: 新 pattern 事件(最新在前)
|
||
REDIS_TTL_CLUSTERS = 86400 * 7 # 7 天
|
||
MAX_NEW_PATTERNS = 200 # 保留最近 200 個新 pattern 事件
|
||
DRAIN_DEPTH = 4 # Drain3 tree depth
|
||
DRAIN_SIM_THRESHOLD = 0.4 # 相似度 < 此值 → 新 cluster
|
||
DRAIN_MAX_CHILDREN = 100 # max children per node
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Data Types
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
@dataclass
|
||
class LogCluster:
|
||
"""Drain3 log cluster"""
|
||
cluster_id: str
|
||
template: str # 模板(e.g. "ERROR <*> connection failed to <*>")
|
||
size: int = 1 # 命中次數
|
||
first_seen_at: str = ""
|
||
last_seen_at: str = ""
|
||
is_new: bool = False # 首次出現 → True
|
||
|
||
def to_dict(self) -> dict[str, Any]:
|
||
return {
|
||
"cluster_id": self.cluster_id,
|
||
"template": self.template,
|
||
"size": self.size,
|
||
"first_seen_at": self.first_seen_at,
|
||
"last_seen_at": self.last_seen_at,
|
||
}
|
||
|
||
@classmethod
|
||
def from_dict(cls, d: dict[str, Any]) -> "LogCluster":
|
||
return cls(
|
||
cluster_id=d["cluster_id"],
|
||
template=d["template"],
|
||
size=d.get("size", 1),
|
||
first_seen_at=d.get("first_seen_at", ""),
|
||
last_seen_at=d.get("last_seen_at", ""),
|
||
)
|
||
|
||
|
||
@dataclass
|
||
class LogAnomalyEvent:
|
||
"""新 pattern 偵測事件"""
|
||
cluster_id: str
|
||
template: str
|
||
sample_log: str
|
||
detected_at: str
|
||
shadow_mode: bool = True
|
||
source: str = "k8s_pod" # k8s_pod | host_syslog | app_log
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Main Service
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
class LogAnomalyDetector:
|
||
"""
|
||
Drain3 日誌異常偵測服務
|
||
|
||
工作流程:
|
||
1. process_log_line() — 即時 clustering
|
||
2. 新 cluster → 記錄到 Redis list
|
||
3. ProactiveInspector 定期呼叫 get_new_patterns() 聚合
|
||
"""
|
||
|
||
def __init__(self) -> None:
|
||
self._drain: Any = None # lazy-init Drain3 instance
|
||
self._initialized = False
|
||
|
||
def _get_drain(self) -> Any:
|
||
"""Lazy-init Drain3(避免 import 在啟動時失敗)。"""
|
||
if self._drain is not None:
|
||
return self._drain
|
||
|
||
try:
|
||
from drain3 import TemplateMiner
|
||
from drain3.template_miner_config import TemplateMinerConfig
|
||
|
||
config = TemplateMinerConfig()
|
||
config.drain_depth = DRAIN_DEPTH
|
||
config.drain_sim_th = DRAIN_SIM_THRESHOLD
|
||
config.drain_max_children = DRAIN_MAX_CHILDREN
|
||
config.parametrize_numeric_tokens = True
|
||
|
||
self._drain = TemplateMiner(config=config)
|
||
self._initialized = True
|
||
logger.info("drain3_initialized")
|
||
return self._drain
|
||
|
||
except ImportError:
|
||
logger.warning("drain3_not_available", reason="package not installed")
|
||
return None
|
||
except Exception as e:
|
||
logger.warning("drain3_init_failed", error=str(e))
|
||
return None
|
||
|
||
async def process_log_line(
|
||
self,
|
||
log_line: str,
|
||
source: str = "k8s_pod",
|
||
) -> LogAnomalyEvent | None:
|
||
"""
|
||
處理單行日誌,若為新 pattern 回傳 LogAnomalyEvent。
|
||
|
||
Args:
|
||
log_line: 原始日誌行
|
||
source: 來源標籤
|
||
|
||
Returns:
|
||
LogAnomalyEvent(新 pattern)或 None(已知 pattern)
|
||
"""
|
||
from src.core.feature_flags import aiops_flags
|
||
|
||
if not aiops_flags.AIOPS_P4_LOG_ANOMALY:
|
||
return None
|
||
|
||
shadow_mode = aiops_flags.AIOPS_P4_SHADOW_MODE
|
||
|
||
try:
|
||
drain = self._get_drain()
|
||
if drain is None:
|
||
return None
|
||
|
||
# Drain3 clustering(同步計算,輕量)
|
||
result = drain.add_log_message(log_line)
|
||
if result is None:
|
||
return None
|
||
|
||
cluster = result.get("cluster", None)
|
||
if cluster is None:
|
||
return None
|
||
|
||
change_type = result.get("change_type", "none")
|
||
is_new = change_type in ("cluster_created", "template_created")
|
||
|
||
if not is_new:
|
||
# 已知 pattern:更新 last_seen
|
||
await self._update_cluster_hit(str(cluster.cluster_id))
|
||
return None
|
||
|
||
# 新 pattern!
|
||
template = cluster.get_template()
|
||
cluster_id = self._make_cluster_id(template)
|
||
|
||
now_str = now_taipei().isoformat()
|
||
log_cluster = LogCluster(
|
||
cluster_id=cluster_id,
|
||
template=template,
|
||
size=1,
|
||
first_seen_at=now_str,
|
||
last_seen_at=now_str,
|
||
is_new=True,
|
||
)
|
||
await self._save_new_cluster(log_cluster, sample_log=log_line)
|
||
|
||
event = LogAnomalyEvent(
|
||
cluster_id=cluster_id,
|
||
template=template,
|
||
sample_log=log_line[:500], # 限制長度
|
||
detected_at=now_str,
|
||
shadow_mode=shadow_mode,
|
||
source=source,
|
||
)
|
||
|
||
logger.info(
|
||
"log_new_pattern_detected",
|
||
cluster_id=cluster_id,
|
||
template=template[:200],
|
||
shadow_mode=shadow_mode,
|
||
source=source,
|
||
)
|
||
return event
|
||
|
||
except Exception as e:
|
||
logger.warning("log_anomaly_process_failed", error=str(e))
|
||
return None
|
||
|
||
async def process_pod_logs(
|
||
self,
|
||
namespace: str = "awoooi-prod",
|
||
tail_lines: int = 100,
|
||
) -> list[LogAnomalyEvent]:
|
||
"""
|
||
批次掃描 K8s Pod 日誌(供 ProactiveInspector 呼叫)。
|
||
|
||
Returns:
|
||
新 pattern 事件列表(Shadow Mode 時只記錄不觸發)
|
||
"""
|
||
from src.core.feature_flags import aiops_flags
|
||
if not aiops_flags.AIOPS_P4_LOG_ANOMALY:
|
||
return []
|
||
|
||
events: list[LogAnomalyEvent] = []
|
||
try:
|
||
logs = await self._fetch_pod_logs(namespace, tail_lines)
|
||
for line in logs:
|
||
if len(line.strip()) < 10:
|
||
continue
|
||
event = await self.process_log_line(line)
|
||
if event:
|
||
events.append(event)
|
||
except Exception as e:
|
||
logger.warning("pod_log_scan_failed", error=str(e))
|
||
|
||
return events
|
||
|
||
async def get_recent_new_patterns(
|
||
self,
|
||
limit: int = 10,
|
||
) -> list[dict[str, Any]]:
|
||
"""取得最近偵測到的新 pattern(供 ProactiveInspector 聚合報告)。"""
|
||
try:
|
||
from src.core.redis_client import get_redis
|
||
r = get_redis()
|
||
raw = await r.lrange(REDIS_KEY_NEW_PATTERNS, 0, limit - 1)
|
||
return [json.loads(item) for item in raw]
|
||
except Exception:
|
||
return []
|
||
|
||
# ──────────────────────────────────────────────────────────────────────────
|
||
# Private Helpers
|
||
# ──────────────────────────────────────────────────────────────────────────
|
||
|
||
def _make_cluster_id(self, template: str) -> str:
|
||
"""根據模板產生穩定 ID。"""
|
||
return hashlib.md5(template.encode()).hexdigest()[:8].upper()
|
||
|
||
async def _save_new_cluster(self, cluster: LogCluster, sample_log: str = "") -> None:
|
||
"""
|
||
儲存新 cluster:
|
||
1. 先寫 PostgreSQL(永久保存,AI 的 log 語意理解庫)
|
||
2. 推送到 Redis list(短期工作記憶,供 ProactiveInspector 聚合)
|
||
|
||
Phase 4 ADR-084 架構鐵律:Drain3 學到的模板不能只存 Redis。
|
||
Redis TTL 到期 = AI 把已知 pattern 再次當成新 pattern = 永遠不會學習。
|
||
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 4 改為 PG source of truth
|
||
"""
|
||
# 1. 寫入 PostgreSQL(主要持久化,UPSERT 防重複)
|
||
await self._pg_upsert_cluster(cluster, sample_log)
|
||
|
||
# 2. 推送到 Redis list(短期工作記憶)
|
||
try:
|
||
from src.core.redis_client import get_redis
|
||
r = get_redis()
|
||
payload = json.dumps({
|
||
**cluster.to_dict(),
|
||
"detected_at": cluster.first_seen_at,
|
||
})
|
||
await r.lpush(REDIS_KEY_NEW_PATTERNS, payload)
|
||
await r.ltrim(REDIS_KEY_NEW_PATTERNS, 0, MAX_NEW_PATTERNS - 1)
|
||
await r.expire(REDIS_KEY_NEW_PATTERNS, REDIS_TTL_CLUSTERS)
|
||
except Exception as e:
|
||
logger.warning("log_cluster_redis_push_failed", error=str(e))
|
||
|
||
async def _pg_upsert_cluster(self, cluster: LogCluster, sample_log: str) -> None:
|
||
"""
|
||
寫入或更新 LogClusterRecord(UPSERT on cluster_id)。
|
||
|
||
同一 cluster_id 再次出現時只更新 last_seen_at 和 size,不重複 INSERT。
|
||
"""
|
||
try:
|
||
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
||
from src.db.base import get_db_context
|
||
from src.db.models import LogClusterRecord
|
||
from src.utils.timezone import now_taipei
|
||
|
||
async with get_db_context() as session:
|
||
stmt = pg_insert(LogClusterRecord).values(
|
||
cluster_id=cluster.cluster_id,
|
||
template=cluster.template,
|
||
size=cluster.size,
|
||
source="k8s_pod",
|
||
sample_log=sample_log[:500] if sample_log else None,
|
||
).on_conflict_do_update(
|
||
index_elements=["cluster_id"],
|
||
set_={
|
||
"size": LogClusterRecord.size + 1,
|
||
"last_seen_at": now_taipei(),
|
||
},
|
||
)
|
||
await session.execute(stmt)
|
||
await session.commit()
|
||
except Exception as e:
|
||
logger.warning("log_cluster_pg_upsert_failed", cluster_id=cluster.cluster_id, error=str(e))
|
||
|
||
async def _update_cluster_hit(self, cluster_id: str) -> None:
|
||
"""更新已知 cluster 的命中次數(best-effort)。"""
|
||
try:
|
||
from src.core.redis_client import get_redis
|
||
r = get_redis()
|
||
key = f"{REDIS_KEY_CLUSTERS}:{cluster_id}"
|
||
await r.hincrby(key, "size", 1)
|
||
await r.hset(key, "last_seen_at", now_taipei().isoformat())
|
||
except Exception:
|
||
pass # best-effort
|
||
|
||
async def _fetch_pod_logs(
|
||
self,
|
||
namespace: str,
|
||
tail_lines: int,
|
||
) -> list[str]:
|
||
"""
|
||
透過 kubectl API server 抓取 Pod 日誌。
|
||
|
||
使用 K8s in-cluster config(API server: https://kubernetes.default.svc)
|
||
或本地 kubeconfig。
|
||
"""
|
||
import asyncio
|
||
import subprocess
|
||
|
||
try:
|
||
# 抓取 awoooi-api deploy 的日誌(最新 Pod)
|
||
result = await asyncio.get_event_loop().run_in_executor(
|
||
None,
|
||
lambda: subprocess.run(
|
||
[
|
||
"kubectl", "logs",
|
||
f"deploy/awoooi-api",
|
||
"-n", namespace,
|
||
f"--tail={tail_lines}",
|
||
"--timestamps=false",
|
||
],
|
||
capture_output=True,
|
||
text=True,
|
||
timeout=15,
|
||
),
|
||
)
|
||
if result.returncode == 0:
|
||
return result.stdout.splitlines()
|
||
logger.warning("kubectl_logs_failed", stderr=result.stderr[:200])
|
||
return []
|
||
except Exception as e:
|
||
logger.warning("pod_log_fetch_failed", error=str(e))
|
||
return []
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Singleton
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
_detector: LogAnomalyDetector | None = None
|
||
|
||
|
||
def get_log_anomaly_detector() -> LogAnomalyDetector:
|
||
global _detector
|
||
if _detector is None:
|
||
_detector = LogAnomalyDetector()
|
||
return _detector
|