Files
awoooi/apps/api/src/services/log_anomaly_detector.py
Your Name ff30c61c4c
All checks were successful
Code Review / ai-code-review (push) Successful in 21s
CD Pipeline / tests (push) Successful in 1m20s
CD Pipeline / build-and-deploy (push) Successful in 4m15s
CD Pipeline / post-deploy-checks (push) Successful in 1m58s
fix(rls): 收斂 API DB access context
2026-05-12 19:55:13 +08:00

385 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
AWOOOI AIOps Phase 4 — Log Anomaly Detector日誌異常偵測
==========================================================
職責Drain3 log clustering即時偵測新 pattern
核心 API
process_log_line(line) -> LogAnomalyEvent | None
get_new_patterns(since_ts) -> list[LogCluster]
設計原則:
- Shadow Mode新 pattern 只記錄 logger.info不觸發 Alert
- 狀態持久化到 Rediscluster tree 序列化JSON
- 熔斷Drain3 失敗 → 僅記錄,不 raise
- 非同步:所有 Redis I/O 非同步Drain3 計算在同步 helper 中執行
ADR-084: Phase 4 動態異常偵測源頭升級
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 4 初始建立
"""
from __future__ import annotations
import hashlib
import json
from dataclasses import dataclass
from typing import Any
import structlog
from src.utils.timezone import now_taipei
logger = structlog.get_logger(__name__)
# ── 常數 ────────────────────────────────────────────────────────────────────
REDIS_KEY_CLUSTERS = "log_anomaly:clusters" # hash: cluster_id → cluster data
REDIS_KEY_NEW_PATTERNS = "log_anomaly:new" # list: 新 pattern 事件(最新在前)
REDIS_TTL_CLUSTERS = 86400 * 7 # 7 天
MAX_NEW_PATTERNS = 200 # 保留最近 200 個新 pattern 事件
DRAIN_DEPTH = 4 # Drain3 tree depth
DRAIN_SIM_THRESHOLD = 0.4 # 相似度 < 此值 → 新 cluster
DRAIN_MAX_CHILDREN = 100 # max children per node
# ─────────────────────────────────────────────────────────────────────────────
# Data Types
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class LogCluster:
"""Drain3 log cluster"""
cluster_id: str
template: str # 模板e.g. "ERROR <*> connection failed to <*>"
size: int = 1 # 命中次數
first_seen_at: str = ""
last_seen_at: str = ""
is_new: bool = False # 首次出現 → True
def to_dict(self) -> dict[str, Any]:
return {
"cluster_id": self.cluster_id,
"template": self.template,
"size": self.size,
"first_seen_at": self.first_seen_at,
"last_seen_at": self.last_seen_at,
}
@classmethod
def from_dict(cls, d: dict[str, Any]) -> "LogCluster":
return cls(
cluster_id=d["cluster_id"],
template=d["template"],
size=d.get("size", 1),
first_seen_at=d.get("first_seen_at", ""),
last_seen_at=d.get("last_seen_at", ""),
)
@dataclass
class LogAnomalyEvent:
"""新 pattern 偵測事件"""
cluster_id: str
template: str
sample_log: str
detected_at: str
shadow_mode: bool = True
source: str = "k8s_pod" # k8s_pod | host_syslog | app_log
# ─────────────────────────────────────────────────────────────────────────────
# Main Service
# ─────────────────────────────────────────────────────────────────────────────
class LogAnomalyDetector:
"""
Drain3 日誌異常偵測服務
工作流程:
1. process_log_line() — 即時 clustering
2. 新 cluster → 記錄到 Redis list
3. ProactiveInspector 定期呼叫 get_new_patterns() 聚合
"""
def __init__(self) -> None:
self._drain: Any = None # lazy-init Drain3 instance
self._initialized = False
def _get_drain(self) -> Any:
"""Lazy-init Drain3避免 import 在啟動時失敗)。"""
if self._drain is not None:
return self._drain
try:
from drain3 import TemplateMiner
from drain3.template_miner_config import TemplateMinerConfig
config = TemplateMinerConfig()
config.drain_depth = DRAIN_DEPTH
config.drain_sim_th = DRAIN_SIM_THRESHOLD
config.drain_max_children = DRAIN_MAX_CHILDREN
config.parametrize_numeric_tokens = True
self._drain = TemplateMiner(config=config)
self._initialized = True
logger.info("drain3_initialized")
return self._drain
except ImportError:
logger.warning("drain3_not_available", reason="package not installed")
return None
except Exception as e:
logger.warning("drain3_init_failed", error=str(e))
return None
async def process_log_line(
self,
log_line: str,
source: str = "k8s_pod",
) -> LogAnomalyEvent | None:
"""
處理單行日誌,若為新 pattern 回傳 LogAnomalyEvent。
Args:
log_line: 原始日誌行
source: 來源標籤
Returns:
LogAnomalyEvent新 pattern或 None已知 pattern
"""
from src.core.feature_flags import aiops_flags
if not aiops_flags.AIOPS_P4_LOG_ANOMALY:
return None
shadow_mode = aiops_flags.AIOPS_P4_SHADOW_MODE
try:
drain = self._get_drain()
if drain is None:
return None
# Drain3 clustering同步計算輕量
result = drain.add_log_message(log_line)
if result is None:
return None
cluster = result.get("cluster", None)
if cluster is None:
return None
change_type = result.get("change_type", "none")
is_new = change_type in ("cluster_created", "template_created")
if not is_new:
# 已知 pattern更新 last_seen
await self._update_cluster_hit(str(cluster.cluster_id))
return None
# 新 pattern
template = cluster.get_template()
cluster_id = self._make_cluster_id(template)
now_str = now_taipei().isoformat()
log_cluster = LogCluster(
cluster_id=cluster_id,
template=template,
size=1,
first_seen_at=now_str,
last_seen_at=now_str,
is_new=True,
)
await self._save_new_cluster(log_cluster, sample_log=log_line)
event = LogAnomalyEvent(
cluster_id=cluster_id,
template=template,
sample_log=log_line[:500], # 限制長度
detected_at=now_str,
shadow_mode=shadow_mode,
source=source,
)
logger.info(
"log_new_pattern_detected",
cluster_id=cluster_id,
template=template[:200],
shadow_mode=shadow_mode,
source=source,
)
return event
except Exception as e:
logger.warning("log_anomaly_process_failed", error=str(e))
return None
async def process_pod_logs(
self,
namespace: str = "awoooi-prod",
tail_lines: int = 100,
) -> list[LogAnomalyEvent]:
"""
批次掃描 K8s Pod 日誌(供 ProactiveInspector 呼叫)。
Returns:
新 pattern 事件列表Shadow Mode 時只記錄不觸發)
"""
from src.core.feature_flags import aiops_flags
if not aiops_flags.AIOPS_P4_LOG_ANOMALY:
return []
events: list[LogAnomalyEvent] = []
try:
logs = await self._fetch_pod_logs(namespace, tail_lines)
for line in logs:
if len(line.strip()) < 10:
continue
event = await self.process_log_line(line)
if event:
events.append(event)
except Exception as e:
logger.warning("pod_log_scan_failed", error=str(e))
return events
async def get_recent_new_patterns(
self,
limit: int = 10,
) -> list[dict[str, Any]]:
"""取得最近偵測到的新 pattern供 ProactiveInspector 聚合報告)。"""
try:
from src.core.redis_client import get_redis
r = get_redis()
raw = await r.lrange(REDIS_KEY_NEW_PATTERNS, 0, limit - 1)
return [json.loads(item) for item in raw]
except Exception:
return []
# ──────────────────────────────────────────────────────────────────────────
# Private Helpers
# ──────────────────────────────────────────────────────────────────────────
def _make_cluster_id(self, template: str) -> str:
"""根據模板產生穩定 ID。"""
return hashlib.md5(template.encode()).hexdigest()[:8].upper()
async def _save_new_cluster(self, cluster: LogCluster, sample_log: str = "") -> None:
"""
儲存新 cluster
1. 先寫 PostgreSQL永久保存AI 的 log 語意理解庫)
2. 推送到 Redis list短期工作記憶供 ProactiveInspector 聚合)
Phase 4 ADR-084 架構鐵律Drain3 學到的模板不能只存 Redis。
Redis TTL 到期 = AI 把已知 pattern 再次當成新 pattern = 永遠不會學習。
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 4 改為 PG source of truth
"""
# 1. 寫入 PostgreSQL主要持久化UPSERT 防重複)
await self._pg_upsert_cluster(cluster, sample_log)
# 2. 推送到 Redis list短期工作記憶
try:
from src.core.redis_client import get_redis
r = get_redis()
payload = json.dumps({
**cluster.to_dict(),
"detected_at": cluster.first_seen_at,
})
await r.lpush(REDIS_KEY_NEW_PATTERNS, payload)
await r.ltrim(REDIS_KEY_NEW_PATTERNS, 0, MAX_NEW_PATTERNS - 1)
await r.expire(REDIS_KEY_NEW_PATTERNS, REDIS_TTL_CLUSTERS)
except Exception as e:
logger.warning("log_cluster_redis_push_failed", error=str(e))
async def _pg_upsert_cluster(self, cluster: LogCluster, sample_log: str) -> None:
"""
寫入或更新 LogClusterRecordUPSERT on cluster_id
同一 cluster_id 再次出現時只更新 last_seen_at 和 size不重複 INSERT。
"""
try:
from sqlalchemy.dialects.postgresql import insert as pg_insert
from src.db.base import get_db_context
from src.db.models import LogClusterRecord
from src.utils.timezone import now_taipei
async with get_db_context() as session:
stmt = pg_insert(LogClusterRecord).values(
cluster_id=cluster.cluster_id,
template=cluster.template,
size=cluster.size,
source="k8s_pod",
sample_log=sample_log[:500] if sample_log else None,
).on_conflict_do_update(
index_elements=["cluster_id"],
set_={
"size": LogClusterRecord.size + 1,
"last_seen_at": now_taipei(),
},
)
await session.execute(stmt)
await session.commit()
except Exception as e:
logger.warning("log_cluster_pg_upsert_failed", cluster_id=cluster.cluster_id, error=str(e))
async def _update_cluster_hit(self, cluster_id: str) -> None:
"""更新已知 cluster 的命中次數best-effort"""
try:
from src.core.redis_client import get_redis
r = get_redis()
key = f"{REDIS_KEY_CLUSTERS}:{cluster_id}"
await r.hincrby(key, "size", 1)
await r.hset(key, "last_seen_at", now_taipei().isoformat())
except Exception:
pass # best-effort
async def _fetch_pod_logs(
self,
namespace: str,
tail_lines: int,
) -> list[str]:
"""
透過 kubectl API server 抓取 Pod 日誌。
使用 K8s in-cluster configAPI server: https://kubernetes.default.svc
或本地 kubeconfig。
"""
import asyncio
import subprocess
try:
# 抓取 awoooi-api deploy 的日誌(最新 Pod
result = await asyncio.get_event_loop().run_in_executor(
None,
lambda: subprocess.run(
[
"kubectl", "logs",
f"deploy/awoooi-api",
"-n", namespace,
f"--tail={tail_lines}",
"--timestamps=false",
],
capture_output=True,
text=True,
timeout=15,
),
)
if result.returncode == 0:
return result.stdout.splitlines()
logger.warning("kubectl_logs_failed", stderr=result.stderr[:200])
return []
except Exception as e:
logger.warning("pod_log_fetch_failed", error=str(e))
return []
# ─────────────────────────────────────────────────────────────────────────────
# Singleton
# ─────────────────────────────────────────────────────────────────────────────
_detector: LogAnomalyDetector | None = None
def get_log_anomaly_detector() -> LogAnomalyDetector:
global _detector
if _detector is None:
_detector = LogAnomalyDetector()
return _detector