diff --git a/apps/api/src/jobs/rule_stats_updater_job.py b/apps/api/src/jobs/rule_stats_updater_job.py new file mode 100644 index 00000000..da0bc5de --- /dev/null +++ b/apps/api/src/jobs/rule_stats_updater_job.py @@ -0,0 +1,180 @@ +""" +Rule Stats Updater Job — ADR-090 § Rule Quality Analysis +========================================================= +每 1h 計算 alert_rule_catalog 的運作統計,解鎖 E3 Hermes 規則品質分析. + +更新欄位: + - last_fired_at = max(incidents.created_at WHERE alertname=rule_name) + - true_positive_count = count incidents.status='RESOLVED' past 30d + - false_positive_count = count approval_records.status='EXPIRED' past 30d + (EXPIRED = 48h 無人處理,視為假警報 proxy) + - noise_rate = fp / (tp + fp) 若有資料,否則 NULL + +職責邊界 (MVP): + ✅ 從 incidents.alertname + approval_records.incident_id JOIN 計算 + ✅ UPDATE alert_rule_catalog 統計欄位 + ⏳ TODO: Hermes AI agent 基於 noise_rate > 0.5 提案 deprecate 規則 (下一階段) + +設計鐵律: + - 只 UPDATE,不 INSERT (依賴 rule_catalog_sync 先建 rule) + - 計算窗口固定 30d (可未來配置化) + - 失敗 → log warning,下次重試 + +排程: + - 首次延遲 240s (rule_catalog_sync 先跑完) + - 每 1h + +2026-04-19 ogt + Claude Opus 4.7 (1M context) Asia/Taipei +ADR-090 § Rule Quality +""" +from __future__ import annotations + +import asyncio +import json as _json +import time as _time + +import structlog + +logger = structlog.get_logger(__name__) + +_UPDATE_INTERVAL_SEC = 3600 +_FIRST_DELAY_SEC = 240 +_LOOP_BACKOFF_SEC = 600 +_WINDOW_DAYS = 30 + + +async def run_rule_stats_updater_loop() -> None: + """每 1h 更新 alert_rule_catalog 統計欄位.""" + logger.info("rule_stats_updater_loop_started", interval_sec=_UPDATE_INTERVAL_SEC) + await asyncio.sleep(_FIRST_DELAY_SEC) + + while True: + try: + await update_once() + except Exception as e: + logger.exception("rule_stats_updater_loop_error", error=str(e)) + await asyncio.sleep(_LOOP_BACKOFF_SEC) + continue + await asyncio.sleep(_UPDATE_INTERVAL_SEC) + + +async def update_once() -> dict[str, int]: + """跑一次 update 全表 alert_rule_catalog.""" + started_ms = _time.time() + stats = {"rules_updated": 0, "rules_with_fires": 0, "rules_noisy": 0} + error_msg: str | None = None + + try: + stats = await _do_update() + except Exception as e: + error_msg = f"{type(e).__name__}: {e}"[:1000] + logger.exception("rule_stats_update_once_failed", error=error_msg) + + duration_ms = int((_time.time() - started_ms) * 1000) + await _log_aol(stats, duration_ms, error_msg) + + logger.info( + "rule_stats_update_once_done", + rules_updated=stats.get("rules_updated", 0), + rules_with_fires=stats.get("rules_with_fires", 0), + rules_noisy=stats.get("rules_noisy", 0), + duration_ms=duration_ms, + ) + return stats + + +async def _do_update() -> dict[str, int]: + """一次 UPDATE 更新所有 rule 的統計.""" + from sqlalchemy import text as _sql + from src.db.base import get_db_context + + async with get_db_context() as db: + # 一次性 UPDATE 全表 — 對每 rule 計算 stats + # 使用 LATERAL 避免 N+1 queries + result = await db.execute( + _sql(f""" + UPDATE alert_rule_catalog arc + SET true_positive_count = COALESCE(s.tp_count, 0), + false_positive_count = COALESCE(s.fp_count, 0), + last_fired_at = s.last_fired, + noise_rate = CASE + WHEN COALESCE(s.tp_count, 0) + COALESCE(s.fp_count, 0) > 0 + THEN COALESCE(s.fp_count, 0)::numeric + / (COALESCE(s.tp_count, 0) + COALESCE(s.fp_count, 0))::numeric + ELSE NULL + END, + updated_at = NOW() + FROM ( + SELECT + arc_inner.rule_name, + (SELECT count(*) FROM incidents i + WHERE i.alertname = arc_inner.rule_name + AND i.status = 'RESOLVED' + AND i.created_at > NOW() - INTERVAL '{_WINDOW_DAYS} days' + ) AS tp_count, + (SELECT count(*) FROM approval_records ar + JOIN incidents i2 ON ar.incident_id = i2.incident_id + WHERE i2.alertname = arc_inner.rule_name + AND ar.status = 'EXPIRED' + AND ar.created_at > NOW() - INTERVAL '{_WINDOW_DAYS} days' + ) AS fp_count, + (SELECT max(created_at) FROM incidents i3 + WHERE i3.alertname = arc_inner.rule_name + ) AS last_fired + FROM alert_rule_catalog arc_inner + ) s + WHERE arc.rule_name = s.rule_name + """), + ) + rules_updated = result.rowcount or 0 + + # 再查統計摘要 + row = await db.execute( + _sql(f""" + SELECT + count(*) FILTER (WHERE last_fired_at > NOW() - INTERVAL '{_WINDOW_DAYS} days') AS with_fires, + count(*) FILTER (WHERE noise_rate IS NOT NULL AND noise_rate > 0.5) AS noisy + FROM alert_rule_catalog + """), + ) + r = row.one() + + return { + "rules_updated": rules_updated, + "rules_with_fires": int(r.with_fires or 0), + "rules_noisy": int(r.noisy or 0), + } + + +async def _log_aol(stats: dict[str, int], duration_ms: int, error: str | None) -> None: + try: + from sqlalchemy import text as _sql + from src.db.base import get_db_context + + aol_status = "failed" if error else "success" + async with get_db_context() as db: + await db.execute( + _sql(""" + INSERT INTO automation_operation_log ( + operation_type, actor, status, + input, output, duration_ms, error, tags + ) VALUES ( + 'rule_updated', + 'rule_stats_updater', + :st, + CAST(:input AS jsonb), + CAST(:output AS jsonb), + :dur, :err, :tags + ) + """), + { + "st": aol_status, + "input": _json.dumps({"window_days": _WINDOW_DAYS}, ensure_ascii=False), + "output": _json.dumps(stats, ensure_ascii=False), + "dur": duration_ms, + "err": (error or "")[:2000] if error else None, + "tags": ["rule_catalog", "stats", "quality"], + }, + ) + except Exception as e: + logger.warning("rule_stats_aol_write_failed", error=str(e)) diff --git a/apps/api/src/main.py b/apps/api/src/main.py index b42422a1..5d27985b 100644 --- a/apps/api/src/main.py +++ b/apps/api/src/main.py @@ -420,6 +420,16 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]: except Exception as e: logger.warning("coverage_evaluator_loop_schedule_failed", error=str(e)) + # ADR-090 § Rule Stats Updater (2026-04-19 ogt + Claude Opus 4.7 Asia/Taipei) + # 每 1h 從 incidents + approval_records 計算 rule 統計 + # 解鎖 E3 Hermes: noise_rate > 0.5 的 rule 可被 AI 提案 deprecate + try: + from src.jobs.rule_stats_updater_job import run_rule_stats_updater_loop + asyncio.create_task(run_rule_stats_updater_loop()) + logger.info("rule_stats_updater_loop_scheduled", interval_sec=3600) + except Exception as e: + logger.warning("rule_stats_updater_loop_schedule_failed", error=str(e)) + # ADR-076 Task 4: 每日 08:00 台北時間自動日度巡檢報告 # 2026-04-14 Claude Haiku 4.5 Asia/Taipei try: