From ba18ad2ef8f4885b1e45874736452a0fa47df895 Mon Sep 17 00:00:00 2001 From: OG T Date: Sun, 19 Apr 2026 19:39:05 +0800 Subject: [PATCH] =?UTF-8?q?feat(hermes+rules):=20LLM=20=E5=8D=87=E7=B4=9A?= =?UTF-8?q?=20Hermes=20+=20=E7=B5=B1=E5=B8=A5=E6=B1=BA=E7=AD=96=20deprecat?= =?UTF-8?q?e=20PostgreSQLDiskGrowthRate?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 統帥 2026-04-19 決策: - Rule 1 PostgreSQLDiskGrowthRate → 選項 C: deprecate + 替代新規則 - Rule 2 NoAlertsReceived2Hours → 保留 (真實告警鏈路守護) - noise_rate 算法先修正 (NO_ACTION 不算 fp),觀察後動態調整 1. rule_stats_updater v2 noise 算法: 原: 任何 EXPIRED approval 都算 fp 問題: NO_ACTION/OBSERVE/INVESTIGATE 是 AI 純觀察,不該算假報 修: WHERE ar.action NOT ILIKE '%NO_ACTION%' AND NOT ILIKE '%OBSERVE%' AND ... 2. hermes_rule_quality v2 LLM 升級: 新增 _llm_analyze_noisy_rule: - 用 OpenClaw (Ollama/NemoTron/Gemini) 分析每條噪音 rule - JSON 輸出: probable_root_causes/recommended_actions/confidence/should_deprecate - 3 路 parse fallback (直接 / NemoTron wrapper / description nested) _write_advisory_aol 加 llm_analysis 到 output_payload _send_telegram_summary 加 AI 判定 + top 2 建議 (8 條上限避免太長) 符合統帥鐵律: AI 分析但不自動動作,仍人工決策 3. ops/monitoring/alerts-unified.yml 替換 Rule 1: 刪 PostgreSQLDiskGrowthRate (500MB/h 增長 → 觸發 WAL 正常行為誤報) 加 HostDiskUsageHigh (>80% for 10m, warning) 加 HostDiskUsageCritical (>90% for 5m, critical) 兩者 labels.supersedes='PostgreSQLDiskGrowthRate' 供追溯 (待 deploy-alerts workflow 下次 apply 到 Prometheus) 4. DB 即時 mark deprecated (避免等 alerts yaml 部署前 Hermes 又推): UPDATE alert_rule_catalog SET review_status='deprecated' WHERE rule_name='PostgreSQLDiskGrowthRate' Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/api/src/jobs/hermes_rule_quality_job.py | 145 ++++++++++++++++--- apps/api/src/jobs/rule_stats_updater_job.py | 18 ++- ops/monitoring/alerts-unified.yml | 50 +++++-- 3 files changed, 179 insertions(+), 34 deletions(-) diff --git a/apps/api/src/jobs/hermes_rule_quality_job.py b/apps/api/src/jobs/hermes_rule_quality_job.py index cb35c2f9..a486e2b4 100644 --- a/apps/api/src/jobs/hermes_rule_quality_job.py +++ b/apps/api/src/jobs/hermes_rule_quality_job.py @@ -64,22 +64,30 @@ async def run_hermes_rule_quality_loop() -> None: async def analyze_once() -> dict[str, int]: - """一次分析: 找噪音 rule + 推建議.""" + """一次分析: 找噪音 rule + LLM 分析真因 + 推建議 + aol 留痕.""" started_ms = _time.time() - stats = {"noisy_rules": 0, "advisories_written": 0, "telegram_sent": 0} + stats = {"noisy_rules": 0, "llm_analyzed": 0, "advisories_written": 0, "telegram_sent": 0} error_msg: str | None = None + llm_analyses: dict[str, dict[str, Any]] = {} try: noisy = await _fetch_noisy_rules() stats["noisy_rules"] = len(noisy) + # v2 升級: 對每條 noisy rule 跑 LLM 分析真因 + 具體建議 for r in noisy: - ok = await _write_advisory_aol(r) + analysis = await _llm_analyze_noisy_rule(r) + if analysis: + llm_analyses[r["rule_name"]] = analysis + stats["llm_analyzed"] += 1 + + for r in noisy: + ok = await _write_advisory_aol(r, llm_analyses.get(r["rule_name"])) if ok: stats["advisories_written"] += 1 if noisy: - sent = await _send_telegram_summary(noisy) + sent = await _send_telegram_summary(noisy, llm_analyses) stats["telegram_sent"] = 1 if sent else 0 except Exception as e: @@ -90,6 +98,7 @@ async def analyze_once() -> dict[str, int]: logger.info( "hermes_rule_quality_once_done", noisy=stats["noisy_rules"], + llm_analyzed=stats["llm_analyzed"], advisories=stats["advisories_written"], telegram_sent=stats["telegram_sent"], duration_ms=duration_ms, @@ -97,6 +106,89 @@ async def analyze_once() -> dict[str, int]: return stats +# ============================================================================ +# v2 LLM 分析 — 統帥鐵律「朝 AI 自主化方向」 +# ============================================================================ + +_LLM_ANALYZE_PROMPT = """你是 AWOOOI SRE 告警規則品質分析專家。以下是一條 Prometheus alerting rule 過去 30 天的統計,請分析假報真因並提出具體改進建議。 + +## 告警規則 +- rule_name: {rule_name} +- severity: {severity} +- expr: {expr} +- for: {duration_seconds}s +- labels: {labels} +- annotations: {annotations} + +## 過去 30 天統計 +- true_positive (確實解決的): {tp} +- false_positive (有破壞性動作但 EXPIRED 沒人理): {fp} +- noise_rate: {noise_rate} + +## 輸出規格 (必須是合法 JSON,純 JSON 無前後文字) +{{ + "probable_root_causes": ["3-4 個候選真因,繁中"], + "recommended_actions": [ + {{"action": "adjust_threshold|add_for_duration|refine_labels|deprecate|split_rule|keep_as_is", "detail": "具體怎麼做,繁中一句話"}} + ], + "confidence": 0.0-1.0, + "should_deprecate": true/false +}} + +## 分析思路 +1. 看 expr 是否過於敏感 (閾值太低 / 沒有 for: window) +2. 看 annotations 是否暗示「這是真實需要處理的問題」但被 AI 判 NO_ACTION → 可能是 action 流程問題而非規則問題 +3. 考慮 severity warning/critical 是否合理 +""" + + +async def _llm_analyze_noisy_rule(rule: dict[str, Any]) -> dict[str, Any] | None: + """用 OpenClaw (多 provider) 分析噪音真因. 失敗回 None 不阻塞.""" + try: + import json as _j + from src.services.openclaw import get_openclaw + + prompt = _LLM_ANALYZE_PROMPT.format( + rule_name=rule["rule_name"], + severity=rule["severity"] or "-", + expr=(rule.get("expr") or "")[:500], + duration_seconds=rule.get("duration_seconds") or 0, + labels=_j.dumps(rule.get("labels", {}), ensure_ascii=False)[:300], + annotations=_j.dumps(rule.get("annotations", {}), ensure_ascii=False)[:300], + tp=rule["tp"], + fp=rule["fp"], + noise_rate=f"{rule['noise_rate']:.1%}", + ) + openclaw = get_openclaw() + text, provider, success = await openclaw.call(prompt) + if not success or not text: + return None + + _raw = text.strip() + if _raw.startswith("```"): + _raw = _raw.strip("`").lstrip("json").strip() + + try: + parsed = _j.loads(_raw) + if isinstance(parsed, dict) and "recommended_actions" in parsed: + parsed["_llm_provider"] = provider + return parsed + # NemoTron wrapper: description 內嵌 JSON + if isinstance(parsed, dict) and "description" in parsed: + desc = str(parsed["description"]).strip() + if desc.startswith("{"): + inner = _j.loads(desc) + if isinstance(inner, dict) and "recommended_actions" in inner: + inner["_llm_provider"] = provider + return inner + except (_j.JSONDecodeError, ValueError) as e: + logger.warning("hermes_llm_parse_failed", rule=rule["rule_name"], error=str(e), raw=_raw[:200]) + return None + except Exception as e: + logger.warning("hermes_llm_analyze_error", rule=rule["rule_name"], error=str(e)) + return None + + # ============================================================================ # 資料查詢 # ============================================================================ @@ -144,8 +236,8 @@ async def _fetch_noisy_rules() -> list[dict[str, Any]]: # 建議寫入 (aol only,不改 rule 本身) # ============================================================================ -async def _write_advisory_aol(rule: dict[str, Any]) -> bool: - """寫 aol(rule_rejected) — 紀錄 AI 建議人工審查.""" +async def _write_advisory_aol(rule: dict[str, Any], llm_analysis: dict[str, Any] | None = None) -> bool: + """寫 aol(rule_rejected) — 紀錄 AI 建議人工審查 + LLM 分析結果.""" try: from sqlalchemy import text as _sql from src.db.base import get_db_context @@ -157,7 +249,7 @@ async def _write_advisory_aol(rule: dict[str, Any]) -> bool: "true_positive_count": rule["tp"], "false_positive_count": rule["fp"], } - output_payload = { + output_payload: dict[str, Any] = { "proposed_action": "review_or_deprecate", "reason": ( f"過去 30d noise_rate {rule['noise_rate']:.1%} " @@ -166,6 +258,8 @@ async def _write_advisory_aol(rule: dict[str, Any]) -> bool: ), "requires_human_decision": True, } + if llm_analysis: + output_payload["llm_analysis"] = llm_analysis async with get_db_context() as db: await db.execute( @@ -198,9 +292,13 @@ async def _write_advisory_aol(rule: dict[str, Any]) -> bool: # Telegram 推送 # ============================================================================ -async def _send_telegram_summary(noisy: list[dict[str, Any]]) -> bool: - """推 Telegram 摘要訊息給 SRE group.""" +async def _send_telegram_summary( + noisy: list[dict[str, Any]], + llm_analyses: dict[str, dict[str, Any]] | None = None, +) -> bool: + """推 Telegram 摘要訊息給 SRE group,含 LLM 分析結果.""" try: + import html from src.core.config import settings from src.services.telegram_gateway import get_telegram_gateway @@ -208,21 +306,32 @@ async def _send_telegram_summary(noisy: list[dict[str, Any]]) -> bool: logger.info("hermes_telegram_skip_no_chat_id") return False + llm_analyses = llm_analyses or {} lines = [ - f"🔍 Hermes 規則品質檢測", - f"檢測到 {len(noisy)} 條規則噪音率 ≥ {_NOISE_THRESHOLD:.0%},建議人工審查:", + "🔍 Hermes 規則品質檢測 (AI 分析)", + f"檢測到 {len(noisy)} 條規則噪音率 ≥ {_NOISE_THRESHOLD:.0%},請統帥審查:", "", ] - for r in noisy[:10]: # 最多秀 10 條避免太長 - import html + for r in noisy[:8]: # LLM 分析含建議,單條訊息較長,只秀 8 條 safe_name = html.escape(r["rule_name"]) lines.append( - f"🟡 {safe_name}\n" - f" 噪音率 {r['noise_rate']:.1%} (tp={r['tp']} fp={r['fp']} sev={r['severity'] or '-'})" + f"🟡 {safe_name} — noise {r['noise_rate']:.1%} (tp={r['tp']} fp={r['fp']})" ) - if len(noisy) > 10: - lines.append(f"\n…還有 {len(noisy) - 10} 條") - lines.append("\n人工決策: 確認 deprecate 或改 expr → 手動 UPDATE review_status") + ai = llm_analyses.get(r["rule_name"]) + if ai: + deprecate = ai.get("should_deprecate") + conf = ai.get("confidence", 0.0) + lines.append(f" AI 判定: should_deprecate={deprecate} confidence={conf:.0%}") + actions = ai.get("recommended_actions", []) or [] + for act in actions[:2]: # 最多秀前 2 個建議 + safe_detail = html.escape(str(act.get("detail", ""))[:120]) + lines.append(f" ▸ {html.escape(str(act.get('action', '')))}: {safe_detail}") + else: + lines.append(" (LLM 分析不可用,僅依噪音率判斷)") + lines.append("") + if len(noisy) > 8: + lines.append(f"…還有 {len(noisy) - 8} 條,見 automation_operation_log") + lines.append("決策: 人工 UPDATE alert_rule_catalog SET review_status='deprecated' WHERE rule_name='...'") msg = "\n".join(lines) diff --git a/apps/api/src/jobs/rule_stats_updater_job.py b/apps/api/src/jobs/rule_stats_updater_job.py index da0bc5de..4c8e08e1 100644 --- a/apps/api/src/jobs/rule_stats_updater_job.py +++ b/apps/api/src/jobs/rule_stats_updater_job.py @@ -84,13 +84,18 @@ async def update_once() -> dict[str, int]: async def _do_update() -> dict[str, int]: - """一次 UPDATE 更新所有 rule 的統計.""" + """一次 UPDATE 更新所有 rule 的統計. + + 2026-04-19 ogt + Claude Opus 4.7 v2 noise 算法修正: + 原: 任何 EXPIRED approval 都算 false positive + 問題: NO_ACTION/OBSERVE/INVESTIGATE 是 AI 刻意選「純觀察不破壞」, + 被 48h EXPIRED 只代表沒人看,不代表規則假報 + 修: 只把「非 NO_ACTION 且 EXPIRED」算 fp (未來可擴: 加人工 rejected 旗標) + """ from sqlalchemy import text as _sql from src.db.base import get_db_context async with get_db_context() as db: - # 一次性 UPDATE 全表 — 對每 rule 計算 stats - # 使用 LATERAL 避免 N+1 queries result = await db.execute( _sql(f""" UPDATE alert_rule_catalog arc @@ -112,10 +117,17 @@ async def _do_update() -> dict[str, int]: AND i.status = 'RESOLVED' AND i.created_at > NOW() - INTERVAL '{_WINDOW_DAYS} days' ) AS tp_count, + -- v2 修正: 只算「非 NO_ACTION 類動作 且 EXPIRED」為假報 + -- NO_ACTION/OBSERVE/INVESTIGATE 是 AI 純觀察,不該算假報 (SELECT count(*) FROM approval_records ar JOIN incidents i2 ON ar.incident_id = i2.incident_id WHERE i2.alertname = arc_inner.rule_name AND ar.status = 'EXPIRED' + AND ar.action NOT ILIKE '%NO_ACTION%' + AND ar.action NOT ILIKE '%NO-ACTION%' + AND ar.action NOT ILIKE '%NOACTION%' + AND ar.action NOT ILIKE '%OBSERVE%' + AND ar.action NOT ILIKE '%INVESTIGATE%' AND ar.created_at > NOW() - INTERVAL '{_WINDOW_DAYS} days' ) AS fp_count, (SELECT max(created_at) FROM incidents i3 diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index eeca4100..da659b91 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -887,28 +887,52 @@ groups: description: "Redis Stream 積壓超過 500 筆,飛輪消費者可能阻塞。" runbook: "檢查 consumer group lag:XINFO GROUPS " - - alert: PostgreSQLDiskGrowthRate + # 2026-04-19 Hermes E3 決策: PostgreSQLDiskGrowthRate deprecated + # 真因: 500MB/h 增長是 PG WAL 正常行為 (commits/checkpoints),不該告警 + # 過去 30d 觸發 7 次,全部 AI 判 NO_ACTION 或誤判 kubectl rollout restart 失敗 + # 統帥決策 (2026-04-19 18:xx Taipei): 選項 C 刪除舊規則 + 改用絕對磁碟使用率 + # ----------------------------------------------------------------- + - alert: HostDiskUsageHigh expr: | ( - node_filesystem_size_bytes{instance="192.168.0.188:9100", mountpoint="/"} - - node_filesystem_avail_bytes{instance="192.168.0.188:9100", mountpoint="/"} + node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"} + - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay", mountpoint="/"} ) - - ( - node_filesystem_size_bytes{instance="192.168.0.188:9100", mountpoint="/"} offset 1h - - node_filesystem_avail_bytes{instance="192.168.0.188:9100", mountpoint="/"} offset 1h - ) - > 524288000 - for: 5m + / node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"} + * 100 > 80 + for: 10m labels: severity: warning - layer: docker-188 + layer: systemd-188 alert_category: infrastructure notification_type: TYPE-3 auto_repair: "false" + supersedes: PostgreSQLDiskGrowthRate annotations: - summary: "188 主機磁碟 1 小時增長超過 500MB" - description: "磁碟在過去 1 小時增長 {{ $value | humanize1024 }}B,可能是 PostgreSQL WAL 或日誌暴增。" - runbook: "SSH 188:df -h / && du -sh /var/lib/postgresql/*/pg_wal" + summary: "主機 {{ $labels.instance }} 根目錄磁碟使用率 {{ $value | humanize }}% (>80%)" + description: "磁碟使用率持續 10 分鐘超過 80%, 需清理或擴容. 常見原因: PG WAL, 日誌, container images, 舊 build cache." + runbook: "SSH 該主機: df -h / && du -sh /var/lib/postgresql/*/pg_wal /var/log /var/lib/docker" + + - alert: HostDiskUsageCritical + expr: | + ( + node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"} + - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay", mountpoint="/"} + ) + / node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"} + * 100 > 90 + for: 5m + labels: + severity: critical + layer: systemd-188 + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + supersedes: PostgreSQLDiskGrowthRate + annotations: + summary: "🔴 主機 {{ $labels.instance }} 根目錄磁碟使用率 {{ $value | humanize }}% (>90%, critical)" + description: "磁碟即將滿, 需立即清理. 超過 95% 可能導致服務中斷." + runbook: "立即 SSH 該主機: df -h / && du -sh /* 2>/dev/null | sort -h | tail -10" # ========================================================================= # 主機連通性告警 (awoooi_host_connectivity) — 從主機補回 2026-04-12