diff --git a/apps/api/src/jobs/hermes_rule_quality_job.py b/apps/api/src/jobs/hermes_rule_quality_job.py
index cb35c2f9..a486e2b4 100644
--- a/apps/api/src/jobs/hermes_rule_quality_job.py
+++ b/apps/api/src/jobs/hermes_rule_quality_job.py
@@ -64,22 +64,30 @@ async def run_hermes_rule_quality_loop() -> None:
async def analyze_once() -> dict[str, int]:
- """一次分析: 找噪音 rule + 推建議."""
+ """一次分析: 找噪音 rule + LLM 分析真因 + 推建議 + aol 留痕."""
started_ms = _time.time()
- stats = {"noisy_rules": 0, "advisories_written": 0, "telegram_sent": 0}
+ stats = {"noisy_rules": 0, "llm_analyzed": 0, "advisories_written": 0, "telegram_sent": 0}
error_msg: str | None = None
+ llm_analyses: dict[str, dict[str, Any]] = {}
try:
noisy = await _fetch_noisy_rules()
stats["noisy_rules"] = len(noisy)
+ # v2 升級: 對每條 noisy rule 跑 LLM 分析真因 + 具體建議
for r in noisy:
- ok = await _write_advisory_aol(r)
+ analysis = await _llm_analyze_noisy_rule(r)
+ if analysis:
+ llm_analyses[r["rule_name"]] = analysis
+ stats["llm_analyzed"] += 1
+
+ for r in noisy:
+ ok = await _write_advisory_aol(r, llm_analyses.get(r["rule_name"]))
if ok:
stats["advisories_written"] += 1
if noisy:
- sent = await _send_telegram_summary(noisy)
+ sent = await _send_telegram_summary(noisy, llm_analyses)
stats["telegram_sent"] = 1 if sent else 0
except Exception as e:
@@ -90,6 +98,7 @@ async def analyze_once() -> dict[str, int]:
logger.info(
"hermes_rule_quality_once_done",
noisy=stats["noisy_rules"],
+ llm_analyzed=stats["llm_analyzed"],
advisories=stats["advisories_written"],
telegram_sent=stats["telegram_sent"],
duration_ms=duration_ms,
@@ -97,6 +106,89 @@ async def analyze_once() -> dict[str, int]:
return stats
+# ============================================================================
+# v2 LLM 分析 — 統帥鐵律「朝 AI 自主化方向」
+# ============================================================================
+
+_LLM_ANALYZE_PROMPT = """你是 AWOOOI SRE 告警規則品質分析專家。以下是一條 Prometheus alerting rule 過去 30 天的統計,請分析假報真因並提出具體改進建議。
+
+## 告警規則
+- rule_name: {rule_name}
+- severity: {severity}
+- expr: {expr}
+- for: {duration_seconds}s
+- labels: {labels}
+- annotations: {annotations}
+
+## 過去 30 天統計
+- true_positive (確實解決的): {tp}
+- false_positive (有破壞性動作但 EXPIRED 沒人理): {fp}
+- noise_rate: {noise_rate}
+
+## 輸出規格 (必須是合法 JSON,純 JSON 無前後文字)
+{{
+ "probable_root_causes": ["3-4 個候選真因,繁中"],
+ "recommended_actions": [
+ {{"action": "adjust_threshold|add_for_duration|refine_labels|deprecate|split_rule|keep_as_is", "detail": "具體怎麼做,繁中一句話"}}
+ ],
+ "confidence": 0.0-1.0,
+ "should_deprecate": true/false
+}}
+
+## 分析思路
+1. 看 expr 是否過於敏感 (閾值太低 / 沒有 for: window)
+2. 看 annotations 是否暗示「這是真實需要處理的問題」但被 AI 判 NO_ACTION → 可能是 action 流程問題而非規則問題
+3. 考慮 severity warning/critical 是否合理
+"""
+
+
+async def _llm_analyze_noisy_rule(rule: dict[str, Any]) -> dict[str, Any] | None:
+ """用 OpenClaw (多 provider) 分析噪音真因. 失敗回 None 不阻塞."""
+ try:
+ import json as _j
+ from src.services.openclaw import get_openclaw
+
+ prompt = _LLM_ANALYZE_PROMPT.format(
+ rule_name=rule["rule_name"],
+ severity=rule["severity"] or "-",
+ expr=(rule.get("expr") or "")[:500],
+ duration_seconds=rule.get("duration_seconds") or 0,
+ labels=_j.dumps(rule.get("labels", {}), ensure_ascii=False)[:300],
+ annotations=_j.dumps(rule.get("annotations", {}), ensure_ascii=False)[:300],
+ tp=rule["tp"],
+ fp=rule["fp"],
+ noise_rate=f"{rule['noise_rate']:.1%}",
+ )
+ openclaw = get_openclaw()
+ text, provider, success = await openclaw.call(prompt)
+ if not success or not text:
+ return None
+
+ _raw = text.strip()
+ if _raw.startswith("```"):
+ _raw = _raw.strip("`").lstrip("json").strip()
+
+ try:
+ parsed = _j.loads(_raw)
+ if isinstance(parsed, dict) and "recommended_actions" in parsed:
+ parsed["_llm_provider"] = provider
+ return parsed
+ # NemoTron wrapper: description 內嵌 JSON
+ if isinstance(parsed, dict) and "description" in parsed:
+ desc = str(parsed["description"]).strip()
+ if desc.startswith("{"):
+ inner = _j.loads(desc)
+ if isinstance(inner, dict) and "recommended_actions" in inner:
+ inner["_llm_provider"] = provider
+ return inner
+ except (_j.JSONDecodeError, ValueError) as e:
+ logger.warning("hermes_llm_parse_failed", rule=rule["rule_name"], error=str(e), raw=_raw[:200])
+ return None
+ except Exception as e:
+ logger.warning("hermes_llm_analyze_error", rule=rule["rule_name"], error=str(e))
+ return None
+
+
# ============================================================================
# 資料查詢
# ============================================================================
@@ -144,8 +236,8 @@ async def _fetch_noisy_rules() -> list[dict[str, Any]]:
# 建議寫入 (aol only,不改 rule 本身)
# ============================================================================
-async def _write_advisory_aol(rule: dict[str, Any]) -> bool:
- """寫 aol(rule_rejected) — 紀錄 AI 建議人工審查."""
+async def _write_advisory_aol(rule: dict[str, Any], llm_analysis: dict[str, Any] | None = None) -> bool:
+ """寫 aol(rule_rejected) — 紀錄 AI 建議人工審查 + LLM 分析結果."""
try:
from sqlalchemy import text as _sql
from src.db.base import get_db_context
@@ -157,7 +249,7 @@ async def _write_advisory_aol(rule: dict[str, Any]) -> bool:
"true_positive_count": rule["tp"],
"false_positive_count": rule["fp"],
}
- output_payload = {
+ output_payload: dict[str, Any] = {
"proposed_action": "review_or_deprecate",
"reason": (
f"過去 30d noise_rate {rule['noise_rate']:.1%} "
@@ -166,6 +258,8 @@ async def _write_advisory_aol(rule: dict[str, Any]) -> bool:
),
"requires_human_decision": True,
}
+ if llm_analysis:
+ output_payload["llm_analysis"] = llm_analysis
async with get_db_context() as db:
await db.execute(
@@ -198,9 +292,13 @@ async def _write_advisory_aol(rule: dict[str, Any]) -> bool:
# Telegram 推送
# ============================================================================
-async def _send_telegram_summary(noisy: list[dict[str, Any]]) -> bool:
- """推 Telegram 摘要訊息給 SRE group."""
+async def _send_telegram_summary(
+ noisy: list[dict[str, Any]],
+ llm_analyses: dict[str, dict[str, Any]] | None = None,
+) -> bool:
+ """推 Telegram 摘要訊息給 SRE group,含 LLM 分析結果."""
try:
+ import html
from src.core.config import settings
from src.services.telegram_gateway import get_telegram_gateway
@@ -208,21 +306,32 @@ async def _send_telegram_summary(noisy: list[dict[str, Any]]) -> bool:
logger.info("hermes_telegram_skip_no_chat_id")
return False
+ llm_analyses = llm_analyses or {}
lines = [
- f"🔍 Hermes 規則品質檢測",
- f"檢測到 {len(noisy)} 條規則噪音率 ≥ {_NOISE_THRESHOLD:.0%},建議人工審查:",
+ "🔍 Hermes 規則品質檢測 (AI 分析)",
+ f"檢測到 {len(noisy)} 條規則噪音率 ≥ {_NOISE_THRESHOLD:.0%},請統帥審查:",
"",
]
- for r in noisy[:10]: # 最多秀 10 條避免太長
- import html
+ for r in noisy[:8]: # LLM 分析含建議,單條訊息較長,只秀 8 條
safe_name = html.escape(r["rule_name"])
lines.append(
- f"🟡 {safe_name}\n"
- f" 噪音率 {r['noise_rate']:.1%} (tp={r['tp']} fp={r['fp']} sev={r['severity'] or '-'})"
+ f"🟡 {safe_name} — noise {r['noise_rate']:.1%} (tp={r['tp']} fp={r['fp']})"
)
- if len(noisy) > 10:
- lines.append(f"\n…還有 {len(noisy) - 10} 條")
- lines.append("\n人工決策: 確認 deprecate 或改 expr → 手動 UPDATE review_status")
+ ai = llm_analyses.get(r["rule_name"])
+ if ai:
+ deprecate = ai.get("should_deprecate")
+ conf = ai.get("confidence", 0.0)
+ lines.append(f" AI 判定: should_deprecate={deprecate} confidence={conf:.0%}")
+ actions = ai.get("recommended_actions", []) or []
+ for act in actions[:2]: # 最多秀前 2 個建議
+ safe_detail = html.escape(str(act.get("detail", ""))[:120])
+ lines.append(f" ▸ {html.escape(str(act.get('action', '')))}: {safe_detail}")
+ else:
+ lines.append(" (LLM 分析不可用,僅依噪音率判斷)")
+ lines.append("")
+ if len(noisy) > 8:
+ lines.append(f"…還有 {len(noisy) - 8} 條,見 automation_operation_log")
+ lines.append("決策: 人工 UPDATE alert_rule_catalog SET review_status='deprecated' WHERE rule_name='...'")
msg = "\n".join(lines)
diff --git a/apps/api/src/jobs/rule_stats_updater_job.py b/apps/api/src/jobs/rule_stats_updater_job.py
index da0bc5de..4c8e08e1 100644
--- a/apps/api/src/jobs/rule_stats_updater_job.py
+++ b/apps/api/src/jobs/rule_stats_updater_job.py
@@ -84,13 +84,18 @@ async def update_once() -> dict[str, int]:
async def _do_update() -> dict[str, int]:
- """一次 UPDATE 更新所有 rule 的統計."""
+ """一次 UPDATE 更新所有 rule 的統計.
+
+ 2026-04-19 ogt + Claude Opus 4.7 v2 noise 算法修正:
+ 原: 任何 EXPIRED approval 都算 false positive
+ 問題: NO_ACTION/OBSERVE/INVESTIGATE 是 AI 刻意選「純觀察不破壞」,
+ 被 48h EXPIRED 只代表沒人看,不代表規則假報
+ 修: 只把「非 NO_ACTION 且 EXPIRED」算 fp (未來可擴: 加人工 rejected 旗標)
+ """
from sqlalchemy import text as _sql
from src.db.base import get_db_context
async with get_db_context() as db:
- # 一次性 UPDATE 全表 — 對每 rule 計算 stats
- # 使用 LATERAL 避免 N+1 queries
result = await db.execute(
_sql(f"""
UPDATE alert_rule_catalog arc
@@ -112,10 +117,17 @@ async def _do_update() -> dict[str, int]:
AND i.status = 'RESOLVED'
AND i.created_at > NOW() - INTERVAL '{_WINDOW_DAYS} days'
) AS tp_count,
+ -- v2 修正: 只算「非 NO_ACTION 類動作 且 EXPIRED」為假報
+ -- NO_ACTION/OBSERVE/INVESTIGATE 是 AI 純觀察,不該算假報
(SELECT count(*) FROM approval_records ar
JOIN incidents i2 ON ar.incident_id = i2.incident_id
WHERE i2.alertname = arc_inner.rule_name
AND ar.status = 'EXPIRED'
+ AND ar.action NOT ILIKE '%NO_ACTION%'
+ AND ar.action NOT ILIKE '%NO-ACTION%'
+ AND ar.action NOT ILIKE '%NOACTION%'
+ AND ar.action NOT ILIKE '%OBSERVE%'
+ AND ar.action NOT ILIKE '%INVESTIGATE%'
AND ar.created_at > NOW() - INTERVAL '{_WINDOW_DAYS} days'
) AS fp_count,
(SELECT max(created_at) FROM incidents i3
diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml
index eeca4100..da659b91 100644
--- a/ops/monitoring/alerts-unified.yml
+++ b/ops/monitoring/alerts-unified.yml
@@ -887,28 +887,52 @@ groups:
description: "Redis Stream 積壓超過 500 筆,飛輪消費者可能阻塞。"
runbook: "檢查 consumer group lag:XINFO GROUPS "
- - alert: PostgreSQLDiskGrowthRate
+ # 2026-04-19 Hermes E3 決策: PostgreSQLDiskGrowthRate deprecated
+ # 真因: 500MB/h 增長是 PG WAL 正常行為 (commits/checkpoints),不該告警
+ # 過去 30d 觸發 7 次,全部 AI 判 NO_ACTION 或誤判 kubectl rollout restart 失敗
+ # 統帥決策 (2026-04-19 18:xx Taipei): 選項 C 刪除舊規則 + 改用絕對磁碟使用率
+ # -----------------------------------------------------------------
+ - alert: HostDiskUsageHigh
expr: |
(
- node_filesystem_size_bytes{instance="192.168.0.188:9100", mountpoint="/"}
- - node_filesystem_avail_bytes{instance="192.168.0.188:9100", mountpoint="/"}
+ node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
+ - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
)
- - (
- node_filesystem_size_bytes{instance="192.168.0.188:9100", mountpoint="/"} offset 1h
- - node_filesystem_avail_bytes{instance="192.168.0.188:9100", mountpoint="/"} offset 1h
- )
- > 524288000
- for: 5m
+ / node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
+ * 100 > 80
+ for: 10m
labels:
severity: warning
- layer: docker-188
+ layer: systemd-188
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
+ supersedes: PostgreSQLDiskGrowthRate
annotations:
- summary: "188 主機磁碟 1 小時增長超過 500MB"
- description: "磁碟在過去 1 小時增長 {{ $value | humanize1024 }}B,可能是 PostgreSQL WAL 或日誌暴增。"
- runbook: "SSH 188:df -h / && du -sh /var/lib/postgresql/*/pg_wal"
+ summary: "主機 {{ $labels.instance }} 根目錄磁碟使用率 {{ $value | humanize }}% (>80%)"
+ description: "磁碟使用率持續 10 分鐘超過 80%, 需清理或擴容. 常見原因: PG WAL, 日誌, container images, 舊 build cache."
+ runbook: "SSH 該主機: df -h / && du -sh /var/lib/postgresql/*/pg_wal /var/log /var/lib/docker"
+
+ - alert: HostDiskUsageCritical
+ expr: |
+ (
+ node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
+ - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
+ )
+ / node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
+ * 100 > 90
+ for: 5m
+ labels:
+ severity: critical
+ layer: systemd-188
+ alert_category: infrastructure
+ notification_type: TYPE-3
+ auto_repair: "false"
+ supersedes: PostgreSQLDiskGrowthRate
+ annotations:
+ summary: "🔴 主機 {{ $labels.instance }} 根目錄磁碟使用率 {{ $value | humanize }}% (>90%, critical)"
+ description: "磁碟即將滿, 需立即清理. 超過 95% 可能導致服務中斷."
+ runbook: "立即 SSH 該主機: df -h / && du -sh /* 2>/dev/null | sort -h | tail -10"
# =========================================================================
# 主機連通性告警 (awoooi_host_connectivity) — 從主機補回 2026-04-12