feat(hermes+rules): LLM 升級 Hermes + 統帥決策 deprecate PostgreSQLDiskGrowthRate
統帥 2026-04-19 決策:
- Rule 1 PostgreSQLDiskGrowthRate → 選項 C: deprecate + 替代新規則
- Rule 2 NoAlertsReceived2Hours → 保留 (真實告警鏈路守護)
- noise_rate 算法先修正 (NO_ACTION 不算 fp),觀察後動態調整
1. rule_stats_updater v2 noise 算法:
原: 任何 EXPIRED approval 都算 fp
問題: NO_ACTION/OBSERVE/INVESTIGATE 是 AI 純觀察,不該算假報
修: WHERE ar.action NOT ILIKE '%NO_ACTION%' AND NOT ILIKE '%OBSERVE%' AND ...
2. hermes_rule_quality v2 LLM 升級:
新增 _llm_analyze_noisy_rule:
- 用 OpenClaw (Ollama/NemoTron/Gemini) 分析每條噪音 rule
- JSON 輸出: probable_root_causes/recommended_actions/confidence/should_deprecate
- 3 路 parse fallback (直接 / NemoTron wrapper / description nested)
_write_advisory_aol 加 llm_analysis 到 output_payload
_send_telegram_summary 加 AI 判定 + top 2 建議 (8 條上限避免太長)
符合統帥鐵律: AI 分析但不自動動作,仍人工決策
3. ops/monitoring/alerts-unified.yml 替換 Rule 1:
刪 PostgreSQLDiskGrowthRate (500MB/h 增長 → 觸發 WAL 正常行為誤報)
加 HostDiskUsageHigh (>80% for 10m, warning)
加 HostDiskUsageCritical (>90% for 5m, critical)
兩者 labels.supersedes='PostgreSQLDiskGrowthRate' 供追溯
(待 deploy-alerts workflow 下次 apply 到 Prometheus)
4. DB 即時 mark deprecated (避免等 alerts yaml 部署前 Hermes 又推):
UPDATE alert_rule_catalog SET review_status='deprecated' WHERE rule_name='PostgreSQLDiskGrowthRate'
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -64,22 +64,30 @@ async def run_hermes_rule_quality_loop() -> None:
|
||||
|
||||
|
||||
async def analyze_once() -> dict[str, int]:
|
||||
"""一次分析: 找噪音 rule + 推建議."""
|
||||
"""一次分析: 找噪音 rule + LLM 分析真因 + 推建議 + aol 留痕."""
|
||||
started_ms = _time.time()
|
||||
stats = {"noisy_rules": 0, "advisories_written": 0, "telegram_sent": 0}
|
||||
stats = {"noisy_rules": 0, "llm_analyzed": 0, "advisories_written": 0, "telegram_sent": 0}
|
||||
error_msg: str | None = None
|
||||
llm_analyses: dict[str, dict[str, Any]] = {}
|
||||
|
||||
try:
|
||||
noisy = await _fetch_noisy_rules()
|
||||
stats["noisy_rules"] = len(noisy)
|
||||
|
||||
# v2 升級: 對每條 noisy rule 跑 LLM 分析真因 + 具體建議
|
||||
for r in noisy:
|
||||
ok = await _write_advisory_aol(r)
|
||||
analysis = await _llm_analyze_noisy_rule(r)
|
||||
if analysis:
|
||||
llm_analyses[r["rule_name"]] = analysis
|
||||
stats["llm_analyzed"] += 1
|
||||
|
||||
for r in noisy:
|
||||
ok = await _write_advisory_aol(r, llm_analyses.get(r["rule_name"]))
|
||||
if ok:
|
||||
stats["advisories_written"] += 1
|
||||
|
||||
if noisy:
|
||||
sent = await _send_telegram_summary(noisy)
|
||||
sent = await _send_telegram_summary(noisy, llm_analyses)
|
||||
stats["telegram_sent"] = 1 if sent else 0
|
||||
|
||||
except Exception as e:
|
||||
@@ -90,6 +98,7 @@ async def analyze_once() -> dict[str, int]:
|
||||
logger.info(
|
||||
"hermes_rule_quality_once_done",
|
||||
noisy=stats["noisy_rules"],
|
||||
llm_analyzed=stats["llm_analyzed"],
|
||||
advisories=stats["advisories_written"],
|
||||
telegram_sent=stats["telegram_sent"],
|
||||
duration_ms=duration_ms,
|
||||
@@ -97,6 +106,89 @@ async def analyze_once() -> dict[str, int]:
|
||||
return stats
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# v2 LLM 分析 — 統帥鐵律「朝 AI 自主化方向」
|
||||
# ============================================================================
|
||||
|
||||
_LLM_ANALYZE_PROMPT = """你是 AWOOOI SRE 告警規則品質分析專家。以下是一條 Prometheus alerting rule 過去 30 天的統計,請分析假報真因並提出具體改進建議。
|
||||
|
||||
## 告警規則
|
||||
- rule_name: {rule_name}
|
||||
- severity: {severity}
|
||||
- expr: {expr}
|
||||
- for: {duration_seconds}s
|
||||
- labels: {labels}
|
||||
- annotations: {annotations}
|
||||
|
||||
## 過去 30 天統計
|
||||
- true_positive (確實解決的): {tp}
|
||||
- false_positive (有破壞性動作但 EXPIRED 沒人理): {fp}
|
||||
- noise_rate: {noise_rate}
|
||||
|
||||
## 輸出規格 (必須是合法 JSON,純 JSON 無前後文字)
|
||||
{{
|
||||
"probable_root_causes": ["3-4 個候選真因,繁中"],
|
||||
"recommended_actions": [
|
||||
{{"action": "adjust_threshold|add_for_duration|refine_labels|deprecate|split_rule|keep_as_is", "detail": "具體怎麼做,繁中一句話"}}
|
||||
],
|
||||
"confidence": 0.0-1.0,
|
||||
"should_deprecate": true/false
|
||||
}}
|
||||
|
||||
## 分析思路
|
||||
1. 看 expr 是否過於敏感 (閾值太低 / 沒有 for: window)
|
||||
2. 看 annotations 是否暗示「這是真實需要處理的問題」但被 AI 判 NO_ACTION → 可能是 action 流程問題而非規則問題
|
||||
3. 考慮 severity warning/critical 是否合理
|
||||
"""
|
||||
|
||||
|
||||
async def _llm_analyze_noisy_rule(rule: dict[str, Any]) -> dict[str, Any] | None:
|
||||
"""用 OpenClaw (多 provider) 分析噪音真因. 失敗回 None 不阻塞."""
|
||||
try:
|
||||
import json as _j
|
||||
from src.services.openclaw import get_openclaw
|
||||
|
||||
prompt = _LLM_ANALYZE_PROMPT.format(
|
||||
rule_name=rule["rule_name"],
|
||||
severity=rule["severity"] or "-",
|
||||
expr=(rule.get("expr") or "")[:500],
|
||||
duration_seconds=rule.get("duration_seconds") or 0,
|
||||
labels=_j.dumps(rule.get("labels", {}), ensure_ascii=False)[:300],
|
||||
annotations=_j.dumps(rule.get("annotations", {}), ensure_ascii=False)[:300],
|
||||
tp=rule["tp"],
|
||||
fp=rule["fp"],
|
||||
noise_rate=f"{rule['noise_rate']:.1%}",
|
||||
)
|
||||
openclaw = get_openclaw()
|
||||
text, provider, success = await openclaw.call(prompt)
|
||||
if not success or not text:
|
||||
return None
|
||||
|
||||
_raw = text.strip()
|
||||
if _raw.startswith("```"):
|
||||
_raw = _raw.strip("`").lstrip("json").strip()
|
||||
|
||||
try:
|
||||
parsed = _j.loads(_raw)
|
||||
if isinstance(parsed, dict) and "recommended_actions" in parsed:
|
||||
parsed["_llm_provider"] = provider
|
||||
return parsed
|
||||
# NemoTron wrapper: description 內嵌 JSON
|
||||
if isinstance(parsed, dict) and "description" in parsed:
|
||||
desc = str(parsed["description"]).strip()
|
||||
if desc.startswith("{"):
|
||||
inner = _j.loads(desc)
|
||||
if isinstance(inner, dict) and "recommended_actions" in inner:
|
||||
inner["_llm_provider"] = provider
|
||||
return inner
|
||||
except (_j.JSONDecodeError, ValueError) as e:
|
||||
logger.warning("hermes_llm_parse_failed", rule=rule["rule_name"], error=str(e), raw=_raw[:200])
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.warning("hermes_llm_analyze_error", rule=rule["rule_name"], error=str(e))
|
||||
return None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# 資料查詢
|
||||
# ============================================================================
|
||||
@@ -144,8 +236,8 @@ async def _fetch_noisy_rules() -> list[dict[str, Any]]:
|
||||
# 建議寫入 (aol only,不改 rule 本身)
|
||||
# ============================================================================
|
||||
|
||||
async def _write_advisory_aol(rule: dict[str, Any]) -> bool:
|
||||
"""寫 aol(rule_rejected) — 紀錄 AI 建議人工審查."""
|
||||
async def _write_advisory_aol(rule: dict[str, Any], llm_analysis: dict[str, Any] | None = None) -> bool:
|
||||
"""寫 aol(rule_rejected) — 紀錄 AI 建議人工審查 + LLM 分析結果."""
|
||||
try:
|
||||
from sqlalchemy import text as _sql
|
||||
from src.db.base import get_db_context
|
||||
@@ -157,7 +249,7 @@ async def _write_advisory_aol(rule: dict[str, Any]) -> bool:
|
||||
"true_positive_count": rule["tp"],
|
||||
"false_positive_count": rule["fp"],
|
||||
}
|
||||
output_payload = {
|
||||
output_payload: dict[str, Any] = {
|
||||
"proposed_action": "review_or_deprecate",
|
||||
"reason": (
|
||||
f"過去 30d noise_rate {rule['noise_rate']:.1%} "
|
||||
@@ -166,6 +258,8 @@ async def _write_advisory_aol(rule: dict[str, Any]) -> bool:
|
||||
),
|
||||
"requires_human_decision": True,
|
||||
}
|
||||
if llm_analysis:
|
||||
output_payload["llm_analysis"] = llm_analysis
|
||||
|
||||
async with get_db_context() as db:
|
||||
await db.execute(
|
||||
@@ -198,9 +292,13 @@ async def _write_advisory_aol(rule: dict[str, Any]) -> bool:
|
||||
# Telegram 推送
|
||||
# ============================================================================
|
||||
|
||||
async def _send_telegram_summary(noisy: list[dict[str, Any]]) -> bool:
|
||||
"""推 Telegram 摘要訊息給 SRE group."""
|
||||
async def _send_telegram_summary(
|
||||
noisy: list[dict[str, Any]],
|
||||
llm_analyses: dict[str, dict[str, Any]] | None = None,
|
||||
) -> bool:
|
||||
"""推 Telegram 摘要訊息給 SRE group,含 LLM 分析結果."""
|
||||
try:
|
||||
import html
|
||||
from src.core.config import settings
|
||||
from src.services.telegram_gateway import get_telegram_gateway
|
||||
|
||||
@@ -208,21 +306,32 @@ async def _send_telegram_summary(noisy: list[dict[str, Any]]) -> bool:
|
||||
logger.info("hermes_telegram_skip_no_chat_id")
|
||||
return False
|
||||
|
||||
llm_analyses = llm_analyses or {}
|
||||
lines = [
|
||||
f"🔍 <b>Hermes 規則品質檢測</b>",
|
||||
f"檢測到 {len(noisy)} 條規則噪音率 ≥ {_NOISE_THRESHOLD:.0%},建議人工審查:",
|
||||
"🔍 <b>Hermes 規則品質檢測 (AI 分析)</b>",
|
||||
f"檢測到 {len(noisy)} 條規則噪音率 ≥ {_NOISE_THRESHOLD:.0%},請統帥審查:",
|
||||
"",
|
||||
]
|
||||
for r in noisy[:10]: # 最多秀 10 條避免太長
|
||||
import html
|
||||
for r in noisy[:8]: # LLM 分析含建議,單條訊息較長,只秀 8 條
|
||||
safe_name = html.escape(r["rule_name"])
|
||||
lines.append(
|
||||
f"🟡 <code>{safe_name}</code>\n"
|
||||
f" 噪音率 <b>{r['noise_rate']:.1%}</b> (tp={r['tp']} fp={r['fp']} sev={r['severity'] or '-'})"
|
||||
f"🟡 <code>{safe_name}</code> — noise {r['noise_rate']:.1%} (tp={r['tp']} fp={r['fp']})"
|
||||
)
|
||||
if len(noisy) > 10:
|
||||
lines.append(f"\n…還有 {len(noisy) - 10} 條")
|
||||
lines.append("\n人工決策: 確認 deprecate 或改 expr → 手動 UPDATE review_status")
|
||||
ai = llm_analyses.get(r["rule_name"])
|
||||
if ai:
|
||||
deprecate = ai.get("should_deprecate")
|
||||
conf = ai.get("confidence", 0.0)
|
||||
lines.append(f" AI 判定: should_deprecate={deprecate} confidence={conf:.0%}")
|
||||
actions = ai.get("recommended_actions", []) or []
|
||||
for act in actions[:2]: # 最多秀前 2 個建議
|
||||
safe_detail = html.escape(str(act.get("detail", ""))[:120])
|
||||
lines.append(f" ▸ <i>{html.escape(str(act.get('action', '')))}</i>: {safe_detail}")
|
||||
else:
|
||||
lines.append(" (LLM 分析不可用,僅依噪音率判斷)")
|
||||
lines.append("")
|
||||
if len(noisy) > 8:
|
||||
lines.append(f"…還有 {len(noisy) - 8} 條,見 automation_operation_log")
|
||||
lines.append("決策: 人工 UPDATE alert_rule_catalog SET review_status='deprecated' WHERE rule_name='...'")
|
||||
|
||||
msg = "\n".join(lines)
|
||||
|
||||
|
||||
@@ -84,13 +84,18 @@ async def update_once() -> dict[str, int]:
|
||||
|
||||
|
||||
async def _do_update() -> dict[str, int]:
|
||||
"""一次 UPDATE 更新所有 rule 的統計."""
|
||||
"""一次 UPDATE 更新所有 rule 的統計.
|
||||
|
||||
2026-04-19 ogt + Claude Opus 4.7 v2 noise 算法修正:
|
||||
原: 任何 EXPIRED approval 都算 false positive
|
||||
問題: NO_ACTION/OBSERVE/INVESTIGATE 是 AI 刻意選「純觀察不破壞」,
|
||||
被 48h EXPIRED 只代表沒人看,不代表規則假報
|
||||
修: 只把「非 NO_ACTION 且 EXPIRED」算 fp (未來可擴: 加人工 rejected 旗標)
|
||||
"""
|
||||
from sqlalchemy import text as _sql
|
||||
from src.db.base import get_db_context
|
||||
|
||||
async with get_db_context() as db:
|
||||
# 一次性 UPDATE 全表 — 對每 rule 計算 stats
|
||||
# 使用 LATERAL 避免 N+1 queries
|
||||
result = await db.execute(
|
||||
_sql(f"""
|
||||
UPDATE alert_rule_catalog arc
|
||||
@@ -112,10 +117,17 @@ async def _do_update() -> dict[str, int]:
|
||||
AND i.status = 'RESOLVED'
|
||||
AND i.created_at > NOW() - INTERVAL '{_WINDOW_DAYS} days'
|
||||
) AS tp_count,
|
||||
-- v2 修正: 只算「非 NO_ACTION 類動作 且 EXPIRED」為假報
|
||||
-- NO_ACTION/OBSERVE/INVESTIGATE 是 AI 純觀察,不該算假報
|
||||
(SELECT count(*) FROM approval_records ar
|
||||
JOIN incidents i2 ON ar.incident_id = i2.incident_id
|
||||
WHERE i2.alertname = arc_inner.rule_name
|
||||
AND ar.status = 'EXPIRED'
|
||||
AND ar.action NOT ILIKE '%NO_ACTION%'
|
||||
AND ar.action NOT ILIKE '%NO-ACTION%'
|
||||
AND ar.action NOT ILIKE '%NOACTION%'
|
||||
AND ar.action NOT ILIKE '%OBSERVE%'
|
||||
AND ar.action NOT ILIKE '%INVESTIGATE%'
|
||||
AND ar.created_at > NOW() - INTERVAL '{_WINDOW_DAYS} days'
|
||||
) AS fp_count,
|
||||
(SELECT max(created_at) FROM incidents i3
|
||||
|
||||
@@ -887,28 +887,52 @@ groups:
|
||||
description: "Redis Stream 積壓超過 500 筆,飛輪消費者可能阻塞。"
|
||||
runbook: "檢查 consumer group lag:XINFO GROUPS <stream-key>"
|
||||
|
||||
- alert: PostgreSQLDiskGrowthRate
|
||||
# 2026-04-19 Hermes E3 決策: PostgreSQLDiskGrowthRate deprecated
|
||||
# 真因: 500MB/h 增長是 PG WAL 正常行為 (commits/checkpoints),不該告警
|
||||
# 過去 30d 觸發 7 次,全部 AI 判 NO_ACTION 或誤判 kubectl rollout restart 失敗
|
||||
# 統帥決策 (2026-04-19 18:xx Taipei): 選項 C 刪除舊規則 + 改用絕對磁碟使用率
|
||||
# -----------------------------------------------------------------
|
||||
- alert: HostDiskUsageHigh
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_size_bytes{instance="192.168.0.188:9100", mountpoint="/"}
|
||||
- node_filesystem_avail_bytes{instance="192.168.0.188:9100", mountpoint="/"}
|
||||
node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
|
||||
- node_filesystem_avail_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
|
||||
)
|
||||
- (
|
||||
node_filesystem_size_bytes{instance="192.168.0.188:9100", mountpoint="/"} offset 1h
|
||||
- node_filesystem_avail_bytes{instance="192.168.0.188:9100", mountpoint="/"} offset 1h
|
||||
)
|
||||
> 524288000
|
||||
for: 5m
|
||||
/ node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
|
||||
* 100 > 80
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: docker-188
|
||||
layer: systemd-188
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
supersedes: PostgreSQLDiskGrowthRate
|
||||
annotations:
|
||||
summary: "188 主機磁碟 1 小時增長超過 500MB"
|
||||
description: "磁碟在過去 1 小時增長 {{ $value | humanize1024 }}B,可能是 PostgreSQL WAL 或日誌暴增。"
|
||||
runbook: "SSH 188:df -h / && du -sh /var/lib/postgresql/*/pg_wal"
|
||||
summary: "主機 {{ $labels.instance }} 根目錄磁碟使用率 {{ $value | humanize }}% (>80%)"
|
||||
description: "磁碟使用率持續 10 分鐘超過 80%, 需清理或擴容. 常見原因: PG WAL, 日誌, container images, 舊 build cache."
|
||||
runbook: "SSH 該主機: df -h / && du -sh /var/lib/postgresql/*/pg_wal /var/log /var/lib/docker"
|
||||
|
||||
- alert: HostDiskUsageCritical
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
|
||||
- node_filesystem_avail_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
|
||||
)
|
||||
/ node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
|
||||
* 100 > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: systemd-188
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
supersedes: PostgreSQLDiskGrowthRate
|
||||
annotations:
|
||||
summary: "🔴 主機 {{ $labels.instance }} 根目錄磁碟使用率 {{ $value | humanize }}% (>90%, critical)"
|
||||
description: "磁碟即將滿, 需立即清理. 超過 95% 可能導致服務中斷."
|
||||
runbook: "立即 SSH 該主機: df -h / && du -sh /* 2>/dev/null | sort -h | tail -10"
|
||||
|
||||
# =========================================================================
|
||||
# 主機連通性告警 (awoooi_host_connectivity) — 從主機補回 2026-04-12
|
||||
|
||||
Reference in New Issue
Block a user