From 71437db0e9ae3a356c9ecc81506d975c417053d8 Mon Sep 17 00:00:00 2001 From: OG T Date: Thu, 9 Apr 2026 09:20:33 +0800 Subject: [PATCH] =?UTF-8?q?feat(rule-engine):=20=E8=87=AA=E5=8B=95?= =?UTF-8?q?=E8=A6=8F=E5=89=87=E7=94=9F=E6=88=90=20=E2=80=94=20generic=5Ffa?= =?UTF-8?q?llback=20=E8=A7=B8=E7=99=BC=20AI=20=E5=AD=B8=E7=BF=92?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 流程: 1. 告警命中 generic_fallback 規則 2. 背景觸發 auto_generate_rule() 3. Ollama (deepseek-r1:14b) 生成 YAML 規則片段 4. Ollama 失敗 → Gemini 備援 5. 驗證格式 → append alert_rules.yaml → 清除 lru_cache 6. 下次同類告警直接命中專屬規則,不再走兜底 去重: 同一 alertname 進程內只生成一次 手寫規則 priority 1-499,AI 生成 500-899,兜底 999 2026-04-09 ogt: AI 自學規則引擎 Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/services/alert_rule_engine.py | 212 ++++++++++++++++++++- apps/api/src/services/openclaw.py | 16 +- 2 files changed, 226 insertions(+), 2 deletions(-) diff --git a/apps/api/src/services/alert_rule_engine.py b/apps/api/src/services/alert_rule_engine.py index 20d4c057..2d6bb4b2 100644 --- a/apps/api/src/services/alert_rule_engine.py +++ b/apps/api/src/services/alert_rule_engine.py @@ -9,16 +9,27 @@ OpenClaw 告警規則匹配引擎 - priority 越小越優先,999 = 通用兜底 - 變數替換: {target} {host} {container} {instance} {job} {namespace} -2026-04-09 ogt: 初版,從 openclaw.py _generate_mock_response if/elif 抽出 +自動規則生成: +- 只有 generic_fallback 觸發時才生成(具體規則不觸發) +- 呼叫 Ollama (deepseek-r1:14b) 或 Gemini 生成 YAML 規則片段 +- 生成後 append 到 alert_rules.yaml,清除 lru_cache 立即生效 +- 同一 alertname 已有規則時跳過(去重) + +2026-04-09 ogt: 初版 + 自動規則生成 """ from __future__ import annotations +import asyncio +import json import re +import time +from datetime import datetime from functools import lru_cache from pathlib import Path from typing import Any +import httpx import structlog import yaml @@ -26,6 +37,9 @@ logger = structlog.get_logger(__name__) RULES_FILE = Path(__file__).parent.parent.parent / "alert_rules.yaml" +# 防止同一 alertname 重複生成(進程記憶體內去重) +_generating: set[str] = set() + # ── 變數提取 ──────────────────────────────────────────────── @@ -193,3 +207,199 @@ def match_rule(alert_context: dict) -> dict[str, Any] | None: "affected_services": [vars["target"]], "signoz_correlation": "", } + + +# ── 自動規則生成 ───────────────────────────────────────────── + +_AUTO_RULE_PROMPT = """\ +你是 SRE 專家。根據以下 Prometheus 告警資訊,生成一條 YAML 規則供 OpenClaw 規則引擎使用。 + +告警資訊: +- alertname: {alertname} +- alert_type: {alert_type} +- message: {message} +- labels: {labels} + +請嚴格輸出以下 YAML 格式,不要加任何說明文字,只輸出 YAML 區塊: + +- id: {rule_id} + priority: {priority} + description: <一行中文描述> + match: + alertname: + - {alertname} + response: + action_title: <含 {{target}} 變數的動作標題> + description: "⚙️ 規則匹配: <含 {{target}} 的描述,說明根因>" + suggested_action: + kubectl_command: "<含 {{target}} {{namespace}} 的具體指令>" + estimated_downtime: "<停機估計>" + risk: + responsibility: + responsibility_reasoning: "<責任歸屬說明>" + secondary_teams: [] + optimization: + - type: <優化類型> + description: "<優化說明>" + command: "<含 {{target}} {{namespace}} 的指令>" + reasoning: "[規則匹配] <處置邏輯說明>" + +規則 id 使用 snake_case,不含特殊字符。\ +""" + + +def _rule_id_exists(alertname: str) -> bool: + """檢查 alertname 是否已有規則(排除通用兜底)""" + try: + rules = _load_rules() + for rule in rules: + if _is_generic(rule): + continue + if alertname in rule.get("match", {}).get("alertname", []): + return True + except Exception: + pass + return False + + +def _append_rule_to_yaml(rule_yaml: str, alertname: str) -> bool: + """驗證並 append 規則到 alert_rules.yaml,清除 lru_cache""" + try: + parsed = yaml.safe_load(rule_yaml) + if not isinstance(parsed, list) or not parsed: + logger.warning("auto_rule_invalid_yaml", alertname=alertname, raw=rule_yaml[:200]) + return False + + rule = parsed[0] + required = {"id", "match", "response"} + if not required.issubset(rule.keys()): + logger.warning("auto_rule_missing_fields", alertname=alertname, keys=list(rule.keys())) + return False + + resp = rule.get("response", {}) + if not resp.get("kubectl_command") or not resp.get("suggested_action"): + logger.warning("auto_rule_empty_response", alertname=alertname) + return False + + # append 到 YAML 檔 + with RULES_FILE.open("a", encoding="utf-8") as f: + now = datetime.now().strftime("%Y-%m-%d %H:%M") + f.write(f"\n # AUTO-GENERATED {now} — alertname={alertname}\n") + # indent list item under rules: + for line in rule_yaml.strip().splitlines(): + f.write(f" {line}\n") + + # 清除 lru_cache 讓新規則立即生效 + _load_rules.cache_clear() + logger.info("auto_rule_written", rule_id=rule.get("id"), alertname=alertname) + return True + + except Exception as e: + logger.error("auto_rule_append_failed", alertname=alertname, error=str(e)) + return False + + +async def _call_ollama(prompt: str, ollama_url: str, model: str) -> str | None: + """呼叫 Ollama 生成規則 YAML""" + try: + async with httpx.AsyncClient(timeout=60) as client: + resp = await client.post( + f"{ollama_url}/api/generate", + json={"model": model, "prompt": prompt, "stream": False, "options": {"temperature": 0.1}}, + ) + resp.raise_for_status() + return resp.json().get("response", "") + except Exception as e: + logger.warning("auto_rule_ollama_failed", error=str(e)) + return None + + +async def _call_gemini(prompt: str, api_key: str) -> str | None: + """呼叫 Gemini 生成規則 YAML(Ollama 失敗時備援)""" + try: + url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" + async with httpx.AsyncClient(timeout=30) as client: + resp = await client.post( + url, + params={"key": api_key}, + json={"contents": [{"parts": [{"text": prompt}]}], + "generationConfig": {"temperature": 0.1}}, + ) + resp.raise_for_status() + data = resp.json() + return data["candidates"][0]["content"]["parts"][0]["text"] + except Exception as e: + logger.warning("auto_rule_gemini_failed", error=str(e)) + return None + + +def _extract_yaml_block(text: str) -> str: + """從 LLM 回應中提取 YAML 區塊""" + # 去掉 markdown code fence + text = re.sub(r"```(?:yaml)?\n?", "", text).strip() + # 確保以 `- id:` 開頭 + match = re.search(r"(- id:.+)", text, re.DOTALL) + return match.group(1).strip() if match else text + + +async def auto_generate_rule(alert_context: dict) -> None: + """ + 非同步背景任務:呼叫 AI 為未知告警自動生成規則並寫入 alert_rules.yaml。 + + 觸發條件: match_rule() 命中 generic_fallback + 流程: Ollama (deepseek-r1:14b) → 失敗則 Gemini → 驗證 → append YAML → 清除 cache + """ + from src.core.config import settings + + labels = alert_context.get("labels", {}) + alertname = labels.get("alertname", alert_context.get("alert_type", "custom")) + + # 去重:同一 alertname 同時只跑一次 + if alertname in _generating: + return + if _rule_id_exists(alertname): + logger.debug("auto_rule_skip_exists", alertname=alertname) + return + + _generating.add(alertname) + try: + rule_id = re.sub(r"[^a-z0-9_]", "_", alertname.lower()).strip("_") + # priority: 500~899 給 AI 生成規則,不干擾手寫規則 (1-499) + existing = [r.get("priority", 0) for r in _load_rules() if not _is_generic(r)] + priority = max((p for p in existing if 500 <= p < 900), default=499) + 10 + + prompt = _AUTO_RULE_PROMPT.format( + alertname=alertname, + alert_type=alert_context.get("alert_type", "custom"), + message=alert_context.get("message", "")[:200], + labels=json.dumps({k: v for k, v in labels.items() if k in + ("job", "instance", "severity", "namespace", "container", "name")}, + ensure_ascii=False), + rule_id=rule_id, + priority=priority, + ) + + logger.info("auto_rule_generating", alertname=alertname, rule_id=rule_id) + + # 1. 先試 Ollama + raw = await _call_ollama(prompt, settings.OLLAMA_URL, settings.OPENCLAW_DEFAULT_MODEL) + + # 2. Ollama 失敗 → Gemini + if not raw and settings.GEMINI_API_KEY: + raw = await _call_gemini(prompt, settings.GEMINI_API_KEY) + + if not raw: + logger.warning("auto_rule_no_response", alertname=alertname) + return + + yaml_block = _extract_yaml_block(raw) + success = _append_rule_to_yaml(yaml_block, alertname) + if success: + logger.info("auto_rule_success", alertname=alertname, rule_id=rule_id) + else: + logger.warning("auto_rule_failed_validation", alertname=alertname) + + except Exception as e: + logger.error("auto_rule_exception", alertname=alertname, error=str(e)) + finally: + _generating.discard(alertname) diff --git a/apps/api/src/services/openclaw.py b/apps/api/src/services/openclaw.py index 771debbe..50ab78fa 100644 --- a/apps/api/src/services/openclaw.py +++ b/apps/api/src/services/openclaw.py @@ -628,9 +628,10 @@ class OpenClawService: if signoz_metrics: mock_response["description"] += f" {signoz_metrics.to_summary()}" + rule_id = mock_response.get("rule_id", "unknown") logger.info( "mock_llm_response_generated", - rule_id=mock_response.get("rule_id", "unknown"), + rule_id=rule_id, action_title=mock_response["action_title"], risk_level=mock_response["risk_level"], primary_responsibility=mock_response["primary_responsibility"], @@ -639,6 +640,19 @@ class OpenClawService: is_mock=True, ) + # 2026-04-09 ogt: 命中通用兜底時,背景自動生成專屬規則 + if rule_id == "generic_fallback": + from src.services.alert_rule_engine import auto_generate_rule + import asyncio + try: + loop = asyncio.get_event_loop() + if loop.is_running(): + loop.create_task(auto_generate_rule(alert_context)) + else: + asyncio.run(auto_generate_rule(alert_context)) + except Exception as _e: + logger.warning("auto_rule_trigger_failed", error=str(_e)) + return json.dumps(mock_response) # =========================================================================