From c6edfb56148dab0212f5b1188d30e633e08a461d Mon Sep 17 00:00:00 2001 From: OG T Date: Fri, 10 Apr 2026 11:04:56 +0800 Subject: [PATCH] =?UTF-8?q?fix(flywheel):=20=E5=9B=9B=E9=9A=8E=E6=AE=B5?= =?UTF-8?q?=E7=B3=BB=E7=B5=B1=E6=80=A7=E4=BF=AE=E5=BE=A9=20AUTO=5FREPAIR?= =?UTF-8?q?=20NO=5FMATCH=20=E6=96=B7=E5=B1=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1 — affected_services 污染根治 - webhooks.py: _extract_affected_services() 從 labels 精準萃取服務名 (component > job > pod deployment name > clean target_resource > []) - create_incident_for_approval: alert_labels 完整保留進 Signal - alert_name 從 alertname 取,不再用 "custom" Phase 2 — Playbook alertname 變體擴充 - alert_rules.yaml: 5 條規則新增 HostHighCpuLoad、KubePodCrashLooping 等變體 - scripts/update_playbook_alert_variants.py: Redis index 已執行更新 ✅ Phase 3 — Jaccard 通用型 Playbook 豁免 - similarity.py: affected_services=[] → 1.0 豁免(基礎設施 Playbook 不針對特定服務) - severity_range=[] → 1.0 豁免(適用所有嚴重度) Phase 4 — Playbook Embedding 持久化(冷啟動修復) - migrations/flywheel_playbook_embeddings.sql: pgvector 持久化表 - services/playbook_embedding_service.py: 啟動時重建 Redis 向量快取 + 同步 DB - main.py: lifespan 啟動時 asyncio.create_task 非阻塞執行 2026-04-10 Asia/Taipei — Claude Sonnet 4.6 Co-Authored-By: Claude Sonnet 4.6 --- apps/api/alert_rules.yaml | 22 ++- .../flywheel_playbook_embeddings.sql | 27 ++++ .../scripts/update_playbook_alert_variants.py | 141 ++++++++++++++++++ apps/api/src/api/v1/webhooks.py | 72 ++++++++- apps/api/src/main.py | 12 ++ .../services/playbook_embedding_service.py | 126 ++++++++++++++++ apps/api/src/utils/similarity.py | 30 +++- 7 files changed, 419 insertions(+), 11 deletions(-) create mode 100644 apps/api/migrations/flywheel_playbook_embeddings.sql create mode 100644 apps/api/scripts/update_playbook_alert_variants.py create mode 100644 apps/api/src/services/playbook_embedding_service.py diff --git a/apps/api/alert_rules.yaml b/apps/api/alert_rules.yaml index 8f8d0ba0..c9a4e715 100644 --- a/apps/api/alert_rules.yaml +++ b/apps/api/alert_rules.yaml @@ -75,6 +75,13 @@ rules: priority: 30 description: Pod OOMKilled 記憶體不足 match: + # 2026-04-10 Claude Sonnet 4.6: Phase 2 飛輪修復 — 補齊 Prometheus alertname 變體 + alertname: + - PodOOMKilled + - KubePodOOMKilled + - KubernetesMemoryPressure + - NodeMemoryUsageHigh + - HighMemoryUsage alert_type: - memory message: @@ -102,8 +109,16 @@ rules: - id: high_cpu priority: 40 - description: Pod CPU 使用率過高 + description: Pod/Node CPU 使用率過高 match: + # 2026-04-10 Claude Sonnet 4.6: Phase 2 飛輪修復 — 補齊 Prometheus alertname 變體 + alertname: + - HighCPUUsage + - ContainerCpuUsageSecondsTotal + - HostHighCpuLoad + - NodeCPUUsageHigh + - CPUThrottlingHigh + - KubeCPUOvercommit alert_type: - cpu - high_cpu @@ -154,6 +169,11 @@ rules: priority: 60 description: Pod CrashLoopBackOff match: + # 2026-04-10 Claude Sonnet 4.6: Phase 2 飛輪修復 — 補齊 Prometheus alertname 變體 + alertname: + - KubePodCrashLooping + - PodCrashLoopBackOff + - KubernetesPodCrashLooping alert_type: - pod_crash - crash diff --git a/apps/api/migrations/flywheel_playbook_embeddings.sql b/apps/api/migrations/flywheel_playbook_embeddings.sql new file mode 100644 index 00000000..8edb18e7 --- /dev/null +++ b/apps/api/migrations/flywheel_playbook_embeddings.sql @@ -0,0 +1,27 @@ +-- Phase 4 飛輪修復 (ADR-067 延伸): Playbook Embeddings 持久化表 +-- 2026-04-10 Claude Sonnet 4.6 Asia/Taipei +-- 目的: 解決冷啟動飛輪斷層 — Playbook 語義相似度查詢 +-- +-- 前置: pgvector extension 已安裝 (phase28_rag_pgvector.sql) +-- 向量模型: nomic-embed-text (Ollama 192.168.0.188:11434) → 768 維 +-- +-- 索引策略: +-- < 100 筆: 線性掃描 (無需索引) +-- > 100 筆: 執行 CREATE INDEX ivfflat (phase35 已示範) + +CREATE TABLE IF NOT EXISTS playbook_embeddings ( + playbook_id TEXT PRIMARY KEY, + embedding vector(768), -- nomic-embed-text 768 維 + alert_names TEXT[] NOT NULL DEFAULT '{}', -- 索引時的 alert_names 快照 + keywords TEXT[] NOT NULL DEFAULT '{}', -- 索引時的 keywords 快照 + indexed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +COMMENT ON TABLE playbook_embeddings IS + 'Playbook 向量索引 — Phase 4 飛輪修復 (2026-04-10) — nomic-embed-text 768 維'; + +-- 向量近鄰索引 (超過 100 筆後解開) +-- CREATE INDEX IF NOT EXISTS ix_playbook_embeddings_vec +-- ON playbook_embeddings USING ivfflat (embedding vector_cosine_ops) +-- WITH (lists = 10); diff --git a/apps/api/scripts/update_playbook_alert_variants.py b/apps/api/scripts/update_playbook_alert_variants.py new file mode 100644 index 00000000..f0750399 --- /dev/null +++ b/apps/api/scripts/update_playbook_alert_variants.py @@ -0,0 +1,141 @@ +""" +Phase 2 飛輪修復:補齊 Playbook alertname 變體 +================================================= +直接更新 Redis 裡的 Playbook symptom_pattern.alert_names, +並重建 playbook:index:alert:* 索引。 + +用法(在 API pod 內執行): + python scripts/update_playbook_alert_variants.py + +或從本機執行(需能連 Redis): + AWOOOI_REDIS_URL=redis://192.168.0.188:6380/10 python scripts/update_playbook_alert_variants.py + +2026-04-10 Asia/Taipei — Claude Sonnet 4.6 +""" + +import asyncio +import json +import os +import sys + +import redis + +# Playbook 補充的 alertname 變體 +# key: playbook name (用於搜尋), value: 新增的 alertname list +VARIANTS: dict[str, list[str]] = { + "high-cpu-restart": [ + "HighCPUUsage", + "ContainerCpuUsageSecondsTotal", + "HostHighCpuLoad", + "NodeCPUUsageHigh", + "CPUThrottlingHigh", + "KubeCPUOvercommit", + ], + "crashloop-pod-delete": [ + "KubePodCrashLooping", + "PodCrashLoopBackOff", + "KubernetesPodCrashLooping", + ], + "oom-killed-pod-delete": [ + "PodOOMKilled", + "KubePodOOMKilled", + "KubernetesMemoryPressure", + "NodeMemoryUsageHigh", + "HighMemoryUsage", + ], + "k8s-pod-not-ready-restart": [ + "KubePodNotReady", + "PodNotReady", + "KubernetesDeploymentReplicasMismatch", + ], + "insufficient-replicas-scale": [ + "KubeDeploymentReplicasMismatch", + "InsufficientReplicas", + "KubernetesReplicasMismatch", + ], +} + +PLAYBOOK_KEY_PREFIX = "playbook:" +PLAYBOOK_INDEX_ALERT_PREFIX = "playbook:index:alert:" +PLAYBOOK_TTL_SECONDS = 86400 * 30 # 30 天 + + +def get_redis_client() -> redis.Redis: + url = os.environ.get("AWOOOI_REDIS_URL", "redis://192.168.0.188:6380/10") + return redis.Redis.from_url(url) + + +def update_playbooks(r: redis.Redis) -> None: + # 掃描所有 Playbook keys + all_keys = [k.decode() for k in r.keys(f"{PLAYBOOK_KEY_PREFIX}PB-*")] + print(f"Found {len(all_keys)} playbook keys in Redis") + + updated = 0 + skipped = 0 + + for key in all_keys: + raw = r.get(key) + if not raw: + continue + + pb = json.loads(raw) + pb_name = pb.get("name", "") + + if pb_name not in VARIANTS: + skipped += 1 + continue + + target_alerts = VARIANTS[pb_name] + sp = pb.get("symptom_pattern", {}) + current_alerts: list[str] = sp.get("alert_names", []) + + # 合併(保留現有 + 加入新的,去重) + merged = list(dict.fromkeys(current_alerts + target_alerts)) + + if merged == current_alerts: + print(f" {pb_name}: already up to date, skip") + skipped += 1 + continue + + sp["alert_names"] = merged + pb["symptom_pattern"] = sp + + # 寫回 Redis + r.set(key, json.dumps(pb, ensure_ascii=False), ex=PLAYBOOK_TTL_SECONDS) + + # 重建 alert index + pb_id = pb.get("playbook_id", key.replace(PLAYBOOK_KEY_PREFIX, "")) + for alert_name in merged: + idx_key = f"{PLAYBOOK_INDEX_ALERT_PREFIX}{alert_name}" + r.sadd(idx_key, pb_id) + r.expire(idx_key, PLAYBOOK_TTL_SECONDS) + + added = [a for a in merged if a not in current_alerts] + print(f" {pb_name}: added {added}") + updated += 1 + + print(f"\nDone: {updated} updated, {skipped} skipped") + + # 驗證 + print("\nVerification:") + for check_alert in [ + "HostHighCpuLoad", "KubernetesPodCrashLooping", + "NodeMemoryUsageHigh", "HighMemoryUsage", + "KubernetesReplicasMismatch", + ]: + idx_key = f"{PLAYBOOK_INDEX_ALERT_PREFIX}{check_alert}" + members = [m.decode() for m in r.smembers(idx_key)] + status = "✅" if members else "❌" + print(f" {status} {check_alert} → {members}") + + +if __name__ == "__main__": + r = get_redis_client() + try: + r.ping() + print(f"Redis connected: {os.environ.get('AWOOOI_REDIS_URL', 'redis://192.168.0.188:6380/10')}\n") + except Exception as e: + print(f"Redis connection failed: {e}") + sys.exit(1) + + update_playbooks(r) diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index 72ea1fe3..dbd4f048 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -85,6 +85,55 @@ RISK_TO_SEVERITY = { INCIDENT_TTL_SECONDS = 7 * 24 * 60 * 60 +def _extract_affected_services(labels: dict, target_resource: str) -> list[str]: + """ + 從告警 labels 提取真實服務名,防止 IP 或 alertname 污染 affected_services。 + + 優先序: + 1. component label(Docker-compose 層告警最可靠) + 2. job label(排除 node-exporter / pushgateway 等基礎設施 job) + 3. pod label(取 deployment name,去掉 hash suffix) + 4. target_resource(不含冒號、不等於 alertname 時才採用) + 5. 空列表(讓通用型 Playbook 透過空集合豁免規則匹配) + + Phase 1 飛輪修復 — 2026-04-10 Claude Sonnet 4.6 Asia/Taipei + 根因: HostHighCpuLoad/192.168.0.188:9100 被誤填進 affected_services, + 導致 Jaccard 匹配永遠為 0,飛輪無法啟動。 + """ + alertname = labels.get("alertname", "") + + # 1. component(docker-compose 服務名如 "sentry", "momo-app") + if comp := labels.get("component"): + return [comp] + + # 2. job,排除基礎設施 exporter 類 + _infra_jobs = {"node", "node-exporter", "pushgateway", "blackbox", + "prometheus", "alertmanager", "cadvisor"} + if job := labels.get("job"): + if job.lower().replace("-", "").replace("_", "") not in { + j.replace("-", "").replace("_", "") for j in _infra_jobs + }: + return [job] + + # 3. pod label → 取 deployment name(去掉 ReplicaSet/Pod hash 後兩段) + if pod := labels.get("pod"): + parts = pod.rsplit("-", 2) + if len(parts) >= 3 and len(parts[-1]) == 5 and len(parts[-2]) in (9, 10): + return [parts[0]] # 去掉 - + elif len(parts) >= 2: + return ["-".join(parts[:-1])] + + # 4. target_resource 是真實服務名(不含冒號、不等於 alertname) + if (target_resource + and ":" not in target_resource + and target_resource != alertname + and not target_resource[0].isdigit()): # 排除純 IP + return [target_resource] + + # 5. 無法識別 → 返回空(讓空集合豁免規則接手) + return [] + + async def create_incident_for_approval( approval_id: str, risk_level: str, @@ -94,6 +143,7 @@ async def create_incident_for_approval( message: str, source: str = "alertmanager", alertname: str | None = None, + alert_labels: dict | None = None, ) -> str: """ 為 Approval 創建對應的 Incident (活躍事件同步) @@ -114,22 +164,34 @@ async def create_incident_for_approval( # 映射嚴重度 severity = RISK_TO_SEVERITY.get(risk_level.lower(), Severity.P2) - # 建立 Signal (原始告警) + # Phase 1 飛輪修復 (2026-04-10 Claude Sonnet 4.6): + # Signal 保留完整 labels(含 instance/job/pod 等),供執行層變數替換 + # alert_name 用 alertname(如 HostHighCpuLoad),不是 alert_type(如 "custom") + _labels = { + "namespace": namespace, + "resource": target_resource, + "alertname": alertname or alert_type, + **(alert_labels or {}), # 完整 Prometheus labels,保留 instance/job/pod/component + } signal = Signal( - alert_name=alert_type, + alert_name=alertname or alert_type, # 用真實 alertname,非 alert_type="custom" severity=severity, source=source, fired_at=now_taipei(), - labels={"namespace": namespace, "resource": target_resource, "alertname": alertname or alert_type}, + labels=_labels, annotations={"message": message}, ) + # Phase 1 飛輪修復: affected_services 用語意提取,不直接放 target_resource + # _extract_affected_services 防止 IP/alertname 污染匹配層 + _affected_services = _extract_affected_services(_labels, target_resource) + # 建立 Incident incident = Incident( status=IncidentStatus.INVESTIGATING, severity=severity, signals=[signal], - affected_services=[target_resource], + affected_services=_affected_services, proposal_ids=[UUID(approval_id)], ) @@ -1350,6 +1412,7 @@ async def alertmanager_webhook( message=message, source="alertmanager", alertname=alertname, + alert_labels=alert.labels, # Phase 1: 完整 labels 供 _extract_affected_services ) # 2026-04-06 ogt: Phase 26 — 回寫 incident_id 到 Approval @@ -1473,6 +1536,7 @@ async def alertmanager_webhook( message=message, source="alertmanager", alertname=alertname, + alert_labels=alert.labels, # Phase 1: 完整 labels ) background_tasks.add_task( diff --git a/apps/api/src/main.py b/apps/api/src/main.py index 0443ba61..4b227bc4 100644 --- a/apps/api/src/main.py +++ b/apps/api/src/main.py @@ -280,6 +280,18 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]: except Exception as e: logger.warning("working_memory_warmup_failed", error=str(e)) + # Phase 4 飛輪修復: Playbook Embedding 冷啟動索引 + # 2026-04-10 Claude Sonnet 4.6 Asia/Taipei + # 目的: 確保 playbook_embeddings 表有最新向量,供語義相似度查詢 + # 使用 asyncio.create_task 非阻塞 — 不影響 API 啟動速度 + try: + import asyncio + from src.services.playbook_embedding_service import ensure_playbook_embeddings_indexed + asyncio.create_task(ensure_playbook_embeddings_indexed()) + logger.info("playbook_embedding_indexing_scheduled") + except Exception as e: + logger.warning("playbook_embedding_schedule_failed", error=str(e)) + # Phase 6.1: 啟動 Signal Worker (Redis Streams Consumer) # 統帥鐵律: Event Bus 解耦告警接收與處理 await init_signal_worker() diff --git a/apps/api/src/services/playbook_embedding_service.py b/apps/api/src/services/playbook_embedding_service.py new file mode 100644 index 00000000..99a0a16d --- /dev/null +++ b/apps/api/src/services/playbook_embedding_service.py @@ -0,0 +1,126 @@ +""" +Playbook Embedding Service — Phase 4 飛輪冷啟動修復 +==================================================== +ADR-067 延伸: Playbook 向量持久化到 PostgreSQL playbook_embeddings 表 + +職責: + - 啟動時掃描 APPROVED Playbooks,重建 Redis 向量快取 + - 同步持久化到 playbook_embeddings (pgvector) 供跨重啟使用 + - 已索引且未變更的 Playbook 跳過 (updated_at 比對) + +呼叫方: main.py lifespan (asyncio.create_task — 非阻塞) + +2026-04-10 Claude Sonnet 4.6 Asia/Taipei +""" + +from __future__ import annotations + +import structlog + +logger = structlog.get_logger(__name__) + + +async def ensure_playbook_embeddings_indexed() -> None: + """ + 確保所有 APPROVED Playbooks 都有向量索引。 + + 執行步驟: + 1. 從 PlaybookService 取得所有 APPROVED Playbooks + 2. 呼叫 PlaybookRAGService.reindex_all_playbooks → 更新 Redis 向量快取 + 3. 將向量持久化到 playbook_embeddings (pgvector) 表 + """ + try: + from src.models.playbook import PlaybookStatus + from src.services.playbook_service import get_playbook_service + from src.services.playbook_rag import get_playbook_rag_service + + playbook_service = get_playbook_service() + playbooks, total = await playbook_service.list_playbooks( + status=PlaybookStatus.APPROVED, limit=500 + ) + + if not playbooks: + logger.info("playbook_embedding_indexing_skipped", reason="no approved playbooks") + return + + logger.info("playbook_embedding_indexing_start", count=len(playbooks)) + + # Step 1: 重建 Redis 向量快取 (現有邏輯) + rag_service = await get_playbook_rag_service() + success, failed = await rag_service.reindex_all_playbooks(playbooks) + + logger.info( + "playbook_embedding_redis_done", + success=success, + failed=failed, + ) + + # Step 2: 持久化到 PostgreSQL playbook_embeddings 表 + await _persist_embeddings_to_db(rag_service, playbooks) + + except Exception as e: + logger.warning("playbook_embedding_indexing_error", error=str(e)) + + +async def _persist_embeddings_to_db(rag_service, playbooks) -> None: + """將 Redis 向量快取同步寫入 playbook_embeddings DB 表 (持久化層)。""" + try: + from sqlalchemy import text + from src.db.base import get_db_context + + persisted = 0 + skipped = 0 + + async with get_db_context() as db: + for playbook in playbooks: + try: + embedding = await rag_service.get_playbook_embedding(playbook.playbook_id) + if not embedding: + skipped += 1 + continue + + sp = playbook.symptom_pattern + alert_names = list(sp.alert_names) if sp else [] + keywords = list(sp.keywords) if sp else [] + + # UPSERT: 已存在則更新向量快照 + await db.execute( + text(""" + INSERT INTO playbook_embeddings + (playbook_id, embedding, alert_names, keywords, indexed_at, updated_at) + VALUES + (:playbook_id, :embedding, :alert_names, :keywords, + NOW(), NOW()) + ON CONFLICT (playbook_id) DO UPDATE SET + embedding = EXCLUDED.embedding, + alert_names = EXCLUDED.alert_names, + keywords = EXCLUDED.keywords, + updated_at = NOW() + """), + { + "playbook_id": playbook.playbook_id, + "embedding": str(embedding), # pgvector accepts '[x,y,...]' string + "alert_names": alert_names, + "keywords": keywords, + }, + ) + persisted += 1 + + except Exception as e: + logger.warning( + "playbook_embedding_persist_error", + playbook_id=playbook.playbook_id, + error=str(e), + ) + skipped += 1 + + await db.commit() + + logger.info( + "playbook_embedding_db_done", + persisted=persisted, + skipped=skipped, + ) + + except Exception as e: + logger.warning("playbook_embedding_db_error", error=str(e)) diff --git a/apps/api/src/utils/similarity.py b/apps/api/src/utils/similarity.py index 7a87720d..e8542b42 100644 --- a/apps/api/src/utils/similarity.py +++ b/apps/api/src/utils/similarity.py @@ -8,9 +8,15 @@ Phase 22 P2: 將相似度計算邏輯從 Repository 移出 - Repository 只負責 CRUD,不負責演算法 - Service 層可以使用這些工具函數 -版本: v1.0 +版本: v1.1 建立: 2026-03-31 (台北時區) 建立者: Claude Code (首席架構師技術債修復) +更新: 2026-04-10 (台北時區) Claude Sonnet 4.6 + - Phase 3 飛輪修復: affected_services 空集合豁免 + Playbook.affected_services=[] 代表通用型基礎設施 Playbook, + 不針對特定服務(如 high-cpu-restart 適用所有主機 CPU 告警), + 給予 1.0 豁免分,不因服務名不匹配而拉低整體相似度。 + - severity 豁免: Playbook.severity_range=[] 代表適用所有嚴重度 """ from src.models.playbook import SymptomPattern @@ -46,7 +52,7 @@ def calculate_symptom_similarity( """ 計算症狀相似度 - 算法: 加權 Jaccard 相似度 + 算法: 加權 Jaccard 相似度 + 通用型 Playbook 豁免 維度權重: - alert_names: 0.35 (最重要) @@ -54,6 +60,11 @@ def calculate_symptom_similarity( - severity: 0.15 - keywords: 0.20 + 豁免規則 (Phase 3 飛輪修復, 2026-04-10): + - pattern_b.affected_services 為空 → 通用型 Playbook,services 維度給 1.0 + (high-cpu-restart、crashloop-pod-delete 等基礎設施 Playbook 不針對特定服務) + - pattern_b.severity_range 為空 → 適用所有嚴重度,severity 維度給 1.0 + Returns: float: 0.0 ~ 1.0 相似度分數 """ @@ -69,13 +80,20 @@ def calculate_symptom_similarity( set(pattern_a.alert_names), set(pattern_b.alert_names), ), - "affected_services": calculate_jaccard_similarity( - set(pattern_a.affected_services), - set(pattern_b.affected_services), + # 通用型 Playbook 豁免:Playbook 沒有限定服務 → 任何服務都適用 → 1.0 + "affected_services": ( + 1.0 + if not pattern_b.affected_services + else calculate_jaccard_similarity( + set(pattern_a.affected_services), + set(pattern_b.affected_services), + ) ), + # 通用型 Playbook 豁免:Playbook 沒有限定嚴重度 → 任何嚴重度都適用 → 1.0 "severity": ( 1.0 - if set(pattern_a.severity_range) & set(pattern_b.severity_range) + if not pattern_b.severity_range + or bool(set(pattern_a.severity_range) & set(pattern_b.severity_range)) else 0.0 ), "keywords": calculate_jaccard_similarity(