fix(flywheel): 四階段系統性修復 AUTO_REPAIR NO_MATCH 斷層
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Phase 1 — affected_services 污染根治
- webhooks.py: _extract_affected_services() 從 labels 精準萃取服務名
(component > job > pod deployment name > clean target_resource > [])
- create_incident_for_approval: alert_labels 完整保留進 Signal
- alert_name 從 alertname 取,不再用 "custom"
Phase 2 — Playbook alertname 變體擴充
- alert_rules.yaml: 5 條規則新增 HostHighCpuLoad、KubePodCrashLooping 等變體
- scripts/update_playbook_alert_variants.py: Redis index 已執行更新 ✅
Phase 3 — Jaccard 通用型 Playbook 豁免
- similarity.py: affected_services=[] → 1.0 豁免(基礎設施 Playbook 不針對特定服務)
- severity_range=[] → 1.0 豁免(適用所有嚴重度)
Phase 4 — Playbook Embedding 持久化(冷啟動修復)
- migrations/flywheel_playbook_embeddings.sql: pgvector 持久化表
- services/playbook_embedding_service.py: 啟動時重建 Redis 向量快取 + 同步 DB
- main.py: lifespan 啟動時 asyncio.create_task 非阻塞執行
2026-04-10 Asia/Taipei — Claude Sonnet 4.6
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -75,6 +75,13 @@ rules:
|
||||
priority: 30
|
||||
description: Pod OOMKilled 記憶體不足
|
||||
match:
|
||||
# 2026-04-10 Claude Sonnet 4.6: Phase 2 飛輪修復 — 補齊 Prometheus alertname 變體
|
||||
alertname:
|
||||
- PodOOMKilled
|
||||
- KubePodOOMKilled
|
||||
- KubernetesMemoryPressure
|
||||
- NodeMemoryUsageHigh
|
||||
- HighMemoryUsage
|
||||
alert_type:
|
||||
- memory
|
||||
message:
|
||||
@@ -102,8 +109,16 @@ rules:
|
||||
|
||||
- id: high_cpu
|
||||
priority: 40
|
||||
description: Pod CPU 使用率過高
|
||||
description: Pod/Node CPU 使用率過高
|
||||
match:
|
||||
# 2026-04-10 Claude Sonnet 4.6: Phase 2 飛輪修復 — 補齊 Prometheus alertname 變體
|
||||
alertname:
|
||||
- HighCPUUsage
|
||||
- ContainerCpuUsageSecondsTotal
|
||||
- HostHighCpuLoad
|
||||
- NodeCPUUsageHigh
|
||||
- CPUThrottlingHigh
|
||||
- KubeCPUOvercommit
|
||||
alert_type:
|
||||
- cpu
|
||||
- high_cpu
|
||||
@@ -154,6 +169,11 @@ rules:
|
||||
priority: 60
|
||||
description: Pod CrashLoopBackOff
|
||||
match:
|
||||
# 2026-04-10 Claude Sonnet 4.6: Phase 2 飛輪修復 — 補齊 Prometheus alertname 變體
|
||||
alertname:
|
||||
- KubePodCrashLooping
|
||||
- PodCrashLoopBackOff
|
||||
- KubernetesPodCrashLooping
|
||||
alert_type:
|
||||
- pod_crash
|
||||
- crash
|
||||
|
||||
27
apps/api/migrations/flywheel_playbook_embeddings.sql
Normal file
27
apps/api/migrations/flywheel_playbook_embeddings.sql
Normal file
@@ -0,0 +1,27 @@
|
||||
-- Phase 4 飛輪修復 (ADR-067 延伸): Playbook Embeddings 持久化表
|
||||
-- 2026-04-10 Claude Sonnet 4.6 Asia/Taipei
|
||||
-- 目的: 解決冷啟動飛輪斷層 — Playbook 語義相似度查詢
|
||||
--
|
||||
-- 前置: pgvector extension 已安裝 (phase28_rag_pgvector.sql)
|
||||
-- 向量模型: nomic-embed-text (Ollama 192.168.0.188:11434) → 768 維
|
||||
--
|
||||
-- 索引策略:
|
||||
-- < 100 筆: 線性掃描 (無需索引)
|
||||
-- > 100 筆: 執行 CREATE INDEX ivfflat (phase35 已示範)
|
||||
|
||||
CREATE TABLE IF NOT EXISTS playbook_embeddings (
|
||||
playbook_id TEXT PRIMARY KEY,
|
||||
embedding vector(768), -- nomic-embed-text 768 維
|
||||
alert_names TEXT[] NOT NULL DEFAULT '{}', -- 索引時的 alert_names 快照
|
||||
keywords TEXT[] NOT NULL DEFAULT '{}', -- 索引時的 keywords 快照
|
||||
indexed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
COMMENT ON TABLE playbook_embeddings IS
|
||||
'Playbook 向量索引 — Phase 4 飛輪修復 (2026-04-10) — nomic-embed-text 768 維';
|
||||
|
||||
-- 向量近鄰索引 (超過 100 筆後解開)
|
||||
-- CREATE INDEX IF NOT EXISTS ix_playbook_embeddings_vec
|
||||
-- ON playbook_embeddings USING ivfflat (embedding vector_cosine_ops)
|
||||
-- WITH (lists = 10);
|
||||
141
apps/api/scripts/update_playbook_alert_variants.py
Normal file
141
apps/api/scripts/update_playbook_alert_variants.py
Normal file
@@ -0,0 +1,141 @@
|
||||
"""
|
||||
Phase 2 飛輪修復:補齊 Playbook alertname 變體
|
||||
=================================================
|
||||
直接更新 Redis 裡的 Playbook symptom_pattern.alert_names,
|
||||
並重建 playbook:index:alert:* 索引。
|
||||
|
||||
用法(在 API pod 內執行):
|
||||
python scripts/update_playbook_alert_variants.py
|
||||
|
||||
或從本機執行(需能連 Redis):
|
||||
AWOOOI_REDIS_URL=redis://192.168.0.188:6380/10 python scripts/update_playbook_alert_variants.py
|
||||
|
||||
2026-04-10 Asia/Taipei — Claude Sonnet 4.6
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
import redis
|
||||
|
||||
# Playbook 補充的 alertname 變體
|
||||
# key: playbook name (用於搜尋), value: 新增的 alertname list
|
||||
VARIANTS: dict[str, list[str]] = {
|
||||
"high-cpu-restart": [
|
||||
"HighCPUUsage",
|
||||
"ContainerCpuUsageSecondsTotal",
|
||||
"HostHighCpuLoad",
|
||||
"NodeCPUUsageHigh",
|
||||
"CPUThrottlingHigh",
|
||||
"KubeCPUOvercommit",
|
||||
],
|
||||
"crashloop-pod-delete": [
|
||||
"KubePodCrashLooping",
|
||||
"PodCrashLoopBackOff",
|
||||
"KubernetesPodCrashLooping",
|
||||
],
|
||||
"oom-killed-pod-delete": [
|
||||
"PodOOMKilled",
|
||||
"KubePodOOMKilled",
|
||||
"KubernetesMemoryPressure",
|
||||
"NodeMemoryUsageHigh",
|
||||
"HighMemoryUsage",
|
||||
],
|
||||
"k8s-pod-not-ready-restart": [
|
||||
"KubePodNotReady",
|
||||
"PodNotReady",
|
||||
"KubernetesDeploymentReplicasMismatch",
|
||||
],
|
||||
"insufficient-replicas-scale": [
|
||||
"KubeDeploymentReplicasMismatch",
|
||||
"InsufficientReplicas",
|
||||
"KubernetesReplicasMismatch",
|
||||
],
|
||||
}
|
||||
|
||||
PLAYBOOK_KEY_PREFIX = "playbook:"
|
||||
PLAYBOOK_INDEX_ALERT_PREFIX = "playbook:index:alert:"
|
||||
PLAYBOOK_TTL_SECONDS = 86400 * 30 # 30 天
|
||||
|
||||
|
||||
def get_redis_client() -> redis.Redis:
|
||||
url = os.environ.get("AWOOOI_REDIS_URL", "redis://192.168.0.188:6380/10")
|
||||
return redis.Redis.from_url(url)
|
||||
|
||||
|
||||
def update_playbooks(r: redis.Redis) -> None:
|
||||
# 掃描所有 Playbook keys
|
||||
all_keys = [k.decode() for k in r.keys(f"{PLAYBOOK_KEY_PREFIX}PB-*")]
|
||||
print(f"Found {len(all_keys)} playbook keys in Redis")
|
||||
|
||||
updated = 0
|
||||
skipped = 0
|
||||
|
||||
for key in all_keys:
|
||||
raw = r.get(key)
|
||||
if not raw:
|
||||
continue
|
||||
|
||||
pb = json.loads(raw)
|
||||
pb_name = pb.get("name", "")
|
||||
|
||||
if pb_name not in VARIANTS:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
target_alerts = VARIANTS[pb_name]
|
||||
sp = pb.get("symptom_pattern", {})
|
||||
current_alerts: list[str] = sp.get("alert_names", [])
|
||||
|
||||
# 合併(保留現有 + 加入新的,去重)
|
||||
merged = list(dict.fromkeys(current_alerts + target_alerts))
|
||||
|
||||
if merged == current_alerts:
|
||||
print(f" {pb_name}: already up to date, skip")
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
sp["alert_names"] = merged
|
||||
pb["symptom_pattern"] = sp
|
||||
|
||||
# 寫回 Redis
|
||||
r.set(key, json.dumps(pb, ensure_ascii=False), ex=PLAYBOOK_TTL_SECONDS)
|
||||
|
||||
# 重建 alert index
|
||||
pb_id = pb.get("playbook_id", key.replace(PLAYBOOK_KEY_PREFIX, ""))
|
||||
for alert_name in merged:
|
||||
idx_key = f"{PLAYBOOK_INDEX_ALERT_PREFIX}{alert_name}"
|
||||
r.sadd(idx_key, pb_id)
|
||||
r.expire(idx_key, PLAYBOOK_TTL_SECONDS)
|
||||
|
||||
added = [a for a in merged if a not in current_alerts]
|
||||
print(f" {pb_name}: added {added}")
|
||||
updated += 1
|
||||
|
||||
print(f"\nDone: {updated} updated, {skipped} skipped")
|
||||
|
||||
# 驗證
|
||||
print("\nVerification:")
|
||||
for check_alert in [
|
||||
"HostHighCpuLoad", "KubernetesPodCrashLooping",
|
||||
"NodeMemoryUsageHigh", "HighMemoryUsage",
|
||||
"KubernetesReplicasMismatch",
|
||||
]:
|
||||
idx_key = f"{PLAYBOOK_INDEX_ALERT_PREFIX}{check_alert}"
|
||||
members = [m.decode() for m in r.smembers(idx_key)]
|
||||
status = "✅" if members else "❌"
|
||||
print(f" {status} {check_alert} → {members}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
r = get_redis_client()
|
||||
try:
|
||||
r.ping()
|
||||
print(f"Redis connected: {os.environ.get('AWOOOI_REDIS_URL', 'redis://192.168.0.188:6380/10')}\n")
|
||||
except Exception as e:
|
||||
print(f"Redis connection failed: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
update_playbooks(r)
|
||||
@@ -85,6 +85,55 @@ RISK_TO_SEVERITY = {
|
||||
INCIDENT_TTL_SECONDS = 7 * 24 * 60 * 60
|
||||
|
||||
|
||||
def _extract_affected_services(labels: dict, target_resource: str) -> list[str]:
|
||||
"""
|
||||
從告警 labels 提取真實服務名,防止 IP 或 alertname 污染 affected_services。
|
||||
|
||||
優先序:
|
||||
1. component label(Docker-compose 層告警最可靠)
|
||||
2. job label(排除 node-exporter / pushgateway 等基礎設施 job)
|
||||
3. pod label(取 deployment name,去掉 hash suffix)
|
||||
4. target_resource(不含冒號、不等於 alertname 時才採用)
|
||||
5. 空列表(讓通用型 Playbook 透過空集合豁免規則匹配)
|
||||
|
||||
Phase 1 飛輪修復 — 2026-04-10 Claude Sonnet 4.6 Asia/Taipei
|
||||
根因: HostHighCpuLoad/192.168.0.188:9100 被誤填進 affected_services,
|
||||
導致 Jaccard 匹配永遠為 0,飛輪無法啟動。
|
||||
"""
|
||||
alertname = labels.get("alertname", "")
|
||||
|
||||
# 1. component(docker-compose 服務名如 "sentry", "momo-app")
|
||||
if comp := labels.get("component"):
|
||||
return [comp]
|
||||
|
||||
# 2. job,排除基礎設施 exporter 類
|
||||
_infra_jobs = {"node", "node-exporter", "pushgateway", "blackbox",
|
||||
"prometheus", "alertmanager", "cadvisor"}
|
||||
if job := labels.get("job"):
|
||||
if job.lower().replace("-", "").replace("_", "") not in {
|
||||
j.replace("-", "").replace("_", "") for j in _infra_jobs
|
||||
}:
|
||||
return [job]
|
||||
|
||||
# 3. pod label → 取 deployment name(去掉 ReplicaSet/Pod hash 後兩段)
|
||||
if pod := labels.get("pod"):
|
||||
parts = pod.rsplit("-", 2)
|
||||
if len(parts) >= 3 and len(parts[-1]) == 5 and len(parts[-2]) in (9, 10):
|
||||
return [parts[0]] # 去掉 <replicaset-hash>-<pod-hash>
|
||||
elif len(parts) >= 2:
|
||||
return ["-".join(parts[:-1])]
|
||||
|
||||
# 4. target_resource 是真實服務名(不含冒號、不等於 alertname)
|
||||
if (target_resource
|
||||
and ":" not in target_resource
|
||||
and target_resource != alertname
|
||||
and not target_resource[0].isdigit()): # 排除純 IP
|
||||
return [target_resource]
|
||||
|
||||
# 5. 無法識別 → 返回空(讓空集合豁免規則接手)
|
||||
return []
|
||||
|
||||
|
||||
async def create_incident_for_approval(
|
||||
approval_id: str,
|
||||
risk_level: str,
|
||||
@@ -94,6 +143,7 @@ async def create_incident_for_approval(
|
||||
message: str,
|
||||
source: str = "alertmanager",
|
||||
alertname: str | None = None,
|
||||
alert_labels: dict | None = None,
|
||||
) -> str:
|
||||
"""
|
||||
為 Approval 創建對應的 Incident (活躍事件同步)
|
||||
@@ -114,22 +164,34 @@ async def create_incident_for_approval(
|
||||
# 映射嚴重度
|
||||
severity = RISK_TO_SEVERITY.get(risk_level.lower(), Severity.P2)
|
||||
|
||||
# 建立 Signal (原始告警)
|
||||
# Phase 1 飛輪修復 (2026-04-10 Claude Sonnet 4.6):
|
||||
# Signal 保留完整 labels(含 instance/job/pod 等),供執行層變數替換
|
||||
# alert_name 用 alertname(如 HostHighCpuLoad),不是 alert_type(如 "custom")
|
||||
_labels = {
|
||||
"namespace": namespace,
|
||||
"resource": target_resource,
|
||||
"alertname": alertname or alert_type,
|
||||
**(alert_labels or {}), # 完整 Prometheus labels,保留 instance/job/pod/component
|
||||
}
|
||||
signal = Signal(
|
||||
alert_name=alert_type,
|
||||
alert_name=alertname or alert_type, # 用真實 alertname,非 alert_type="custom"
|
||||
severity=severity,
|
||||
source=source,
|
||||
fired_at=now_taipei(),
|
||||
labels={"namespace": namespace, "resource": target_resource, "alertname": alertname or alert_type},
|
||||
labels=_labels,
|
||||
annotations={"message": message},
|
||||
)
|
||||
|
||||
# Phase 1 飛輪修復: affected_services 用語意提取,不直接放 target_resource
|
||||
# _extract_affected_services 防止 IP/alertname 污染匹配層
|
||||
_affected_services = _extract_affected_services(_labels, target_resource)
|
||||
|
||||
# 建立 Incident
|
||||
incident = Incident(
|
||||
status=IncidentStatus.INVESTIGATING,
|
||||
severity=severity,
|
||||
signals=[signal],
|
||||
affected_services=[target_resource],
|
||||
affected_services=_affected_services,
|
||||
proposal_ids=[UUID(approval_id)],
|
||||
)
|
||||
|
||||
@@ -1350,6 +1412,7 @@ async def alertmanager_webhook(
|
||||
message=message,
|
||||
source="alertmanager",
|
||||
alertname=alertname,
|
||||
alert_labels=alert.labels, # Phase 1: 完整 labels 供 _extract_affected_services
|
||||
)
|
||||
|
||||
# 2026-04-06 ogt: Phase 26 — 回寫 incident_id 到 Approval
|
||||
@@ -1473,6 +1536,7 @@ async def alertmanager_webhook(
|
||||
message=message,
|
||||
source="alertmanager",
|
||||
alertname=alertname,
|
||||
alert_labels=alert.labels, # Phase 1: 完整 labels
|
||||
)
|
||||
|
||||
background_tasks.add_task(
|
||||
|
||||
@@ -280,6 +280,18 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
|
||||
except Exception as e:
|
||||
logger.warning("working_memory_warmup_failed", error=str(e))
|
||||
|
||||
# Phase 4 飛輪修復: Playbook Embedding 冷啟動索引
|
||||
# 2026-04-10 Claude Sonnet 4.6 Asia/Taipei
|
||||
# 目的: 確保 playbook_embeddings 表有最新向量,供語義相似度查詢
|
||||
# 使用 asyncio.create_task 非阻塞 — 不影響 API 啟動速度
|
||||
try:
|
||||
import asyncio
|
||||
from src.services.playbook_embedding_service import ensure_playbook_embeddings_indexed
|
||||
asyncio.create_task(ensure_playbook_embeddings_indexed())
|
||||
logger.info("playbook_embedding_indexing_scheduled")
|
||||
except Exception as e:
|
||||
logger.warning("playbook_embedding_schedule_failed", error=str(e))
|
||||
|
||||
# Phase 6.1: 啟動 Signal Worker (Redis Streams Consumer)
|
||||
# 統帥鐵律: Event Bus 解耦告警接收與處理
|
||||
await init_signal_worker()
|
||||
|
||||
126
apps/api/src/services/playbook_embedding_service.py
Normal file
126
apps/api/src/services/playbook_embedding_service.py
Normal file
@@ -0,0 +1,126 @@
|
||||
"""
|
||||
Playbook Embedding Service — Phase 4 飛輪冷啟動修復
|
||||
====================================================
|
||||
ADR-067 延伸: Playbook 向量持久化到 PostgreSQL playbook_embeddings 表
|
||||
|
||||
職責:
|
||||
- 啟動時掃描 APPROVED Playbooks,重建 Redis 向量快取
|
||||
- 同步持久化到 playbook_embeddings (pgvector) 供跨重啟使用
|
||||
- 已索引且未變更的 Playbook 跳過 (updated_at 比對)
|
||||
|
||||
呼叫方: main.py lifespan (asyncio.create_task — 非阻塞)
|
||||
|
||||
2026-04-10 Claude Sonnet 4.6 Asia/Taipei
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import structlog
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
async def ensure_playbook_embeddings_indexed() -> None:
|
||||
"""
|
||||
確保所有 APPROVED Playbooks 都有向量索引。
|
||||
|
||||
執行步驟:
|
||||
1. 從 PlaybookService 取得所有 APPROVED Playbooks
|
||||
2. 呼叫 PlaybookRAGService.reindex_all_playbooks → 更新 Redis 向量快取
|
||||
3. 將向量持久化到 playbook_embeddings (pgvector) 表
|
||||
"""
|
||||
try:
|
||||
from src.models.playbook import PlaybookStatus
|
||||
from src.services.playbook_service import get_playbook_service
|
||||
from src.services.playbook_rag import get_playbook_rag_service
|
||||
|
||||
playbook_service = get_playbook_service()
|
||||
playbooks, total = await playbook_service.list_playbooks(
|
||||
status=PlaybookStatus.APPROVED, limit=500
|
||||
)
|
||||
|
||||
if not playbooks:
|
||||
logger.info("playbook_embedding_indexing_skipped", reason="no approved playbooks")
|
||||
return
|
||||
|
||||
logger.info("playbook_embedding_indexing_start", count=len(playbooks))
|
||||
|
||||
# Step 1: 重建 Redis 向量快取 (現有邏輯)
|
||||
rag_service = await get_playbook_rag_service()
|
||||
success, failed = await rag_service.reindex_all_playbooks(playbooks)
|
||||
|
||||
logger.info(
|
||||
"playbook_embedding_redis_done",
|
||||
success=success,
|
||||
failed=failed,
|
||||
)
|
||||
|
||||
# Step 2: 持久化到 PostgreSQL playbook_embeddings 表
|
||||
await _persist_embeddings_to_db(rag_service, playbooks)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("playbook_embedding_indexing_error", error=str(e))
|
||||
|
||||
|
||||
async def _persist_embeddings_to_db(rag_service, playbooks) -> None:
|
||||
"""將 Redis 向量快取同步寫入 playbook_embeddings DB 表 (持久化層)。"""
|
||||
try:
|
||||
from sqlalchemy import text
|
||||
from src.db.base import get_db_context
|
||||
|
||||
persisted = 0
|
||||
skipped = 0
|
||||
|
||||
async with get_db_context() as db:
|
||||
for playbook in playbooks:
|
||||
try:
|
||||
embedding = await rag_service.get_playbook_embedding(playbook.playbook_id)
|
||||
if not embedding:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
sp = playbook.symptom_pattern
|
||||
alert_names = list(sp.alert_names) if sp else []
|
||||
keywords = list(sp.keywords) if sp else []
|
||||
|
||||
# UPSERT: 已存在則更新向量快照
|
||||
await db.execute(
|
||||
text("""
|
||||
INSERT INTO playbook_embeddings
|
||||
(playbook_id, embedding, alert_names, keywords, indexed_at, updated_at)
|
||||
VALUES
|
||||
(:playbook_id, :embedding, :alert_names, :keywords,
|
||||
NOW(), NOW())
|
||||
ON CONFLICT (playbook_id) DO UPDATE SET
|
||||
embedding = EXCLUDED.embedding,
|
||||
alert_names = EXCLUDED.alert_names,
|
||||
keywords = EXCLUDED.keywords,
|
||||
updated_at = NOW()
|
||||
"""),
|
||||
{
|
||||
"playbook_id": playbook.playbook_id,
|
||||
"embedding": str(embedding), # pgvector accepts '[x,y,...]' string
|
||||
"alert_names": alert_names,
|
||||
"keywords": keywords,
|
||||
},
|
||||
)
|
||||
persisted += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"playbook_embedding_persist_error",
|
||||
playbook_id=playbook.playbook_id,
|
||||
error=str(e),
|
||||
)
|
||||
skipped += 1
|
||||
|
||||
await db.commit()
|
||||
|
||||
logger.info(
|
||||
"playbook_embedding_db_done",
|
||||
persisted=persisted,
|
||||
skipped=skipped,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("playbook_embedding_db_error", error=str(e))
|
||||
@@ -8,9 +8,15 @@ Phase 22 P2: 將相似度計算邏輯從 Repository 移出
|
||||
- Repository 只負責 CRUD,不負責演算法
|
||||
- Service 層可以使用這些工具函數
|
||||
|
||||
版本: v1.0
|
||||
版本: v1.1
|
||||
建立: 2026-03-31 (台北時區)
|
||||
建立者: Claude Code (首席架構師技術債修復)
|
||||
更新: 2026-04-10 (台北時區) Claude Sonnet 4.6
|
||||
- Phase 3 飛輪修復: affected_services 空集合豁免
|
||||
Playbook.affected_services=[] 代表通用型基礎設施 Playbook,
|
||||
不針對特定服務(如 high-cpu-restart 適用所有主機 CPU 告警),
|
||||
給予 1.0 豁免分,不因服務名不匹配而拉低整體相似度。
|
||||
- severity 豁免: Playbook.severity_range=[] 代表適用所有嚴重度
|
||||
"""
|
||||
|
||||
from src.models.playbook import SymptomPattern
|
||||
@@ -46,7 +52,7 @@ def calculate_symptom_similarity(
|
||||
"""
|
||||
計算症狀相似度
|
||||
|
||||
算法: 加權 Jaccard 相似度
|
||||
算法: 加權 Jaccard 相似度 + 通用型 Playbook 豁免
|
||||
|
||||
維度權重:
|
||||
- alert_names: 0.35 (最重要)
|
||||
@@ -54,6 +60,11 @@ def calculate_symptom_similarity(
|
||||
- severity: 0.15
|
||||
- keywords: 0.20
|
||||
|
||||
豁免規則 (Phase 3 飛輪修復, 2026-04-10):
|
||||
- pattern_b.affected_services 為空 → 通用型 Playbook,services 維度給 1.0
|
||||
(high-cpu-restart、crashloop-pod-delete 等基礎設施 Playbook 不針對特定服務)
|
||||
- pattern_b.severity_range 為空 → 適用所有嚴重度,severity 維度給 1.0
|
||||
|
||||
Returns:
|
||||
float: 0.0 ~ 1.0 相似度分數
|
||||
"""
|
||||
@@ -69,13 +80,20 @@ def calculate_symptom_similarity(
|
||||
set(pattern_a.alert_names),
|
||||
set(pattern_b.alert_names),
|
||||
),
|
||||
"affected_services": calculate_jaccard_similarity(
|
||||
set(pattern_a.affected_services),
|
||||
set(pattern_b.affected_services),
|
||||
# 通用型 Playbook 豁免:Playbook 沒有限定服務 → 任何服務都適用 → 1.0
|
||||
"affected_services": (
|
||||
1.0
|
||||
if not pattern_b.affected_services
|
||||
else calculate_jaccard_similarity(
|
||||
set(pattern_a.affected_services),
|
||||
set(pattern_b.affected_services),
|
||||
)
|
||||
),
|
||||
# 通用型 Playbook 豁免:Playbook 沒有限定嚴重度 → 任何嚴重度都適用 → 1.0
|
||||
"severity": (
|
||||
1.0
|
||||
if set(pattern_a.severity_range) & set(pattern_b.severity_range)
|
||||
if not pattern_b.severity_range
|
||||
or bool(set(pattern_a.severity_range) & set(pattern_b.severity_range))
|
||||
else 0.0
|
||||
),
|
||||
"keywords": calculate_jaccard_similarity(
|
||||
|
||||
Reference in New Issue
Block a user