Files
awoooi/apps/api/scripts/update_playbook_alert_variants.py
OG T c6edfb5614
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
fix(flywheel): 四階段系統性修復 AUTO_REPAIR NO_MATCH 斷層
Phase 1 — affected_services 污染根治
  - webhooks.py: _extract_affected_services() 從 labels 精準萃取服務名
    (component > job > pod deployment name > clean target_resource > [])
  - create_incident_for_approval: alert_labels 完整保留進 Signal
  - alert_name 從 alertname 取,不再用 "custom"

Phase 2 — Playbook alertname 變體擴充
  - alert_rules.yaml: 5 條規則新增 HostHighCpuLoad、KubePodCrashLooping 等變體
  - scripts/update_playbook_alert_variants.py: Redis index 已執行更新 

Phase 3 — Jaccard 通用型 Playbook 豁免
  - similarity.py: affected_services=[] → 1.0 豁免(基礎設施 Playbook 不針對特定服務)
  - severity_range=[] → 1.0 豁免(適用所有嚴重度)

Phase 4 — Playbook Embedding 持久化(冷啟動修復)
  - migrations/flywheel_playbook_embeddings.sql: pgvector 持久化表
  - services/playbook_embedding_service.py: 啟動時重建 Redis 向量快取 + 同步 DB
  - main.py: lifespan 啟動時 asyncio.create_task 非阻塞執行

2026-04-10 Asia/Taipei — Claude Sonnet 4.6
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-10 11:04:56 +08:00

142 lines
4.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Phase 2 飛輪修復:補齊 Playbook alertname 變體
=================================================
直接更新 Redis 裡的 Playbook symptom_pattern.alert_names
並重建 playbook:index:alert:* 索引。
用法(在 API pod 內執行):
python scripts/update_playbook_alert_variants.py
或從本機執行(需能連 Redis:
AWOOOI_REDIS_URL=redis://192.168.0.188:6380/10 python scripts/update_playbook_alert_variants.py
2026-04-10 Asia/Taipei — Claude Sonnet 4.6
"""
import asyncio
import json
import os
import sys
import redis
# Playbook 補充的 alertname 變體
# key: playbook name (用於搜尋), value: 新增的 alertname list
VARIANTS: dict[str, list[str]] = {
"high-cpu-restart": [
"HighCPUUsage",
"ContainerCpuUsageSecondsTotal",
"HostHighCpuLoad",
"NodeCPUUsageHigh",
"CPUThrottlingHigh",
"KubeCPUOvercommit",
],
"crashloop-pod-delete": [
"KubePodCrashLooping",
"PodCrashLoopBackOff",
"KubernetesPodCrashLooping",
],
"oom-killed-pod-delete": [
"PodOOMKilled",
"KubePodOOMKilled",
"KubernetesMemoryPressure",
"NodeMemoryUsageHigh",
"HighMemoryUsage",
],
"k8s-pod-not-ready-restart": [
"KubePodNotReady",
"PodNotReady",
"KubernetesDeploymentReplicasMismatch",
],
"insufficient-replicas-scale": [
"KubeDeploymentReplicasMismatch",
"InsufficientReplicas",
"KubernetesReplicasMismatch",
],
}
PLAYBOOK_KEY_PREFIX = "playbook:"
PLAYBOOK_INDEX_ALERT_PREFIX = "playbook:index:alert:"
PLAYBOOK_TTL_SECONDS = 86400 * 30 # 30 天
def get_redis_client() -> redis.Redis:
url = os.environ.get("AWOOOI_REDIS_URL", "redis://192.168.0.188:6380/10")
return redis.Redis.from_url(url)
def update_playbooks(r: redis.Redis) -> None:
# 掃描所有 Playbook keys
all_keys = [k.decode() for k in r.keys(f"{PLAYBOOK_KEY_PREFIX}PB-*")]
print(f"Found {len(all_keys)} playbook keys in Redis")
updated = 0
skipped = 0
for key in all_keys:
raw = r.get(key)
if not raw:
continue
pb = json.loads(raw)
pb_name = pb.get("name", "")
if pb_name not in VARIANTS:
skipped += 1
continue
target_alerts = VARIANTS[pb_name]
sp = pb.get("symptom_pattern", {})
current_alerts: list[str] = sp.get("alert_names", [])
# 合併(保留現有 + 加入新的,去重)
merged = list(dict.fromkeys(current_alerts + target_alerts))
if merged == current_alerts:
print(f" {pb_name}: already up to date, skip")
skipped += 1
continue
sp["alert_names"] = merged
pb["symptom_pattern"] = sp
# 寫回 Redis
r.set(key, json.dumps(pb, ensure_ascii=False), ex=PLAYBOOK_TTL_SECONDS)
# 重建 alert index
pb_id = pb.get("playbook_id", key.replace(PLAYBOOK_KEY_PREFIX, ""))
for alert_name in merged:
idx_key = f"{PLAYBOOK_INDEX_ALERT_PREFIX}{alert_name}"
r.sadd(idx_key, pb_id)
r.expire(idx_key, PLAYBOOK_TTL_SECONDS)
added = [a for a in merged if a not in current_alerts]
print(f" {pb_name}: added {added}")
updated += 1
print(f"\nDone: {updated} updated, {skipped} skipped")
# 驗證
print("\nVerification:")
for check_alert in [
"HostHighCpuLoad", "KubernetesPodCrashLooping",
"NodeMemoryUsageHigh", "HighMemoryUsage",
"KubernetesReplicasMismatch",
]:
idx_key = f"{PLAYBOOK_INDEX_ALERT_PREFIX}{check_alert}"
members = [m.decode() for m in r.smembers(idx_key)]
status = "" if members else ""
print(f" {status} {check_alert}{members}")
if __name__ == "__main__":
r = get_redis_client()
try:
r.ping()
print(f"Redis connected: {os.environ.get('AWOOOI_REDIS_URL', 'redis://192.168.0.188:6380/10')}\n")
except Exception as e:
print(f"Redis connection failed: {e}")
sys.exit(1)
update_playbooks(r)