""" Phase 2 飛輪修復:補齊 Playbook alertname 變體 ================================================= 直接更新 Redis 裡的 Playbook symptom_pattern.alert_names, 並重建 playbook:index:alert:* 索引。 用法(在 API pod 內執行): python scripts/update_playbook_alert_variants.py 或從本機執行(需能連 Redis): AWOOOI_REDIS_URL=redis://192.168.0.188:6380/10 python scripts/update_playbook_alert_variants.py 2026-04-10 Asia/Taipei — Claude Sonnet 4.6 """ import asyncio import json import os import sys import redis # Playbook 補充的 alertname 變體 # key: playbook name (用於搜尋), value: 新增的 alertname list VARIANTS: dict[str, list[str]] = { "high-cpu-restart": [ "HighCPUUsage", "ContainerCpuUsageSecondsTotal", "HostHighCpuLoad", "NodeCPUUsageHigh", "CPUThrottlingHigh", "KubeCPUOvercommit", ], "crashloop-pod-delete": [ "KubePodCrashLooping", "PodCrashLoopBackOff", "KubernetesPodCrashLooping", ], "oom-killed-pod-delete": [ "PodOOMKilled", "KubePodOOMKilled", "KubernetesMemoryPressure", "NodeMemoryUsageHigh", "HighMemoryUsage", ], "k8s-pod-not-ready-restart": [ "KubePodNotReady", "PodNotReady", "KubernetesDeploymentReplicasMismatch", ], "insufficient-replicas-scale": [ "KubeDeploymentReplicasMismatch", "InsufficientReplicas", "KubernetesReplicasMismatch", ], } PLAYBOOK_KEY_PREFIX = "playbook:" PLAYBOOK_INDEX_ALERT_PREFIX = "playbook:index:alert:" PLAYBOOK_TTL_SECONDS = 86400 * 30 # 30 天 def get_redis_client() -> redis.Redis: url = os.environ.get("AWOOOI_REDIS_URL", "redis://192.168.0.188:6380/10") return redis.Redis.from_url(url) def update_playbooks(r: redis.Redis) -> None: # 掃描所有 Playbook keys all_keys = [k.decode() for k in r.keys(f"{PLAYBOOK_KEY_PREFIX}PB-*")] print(f"Found {len(all_keys)} playbook keys in Redis") updated = 0 skipped = 0 for key in all_keys: raw = r.get(key) if not raw: continue pb = json.loads(raw) pb_name = pb.get("name", "") if pb_name not in VARIANTS: skipped += 1 continue target_alerts = VARIANTS[pb_name] sp = pb.get("symptom_pattern", {}) current_alerts: list[str] = sp.get("alert_names", []) # 合併(保留現有 + 加入新的,去重) merged = list(dict.fromkeys(current_alerts + target_alerts)) if merged == current_alerts: print(f" {pb_name}: already up to date, skip") skipped += 1 continue sp["alert_names"] = merged pb["symptom_pattern"] = sp # 寫回 Redis r.set(key, json.dumps(pb, ensure_ascii=False), ex=PLAYBOOK_TTL_SECONDS) # 重建 alert index pb_id = pb.get("playbook_id", key.replace(PLAYBOOK_KEY_PREFIX, "")) for alert_name in merged: idx_key = f"{PLAYBOOK_INDEX_ALERT_PREFIX}{alert_name}" r.sadd(idx_key, pb_id) r.expire(idx_key, PLAYBOOK_TTL_SECONDS) added = [a for a in merged if a not in current_alerts] print(f" {pb_name}: added {added}") updated += 1 print(f"\nDone: {updated} updated, {skipped} skipped") # 驗證 print("\nVerification:") for check_alert in [ "HostHighCpuLoad", "KubernetesPodCrashLooping", "NodeMemoryUsageHigh", "HighMemoryUsage", "KubernetesReplicasMismatch", ]: idx_key = f"{PLAYBOOK_INDEX_ALERT_PREFIX}{check_alert}" members = [m.decode() for m in r.smembers(idx_key)] status = "✅" if members else "❌" print(f" {status} {check_alert} → {members}") if __name__ == "__main__": r = get_redis_client() try: r.ping() print(f"Redis connected: {os.environ.get('AWOOOI_REDIS_URL', 'redis://192.168.0.188:6380/10')}\n") except Exception as e: print(f"Redis connection failed: {e}") sys.exit(1) update_playbooks(r)