Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Phase 1 — affected_services 污染根治
- webhooks.py: _extract_affected_services() 從 labels 精準萃取服務名
(component > job > pod deployment name > clean target_resource > [])
- create_incident_for_approval: alert_labels 完整保留進 Signal
- alert_name 從 alertname 取,不再用 "custom"
Phase 2 — Playbook alertname 變體擴充
- alert_rules.yaml: 5 條規則新增 HostHighCpuLoad、KubePodCrashLooping 等變體
- scripts/update_playbook_alert_variants.py: Redis index 已執行更新 ✅
Phase 3 — Jaccard 通用型 Playbook 豁免
- similarity.py: affected_services=[] → 1.0 豁免(基礎設施 Playbook 不針對特定服務)
- severity_range=[] → 1.0 豁免(適用所有嚴重度)
Phase 4 — Playbook Embedding 持久化(冷啟動修復)
- migrations/flywheel_playbook_embeddings.sql: pgvector 持久化表
- services/playbook_embedding_service.py: 啟動時重建 Redis 向量快取 + 同步 DB
- main.py: lifespan 啟動時 asyncio.create_task 非阻塞執行
2026-04-10 Asia/Taipei — Claude Sonnet 4.6
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
142 lines
4.0 KiB
Python
142 lines
4.0 KiB
Python
"""
|
||
Phase 2 飛輪修復:補齊 Playbook alertname 變體
|
||
=================================================
|
||
直接更新 Redis 裡的 Playbook symptom_pattern.alert_names,
|
||
並重建 playbook:index:alert:* 索引。
|
||
|
||
用法(在 API pod 內執行):
|
||
python scripts/update_playbook_alert_variants.py
|
||
|
||
或從本機執行(需能連 Redis):
|
||
AWOOOI_REDIS_URL=redis://192.168.0.188:6380/10 python scripts/update_playbook_alert_variants.py
|
||
|
||
2026-04-10 Asia/Taipei — Claude Sonnet 4.6
|
||
"""
|
||
|
||
import asyncio
|
||
import json
|
||
import os
|
||
import sys
|
||
|
||
import redis
|
||
|
||
# Playbook 補充的 alertname 變體
|
||
# key: playbook name (用於搜尋), value: 新增的 alertname list
|
||
VARIANTS: dict[str, list[str]] = {
|
||
"high-cpu-restart": [
|
||
"HighCPUUsage",
|
||
"ContainerCpuUsageSecondsTotal",
|
||
"HostHighCpuLoad",
|
||
"NodeCPUUsageHigh",
|
||
"CPUThrottlingHigh",
|
||
"KubeCPUOvercommit",
|
||
],
|
||
"crashloop-pod-delete": [
|
||
"KubePodCrashLooping",
|
||
"PodCrashLoopBackOff",
|
||
"KubernetesPodCrashLooping",
|
||
],
|
||
"oom-killed-pod-delete": [
|
||
"PodOOMKilled",
|
||
"KubePodOOMKilled",
|
||
"KubernetesMemoryPressure",
|
||
"NodeMemoryUsageHigh",
|
||
"HighMemoryUsage",
|
||
],
|
||
"k8s-pod-not-ready-restart": [
|
||
"KubePodNotReady",
|
||
"PodNotReady",
|
||
"KubernetesDeploymentReplicasMismatch",
|
||
],
|
||
"insufficient-replicas-scale": [
|
||
"KubeDeploymentReplicasMismatch",
|
||
"InsufficientReplicas",
|
||
"KubernetesReplicasMismatch",
|
||
],
|
||
}
|
||
|
||
PLAYBOOK_KEY_PREFIX = "playbook:"
|
||
PLAYBOOK_INDEX_ALERT_PREFIX = "playbook:index:alert:"
|
||
PLAYBOOK_TTL_SECONDS = 86400 * 30 # 30 天
|
||
|
||
|
||
def get_redis_client() -> redis.Redis:
|
||
url = os.environ.get("AWOOOI_REDIS_URL", "redis://192.168.0.188:6380/10")
|
||
return redis.Redis.from_url(url)
|
||
|
||
|
||
def update_playbooks(r: redis.Redis) -> None:
|
||
# 掃描所有 Playbook keys
|
||
all_keys = [k.decode() for k in r.keys(f"{PLAYBOOK_KEY_PREFIX}PB-*")]
|
||
print(f"Found {len(all_keys)} playbook keys in Redis")
|
||
|
||
updated = 0
|
||
skipped = 0
|
||
|
||
for key in all_keys:
|
||
raw = r.get(key)
|
||
if not raw:
|
||
continue
|
||
|
||
pb = json.loads(raw)
|
||
pb_name = pb.get("name", "")
|
||
|
||
if pb_name not in VARIANTS:
|
||
skipped += 1
|
||
continue
|
||
|
||
target_alerts = VARIANTS[pb_name]
|
||
sp = pb.get("symptom_pattern", {})
|
||
current_alerts: list[str] = sp.get("alert_names", [])
|
||
|
||
# 合併(保留現有 + 加入新的,去重)
|
||
merged = list(dict.fromkeys(current_alerts + target_alerts))
|
||
|
||
if merged == current_alerts:
|
||
print(f" {pb_name}: already up to date, skip")
|
||
skipped += 1
|
||
continue
|
||
|
||
sp["alert_names"] = merged
|
||
pb["symptom_pattern"] = sp
|
||
|
||
# 寫回 Redis
|
||
r.set(key, json.dumps(pb, ensure_ascii=False), ex=PLAYBOOK_TTL_SECONDS)
|
||
|
||
# 重建 alert index
|
||
pb_id = pb.get("playbook_id", key.replace(PLAYBOOK_KEY_PREFIX, ""))
|
||
for alert_name in merged:
|
||
idx_key = f"{PLAYBOOK_INDEX_ALERT_PREFIX}{alert_name}"
|
||
r.sadd(idx_key, pb_id)
|
||
r.expire(idx_key, PLAYBOOK_TTL_SECONDS)
|
||
|
||
added = [a for a in merged if a not in current_alerts]
|
||
print(f" {pb_name}: added {added}")
|
||
updated += 1
|
||
|
||
print(f"\nDone: {updated} updated, {skipped} skipped")
|
||
|
||
# 驗證
|
||
print("\nVerification:")
|
||
for check_alert in [
|
||
"HostHighCpuLoad", "KubernetesPodCrashLooping",
|
||
"NodeMemoryUsageHigh", "HighMemoryUsage",
|
||
"KubernetesReplicasMismatch",
|
||
]:
|
||
idx_key = f"{PLAYBOOK_INDEX_ALERT_PREFIX}{check_alert}"
|
||
members = [m.decode() for m in r.smembers(idx_key)]
|
||
status = "✅" if members else "❌"
|
||
print(f" {status} {check_alert} → {members}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
r = get_redis_client()
|
||
try:
|
||
r.ping()
|
||
print(f"Redis connected: {os.environ.get('AWOOOI_REDIS_URL', 'redis://192.168.0.188:6380/10')}\n")
|
||
except Exception as e:
|
||
print(f"Redis connection failed: {e}")
|
||
sys.exit(1)
|
||
|
||
update_playbooks(r)
|