Files
awoooi/scripts/cold_start_playbooks.py
Your Name 8c4dc7a5a8
Some checks failed
Code Review / ai-code-review (push) Successful in 10s
CD Pipeline / tests (push) Successful in 1m5s
CD Pipeline / build-and-deploy (push) Failing after 10m6s
CD Pipeline / post-deploy-checks (push) Has been skipped
chore(rls): 新增 manual script gate 與 canary wave1
2026-05-12 20:23:27 +08:00

313 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
cold_start_playbooks.py — ADR-073 Phase 1 Step 8
飛輪冷啟動:預填 15+ 個基礎 Playbook涵蓋最常見告警類型。
資料來源:
1. EXECUTION_SUCCESS 記錄2 筆)
2. 已知告警類型的標準修復模板
執行方式:
python3 scripts/cold_start_playbooks.py --dry-run # 預覽
python3 scripts/cold_start_playbooks.py # 實際寫入
2026-04-12 Claude Sonnet 4.6 (ADR-073 Phase 1)
"""
import asyncio
import asyncpg
import json
import os
import sys
import uuid
from datetime import datetime, timezone
DRY_RUN = "--dry-run" in sys.argv
DATABASE_URL = os.environ.get("DATABASE_URL", "").replace("postgresql+asyncpg://", "postgresql://")
PROJECT_ID = os.environ.get("AWOOOP_PROJECT_ID", "awoooi")
if not DATABASE_URL:
print("ERROR: DATABASE_URL 未設定")
sys.exit(1)
# 亞洲/台北時間 (UTC+8)
def now_taipei():
from datetime import timedelta
return datetime.now(timezone(timedelta(hours=8)))
PLAYBOOK_TEMPLATES = [
{
"name": "Kubernetes Pod CrashLoopBackOff 修復",
"alert_type": "KubePodCrashLooping",
"description": "Pod 持續崩潰重啟,透過 rollout restart 觸發重新部署",
"category": "kubernetes",
"repair_steps": [
{"command": "kubectl get pod {target} -n {namespace} -o yaml", "purpose": "查看 Pod 狀態和事件"},
{"command": "kubectl logs {target} -n {namespace} --previous --tail=100", "purpose": "查看崩潰前日誌"},
{"command": "kubectl rollout restart deployment/{deployment} -n {namespace}", "purpose": "重新部署修復"},
],
"estimated_minutes": 5,
"symptom_alertnames": ["KubePodCrashLooping", "KubePodNotReady"],
"severity_range": ["P1", "P2"],
},
{
"name": "Kubernetes Deployment 重新啟動",
"alert_type": "KubeDeploymentReplicasMismatch",
"description": "Deployment replicas 不符預期,執行 rollout restart 恢復",
"category": "kubernetes",
"repair_steps": [
{"command": "kubectl get deployment {target} -n {namespace}", "purpose": "確認 Deployment 狀態"},
{"command": "kubectl rollout restart deployment/{target} -n {namespace}", "purpose": "重新部署"},
{"command": "kubectl rollout status deployment/{target} -n {namespace} --timeout=120s", "purpose": "等待部署完成"},
],
"estimated_minutes": 5,
"symptom_alertnames": ["KubeDeploymentReplicasMismatch", "KubeDeploymentGenerationMismatch"],
"severity_range": ["P1", "P2"],
},
{
"name": "AWOOOI API 服務重啟",
"alert_type": "KubePodCrashLooping",
"description": "awoooi-api 服務異常,重新啟動 deployment",
"category": "kubernetes",
"repair_steps": [
{"command": "kubectl rollout restart deployment/awoooi-api -n awoooi-prod", "purpose": "重啟 API 服務"},
{"command": "kubectl rollout status deployment/awoooi-api -n awoooi-prod --timeout=120s", "purpose": "等待就緒"},
],
"estimated_minutes": 3,
"symptom_alertnames": ["KubePodCrashLooping"],
"severity_range": ["P1"],
"source_approval_id": "003e3eb2-7e58-47fd-bd24-c9e1fdf6cb1a", # EXECUTION_SUCCESS
},
{
"name": "AWOOOI Worker 服務重啟",
"alert_type": "KubePodCrashLooping",
"description": "awoooi-worker 服務異常,重新啟動 deployment",
"category": "kubernetes",
"repair_steps": [
{"command": "kubectl rollout restart deployment/awoooi-worker -n awoooi-prod", "purpose": "重啟 Worker"},
{"command": "kubectl rollout status deployment/awoooi-worker -n awoooi-prod --timeout=120s", "purpose": "等待就緒"},
],
"estimated_minutes": 3,
"symptom_alertnames": ["KubePodCrashLooping"],
"severity_range": ["P1"],
"source_approval_id": "6d2449ec-fd98-4c3a-a559-62247838efca", # EXECUTION_SUCCESS
},
{
"name": "節點 CPU 使用率過高 — 告警確認",
"alert_type": "HostHighCpuLoad",
"description": "主機 CPU 長時間高使用率,確認原因並通知人工處理",
"category": "infrastructure",
"repair_steps": [
{"command": "ssh {target} 'top -b -n 1 | head -20'", "purpose": "查看 CPU 消耗 top 進程"},
{"command": "ssh {target} 'ps aux --sort=-%cpu | head -10'", "purpose": "確認高 CPU 進程"},
],
"estimated_minutes": 10,
"symptom_alertnames": ["HostHighCpuLoad", "NodeHighCpuUsage"],
"severity_range": ["P2", "P3"],
},
{
"name": "節點記憶體不足 — 清理 Cache",
"alert_type": "HostOutOfMemory",
"description": "主機記憶體不足,清理 Page Cache 釋放空間",
"category": "infrastructure",
"repair_steps": [
{"command": "ssh {target} 'free -h'", "purpose": "確認記憶體狀況"},
{"command": "ssh {target} 'sudo sync && sudo sysctl vm.drop_caches=3'", "purpose": "清理 Page Cache"},
{"command": "ssh {target} 'free -h'", "purpose": "確認記憶體已釋放"},
],
"estimated_minutes": 5,
"symptom_alertnames": ["HostOutOfMemory", "NodeMemoryPressure"],
"severity_range": ["P1", "P2"],
},
{
"name": "磁碟使用率過高 — 清理舊日誌",
"alert_type": "HostOutOfDiskSpace",
"description": "磁碟空間不足,清理舊日誌和臨時文件",
"category": "infrastructure",
"repair_steps": [
{"command": "ssh {target} 'df -h'", "purpose": "確認磁碟使用狀況"},
{"command": "ssh {target} 'sudo journalctl --vacuum-time=7d'", "purpose": "清理 7 天前的系統日誌"},
{"command": "ssh {target} 'docker system prune -f --volumes 2>/dev/null || true'", "purpose": "清理 Docker 未用資源"},
{"command": "ssh {target} 'df -h'", "purpose": "確認空間已釋放"},
],
"estimated_minutes": 10,
"symptom_alertnames": ["HostOutOfDiskSpace", "NodeDiskPressure"],
"severity_range": ["P1", "P2"],
},
{
"name": "Docker 容器異常 — 重啟",
"alert_type": "DockerContainerUnhealthy",
"description": "Docker 容器健康檢查失敗,重啟恢復服務",
"category": "infrastructure",
"repair_steps": [
{"command": "ssh {host} 'docker ps -a | grep {target}'", "purpose": "確認容器狀態"},
{"command": "ssh {host} 'docker logs {target} --tail=50'", "purpose": "查看容器日誌"},
{"command": "ssh {host} 'docker restart {target}'", "purpose": "重啟容器"},
],
"estimated_minutes": 5,
"symptom_alertnames": ["DockerContainerUnhealthy", "DockerContainerOOMKilled"],
"severity_range": ["P1", "P2"],
},
{
"name": "Docker 容器停止 — 啟動",
"alert_type": "DockerContainerNotRunning",
"description": "Docker 容器意外停止,重新啟動服務",
"category": "infrastructure",
"repair_steps": [
{"command": "ssh {host} 'docker ps -a | grep {target}'", "purpose": "確認容器狀態"},
{"command": "ssh {host} 'docker start {target}'", "purpose": "啟動容器"},
{"command": "ssh {host} 'docker ps | grep {target}'", "purpose": "確認容器已運行"},
],
"estimated_minutes": 3,
"symptom_alertnames": ["DockerContainerNotRunning", "DockerContainerStopped"],
"severity_range": ["P1", "P2"],
},
{
"name": "PostgreSQL 服務恢復",
"alert_type": "PostgreSQLDown",
"description": "PostgreSQL 服務下線,嘗試重啟恢復",
"category": "database",
"repair_steps": [
{"command": "ssh {host} 'docker ps | grep postgres'", "purpose": "確認 Postgres 容器狀態"},
{"command": "ssh {host} 'docker restart postgres'", "purpose": "重啟 Postgres"},
{"command": "ssh {host} 'docker exec postgres pg_isready'", "purpose": "確認 Postgres 已就緒"},
],
"estimated_minutes": 10,
"symptom_alertnames": ["PostgreSQLDown", "PostgresqlDown"],
"severity_range": ["P0", "P1"],
},
{
"name": "Redis 服務恢復",
"alert_type": "RedisDown",
"description": "Redis 服務下線,嘗試重啟恢復",
"category": "database",
"repair_steps": [
{"command": "ssh {host} 'docker ps | grep redis'", "purpose": "確認 Redis 容器狀態"},
{"command": "ssh {host} 'docker restart redis'", "purpose": "重啟 Redis"},
{"command": "ssh {host} 'docker exec redis redis-cli ping'", "purpose": "確認 Redis 已就緒"},
],
"estimated_minutes": 5,
"symptom_alertnames": ["RedisDown", "RedisMissedSlaves"],
"severity_range": ["P0", "P1"],
},
{
"name": "K3s 節點 NotReady — 重啟 K3s",
"alert_type": "KubeNodeNotReady",
"description": "K3s 節點 NotReady重啟 k3s service",
"category": "kubernetes",
"repair_steps": [
{"command": "ssh {target} 'sudo systemctl status k3s'", "purpose": "確認 k3s 服務狀態"},
{"command": "ssh {target} 'sudo systemctl restart k3s'", "purpose": "重啟 k3s"},
{"command": "kubectl get nodes", "purpose": "確認節點狀態恢復"},
],
"estimated_minutes": 15,
"symptom_alertnames": ["KubeNodeNotReady", "KubeNodeUnreachable"],
"severity_range": ["P0", "P1"],
},
{
"name": "SSL 憑證即將到期 — 通知更新",
"alert_type": "SSLCertExpiringSoon",
"description": "SSL 憑證即將過期,通知人工更新",
"category": "security",
"repair_steps": [
{"command": "echo 'SSL 憑證即將過期,需人工更新'", "purpose": "記錄問題"},
],
"estimated_minutes": 1,
"symptom_alertnames": ["SSLCertExpiringSoon"],
"severity_range": ["P2", "P3"],
},
{
"name": "ArgoCD 同步失敗 — 重試",
"alert_type": "ArgoCDAppSyncFailed",
"description": "ArgoCD 應用同步失敗,觸發重新同步",
"category": "kubernetes",
"repair_steps": [
{"command": "argocd app sync {target} --force", "purpose": "強制重新同步"},
{"command": "argocd app wait {target} --timeout 120", "purpose": "等待同步完成"},
],
"estimated_minutes": 10,
"symptom_alertnames": ["ArgoCDAppSyncFailed", "ArgoCDAppOutOfSync"],
"severity_range": ["P1", "P2"],
},
{
"name": "備份失敗 — 手動確認",
"alert_type": "HostBackupFailed",
"description": "定期備份失敗,需人工確認備份狀態",
"category": "operations",
"repair_steps": [
{"command": "ssh {target} 'ls -lh /backup/ | tail -10'", "purpose": "確認最近備份文件"},
],
"estimated_minutes": 5,
"symptom_alertnames": ["HostBackupFailed", "BackupFailed", "VeleroBackupFailed"],
"severity_range": ["P2"],
},
]
async def main():
conn = await asyncpg.connect(DATABASE_URL)
await conn.execute("SELECT set_config('app.project_id', $1, FALSE)", PROJECT_ID)
# 確認當前 playbooks 數量
current = await conn.fetchval("SELECT count(*) FROM playbooks")
print(f"當前 playbooks: {current}")
if DRY_RUN:
print(f"\n[DRY RUN] 將新增 {len(PLAYBOOK_TEMPLATES)} 個 Playbook:")
for i, t in enumerate(PLAYBOOK_TEMPLATES, 1):
print(f" {i:2d}. [{t['category']}] {t['name']}")
print("\n使用 --dry-run 以外的方式執行以實際寫入")
await conn.close()
return
ts = now_taipei()
inserted = 0
skipped = 0
for tmpl in PLAYBOOK_TEMPLATES:
playbook_id = f"PB-COLD-{str(uuid.uuid4())[:8].upper()}"
repair_steps = json.dumps(tmpl["repair_steps"])
symptom_pattern = json.dumps({
"alertnames": tmpl["symptom_alertnames"],
"severity_range": tmpl["severity_range"],
})
try:
await conn.execute(
"""
INSERT INTO playbooks (
playbook_id, name, description, status, source,
repair_steps, symptom_pattern,
estimated_duration_minutes, ai_confidence,
success_count, failure_count,
created_at, updated_at
) VALUES (
$1, $2, $3, 'active', 'cold_start',
$4::jsonb, $5::jsonb,
$6, 0.6,
0, 0,
$7, $7
)
ON CONFLICT DO NOTHING
""",
playbook_id,
tmpl["name"],
tmpl["description"],
repair_steps,
symptom_pattern,
tmpl["estimated_minutes"],
ts,
)
inserted += 1
print(f" ✅ 寫入: {tmpl['name']}")
except Exception as e:
skipped += 1
print(f" ❌ 失敗: {tmpl['name']}{e}")
final = await conn.fetchval("SELECT count(*) FROM playbooks")
print(f"\n✅ 完成: 新增 {inserted} 個 Playbook失敗 {skipped} 個)")
print(f" Playbooks 總數: {final}")
await conn.close()
if __name__ == "__main__":
asyncio.run(main())