Files
awoooi/ops/scripts/alert_chain_smoke_test.py
OG T a5a6bd3408 feat(monitoring): K8s alert rules + Grafana dashboards + ops 腳本
- k8s/monitoring/alert-chain-monitor.yaml
- k8s/monitoring/database-alerts.yaml
- ops/grafana/ Grafana dashboards
- ops/signoz/ SignOz 配置
- ops/scripts/ 維運腳本

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-29 16:04:14 +08:00

268 lines
8.2 KiB
Python
Executable File

#!/usr/bin/env python3
"""
AWOOOI 告警鏈路 E2E Smoke Test
==============================
ADR-037 Wave A.6: 告警鏈路端到端驗證腳本
執行方式:
python ops/scripts/alert_chain_smoke_test.py
驗證項目:
1. Health Endpoint 可達
2. Alertmanager Webhook 可達
3. Sentry Webhook 可達
4. SignOz Webhook 可達
5. Telegram 連通性
版本: v1.0
建立: 2026-03-29 (台北時區)
建立者: Claude Code (Phase 21 ADR-037)
"""
import asyncio
import os
import sys
from datetime import datetime
import httpx
# API 基礎位址 (可透過環境變數覆蓋)
API_BASE = os.getenv(
"AWOOOI_API_BASE",
"http://awoooi-api.awoooi-prod.svc.cluster.local:8000"
)
# 本地測試用:
# API_BASE = "http://localhost:8000"
TIMEOUT = 30
async def test_health_endpoint() -> bool:
"""測試 Health Endpoint"""
print("1. Testing Health Endpoint...")
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
try:
response = await client.get(f"{API_BASE}/api/v1/health")
if response.status_code == 200:
print(" ✅ Health: OK")
return True
else:
print(f" ❌ Health: HTTP {response.status_code}")
return False
except Exception as e:
print(f" ❌ Health: {e}")
return False
async def test_alertmanager_webhook() -> bool:
"""測試 Alertmanager Webhook"""
print("2. Testing Alertmanager Webhook...")
test_payload = {
"version": "4",
"status": "firing",
"alerts": [{
"status": "firing",
"labels": {
"alertname": "E2E_SMOKE_TEST",
"severity": "info",
"service": "smoke-test",
"namespace": "test",
},
"annotations": {
"summary": "E2E Smoke Test - Please Ignore",
"description": f"Auto test @ {datetime.now().isoformat()}",
},
"startsAt": datetime.now().isoformat() + "Z",
}]
}
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
try:
response = await client.post(
f"{API_BASE}/api/v1/webhooks/alertmanager",
json=test_payload,
)
if response.status_code == 200:
print(" ✅ Alertmanager Webhook: OK")
return True
else:
print(f" ❌ Alertmanager Webhook: HTTP {response.status_code}")
print(f" Response: {response.text[:200]}")
return False
except Exception as e:
print(f" ❌ Alertmanager Webhook: {e}")
return False
async def test_sentry_webhook() -> bool:
"""測試 Sentry Webhook"""
print("3. Testing Sentry Webhook...")
# 使用唯一 ID 避免去重
test_id = f"smoke-test-{datetime.now().strftime('%Y%m%d%H%M%S%f')}"
test_payload = {
"action": "triggered",
"data": {
"issue": {
"id": test_id,
"title": "E2E Smoke Test Error",
"level": "info", # 使用 info 避免觸發實際告警
"culprit": "smoke_test.py:test",
"project": {"slug": "awoooi-api"},
"firstSeen": datetime.now().isoformat(),
"count": 1,
},
"event": {
"message": "E2E Smoke Test - Please Ignore",
"platform": "python",
},
},
}
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
try:
response = await client.post(
f"{API_BASE}/api/v1/webhooks/sentry/error",
json=test_payload,
)
if response.status_code == 200:
result = response.json()
status = result.get("status")
if status in ["accepted", "deduplicated", "ignored"]:
print(f" ✅ Sentry Webhook: OK (status={status})")
return True
print(f" ❌ Sentry Webhook: HTTP {response.status_code}")
return False
except Exception as e:
print(f" ❌ Sentry Webhook: {e}")
return False
async def test_signoz_webhook() -> bool:
"""測試 SignOz Webhook"""
print("4. Testing SignOz Webhook...")
test_payload = {
"alertname": "E2E_SMOKE_TEST",
"status": "firing",
"labels": {
"alertname": "E2E_SMOKE_TEST",
"severity": "info",
"service_name": "smoke-test",
"source": "signoz",
},
"annotations": {
"summary": "E2E Smoke Test - Please Ignore",
"description": f"Auto test @ {datetime.now().isoformat()}",
},
"startsAt": datetime.now().isoformat() + "Z",
}
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
try:
response = await client.post(
f"{API_BASE}/api/v1/webhooks/signoz/alert",
json=test_payload,
)
if response.status_code == 200:
result = response.json()
if result.get("status") == "ok":
print(" ✅ SignOz Webhook: OK")
return True
print(f" ❌ SignOz Webhook: HTTP {response.status_code}")
return False
except Exception as e:
print(f" ❌ SignOz Webhook: {e}")
return False
async def test_signoz_health() -> bool:
"""測試 SignOz Webhook Health"""
print("5. Testing SignOz Webhook Health...")
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
try:
response = await client.get(
f"{API_BASE}/api/v1/webhooks/signoz/health"
)
if response.status_code == 200:
print(" ✅ SignOz Health: OK")
return True
else:
print(f" ❌ SignOz Health: HTTP {response.status_code}")
return False
except Exception as e:
print(f" ❌ SignOz Health: {e}")
return False
async def test_telegram_connectivity() -> bool:
"""測試 Telegram 連通性"""
print("6. Testing Telegram Connectivity...")
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
try:
response = await client.get(f"{API_BASE}/api/v1/telegram/status")
if response.status_code == 200:
data = response.json()
if data.get("connected"):
print(" ✅ Telegram: Connected")
return True
else:
print(" ⚠️ Telegram: Not Connected (endpoint reachable)")
return True # 端點可達即可
elif response.status_code == 404:
print(" ⚠️ Telegram: Endpoint not found (skipped)")
return True # 不影響整體測試
else:
print(f" ❌ Telegram: HTTP {response.status_code}")
return False
except Exception as e:
print(f" ⚠️ Telegram: {e} (skipped)")
return True # 不影響整體測試
async def main():
"""執行所有 Smoke Test"""
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print("=" * 60)
print(" AWOOOI Alert Chain E2E Smoke Test")
print(f" Time: {now}")
print(f" Target: {API_BASE}")
print("=" * 60)
print()
# 依序執行測試 (非並行,方便除錯)
results = []
results.append(await test_health_endpoint())
results.append(await test_alertmanager_webhook())
results.append(await test_sentry_webhook())
results.append(await test_signoz_webhook())
results.append(await test_signoz_health())
results.append(await test_telegram_connectivity())
print()
print("=" * 60)
passed = sum(results)
total = len(results)
if passed == total:
print(f" ✅ ALL PASSED ({passed}/{total})")
print("=" * 60)
sys.exit(0)
else:
failed = total - passed
print(f" ❌ FAILED ({failed}/{total} tests failed)")
print("=" * 60)
sys.exit(1)
if __name__ == "__main__":
asyncio.run(main())