- k8s/monitoring/alert-chain-monitor.yaml - k8s/monitoring/database-alerts.yaml - ops/grafana/ Grafana dashboards - ops/signoz/ SignOz 配置 - ops/scripts/ 維運腳本 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
268 lines
8.2 KiB
Python
Executable File
268 lines
8.2 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
AWOOOI 告警鏈路 E2E Smoke Test
|
|
==============================
|
|
|
|
ADR-037 Wave A.6: 告警鏈路端到端驗證腳本
|
|
|
|
執行方式:
|
|
python ops/scripts/alert_chain_smoke_test.py
|
|
|
|
驗證項目:
|
|
1. Health Endpoint 可達
|
|
2. Alertmanager Webhook 可達
|
|
3. Sentry Webhook 可達
|
|
4. SignOz Webhook 可達
|
|
5. Telegram 連通性
|
|
|
|
版本: v1.0
|
|
建立: 2026-03-29 (台北時區)
|
|
建立者: Claude Code (Phase 21 ADR-037)
|
|
"""
|
|
|
|
import asyncio
|
|
import os
|
|
import sys
|
|
from datetime import datetime
|
|
|
|
import httpx
|
|
|
|
# API 基礎位址 (可透過環境變數覆蓋)
|
|
API_BASE = os.getenv(
|
|
"AWOOOI_API_BASE",
|
|
"http://awoooi-api.awoooi-prod.svc.cluster.local:8000"
|
|
)
|
|
|
|
# 本地測試用:
|
|
# API_BASE = "http://localhost:8000"
|
|
|
|
TIMEOUT = 30
|
|
|
|
|
|
async def test_health_endpoint() -> bool:
|
|
"""測試 Health Endpoint"""
|
|
print("1. Testing Health Endpoint...")
|
|
|
|
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
|
try:
|
|
response = await client.get(f"{API_BASE}/api/v1/health")
|
|
if response.status_code == 200:
|
|
print(" ✅ Health: OK")
|
|
return True
|
|
else:
|
|
print(f" ❌ Health: HTTP {response.status_code}")
|
|
return False
|
|
except Exception as e:
|
|
print(f" ❌ Health: {e}")
|
|
return False
|
|
|
|
|
|
async def test_alertmanager_webhook() -> bool:
|
|
"""測試 Alertmanager Webhook"""
|
|
print("2. Testing Alertmanager Webhook...")
|
|
|
|
test_payload = {
|
|
"version": "4",
|
|
"status": "firing",
|
|
"alerts": [{
|
|
"status": "firing",
|
|
"labels": {
|
|
"alertname": "E2E_SMOKE_TEST",
|
|
"severity": "info",
|
|
"service": "smoke-test",
|
|
"namespace": "test",
|
|
},
|
|
"annotations": {
|
|
"summary": "E2E Smoke Test - Please Ignore",
|
|
"description": f"Auto test @ {datetime.now().isoformat()}",
|
|
},
|
|
"startsAt": datetime.now().isoformat() + "Z",
|
|
}]
|
|
}
|
|
|
|
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
|
try:
|
|
response = await client.post(
|
|
f"{API_BASE}/api/v1/webhooks/alertmanager",
|
|
json=test_payload,
|
|
)
|
|
if response.status_code == 200:
|
|
print(" ✅ Alertmanager Webhook: OK")
|
|
return True
|
|
else:
|
|
print(f" ❌ Alertmanager Webhook: HTTP {response.status_code}")
|
|
print(f" Response: {response.text[:200]}")
|
|
return False
|
|
except Exception as e:
|
|
print(f" ❌ Alertmanager Webhook: {e}")
|
|
return False
|
|
|
|
|
|
async def test_sentry_webhook() -> bool:
|
|
"""測試 Sentry Webhook"""
|
|
print("3. Testing Sentry Webhook...")
|
|
|
|
# 使用唯一 ID 避免去重
|
|
test_id = f"smoke-test-{datetime.now().strftime('%Y%m%d%H%M%S%f')}"
|
|
|
|
test_payload = {
|
|
"action": "triggered",
|
|
"data": {
|
|
"issue": {
|
|
"id": test_id,
|
|
"title": "E2E Smoke Test Error",
|
|
"level": "info", # 使用 info 避免觸發實際告警
|
|
"culprit": "smoke_test.py:test",
|
|
"project": {"slug": "awoooi-api"},
|
|
"firstSeen": datetime.now().isoformat(),
|
|
"count": 1,
|
|
},
|
|
"event": {
|
|
"message": "E2E Smoke Test - Please Ignore",
|
|
"platform": "python",
|
|
},
|
|
},
|
|
}
|
|
|
|
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
|
try:
|
|
response = await client.post(
|
|
f"{API_BASE}/api/v1/webhooks/sentry/error",
|
|
json=test_payload,
|
|
)
|
|
if response.status_code == 200:
|
|
result = response.json()
|
|
status = result.get("status")
|
|
if status in ["accepted", "deduplicated", "ignored"]:
|
|
print(f" ✅ Sentry Webhook: OK (status={status})")
|
|
return True
|
|
print(f" ❌ Sentry Webhook: HTTP {response.status_code}")
|
|
return False
|
|
except Exception as e:
|
|
print(f" ❌ Sentry Webhook: {e}")
|
|
return False
|
|
|
|
|
|
async def test_signoz_webhook() -> bool:
|
|
"""測試 SignOz Webhook"""
|
|
print("4. Testing SignOz Webhook...")
|
|
|
|
test_payload = {
|
|
"alertname": "E2E_SMOKE_TEST",
|
|
"status": "firing",
|
|
"labels": {
|
|
"alertname": "E2E_SMOKE_TEST",
|
|
"severity": "info",
|
|
"service_name": "smoke-test",
|
|
"source": "signoz",
|
|
},
|
|
"annotations": {
|
|
"summary": "E2E Smoke Test - Please Ignore",
|
|
"description": f"Auto test @ {datetime.now().isoformat()}",
|
|
},
|
|
"startsAt": datetime.now().isoformat() + "Z",
|
|
}
|
|
|
|
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
|
try:
|
|
response = await client.post(
|
|
f"{API_BASE}/api/v1/webhooks/signoz/alert",
|
|
json=test_payload,
|
|
)
|
|
if response.status_code == 200:
|
|
result = response.json()
|
|
if result.get("status") == "ok":
|
|
print(" ✅ SignOz Webhook: OK")
|
|
return True
|
|
print(f" ❌ SignOz Webhook: HTTP {response.status_code}")
|
|
return False
|
|
except Exception as e:
|
|
print(f" ❌ SignOz Webhook: {e}")
|
|
return False
|
|
|
|
|
|
async def test_signoz_health() -> bool:
|
|
"""測試 SignOz Webhook Health"""
|
|
print("5. Testing SignOz Webhook Health...")
|
|
|
|
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
|
try:
|
|
response = await client.get(
|
|
f"{API_BASE}/api/v1/webhooks/signoz/health"
|
|
)
|
|
if response.status_code == 200:
|
|
print(" ✅ SignOz Health: OK")
|
|
return True
|
|
else:
|
|
print(f" ❌ SignOz Health: HTTP {response.status_code}")
|
|
return False
|
|
except Exception as e:
|
|
print(f" ❌ SignOz Health: {e}")
|
|
return False
|
|
|
|
|
|
async def test_telegram_connectivity() -> bool:
|
|
"""測試 Telegram 連通性"""
|
|
print("6. Testing Telegram Connectivity...")
|
|
|
|
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
|
try:
|
|
response = await client.get(f"{API_BASE}/api/v1/telegram/status")
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
if data.get("connected"):
|
|
print(" ✅ Telegram: Connected")
|
|
return True
|
|
else:
|
|
print(" ⚠️ Telegram: Not Connected (endpoint reachable)")
|
|
return True # 端點可達即可
|
|
elif response.status_code == 404:
|
|
print(" ⚠️ Telegram: Endpoint not found (skipped)")
|
|
return True # 不影響整體測試
|
|
else:
|
|
print(f" ❌ Telegram: HTTP {response.status_code}")
|
|
return False
|
|
except Exception as e:
|
|
print(f" ⚠️ Telegram: {e} (skipped)")
|
|
return True # 不影響整體測試
|
|
|
|
|
|
async def main():
|
|
"""執行所有 Smoke Test"""
|
|
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
print("=" * 60)
|
|
print(" AWOOOI Alert Chain E2E Smoke Test")
|
|
print(f" Time: {now}")
|
|
print(f" Target: {API_BASE}")
|
|
print("=" * 60)
|
|
print()
|
|
|
|
# 依序執行測試 (非並行,方便除錯)
|
|
results = []
|
|
results.append(await test_health_endpoint())
|
|
results.append(await test_alertmanager_webhook())
|
|
results.append(await test_sentry_webhook())
|
|
results.append(await test_signoz_webhook())
|
|
results.append(await test_signoz_health())
|
|
results.append(await test_telegram_connectivity())
|
|
|
|
print()
|
|
print("=" * 60)
|
|
passed = sum(results)
|
|
total = len(results)
|
|
|
|
if passed == total:
|
|
print(f" ✅ ALL PASSED ({passed}/{total})")
|
|
print("=" * 60)
|
|
sys.exit(0)
|
|
else:
|
|
failed = total - passed
|
|
print(f" ❌ FAILED ({failed}/{total} tests failed)")
|
|
print("=" * 60)
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|