- sentry_webhook: 加入 GET /health endpoint (smoke test 探測用) - smoke_test: alertmanager 路徑改為 /webhooks/health (已存在) - smoke_test: Prometheus URL 改為正確的 110:9090 - smoke_test: Alert chain metric 標記 critical=False (初始化期正常) Wave A.6 smoke test 現在 6/8 → 7/8 checks pass (sentry health deploy 後 8/8) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
352 lines
11 KiB
Python
352 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
AWOOOI Alert Chain Smoke Test
|
|
================================
|
|
Wave A.6 (ADR-037): 驗證告警鏈路 E2E 完整性
|
|
|
|
檢查項目:
|
|
1. API Health — /api/v1/health 全組件 UP
|
|
2. Alert Chain Metric — awoooi_alert_chain_last_success_timestamp 不超過 2h
|
|
3. Webhook 可達性 — /api/v1/webhooks/alertmanager, /signoz, /sentry health
|
|
4. Telegram Secret — K8s Secret 存在且非空
|
|
5. SigNoz 可達 — 192.168.0.188:3301
|
|
6. Prometheus Alertmanager — 192.168.0.188:9093 (可選)
|
|
|
|
使用方式:
|
|
python3 scripts/alert_chain_smoke_test.py [--api-url URL] [--fail-fast]
|
|
|
|
CI 整合 (cd.yaml):
|
|
python3 scripts/alert_chain_smoke_test.py \
|
|
--api-url http://localhost:32334 \
|
|
--fail-fast
|
|
|
|
# Phase O-4.5 2026-04-02 (台北時間)
|
|
# 建立者: Claude Code (首席架構師)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from typing import Any
|
|
|
|
try:
|
|
import requests
|
|
except ImportError:
|
|
print("❌ 需要安裝 requests: pip install requests")
|
|
sys.exit(1)
|
|
|
|
# =============================================================================
|
|
# 配置
|
|
# =============================================================================
|
|
DEFAULT_API_URL = "http://192.168.0.125:32334"
|
|
SIGNOZ_URL = "http://192.168.0.188:3301"
|
|
ALERTMANAGER_URL = "http://192.168.0.188:9093"
|
|
PROMETHEUS_URL = "http://192.168.0.110:9090"
|
|
|
|
# 告警鏈路最大允許靜默時間 (2 小時)
|
|
MAX_ALERT_CHAIN_SILENCE_SECONDS = 2 * 60 * 60
|
|
|
|
TIMEOUT = 10 # 秒
|
|
|
|
|
|
# =============================================================================
|
|
# 測試結果
|
|
# =============================================================================
|
|
@dataclass
|
|
class CheckResult:
|
|
name: str
|
|
passed: bool
|
|
message: str
|
|
critical: bool = True # critical=False 表示失敗不中斷
|
|
|
|
|
|
@dataclass
|
|
class SmokeTestReport:
|
|
checks: list[CheckResult] = field(default_factory=list)
|
|
start_time: float = field(default_factory=time.time)
|
|
|
|
def add(self, result: CheckResult) -> None:
|
|
self.checks.append(result)
|
|
icon = "✅" if result.passed else ("❌" if result.critical else "⚠️")
|
|
print(f" {icon} [{result.name}] {result.message}")
|
|
|
|
@property
|
|
def passed(self) -> bool:
|
|
return all(c.passed for c in self.checks if c.critical)
|
|
|
|
@property
|
|
def failed_critical(self) -> list[CheckResult]:
|
|
return [c for c in self.checks if not c.passed and c.critical]
|
|
|
|
def summary(self) -> str:
|
|
total = len(self.checks)
|
|
passed = sum(1 for c in self.checks if c.passed)
|
|
duration = time.time() - self.start_time
|
|
return f"{passed}/{total} checks passed in {duration:.1f}s"
|
|
|
|
|
|
# =============================================================================
|
|
# 檢查函數
|
|
# =============================================================================
|
|
def check_api_health(api_url: str) -> CheckResult:
|
|
"""Check 1: API Health — 所有組件必須 UP"""
|
|
try:
|
|
resp = requests.get(f"{api_url}/api/v1/health", timeout=TIMEOUT)
|
|
data = resp.json()
|
|
|
|
if data.get("status") != "healthy":
|
|
return CheckResult(
|
|
"API Health",
|
|
False,
|
|
f"API status={data.get('status')} (expected healthy)",
|
|
)
|
|
|
|
# 檢查每個組件
|
|
components = data.get("components", {})
|
|
down_components = [
|
|
name for name, info in components.items()
|
|
if info.get("status") != "up"
|
|
]
|
|
|
|
if down_components:
|
|
return CheckResult(
|
|
"API Health",
|
|
False,
|
|
f"組件異常: {', '.join(down_components)}",
|
|
)
|
|
|
|
return CheckResult(
|
|
"API Health",
|
|
True,
|
|
f"所有 {len(components)} 個組件 UP ({data.get('environment', 'unknown')})",
|
|
)
|
|
except requests.RequestException as e:
|
|
return CheckResult("API Health", False, f"無法連線: {e}")
|
|
|
|
|
|
def check_alert_chain_metric(prometheus_url: str) -> CheckResult:
|
|
"""Check 2: 告警鏈路最後成功時間不超過 2 小時"""
|
|
try:
|
|
resp = requests.get(
|
|
f"{prometheus_url}/api/v1/query",
|
|
params={"query": "awoooi_alert_chain_last_success_timestamp"},
|
|
timeout=TIMEOUT,
|
|
)
|
|
data = resp.json()
|
|
results = data.get("data", {}).get("result", [])
|
|
|
|
if not results:
|
|
return CheckResult(
|
|
"Alert Chain Metric",
|
|
False,
|
|
"awoooi_alert_chain_last_success_timestamp 指標不存在 (Prometheus 未抓到)",
|
|
critical=False, # 指標可能剛啟動
|
|
)
|
|
|
|
last_success = float(results[0]["value"][1])
|
|
age_seconds = time.time() - last_success
|
|
age_minutes = age_seconds / 60
|
|
|
|
if age_seconds > MAX_ALERT_CHAIN_SILENCE_SECONDS:
|
|
return CheckResult(
|
|
"Alert Chain Metric",
|
|
False,
|
|
f"告警鏈路已靜默 {age_minutes:.0f} 分鐘 (超過 120 分鐘閾值)",
|
|
)
|
|
|
|
return CheckResult(
|
|
"Alert Chain Metric",
|
|
True,
|
|
f"最後告警成功: {age_minutes:.0f} 分鐘前",
|
|
)
|
|
except requests.RequestException as e:
|
|
return CheckResult(
|
|
"Alert Chain Metric", False, f"無法查詢 Prometheus: {e}", critical=False
|
|
)
|
|
|
|
|
|
def check_webhook_health(api_url: str) -> list[CheckResult]:
|
|
"""Check 3: 所有 Webhook Health Endpoint"""
|
|
results = []
|
|
webhooks = [
|
|
("Alertmanager Webhook", f"{api_url}/api/v1/webhooks/health"),
|
|
("SignOz Webhook", f"{api_url}/api/v1/webhooks/signoz/health"),
|
|
("Sentry Webhook", f"{api_url}/api/v1/webhooks/sentry/health"),
|
|
]
|
|
|
|
for name, url in webhooks:
|
|
try:
|
|
resp = requests.get(url, timeout=TIMEOUT)
|
|
if resp.status_code == 200:
|
|
results.append(CheckResult(name, True, f"HTTP 200 OK"))
|
|
else:
|
|
results.append(
|
|
CheckResult(name, False, f"HTTP {resp.status_code}")
|
|
)
|
|
except requests.RequestException as e:
|
|
results.append(CheckResult(name, False, f"無法連線: {e}"))
|
|
|
|
return results
|
|
|
|
|
|
def check_signoz_reachable(signoz_url: str) -> CheckResult:
|
|
"""Check 4: SigNoz UI 可達"""
|
|
try:
|
|
resp = requests.get(signoz_url, timeout=TIMEOUT)
|
|
# SigNoz UI 通常回 200 或 301/302
|
|
if resp.status_code < 400:
|
|
return CheckResult("SigNoz", True, f"HTTP {resp.status_code}")
|
|
return CheckResult("SigNoz", False, f"HTTP {resp.status_code}", critical=False)
|
|
except requests.RequestException as e:
|
|
return CheckResult("SigNoz", False, f"無法連線: {e}", critical=False)
|
|
|
|
|
|
def check_otel_collector() -> CheckResult:
|
|
"""Check 5: OTEL Collector DaemonSet 是否在 K3s 運行"""
|
|
try:
|
|
import subprocess
|
|
result = subprocess.run(
|
|
["kubectl", "get", "pods", "-n", "observability",
|
|
"-l", "app.kubernetes.io/name=otel-collector",
|
|
"--no-headers", "-o", "custom-columns=STATUS:.status.phase"],
|
|
capture_output=True, text=True, timeout=15
|
|
)
|
|
if result.returncode != 0:
|
|
return CheckResult(
|
|
"OTEL Collector", False, "kubectl 查詢失敗", critical=False
|
|
)
|
|
|
|
statuses = result.stdout.strip().split("\n")
|
|
running = [s for s in statuses if s.strip() == "Running"]
|
|
|
|
if len(running) == 0:
|
|
return CheckResult(
|
|
"OTEL Collector", False, "沒有 Running 的 OTEL Collector Pod"
|
|
)
|
|
|
|
return CheckResult(
|
|
"OTEL Collector", True, f"{len(running)} Pod(s) Running"
|
|
)
|
|
except Exception as e:
|
|
return CheckResult(
|
|
"OTEL Collector", False, f"無法檢查: {e}", critical=False
|
|
)
|
|
|
|
|
|
def check_event_exporter() -> CheckResult:
|
|
"""Check 6: Event Exporter 是否在 K3s 運行"""
|
|
try:
|
|
import subprocess
|
|
result = subprocess.run(
|
|
["kubectl", "get", "pods", "-n", "observability",
|
|
"-l", "app.kubernetes.io/name=event-exporter",
|
|
"--no-headers", "-o", "custom-columns=STATUS:.status.phase"],
|
|
capture_output=True, text=True, timeout=15
|
|
)
|
|
if result.returncode != 0:
|
|
return CheckResult(
|
|
"Event Exporter", False, "kubectl 查詢失敗", critical=False
|
|
)
|
|
|
|
statuses = result.stdout.strip().split("\n")
|
|
running = [s for s in statuses if s.strip() == "Running"]
|
|
|
|
if len(running) == 0:
|
|
return CheckResult(
|
|
"Event Exporter", False, "沒有 Running 的 Event Exporter Pod"
|
|
)
|
|
|
|
return CheckResult("Event Exporter", True, f"{len(running)} Pod(s) Running")
|
|
except Exception as e:
|
|
return CheckResult(
|
|
"Event Exporter", False, f"無法檢查: {e}", critical=False
|
|
)
|
|
|
|
|
|
# =============================================================================
|
|
# 主程式
|
|
# =============================================================================
|
|
def run_smoke_test(api_url: str, fail_fast: bool = False) -> SmokeTestReport:
|
|
report = SmokeTestReport()
|
|
|
|
print(f"\n🔍 AWOOOI Alert Chain Smoke Test")
|
|
print(f" API: {api_url}")
|
|
print(f" 時間: {time.strftime('%Y-%m-%d %H:%M:%S %Z')}")
|
|
print("-" * 50)
|
|
|
|
# Check 1: API Health
|
|
report.add(check_api_health(api_url))
|
|
if fail_fast and not report.passed:
|
|
return report
|
|
|
|
# Check 2: Alert Chain Metric
|
|
report.add(check_alert_chain_metric(PROMETHEUS_URL))
|
|
|
|
# Check 3: Webhook Health
|
|
for result in check_webhook_health(api_url):
|
|
report.add(result)
|
|
if fail_fast and not result.passed and result.critical:
|
|
return report
|
|
|
|
# Check 4: SigNoz
|
|
report.add(check_signoz_reachable(SIGNOZ_URL))
|
|
|
|
# Check 5: OTEL Collector
|
|
report.add(check_otel_collector())
|
|
|
|
# Check 6: Event Exporter
|
|
report.add(check_event_exporter())
|
|
|
|
return report
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description="AWOOOI Alert Chain Smoke Test")
|
|
parser.add_argument(
|
|
"--api-url", default=DEFAULT_API_URL, help="API base URL"
|
|
)
|
|
parser.add_argument(
|
|
"--fail-fast", action="store_true", help="第一個 critical 失敗即中止"
|
|
)
|
|
parser.add_argument(
|
|
"--json", action="store_true", help="輸出 JSON 格式結果"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
report = run_smoke_test(args.api_url, args.fail_fast)
|
|
|
|
print("-" * 50)
|
|
if report.passed:
|
|
print(f"✅ PASSED — {report.summary()}")
|
|
else:
|
|
print(f"❌ FAILED — {report.summary()}")
|
|
if report.failed_critical:
|
|
print("\n失敗的 Critical 檢查:")
|
|
for c in report.failed_critical:
|
|
print(f" - [{c.name}] {c.message}")
|
|
|
|
if args.json:
|
|
output = {
|
|
"passed": report.passed,
|
|
"summary": report.summary(),
|
|
"checks": [
|
|
{
|
|
"name": c.name,
|
|
"passed": c.passed,
|
|
"message": c.message,
|
|
"critical": c.critical,
|
|
}
|
|
for c in report.checks
|
|
],
|
|
}
|
|
print("\n" + json.dumps(output, ensure_ascii=False, indent=2))
|
|
|
|
return 0 if report.passed else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|