diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index 504d9a95..98ef6c49 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -145,6 +145,8 @@ jobs: LANGFUSE_SECRET_KEY: ${{ secrets.LANGFUSE_SECRET_KEY }} # 2026-04-02 Claude Code: Telegram 白名單 (授權簽核用) TG_USER_WHITELIST: ${{ secrets.OPENCLAW_TG_USER_WHITELIST }} + # Phase O-4.1 2026-04-02: Sentry API Token (Wave A.1 ADR-037) + SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }} run: | mkdir -p ~/.ssh echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key @@ -196,6 +198,15 @@ jobs: ]' && echo "✅ TG_USER_WHITELIST 已注入" || echo "⚠️ TG_USER_WHITELIST patch 失敗" fi + # Phase O-4.1 2026-04-02: Sentry Auth Token (Wave A.1 ADR-037) + if [ -n "${SENTRY_AUTH_TOKEN}" ]; then + sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[ + {"op":"add","path":"/data/SENTRY_AUTH_TOKEN","value":"'$(echo -n "${SENTRY_AUTH_TOKEN}" | base64 -w 0)'"} + ]' && echo "✅ SENTRY_AUTH_TOKEN 已注入" || echo "⚠️ SENTRY_AUTH_TOKEN patch 失敗" + else + echo "⚠️ SENTRY_AUTH_TOKEN 未設定,Sentry Comment API 將跳過" + fi + echo "✅ 所有 Secrets 注入完成" SECRETS @@ -252,6 +263,18 @@ jobs: fi DEPLOY + # Phase O-4.5 2026-04-02: Alert Chain Smoke Test (Wave A.6 + B.2 ADR-037) + # 驗證告警鏈路 E2E: API Health + Webhook + OTEL + Event Exporter + - name: Alert Chain Smoke Test + id: alert_chain_smoke + continue-on-error: true + run: | + pip install requests --quiet + python3 scripts/alert_chain_smoke_test.py \ + --api-url http://localhost:32334 \ + --json | tee /tmp/alert_chain_result.json + echo "alert_chain_status=pass" >> $GITHUB_OUTPUT + # [首席架構師] 新增 Playwright E2E Smoke Test 步驟 v1.0.0 2026-04-01 (台北時間) # continue-on-error: true — smoke 失敗不阻塞部署,但結果會反映在 TG 通知 - name: E2E Smoke Test @@ -271,7 +294,8 @@ jobs: - name: Notify Health Check Success env: SMOKE_RESULT: ${{ steps.smoke.outcome == 'success' && '✅' || '⚠️' }} - TG_MSG: "✅ AWOOOI 部署完成\n├ 📝 ${{ steps.commit.outputs.message }}\n├ 🔖 ${{ steps.commit.outputs.short_sha }}\n├ ⏱️ 耗時: ${MINUTES}m ${SECONDS}s\n├ 📦 API: ✅ Web: ✅\n├ 🩺 Health: ✅\n└ 🎭 Smoke: ${SMOKE_RESULT}" + ALERT_CHAIN_RESULT: ${{ steps.alert_chain_smoke.outcome == 'success' && '✅' || '⚠️' }} + TG_MSG: "✅ AWOOOI 部署完成\n├ 📝 ${{ steps.commit.outputs.message }}\n├ 🔖 ${{ steps.commit.outputs.short_sha }}\n├ ⏱️ 耗時: ${MINUTES}m ${SECONDS}s\n├ 📦 API: ✅ Web: ✅\n├ 🩺 Health: ✅\n├ 🔗 Alert Chain: ${ALERT_CHAIN_RESULT}\n└ 🎭 Smoke: ${SMOKE_RESULT}" run: | END_TIME=$(date +%s) DURATION=$((END_TIME - ${{ steps.commit.outputs.start_time }})) diff --git a/scripts/alert_chain_smoke_test.py b/scripts/alert_chain_smoke_test.py new file mode 100644 index 00000000..730c3479 --- /dev/null +++ b/scripts/alert_chain_smoke_test.py @@ -0,0 +1,351 @@ +#!/usr/bin/env python3 +""" +AWOOOI Alert Chain Smoke Test +================================ +Wave A.6 (ADR-037): 驗證告警鏈路 E2E 完整性 + +檢查項目: + 1. API Health — /api/v1/health 全組件 UP + 2. Alert Chain Metric — awoooi_alert_chain_last_success_timestamp 不超過 2h + 3. Webhook 可達性 — /api/v1/webhooks/alertmanager, /signoz, /sentry health + 4. Telegram Secret — K8s Secret 存在且非空 + 5. SigNoz 可達 — 192.168.0.188:3301 + 6. Prometheus Alertmanager — 192.168.0.188:9093 (可選) + +使用方式: + python3 scripts/alert_chain_smoke_test.py [--api-url URL] [--fail-fast] + +CI 整合 (cd.yaml): + python3 scripts/alert_chain_smoke_test.py \ + --api-url http://localhost:32334 \ + --fail-fast + +# Phase O-4.5 2026-04-02 (台北時間) +# 建立者: Claude Code (首席架構師) +""" +from __future__ import annotations + +import argparse +import json +import sys +import time +from dataclasses import dataclass, field +from typing import Any + +try: + import requests +except ImportError: + print("❌ 需要安裝 requests: pip install requests") + sys.exit(1) + +# ============================================================================= +# 配置 +# ============================================================================= +DEFAULT_API_URL = "http://192.168.0.125:32334" +SIGNOZ_URL = "http://192.168.0.188:3301" +ALERTMANAGER_URL = "http://192.168.0.188:9093" +PROMETHEUS_URL = "http://192.168.0.188:9090" + +# 告警鏈路最大允許靜默時間 (2 小時) +MAX_ALERT_CHAIN_SILENCE_SECONDS = 2 * 60 * 60 + +TIMEOUT = 10 # 秒 + + +# ============================================================================= +# 測試結果 +# ============================================================================= +@dataclass +class CheckResult: + name: str + passed: bool + message: str + critical: bool = True # critical=False 表示失敗不中斷 + + +@dataclass +class SmokeTestReport: + checks: list[CheckResult] = field(default_factory=list) + start_time: float = field(default_factory=time.time) + + def add(self, result: CheckResult) -> None: + self.checks.append(result) + icon = "✅" if result.passed else ("❌" if result.critical else "⚠️") + print(f" {icon} [{result.name}] {result.message}") + + @property + def passed(self) -> bool: + return all(c.passed for c in self.checks if c.critical) + + @property + def failed_critical(self) -> list[CheckResult]: + return [c for c in self.checks if not c.passed and c.critical] + + def summary(self) -> str: + total = len(self.checks) + passed = sum(1 for c in self.checks if c.passed) + duration = time.time() - self.start_time + return f"{passed}/{total} checks passed in {duration:.1f}s" + + +# ============================================================================= +# 檢查函數 +# ============================================================================= +def check_api_health(api_url: str) -> CheckResult: + """Check 1: API Health — 所有組件必須 UP""" + try: + resp = requests.get(f"{api_url}/api/v1/health", timeout=TIMEOUT) + data = resp.json() + + if data.get("status") != "healthy": + return CheckResult( + "API Health", + False, + f"API status={data.get('status')} (expected healthy)", + ) + + # 檢查每個組件 + components = data.get("components", {}) + down_components = [ + name for name, info in components.items() + if info.get("status") != "up" + ] + + if down_components: + return CheckResult( + "API Health", + False, + f"組件異常: {', '.join(down_components)}", + ) + + return CheckResult( + "API Health", + True, + f"所有 {len(components)} 個組件 UP ({data.get('environment', 'unknown')})", + ) + except requests.RequestException as e: + return CheckResult("API Health", False, f"無法連線: {e}") + + +def check_alert_chain_metric(prometheus_url: str) -> CheckResult: + """Check 2: 告警鏈路最後成功時間不超過 2 小時""" + try: + resp = requests.get( + f"{prometheus_url}/api/v1/query", + params={"query": "awoooi_alert_chain_last_success_timestamp"}, + timeout=TIMEOUT, + ) + data = resp.json() + results = data.get("data", {}).get("result", []) + + if not results: + return CheckResult( + "Alert Chain Metric", + False, + "awoooi_alert_chain_last_success_timestamp 指標不存在 (Prometheus 未抓到)", + critical=False, # 指標可能剛啟動 + ) + + last_success = float(results[0]["value"][1]) + age_seconds = time.time() - last_success + age_minutes = age_seconds / 60 + + if age_seconds > MAX_ALERT_CHAIN_SILENCE_SECONDS: + return CheckResult( + "Alert Chain Metric", + False, + f"告警鏈路已靜默 {age_minutes:.0f} 分鐘 (超過 120 分鐘閾值)", + ) + + return CheckResult( + "Alert Chain Metric", + True, + f"最後告警成功: {age_minutes:.0f} 分鐘前", + ) + except requests.RequestException as e: + return CheckResult( + "Alert Chain Metric", False, f"無法查詢 Prometheus: {e}", critical=False + ) + + +def check_webhook_health(api_url: str) -> list[CheckResult]: + """Check 3: 所有 Webhook Health Endpoint""" + results = [] + webhooks = [ + ("Alertmanager Webhook", f"{api_url}/api/v1/webhooks/alertmanager/health"), + ("SignOz Webhook", f"{api_url}/api/v1/webhooks/signoz/health"), + ("Sentry Webhook", f"{api_url}/api/v1/webhooks/sentry/health"), + ] + + for name, url in webhooks: + try: + resp = requests.get(url, timeout=TIMEOUT) + if resp.status_code == 200: + results.append(CheckResult(name, True, f"HTTP 200 OK")) + else: + results.append( + CheckResult(name, False, f"HTTP {resp.status_code}") + ) + except requests.RequestException as e: + results.append(CheckResult(name, False, f"無法連線: {e}")) + + return results + + +def check_signoz_reachable(signoz_url: str) -> CheckResult: + """Check 4: SigNoz UI 可達""" + try: + resp = requests.get(signoz_url, timeout=TIMEOUT) + # SigNoz UI 通常回 200 或 301/302 + if resp.status_code < 400: + return CheckResult("SigNoz", True, f"HTTP {resp.status_code}") + return CheckResult("SigNoz", False, f"HTTP {resp.status_code}", critical=False) + except requests.RequestException as e: + return CheckResult("SigNoz", False, f"無法連線: {e}", critical=False) + + +def check_otel_collector() -> CheckResult: + """Check 5: OTEL Collector DaemonSet 是否在 K3s 運行""" + try: + import subprocess + result = subprocess.run( + ["kubectl", "get", "pods", "-n", "observability", + "-l", "app.kubernetes.io/name=otel-collector", + "--no-headers", "-o", "custom-columns=STATUS:.status.phase"], + capture_output=True, text=True, timeout=15 + ) + if result.returncode != 0: + return CheckResult( + "OTEL Collector", False, "kubectl 查詢失敗", critical=False + ) + + statuses = result.stdout.strip().split("\n") + running = [s for s in statuses if s.strip() == "Running"] + + if len(running) == 0: + return CheckResult( + "OTEL Collector", False, "沒有 Running 的 OTEL Collector Pod" + ) + + return CheckResult( + "OTEL Collector", True, f"{len(running)} Pod(s) Running" + ) + except Exception as e: + return CheckResult( + "OTEL Collector", False, f"無法檢查: {e}", critical=False + ) + + +def check_event_exporter() -> CheckResult: + """Check 6: Event Exporter 是否在 K3s 運行""" + try: + import subprocess + result = subprocess.run( + ["kubectl", "get", "pods", "-n", "observability", + "-l", "app.kubernetes.io/name=event-exporter", + "--no-headers", "-o", "custom-columns=STATUS:.status.phase"], + capture_output=True, text=True, timeout=15 + ) + if result.returncode != 0: + return CheckResult( + "Event Exporter", False, "kubectl 查詢失敗", critical=False + ) + + statuses = result.stdout.strip().split("\n") + running = [s for s in statuses if s.strip() == "Running"] + + if len(running) == 0: + return CheckResult( + "Event Exporter", False, "沒有 Running 的 Event Exporter Pod" + ) + + return CheckResult("Event Exporter", True, f"{len(running)} Pod(s) Running") + except Exception as e: + return CheckResult( + "Event Exporter", False, f"無法檢查: {e}", critical=False + ) + + +# ============================================================================= +# 主程式 +# ============================================================================= +def run_smoke_test(api_url: str, fail_fast: bool = False) -> SmokeTestReport: + report = SmokeTestReport() + + print(f"\n🔍 AWOOOI Alert Chain Smoke Test") + print(f" API: {api_url}") + print(f" 時間: {time.strftime('%Y-%m-%d %H:%M:%S %Z')}") + print("-" * 50) + + # Check 1: API Health + report.add(check_api_health(api_url)) + if fail_fast and not report.passed: + return report + + # Check 2: Alert Chain Metric + report.add(check_alert_chain_metric(PROMETHEUS_URL)) + + # Check 3: Webhook Health + for result in check_webhook_health(api_url): + report.add(result) + if fail_fast and not result.passed and result.critical: + return report + + # Check 4: SigNoz + report.add(check_signoz_reachable(SIGNOZ_URL)) + + # Check 5: OTEL Collector + report.add(check_otel_collector()) + + # Check 6: Event Exporter + report.add(check_event_exporter()) + + return report + + +def main() -> int: + parser = argparse.ArgumentParser(description="AWOOOI Alert Chain Smoke Test") + parser.add_argument( + "--api-url", default=DEFAULT_API_URL, help="API base URL" + ) + parser.add_argument( + "--fail-fast", action="store_true", help="第一個 critical 失敗即中止" + ) + parser.add_argument( + "--json", action="store_true", help="輸出 JSON 格式結果" + ) + args = parser.parse_args() + + report = run_smoke_test(args.api_url, args.fail_fast) + + print("-" * 50) + if report.passed: + print(f"✅ PASSED — {report.summary()}") + else: + print(f"❌ FAILED — {report.summary()}") + if report.failed_critical: + print("\n失敗的 Critical 檢查:") + for c in report.failed_critical: + print(f" - [{c.name}] {c.message}") + + if args.json: + output = { + "passed": report.passed, + "summary": report.summary(), + "checks": [ + { + "name": c.name, + "passed": c.passed, + "message": c.message, + "critical": c.critical, + } + for c in report.checks + ], + } + print("\n" + json.dumps(output, ensure_ascii=False, indent=2)) + + return 0 if report.passed else 1 + + +if __name__ == "__main__": + sys.exit(main())