diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml
index 504d9a95..98ef6c49 100644
--- a/.gitea/workflows/cd.yaml
+++ b/.gitea/workflows/cd.yaml
@@ -145,6 +145,8 @@ jobs:
LANGFUSE_SECRET_KEY: ${{ secrets.LANGFUSE_SECRET_KEY }}
# 2026-04-02 Claude Code: Telegram 白名單 (授權簽核用)
TG_USER_WHITELIST: ${{ secrets.OPENCLAW_TG_USER_WHITELIST }}
+ # Phase O-4.1 2026-04-02: Sentry API Token (Wave A.1 ADR-037)
+ SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
run: |
mkdir -p ~/.ssh
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
@@ -196,6 +198,15 @@ jobs:
]' && echo "✅ TG_USER_WHITELIST 已注入" || echo "⚠️ TG_USER_WHITELIST patch 失敗"
fi
+ # Phase O-4.1 2026-04-02: Sentry Auth Token (Wave A.1 ADR-037)
+ if [ -n "${SENTRY_AUTH_TOKEN}" ]; then
+ sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
+ {"op":"add","path":"/data/SENTRY_AUTH_TOKEN","value":"'$(echo -n "${SENTRY_AUTH_TOKEN}" | base64 -w 0)'"}
+ ]' && echo "✅ SENTRY_AUTH_TOKEN 已注入" || echo "⚠️ SENTRY_AUTH_TOKEN patch 失敗"
+ else
+ echo "⚠️ SENTRY_AUTH_TOKEN 未設定,Sentry Comment API 將跳過"
+ fi
+
echo "✅ 所有 Secrets 注入完成"
SECRETS
@@ -252,6 +263,18 @@ jobs:
fi
DEPLOY
+ # Phase O-4.5 2026-04-02: Alert Chain Smoke Test (Wave A.6 + B.2 ADR-037)
+ # 驗證告警鏈路 E2E: API Health + Webhook + OTEL + Event Exporter
+ - name: Alert Chain Smoke Test
+ id: alert_chain_smoke
+ continue-on-error: true
+ run: |
+ pip install requests --quiet
+ python3 scripts/alert_chain_smoke_test.py \
+ --api-url http://localhost:32334 \
+ --json | tee /tmp/alert_chain_result.json
+ echo "alert_chain_status=pass" >> $GITHUB_OUTPUT
+
# [首席架構師] 新增 Playwright E2E Smoke Test 步驟 v1.0.0 2026-04-01 (台北時間)
# continue-on-error: true — smoke 失敗不阻塞部署,但結果會反映在 TG 通知
- name: E2E Smoke Test
@@ -271,7 +294,8 @@ jobs:
- name: Notify Health Check Success
env:
SMOKE_RESULT: ${{ steps.smoke.outcome == 'success' && '✅' || '⚠️' }}
- TG_MSG: "✅ AWOOOI 部署完成\n├ 📝 ${{ steps.commit.outputs.message }}\n├ 🔖 ${{ steps.commit.outputs.short_sha }}\n├ ⏱️ 耗時: ${MINUTES}m ${SECONDS}s\n├ 📦 API: ✅ Web: ✅\n├ 🩺 Health: ✅\n└ 🎭 Smoke: ${SMOKE_RESULT}"
+ ALERT_CHAIN_RESULT: ${{ steps.alert_chain_smoke.outcome == 'success' && '✅' || '⚠️' }}
+ TG_MSG: "✅ AWOOOI 部署完成\n├ 📝 ${{ steps.commit.outputs.message }}\n├ 🔖 ${{ steps.commit.outputs.short_sha }}\n├ ⏱️ 耗時: ${MINUTES}m ${SECONDS}s\n├ 📦 API: ✅ Web: ✅\n├ 🩺 Health: ✅\n├ 🔗 Alert Chain: ${ALERT_CHAIN_RESULT}\n└ 🎭 Smoke: ${SMOKE_RESULT}"
run: |
END_TIME=$(date +%s)
DURATION=$((END_TIME - ${{ steps.commit.outputs.start_time }}))
diff --git a/scripts/alert_chain_smoke_test.py b/scripts/alert_chain_smoke_test.py
new file mode 100644
index 00000000..730c3479
--- /dev/null
+++ b/scripts/alert_chain_smoke_test.py
@@ -0,0 +1,351 @@
+#!/usr/bin/env python3
+"""
+AWOOOI Alert Chain Smoke Test
+================================
+Wave A.6 (ADR-037): 驗證告警鏈路 E2E 完整性
+
+檢查項目:
+ 1. API Health — /api/v1/health 全組件 UP
+ 2. Alert Chain Metric — awoooi_alert_chain_last_success_timestamp 不超過 2h
+ 3. Webhook 可達性 — /api/v1/webhooks/alertmanager, /signoz, /sentry health
+ 4. Telegram Secret — K8s Secret 存在且非空
+ 5. SigNoz 可達 — 192.168.0.188:3301
+ 6. Prometheus Alertmanager — 192.168.0.188:9093 (可選)
+
+使用方式:
+ python3 scripts/alert_chain_smoke_test.py [--api-url URL] [--fail-fast]
+
+CI 整合 (cd.yaml):
+ python3 scripts/alert_chain_smoke_test.py \
+ --api-url http://localhost:32334 \
+ --fail-fast
+
+# Phase O-4.5 2026-04-02 (台北時間)
+# 建立者: Claude Code (首席架構師)
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+import time
+from dataclasses import dataclass, field
+from typing import Any
+
+try:
+ import requests
+except ImportError:
+ print("❌ 需要安裝 requests: pip install requests")
+ sys.exit(1)
+
+# =============================================================================
+# 配置
+# =============================================================================
+DEFAULT_API_URL = "http://192.168.0.125:32334"
+SIGNOZ_URL = "http://192.168.0.188:3301"
+ALERTMANAGER_URL = "http://192.168.0.188:9093"
+PROMETHEUS_URL = "http://192.168.0.188:9090"
+
+# 告警鏈路最大允許靜默時間 (2 小時)
+MAX_ALERT_CHAIN_SILENCE_SECONDS = 2 * 60 * 60
+
+TIMEOUT = 10 # 秒
+
+
+# =============================================================================
+# 測試結果
+# =============================================================================
+@dataclass
+class CheckResult:
+ name: str
+ passed: bool
+ message: str
+ critical: bool = True # critical=False 表示失敗不中斷
+
+
+@dataclass
+class SmokeTestReport:
+ checks: list[CheckResult] = field(default_factory=list)
+ start_time: float = field(default_factory=time.time)
+
+ def add(self, result: CheckResult) -> None:
+ self.checks.append(result)
+ icon = "✅" if result.passed else ("❌" if result.critical else "⚠️")
+ print(f" {icon} [{result.name}] {result.message}")
+
+ @property
+ def passed(self) -> bool:
+ return all(c.passed for c in self.checks if c.critical)
+
+ @property
+ def failed_critical(self) -> list[CheckResult]:
+ return [c for c in self.checks if not c.passed and c.critical]
+
+ def summary(self) -> str:
+ total = len(self.checks)
+ passed = sum(1 for c in self.checks if c.passed)
+ duration = time.time() - self.start_time
+ return f"{passed}/{total} checks passed in {duration:.1f}s"
+
+
+# =============================================================================
+# 檢查函數
+# =============================================================================
+def check_api_health(api_url: str) -> CheckResult:
+ """Check 1: API Health — 所有組件必須 UP"""
+ try:
+ resp = requests.get(f"{api_url}/api/v1/health", timeout=TIMEOUT)
+ data = resp.json()
+
+ if data.get("status") != "healthy":
+ return CheckResult(
+ "API Health",
+ False,
+ f"API status={data.get('status')} (expected healthy)",
+ )
+
+ # 檢查每個組件
+ components = data.get("components", {})
+ down_components = [
+ name for name, info in components.items()
+ if info.get("status") != "up"
+ ]
+
+ if down_components:
+ return CheckResult(
+ "API Health",
+ False,
+ f"組件異常: {', '.join(down_components)}",
+ )
+
+ return CheckResult(
+ "API Health",
+ True,
+ f"所有 {len(components)} 個組件 UP ({data.get('environment', 'unknown')})",
+ )
+ except requests.RequestException as e:
+ return CheckResult("API Health", False, f"無法連線: {e}")
+
+
+def check_alert_chain_metric(prometheus_url: str) -> CheckResult:
+ """Check 2: 告警鏈路最後成功時間不超過 2 小時"""
+ try:
+ resp = requests.get(
+ f"{prometheus_url}/api/v1/query",
+ params={"query": "awoooi_alert_chain_last_success_timestamp"},
+ timeout=TIMEOUT,
+ )
+ data = resp.json()
+ results = data.get("data", {}).get("result", [])
+
+ if not results:
+ return CheckResult(
+ "Alert Chain Metric",
+ False,
+ "awoooi_alert_chain_last_success_timestamp 指標不存在 (Prometheus 未抓到)",
+ critical=False, # 指標可能剛啟動
+ )
+
+ last_success = float(results[0]["value"][1])
+ age_seconds = time.time() - last_success
+ age_minutes = age_seconds / 60
+
+ if age_seconds > MAX_ALERT_CHAIN_SILENCE_SECONDS:
+ return CheckResult(
+ "Alert Chain Metric",
+ False,
+ f"告警鏈路已靜默 {age_minutes:.0f} 分鐘 (超過 120 分鐘閾值)",
+ )
+
+ return CheckResult(
+ "Alert Chain Metric",
+ True,
+ f"最後告警成功: {age_minutes:.0f} 分鐘前",
+ )
+ except requests.RequestException as e:
+ return CheckResult(
+ "Alert Chain Metric", False, f"無法查詢 Prometheus: {e}", critical=False
+ )
+
+
+def check_webhook_health(api_url: str) -> list[CheckResult]:
+ """Check 3: 所有 Webhook Health Endpoint"""
+ results = []
+ webhooks = [
+ ("Alertmanager Webhook", f"{api_url}/api/v1/webhooks/alertmanager/health"),
+ ("SignOz Webhook", f"{api_url}/api/v1/webhooks/signoz/health"),
+ ("Sentry Webhook", f"{api_url}/api/v1/webhooks/sentry/health"),
+ ]
+
+ for name, url in webhooks:
+ try:
+ resp = requests.get(url, timeout=TIMEOUT)
+ if resp.status_code == 200:
+ results.append(CheckResult(name, True, f"HTTP 200 OK"))
+ else:
+ results.append(
+ CheckResult(name, False, f"HTTP {resp.status_code}")
+ )
+ except requests.RequestException as e:
+ results.append(CheckResult(name, False, f"無法連線: {e}"))
+
+ return results
+
+
+def check_signoz_reachable(signoz_url: str) -> CheckResult:
+ """Check 4: SigNoz UI 可達"""
+ try:
+ resp = requests.get(signoz_url, timeout=TIMEOUT)
+ # SigNoz UI 通常回 200 或 301/302
+ if resp.status_code < 400:
+ return CheckResult("SigNoz", True, f"HTTP {resp.status_code}")
+ return CheckResult("SigNoz", False, f"HTTP {resp.status_code}", critical=False)
+ except requests.RequestException as e:
+ return CheckResult("SigNoz", False, f"無法連線: {e}", critical=False)
+
+
+def check_otel_collector() -> CheckResult:
+ """Check 5: OTEL Collector DaemonSet 是否在 K3s 運行"""
+ try:
+ import subprocess
+ result = subprocess.run(
+ ["kubectl", "get", "pods", "-n", "observability",
+ "-l", "app.kubernetes.io/name=otel-collector",
+ "--no-headers", "-o", "custom-columns=STATUS:.status.phase"],
+ capture_output=True, text=True, timeout=15
+ )
+ if result.returncode != 0:
+ return CheckResult(
+ "OTEL Collector", False, "kubectl 查詢失敗", critical=False
+ )
+
+ statuses = result.stdout.strip().split("\n")
+ running = [s for s in statuses if s.strip() == "Running"]
+
+ if len(running) == 0:
+ return CheckResult(
+ "OTEL Collector", False, "沒有 Running 的 OTEL Collector Pod"
+ )
+
+ return CheckResult(
+ "OTEL Collector", True, f"{len(running)} Pod(s) Running"
+ )
+ except Exception as e:
+ return CheckResult(
+ "OTEL Collector", False, f"無法檢查: {e}", critical=False
+ )
+
+
+def check_event_exporter() -> CheckResult:
+ """Check 6: Event Exporter 是否在 K3s 運行"""
+ try:
+ import subprocess
+ result = subprocess.run(
+ ["kubectl", "get", "pods", "-n", "observability",
+ "-l", "app.kubernetes.io/name=event-exporter",
+ "--no-headers", "-o", "custom-columns=STATUS:.status.phase"],
+ capture_output=True, text=True, timeout=15
+ )
+ if result.returncode != 0:
+ return CheckResult(
+ "Event Exporter", False, "kubectl 查詢失敗", critical=False
+ )
+
+ statuses = result.stdout.strip().split("\n")
+ running = [s for s in statuses if s.strip() == "Running"]
+
+ if len(running) == 0:
+ return CheckResult(
+ "Event Exporter", False, "沒有 Running 的 Event Exporter Pod"
+ )
+
+ return CheckResult("Event Exporter", True, f"{len(running)} Pod(s) Running")
+ except Exception as e:
+ return CheckResult(
+ "Event Exporter", False, f"無法檢查: {e}", critical=False
+ )
+
+
+# =============================================================================
+# 主程式
+# =============================================================================
+def run_smoke_test(api_url: str, fail_fast: bool = False) -> SmokeTestReport:
+ report = SmokeTestReport()
+
+ print(f"\n🔍 AWOOOI Alert Chain Smoke Test")
+ print(f" API: {api_url}")
+ print(f" 時間: {time.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+ print("-" * 50)
+
+ # Check 1: API Health
+ report.add(check_api_health(api_url))
+ if fail_fast and not report.passed:
+ return report
+
+ # Check 2: Alert Chain Metric
+ report.add(check_alert_chain_metric(PROMETHEUS_URL))
+
+ # Check 3: Webhook Health
+ for result in check_webhook_health(api_url):
+ report.add(result)
+ if fail_fast and not result.passed and result.critical:
+ return report
+
+ # Check 4: SigNoz
+ report.add(check_signoz_reachable(SIGNOZ_URL))
+
+ # Check 5: OTEL Collector
+ report.add(check_otel_collector())
+
+ # Check 6: Event Exporter
+ report.add(check_event_exporter())
+
+ return report
+
+
+def main() -> int:
+ parser = argparse.ArgumentParser(description="AWOOOI Alert Chain Smoke Test")
+ parser.add_argument(
+ "--api-url", default=DEFAULT_API_URL, help="API base URL"
+ )
+ parser.add_argument(
+ "--fail-fast", action="store_true", help="第一個 critical 失敗即中止"
+ )
+ parser.add_argument(
+ "--json", action="store_true", help="輸出 JSON 格式結果"
+ )
+ args = parser.parse_args()
+
+ report = run_smoke_test(args.api_url, args.fail_fast)
+
+ print("-" * 50)
+ if report.passed:
+ print(f"✅ PASSED — {report.summary()}")
+ else:
+ print(f"❌ FAILED — {report.summary()}")
+ if report.failed_critical:
+ print("\n失敗的 Critical 檢查:")
+ for c in report.failed_critical:
+ print(f" - [{c.name}] {c.message}")
+
+ if args.json:
+ output = {
+ "passed": report.passed,
+ "summary": report.summary(),
+ "checks": [
+ {
+ "name": c.name,
+ "passed": c.passed,
+ "message": c.message,
+ "critical": c.critical,
+ }
+ for c in report.checks
+ ],
+ }
+ print("\n" + json.dumps(output, ensure_ascii=False, indent=2))
+
+ return 0 if report.passed else 1
+
+
+if __name__ == "__main__":
+ sys.exit(main())