#!/usr/bin/env python3 """ Sprint 5.1 Data Safety Guardrails — E2E 驗收腳本 用法: python3 scripts/sprint51_e2e_validation.py --api-url http://192.168.0.121:32334 測試情境: T1: BLOCK 服務告警 → GUARDRAIL_BLOCKED + 無背景修復任務 T2: auto_repair=false Prometheus flag → GUARDRAIL_BLOCKED log + 無背景修復 T3: AUTO 服務告警 → 正常流程(不被阻擋) T4: docker-health-monitor.sh webhook 格式 → ALERT_RECEIVED 記錄 T5: /api/v1/guardrail/status 端點(Service Registry 查詢) """ import argparse import json import time import sys import urllib.request import urllib.error from datetime import datetime, timezone # 2026-04-08 Claude Sonnet 4.6 Asia/Taipei def _post(url: str, payload: dict) -> tuple[int, dict]: data = json.dumps(payload).encode() req = urllib.request.Request( url, data=data, headers={"Content-Type": "application/json"}, method="POST", ) try: with urllib.request.urlopen(req, timeout=15) as resp: return resp.status, json.loads(resp.read()) except urllib.error.HTTPError as e: body = {} try: body = json.loads(e.read()) except Exception: pass return e.code, body def _get(url: str) -> tuple[int, dict]: req = urllib.request.Request(url, method="GET") try: with urllib.request.urlopen(req, timeout=15) as resp: return resp.status, json.loads(resp.read()) except urllib.error.HTTPError as e: return e.code, {} def _alertmanager_payload(alertname: str, labels: dict, instance: str = "test") -> dict: """建立 Alertmanager 格式 payload""" merged_labels = { "alertname": alertname, "instance": instance, "severity": "warning", **labels, } return { "version": "4", "groupKey": f"test-{alertname}", "status": "firing", "receiver": "awoooi-api", "groupLabels": {"alertname": alertname}, "commonLabels": merged_labels, "commonAnnotations": {"summary": f"[E2E Test] {alertname}"}, "externalURL": "http://192.168.0.110:9093", "alerts": [ { "status": "firing", "labels": merged_labels, "annotations": {"summary": f"[E2E Test] {alertname}"}, "startsAt": datetime.now(timezone.utc).isoformat(), "endsAt": "0001-01-01T00:00:00Z", "generatorURL": "http://prometheus:9090", "fingerprint": f"e2e-{alertname}-{int(time.time())}", } ], } def _check_op_log(api_url: str, event_type: str, lookback_secs: int = 30) -> bool: """檢查 alert_operation_log 是否有最近的指定 event_type""" code, data = _get(f"{api_url}/api/v1/operation-log?limit=20") if code != 200: return False items = data.get("items", []) cutoff = time.time() - lookback_secs for item in items: ts = item.get("created_at", "") try: t = datetime.fromisoformat(ts.replace("Z", "+00:00")).timestamp() if t > cutoff and item.get("event_type") == event_type: return True except Exception: pass return False PASS = "✅" FAIL = "❌" SKIP = "⏭️" def run_t1_block_service(api_url: str) -> bool: """T1: PostgreSQL (BLOCK) 告警 → GUARDRAIL_BLOCKED,不觸發修復""" print("\n── T1: BLOCK 服務告警 (PostgreSQL) ──") payload = _alertmanager_payload( "PostgreSQLDown", { "job": "postgres-exporter", "auto_repair": "true", # rule 標為 true,但 Service Registry 應覆寫 "layer": "systemd-188", "component": "postgres", }, ) code, resp = _post(f"{api_url}/api/v1/webhooks/alertmanager", payload) print(f" POST /webhooks/alertmanager → HTTP {code}") # 等待非同步處理 time.sleep(3) # 驗證 op_log 有 GUARDRAIL_BLOCKED found = _check_op_log(api_url, "GUARDRAIL_BLOCKED") if found: print(f" {PASS} alert_operation_log 有 GUARDRAIL_BLOCKED 記錄") else: print(f" {FAIL} 未找到 GUARDRAIL_BLOCKED 記錄(可能未部署 Sprint 5.1)") return code == 200 and found def run_t2_auto_repair_false_flag(api_url: str) -> bool: """T2: auto_repair=false Prometheus flag → 不觸發修復""" print("\n── T2: auto_repair=false flag (KaliScannerDown) ──") payload = _alertmanager_payload( "KaliScannerDown", { "auto_repair": "false", # Prometheus rule 設 false "layer": "docker-188", "component": "kali", "severity": "info", }, ) code, resp = _post(f"{api_url}/api/v1/webhooks/alertmanager", payload) print(f" POST /webhooks/alertmanager → HTTP {code}") time.sleep(3) # 驗證 ALERT_RECEIVED 存在 received = _check_op_log(api_url, "ALERT_RECEIVED") # 驗證 GUARDRAIL_BLOCKED 存在(由 auto_repair=false flag 觸發) blocked = _check_op_log(api_url, "GUARDRAIL_BLOCKED") if received: print(f" {PASS} ALERT_RECEIVED 已記錄") else: print(f" {FAIL} 未找到 ALERT_RECEIVED 記錄") if blocked: print(f" {PASS} GUARDRAIL_BLOCKED 已記錄(auto_repair=false flag 生效)") else: print(f" {FAIL} 未找到 GUARDRAIL_BLOCKED(flag 未生效)") return code == 200 and received def run_t3_auto_service(api_url: str) -> bool: """T3: AUTO 服務告警(awoooi-api)→ 正常流程,不被阻擋""" print("\n── T3: AUTO 服務告警 (KubePodNotReady) ──") payload = _alertmanager_payload( "KubePodNotReady", { "auto_repair": "true", "layer": "k8s", "namespace": "awoooi-prod", "pod": "test-pod-e2e", }, ) code, resp = _post(f"{api_url}/api/v1/webhooks/alertmanager", payload) print(f" POST /webhooks/alertmanager → HTTP {code}") time.sleep(3) # 應有 ALERT_RECEIVED,不應有 GUARDRAIL_BLOCKED(除非沒有對應 playbook) received = _check_op_log(api_url, "ALERT_RECEIVED") if received: print(f" {PASS} ALERT_RECEIVED 已記錄,AUTO 服務進入正常流程") else: print(f" {FAIL} 未找到 ALERT_RECEIVED") return code == 200 and received def run_t4_docker_health_monitor(api_url: str) -> bool: """T4: docker-health-monitor.sh 格式 webhook → ALERT_RECEIVED""" print("\n── T4: docker-health-monitor webhook 格式 ──") # 模擬 docker-health-monitor.sh 的 send_to_awoooi() 格式 payload = _alertmanager_payload( "DockerContainerExited", { "auto_repair": "true", "layer": "docker", "host": "188", "container": "test-container-e2e", "source": "docker-health-monitor", }, ) code, resp = _post(f"{api_url}/api/v1/webhooks/alertmanager", payload) print(f" POST /webhooks/alertmanager → HTTP {code}") time.sleep(2) received = _check_op_log(api_url, "ALERT_RECEIVED") if received: print(f" {PASS} ALERT_RECEIVED 已記錄,docker-health-monitor 格式相容") else: print(f" {FAIL} 未找到 ALERT_RECEIVED") return code == 200 def run_t5_service_registry_api(api_url: str) -> bool: """T5: Service Registry 查詢 API(若有暴露)""" print("\n── T5: Health Check + 系統狀態 ──") code, data = _get(f"{api_url}/api/v1/health") print(f" GET /api/v1/health → HTTP {code}") if code == 200: print(f" {PASS} API 健康") version = data.get("version", data.get("git_sha", "unknown")) print(f" 版本: {version}") else: print(f" {FAIL} API 健康檢查失敗") return False # 確認 auto_repair_executions 表存在(查看統計) code2, data2 = _get(f"{api_url}/api/v1/stats/auto-repair") if code2 == 200: print(f" {PASS} auto_repair stats 端點正常") else: print(f" {SKIP} auto_repair stats 端點: HTTP {code2}(可接受)") return code == 200 def main(): parser = argparse.ArgumentParser(description="Sprint 5.1 E2E 驗收") parser.add_argument("--api-url", default="http://192.168.0.121:32334") parser.add_argument("--json", action="store_true") args = parser.parse_args() print(f"🧪 Sprint 5.1 Data Safety Guardrails E2E 驗收") print(f" API: {args.api_url}") print(f" 時間: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") results = { "T1_block_service": run_t1_block_service(args.api_url), "T2_auto_repair_false": run_t2_auto_repair_false_flag(args.api_url), "T3_auto_service": run_t3_auto_service(args.api_url), "T4_docker_health_monitor": run_t4_docker_health_monitor(args.api_url), "T5_health_check": run_t5_service_registry_api(args.api_url), } passed = sum(1 for v in results.values() if v) total = len(results) print(f"\n{'═'*50}") print(f" 結果: {passed}/{total} 通過") for name, ok in results.items(): print(f" {'✅' if ok else '❌'} {name}") print(f"{'═'*50}") if args.json: print(json.dumps({"passed": passed, "total": total, "results": results})) sys.exit(0 if passed == total else 1) if __name__ == "__main__": main()