Files
awoooi/scripts/sprint51_e2e_validation.py
OG T 170ce2f11d
Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 1m38s
fix(ci): 修正測試與 Sprint 5.2 部署腳本
tests/test_auto_repair_service.py:
  - 更新 3個測試符合 2026-04-07 統帥指令移除門檻
  - APPROVED Playbook 直接通過 (低相似度/低品質/高風險均通過)

tests/test_phase22_nemotron_collab.py:
  - 更新 log key: nemotron_collaboration_failed → exhausted

ops/monitoring/docker-compose.exporters.yaml:
  - 修正 postgres DSN: awoooi:awoooi_prod_2026@localhost:5432/awoooi_prod

Sprint 5.2 新增腳本:
  - scripts/sprint51_e2e_validation.py: L7 E2E 驗收腳本 (T1-T5)
  - scripts/ops/deploy-docker-health-monitor.sh: Plan A 一鍵部署腳本

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-08 18:17:48 +08:00

281 lines
9.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Sprint 5.1 Data Safety Guardrails — E2E 驗收腳本
用法: python3 scripts/sprint51_e2e_validation.py --api-url http://192.168.0.121:32334
測試情境:
T1: BLOCK 服務告警 → GUARDRAIL_BLOCKED + 無背景修復任務
T2: auto_repair=false Prometheus flag → GUARDRAIL_BLOCKED log + 無背景修復
T3: AUTO 服務告警 → 正常流程(不被阻擋)
T4: docker-health-monitor.sh webhook 格式 → ALERT_RECEIVED 記錄
T5: /api/v1/guardrail/status 端點Service Registry 查詢)
"""
import argparse
import json
import time
import sys
import urllib.request
import urllib.error
from datetime import datetime, timezone
# 2026-04-08 Claude Sonnet 4.6 Asia/Taipei
def _post(url: str, payload: dict) -> tuple[int, dict]:
data = json.dumps(payload).encode()
req = urllib.request.Request(
url,
data=data,
headers={"Content-Type": "application/json"},
method="POST",
)
try:
with urllib.request.urlopen(req, timeout=15) as resp:
return resp.status, json.loads(resp.read())
except urllib.error.HTTPError as e:
body = {}
try:
body = json.loads(e.read())
except Exception:
pass
return e.code, body
def _get(url: str) -> tuple[int, dict]:
req = urllib.request.Request(url, method="GET")
try:
with urllib.request.urlopen(req, timeout=15) as resp:
return resp.status, json.loads(resp.read())
except urllib.error.HTTPError as e:
return e.code, {}
def _alertmanager_payload(alertname: str, labels: dict, instance: str = "test") -> dict:
"""建立 Alertmanager 格式 payload"""
merged_labels = {
"alertname": alertname,
"instance": instance,
"severity": "warning",
**labels,
}
return {
"version": "4",
"groupKey": f"test-{alertname}",
"status": "firing",
"receiver": "awoooi-api",
"groupLabels": {"alertname": alertname},
"commonLabels": merged_labels,
"commonAnnotations": {"summary": f"[E2E Test] {alertname}"},
"externalURL": "http://192.168.0.110:9093",
"alerts": [
{
"status": "firing",
"labels": merged_labels,
"annotations": {"summary": f"[E2E Test] {alertname}"},
"startsAt": datetime.now(timezone.utc).isoformat(),
"endsAt": "0001-01-01T00:00:00Z",
"generatorURL": "http://prometheus:9090",
"fingerprint": f"e2e-{alertname}-{int(time.time())}",
}
],
}
def _check_op_log(api_url: str, event_type: str, lookback_secs: int = 30) -> bool:
"""檢查 alert_operation_log 是否有最近的指定 event_type"""
code, data = _get(f"{api_url}/api/v1/operation-log?limit=20")
if code != 200:
return False
items = data.get("items", [])
cutoff = time.time() - lookback_secs
for item in items:
ts = item.get("created_at", "")
try:
t = datetime.fromisoformat(ts.replace("Z", "+00:00")).timestamp()
if t > cutoff and item.get("event_type") == event_type:
return True
except Exception:
pass
return False
PASS = ""
FAIL = ""
SKIP = "⏭️"
def run_t1_block_service(api_url: str) -> bool:
"""T1: PostgreSQL (BLOCK) 告警 → GUARDRAIL_BLOCKED不觸發修復"""
print("\n── T1: BLOCK 服務告警 (PostgreSQL) ──")
payload = _alertmanager_payload(
"PostgreSQLDown",
{
"job": "postgres-exporter",
"auto_repair": "true", # rule 標為 true但 Service Registry 應覆寫
"layer": "systemd-188",
"component": "postgres",
},
)
code, resp = _post(f"{api_url}/api/v1/webhooks/alertmanager", payload)
print(f" POST /webhooks/alertmanager → HTTP {code}")
# 等待非同步處理
time.sleep(3)
# 驗證 op_log 有 GUARDRAIL_BLOCKED
found = _check_op_log(api_url, "GUARDRAIL_BLOCKED")
if found:
print(f" {PASS} alert_operation_log 有 GUARDRAIL_BLOCKED 記錄")
else:
print(f" {FAIL} 未找到 GUARDRAIL_BLOCKED 記錄(可能未部署 Sprint 5.1")
return code == 200 and found
def run_t2_auto_repair_false_flag(api_url: str) -> bool:
"""T2: auto_repair=false Prometheus flag → 不觸發修復"""
print("\n── T2: auto_repair=false flag (KaliScannerDown) ──")
payload = _alertmanager_payload(
"KaliScannerDown",
{
"auto_repair": "false", # Prometheus rule 設 false
"layer": "docker-188",
"component": "kali",
"severity": "info",
},
)
code, resp = _post(f"{api_url}/api/v1/webhooks/alertmanager", payload)
print(f" POST /webhooks/alertmanager → HTTP {code}")
time.sleep(3)
# 驗證 ALERT_RECEIVED 存在
received = _check_op_log(api_url, "ALERT_RECEIVED")
# 驗證 GUARDRAIL_BLOCKED 存在(由 auto_repair=false flag 觸發)
blocked = _check_op_log(api_url, "GUARDRAIL_BLOCKED")
if received:
print(f" {PASS} ALERT_RECEIVED 已記錄")
else:
print(f" {FAIL} 未找到 ALERT_RECEIVED 記錄")
if blocked:
print(f" {PASS} GUARDRAIL_BLOCKED 已記錄auto_repair=false flag 生效)")
else:
print(f" {FAIL} 未找到 GUARDRAIL_BLOCKEDflag 未生效)")
return code == 200 and received
def run_t3_auto_service(api_url: str) -> bool:
"""T3: AUTO 服務告警awoooi-api→ 正常流程,不被阻擋"""
print("\n── T3: AUTO 服務告警 (KubePodNotReady) ──")
payload = _alertmanager_payload(
"KubePodNotReady",
{
"auto_repair": "true",
"layer": "k8s",
"namespace": "awoooi-prod",
"pod": "test-pod-e2e",
},
)
code, resp = _post(f"{api_url}/api/v1/webhooks/alertmanager", payload)
print(f" POST /webhooks/alertmanager → HTTP {code}")
time.sleep(3)
# 應有 ALERT_RECEIVED不應有 GUARDRAIL_BLOCKED除非沒有對應 playbook
received = _check_op_log(api_url, "ALERT_RECEIVED")
if received:
print(f" {PASS} ALERT_RECEIVED 已記錄AUTO 服務進入正常流程")
else:
print(f" {FAIL} 未找到 ALERT_RECEIVED")
return code == 200 and received
def run_t4_docker_health_monitor(api_url: str) -> bool:
"""T4: docker-health-monitor.sh 格式 webhook → ALERT_RECEIVED"""
print("\n── T4: docker-health-monitor webhook 格式 ──")
# 模擬 docker-health-monitor.sh 的 send_to_awoooi() 格式
payload = _alertmanager_payload(
"DockerContainerExited",
{
"auto_repair": "true",
"layer": "docker",
"host": "188",
"container": "test-container-e2e",
"source": "docker-health-monitor",
},
)
code, resp = _post(f"{api_url}/api/v1/webhooks/alertmanager", payload)
print(f" POST /webhooks/alertmanager → HTTP {code}")
time.sleep(2)
received = _check_op_log(api_url, "ALERT_RECEIVED")
if received:
print(f" {PASS} ALERT_RECEIVED 已記錄docker-health-monitor 格式相容")
else:
print(f" {FAIL} 未找到 ALERT_RECEIVED")
return code == 200
def run_t5_service_registry_api(api_url: str) -> bool:
"""T5: Service Registry 查詢 API若有暴露"""
print("\n── T5: Health Check + 系統狀態 ──")
code, data = _get(f"{api_url}/api/v1/health")
print(f" GET /api/v1/health → HTTP {code}")
if code == 200:
print(f" {PASS} API 健康")
version = data.get("version", data.get("git_sha", "unknown"))
print(f" 版本: {version}")
else:
print(f" {FAIL} API 健康檢查失敗")
return False
# 確認 auto_repair_executions 表存在(查看統計)
code2, data2 = _get(f"{api_url}/api/v1/stats/auto-repair")
if code2 == 200:
print(f" {PASS} auto_repair stats 端點正常")
else:
print(f" {SKIP} auto_repair stats 端點: HTTP {code2}(可接受)")
return code == 200
def main():
parser = argparse.ArgumentParser(description="Sprint 5.1 E2E 驗收")
parser.add_argument("--api-url", default="http://192.168.0.121:32334")
parser.add_argument("--json", action="store_true")
args = parser.parse_args()
print(f"🧪 Sprint 5.1 Data Safety Guardrails E2E 驗收")
print(f" API: {args.api_url}")
print(f" 時間: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
results = {
"T1_block_service": run_t1_block_service(args.api_url),
"T2_auto_repair_false": run_t2_auto_repair_false_flag(args.api_url),
"T3_auto_service": run_t3_auto_service(args.api_url),
"T4_docker_health_monitor": run_t4_docker_health_monitor(args.api_url),
"T5_health_check": run_t5_service_registry_api(args.api_url),
}
passed = sum(1 for v in results.values() if v)
total = len(results)
print(f"\n{''*50}")
print(f" 結果: {passed}/{total} 通過")
for name, ok in results.items():
print(f" {'' if ok else ''} {name}")
print(f"{''*50}")
if args.json:
print(json.dumps({"passed": passed, "total": total, "results": results}))
sys.exit(0 if passed == total else 1)
if __name__ == "__main__":
main()