Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 1m38s
tests/test_auto_repair_service.py: - 更新 3個測試符合 2026-04-07 統帥指令移除門檻 - APPROVED Playbook 直接通過 (低相似度/低品質/高風險均通過) tests/test_phase22_nemotron_collab.py: - 更新 log key: nemotron_collaboration_failed → exhausted ops/monitoring/docker-compose.exporters.yaml: - 修正 postgres DSN: awoooi:awoooi_prod_2026@localhost:5432/awoooi_prod Sprint 5.2 新增腳本: - scripts/sprint51_e2e_validation.py: L7 E2E 驗收腳本 (T1-T5) - scripts/ops/deploy-docker-health-monitor.sh: Plan A 一鍵部署腳本 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
281 lines
9.3 KiB
Python
281 lines
9.3 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Sprint 5.1 Data Safety Guardrails — E2E 驗收腳本
|
||
用法: python3 scripts/sprint51_e2e_validation.py --api-url http://192.168.0.121:32334
|
||
|
||
測試情境:
|
||
T1: BLOCK 服務告警 → GUARDRAIL_BLOCKED + 無背景修復任務
|
||
T2: auto_repair=false Prometheus flag → GUARDRAIL_BLOCKED log + 無背景修復
|
||
T3: AUTO 服務告警 → 正常流程(不被阻擋)
|
||
T4: docker-health-monitor.sh webhook 格式 → ALERT_RECEIVED 記錄
|
||
T5: /api/v1/guardrail/status 端點(Service Registry 查詢)
|
||
"""
|
||
|
||
import argparse
|
||
import json
|
||
import time
|
||
import sys
|
||
import urllib.request
|
||
import urllib.error
|
||
from datetime import datetime, timezone
|
||
|
||
# 2026-04-08 Claude Sonnet 4.6 Asia/Taipei
|
||
|
||
|
||
def _post(url: str, payload: dict) -> tuple[int, dict]:
|
||
data = json.dumps(payload).encode()
|
||
req = urllib.request.Request(
|
||
url,
|
||
data=data,
|
||
headers={"Content-Type": "application/json"},
|
||
method="POST",
|
||
)
|
||
try:
|
||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||
return resp.status, json.loads(resp.read())
|
||
except urllib.error.HTTPError as e:
|
||
body = {}
|
||
try:
|
||
body = json.loads(e.read())
|
||
except Exception:
|
||
pass
|
||
return e.code, body
|
||
|
||
|
||
def _get(url: str) -> tuple[int, dict]:
|
||
req = urllib.request.Request(url, method="GET")
|
||
try:
|
||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||
return resp.status, json.loads(resp.read())
|
||
except urllib.error.HTTPError as e:
|
||
return e.code, {}
|
||
|
||
|
||
def _alertmanager_payload(alertname: str, labels: dict, instance: str = "test") -> dict:
|
||
"""建立 Alertmanager 格式 payload"""
|
||
merged_labels = {
|
||
"alertname": alertname,
|
||
"instance": instance,
|
||
"severity": "warning",
|
||
**labels,
|
||
}
|
||
return {
|
||
"version": "4",
|
||
"groupKey": f"test-{alertname}",
|
||
"status": "firing",
|
||
"receiver": "awoooi-api",
|
||
"groupLabels": {"alertname": alertname},
|
||
"commonLabels": merged_labels,
|
||
"commonAnnotations": {"summary": f"[E2E Test] {alertname}"},
|
||
"externalURL": "http://192.168.0.110:9093",
|
||
"alerts": [
|
||
{
|
||
"status": "firing",
|
||
"labels": merged_labels,
|
||
"annotations": {"summary": f"[E2E Test] {alertname}"},
|
||
"startsAt": datetime.now(timezone.utc).isoformat(),
|
||
"endsAt": "0001-01-01T00:00:00Z",
|
||
"generatorURL": "http://prometheus:9090",
|
||
"fingerprint": f"e2e-{alertname}-{int(time.time())}",
|
||
}
|
||
],
|
||
}
|
||
|
||
|
||
def _check_op_log(api_url: str, event_type: str, lookback_secs: int = 30) -> bool:
|
||
"""檢查 alert_operation_log 是否有最近的指定 event_type"""
|
||
code, data = _get(f"{api_url}/api/v1/operation-log?limit=20")
|
||
if code != 200:
|
||
return False
|
||
items = data.get("items", [])
|
||
cutoff = time.time() - lookback_secs
|
||
for item in items:
|
||
ts = item.get("created_at", "")
|
||
try:
|
||
t = datetime.fromisoformat(ts.replace("Z", "+00:00")).timestamp()
|
||
if t > cutoff and item.get("event_type") == event_type:
|
||
return True
|
||
except Exception:
|
||
pass
|
||
return False
|
||
|
||
|
||
PASS = "✅"
|
||
FAIL = "❌"
|
||
SKIP = "⏭️"
|
||
|
||
|
||
def run_t1_block_service(api_url: str) -> bool:
|
||
"""T1: PostgreSQL (BLOCK) 告警 → GUARDRAIL_BLOCKED,不觸發修復"""
|
||
print("\n── T1: BLOCK 服務告警 (PostgreSQL) ──")
|
||
payload = _alertmanager_payload(
|
||
"PostgreSQLDown",
|
||
{
|
||
"job": "postgres-exporter",
|
||
"auto_repair": "true", # rule 標為 true,但 Service Registry 應覆寫
|
||
"layer": "systemd-188",
|
||
"component": "postgres",
|
||
},
|
||
)
|
||
code, resp = _post(f"{api_url}/api/v1/webhooks/alertmanager", payload)
|
||
print(f" POST /webhooks/alertmanager → HTTP {code}")
|
||
|
||
# 等待非同步處理
|
||
time.sleep(3)
|
||
|
||
# 驗證 op_log 有 GUARDRAIL_BLOCKED
|
||
found = _check_op_log(api_url, "GUARDRAIL_BLOCKED")
|
||
if found:
|
||
print(f" {PASS} alert_operation_log 有 GUARDRAIL_BLOCKED 記錄")
|
||
else:
|
||
print(f" {FAIL} 未找到 GUARDRAIL_BLOCKED 記錄(可能未部署 Sprint 5.1)")
|
||
|
||
return code == 200 and found
|
||
|
||
|
||
def run_t2_auto_repair_false_flag(api_url: str) -> bool:
|
||
"""T2: auto_repair=false Prometheus flag → 不觸發修復"""
|
||
print("\n── T2: auto_repair=false flag (KaliScannerDown) ──")
|
||
payload = _alertmanager_payload(
|
||
"KaliScannerDown",
|
||
{
|
||
"auto_repair": "false", # Prometheus rule 設 false
|
||
"layer": "docker-188",
|
||
"component": "kali",
|
||
"severity": "info",
|
||
},
|
||
)
|
||
code, resp = _post(f"{api_url}/api/v1/webhooks/alertmanager", payload)
|
||
print(f" POST /webhooks/alertmanager → HTTP {code}")
|
||
time.sleep(3)
|
||
|
||
# 驗證 ALERT_RECEIVED 存在
|
||
received = _check_op_log(api_url, "ALERT_RECEIVED")
|
||
# 驗證 GUARDRAIL_BLOCKED 存在(由 auto_repair=false flag 觸發)
|
||
blocked = _check_op_log(api_url, "GUARDRAIL_BLOCKED")
|
||
|
||
if received:
|
||
print(f" {PASS} ALERT_RECEIVED 已記錄")
|
||
else:
|
||
print(f" {FAIL} 未找到 ALERT_RECEIVED 記錄")
|
||
|
||
if blocked:
|
||
print(f" {PASS} GUARDRAIL_BLOCKED 已記錄(auto_repair=false flag 生效)")
|
||
else:
|
||
print(f" {FAIL} 未找到 GUARDRAIL_BLOCKED(flag 未生效)")
|
||
|
||
return code == 200 and received
|
||
|
||
|
||
def run_t3_auto_service(api_url: str) -> bool:
|
||
"""T3: AUTO 服務告警(awoooi-api)→ 正常流程,不被阻擋"""
|
||
print("\n── T3: AUTO 服務告警 (KubePodNotReady) ──")
|
||
payload = _alertmanager_payload(
|
||
"KubePodNotReady",
|
||
{
|
||
"auto_repair": "true",
|
||
"layer": "k8s",
|
||
"namespace": "awoooi-prod",
|
||
"pod": "test-pod-e2e",
|
||
},
|
||
)
|
||
code, resp = _post(f"{api_url}/api/v1/webhooks/alertmanager", payload)
|
||
print(f" POST /webhooks/alertmanager → HTTP {code}")
|
||
time.sleep(3)
|
||
|
||
# 應有 ALERT_RECEIVED,不應有 GUARDRAIL_BLOCKED(除非沒有對應 playbook)
|
||
received = _check_op_log(api_url, "ALERT_RECEIVED")
|
||
if received:
|
||
print(f" {PASS} ALERT_RECEIVED 已記錄,AUTO 服務進入正常流程")
|
||
else:
|
||
print(f" {FAIL} 未找到 ALERT_RECEIVED")
|
||
|
||
return code == 200 and received
|
||
|
||
|
||
def run_t4_docker_health_monitor(api_url: str) -> bool:
|
||
"""T4: docker-health-monitor.sh 格式 webhook → ALERT_RECEIVED"""
|
||
print("\n── T4: docker-health-monitor webhook 格式 ──")
|
||
# 模擬 docker-health-monitor.sh 的 send_to_awoooi() 格式
|
||
payload = _alertmanager_payload(
|
||
"DockerContainerExited",
|
||
{
|
||
"auto_repair": "true",
|
||
"layer": "docker",
|
||
"host": "188",
|
||
"container": "test-container-e2e",
|
||
"source": "docker-health-monitor",
|
||
},
|
||
)
|
||
code, resp = _post(f"{api_url}/api/v1/webhooks/alertmanager", payload)
|
||
print(f" POST /webhooks/alertmanager → HTTP {code}")
|
||
time.sleep(2)
|
||
|
||
received = _check_op_log(api_url, "ALERT_RECEIVED")
|
||
if received:
|
||
print(f" {PASS} ALERT_RECEIVED 已記錄,docker-health-monitor 格式相容")
|
||
else:
|
||
print(f" {FAIL} 未找到 ALERT_RECEIVED")
|
||
|
||
return code == 200
|
||
|
||
|
||
def run_t5_service_registry_api(api_url: str) -> bool:
|
||
"""T5: Service Registry 查詢 API(若有暴露)"""
|
||
print("\n── T5: Health Check + 系統狀態 ──")
|
||
code, data = _get(f"{api_url}/api/v1/health")
|
||
print(f" GET /api/v1/health → HTTP {code}")
|
||
if code == 200:
|
||
print(f" {PASS} API 健康")
|
||
version = data.get("version", data.get("git_sha", "unknown"))
|
||
print(f" 版本: {version}")
|
||
else:
|
||
print(f" {FAIL} API 健康檢查失敗")
|
||
return False
|
||
|
||
# 確認 auto_repair_executions 表存在(查看統計)
|
||
code2, data2 = _get(f"{api_url}/api/v1/stats/auto-repair")
|
||
if code2 == 200:
|
||
print(f" {PASS} auto_repair stats 端點正常")
|
||
else:
|
||
print(f" {SKIP} auto_repair stats 端點: HTTP {code2}(可接受)")
|
||
|
||
return code == 200
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description="Sprint 5.1 E2E 驗收")
|
||
parser.add_argument("--api-url", default="http://192.168.0.121:32334")
|
||
parser.add_argument("--json", action="store_true")
|
||
args = parser.parse_args()
|
||
|
||
print(f"🧪 Sprint 5.1 Data Safety Guardrails E2E 驗收")
|
||
print(f" API: {args.api_url}")
|
||
print(f" 時間: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||
|
||
results = {
|
||
"T1_block_service": run_t1_block_service(args.api_url),
|
||
"T2_auto_repair_false": run_t2_auto_repair_false_flag(args.api_url),
|
||
"T3_auto_service": run_t3_auto_service(args.api_url),
|
||
"T4_docker_health_monitor": run_t4_docker_health_monitor(args.api_url),
|
||
"T5_health_check": run_t5_service_registry_api(args.api_url),
|
||
}
|
||
|
||
passed = sum(1 for v in results.values() if v)
|
||
total = len(results)
|
||
|
||
print(f"\n{'═'*50}")
|
||
print(f" 結果: {passed}/{total} 通過")
|
||
for name, ok in results.items():
|
||
print(f" {'✅' if ok else '❌'} {name}")
|
||
print(f"{'═'*50}")
|
||
|
||
if args.json:
|
||
print(json.dumps({"passed": passed, "total": total, "results": results}))
|
||
|
||
sys.exit(0 if passed == total else 1)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|