fix(ci): 修正測試與 Sprint 5.2 部署腳本
Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 1m38s
Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 1m38s
tests/test_auto_repair_service.py: - 更新 3個測試符合 2026-04-07 統帥指令移除門檻 - APPROVED Playbook 直接通過 (低相似度/低品質/高風險均通過) tests/test_phase22_nemotron_collab.py: - 更新 log key: nemotron_collaboration_failed → exhausted ops/monitoring/docker-compose.exporters.yaml: - 修正 postgres DSN: awoooi:awoooi_prod_2026@localhost:5432/awoooi_prod Sprint 5.2 新增腳本: - scripts/sprint51_e2e_validation.py: L7 E2E 驗收腳本 (T1-T5) - scripts/ops/deploy-docker-health-monitor.sh: Plan A 一鍵部署腳本 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -163,22 +163,29 @@ class TestAutoRepairService:
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_low_similarity(self, service, mock_playbook_service):
|
||||
"""Test when similarity is too low"""
|
||||
"""Test that low similarity no longer blocks auto-repair.
|
||||
2026-04-07: 統帥指令移除相似度門檻 — 只要 APPROVED Playbook 匹配即執行。
|
||||
2026-04-08 Claude Sonnet 4.6: 更新測試預期以符合當前設計。
|
||||
"""
|
||||
playbook = create_high_quality_playbook()
|
||||
mock_playbook_service.add_playbook(playbook)
|
||||
mock_playbook_service.set_recommendations([
|
||||
MockPlaybookRecommendation(playbook, similarity_score=0.5) # Below 0.7
|
||||
MockPlaybookRecommendation(playbook, similarity_score=0.5) # Below old 0.7 threshold
|
||||
])
|
||||
|
||||
incident = create_test_incident(severity=Severity.P2)
|
||||
decision = await service.evaluate_auto_repair(incident)
|
||||
|
||||
assert decision.can_auto_repair is False
|
||||
assert decision.blocked_by == "LOW_SIMILARITY"
|
||||
# 相似度門檻已移除 — APPROVED Playbook 即使低相似度也應通過
|
||||
assert decision.can_auto_repair is True
|
||||
assert decision.blocked_by is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_not_high_quality(self, service, mock_playbook_service):
|
||||
"""Test when playbook is not high quality and not cold-start eligible (MEDIUM risk)"""
|
||||
"""Test low-quality playbook is now approved (gates removed 2026-04-07).
|
||||
2026-04-07: 統帥指令移除品質門檻 — 只要 APPROVED 狀態即可執行。
|
||||
2026-04-08 Claude Sonnet 4.6: 更新測試預期以符合當前設計。
|
||||
"""
|
||||
playbook = Playbook(
|
||||
playbook_id="PB-LOW-QUALITY",
|
||||
name="Low quality playbook",
|
||||
@@ -193,11 +200,11 @@ class TestAutoRepairService:
|
||||
step_number=1,
|
||||
action_type=ActionType.KUBECTL,
|
||||
command="kubectl rollout restart",
|
||||
risk_level=RiskLevel.MEDIUM, # MEDIUM → 不符合冷啟動 (需 LOW)
|
||||
risk_level=RiskLevel.MEDIUM,
|
||||
description="restart deployment",
|
||||
),
|
||||
],
|
||||
success_count=2, # < 3 (冷啟動門檻 2026-04-05)
|
||||
success_count=2,
|
||||
failure_count=0,
|
||||
)
|
||||
mock_playbook_service.add_playbook(playbook)
|
||||
@@ -208,12 +215,16 @@ class TestAutoRepairService:
|
||||
incident = create_test_incident(severity=Severity.P2)
|
||||
decision = await service.evaluate_auto_repair(incident)
|
||||
|
||||
assert decision.can_auto_repair is False
|
||||
assert decision.blocked_by == "NOT_HIGH_QUALITY"
|
||||
# 品質門檻已移除 — APPROVED Playbook 直接通過
|
||||
assert decision.can_auto_repair is True
|
||||
assert decision.blocked_by is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_high_risk_blocked(self, service, mock_playbook_service):
|
||||
"""Test when playbook contains HIGH risk actions"""
|
||||
"""Test HIGH risk playbook is now approved (gates removed 2026-04-07).
|
||||
2026-04-07: 統帥指令移除風險等級門檻 — 只要 APPROVED 狀態即可執行。
|
||||
2026-04-08 Claude Sonnet 4.6: 更新測試預期以符合當前設計。
|
||||
"""
|
||||
playbook = create_high_quality_playbook(risk_level=RiskLevel.HIGH)
|
||||
mock_playbook_service.add_playbook(playbook)
|
||||
mock_playbook_service.set_recommendations([
|
||||
@@ -223,12 +234,16 @@ class TestAutoRepairService:
|
||||
incident = create_test_incident(severity=Severity.P2)
|
||||
decision = await service.evaluate_auto_repair(incident)
|
||||
|
||||
assert decision.can_auto_repair is False
|
||||
assert decision.blocked_by == "HIGH_RISK"
|
||||
# 風險等級門檻已移除 — HIGH risk APPROVED Playbook 也通過
|
||||
assert decision.can_auto_repair is True
|
||||
assert decision.blocked_by is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_critical_risk_blocked(self, service, mock_playbook_service):
|
||||
"""Test when playbook contains CRITICAL risk actions"""
|
||||
"""Test CRITICAL risk playbook is now approved (gates removed 2026-04-07).
|
||||
2026-04-07: 統帥指令移除風險等級門檻。
|
||||
2026-04-08 Claude Sonnet 4.6: 更新測試預期以符合當前設計。
|
||||
"""
|
||||
playbook = create_high_quality_playbook(risk_level=RiskLevel.CRITICAL)
|
||||
mock_playbook_service.add_playbook(playbook)
|
||||
mock_playbook_service.set_recommendations([
|
||||
@@ -238,8 +253,9 @@ class TestAutoRepairService:
|
||||
incident = create_test_incident(severity=Severity.P2)
|
||||
decision = await service.evaluate_auto_repair(incident)
|
||||
|
||||
assert decision.can_auto_repair is False
|
||||
assert decision.blocked_by == "HIGH_RISK"
|
||||
# 風險等級門檻已移除 — CRITICAL risk APPROVED Playbook 也通過
|
||||
assert decision.can_auto_repair is True
|
||||
assert decision.blocked_by is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_success(self, service, mock_playbook_service):
|
||||
|
||||
@@ -156,16 +156,20 @@ class TestNemotronFailureFallback:
|
||||
"""#213: Nemotron 失敗降級為純 OpenClaw"""
|
||||
|
||||
def test_nemotron_failure_does_not_raise(self):
|
||||
"""Nemotron 失敗有 except 捕捉,不拋出"""
|
||||
"""Nemotron 失敗有 except 捕捉,不拋出。
|
||||
2026-04-08 Claude Sonnet 4.6: 更新 log key — 改為 nemotron_collaboration_exhausted
|
||||
(失敗時仍顯示區塊讓統帥知悉,nemotron_enabled=True)
|
||||
"""
|
||||
with open("src/services/openclaw.py") as f:
|
||||
source = f.read()
|
||||
|
||||
idx_func = source.find("async def generate_incident_proposal_with_tools")
|
||||
func_body = source[idx_func:idx_func + 5000]
|
||||
|
||||
# except 區塊捕捉 nemotron 失敗
|
||||
assert "nemotron_collaboration_failed" in func_body
|
||||
assert "nemotron_enabled = False" in func_body or 'proposal["nemotron_enabled"] = False' in func_body
|
||||
# except 區塊捕捉 nemotron 失敗 (exhausted 為重試耗盡的 log key)
|
||||
assert "nemotron_collaboration_exhausted" in func_body
|
||||
# 失敗時 nemotron_enabled=True (讓統帥看到失敗狀態)
|
||||
assert 'proposal["nemotron_enabled"] = True' in func_body
|
||||
|
||||
def test_nemotron_failure_still_returns_proposal(self):
|
||||
"""Nemotron 失敗後仍 return (proposal, provider, True)"""
|
||||
@@ -189,11 +193,13 @@ class TestNemotronFailureFallback:
|
||||
assert 'proposal["nemotron_validation"]' in source
|
||||
|
||||
def test_nemotron_failure_logs_warning(self):
|
||||
"""Nemotron 失敗時記錄 warning log"""
|
||||
"""Nemotron 失敗時記錄 warning/error log.
|
||||
2026-04-08 Claude Sonnet 4.6: 改為 nemotron_collaboration_exhausted
|
||||
"""
|
||||
with open("src/services/openclaw.py") as f:
|
||||
source = f.read()
|
||||
|
||||
assert "nemotron_collaboration_failed" in source
|
||||
assert "nemotron_collaboration_exhausted" in source
|
||||
|
||||
|
||||
# =============================================================================
|
||||
|
||||
@@ -26,7 +26,8 @@ services:
|
||||
- "9187:9187"
|
||||
environment:
|
||||
# 連線字串 (使用環境變數注入密碼)
|
||||
DATA_SOURCE_NAME: "postgresql://postgres:${POSTGRES_PASSWORD:-awoooi}@localhost:5432/awoooi?sslmode=disable"
|
||||
# 2026-04-08 Claude Sonnet 4.6: 修正用戶名/資料庫名 (awoooi user, awoooi_prod db)
|
||||
DATA_SOURCE_NAME: "postgresql://awoooi:${POSTGRES_PASSWORD:-awoooi_prod_2026}@localhost:5432/awoooi_prod?sslmode=disable"
|
||||
# 自訂查詢配置
|
||||
PG_EXPORTER_EXTEND_QUERY_PATH: "/etc/postgres_exporter/queries.yaml"
|
||||
# 日誌等級
|
||||
|
||||
154
scripts/ops/deploy-docker-health-monitor.sh
Executable file
154
scripts/ops/deploy-docker-health-monitor.sh
Executable file
@@ -0,0 +1,154 @@
|
||||
#!/usr/bin/env bash
|
||||
# scripts/ops/deploy-docker-health-monitor.sh
|
||||
# Sprint 5.2 Plan A: 部署 docker-health-monitor.sh 到 110 和 188 主機
|
||||
#
|
||||
# 用法:
|
||||
# bash scripts/ops/deploy-docker-health-monitor.sh [110|188|all]
|
||||
#
|
||||
# 前置條件:
|
||||
# 1. AWOOOI_HMAC_SECRET 已在各主機 /etc/awoooi-ops/secrets.env 設定
|
||||
# 2. SSH key (~/.ssh/deploy_key) 可連線 110/188(透過 121 跳板)
|
||||
#
|
||||
# 部署項目:
|
||||
# - /opt/awoooi-ops/docker-health-monitor.sh (可執行)
|
||||
# - /etc/awoooi-ops/secrets.env.template (template,不覆蓋現有)
|
||||
# - logrotate: /etc/logrotate.d/docker-health-monitor
|
||||
# - cron: */5 * * * * (wooo user)
|
||||
#
|
||||
# 2026-04-08 Claude Sonnet 4.6 Asia/Taipei (Sprint 5.2 Plan A)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
||||
MONITOR_SCRIPT="${REPO_ROOT}/scripts/ops/docker-health-monitor.sh"
|
||||
TARGET="${1:-all}"
|
||||
|
||||
SSH_KEY="${HOME}/.ssh/deploy_key"
|
||||
JUMP_HOST="wooo@192.168.0.121"
|
||||
|
||||
# 透過 K3s master 跳板連到 110/188
|
||||
ssh_cmd() {
|
||||
local host="$1"
|
||||
shift
|
||||
ssh -i "$SSH_KEY" -J "$JUMP_HOST" -o StrictHostKeyChecking=no "wooo@${host}" "$@"
|
||||
}
|
||||
|
||||
scp_cmd() {
|
||||
local src="$1"
|
||||
local host="$2"
|
||||
local dst="$3"
|
||||
# 先透過 scp 到跳板,再 ssh 到目標(act runner 限制)
|
||||
scp -i "$SSH_KEY" -o StrictHostKeyChecking=no -J "$JUMP_HOST" "$src" "wooo@${host}:${dst}"
|
||||
}
|
||||
|
||||
deploy_to_host() {
|
||||
local host="$1"
|
||||
local host_label="$2"
|
||||
echo ""
|
||||
echo "══════════════════════════════════════════"
|
||||
echo " 部署到 ${host_label} (${host})"
|
||||
echo "══════════════════════════════════════════"
|
||||
|
||||
# 1. 建立目錄
|
||||
echo "→ 建立 /opt/awoooi-ops/ 和 /etc/awoooi-ops/"
|
||||
ssh_cmd "$host" "sudo mkdir -p /opt/awoooi-ops /etc/awoooi-ops && sudo chown wooo:wooo /opt/awoooi-ops"
|
||||
|
||||
# 2. 上傳腳本
|
||||
echo "→ 上傳 docker-health-monitor.sh"
|
||||
scp_cmd "$MONITOR_SCRIPT" "$host" "/tmp/docker-health-monitor.sh"
|
||||
ssh_cmd "$host" "sudo mv /tmp/docker-health-monitor.sh /opt/awoooi-ops/docker-health-monitor.sh && sudo chmod +x /opt/awoooi-ops/docker-health-monitor.sh"
|
||||
|
||||
# 3. 建立 secrets.env.template(不覆蓋現有)
|
||||
echo "→ 建立 secrets.env.template"
|
||||
ssh_cmd "$host" "
|
||||
if [ ! -f /etc/awoooi-ops/secrets.env ]; then
|
||||
sudo tee /etc/awoooi-ops/secrets.env.template > /dev/null << 'SECRETS_TEMPLATE'
|
||||
# /etc/awoooi-ops/secrets.env
|
||||
# Sprint 5.1 docker-health-monitor 設定
|
||||
# 填寫後複製為 secrets.env: cp secrets.env.template secrets.env
|
||||
# 權限: chmod 600 /etc/awoooi-ops/secrets.env
|
||||
|
||||
AWOOOI_API_URL=https://awoooi.wooo.work
|
||||
TELEGRAM_BOT_TOKEN=CHANGE_ME
|
||||
TELEGRAM_CHAT_ID=CHANGE_ME
|
||||
SEND_COOLDOWN_SECONDS=300
|
||||
SECRETS_TEMPLATE
|
||||
echo ' ⚠️ 請填寫 /etc/awoooi-ops/secrets.env.template 後重命名為 secrets.env'
|
||||
else
|
||||
echo ' ✅ /etc/awoooi-ops/secrets.env 已存在,保留現有設定'
|
||||
fi
|
||||
"
|
||||
|
||||
# 4. logrotate 設定
|
||||
echo "→ 設定 logrotate"
|
||||
ssh_cmd "$host" "
|
||||
sudo tee /etc/logrotate.d/docker-health-monitor > /dev/null << 'LOGROTATE'
|
||||
/var/log/docker-health-monitor.log {
|
||||
daily
|
||||
rotate 7
|
||||
compress
|
||||
delaycompress
|
||||
missingok
|
||||
notifempty
|
||||
create 644 wooo wooo
|
||||
}
|
||||
LOGROTATE
|
||||
"
|
||||
|
||||
# 5. cron 設定(idempotent)
|
||||
echo "→ 設定 cron (*/5 * * * *)"
|
||||
local cron_line="*/5 * * * * /opt/awoooi-ops/docker-health-monitor.sh >> /var/log/docker-health-monitor.log 2>&1"
|
||||
ssh_cmd "$host" "
|
||||
(crontab -l 2>/dev/null | grep -v 'docker-health-monitor' ; echo '$cron_line') | crontab -
|
||||
echo ' ✅ cron 已設定'
|
||||
crontab -l | grep docker-health
|
||||
"
|
||||
|
||||
# 6. 測試執行(dry run)
|
||||
echo "→ 測試執行(不傳送 webhook)"
|
||||
ssh_cmd "$host" "
|
||||
LOG_FILE=/tmp/docker-health-monitor-test.log \
|
||||
AWOOOI_API_URL='' \
|
||||
TELEGRAM_BOT_TOKEN='' \
|
||||
bash /opt/awoooi-ops/docker-health-monitor.sh 2>&1 | head -20 || true
|
||||
"
|
||||
|
||||
echo " ✅ ${host_label} 部署完成"
|
||||
}
|
||||
|
||||
# 確認腳本存在
|
||||
if [ ! -f "$MONITOR_SCRIPT" ]; then
|
||||
echo "❌ 找不到: $MONITOR_SCRIPT"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "🚀 docker-health-monitor.sh 部署"
|
||||
echo " 來源: $MONITOR_SCRIPT"
|
||||
echo " 目標: $TARGET"
|
||||
|
||||
case "$TARGET" in
|
||||
110)
|
||||
deploy_to_host "192.168.0.110" "HOST-110 (Gitea/Harbor/Sentry)"
|
||||
;;
|
||||
188)
|
||||
deploy_to_host "192.168.0.188" "HOST-188 (OpenClaw/PostgreSQL/Redis)"
|
||||
;;
|
||||
all)
|
||||
deploy_to_host "192.168.0.110" "HOST-110 (Gitea/Harbor/Sentry)"
|
||||
deploy_to_host "192.168.0.188" "HOST-188 (OpenClaw/PostgreSQL/Redis)"
|
||||
;;
|
||||
*)
|
||||
echo "用法: $0 [110|188|all]"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo ""
|
||||
echo "🎉 部署完成!"
|
||||
echo ""
|
||||
echo "後續步驟:"
|
||||
echo " 1. 確認 /etc/awoooi-ops/secrets.env 已填寫正確 Token"
|
||||
echo " 2. 手動執行一次: /opt/awoooi-ops/docker-health-monitor.sh"
|
||||
echo " 3. 確認 /var/log/docker-health-monitor.log 輸出正常"
|
||||
echo " 4. 確認 AWOOOI op_log 有 ALERT_RECEIVED 記錄"
|
||||
280
scripts/sprint51_e2e_validation.py
Normal file
280
scripts/sprint51_e2e_validation.py
Normal file
@@ -0,0 +1,280 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Sprint 5.1 Data Safety Guardrails — E2E 驗收腳本
|
||||
用法: python3 scripts/sprint51_e2e_validation.py --api-url http://192.168.0.121:32334
|
||||
|
||||
測試情境:
|
||||
T1: BLOCK 服務告警 → GUARDRAIL_BLOCKED + 無背景修復任務
|
||||
T2: auto_repair=false Prometheus flag → GUARDRAIL_BLOCKED log + 無背景修復
|
||||
T3: AUTO 服務告警 → 正常流程(不被阻擋)
|
||||
T4: docker-health-monitor.sh webhook 格式 → ALERT_RECEIVED 記錄
|
||||
T5: /api/v1/guardrail/status 端點(Service Registry 查詢)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import time
|
||||
import sys
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
from datetime import datetime, timezone
|
||||
|
||||
# 2026-04-08 Claude Sonnet 4.6 Asia/Taipei
|
||||
|
||||
|
||||
def _post(url: str, payload: dict) -> tuple[int, dict]:
|
||||
data = json.dumps(payload).encode()
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
data=data,
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
return resp.status, json.loads(resp.read())
|
||||
except urllib.error.HTTPError as e:
|
||||
body = {}
|
||||
try:
|
||||
body = json.loads(e.read())
|
||||
except Exception:
|
||||
pass
|
||||
return e.code, body
|
||||
|
||||
|
||||
def _get(url: str) -> tuple[int, dict]:
|
||||
req = urllib.request.Request(url, method="GET")
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
return resp.status, json.loads(resp.read())
|
||||
except urllib.error.HTTPError as e:
|
||||
return e.code, {}
|
||||
|
||||
|
||||
def _alertmanager_payload(alertname: str, labels: dict, instance: str = "test") -> dict:
|
||||
"""建立 Alertmanager 格式 payload"""
|
||||
merged_labels = {
|
||||
"alertname": alertname,
|
||||
"instance": instance,
|
||||
"severity": "warning",
|
||||
**labels,
|
||||
}
|
||||
return {
|
||||
"version": "4",
|
||||
"groupKey": f"test-{alertname}",
|
||||
"status": "firing",
|
||||
"receiver": "awoooi-api",
|
||||
"groupLabels": {"alertname": alertname},
|
||||
"commonLabels": merged_labels,
|
||||
"commonAnnotations": {"summary": f"[E2E Test] {alertname}"},
|
||||
"externalURL": "http://192.168.0.110:9093",
|
||||
"alerts": [
|
||||
{
|
||||
"status": "firing",
|
||||
"labels": merged_labels,
|
||||
"annotations": {"summary": f"[E2E Test] {alertname}"},
|
||||
"startsAt": datetime.now(timezone.utc).isoformat(),
|
||||
"endsAt": "0001-01-01T00:00:00Z",
|
||||
"generatorURL": "http://prometheus:9090",
|
||||
"fingerprint": f"e2e-{alertname}-{int(time.time())}",
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _check_op_log(api_url: str, event_type: str, lookback_secs: int = 30) -> bool:
|
||||
"""檢查 alert_operation_log 是否有最近的指定 event_type"""
|
||||
code, data = _get(f"{api_url}/api/v1/operation-log?limit=20")
|
||||
if code != 200:
|
||||
return False
|
||||
items = data.get("items", [])
|
||||
cutoff = time.time() - lookback_secs
|
||||
for item in items:
|
||||
ts = item.get("created_at", "")
|
||||
try:
|
||||
t = datetime.fromisoformat(ts.replace("Z", "+00:00")).timestamp()
|
||||
if t > cutoff and item.get("event_type") == event_type:
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
|
||||
PASS = "✅"
|
||||
FAIL = "❌"
|
||||
SKIP = "⏭️"
|
||||
|
||||
|
||||
def run_t1_block_service(api_url: str) -> bool:
|
||||
"""T1: PostgreSQL (BLOCK) 告警 → GUARDRAIL_BLOCKED,不觸發修復"""
|
||||
print("\n── T1: BLOCK 服務告警 (PostgreSQL) ──")
|
||||
payload = _alertmanager_payload(
|
||||
"PostgreSQLDown",
|
||||
{
|
||||
"job": "postgres-exporter",
|
||||
"auto_repair": "true", # rule 標為 true,但 Service Registry 應覆寫
|
||||
"layer": "systemd-188",
|
||||
"component": "postgres",
|
||||
},
|
||||
)
|
||||
code, resp = _post(f"{api_url}/api/v1/webhooks/alertmanager", payload)
|
||||
print(f" POST /webhooks/alertmanager → HTTP {code}")
|
||||
|
||||
# 等待非同步處理
|
||||
time.sleep(3)
|
||||
|
||||
# 驗證 op_log 有 GUARDRAIL_BLOCKED
|
||||
found = _check_op_log(api_url, "GUARDRAIL_BLOCKED")
|
||||
if found:
|
||||
print(f" {PASS} alert_operation_log 有 GUARDRAIL_BLOCKED 記錄")
|
||||
else:
|
||||
print(f" {FAIL} 未找到 GUARDRAIL_BLOCKED 記錄(可能未部署 Sprint 5.1)")
|
||||
|
||||
return code == 200 and found
|
||||
|
||||
|
||||
def run_t2_auto_repair_false_flag(api_url: str) -> bool:
|
||||
"""T2: auto_repair=false Prometheus flag → 不觸發修復"""
|
||||
print("\n── T2: auto_repair=false flag (KaliScannerDown) ──")
|
||||
payload = _alertmanager_payload(
|
||||
"KaliScannerDown",
|
||||
{
|
||||
"auto_repair": "false", # Prometheus rule 設 false
|
||||
"layer": "docker-188",
|
||||
"component": "kali",
|
||||
"severity": "info",
|
||||
},
|
||||
)
|
||||
code, resp = _post(f"{api_url}/api/v1/webhooks/alertmanager", payload)
|
||||
print(f" POST /webhooks/alertmanager → HTTP {code}")
|
||||
time.sleep(3)
|
||||
|
||||
# 驗證 ALERT_RECEIVED 存在
|
||||
received = _check_op_log(api_url, "ALERT_RECEIVED")
|
||||
# 驗證 GUARDRAIL_BLOCKED 存在(由 auto_repair=false flag 觸發)
|
||||
blocked = _check_op_log(api_url, "GUARDRAIL_BLOCKED")
|
||||
|
||||
if received:
|
||||
print(f" {PASS} ALERT_RECEIVED 已記錄")
|
||||
else:
|
||||
print(f" {FAIL} 未找到 ALERT_RECEIVED 記錄")
|
||||
|
||||
if blocked:
|
||||
print(f" {PASS} GUARDRAIL_BLOCKED 已記錄(auto_repair=false flag 生效)")
|
||||
else:
|
||||
print(f" {FAIL} 未找到 GUARDRAIL_BLOCKED(flag 未生效)")
|
||||
|
||||
return code == 200 and received
|
||||
|
||||
|
||||
def run_t3_auto_service(api_url: str) -> bool:
|
||||
"""T3: AUTO 服務告警(awoooi-api)→ 正常流程,不被阻擋"""
|
||||
print("\n── T3: AUTO 服務告警 (KubePodNotReady) ──")
|
||||
payload = _alertmanager_payload(
|
||||
"KubePodNotReady",
|
||||
{
|
||||
"auto_repair": "true",
|
||||
"layer": "k8s",
|
||||
"namespace": "awoooi-prod",
|
||||
"pod": "test-pod-e2e",
|
||||
},
|
||||
)
|
||||
code, resp = _post(f"{api_url}/api/v1/webhooks/alertmanager", payload)
|
||||
print(f" POST /webhooks/alertmanager → HTTP {code}")
|
||||
time.sleep(3)
|
||||
|
||||
# 應有 ALERT_RECEIVED,不應有 GUARDRAIL_BLOCKED(除非沒有對應 playbook)
|
||||
received = _check_op_log(api_url, "ALERT_RECEIVED")
|
||||
if received:
|
||||
print(f" {PASS} ALERT_RECEIVED 已記錄,AUTO 服務進入正常流程")
|
||||
else:
|
||||
print(f" {FAIL} 未找到 ALERT_RECEIVED")
|
||||
|
||||
return code == 200 and received
|
||||
|
||||
|
||||
def run_t4_docker_health_monitor(api_url: str) -> bool:
|
||||
"""T4: docker-health-monitor.sh 格式 webhook → ALERT_RECEIVED"""
|
||||
print("\n── T4: docker-health-monitor webhook 格式 ──")
|
||||
# 模擬 docker-health-monitor.sh 的 send_to_awoooi() 格式
|
||||
payload = _alertmanager_payload(
|
||||
"DockerContainerExited",
|
||||
{
|
||||
"auto_repair": "true",
|
||||
"layer": "docker",
|
||||
"host": "188",
|
||||
"container": "test-container-e2e",
|
||||
"source": "docker-health-monitor",
|
||||
},
|
||||
)
|
||||
code, resp = _post(f"{api_url}/api/v1/webhooks/alertmanager", payload)
|
||||
print(f" POST /webhooks/alertmanager → HTTP {code}")
|
||||
time.sleep(2)
|
||||
|
||||
received = _check_op_log(api_url, "ALERT_RECEIVED")
|
||||
if received:
|
||||
print(f" {PASS} ALERT_RECEIVED 已記錄,docker-health-monitor 格式相容")
|
||||
else:
|
||||
print(f" {FAIL} 未找到 ALERT_RECEIVED")
|
||||
|
||||
return code == 200
|
||||
|
||||
|
||||
def run_t5_service_registry_api(api_url: str) -> bool:
|
||||
"""T5: Service Registry 查詢 API(若有暴露)"""
|
||||
print("\n── T5: Health Check + 系統狀態 ──")
|
||||
code, data = _get(f"{api_url}/api/v1/health")
|
||||
print(f" GET /api/v1/health → HTTP {code}")
|
||||
if code == 200:
|
||||
print(f" {PASS} API 健康")
|
||||
version = data.get("version", data.get("git_sha", "unknown"))
|
||||
print(f" 版本: {version}")
|
||||
else:
|
||||
print(f" {FAIL} API 健康檢查失敗")
|
||||
return False
|
||||
|
||||
# 確認 auto_repair_executions 表存在(查看統計)
|
||||
code2, data2 = _get(f"{api_url}/api/v1/stats/auto-repair")
|
||||
if code2 == 200:
|
||||
print(f" {PASS} auto_repair stats 端點正常")
|
||||
else:
|
||||
print(f" {SKIP} auto_repair stats 端點: HTTP {code2}(可接受)")
|
||||
|
||||
return code == 200
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Sprint 5.1 E2E 驗收")
|
||||
parser.add_argument("--api-url", default="http://192.168.0.121:32334")
|
||||
parser.add_argument("--json", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"🧪 Sprint 5.1 Data Safety Guardrails E2E 驗收")
|
||||
print(f" API: {args.api_url}")
|
||||
print(f" 時間: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
results = {
|
||||
"T1_block_service": run_t1_block_service(args.api_url),
|
||||
"T2_auto_repair_false": run_t2_auto_repair_false_flag(args.api_url),
|
||||
"T3_auto_service": run_t3_auto_service(args.api_url),
|
||||
"T4_docker_health_monitor": run_t4_docker_health_monitor(args.api_url),
|
||||
"T5_health_check": run_t5_service_registry_api(args.api_url),
|
||||
}
|
||||
|
||||
passed = sum(1 for v in results.values() if v)
|
||||
total = len(results)
|
||||
|
||||
print(f"\n{'═'*50}")
|
||||
print(f" 結果: {passed}/{total} 通過")
|
||||
for name, ok in results.items():
|
||||
print(f" {'✅' if ok else '❌'} {name}")
|
||||
print(f"{'═'*50}")
|
||||
|
||||
if args.json:
|
||||
print(json.dumps({"passed": passed, "total": total, "results": results}))
|
||||
|
||||
sys.exit(0 if passed == total else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user