fix(ci): report provider degradation as warning
This commit is contained in:
@@ -1383,10 +1383,10 @@ jobs:
|
||||
|
||||
- name: Notify Health Check Success
|
||||
env:
|
||||
SMOKE_RESULT: ${{ steps.smoke.outcome == 'success' && '✅' || '⚠️' }}
|
||||
ALERT_CHAIN_RESULT: ${{ steps.alert_chain_smoke.outcome == 'success' && '✅' || '⚠️' }}
|
||||
MONITORING_RESULT: ${{ steps.monitoring_coverage.outcome == 'success' && '✅' || '⚠️' }}
|
||||
SOURCE_LINK_RESULT: ${{ steps.source_correlation_apply_smoke.outcome == 'success' && '✅' || '⚠️' }}
|
||||
SMOKE_RESULT: ${{ steps.smoke.outputs.smoke_status == 'pass' && '✅' || '⚠️' }}
|
||||
ALERT_CHAIN_RESULT: ${{ steps.alert_chain_smoke.outputs.alert_chain_status == 'pass' && '✅' || '⚠️' }}
|
||||
MONITORING_RESULT: ${{ steps.monitoring_coverage.outputs.coverage_status == 'pass' && '✅' || '⚠️' }}
|
||||
SOURCE_LINK_RESULT: ${{ steps.source_correlation_apply_smoke.outputs.source_correlation_apply_status == 'pass' && '✅' || '⚠️' }}
|
||||
run: |
|
||||
END_TIME=$(date +%s)
|
||||
DURATION=$((END_TIME - ${{ steps.commit.outputs.start_time }}))
|
||||
|
||||
@@ -17,6 +17,64 @@ SPEC.loader.exec_module(alert_chain_smoke_test)
|
||||
|
||||
|
||||
class AlertChainSmokeMetricTest(unittest.TestCase):
|
||||
def test_api_health_passes_when_only_provider_is_degraded(self):
|
||||
def fake_get(url, *, params=None, timeout=None):
|
||||
self.assertTrue(url.endswith("/api/v1/health"))
|
||||
return alert_chain_smoke_test.HttpGetResult(
|
||||
200,
|
||||
json.dumps(
|
||||
{
|
||||
"status": "degraded",
|
||||
"environment": "prod",
|
||||
"components": {
|
||||
"api": {"status": "up"},
|
||||
"postgresql": {"status": "up"},
|
||||
"redis": {"status": "up"},
|
||||
"ollama": {"status": "down", "error": "timeout"},
|
||||
"signoz": {"status": "up"},
|
||||
},
|
||||
}
|
||||
),
|
||||
)
|
||||
|
||||
original_get = alert_chain_smoke_test.http_get
|
||||
try:
|
||||
alert_chain_smoke_test.http_get = fake_get
|
||||
result = alert_chain_smoke_test.check_api_health("http://api")
|
||||
finally:
|
||||
alert_chain_smoke_test.http_get = original_get
|
||||
|
||||
self.assertTrue(result.passed)
|
||||
self.assertIn("非阻塞降級: ollama", result.message)
|
||||
|
||||
def test_api_health_fails_when_core_component_is_down(self):
|
||||
def fake_get(url, *, params=None, timeout=None):
|
||||
self.assertTrue(url.endswith("/api/v1/health"))
|
||||
return alert_chain_smoke_test.HttpGetResult(
|
||||
200,
|
||||
json.dumps(
|
||||
{
|
||||
"status": "degraded",
|
||||
"components": {
|
||||
"api": {"status": "up"},
|
||||
"postgresql": {"status": "down"},
|
||||
"redis": {"status": "up"},
|
||||
"ollama": {"status": "up"},
|
||||
},
|
||||
}
|
||||
),
|
||||
)
|
||||
|
||||
original_get = alert_chain_smoke_test.http_get
|
||||
try:
|
||||
alert_chain_smoke_test.http_get = fake_get
|
||||
result = alert_chain_smoke_test.check_api_health("http://api")
|
||||
finally:
|
||||
alert_chain_smoke_test.http_get = original_get
|
||||
|
||||
self.assertFalse(result.passed)
|
||||
self.assertIn("核心組件異常: postgresql", result.message)
|
||||
|
||||
def test_parse_app_alert_chain_metric_samples(self):
|
||||
samples = alert_chain_smoke_test.parse_app_alert_chain_metric_samples(
|
||||
"\n".join([
|
||||
|
||||
@@ -5,7 +5,7 @@ AWOOOI Alert Chain Smoke Test
|
||||
Wave A.6 (ADR-037): 驗證告警鏈路 E2E 完整性
|
||||
|
||||
檢查項目:
|
||||
1. API Health — /api/v1/health 全組件 UP
|
||||
1. API Health — /api/v1/health 核心組件 UP,AI provider 降級列為警告
|
||||
2. Alert Chain Metric — awoooi_alert_chain_last_success_timestamp 不超過 2h
|
||||
3. Webhook 可達性 — /api/v1/webhooks/alertmanager, /signoz, /sentry health
|
||||
4. Telegram Secret — K8s Secret 存在且非空
|
||||
@@ -215,7 +215,7 @@ class SmokeTestReport:
|
||||
# 檢查函數
|
||||
# =============================================================================
|
||||
def check_api_health(api_url: str) -> CheckResult:
|
||||
"""Check 1: API Health — 所有組件必須 UP"""
|
||||
"""Check 1: API Health — core runtime must be up; provider degradation is warning evidence."""
|
||||
try:
|
||||
resp = http_get(f"{api_url}/api/v1/health", timeout=TIMEOUT)
|
||||
data = resp.json()
|
||||
@@ -223,25 +223,28 @@ def check_api_health(api_url: str) -> CheckResult:
|
||||
if resp.status_code >= 400:
|
||||
return CheckResult("API Health", False, f"HTTP {resp.status_code}")
|
||||
|
||||
if data.get("status") != "healthy":
|
||||
components = data.get("components", {})
|
||||
core_components = ("api", "postgresql", "redis")
|
||||
down_core_components = [
|
||||
name for name in core_components
|
||||
if components.get(name, {}).get("status") != "up"
|
||||
]
|
||||
if down_core_components:
|
||||
return CheckResult(
|
||||
"API Health",
|
||||
False,
|
||||
f"API status={data.get('status')} (expected healthy)",
|
||||
f"核心組件異常: {', '.join(down_core_components)}",
|
||||
)
|
||||
|
||||
# 檢查每個組件
|
||||
components = data.get("components", {})
|
||||
down_components = [
|
||||
name for name, info in components.items()
|
||||
if info.get("status") != "up"
|
||||
]
|
||||
|
||||
if down_components:
|
||||
return CheckResult(
|
||||
"API Health",
|
||||
False,
|
||||
f"組件異常: {', '.join(down_components)}",
|
||||
True,
|
||||
f"核心組件 UP;非阻塞降級: {', '.join(down_components)}",
|
||||
)
|
||||
|
||||
return CheckResult(
|
||||
|
||||
Reference in New Issue
Block a user