fix(ci): report provider degradation as warning
All checks were successful
CD Pipeline / tests (push) Successful in 5m55s
Code Review / ai-code-review (push) Successful in 11s
CD Pipeline / build-and-deploy (push) Successful in 3m59s
CD Pipeline / post-deploy-checks (push) Successful in 1m48s

This commit is contained in:
Your Name
2026-05-24 10:59:21 +08:00
parent f3b85cda4f
commit 22a4b44aef
3 changed files with 74 additions and 13 deletions

View File

@@ -1383,10 +1383,10 @@ jobs:
- name: Notify Health Check Success
env:
SMOKE_RESULT: ${{ steps.smoke.outcome == 'success' && '✅' || '⚠️' }}
ALERT_CHAIN_RESULT: ${{ steps.alert_chain_smoke.outcome == 'success' && '✅' || '⚠️' }}
MONITORING_RESULT: ${{ steps.monitoring_coverage.outcome == 'success' && '✅' || '⚠️' }}
SOURCE_LINK_RESULT: ${{ steps.source_correlation_apply_smoke.outcome == 'success' && '✅' || '⚠️' }}
SMOKE_RESULT: ${{ steps.smoke.outputs.smoke_status == 'pass' && '✅' || '⚠️' }}
ALERT_CHAIN_RESULT: ${{ steps.alert_chain_smoke.outputs.alert_chain_status == 'pass' && '✅' || '⚠️' }}
MONITORING_RESULT: ${{ steps.monitoring_coverage.outputs.coverage_status == 'pass' && '✅' || '⚠️' }}
SOURCE_LINK_RESULT: ${{ steps.source_correlation_apply_smoke.outputs.source_correlation_apply_status == 'pass' && '✅' || '⚠️' }}
run: |
END_TIME=$(date +%s)
DURATION=$((END_TIME - ${{ steps.commit.outputs.start_time }}))

View File

@@ -17,6 +17,64 @@ SPEC.loader.exec_module(alert_chain_smoke_test)
class AlertChainSmokeMetricTest(unittest.TestCase):
def test_api_health_passes_when_only_provider_is_degraded(self):
def fake_get(url, *, params=None, timeout=None):
self.assertTrue(url.endswith("/api/v1/health"))
return alert_chain_smoke_test.HttpGetResult(
200,
json.dumps(
{
"status": "degraded",
"environment": "prod",
"components": {
"api": {"status": "up"},
"postgresql": {"status": "up"},
"redis": {"status": "up"},
"ollama": {"status": "down", "error": "timeout"},
"signoz": {"status": "up"},
},
}
),
)
original_get = alert_chain_smoke_test.http_get
try:
alert_chain_smoke_test.http_get = fake_get
result = alert_chain_smoke_test.check_api_health("http://api")
finally:
alert_chain_smoke_test.http_get = original_get
self.assertTrue(result.passed)
self.assertIn("非阻塞降級: ollama", result.message)
def test_api_health_fails_when_core_component_is_down(self):
def fake_get(url, *, params=None, timeout=None):
self.assertTrue(url.endswith("/api/v1/health"))
return alert_chain_smoke_test.HttpGetResult(
200,
json.dumps(
{
"status": "degraded",
"components": {
"api": {"status": "up"},
"postgresql": {"status": "down"},
"redis": {"status": "up"},
"ollama": {"status": "up"},
},
}
),
)
original_get = alert_chain_smoke_test.http_get
try:
alert_chain_smoke_test.http_get = fake_get
result = alert_chain_smoke_test.check_api_health("http://api")
finally:
alert_chain_smoke_test.http_get = original_get
self.assertFalse(result.passed)
self.assertIn("核心組件異常: postgresql", result.message)
def test_parse_app_alert_chain_metric_samples(self):
samples = alert_chain_smoke_test.parse_app_alert_chain_metric_samples(
"\n".join([

View File

@@ -5,7 +5,7 @@ AWOOOI Alert Chain Smoke Test
Wave A.6 (ADR-037): 驗證告警鏈路 E2E 完整性
檢查項目:
1. API Health — /api/v1/health 組件 UP
1. API Health — /api/v1/health 核心組件 UPAI provider 降級列為警告
2. Alert Chain Metric — awoooi_alert_chain_last_success_timestamp 不超過 2h
3. Webhook 可達性 — /api/v1/webhooks/alertmanager, /signoz, /sentry health
4. Telegram Secret — K8s Secret 存在且非空
@@ -215,7 +215,7 @@ class SmokeTestReport:
# 檢查函數
# =============================================================================
def check_api_health(api_url: str) -> CheckResult:
"""Check 1: API Health — 所有組件必須 UP"""
"""Check 1: API Health — core runtime must be up; provider degradation is warning evidence."""
try:
resp = http_get(f"{api_url}/api/v1/health", timeout=TIMEOUT)
data = resp.json()
@@ -223,25 +223,28 @@ def check_api_health(api_url: str) -> CheckResult:
if resp.status_code >= 400:
return CheckResult("API Health", False, f"HTTP {resp.status_code}")
if data.get("status") != "healthy":
components = data.get("components", {})
core_components = ("api", "postgresql", "redis")
down_core_components = [
name for name in core_components
if components.get(name, {}).get("status") != "up"
]
if down_core_components:
return CheckResult(
"API Health",
False,
f"API status={data.get('status')} (expected healthy)",
f"核心組件異常: {', '.join(down_core_components)}",
)
# 檢查每個組件
components = data.get("components", {})
down_components = [
name for name, info in components.items()
if info.get("status") != "up"
]
if down_components:
return CheckResult(
"API Health",
False,
f"組件異常: {', '.join(down_components)}",
True,
f"核心組件 UP非阻塞降級: {', '.join(down_components)}",
)
return CheckResult(