From 22a4b44aef5e5795f34611e6fe155fd862bb560f Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 24 May 2026 10:59:21 +0800 Subject: [PATCH] fix(ci): report provider degradation as warning --- .gitea/workflows/cd.yaml | 8 +-- .../tests/test_alert_chain_smoke_metric.py | 58 +++++++++++++++++++ scripts/alert_chain_smoke_test.py | 21 ++++--- 3 files changed, 74 insertions(+), 13 deletions(-) diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index 74899c3d..6391ec6c 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -1383,10 +1383,10 @@ jobs: - name: Notify Health Check Success env: - SMOKE_RESULT: ${{ steps.smoke.outcome == 'success' && '✅' || '⚠️' }} - ALERT_CHAIN_RESULT: ${{ steps.alert_chain_smoke.outcome == 'success' && '✅' || '⚠️' }} - MONITORING_RESULT: ${{ steps.monitoring_coverage.outcome == 'success' && '✅' || '⚠️' }} - SOURCE_LINK_RESULT: ${{ steps.source_correlation_apply_smoke.outcome == 'success' && '✅' || '⚠️' }} + SMOKE_RESULT: ${{ steps.smoke.outputs.smoke_status == 'pass' && '✅' || '⚠️' }} + ALERT_CHAIN_RESULT: ${{ steps.alert_chain_smoke.outputs.alert_chain_status == 'pass' && '✅' || '⚠️' }} + MONITORING_RESULT: ${{ steps.monitoring_coverage.outputs.coverage_status == 'pass' && '✅' || '⚠️' }} + SOURCE_LINK_RESULT: ${{ steps.source_correlation_apply_smoke.outputs.source_correlation_apply_status == 'pass' && '✅' || '⚠️' }} run: | END_TIME=$(date +%s) DURATION=$((END_TIME - ${{ steps.commit.outputs.start_time }})) diff --git a/apps/api/tests/test_alert_chain_smoke_metric.py b/apps/api/tests/test_alert_chain_smoke_metric.py index 39bf64c7..47ec0799 100644 --- a/apps/api/tests/test_alert_chain_smoke_metric.py +++ b/apps/api/tests/test_alert_chain_smoke_metric.py @@ -17,6 +17,64 @@ SPEC.loader.exec_module(alert_chain_smoke_test) class AlertChainSmokeMetricTest(unittest.TestCase): + def test_api_health_passes_when_only_provider_is_degraded(self): + def fake_get(url, *, params=None, timeout=None): + self.assertTrue(url.endswith("/api/v1/health")) + return alert_chain_smoke_test.HttpGetResult( + 200, + json.dumps( + { + "status": "degraded", + "environment": "prod", + "components": { + "api": {"status": "up"}, + "postgresql": {"status": "up"}, + "redis": {"status": "up"}, + "ollama": {"status": "down", "error": "timeout"}, + "signoz": {"status": "up"}, + }, + } + ), + ) + + original_get = alert_chain_smoke_test.http_get + try: + alert_chain_smoke_test.http_get = fake_get + result = alert_chain_smoke_test.check_api_health("http://api") + finally: + alert_chain_smoke_test.http_get = original_get + + self.assertTrue(result.passed) + self.assertIn("非阻塞降級: ollama", result.message) + + def test_api_health_fails_when_core_component_is_down(self): + def fake_get(url, *, params=None, timeout=None): + self.assertTrue(url.endswith("/api/v1/health")) + return alert_chain_smoke_test.HttpGetResult( + 200, + json.dumps( + { + "status": "degraded", + "components": { + "api": {"status": "up"}, + "postgresql": {"status": "down"}, + "redis": {"status": "up"}, + "ollama": {"status": "up"}, + }, + } + ), + ) + + original_get = alert_chain_smoke_test.http_get + try: + alert_chain_smoke_test.http_get = fake_get + result = alert_chain_smoke_test.check_api_health("http://api") + finally: + alert_chain_smoke_test.http_get = original_get + + self.assertFalse(result.passed) + self.assertIn("核心組件異常: postgresql", result.message) + def test_parse_app_alert_chain_metric_samples(self): samples = alert_chain_smoke_test.parse_app_alert_chain_metric_samples( "\n".join([ diff --git a/scripts/alert_chain_smoke_test.py b/scripts/alert_chain_smoke_test.py index 40a61e72..1f7a988f 100644 --- a/scripts/alert_chain_smoke_test.py +++ b/scripts/alert_chain_smoke_test.py @@ -5,7 +5,7 @@ AWOOOI Alert Chain Smoke Test Wave A.6 (ADR-037): 驗證告警鏈路 E2E 完整性 檢查項目: - 1. API Health — /api/v1/health 全組件 UP + 1. API Health — /api/v1/health 核心組件 UP,AI provider 降級列為警告 2. Alert Chain Metric — awoooi_alert_chain_last_success_timestamp 不超過 2h 3. Webhook 可達性 — /api/v1/webhooks/alertmanager, /signoz, /sentry health 4. Telegram Secret — K8s Secret 存在且非空 @@ -215,7 +215,7 @@ class SmokeTestReport: # 檢查函數 # ============================================================================= def check_api_health(api_url: str) -> CheckResult: - """Check 1: API Health — 所有組件必須 UP""" + """Check 1: API Health — core runtime must be up; provider degradation is warning evidence.""" try: resp = http_get(f"{api_url}/api/v1/health", timeout=TIMEOUT) data = resp.json() @@ -223,25 +223,28 @@ def check_api_health(api_url: str) -> CheckResult: if resp.status_code >= 400: return CheckResult("API Health", False, f"HTTP {resp.status_code}") - if data.get("status") != "healthy": + components = data.get("components", {}) + core_components = ("api", "postgresql", "redis") + down_core_components = [ + name for name in core_components + if components.get(name, {}).get("status") != "up" + ] + if down_core_components: return CheckResult( "API Health", False, - f"API status={data.get('status')} (expected healthy)", + f"核心組件異常: {', '.join(down_core_components)}", ) - # 檢查每個組件 - components = data.get("components", {}) down_components = [ name for name, info in components.items() if info.get("status") != "up" ] - if down_components: return CheckResult( "API Health", - False, - f"組件異常: {', '.join(down_components)}", + True, + f"核心組件 UP;非阻塞降級: {', '.join(down_components)}", ) return CheckResult(