diff --git a/.gitea/workflows/e2e-health.yaml b/.gitea/workflows/e2e-health.yaml index 865c94f4..90472aa8 100644 --- a/.gitea/workflows/e2e-health.yaml +++ b/.gitea/workflows/e2e-health.yaml @@ -61,6 +61,7 @@ jobs: AWOOOP_OPERATOR_ID=gitea-e2e-health \ python3 scripts/alert_chain_smoke_test.py \ --api-url https://awoooi.wooo.work \ + --metrics-api-url http://192.168.0.125:32334 \ --source-provider-heartbeat \ --json diff --git a/apps/api/tests/test_alert_chain_smoke_metric.py b/apps/api/tests/test_alert_chain_smoke_metric.py index f8bdc2ca..c840cf47 100644 --- a/apps/api/tests/test_alert_chain_smoke_metric.py +++ b/apps/api/tests/test_alert_chain_smoke_metric.py @@ -1,6 +1,7 @@ from __future__ import annotations import importlib.util +import json import sys import time import unittest @@ -78,6 +79,50 @@ class AlertChainSmokeMetricTest(unittest.TestCase): self.assertFalse(result.passed) self.assertTrue(result.critical) + def test_alert_chain_metric_checks_app_metric_when_prometheus_is_stale(self): + fresh_ts = time.time() - 30 + stale_ts = time.time() - alert_chain_smoke_test.MAX_ALERT_CHAIN_SILENCE_SECONDS - 60 + + def fake_get(url, *, params=None, timeout=None): + if url.endswith("/api/v1/query"): + return alert_chain_smoke_test.HttpGetResult( + 200, + json.dumps( + { + "data": { + "result": [ + { + "metric": {"source": "sentry"}, + "value": [time.time(), str(stale_ts)], + } + ] + } + } + ), + ) + if url.endswith("/metrics"): + return alert_chain_smoke_test.HttpGetResult( + 200, + 'awoooi_alert_chain_last_success_timestamp{source="sentry"} ' + f"{fresh_ts}", + ) + raise AssertionError(f"unexpected url {url}") + + original_get = alert_chain_smoke_test.http_get + try: + alert_chain_smoke_test.http_get = fake_get + result = alert_chain_smoke_test.check_alert_chain_metric( + "http://prometheus", + "http://api", + source="sentry", + ) + finally: + alert_chain_smoke_test.http_get = original_get + + self.assertTrue(result.passed) + self.assertIn("app_metrics", result.message) + self.assertIn("Prometheus scrape 尚未看到", result.message) + def test_source_provider_heartbeat_requires_operator_key(self): result = alert_chain_smoke_test.send_source_provider_heartbeat( "https://awoooi.example", diff --git a/scripts/alert_chain_smoke_test.py b/scripts/alert_chain_smoke_test.py index 782e4c6c..771aa6da 100644 --- a/scripts/alert_chain_smoke_test.py +++ b/scripts/alert_chain_smoke_test.py @@ -364,6 +364,7 @@ def check_alert_chain_metric( "awoooi_alert_chain_last_success_timestamp" f'{{source="{_escape_prometheus_label_value(source)}"}}' ) + prometheus_result: CheckResult | None = None try: resp = http_get( f"{prometheus_url}/api/v1/query", @@ -379,11 +380,17 @@ def check_alert_chain_metric( source, ) if sample: - return _alert_chain_metric_result(sample) + prometheus_result = _alert_chain_metric_result(sample) + if prometheus_result.passed: + return prometheus_result except (URLError, TimeoutError, OSError, json.JSONDecodeError) as e: prometheus_error = _http_error_message(e) else: - prometheus_error = "Prometheus 未抓到" + prometheus_error = ( + prometheus_result.message + if prometheus_result is not None + else "Prometheus 未抓到" + ) try: app_resp = http_get(f"{api_url}/metrics", timeout=TIMEOUT) @@ -394,8 +401,13 @@ def check_alert_chain_metric( source, ) if app_sample: - return _alert_chain_metric_result(app_sample, fallback=True) + app_result = _alert_chain_metric_result(app_sample, fallback=True) + if app_result.passed or prometheus_result is None: + return app_result + return prometheus_result except (URLError, TimeoutError, OSError) as e: + if prometheus_result is not None: + return prometheus_result return CheckResult( "Alert Chain Metric", False, @@ -406,6 +418,9 @@ def check_alert_chain_metric( critical=False, ) + if prometheus_result is not None: + return prometheus_result + return CheckResult( "Alert Chain Metric", False, @@ -610,6 +625,7 @@ def run_smoke_test( api_url: str, fail_fast: bool = False, *, + metrics_api_url: str | None = None, source_provider_heartbeat: bool = False, source_providers: list[str] | None = None, operator_key: str | None = None, @@ -617,9 +633,12 @@ def run_smoke_test( run_ref: str | None = None, ) -> SmokeTestReport: report = SmokeTestReport() + metrics_url = metrics_api_url or api_url print("\n🔍 AWOOOI Alert Chain Smoke Test") print(f" API: {api_url}") + if metrics_url != api_url: + print(f" Metrics API: {metrics_url}") print(f" 時間: {time.strftime('%Y-%m-%d %H:%M:%S %Z')}") print("-" * 50) @@ -629,7 +648,7 @@ def run_smoke_test( return report # Check 2: Alert Chain Metric - report.add(check_alert_chain_metric(PROMETHEUS_URL, api_url)) + report.add(check_alert_chain_metric(PROMETHEUS_URL, metrics_url)) # Check 3: Webhook Health for result in check_webhook_health(api_url): @@ -652,7 +671,13 @@ def run_smoke_test( if heartbeat_result.passed: for source in provider_list: - report.add(check_alert_chain_metric(PROMETHEUS_URL, api_url, source=source)) + report.add( + check_alert_chain_metric( + PROMETHEUS_URL, + metrics_url, + source=source, + ) + ) # Check 4: SigNoz report.add(check_signoz_reachable(SIGNOZ_URL)) @@ -671,6 +696,14 @@ def main() -> int: parser.add_argument( "--api-url", default=DEFAULT_API_URL, help="API base URL" ) + parser.add_argument( + "--metrics-api-url", + default=os.environ.get("ALERT_CHAIN_METRICS_API_URL"), + help=( + "API base URL used only for /metrics fallback; useful when public " + "API routes /metrics to the frontend" + ), + ) parser.add_argument( "--fail-fast", action="store_true", help="第一個 critical 失敗即中止" ) @@ -708,6 +741,7 @@ def main() -> int: report = run_smoke_test( args.api_url, args.fail_fast, + metrics_api_url=args.metrics_api_url, source_provider_heartbeat=args.source_provider_heartbeat, source_providers=args.source_provider, operator_key=os.environ.get(args.operator_key_env),