483 lines
19 KiB
Python
483 lines
19 KiB
Python
from __future__ import annotations
|
|
|
|
import importlib.util
|
|
import json
|
|
import sys
|
|
import time
|
|
import unittest
|
|
from pathlib import Path
|
|
|
|
SCRIPT_PATH = Path(__file__).resolve().parents[3] / "scripts" / "alert_chain_smoke_test.py"
|
|
SPEC = importlib.util.spec_from_file_location("alert_chain_smoke_test", SCRIPT_PATH)
|
|
alert_chain_smoke_test = importlib.util.module_from_spec(SPEC)
|
|
assert SPEC and SPEC.loader
|
|
sys.dont_write_bytecode = True
|
|
sys.modules[SPEC.name] = alert_chain_smoke_test
|
|
SPEC.loader.exec_module(alert_chain_smoke_test)
|
|
|
|
|
|
class AlertChainSmokeMetricTest(unittest.TestCase):
|
|
def test_api_health_passes_when_only_provider_is_degraded(self):
|
|
def fake_get(url, *, params=None, timeout=None):
|
|
self.assertTrue(url.endswith("/api/v1/health"))
|
|
return alert_chain_smoke_test.HttpGetResult(
|
|
200,
|
|
json.dumps(
|
|
{
|
|
"status": "degraded",
|
|
"environment": "prod",
|
|
"components": {
|
|
"api": {"status": "up"},
|
|
"postgresql": {"status": "up"},
|
|
"redis": {"status": "up"},
|
|
"ollama": {"status": "down", "error": "timeout"},
|
|
"signoz": {"status": "up"},
|
|
},
|
|
}
|
|
),
|
|
)
|
|
|
|
original_get = alert_chain_smoke_test.http_get
|
|
try:
|
|
alert_chain_smoke_test.http_get = fake_get
|
|
result = alert_chain_smoke_test.check_api_health("http://api")
|
|
finally:
|
|
alert_chain_smoke_test.http_get = original_get
|
|
|
|
self.assertTrue(result.passed)
|
|
self.assertIn("非阻塞降級: ollama", result.message)
|
|
|
|
def test_api_health_retries_transient_connection_failure(self):
|
|
calls = []
|
|
|
|
def fake_get(url, *, params=None, timeout=None):
|
|
self.assertTrue(url.endswith("/api/v1/health"))
|
|
calls.append({"url": url, "timeout": timeout})
|
|
if len(calls) == 1:
|
|
raise alert_chain_smoke_test.URLError("timed out")
|
|
return alert_chain_smoke_test.HttpGetResult(
|
|
200,
|
|
json.dumps(
|
|
{
|
|
"status": "healthy",
|
|
"environment": "prod",
|
|
"components": {
|
|
"api": {"status": "up"},
|
|
"postgresql": {"status": "up"},
|
|
"redis": {"status": "up"},
|
|
},
|
|
}
|
|
),
|
|
)
|
|
|
|
original_get = alert_chain_smoke_test.http_get
|
|
original_attempts = alert_chain_smoke_test.API_HEALTH_ATTEMPTS
|
|
original_timeout = alert_chain_smoke_test.API_HEALTH_TIMEOUT
|
|
original_delay = alert_chain_smoke_test.API_HEALTH_RETRY_DELAY
|
|
try:
|
|
alert_chain_smoke_test.http_get = fake_get
|
|
alert_chain_smoke_test.API_HEALTH_ATTEMPTS = 3
|
|
alert_chain_smoke_test.API_HEALTH_TIMEOUT = 20
|
|
alert_chain_smoke_test.API_HEALTH_RETRY_DELAY = 0
|
|
result = alert_chain_smoke_test.check_api_health("http://api")
|
|
finally:
|
|
alert_chain_smoke_test.http_get = original_get
|
|
alert_chain_smoke_test.API_HEALTH_ATTEMPTS = original_attempts
|
|
alert_chain_smoke_test.API_HEALTH_TIMEOUT = original_timeout
|
|
alert_chain_smoke_test.API_HEALTH_RETRY_DELAY = original_delay
|
|
|
|
self.assertTrue(result.passed)
|
|
self.assertEqual(len(calls), 2)
|
|
self.assertEqual({call["timeout"] for call in calls}, {20})
|
|
|
|
def test_api_health_reports_attempts_after_retry_exhaustion(self):
|
|
calls = []
|
|
|
|
def fake_get(url, *, params=None, timeout=None):
|
|
self.assertTrue(url.endswith("/api/v1/health"))
|
|
calls.append(timeout)
|
|
raise TimeoutError("timed out")
|
|
|
|
original_get = alert_chain_smoke_test.http_get
|
|
original_attempts = alert_chain_smoke_test.API_HEALTH_ATTEMPTS
|
|
original_timeout = alert_chain_smoke_test.API_HEALTH_TIMEOUT
|
|
original_delay = alert_chain_smoke_test.API_HEALTH_RETRY_DELAY
|
|
try:
|
|
alert_chain_smoke_test.http_get = fake_get
|
|
alert_chain_smoke_test.API_HEALTH_ATTEMPTS = 2
|
|
alert_chain_smoke_test.API_HEALTH_TIMEOUT = 7
|
|
alert_chain_smoke_test.API_HEALTH_RETRY_DELAY = 0
|
|
result = alert_chain_smoke_test.check_api_health("http://api")
|
|
finally:
|
|
alert_chain_smoke_test.http_get = original_get
|
|
alert_chain_smoke_test.API_HEALTH_ATTEMPTS = original_attempts
|
|
alert_chain_smoke_test.API_HEALTH_TIMEOUT = original_timeout
|
|
alert_chain_smoke_test.API_HEALTH_RETRY_DELAY = original_delay
|
|
|
|
self.assertFalse(result.passed)
|
|
self.assertEqual(calls, [7, 7])
|
|
self.assertIn("attempts=2", result.message)
|
|
self.assertIn("timeout=7s", result.message)
|
|
|
|
def test_api_health_fails_when_core_component_is_down(self):
|
|
def fake_get(url, *, params=None, timeout=None):
|
|
self.assertTrue(url.endswith("/api/v1/health"))
|
|
return alert_chain_smoke_test.HttpGetResult(
|
|
200,
|
|
json.dumps(
|
|
{
|
|
"status": "degraded",
|
|
"components": {
|
|
"api": {"status": "up"},
|
|
"postgresql": {"status": "down"},
|
|
"redis": {"status": "up"},
|
|
"ollama": {"status": "up"},
|
|
},
|
|
}
|
|
),
|
|
)
|
|
|
|
original_get = alert_chain_smoke_test.http_get
|
|
try:
|
|
alert_chain_smoke_test.http_get = fake_get
|
|
result = alert_chain_smoke_test.check_api_health("http://api")
|
|
finally:
|
|
alert_chain_smoke_test.http_get = original_get
|
|
|
|
self.assertFalse(result.passed)
|
|
self.assertIn("核心組件異常: postgresql", result.message)
|
|
|
|
def test_parse_app_alert_chain_metric_samples(self):
|
|
samples = alert_chain_smoke_test.parse_app_alert_chain_metric_samples(
|
|
"\n".join([
|
|
"# HELP awoooi_alert_chain_last_success_timestamp Last successful alert chain",
|
|
'awoooi_alert_chain_last_success_timestamp{source="alertmanager"} 123.5',
|
|
'awoooi_alert_chain_last_success_timestamp{source="sentry"} 120',
|
|
"unrelated_metric 1",
|
|
])
|
|
)
|
|
|
|
self.assertEqual(
|
|
samples,
|
|
[
|
|
alert_chain_smoke_test.AlertChainMetricSample(
|
|
source="alertmanager",
|
|
timestamp=123.5,
|
|
evidence_path="app_metrics",
|
|
),
|
|
alert_chain_smoke_test.AlertChainMetricSample(
|
|
source="sentry",
|
|
timestamp=120.0,
|
|
evidence_path="app_metrics",
|
|
),
|
|
],
|
|
)
|
|
|
|
def test_newest_sample_for_source_prefers_requested_source(self):
|
|
samples = [
|
|
alert_chain_smoke_test.AlertChainMetricSample("sentry", 999.0, "prometheus"),
|
|
alert_chain_smoke_test.AlertChainMetricSample("alertmanager", 100.0, "prometheus"),
|
|
alert_chain_smoke_test.AlertChainMetricSample("alertmanager", 200.0, "app_metrics"),
|
|
]
|
|
|
|
sample = alert_chain_smoke_test._newest_sample_for_source(samples, "alertmanager")
|
|
|
|
self.assertEqual(sample.timestamp, 200.0)
|
|
self.assertEqual(sample.evidence_path, "app_metrics")
|
|
|
|
def test_alert_chain_metric_result_marks_recent_app_metric_as_scrape_delay(self):
|
|
sample = alert_chain_smoke_test.AlertChainMetricSample(
|
|
source="alertmanager",
|
|
timestamp=time.time() - 60,
|
|
evidence_path="app_metrics",
|
|
)
|
|
|
|
result = alert_chain_smoke_test._alert_chain_metric_result(sample, fallback=True)
|
|
|
|
self.assertTrue(result.passed)
|
|
self.assertIn("Prometheus scrape 尚未看到", result.message)
|
|
|
|
def test_alert_chain_metric_result_fails_persistent_silence(self):
|
|
sample = alert_chain_smoke_test.AlertChainMetricSample(
|
|
source="alertmanager",
|
|
timestamp=time.time() - alert_chain_smoke_test.MAX_ALERT_CHAIN_SILENCE_SECONDS - 60,
|
|
evidence_path="prometheus",
|
|
)
|
|
|
|
result = alert_chain_smoke_test._alert_chain_metric_result(sample)
|
|
|
|
self.assertFalse(result.passed)
|
|
self.assertTrue(result.critical)
|
|
|
|
def test_alert_chain_metric_checks_app_metric_when_prometheus_is_stale(self):
|
|
fresh_ts = time.time() - 30
|
|
stale_ts = time.time() - alert_chain_smoke_test.MAX_ALERT_CHAIN_SILENCE_SECONDS - 60
|
|
|
|
def fake_get(url, *, params=None, timeout=None):
|
|
if url.endswith("/api/v1/query"):
|
|
return alert_chain_smoke_test.HttpGetResult(
|
|
200,
|
|
json.dumps(
|
|
{
|
|
"data": {
|
|
"result": [
|
|
{
|
|
"metric": {"source": "sentry"},
|
|
"value": [time.time(), str(stale_ts)],
|
|
}
|
|
]
|
|
}
|
|
}
|
|
),
|
|
)
|
|
if url.endswith("/metrics"):
|
|
return alert_chain_smoke_test.HttpGetResult(
|
|
200,
|
|
'awoooi_alert_chain_last_success_timestamp{source="sentry"} '
|
|
f"{fresh_ts}",
|
|
)
|
|
raise AssertionError(f"unexpected url {url}")
|
|
|
|
original_get = alert_chain_smoke_test.http_get
|
|
try:
|
|
alert_chain_smoke_test.http_get = fake_get
|
|
result = alert_chain_smoke_test.check_alert_chain_metric(
|
|
"http://prometheus",
|
|
"http://api",
|
|
source="sentry",
|
|
)
|
|
finally:
|
|
alert_chain_smoke_test.http_get = original_get
|
|
|
|
self.assertTrue(result.passed)
|
|
self.assertIn("app_metrics", result.message)
|
|
self.assertIn("Prometheus scrape 尚未看到", result.message)
|
|
|
|
def test_source_provider_heartbeat_requires_operator_key(self):
|
|
result = alert_chain_smoke_test.send_source_provider_heartbeat(
|
|
"https://awoooi.example",
|
|
providers=["sentry", "signoz"],
|
|
operator_key=None,
|
|
operator_id="gitea-e2e-health",
|
|
)
|
|
|
|
self.assertFalse(result.passed)
|
|
self.assertTrue(result.critical)
|
|
self.assertIn("AWOOOP_OPERATOR_API_KEY", result.message)
|
|
|
|
def test_source_provider_heartbeat_posts_expected_payload(self):
|
|
calls = []
|
|
|
|
def fake_post(url, payload, *, headers=None, timeout=None):
|
|
calls.append(
|
|
{
|
|
"url": url,
|
|
"payload": payload,
|
|
"headers": headers,
|
|
"timeout": timeout,
|
|
}
|
|
)
|
|
return alert_chain_smoke_test.HttpGetResult(
|
|
200,
|
|
(
|
|
'{"status":"recorded","items":['
|
|
'{"provider":"sentry"},{"provider":"signoz"}]}'
|
|
),
|
|
)
|
|
|
|
original_post = alert_chain_smoke_test.http_post_json
|
|
try:
|
|
alert_chain_smoke_test.http_post_json = fake_post
|
|
result = alert_chain_smoke_test.send_source_provider_heartbeat(
|
|
"https://awoooi.example",
|
|
providers=["sentry", "signoz"],
|
|
operator_key="secret",
|
|
operator_id="gitea-e2e-health",
|
|
run_ref="run-123",
|
|
)
|
|
finally:
|
|
alert_chain_smoke_test.http_post_json = original_post
|
|
|
|
self.assertTrue(result.passed)
|
|
self.assertEqual(
|
|
calls[0]["url"],
|
|
"https://awoooi.example/api/v1/platform/events/dossier/provider-heartbeat",
|
|
)
|
|
self.assertEqual(calls[0]["payload"]["providers"], ["sentry", "signoz"])
|
|
self.assertEqual(calls[0]["payload"]["run_ref"], "run-123")
|
|
self.assertEqual(calls[0]["headers"]["X-AwoooP-Operator-Id"], "gitea-e2e-health")
|
|
self.assertEqual(calls[0]["headers"]["X-AwoooP-Operator-Key"], "secret")
|
|
|
|
def test_source_provider_upstream_canary_requires_operator_key(self):
|
|
result = alert_chain_smoke_test.send_source_provider_upstream_canary(
|
|
"https://awoooi.example",
|
|
providers=["sentry", "signoz"],
|
|
operator_key=None,
|
|
operator_id="gitea-e2e-health",
|
|
)
|
|
|
|
self.assertFalse(result.passed)
|
|
self.assertTrue(result.critical)
|
|
self.assertIn("AWOOOP_OPERATOR_API_KEY", result.message)
|
|
|
|
def test_source_provider_upstream_canary_posts_provider_payloads(self):
|
|
calls = []
|
|
|
|
def fake_post(url, payload, *, headers=None, timeout=None):
|
|
calls.append(
|
|
{
|
|
"url": url,
|
|
"payload": payload,
|
|
"headers": headers,
|
|
"timeout": timeout,
|
|
}
|
|
)
|
|
if url.endswith("/api/v1/webhooks/sentry/error"):
|
|
return alert_chain_smoke_test.HttpGetResult(
|
|
200,
|
|
'{"status":"canary_recorded","provider":"sentry"}',
|
|
)
|
|
if url.endswith("/api/v1/webhooks/signoz/alert"):
|
|
return alert_chain_smoke_test.HttpGetResult(
|
|
200,
|
|
(
|
|
'{"status":"ok","results":['
|
|
'{"status":"canary_recorded","provider":"signoz"}]}'
|
|
),
|
|
)
|
|
raise AssertionError(f"unexpected url {url}")
|
|
|
|
original_post = alert_chain_smoke_test.http_post_json
|
|
try:
|
|
alert_chain_smoke_test.http_post_json = fake_post
|
|
result = alert_chain_smoke_test.send_source_provider_upstream_canary(
|
|
"https://awoooi.example",
|
|
providers=["sentry", "signoz"],
|
|
operator_key="secret",
|
|
operator_id="gitea-e2e-health",
|
|
run_ref="run/123",
|
|
)
|
|
finally:
|
|
alert_chain_smoke_test.http_post_json = original_post
|
|
|
|
self.assertTrue(result.passed)
|
|
self.assertEqual(
|
|
calls[0]["url"],
|
|
"https://awoooi.example/api/v1/webhooks/sentry/error",
|
|
)
|
|
self.assertEqual(
|
|
calls[1]["url"],
|
|
"https://awoooi.example/api/v1/webhooks/signoz/alert",
|
|
)
|
|
self.assertEqual(calls[0]["payload"]["data"]["issue"]["title"], "AwoooPSourceProviderCanary")
|
|
self.assertEqual(calls[1]["payload"]["alerts"][0]["labels"]["awoooi_canary"], "true")
|
|
self.assertEqual(calls[0]["headers"]["X-AwoooP-Operator-Id"], "gitea-e2e-health")
|
|
self.assertEqual(calls[1]["headers"]["X-AwoooP-Operator-Key"], "secret")
|
|
|
|
def test_source_link_canary_requires_operator_key(self):
|
|
result = alert_chain_smoke_test.send_source_link_canary(
|
|
"https://awoooi.example",
|
|
target_incident_id="INC-20260505-25E744",
|
|
operator_key=None,
|
|
operator_id="gitea-e2e-health",
|
|
run_ref="run-123",
|
|
)
|
|
|
|
self.assertFalse(result.passed)
|
|
self.assertTrue(result.critical)
|
|
self.assertIn("AWOOOP_OPERATOR_API_KEY", result.message)
|
|
|
|
def test_source_link_canary_posts_dedicated_sentry_payload(self):
|
|
calls = []
|
|
|
|
def fake_post(url, payload, *, headers=None, timeout=None):
|
|
calls.append(
|
|
{
|
|
"url": url,
|
|
"payload": payload,
|
|
"headers": headers,
|
|
"timeout": timeout,
|
|
}
|
|
)
|
|
return alert_chain_smoke_test.HttpGetResult(
|
|
200,
|
|
'{"status":"canary_recorded","provider":"sentry"}',
|
|
)
|
|
|
|
original_post = alert_chain_smoke_test.http_post_json
|
|
try:
|
|
alert_chain_smoke_test.http_post_json = fake_post
|
|
result = alert_chain_smoke_test.send_source_link_canary(
|
|
"https://awoooi.example",
|
|
target_incident_id="INC-20260505-25E744",
|
|
operator_key="secret",
|
|
operator_id="gitea-e2e-health",
|
|
run_ref="run/123",
|
|
)
|
|
finally:
|
|
alert_chain_smoke_test.http_post_json = original_post
|
|
|
|
self.assertTrue(result.passed)
|
|
self.assertEqual(
|
|
calls[0]["url"],
|
|
"https://awoooi.example/api/v1/webhooks/sentry/error",
|
|
)
|
|
issue = calls[0]["payload"]["data"]["issue"]
|
|
tags = calls[0]["payload"]["data"]["event"]["tags"]
|
|
self.assertEqual(issue["id"], "awoooi-source-link-canary-run-123")
|
|
self.assertEqual(issue["title"], "AwoooPSourceLinkCanary")
|
|
self.assertIn(["source_link_canary", "true"], tags)
|
|
self.assertIn(["target_incident_id", "INC-20260505-25E744"], tags)
|
|
self.assertEqual(calls[0]["headers"]["X-AwoooP-Operator-Key"], "secret")
|
|
|
|
def test_source_link_canary_accepts_empty_2xx_for_downstream_readback(self):
|
|
def fake_post(url, payload, *, headers=None, timeout=None):
|
|
self.assertTrue(url.endswith("/api/v1/webhooks/sentry/error"))
|
|
self.assertEqual(payload["data"]["issue"]["title"], "AwoooPSourceLinkCanary")
|
|
return alert_chain_smoke_test.HttpGetResult(204, "")
|
|
|
|
original_post = alert_chain_smoke_test.http_post_json
|
|
try:
|
|
alert_chain_smoke_test.http_post_json = fake_post
|
|
result = alert_chain_smoke_test.send_source_link_canary(
|
|
"https://awoooi.example",
|
|
target_incident_id="INC-20260505-25E744",
|
|
operator_key="secret",
|
|
operator_id="gitea-e2e-health",
|
|
run_ref="run/123",
|
|
)
|
|
finally:
|
|
alert_chain_smoke_test.http_post_json = original_post
|
|
|
|
self.assertTrue(result.passed)
|
|
self.assertIn("source-correlation smoke must verify readback", result.message)
|
|
|
|
def test_source_link_canary_reports_http_error_before_json_parse(self):
|
|
def fake_post(url, payload, *, headers=None, timeout=None):
|
|
self.assertTrue(url.endswith("/api/v1/webhooks/sentry/error"))
|
|
return alert_chain_smoke_test.HttpGetResult(
|
|
502,
|
|
"<html><body>bad gateway</body></html>",
|
|
)
|
|
|
|
original_post = alert_chain_smoke_test.http_post_json
|
|
try:
|
|
alert_chain_smoke_test.http_post_json = fake_post
|
|
result = alert_chain_smoke_test.send_source_link_canary(
|
|
"https://awoooi.example",
|
|
target_incident_id="INC-20260505-25E744",
|
|
operator_key="secret",
|
|
operator_id="gitea-e2e-health",
|
|
run_ref="run/123",
|
|
)
|
|
finally:
|
|
alert_chain_smoke_test.http_post_json = original_post
|
|
|
|
self.assertFalse(result.passed)
|
|
self.assertIn("sentry HTTP 502", result.message)
|
|
self.assertIn("bad gateway", result.message)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|