fix(cd): retry alert chain api health smoke
All checks were successful
CD Pipeline / tests (push) Successful in 1m21s
Code Review / ai-code-review (push) Successful in 14s
CD Pipeline / build-and-deploy (push) Successful in 3m40s
CD Pipeline / post-deploy-checks (push) Successful in 1m40s

This commit is contained in:
Your Name
2026-06-01 18:38:48 +08:00
parent cc92eb0294
commit 0746543b0a
3 changed files with 192 additions and 7 deletions

View File

@@ -47,6 +47,78 @@ class AlertChainSmokeMetricTest(unittest.TestCase):
self.assertTrue(result.passed)
self.assertIn("非阻塞降級: ollama", result.message)
def test_api_health_retries_transient_connection_failure(self):
calls = []
def fake_get(url, *, params=None, timeout=None):
self.assertTrue(url.endswith("/api/v1/health"))
calls.append({"url": url, "timeout": timeout})
if len(calls) == 1:
raise alert_chain_smoke_test.URLError("timed out")
return alert_chain_smoke_test.HttpGetResult(
200,
json.dumps(
{
"status": "healthy",
"environment": "prod",
"components": {
"api": {"status": "up"},
"postgresql": {"status": "up"},
"redis": {"status": "up"},
},
}
),
)
original_get = alert_chain_smoke_test.http_get
original_attempts = alert_chain_smoke_test.API_HEALTH_ATTEMPTS
original_timeout = alert_chain_smoke_test.API_HEALTH_TIMEOUT
original_delay = alert_chain_smoke_test.API_HEALTH_RETRY_DELAY
try:
alert_chain_smoke_test.http_get = fake_get
alert_chain_smoke_test.API_HEALTH_ATTEMPTS = 3
alert_chain_smoke_test.API_HEALTH_TIMEOUT = 20
alert_chain_smoke_test.API_HEALTH_RETRY_DELAY = 0
result = alert_chain_smoke_test.check_api_health("http://api")
finally:
alert_chain_smoke_test.http_get = original_get
alert_chain_smoke_test.API_HEALTH_ATTEMPTS = original_attempts
alert_chain_smoke_test.API_HEALTH_TIMEOUT = original_timeout
alert_chain_smoke_test.API_HEALTH_RETRY_DELAY = original_delay
self.assertTrue(result.passed)
self.assertEqual(len(calls), 2)
self.assertEqual({call["timeout"] for call in calls}, {20})
def test_api_health_reports_attempts_after_retry_exhaustion(self):
calls = []
def fake_get(url, *, params=None, timeout=None):
self.assertTrue(url.endswith("/api/v1/health"))
calls.append(timeout)
raise TimeoutError("timed out")
original_get = alert_chain_smoke_test.http_get
original_attempts = alert_chain_smoke_test.API_HEALTH_ATTEMPTS
original_timeout = alert_chain_smoke_test.API_HEALTH_TIMEOUT
original_delay = alert_chain_smoke_test.API_HEALTH_RETRY_DELAY
try:
alert_chain_smoke_test.http_get = fake_get
alert_chain_smoke_test.API_HEALTH_ATTEMPTS = 2
alert_chain_smoke_test.API_HEALTH_TIMEOUT = 7
alert_chain_smoke_test.API_HEALTH_RETRY_DELAY = 0
result = alert_chain_smoke_test.check_api_health("http://api")
finally:
alert_chain_smoke_test.http_get = original_get
alert_chain_smoke_test.API_HEALTH_ATTEMPTS = original_attempts
alert_chain_smoke_test.API_HEALTH_TIMEOUT = original_timeout
alert_chain_smoke_test.API_HEALTH_RETRY_DELAY = original_delay
self.assertFalse(result.passed)
self.assertEqual(calls, [7, 7])
self.assertIn("attempts=2", result.message)
self.assertIn("timeout=7s", result.message)
def test_api_health_fails_when_core_component_is_down(self):
def fake_get(url, *, params=None, timeout=None):
self.assertTrue(url.endswith("/api/v1/health"))

View File

@@ -1,3 +1,34 @@
## 2026-06-01Alert Chain post-deploy API Health 重試化
**背景**
- W-1 自動修復 SLO 診斷與前端治理面板已部署,但 `post-deploy-checks` 曾在 `Alert Chain Smoke Test``API Health` 單次 public read timeout 被標紅。
- 人工即時查正式 `/api/v1/health``HTTP 200` 且核心組件健康,因此這不是 production outage而是部署後 smoke probe 對短暫 timeout 太敏感,會反過來製造假失敗告警。
**本次調整**
- `scripts/alert_chain_smoke_test.py``API Health` probe 改為可設定 `ALERT_CHAIN_API_HEALTH_ATTEMPTS``ALERT_CHAIN_API_HEALTH_TIMEOUT``ALERT_CHAIN_API_HEALTH_RETRY_DELAY`,預設 `3` 次、單次 `20s`、間隔 `3s`
- 僅重試連線 / timeout / JSON decode 與短暫 `5xx``api` / `postgresql` / `redis` 核心組件實際 down 仍維持 critical failure。
- smoke 訊息現在會標出 `attempts=目前/上限` 與 timeout讓 Telegram / Gitea log 能看出是第幾次 probe 通過或失敗。
- `apps/api/tests/test_alert_chain_smoke_metric.py`:新增 transient connection retry 與 retry exhaustion 單元測試。
**驗證**
- `python3 -m py_compile scripts/alert_chain_smoke_test.py apps/api/tests/test_alert_chain_smoke_metric.py`
- `/Users/ogt/.pyenv/shims/python -m unittest apps/api/tests/test_alert_chain_smoke_metric.py``15 tests OK`
- `git diff --check`
- `python3 scripts/security/security-mirror-progress-guard.py --root .``SECURITY_MIRROR_PROGRESS_GUARD_OK`
- Production smoke`ALERT_CHAIN_API_HEALTH_RETRY_DELAY=1 python3 scripts/alert_chain_smoke_test.py --api-url https://awoooi.wooo.work --json``PASSED — 8/8 checks passed in 6.3s`
- `API Health`:核心組件 UP非阻塞降級 `ollama_local``attempts=1/3, timeout=20s`
- `Alert Chain Metric`:最後 `alertmanager` 告警成功約 3 分鐘前evidence=`prometheus`
- Alertmanager / SignOz / Sentry webhook、SigNoz、OTEL Collector、Event Exporter 全部通過。
**目前整體進度(本階段完成後)**
- CI/CD 與 post-deploy smoke 穩定性:約 `99.2%`;已補 build-deploy public curl 與 post-deploy Alert Chain API Health 兩處 timeout 誤報來源,下一步靠新一輪 Gitea CD 驗證。
- W-1 自動修復 SLO 可解釋化:約 `90%`API 診斷、Telegram 話術、治理前端面板都已到位,但 7 天 rolling window 還需等舊失敗自然滑出。
- 完整 AI 自動化飛輪總進度:維持 `61%`;本階段清掉的是「告警可信度 / CI 誤報」技術債,尚未增加新的 verified auto-repair 成功樣本,所以不能把整體自動修復完成度上調。
## 2026-06-01CD public health 單次 timeout 重試化
**背景**

View File

@@ -41,6 +41,20 @@ from urllib.request import Request, urlopen
# =============================================================================
# 配置
# =============================================================================
def _env_int(name: str, default: int) -> int:
try:
return int(os.environ.get(name, str(default)))
except ValueError:
return default
def _env_float(name: str, default: float) -> float:
try:
return float(os.environ.get(name, str(default)))
except ValueError:
return default
DEFAULT_API_URL = "https://awoooi.wooo.work"
SIGNOZ_URL = "http://192.168.0.188:3301"
ALERTMANAGER_URL = "http://192.168.0.188:9093"
@@ -50,6 +64,12 @@ PROMETHEUS_URL = "http://192.168.0.110:9090"
MAX_ALERT_CHAIN_SILENCE_SECONDS = 2 * 60 * 60
TIMEOUT = 10 # 秒
API_HEALTH_TIMEOUT = _env_int("ALERT_CHAIN_API_HEALTH_TIMEOUT", 20)
API_HEALTH_ATTEMPTS = max(
1,
_env_int("ALERT_CHAIN_API_HEALTH_ATTEMPTS", 3),
)
API_HEALTH_RETRY_DELAY = _env_float("ALERT_CHAIN_API_HEALTH_RETRY_DELAY", 3.0)
@dataclass(frozen=True)
@@ -117,6 +137,13 @@ def _http_error_message(error: Exception) -> str:
return str(error)
def _api_health_probe_summary(attempt: int) -> str:
return (
f"attempts={attempt}/{API_HEALTH_ATTEMPTS}, "
f"timeout={API_HEALTH_TIMEOUT}s"
)
def _statuses_from_env(env_name: str) -> list[str] | None:
"""Return preflight pod statuses supplied by CI, or None to use kubectl."""
if env_name not in os.environ:
@@ -216,12 +243,53 @@ class SmokeTestReport:
# =============================================================================
def check_api_health(api_url: str) -> CheckResult:
"""Check 1: API Health — core runtime must be up; provider degradation is warning evidence."""
try:
resp = http_get(f"{api_url}/api/v1/health", timeout=TIMEOUT)
data = resp.json()
last_error = "unknown"
resp: HttpGetResult | None = None
data: dict[str, Any] | None = None
used_attempt = 0
for attempt in range(1, API_HEALTH_ATTEMPTS + 1):
used_attempt = attempt
try:
resp = http_get(
f"{api_url}/api/v1/health",
timeout=API_HEALTH_TIMEOUT,
)
data = resp.json()
except (URLError, TimeoutError, OSError, json.JSONDecodeError) as e:
last_error = _http_error_message(e)
if attempt < API_HEALTH_ATTEMPTS:
time.sleep(API_HEALTH_RETRY_DELAY)
continue
return CheckResult(
"API Health",
False,
(
f"無法連線: {last_error} "
f"({_api_health_probe_summary(attempt)})"
),
)
if resp.status_code >= 500 and attempt < API_HEALTH_ATTEMPTS:
last_error = f"HTTP {resp.status_code}"
time.sleep(API_HEALTH_RETRY_DELAY)
continue
break
if resp is None or data is None:
return CheckResult("API Health", False, f"無法連線: {last_error}")
try:
if resp.status_code >= 400:
return CheckResult("API Health", False, f"HTTP {resp.status_code}")
return CheckResult(
"API Health",
False,
(
f"HTTP {resp.status_code} "
f"({_api_health_probe_summary(used_attempt)})"
),
)
components = data.get("components", {})
core_components = ("api", "postgresql", "redis")
@@ -244,16 +312,30 @@ def check_api_health(api_url: str) -> CheckResult:
return CheckResult(
"API Health",
True,
f"核心組件 UP非阻塞降級: {', '.join(down_components)}",
(
f"核心組件 UP非阻塞降級: {', '.join(down_components)} "
f"({_api_health_probe_summary(used_attempt)})"
),
)
return CheckResult(
"API Health",
True,
f"所有 {len(components)} 個組件 UP ({data.get('environment', 'unknown')})",
(
f"所有 {len(components)} 個組件 UP "
f"({data.get('environment', 'unknown')}; "
f"{_api_health_probe_summary(used_attempt)})"
),
)
except (URLError, TimeoutError, OSError, json.JSONDecodeError) as e:
return CheckResult("API Health", False, f"無法連線: {_http_error_message(e)}")
return CheckResult(
"API Health",
False,
(
f"無法連線: {_http_error_message(e)} "
f"({_api_health_probe_summary(used_attempt)})"
),
)
def _escape_prometheus_label_value(value: str) -> str: