fix(cd): retry alert chain api health smoke
This commit is contained in:
@@ -47,6 +47,78 @@ class AlertChainSmokeMetricTest(unittest.TestCase):
|
||||
self.assertTrue(result.passed)
|
||||
self.assertIn("非阻塞降級: ollama", result.message)
|
||||
|
||||
def test_api_health_retries_transient_connection_failure(self):
|
||||
calls = []
|
||||
|
||||
def fake_get(url, *, params=None, timeout=None):
|
||||
self.assertTrue(url.endswith("/api/v1/health"))
|
||||
calls.append({"url": url, "timeout": timeout})
|
||||
if len(calls) == 1:
|
||||
raise alert_chain_smoke_test.URLError("timed out")
|
||||
return alert_chain_smoke_test.HttpGetResult(
|
||||
200,
|
||||
json.dumps(
|
||||
{
|
||||
"status": "healthy",
|
||||
"environment": "prod",
|
||||
"components": {
|
||||
"api": {"status": "up"},
|
||||
"postgresql": {"status": "up"},
|
||||
"redis": {"status": "up"},
|
||||
},
|
||||
}
|
||||
),
|
||||
)
|
||||
|
||||
original_get = alert_chain_smoke_test.http_get
|
||||
original_attempts = alert_chain_smoke_test.API_HEALTH_ATTEMPTS
|
||||
original_timeout = alert_chain_smoke_test.API_HEALTH_TIMEOUT
|
||||
original_delay = alert_chain_smoke_test.API_HEALTH_RETRY_DELAY
|
||||
try:
|
||||
alert_chain_smoke_test.http_get = fake_get
|
||||
alert_chain_smoke_test.API_HEALTH_ATTEMPTS = 3
|
||||
alert_chain_smoke_test.API_HEALTH_TIMEOUT = 20
|
||||
alert_chain_smoke_test.API_HEALTH_RETRY_DELAY = 0
|
||||
result = alert_chain_smoke_test.check_api_health("http://api")
|
||||
finally:
|
||||
alert_chain_smoke_test.http_get = original_get
|
||||
alert_chain_smoke_test.API_HEALTH_ATTEMPTS = original_attempts
|
||||
alert_chain_smoke_test.API_HEALTH_TIMEOUT = original_timeout
|
||||
alert_chain_smoke_test.API_HEALTH_RETRY_DELAY = original_delay
|
||||
|
||||
self.assertTrue(result.passed)
|
||||
self.assertEqual(len(calls), 2)
|
||||
self.assertEqual({call["timeout"] for call in calls}, {20})
|
||||
|
||||
def test_api_health_reports_attempts_after_retry_exhaustion(self):
|
||||
calls = []
|
||||
|
||||
def fake_get(url, *, params=None, timeout=None):
|
||||
self.assertTrue(url.endswith("/api/v1/health"))
|
||||
calls.append(timeout)
|
||||
raise TimeoutError("timed out")
|
||||
|
||||
original_get = alert_chain_smoke_test.http_get
|
||||
original_attempts = alert_chain_smoke_test.API_HEALTH_ATTEMPTS
|
||||
original_timeout = alert_chain_smoke_test.API_HEALTH_TIMEOUT
|
||||
original_delay = alert_chain_smoke_test.API_HEALTH_RETRY_DELAY
|
||||
try:
|
||||
alert_chain_smoke_test.http_get = fake_get
|
||||
alert_chain_smoke_test.API_HEALTH_ATTEMPTS = 2
|
||||
alert_chain_smoke_test.API_HEALTH_TIMEOUT = 7
|
||||
alert_chain_smoke_test.API_HEALTH_RETRY_DELAY = 0
|
||||
result = alert_chain_smoke_test.check_api_health("http://api")
|
||||
finally:
|
||||
alert_chain_smoke_test.http_get = original_get
|
||||
alert_chain_smoke_test.API_HEALTH_ATTEMPTS = original_attempts
|
||||
alert_chain_smoke_test.API_HEALTH_TIMEOUT = original_timeout
|
||||
alert_chain_smoke_test.API_HEALTH_RETRY_DELAY = original_delay
|
||||
|
||||
self.assertFalse(result.passed)
|
||||
self.assertEqual(calls, [7, 7])
|
||||
self.assertIn("attempts=2", result.message)
|
||||
self.assertIn("timeout=7s", result.message)
|
||||
|
||||
def test_api_health_fails_when_core_component_is_down(self):
|
||||
def fake_get(url, *, params=None, timeout=None):
|
||||
self.assertTrue(url.endswith("/api/v1/health"))
|
||||
|
||||
@@ -1,3 +1,34 @@
|
||||
## 2026-06-01|Alert Chain post-deploy API Health 重試化
|
||||
|
||||
**背景**:
|
||||
|
||||
- W-1 自動修復 SLO 診斷與前端治理面板已部署,但 `post-deploy-checks` 曾在 `Alert Chain Smoke Test` 的 `API Health` 單次 public read timeout 被標紅。
|
||||
- 人工即時查正式 `/api/v1/health` 回 `HTTP 200` 且核心組件健康,因此這不是 production outage,而是部署後 smoke probe 對短暫 timeout 太敏感,會反過來製造假失敗告警。
|
||||
|
||||
**本次調整**:
|
||||
|
||||
- `scripts/alert_chain_smoke_test.py`:`API Health` probe 改為可設定 `ALERT_CHAIN_API_HEALTH_ATTEMPTS`、`ALERT_CHAIN_API_HEALTH_TIMEOUT`、`ALERT_CHAIN_API_HEALTH_RETRY_DELAY`,預設 `3` 次、單次 `20s`、間隔 `3s`。
|
||||
- 僅重試連線 / timeout / JSON decode 與短暫 `5xx`;`api` / `postgresql` / `redis` 核心組件實際 down 仍維持 critical failure。
|
||||
- smoke 訊息現在會標出 `attempts=目前/上限` 與 timeout,讓 Telegram / Gitea log 能看出是第幾次 probe 通過或失敗。
|
||||
- `apps/api/tests/test_alert_chain_smoke_metric.py`:新增 transient connection retry 與 retry exhaustion 單元測試。
|
||||
|
||||
**驗證**:
|
||||
|
||||
- `python3 -m py_compile scripts/alert_chain_smoke_test.py apps/api/tests/test_alert_chain_smoke_metric.py`
|
||||
- `/Users/ogt/.pyenv/shims/python -m unittest apps/api/tests/test_alert_chain_smoke_metric.py` → `15 tests OK`
|
||||
- `git diff --check`
|
||||
- `python3 scripts/security/security-mirror-progress-guard.py --root .` → `SECURITY_MIRROR_PROGRESS_GUARD_OK`
|
||||
- Production smoke:`ALERT_CHAIN_API_HEALTH_RETRY_DELAY=1 python3 scripts/alert_chain_smoke_test.py --api-url https://awoooi.wooo.work --json` → `PASSED — 8/8 checks passed in 6.3s`
|
||||
- `API Health`:核心組件 UP;非阻塞降級 `ollama_local`;`attempts=1/3, timeout=20s`
|
||||
- `Alert Chain Metric`:最後 `alertmanager` 告警成功約 3 分鐘前,evidence=`prometheus`
|
||||
- Alertmanager / SignOz / Sentry webhook、SigNoz、OTEL Collector、Event Exporter 全部通過。
|
||||
|
||||
**目前整體進度(本階段完成後)**:
|
||||
|
||||
- CI/CD 與 post-deploy smoke 穩定性:約 `99.2%`;已補 build-deploy public curl 與 post-deploy Alert Chain API Health 兩處 timeout 誤報來源,下一步靠新一輪 Gitea CD 驗證。
|
||||
- W-1 自動修復 SLO 可解釋化:約 `90%`;API 診斷、Telegram 話術、治理前端面板都已到位,但 7 天 rolling window 還需等舊失敗自然滑出。
|
||||
- 完整 AI 自動化飛輪總進度:維持 `61%`;本階段清掉的是「告警可信度 / CI 誤報」技術債,尚未增加新的 verified auto-repair 成功樣本,所以不能把整體自動修復完成度上調。
|
||||
|
||||
## 2026-06-01|CD public health 單次 timeout 重試化
|
||||
|
||||
**背景**:
|
||||
|
||||
@@ -41,6 +41,20 @@ from urllib.request import Request, urlopen
|
||||
# =============================================================================
|
||||
# 配置
|
||||
# =============================================================================
|
||||
def _env_int(name: str, default: int) -> int:
|
||||
try:
|
||||
return int(os.environ.get(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def _env_float(name: str, default: float) -> float:
|
||||
try:
|
||||
return float(os.environ.get(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
DEFAULT_API_URL = "https://awoooi.wooo.work"
|
||||
SIGNOZ_URL = "http://192.168.0.188:3301"
|
||||
ALERTMANAGER_URL = "http://192.168.0.188:9093"
|
||||
@@ -50,6 +64,12 @@ PROMETHEUS_URL = "http://192.168.0.110:9090"
|
||||
MAX_ALERT_CHAIN_SILENCE_SECONDS = 2 * 60 * 60
|
||||
|
||||
TIMEOUT = 10 # 秒
|
||||
API_HEALTH_TIMEOUT = _env_int("ALERT_CHAIN_API_HEALTH_TIMEOUT", 20)
|
||||
API_HEALTH_ATTEMPTS = max(
|
||||
1,
|
||||
_env_int("ALERT_CHAIN_API_HEALTH_ATTEMPTS", 3),
|
||||
)
|
||||
API_HEALTH_RETRY_DELAY = _env_float("ALERT_CHAIN_API_HEALTH_RETRY_DELAY", 3.0)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@@ -117,6 +137,13 @@ def _http_error_message(error: Exception) -> str:
|
||||
return str(error)
|
||||
|
||||
|
||||
def _api_health_probe_summary(attempt: int) -> str:
|
||||
return (
|
||||
f"attempts={attempt}/{API_HEALTH_ATTEMPTS}, "
|
||||
f"timeout={API_HEALTH_TIMEOUT}s"
|
||||
)
|
||||
|
||||
|
||||
def _statuses_from_env(env_name: str) -> list[str] | None:
|
||||
"""Return preflight pod statuses supplied by CI, or None to use kubectl."""
|
||||
if env_name not in os.environ:
|
||||
@@ -216,12 +243,53 @@ class SmokeTestReport:
|
||||
# =============================================================================
|
||||
def check_api_health(api_url: str) -> CheckResult:
|
||||
"""Check 1: API Health — core runtime must be up; provider degradation is warning evidence."""
|
||||
try:
|
||||
resp = http_get(f"{api_url}/api/v1/health", timeout=TIMEOUT)
|
||||
data = resp.json()
|
||||
last_error = "unknown"
|
||||
resp: HttpGetResult | None = None
|
||||
data: dict[str, Any] | None = None
|
||||
used_attempt = 0
|
||||
|
||||
for attempt in range(1, API_HEALTH_ATTEMPTS + 1):
|
||||
used_attempt = attempt
|
||||
try:
|
||||
resp = http_get(
|
||||
f"{api_url}/api/v1/health",
|
||||
timeout=API_HEALTH_TIMEOUT,
|
||||
)
|
||||
data = resp.json()
|
||||
except (URLError, TimeoutError, OSError, json.JSONDecodeError) as e:
|
||||
last_error = _http_error_message(e)
|
||||
if attempt < API_HEALTH_ATTEMPTS:
|
||||
time.sleep(API_HEALTH_RETRY_DELAY)
|
||||
continue
|
||||
return CheckResult(
|
||||
"API Health",
|
||||
False,
|
||||
(
|
||||
f"無法連線: {last_error} "
|
||||
f"({_api_health_probe_summary(attempt)})"
|
||||
),
|
||||
)
|
||||
|
||||
if resp.status_code >= 500 and attempt < API_HEALTH_ATTEMPTS:
|
||||
last_error = f"HTTP {resp.status_code}"
|
||||
time.sleep(API_HEALTH_RETRY_DELAY)
|
||||
continue
|
||||
|
||||
break
|
||||
|
||||
if resp is None or data is None:
|
||||
return CheckResult("API Health", False, f"無法連線: {last_error}")
|
||||
|
||||
try:
|
||||
if resp.status_code >= 400:
|
||||
return CheckResult("API Health", False, f"HTTP {resp.status_code}")
|
||||
return CheckResult(
|
||||
"API Health",
|
||||
False,
|
||||
(
|
||||
f"HTTP {resp.status_code} "
|
||||
f"({_api_health_probe_summary(used_attempt)})"
|
||||
),
|
||||
)
|
||||
|
||||
components = data.get("components", {})
|
||||
core_components = ("api", "postgresql", "redis")
|
||||
@@ -244,16 +312,30 @@ def check_api_health(api_url: str) -> CheckResult:
|
||||
return CheckResult(
|
||||
"API Health",
|
||||
True,
|
||||
f"核心組件 UP;非阻塞降級: {', '.join(down_components)}",
|
||||
(
|
||||
f"核心組件 UP;非阻塞降級: {', '.join(down_components)} "
|
||||
f"({_api_health_probe_summary(used_attempt)})"
|
||||
),
|
||||
)
|
||||
|
||||
return CheckResult(
|
||||
"API Health",
|
||||
True,
|
||||
f"所有 {len(components)} 個組件 UP ({data.get('environment', 'unknown')})",
|
||||
(
|
||||
f"所有 {len(components)} 個組件 UP "
|
||||
f"({data.get('environment', 'unknown')}; "
|
||||
f"{_api_health_probe_summary(used_attempt)})"
|
||||
),
|
||||
)
|
||||
except (URLError, TimeoutError, OSError, json.JSONDecodeError) as e:
|
||||
return CheckResult("API Health", False, f"無法連線: {_http_error_message(e)}")
|
||||
return CheckResult(
|
||||
"API Health",
|
||||
False,
|
||||
(
|
||||
f"無法連線: {_http_error_message(e)} "
|
||||
f"({_api_health_probe_summary(used_attempt)})"
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _escape_prometheus_label_value(value: str) -> str:
|
||||
|
||||
Reference in New Issue
Block a user