diff --git a/apps/api/tests/test_alert_chain_smoke_metric.py b/apps/api/tests/test_alert_chain_smoke_metric.py index 47ec0799..3fcf9328 100644 --- a/apps/api/tests/test_alert_chain_smoke_metric.py +++ b/apps/api/tests/test_alert_chain_smoke_metric.py @@ -47,6 +47,78 @@ class AlertChainSmokeMetricTest(unittest.TestCase): self.assertTrue(result.passed) self.assertIn("非阻塞降級: ollama", result.message) + def test_api_health_retries_transient_connection_failure(self): + calls = [] + + def fake_get(url, *, params=None, timeout=None): + self.assertTrue(url.endswith("/api/v1/health")) + calls.append({"url": url, "timeout": timeout}) + if len(calls) == 1: + raise alert_chain_smoke_test.URLError("timed out") + return alert_chain_smoke_test.HttpGetResult( + 200, + json.dumps( + { + "status": "healthy", + "environment": "prod", + "components": { + "api": {"status": "up"}, + "postgresql": {"status": "up"}, + "redis": {"status": "up"}, + }, + } + ), + ) + + original_get = alert_chain_smoke_test.http_get + original_attempts = alert_chain_smoke_test.API_HEALTH_ATTEMPTS + original_timeout = alert_chain_smoke_test.API_HEALTH_TIMEOUT + original_delay = alert_chain_smoke_test.API_HEALTH_RETRY_DELAY + try: + alert_chain_smoke_test.http_get = fake_get + alert_chain_smoke_test.API_HEALTH_ATTEMPTS = 3 + alert_chain_smoke_test.API_HEALTH_TIMEOUT = 20 + alert_chain_smoke_test.API_HEALTH_RETRY_DELAY = 0 + result = alert_chain_smoke_test.check_api_health("http://api") + finally: + alert_chain_smoke_test.http_get = original_get + alert_chain_smoke_test.API_HEALTH_ATTEMPTS = original_attempts + alert_chain_smoke_test.API_HEALTH_TIMEOUT = original_timeout + alert_chain_smoke_test.API_HEALTH_RETRY_DELAY = original_delay + + self.assertTrue(result.passed) + self.assertEqual(len(calls), 2) + self.assertEqual({call["timeout"] for call in calls}, {20}) + + def test_api_health_reports_attempts_after_retry_exhaustion(self): + calls = [] + + def fake_get(url, *, params=None, timeout=None): + self.assertTrue(url.endswith("/api/v1/health")) + calls.append(timeout) + raise TimeoutError("timed out") + + original_get = alert_chain_smoke_test.http_get + original_attempts = alert_chain_smoke_test.API_HEALTH_ATTEMPTS + original_timeout = alert_chain_smoke_test.API_HEALTH_TIMEOUT + original_delay = alert_chain_smoke_test.API_HEALTH_RETRY_DELAY + try: + alert_chain_smoke_test.http_get = fake_get + alert_chain_smoke_test.API_HEALTH_ATTEMPTS = 2 + alert_chain_smoke_test.API_HEALTH_TIMEOUT = 7 + alert_chain_smoke_test.API_HEALTH_RETRY_DELAY = 0 + result = alert_chain_smoke_test.check_api_health("http://api") + finally: + alert_chain_smoke_test.http_get = original_get + alert_chain_smoke_test.API_HEALTH_ATTEMPTS = original_attempts + alert_chain_smoke_test.API_HEALTH_TIMEOUT = original_timeout + alert_chain_smoke_test.API_HEALTH_RETRY_DELAY = original_delay + + self.assertFalse(result.passed) + self.assertEqual(calls, [7, 7]) + self.assertIn("attempts=2", result.message) + self.assertIn("timeout=7s", result.message) + def test_api_health_fails_when_core_component_is_down(self): def fake_get(url, *, params=None, timeout=None): self.assertTrue(url.endswith("/api/v1/health")) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 90f07f03..0d21b60d 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,34 @@ +## 2026-06-01|Alert Chain post-deploy API Health 重試化 + +**背景**: + +- W-1 自動修復 SLO 診斷與前端治理面板已部署,但 `post-deploy-checks` 曾在 `Alert Chain Smoke Test` 的 `API Health` 單次 public read timeout 被標紅。 +- 人工即時查正式 `/api/v1/health` 回 `HTTP 200` 且核心組件健康,因此這不是 production outage,而是部署後 smoke probe 對短暫 timeout 太敏感,會反過來製造假失敗告警。 + +**本次調整**: + +- `scripts/alert_chain_smoke_test.py`:`API Health` probe 改為可設定 `ALERT_CHAIN_API_HEALTH_ATTEMPTS`、`ALERT_CHAIN_API_HEALTH_TIMEOUT`、`ALERT_CHAIN_API_HEALTH_RETRY_DELAY`,預設 `3` 次、單次 `20s`、間隔 `3s`。 +- 僅重試連線 / timeout / JSON decode 與短暫 `5xx`;`api` / `postgresql` / `redis` 核心組件實際 down 仍維持 critical failure。 +- smoke 訊息現在會標出 `attempts=目前/上限` 與 timeout,讓 Telegram / Gitea log 能看出是第幾次 probe 通過或失敗。 +- `apps/api/tests/test_alert_chain_smoke_metric.py`:新增 transient connection retry 與 retry exhaustion 單元測試。 + +**驗證**: + +- `python3 -m py_compile scripts/alert_chain_smoke_test.py apps/api/tests/test_alert_chain_smoke_metric.py` +- `/Users/ogt/.pyenv/shims/python -m unittest apps/api/tests/test_alert_chain_smoke_metric.py` → `15 tests OK` +- `git diff --check` +- `python3 scripts/security/security-mirror-progress-guard.py --root .` → `SECURITY_MIRROR_PROGRESS_GUARD_OK` +- Production smoke:`ALERT_CHAIN_API_HEALTH_RETRY_DELAY=1 python3 scripts/alert_chain_smoke_test.py --api-url https://awoooi.wooo.work --json` → `PASSED — 8/8 checks passed in 6.3s` + - `API Health`:核心組件 UP;非阻塞降級 `ollama_local`;`attempts=1/3, timeout=20s` + - `Alert Chain Metric`:最後 `alertmanager` 告警成功約 3 分鐘前,evidence=`prometheus` + - Alertmanager / SignOz / Sentry webhook、SigNoz、OTEL Collector、Event Exporter 全部通過。 + +**目前整體進度(本階段完成後)**: + +- CI/CD 與 post-deploy smoke 穩定性:約 `99.2%`;已補 build-deploy public curl 與 post-deploy Alert Chain API Health 兩處 timeout 誤報來源,下一步靠新一輪 Gitea CD 驗證。 +- W-1 自動修復 SLO 可解釋化:約 `90%`;API 診斷、Telegram 話術、治理前端面板都已到位,但 7 天 rolling window 還需等舊失敗自然滑出。 +- 完整 AI 自動化飛輪總進度:維持 `61%`;本階段清掉的是「告警可信度 / CI 誤報」技術債,尚未增加新的 verified auto-repair 成功樣本,所以不能把整體自動修復完成度上調。 + ## 2026-06-01|CD public health 單次 timeout 重試化 **背景**: diff --git a/scripts/alert_chain_smoke_test.py b/scripts/alert_chain_smoke_test.py index 9af9c3b9..f0d71014 100644 --- a/scripts/alert_chain_smoke_test.py +++ b/scripts/alert_chain_smoke_test.py @@ -41,6 +41,20 @@ from urllib.request import Request, urlopen # ============================================================================= # 配置 # ============================================================================= +def _env_int(name: str, default: int) -> int: + try: + return int(os.environ.get(name, str(default))) + except ValueError: + return default + + +def _env_float(name: str, default: float) -> float: + try: + return float(os.environ.get(name, str(default))) + except ValueError: + return default + + DEFAULT_API_URL = "https://awoooi.wooo.work" SIGNOZ_URL = "http://192.168.0.188:3301" ALERTMANAGER_URL = "http://192.168.0.188:9093" @@ -50,6 +64,12 @@ PROMETHEUS_URL = "http://192.168.0.110:9090" MAX_ALERT_CHAIN_SILENCE_SECONDS = 2 * 60 * 60 TIMEOUT = 10 # 秒 +API_HEALTH_TIMEOUT = _env_int("ALERT_CHAIN_API_HEALTH_TIMEOUT", 20) +API_HEALTH_ATTEMPTS = max( + 1, + _env_int("ALERT_CHAIN_API_HEALTH_ATTEMPTS", 3), +) +API_HEALTH_RETRY_DELAY = _env_float("ALERT_CHAIN_API_HEALTH_RETRY_DELAY", 3.0) @dataclass(frozen=True) @@ -117,6 +137,13 @@ def _http_error_message(error: Exception) -> str: return str(error) +def _api_health_probe_summary(attempt: int) -> str: + return ( + f"attempts={attempt}/{API_HEALTH_ATTEMPTS}, " + f"timeout={API_HEALTH_TIMEOUT}s" + ) + + def _statuses_from_env(env_name: str) -> list[str] | None: """Return preflight pod statuses supplied by CI, or None to use kubectl.""" if env_name not in os.environ: @@ -216,12 +243,53 @@ class SmokeTestReport: # ============================================================================= def check_api_health(api_url: str) -> CheckResult: """Check 1: API Health — core runtime must be up; provider degradation is warning evidence.""" - try: - resp = http_get(f"{api_url}/api/v1/health", timeout=TIMEOUT) - data = resp.json() + last_error = "unknown" + resp: HttpGetResult | None = None + data: dict[str, Any] | None = None + used_attempt = 0 + for attempt in range(1, API_HEALTH_ATTEMPTS + 1): + used_attempt = attempt + try: + resp = http_get( + f"{api_url}/api/v1/health", + timeout=API_HEALTH_TIMEOUT, + ) + data = resp.json() + except (URLError, TimeoutError, OSError, json.JSONDecodeError) as e: + last_error = _http_error_message(e) + if attempt < API_HEALTH_ATTEMPTS: + time.sleep(API_HEALTH_RETRY_DELAY) + continue + return CheckResult( + "API Health", + False, + ( + f"無法連線: {last_error} " + f"({_api_health_probe_summary(attempt)})" + ), + ) + + if resp.status_code >= 500 and attempt < API_HEALTH_ATTEMPTS: + last_error = f"HTTP {resp.status_code}" + time.sleep(API_HEALTH_RETRY_DELAY) + continue + + break + + if resp is None or data is None: + return CheckResult("API Health", False, f"無法連線: {last_error}") + + try: if resp.status_code >= 400: - return CheckResult("API Health", False, f"HTTP {resp.status_code}") + return CheckResult( + "API Health", + False, + ( + f"HTTP {resp.status_code} " + f"({_api_health_probe_summary(used_attempt)})" + ), + ) components = data.get("components", {}) core_components = ("api", "postgresql", "redis") @@ -244,16 +312,30 @@ def check_api_health(api_url: str) -> CheckResult: return CheckResult( "API Health", True, - f"核心組件 UP;非阻塞降級: {', '.join(down_components)}", + ( + f"核心組件 UP;非阻塞降級: {', '.join(down_components)} " + f"({_api_health_probe_summary(used_attempt)})" + ), ) return CheckResult( "API Health", True, - f"所有 {len(components)} 個組件 UP ({data.get('environment', 'unknown')})", + ( + f"所有 {len(components)} 個組件 UP " + f"({data.get('environment', 'unknown')}; " + f"{_api_health_probe_summary(used_attempt)})" + ), ) except (URLError, TimeoutError, OSError, json.JSONDecodeError) as e: - return CheckResult("API Health", False, f"無法連線: {_http_error_message(e)}") + return CheckResult( + "API Health", + False, + ( + f"無法連線: {_http_error_message(e)} " + f"({_api_health_probe_summary(used_attempt)})" + ), + ) def _escape_prometheus_label_value(value: str) -> str: