diff --git a/apps/api/src/jobs/ai_slo_watchdog_job.py b/apps/api/src/jobs/ai_slo_watchdog_job.py index 6d4b39d9..29da11ac 100644 --- a/apps/api/src/jobs/ai_slo_watchdog_job.py +++ b/apps/api/src/jobs/ai_slo_watchdog_job.py @@ -118,11 +118,18 @@ async def _check_once() -> None: report = await AiSloCalculator().calculate() if report.any_violated: violated = [m.name for m in report.metrics if m.violated] - w1_line, w1_cause = _format_slo_violation_for_alert(report, violated) - violations.append(w1_line) - if w1_cause: - probable_causes.append(w1_cause) - violation_codes.append(f"W1:slo_violated:{','.join(sorted(violated))}") + if _is_observation_only_slo_violation(report, violated): + logger.info( + "watchdog_w1_slo_observation_only", + violated=violated, + reason="sealed_waiting_rolling_window", + ) + else: + w1_line, w1_cause = _format_slo_violation_for_alert(report, violated) + violations.append(w1_line) + if w1_cause: + probable_causes.append(w1_cause) + violation_codes.append(f"W1:slo_violated:{','.join(sorted(violated))}") except Exception as e: logger.warning("watchdog_w1_slo_check_failed", error=str(e)) @@ -354,6 +361,23 @@ def _format_slo_violation_for_alert(report, violated: list[str]) -> tuple[str, s return line, "\n".join(cause_parts) +def _is_observation_only_slo_violation(report, violated: list[str]) -> bool: + """已封口且只等 rolling window 的 W-1,不再升成 Meta System 告警。""" + if set(violated) != {"auto_execute_success_rate"}: + return False + + diagnostics = getattr(report, "diagnostics", {}) or {} + diag = diagnostics.get("auto_execute_success_rate") or {} + try: + open_groups = int(diag.get("open_failure_group_count") or 0) + except (TypeError, ValueError): + open_groups = 0 + return ( + diag.get("status") == "sealed_waiting_window" + and open_groups == 0 + ) + + def _short_taipei_time(value: str | None) -> str | None: if not value: return None diff --git a/apps/api/tests/test_ai_slo_calculator.py b/apps/api/tests/test_ai_slo_calculator.py index f62cf83b..b6f4515d 100644 --- a/apps/api/tests/test_ai_slo_calculator.py +++ b/apps/api/tests/test_ai_slo_calculator.py @@ -3,7 +3,10 @@ from __future__ import annotations from datetime import datetime, timedelta from zoneinfo import ZoneInfo -from src.jobs.ai_slo_watchdog_job import _format_slo_violation_for_alert +from src.jobs.ai_slo_watchdog_job import ( + _format_slo_violation_for_alert, + _is_observation_only_slo_violation, +) from src.services.ai_slo_calculator import ( SLO_AUTO_SUCCESS_MIN, SloMetric, @@ -176,3 +179,92 @@ def test_watchdog_formats_auto_execute_diagnostics_for_meta_alert(): assert cause is not None assert "Top failure groups" in cause assert "不需要重啟服務或改寫歷史資料" in cause + + +def test_watchdog_treats_sealed_auto_execute_slo_as_observation_only(): + report = SloReport( + metrics=[ + SloMetric( + name="auto_execute_success_rate", + value=0.83, + threshold=SLO_AUTO_SUCCESS_MIN, + direction="above", + sample_count=53, + violated=True, + ) + ], + any_violated=True, + diagnostics={ + "auto_execute_success_rate": { + "status": "sealed_waiting_window", + "open_failure_group_count": 0, + } + }, + ) + + assert _is_observation_only_slo_violation( + report, + ["auto_execute_success_rate"], + ) + + +def test_watchdog_keeps_auto_execute_slo_alert_when_open_groups_remain(): + report = SloReport( + metrics=[ + SloMetric( + name="auto_execute_success_rate", + value=0.83, + threshold=SLO_AUTO_SUCCESS_MIN, + direction="above", + sample_count=53, + violated=True, + ) + ], + any_violated=True, + diagnostics={ + "auto_execute_success_rate": { + "status": "needs_investigation", + "open_failure_group_count": 1, + } + }, + ) + + assert not _is_observation_only_slo_violation( + report, + ["auto_execute_success_rate"], + ) + + +def test_watchdog_keeps_slo_alert_when_other_metric_violates(): + report = SloReport( + metrics=[ + SloMetric( + name="auto_execute_success_rate", + value=0.83, + threshold=SLO_AUTO_SUCCESS_MIN, + direction="above", + sample_count=53, + violated=True, + ), + SloMetric( + name="human_override_rate", + value=0.25, + threshold=0.20, + direction="below", + sample_count=10, + violated=True, + ), + ], + any_violated=True, + diagnostics={ + "auto_execute_success_rate": { + "status": "sealed_waiting_window", + "open_failure_group_count": 0, + } + }, + ) + + assert not _is_observation_only_slo_violation( + report, + ["auto_execute_success_rate", "human_override_rate"], + ) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index b1ca5a6c..b99894e2 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,38 @@ +## 2026-06-01|W-1 已封口 SLO 改為觀察中,停止 Meta 誤擾 + +**背景**: + +- 正式 `/api/v1/ai/slo?force_refresh=true` 仍顯示 `auto_execute_success_rate=44/53=83.0%`,低於 `85%` 門檻。 +- 但 diagnostics 已確認 9 筆失敗全屬兩個已封口群組:`DockerContainerUnhealthy` 已改走 `ssh_docker_restart/write` MCP grant,`StockWoooWorkDown` 已阻擋外部站台誤配 K3s node PlayBook。 +- `open_failure_group_count=0`,狀態是 `sealed_waiting_window`;這代表目前是 7 日 rolling window 歷史包袱,不是新故障。若仍送 `META SYSTEM | 飛輪核心異常`,會造成 operator 誤判與 Telegram 噪音。 + +**本次調整**: + +- `apps/api/src/jobs/ai_slo_watchdog_job.py`: + - 新增 `_is_observation_only_slo_violation()`。 + - 當且僅當違反項目只有 `auto_execute_success_rate`,且 diagnostics 為 `sealed_waiting_window`、`open_failure_group_count=0` 時,W-1 降成 observation-only,不再升成 Meta System 告警。 + - 若存在未封口群組、診斷資料不足,或其他 SLO 一起違反,仍維持原本 Meta System 告警行為。 +- `apps/api/tests/test_ai_slo_calculator.py`: + - 新增已封口 observation-only 測試。 + - 新增 open group / other metric 仍告警的防回歸測試。 + +**驗證**: + +- Production SLO 查核:`any_violated=true`,`auto_execute_success_rate=83.0%`,`sealed_failure_group_count=2`,`open_failure_group_count=0`,`projected_green_at=2026-06-03T15:07:04.658109+00:00`。 +- Production health:Public `/api/v1/health` 回 `HTTP 200`,約 `0.19s`;`ollama_route_order=["ollama_gcp_a","ollama_gcp_b","ollama_local"]`,GCP-A / GCP-B up。 +- Local validation: + - `python3 -m py_compile apps/api/src/jobs/ai_slo_watchdog_job.py apps/api/src/services/ai_slo_calculator.py apps/api/tests/test_ai_slo_calculator.py` + - `DATABASE_URL=sqlite+aiosqlite:///:memory: PYTHONPATH=apps/api /Users/ogt/.pyenv/shims/pytest apps/api/tests/test_ai_slo_calculator.py apps/api/tests/test_trust_drift_watchdog.py -q` → `19 passed` + - `DATABASE_URL=sqlite+aiosqlite:///:memory: PYTHONPATH=apps/api /Users/ogt/.pyenv/shims/pytest apps/api/tests/test_ai_slo_calculator.py apps/api/tests/test_trust_drift_watchdog.py apps/api/tests/test_auto_repair_service.py apps/api/tests/test_alert_rule_engine_validation.py -q` → `84 passed` + - `python3 scripts/security/security-mirror-progress-guard.py --root .` → `SECURITY_MIRROR_PROGRESS_GUARD_OK` + - `git diff --check` + +**目前整體進度(本階段完成後)**: + +- W-1 Meta 告警降噪:約 `96%`;已封口 rolling-window 狀態不再升級為飛輪核心異常,剩餘需 production watchdog 下一輪實跑確認沒有再推同型 META。 +- W-1 自動修復 SLO 可解釋化:約 `92%`;可查、可視、可判斷人工/觀察邊界,但 SLO 數值仍需等歷史失敗滑出或新增真實成功樣本才回綠。 +- 完整 AI 自動化飛輪總進度:維持 `61%`;這次是降低錯誤告警與 operator 誤判,不等於新增 verified auto-repair 能力。 + ## 2026-06-01|Alert Chain post-deploy API Health 重試化 **背景**: