from __future__ import annotations from datetime import datetime, timedelta from zoneinfo import ZoneInfo from src.jobs.ai_slo_watchdog_job import ( _format_slo_violation_for_alert, _is_observation_only_slo_violation, ) from src.services.ai_slo_calculator import ( SLO_AUTO_SUCCESS_MIN, SloMetric, SloReport, build_auto_execute_success_diagnostics, ) TZ = ZoneInfo("Asia/Taipei") def _row( *, created_at: datetime, success: bool, incident_id: str, playbook_id: str = "PB-OK", playbook_name: str = "成功修復 PlayBook", alertname: str = "SyntheticAutoRepair", error_message: str | None = None, ) -> dict: return { "created_at": created_at, "success": success, "incident_id": incident_id, "playbook_id": playbook_id, "playbook_name": playbook_name, "alertname": alertname, "error_message": error_message, } def test_auto_execute_diagnostics_marks_known_failures_as_sealed_and_projects_green(): now = datetime(2026, 6, 1, 18, 0, tzinfo=TZ) rows = [ _row( created_at=now - timedelta(days=2, minutes=i), success=True, incident_id=f"INC-SUCCESS-{i:02d}", ) for i in range(45) ] rows.extend( _row( created_at=now - timedelta(days=6, minutes=i), success=False, incident_id=f"INC-DOCKER-{i}", playbook_id="PB-20260420-3F9C4C", playbook_name="DockerContainerUnhealthy 修復", alertname="DockerContainerUnhealthy", error_message="Unsupported scheme: ssh {host} docker inspect minio && docker restart minio", ) for i in range(5) ) rows.extend( _row( created_at=now - timedelta(days=5, minutes=i), success=False, incident_id=f"INC-STOCK-{i}", playbook_id="PB-20260416-79EB94", playbook_name="K3s 節點下線修復", alertname="StockWoooWorkDown", error_message='nodes "stock-platform" not found', ) for i in range(4) ) diagnostics = build_auto_execute_success_diagnostics(rows, now) assert diagnostics["status"] == "sealed_waiting_window" assert diagnostics["summary"]["total"] == 54 assert diagnostics["summary"]["success"] == 45 assert diagnostics["summary"]["rate"] == 45 / 54 assert diagnostics["sealed_failure_group_count"] == 2 assert diagnostics["open_failure_group_count"] == 0 assert diagnostics["immediate_successes_needed"] == 6 assert diagnostics["projection_reason"] == "rolling_window_if_no_new_failures" assert diagnostics["projected_green_at"].startswith("2026-06-02T17:57") statuses = {group["closure_status"] for group in diagnostics["top_failure_groups"]} assert "sealed_by_mcp_grant" in statuses assert "sealed_by_external_site_guard" in statuses def test_auto_execute_diagnostics_keeps_unknown_failures_open(): now = datetime(2026, 6, 1, 18, 0, tzinfo=TZ) rows = [ _row( created_at=now - timedelta(hours=i), success=False, incident_id=f"INC-UNKNOWN-{i}", playbook_id="PB-UNKNOWN", playbook_name="未知修復", alertname="UnknownRepair", error_message="timeout waiting for executor", ) for i in range(5) ] diagnostics = build_auto_execute_success_diagnostics(rows, now) assert diagnostics["status"] == "needs_investigation" assert diagnostics["sealed_failure_group_count"] == 0 assert diagnostics["open_failure_group_count"] == 1 assert diagnostics["next_action"] == "investigate_open_failure_groups" def test_slo_report_to_dict_includes_diagnostics(): report = SloReport( metrics=[ SloMetric( name="auto_execute_success_rate", value=0.8, threshold=SLO_AUTO_SUCCESS_MIN, direction="above", sample_count=10, violated=True, ) ], any_violated=True, diagnostics={"auto_execute_success_rate": {"status": "sealed_waiting_window"}}, ) assert report.to_dict()["diagnostics"]["auto_execute_success_rate"]["status"] == "sealed_waiting_window" def test_watchdog_formats_auto_execute_diagnostics_for_meta_alert(): projected = datetime(2026, 6, 3, 23, 7, tzinfo=TZ).isoformat() report = SloReport( metrics=[ SloMetric( name="auto_execute_success_rate", value=45 / 54, threshold=SLO_AUTO_SUCCESS_MIN, direction="above", sample_count=54, violated=True, ) ], any_violated=True, diagnostics={ "auto_execute_success_rate": { "summary": { "total": 54, "success": 45, "failed": 9, "rate": 45 / 54, "threshold": SLO_AUTO_SUCCESS_MIN, }, "sealed_failure_group_count": 2, "open_failure_group_count": 0, "immediate_successes_needed": 6, "projected_green_at": projected, "top_failure_groups": [ { "alertname": "DockerContainerUnhealthy", "playbook_id": "PB-20260420-3F9C4C", "count": 5, "closure_status": "sealed_by_mcp_grant", } ], } }, ) line, cause = _format_slo_violation_for_alert(report, ["auto_execute_success_rate"]) assert "45/54=83.3%" in line assert "已封口群組 2" in line assert "06/03 23:07" in line assert cause is not None assert "Top failure groups" in cause assert "不需要重啟服務或改寫歷史資料" in cause def test_watchdog_treats_sealed_auto_execute_slo_as_observation_only(): report = SloReport( metrics=[ SloMetric( name="auto_execute_success_rate", value=0.83, threshold=SLO_AUTO_SUCCESS_MIN, direction="above", sample_count=53, violated=True, ) ], any_violated=True, diagnostics={ "auto_execute_success_rate": { "status": "sealed_waiting_window", "open_failure_group_count": 0, } }, ) assert _is_observation_only_slo_violation( report, ["auto_execute_success_rate"], ) def test_watchdog_keeps_auto_execute_slo_alert_when_open_groups_remain(): report = SloReport( metrics=[ SloMetric( name="auto_execute_success_rate", value=0.83, threshold=SLO_AUTO_SUCCESS_MIN, direction="above", sample_count=53, violated=True, ) ], any_violated=True, diagnostics={ "auto_execute_success_rate": { "status": "needs_investigation", "open_failure_group_count": 1, } }, ) assert not _is_observation_only_slo_violation( report, ["auto_execute_success_rate"], ) def test_watchdog_keeps_slo_alert_when_other_metric_violates(): report = SloReport( metrics=[ SloMetric( name="auto_execute_success_rate", value=0.83, threshold=SLO_AUTO_SUCCESS_MIN, direction="above", sample_count=53, violated=True, ), SloMetric( name="human_override_rate", value=0.25, threshold=0.20, direction="below", sample_count=10, violated=True, ), ], any_violated=True, diagnostics={ "auto_execute_success_rate": { "status": "sealed_waiting_window", "open_failure_group_count": 0, } }, ) assert not _is_observation_only_slo_violation( report, ["auto_execute_success_rate", "human_override_rate"], )