Files
awoooi/apps/api/tests/test_ai_slo_calculator.py
Your Name 9886df8785
All checks were successful
CD Pipeline / tests (push) Successful in 1m21s
Code Review / ai-code-review (push) Successful in 17s
CD Pipeline / build-and-deploy (push) Successful in 3m48s
CD Pipeline / post-deploy-checks (push) Successful in 2m6s
fix(ai): suppress sealed slo watchdog meta noise
2026-06-01 18:52:27 +08:00

271 lines
8.2 KiB
Python

from __future__ import annotations
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo
from src.jobs.ai_slo_watchdog_job import (
_format_slo_violation_for_alert,
_is_observation_only_slo_violation,
)
from src.services.ai_slo_calculator import (
SLO_AUTO_SUCCESS_MIN,
SloMetric,
SloReport,
build_auto_execute_success_diagnostics,
)
TZ = ZoneInfo("Asia/Taipei")
def _row(
*,
created_at: datetime,
success: bool,
incident_id: str,
playbook_id: str = "PB-OK",
playbook_name: str = "成功修復 PlayBook",
alertname: str = "SyntheticAutoRepair",
error_message: str | None = None,
) -> dict:
return {
"created_at": created_at,
"success": success,
"incident_id": incident_id,
"playbook_id": playbook_id,
"playbook_name": playbook_name,
"alertname": alertname,
"error_message": error_message,
}
def test_auto_execute_diagnostics_marks_known_failures_as_sealed_and_projects_green():
now = datetime(2026, 6, 1, 18, 0, tzinfo=TZ)
rows = [
_row(
created_at=now - timedelta(days=2, minutes=i),
success=True,
incident_id=f"INC-SUCCESS-{i:02d}",
)
for i in range(45)
]
rows.extend(
_row(
created_at=now - timedelta(days=6, minutes=i),
success=False,
incident_id=f"INC-DOCKER-{i}",
playbook_id="PB-20260420-3F9C4C",
playbook_name="DockerContainerUnhealthy 修復",
alertname="DockerContainerUnhealthy",
error_message="Unsupported scheme: ssh {host} docker inspect minio && docker restart minio",
)
for i in range(5)
)
rows.extend(
_row(
created_at=now - timedelta(days=5, minutes=i),
success=False,
incident_id=f"INC-STOCK-{i}",
playbook_id="PB-20260416-79EB94",
playbook_name="K3s 節點下線修復",
alertname="StockWoooWorkDown",
error_message='nodes "stock-platform" not found',
)
for i in range(4)
)
diagnostics = build_auto_execute_success_diagnostics(rows, now)
assert diagnostics["status"] == "sealed_waiting_window"
assert diagnostics["summary"]["total"] == 54
assert diagnostics["summary"]["success"] == 45
assert diagnostics["summary"]["rate"] == 45 / 54
assert diagnostics["sealed_failure_group_count"] == 2
assert diagnostics["open_failure_group_count"] == 0
assert diagnostics["immediate_successes_needed"] == 6
assert diagnostics["projection_reason"] == "rolling_window_if_no_new_failures"
assert diagnostics["projected_green_at"].startswith("2026-06-02T17:57")
statuses = {group["closure_status"] for group in diagnostics["top_failure_groups"]}
assert "sealed_by_mcp_grant" in statuses
assert "sealed_by_external_site_guard" in statuses
def test_auto_execute_diagnostics_keeps_unknown_failures_open():
now = datetime(2026, 6, 1, 18, 0, tzinfo=TZ)
rows = [
_row(
created_at=now - timedelta(hours=i),
success=False,
incident_id=f"INC-UNKNOWN-{i}",
playbook_id="PB-UNKNOWN",
playbook_name="未知修復",
alertname="UnknownRepair",
error_message="timeout waiting for executor",
)
for i in range(5)
]
diagnostics = build_auto_execute_success_diagnostics(rows, now)
assert diagnostics["status"] == "needs_investigation"
assert diagnostics["sealed_failure_group_count"] == 0
assert diagnostics["open_failure_group_count"] == 1
assert diagnostics["next_action"] == "investigate_open_failure_groups"
def test_slo_report_to_dict_includes_diagnostics():
report = SloReport(
metrics=[
SloMetric(
name="auto_execute_success_rate",
value=0.8,
threshold=SLO_AUTO_SUCCESS_MIN,
direction="above",
sample_count=10,
violated=True,
)
],
any_violated=True,
diagnostics={"auto_execute_success_rate": {"status": "sealed_waiting_window"}},
)
assert report.to_dict()["diagnostics"]["auto_execute_success_rate"]["status"] == "sealed_waiting_window"
def test_watchdog_formats_auto_execute_diagnostics_for_meta_alert():
projected = datetime(2026, 6, 3, 23, 7, tzinfo=TZ).isoformat()
report = SloReport(
metrics=[
SloMetric(
name="auto_execute_success_rate",
value=45 / 54,
threshold=SLO_AUTO_SUCCESS_MIN,
direction="above",
sample_count=54,
violated=True,
)
],
any_violated=True,
diagnostics={
"auto_execute_success_rate": {
"summary": {
"total": 54,
"success": 45,
"failed": 9,
"rate": 45 / 54,
"threshold": SLO_AUTO_SUCCESS_MIN,
},
"sealed_failure_group_count": 2,
"open_failure_group_count": 0,
"immediate_successes_needed": 6,
"projected_green_at": projected,
"top_failure_groups": [
{
"alertname": "DockerContainerUnhealthy",
"playbook_id": "PB-20260420-3F9C4C",
"count": 5,
"closure_status": "sealed_by_mcp_grant",
}
],
}
},
)
line, cause = _format_slo_violation_for_alert(report, ["auto_execute_success_rate"])
assert "45/54=83.3%" in line
assert "已封口群組 2" in line
assert "06/03 23:07" in line
assert cause is not None
assert "Top failure groups" in cause
assert "不需要重啟服務或改寫歷史資料" in cause
def test_watchdog_treats_sealed_auto_execute_slo_as_observation_only():
report = SloReport(
metrics=[
SloMetric(
name="auto_execute_success_rate",
value=0.83,
threshold=SLO_AUTO_SUCCESS_MIN,
direction="above",
sample_count=53,
violated=True,
)
],
any_violated=True,
diagnostics={
"auto_execute_success_rate": {
"status": "sealed_waiting_window",
"open_failure_group_count": 0,
}
},
)
assert _is_observation_only_slo_violation(
report,
["auto_execute_success_rate"],
)
def test_watchdog_keeps_auto_execute_slo_alert_when_open_groups_remain():
report = SloReport(
metrics=[
SloMetric(
name="auto_execute_success_rate",
value=0.83,
threshold=SLO_AUTO_SUCCESS_MIN,
direction="above",
sample_count=53,
violated=True,
)
],
any_violated=True,
diagnostics={
"auto_execute_success_rate": {
"status": "needs_investigation",
"open_failure_group_count": 1,
}
},
)
assert not _is_observation_only_slo_violation(
report,
["auto_execute_success_rate"],
)
def test_watchdog_keeps_slo_alert_when_other_metric_violates():
report = SloReport(
metrics=[
SloMetric(
name="auto_execute_success_rate",
value=0.83,
threshold=SLO_AUTO_SUCCESS_MIN,
direction="above",
sample_count=53,
violated=True,
),
SloMetric(
name="human_override_rate",
value=0.25,
threshold=0.20,
direction="below",
sample_count=10,
violated=True,
),
],
any_violated=True,
diagnostics={
"auto_execute_success_rate": {
"status": "sealed_waiting_window",
"open_failure_group_count": 0,
}
},
)
assert not _is_observation_only_slo_violation(
report,
["auto_execute_success_rate", "human_override_rate"],
)