271 lines
8.2 KiB
Python
271 lines
8.2 KiB
Python
from __future__ import annotations
|
|
|
|
from datetime import datetime, timedelta
|
|
from zoneinfo import ZoneInfo
|
|
|
|
from src.jobs.ai_slo_watchdog_job import (
|
|
_format_slo_violation_for_alert,
|
|
_is_observation_only_slo_violation,
|
|
)
|
|
from src.services.ai_slo_calculator import (
|
|
SLO_AUTO_SUCCESS_MIN,
|
|
SloMetric,
|
|
SloReport,
|
|
build_auto_execute_success_diagnostics,
|
|
)
|
|
|
|
|
|
TZ = ZoneInfo("Asia/Taipei")
|
|
|
|
|
|
def _row(
|
|
*,
|
|
created_at: datetime,
|
|
success: bool,
|
|
incident_id: str,
|
|
playbook_id: str = "PB-OK",
|
|
playbook_name: str = "成功修復 PlayBook",
|
|
alertname: str = "SyntheticAutoRepair",
|
|
error_message: str | None = None,
|
|
) -> dict:
|
|
return {
|
|
"created_at": created_at,
|
|
"success": success,
|
|
"incident_id": incident_id,
|
|
"playbook_id": playbook_id,
|
|
"playbook_name": playbook_name,
|
|
"alertname": alertname,
|
|
"error_message": error_message,
|
|
}
|
|
|
|
|
|
def test_auto_execute_diagnostics_marks_known_failures_as_sealed_and_projects_green():
|
|
now = datetime(2026, 6, 1, 18, 0, tzinfo=TZ)
|
|
rows = [
|
|
_row(
|
|
created_at=now - timedelta(days=2, minutes=i),
|
|
success=True,
|
|
incident_id=f"INC-SUCCESS-{i:02d}",
|
|
)
|
|
for i in range(45)
|
|
]
|
|
rows.extend(
|
|
_row(
|
|
created_at=now - timedelta(days=6, minutes=i),
|
|
success=False,
|
|
incident_id=f"INC-DOCKER-{i}",
|
|
playbook_id="PB-20260420-3F9C4C",
|
|
playbook_name="DockerContainerUnhealthy 修復",
|
|
alertname="DockerContainerUnhealthy",
|
|
error_message="Unsupported scheme: ssh {host} docker inspect minio && docker restart minio",
|
|
)
|
|
for i in range(5)
|
|
)
|
|
rows.extend(
|
|
_row(
|
|
created_at=now - timedelta(days=5, minutes=i),
|
|
success=False,
|
|
incident_id=f"INC-STOCK-{i}",
|
|
playbook_id="PB-20260416-79EB94",
|
|
playbook_name="K3s 節點下線修復",
|
|
alertname="StockWoooWorkDown",
|
|
error_message='nodes "stock-platform" not found',
|
|
)
|
|
for i in range(4)
|
|
)
|
|
|
|
diagnostics = build_auto_execute_success_diagnostics(rows, now)
|
|
|
|
assert diagnostics["status"] == "sealed_waiting_window"
|
|
assert diagnostics["summary"]["total"] == 54
|
|
assert diagnostics["summary"]["success"] == 45
|
|
assert diagnostics["summary"]["rate"] == 45 / 54
|
|
assert diagnostics["sealed_failure_group_count"] == 2
|
|
assert diagnostics["open_failure_group_count"] == 0
|
|
assert diagnostics["immediate_successes_needed"] == 6
|
|
assert diagnostics["projection_reason"] == "rolling_window_if_no_new_failures"
|
|
assert diagnostics["projected_green_at"].startswith("2026-06-02T17:57")
|
|
statuses = {group["closure_status"] for group in diagnostics["top_failure_groups"]}
|
|
assert "sealed_by_mcp_grant" in statuses
|
|
assert "sealed_by_external_site_guard" in statuses
|
|
|
|
|
|
def test_auto_execute_diagnostics_keeps_unknown_failures_open():
|
|
now = datetime(2026, 6, 1, 18, 0, tzinfo=TZ)
|
|
rows = [
|
|
_row(
|
|
created_at=now - timedelta(hours=i),
|
|
success=False,
|
|
incident_id=f"INC-UNKNOWN-{i}",
|
|
playbook_id="PB-UNKNOWN",
|
|
playbook_name="未知修復",
|
|
alertname="UnknownRepair",
|
|
error_message="timeout waiting for executor",
|
|
)
|
|
for i in range(5)
|
|
]
|
|
|
|
diagnostics = build_auto_execute_success_diagnostics(rows, now)
|
|
|
|
assert diagnostics["status"] == "needs_investigation"
|
|
assert diagnostics["sealed_failure_group_count"] == 0
|
|
assert diagnostics["open_failure_group_count"] == 1
|
|
assert diagnostics["next_action"] == "investigate_open_failure_groups"
|
|
|
|
|
|
def test_slo_report_to_dict_includes_diagnostics():
|
|
report = SloReport(
|
|
metrics=[
|
|
SloMetric(
|
|
name="auto_execute_success_rate",
|
|
value=0.8,
|
|
threshold=SLO_AUTO_SUCCESS_MIN,
|
|
direction="above",
|
|
sample_count=10,
|
|
violated=True,
|
|
)
|
|
],
|
|
any_violated=True,
|
|
diagnostics={"auto_execute_success_rate": {"status": "sealed_waiting_window"}},
|
|
)
|
|
|
|
assert report.to_dict()["diagnostics"]["auto_execute_success_rate"]["status"] == "sealed_waiting_window"
|
|
|
|
|
|
def test_watchdog_formats_auto_execute_diagnostics_for_meta_alert():
|
|
projected = datetime(2026, 6, 3, 23, 7, tzinfo=TZ).isoformat()
|
|
report = SloReport(
|
|
metrics=[
|
|
SloMetric(
|
|
name="auto_execute_success_rate",
|
|
value=45 / 54,
|
|
threshold=SLO_AUTO_SUCCESS_MIN,
|
|
direction="above",
|
|
sample_count=54,
|
|
violated=True,
|
|
)
|
|
],
|
|
any_violated=True,
|
|
diagnostics={
|
|
"auto_execute_success_rate": {
|
|
"summary": {
|
|
"total": 54,
|
|
"success": 45,
|
|
"failed": 9,
|
|
"rate": 45 / 54,
|
|
"threshold": SLO_AUTO_SUCCESS_MIN,
|
|
},
|
|
"sealed_failure_group_count": 2,
|
|
"open_failure_group_count": 0,
|
|
"immediate_successes_needed": 6,
|
|
"projected_green_at": projected,
|
|
"top_failure_groups": [
|
|
{
|
|
"alertname": "DockerContainerUnhealthy",
|
|
"playbook_id": "PB-20260420-3F9C4C",
|
|
"count": 5,
|
|
"closure_status": "sealed_by_mcp_grant",
|
|
}
|
|
],
|
|
}
|
|
},
|
|
)
|
|
|
|
line, cause = _format_slo_violation_for_alert(report, ["auto_execute_success_rate"])
|
|
|
|
assert "45/54=83.3%" in line
|
|
assert "已封口群組 2" in line
|
|
assert "06/03 23:07" in line
|
|
assert cause is not None
|
|
assert "Top failure groups" in cause
|
|
assert "不需要重啟服務或改寫歷史資料" in cause
|
|
|
|
|
|
def test_watchdog_treats_sealed_auto_execute_slo_as_observation_only():
|
|
report = SloReport(
|
|
metrics=[
|
|
SloMetric(
|
|
name="auto_execute_success_rate",
|
|
value=0.83,
|
|
threshold=SLO_AUTO_SUCCESS_MIN,
|
|
direction="above",
|
|
sample_count=53,
|
|
violated=True,
|
|
)
|
|
],
|
|
any_violated=True,
|
|
diagnostics={
|
|
"auto_execute_success_rate": {
|
|
"status": "sealed_waiting_window",
|
|
"open_failure_group_count": 0,
|
|
}
|
|
},
|
|
)
|
|
|
|
assert _is_observation_only_slo_violation(
|
|
report,
|
|
["auto_execute_success_rate"],
|
|
)
|
|
|
|
|
|
def test_watchdog_keeps_auto_execute_slo_alert_when_open_groups_remain():
|
|
report = SloReport(
|
|
metrics=[
|
|
SloMetric(
|
|
name="auto_execute_success_rate",
|
|
value=0.83,
|
|
threshold=SLO_AUTO_SUCCESS_MIN,
|
|
direction="above",
|
|
sample_count=53,
|
|
violated=True,
|
|
)
|
|
],
|
|
any_violated=True,
|
|
diagnostics={
|
|
"auto_execute_success_rate": {
|
|
"status": "needs_investigation",
|
|
"open_failure_group_count": 1,
|
|
}
|
|
},
|
|
)
|
|
|
|
assert not _is_observation_only_slo_violation(
|
|
report,
|
|
["auto_execute_success_rate"],
|
|
)
|
|
|
|
|
|
def test_watchdog_keeps_slo_alert_when_other_metric_violates():
|
|
report = SloReport(
|
|
metrics=[
|
|
SloMetric(
|
|
name="auto_execute_success_rate",
|
|
value=0.83,
|
|
threshold=SLO_AUTO_SUCCESS_MIN,
|
|
direction="above",
|
|
sample_count=53,
|
|
violated=True,
|
|
),
|
|
SloMetric(
|
|
name="human_override_rate",
|
|
value=0.25,
|
|
threshold=0.20,
|
|
direction="below",
|
|
sample_count=10,
|
|
violated=True,
|
|
),
|
|
],
|
|
any_violated=True,
|
|
diagnostics={
|
|
"auto_execute_success_rate": {
|
|
"status": "sealed_waiting_window",
|
|
"open_failure_group_count": 0,
|
|
}
|
|
},
|
|
)
|
|
|
|
assert not _is_observation_only_slo_violation(
|
|
report,
|
|
["auto_execute_success_rate", "human_override_rate"],
|
|
)
|