fix(api): expose AI SLO truth in weekly reports
Some checks failed
Code Review / ai-code-review (push) Successful in 15s
CD Pipeline / tests (push) Successful in 1m42s
CD Pipeline / build-and-deploy (push) Successful in 5m16s
CD Pipeline / post-deploy-checks (push) Has been cancelled

This commit is contained in:
Your Name
2026-06-27 15:42:20 +08:00
parent f3d218af9b
commit ef049b4b88
3 changed files with 172 additions and 17 deletions

View File

@@ -4212,6 +4212,14 @@ class WeeklyReportMessage:
k3s_source_ok: bool = True
git_source_ok: bool = True
cost_source_ok: bool = False
ai_slo_source_ok: bool = True
ai_slo_auto_execute_success_rate: float | None = None
ai_slo_auto_execute_sample_count: int = 0
ai_slo_auto_execute_threshold: float = 0.85
ai_slo_auto_execute_violated: bool = False
ai_slo_top_failure: str = ""
ai_slo_verifier_coverage_rate: float | None = None
ai_slo_unverified_auto_count: int = 0
all_zero_actionable_anomaly: bool = False
report_source_confidence_percent: int = 0
report_source_ok_count: int = 0
@@ -4230,7 +4238,9 @@ class WeeklyReportMessage:
self.k3s_source_ok,
self.git_source_ok,
self.cost_source_ok,
self.ai_slo_source_ok,
])
source_total_count = 5
all_zero = (
self.alert_total == 0
and self.ai_proposal_count == 0
@@ -4241,9 +4251,10 @@ class WeeklyReportMessage:
and self.disposition_total == 0
)
actionable_all_zero = self.all_zero_actionable_anomaly or all_zero
report_trust = "低可信" if actionable_all_zero or source_ok_count < 4 else "可參考"
report_trust = "低可信" if actionable_all_zero or source_ok_count < source_total_count else "可參考"
source_status = (
f"統計={'正常' if self.stats_source_ok else '失效'} / "
f"AI SLO={'正常' if self.ai_slo_source_ok else '失效'} / "
f"K3s={'正常' if self.k3s_source_ok else '失效'} / "
f"Git={'正常' if self.git_source_ok else '失效'} / "
f"成本={'正常' if self.cost_source_ok else '缺資料'}"
@@ -4251,6 +4262,8 @@ class WeeklyReportMessage:
source_gaps: list[str] = []
if not self.stats_source_ok:
source_gaps.append("告警 / AI 統計資料源失效:建立 report-source-gap:stats_api")
if not self.ai_slo_source_ok:
source_gaps.append("AI SLO 真相資料源失效:建立 report-source-gap:ai_slo")
if not self.k3s_source_ok:
source_gaps.append("K3s 指標資料源失效:建立 report-source-gap:k3s_metrics")
if not self.git_source_ok:
@@ -4282,6 +4295,54 @@ class WeeklyReportMessage:
f"{formatted_assets}\n"
)
def _code_metric(
value: int | float | str,
*,
source_ok: bool,
suffix: str = "",
precision: int | None = None,
) -> str:
if not source_ok:
return "<code>缺資料</code>"
if isinstance(value, float) and precision is not None:
rendered = f"{value:.{precision}f}"
elif isinstance(value, int):
rendered = f"{value:,}"
else:
rendered = str(value)
return f"<code>{html.escape(rendered)}</code>{suffix}"
ai_slo_block = ""
if (
not self.ai_slo_source_ok
or self.ai_slo_auto_execute_sample_count > 0
or self.ai_slo_top_failure
):
if self.ai_slo_source_ok and self.ai_slo_auto_execute_success_rate is not None:
slo_pct = self.ai_slo_auto_execute_success_rate * 100
threshold_pct = self.ai_slo_auto_execute_threshold * 100
slo_status = "違反" if self.ai_slo_auto_execute_violated else "合格"
verifier_text = (
f"{self.ai_slo_verifier_coverage_rate * 100:.1f}%"
if self.ai_slo_verifier_coverage_rate is not None
else "缺資料"
)
top_failure = self.ai_slo_top_failure or "目前無 top failure"
ai_slo_block = (
f"━━━━━━━━━━━━━━━━━━━\n"
f"🧠 <b>AI 自動化 SLO</b>\n"
f"├ 自動執行成功率: <code>{slo_pct:.1f}%</code> / 目標 <code>{threshold_pct:.0f}%</code>{slo_status}\n"
f"├ 樣本: <code>{self.ai_slo_auto_execute_sample_count}</code> | Verifier 覆蓋: <code>{verifier_text}</code>\n"
f"├ 未驗證自動執行: <code>{self.ai_slo_unverified_auto_count}</code>\n"
f"└ Top failure: <code>{html.escape(top_failure[:180])}</code>\n"
)
else:
ai_slo_block = (
f"━━━━━━━━━━━━━━━━━━━\n"
f"🧠 <b>AI 自動化 SLO</b>\n"
f"└ <code>資料源缺口</code>:無法判定 AI 自動化是否真的接管。\n"
)
message = (
f"═══════════════════════════\n"
f"📊 <b>AWOOOI 週報</b>\n"
@@ -4294,29 +4355,30 @@ class WeeklyReportMessage:
f"└ 全 0: <code>{'actionable_anomaly' if actionable_all_zero else 'no'}</code>\n"
f"━━━━━━━━━━━━━━━━━━━\n"
f"{alert_health} <b>告警統計</b>\n"
f"├ 總數: <code>{self.alert_total}</code>\n"
f"├ Critical: <code>{self.alert_critical}</code>\n"
f"├ 已解決: <code>{self.alert_resolved}</code>\n"
f"└ 解決率: <code>{self.resolved_rate:.1f}%</code>\n"
f"├ 總數: {_code_metric(self.alert_total, source_ok=self.stats_source_ok)}\n"
f"├ Critical: {_code_metric(self.alert_critical, source_ok=self.stats_source_ok)}\n"
f"├ 已解決: {_code_metric(self.alert_resolved, source_ok=self.stats_source_ok)}\n"
f"└ 解決率: {_code_metric(self.resolved_rate, source_ok=self.stats_source_ok, suffix='%', precision=1)}\n"
f"━━━━━━━━━━━━━━━━━━━\n"
f"{ai_health} <b>AI 效能</b>\n"
f"├ 提案數: <code>{self.ai_proposal_count}</code>\n"
f"├ 執行數: <code>{self.ai_executed_count}</code>\n"
f"├ 成功率: <code>{self.ai_success_rate:.1f}%</code>\n"
f"└ 平均回應: <code>{self.avg_response_minutes:.1f}</code> 分鐘\n"
f"├ 提案數: {_code_metric(self.ai_proposal_count, source_ok=self.stats_source_ok)}\n"
f"├ 執行數: {_code_metric(self.ai_executed_count, source_ok=self.stats_source_ok)}\n"
f"├ 成功率: {_code_metric(self.ai_success_rate, source_ok=self.stats_source_ok, suffix='%', precision=1)}\n"
f"└ 平均回應: {_code_metric(self.avg_response_minutes, source_ok=self.stats_source_ok, precision=1)} 分鐘\n"
f"{ai_slo_block}"
f"━━━━━━━━━━━━━━━━━━━\n"
f"{k3s_health} <b>K3s 健康</b>\n"
f"├ Uptime: <code>{self.k3s_uptime_pct:.2f}%</code>\n"
f"├ Pod 重啟: <code>{self.pod_restart_total}</code>\n"
f"└ HPA 擴縮: <code>{self.hpa_scale_events}</code>\n"
f"├ Uptime: {_code_metric(self.k3s_uptime_pct, source_ok=self.k3s_source_ok, suffix='%', precision=2)}\n"
f"├ Pod 重啟: {_code_metric(self.pod_restart_total, source_ok=self.k3s_source_ok)}\n"
f"└ HPA 擴縮: {_code_metric(self.hpa_scale_events, source_ok=self.k3s_source_ok)}\n"
f"━━━━━━━━━━━━━━━━━━━\n"
f"📦 <b>開發活動</b>\n"
f"├ Commits: <code>{self.commits_count}</code>\n"
f"└ 部署: <code>{self.deploy_count}</code>\n"
f"├ Commits: {_code_metric(self.commits_count, source_ok=self.git_source_ok)}\n"
f"└ 部署: {_code_metric(self.deploy_count, source_ok=self.git_source_ok)}\n"
f"━━━━━━━━━━━━━━━━━━━\n"
f"💰 <b>AI 成本</b>\n"
f"├ 費用: $<code>{self.ai_cost_week:.2f}</code>\n"
f"└ Tokens: <code>{self.ai_tokens_week:,}</code>\n"
f"├ 費用: ${_code_metric(self.ai_cost_week, source_ok=self.cost_source_ok, precision=2)}\n"
f"└ Tokens: {_code_metric(self.ai_tokens_week, source_ok=self.cost_source_ok)}\n"
f"━━━━━━━━━━━━━━━━━━━\n"
f"🧩 <b>資料缺口 / 下一步</b>\n"
f"{gap_lines}\n"
@@ -4338,7 +4400,7 @@ class WeeklyReportMessage:
f"└ 自動化率: <b>{auto_rate}%</b>"
)
return message[:2400]
return message[:3600]
@dataclass

View File

@@ -210,6 +210,49 @@ class WeeklyReportService:
except Exception as _disp_e:
logger.warning("weekly_report_disposition_failed", error=str(_disp_e))
ai_slo_source_ok = True
ai_slo_auto_execute_success_rate: float | None = None
ai_slo_auto_execute_sample_count = 0
ai_slo_auto_execute_threshold = 0.85
ai_slo_auto_execute_violated = False
ai_slo_top_failure = ""
ai_slo_verifier_coverage_rate: float | None = None
ai_slo_unverified_auto_count = 0
try:
from src.services.adr100_slo_status_service import (
get_adr100_slo_status_service,
)
from src.services.ai_slo_calculator import AiSloCalculator
slo_report = await AiSloCalculator(project_id="awoooi").calculate()
auto_metric = next(
(metric for metric in slo_report.metrics if metric.name == "auto_execute_success_rate"),
None,
)
if auto_metric is not None:
ai_slo_auto_execute_success_rate = auto_metric.value
ai_slo_auto_execute_sample_count = auto_metric.sample_count
ai_slo_auto_execute_threshold = auto_metric.threshold
ai_slo_auto_execute_violated = auto_metric.violated
diagnostics = slo_report.diagnostics.get("auto_execute_success_rate") or {}
top_failure = (diagnostics.get("top_failure_groups") or [{}])[0]
if top_failure:
ai_slo_top_failure = (
f"{top_failure.get('alertname') or 'unknown'} / "
f"{top_failure.get('playbook_id') or 'unknown'} ×"
f"{int(top_failure.get('count') or 0)}: "
f"{str(top_failure.get('error_signature') or '')[:90]}"
)
adr100_report = await get_adr100_slo_status_service("awoooi").fetch_report()
verification = adr100_report.get("verification_coverage") or {}
ai_slo_verifier_coverage_rate = verification.get("coverage_rate")
ai_slo_unverified_auto_count = int(verification.get("unverified_auto") or 0)
except Exception as _slo_e:
ai_slo_source_ok = False
logger.warning("weekly_report_ai_slo_failed", error=str(_slo_e))
report_source_confidence = 0
report_source_ok = 0
report_source_total = 0
@@ -267,6 +310,14 @@ class WeeklyReportService:
k3s_source_ok=k3s_source_ok,
git_source_ok=git_source_ok,
cost_source_ok=False,
ai_slo_source_ok=ai_slo_source_ok,
ai_slo_auto_execute_success_rate=ai_slo_auto_execute_success_rate,
ai_slo_auto_execute_sample_count=ai_slo_auto_execute_sample_count,
ai_slo_auto_execute_threshold=ai_slo_auto_execute_threshold,
ai_slo_auto_execute_violated=ai_slo_auto_execute_violated,
ai_slo_top_failure=ai_slo_top_failure,
ai_slo_verifier_coverage_rate=ai_slo_verifier_coverage_rate,
ai_slo_unverified_auto_count=ai_slo_unverified_auto_count,
all_zero_actionable_anomaly=(
total_incidents == 0
and ai_proposals == 0

View File

@@ -469,6 +469,9 @@ def test_weekly_report_marks_all_zero_as_low_trust_anomaly() -> None:
assert "Git=失效" in body
assert "成本=缺資料" in body
assert "全 0: <code>actionable_anomaly</code>" in body
assert "總數: <code>缺資料</code>" in body
assert "Commits: <code>缺資料</code>" in body
assert "Tokens: <code>缺資料</code>" in body
assert "資料缺口 / 下一步" in body
assert "全 0 不是健康" in body
assert "report-source-gap:stats_api" in body
@@ -500,6 +503,45 @@ def test_weekly_report_keeps_nonzero_source_status_visible() -> None:
assert "Tokens: <code>1,200</code>" in body
def test_weekly_report_includes_ai_slo_truth_when_available() -> None:
report = WeeklyReportMessage(
week_range="2026-W26",
report_date="2026-06-27 15:40",
alert_total=3,
ai_proposal_count=4,
ai_executed_count=2,
ai_success_rate=50.0,
commits_count=6,
deploy_count=2,
ai_tokens_week=1200,
stats_source_ok=True,
k3s_source_ok=True,
git_source_ok=True,
cost_source_ok=True,
ai_slo_source_ok=True,
ai_slo_auto_execute_success_rate=0.5,
ai_slo_auto_execute_sample_count=14,
ai_slo_auto_execute_threshold=0.85,
ai_slo_auto_execute_violated=True,
ai_slo_top_failure=(
"DockerContainerMissingResourceLimit / ansible:188-ai-web ×5: "
"role host-textfile-exporters not found"
),
ai_slo_verifier_coverage_rate=0.857,
ai_slo_unverified_auto_count=2,
)
body = report.format()
assert "AI 自動化 SLO" in body
assert "自動執行成功率: <code>50.0%</code>" in body
assert "目標 <code>85%</code>" in body
assert "樣本: <code>14</code>" in body
assert "Verifier 覆蓋: <code>85.7%</code>" in body
assert "未驗證自動執行: <code>2</code>" in body
assert "DockerContainerMissingResourceLimit / ansible:188-ai-web" in body
def test_weekly_report_includes_report_source_health_assets() -> None:
report = WeeklyReportMessage(
week_range="2026-W25",