fix(telegram): clarify automation notification state
All checks were successful
Code Review / ai-code-review (push) Successful in 11s
CD Pipeline / tests (push) Successful in 1m16s
CD Pipeline / build-and-deploy (push) Successful in 3m39s
CD Pipeline / post-deploy-checks (push) Successful in 1m18s

This commit is contained in:
Your Name
2026-05-06 20:59:58 +08:00
parent b2f0db0717
commit ea5ad040da
7 changed files with 643 additions and 5 deletions

View File

@@ -23,6 +23,7 @@ SOUL.md 鐵律 (4.1 Telegram 訊息壓縮原則):
"""
import asyncio
import hashlib
import html
import os
import re
@@ -50,6 +51,8 @@ SNOOZE_KEY_PREFIX = "telegram_snooze:" # {approval_id} -> 稍後提醒
SILENCE_KEY_PREFIX = "telegram_silence:" # {resource_name} -> 靜默
SNOOZE_TTL_SECONDS = 30 * 60 # 30 分鐘
SILENCE_TTL_SECONDS = 60 * 60 # 1 小時
INCIDENT_UPDATE_DEDUP_PREFIX = "awoooi:tg_update_dedup:" # {incident_id}:{status_hash}
INCIDENT_UPDATE_DEDUP_TTL_SECONDS = 5 * 60 # 5 分鐘內相同狀態不重複洗版
# 2026-04-01 Claude Code: Long Polling 分散式 Leader Election
# 防止多 Pod 同時 getUpdates → 409 Conflict 互搶問題
@@ -261,6 +264,28 @@ class TelegramMessage:
return "analysis_degraded"
return "safe_gate_pending"
def _automation_status_summary(self) -> str:
"""Telegram 首屏的人類可讀處置狀態。
這行是值班判斷入口先讓人知道這張卡是「AI 已有建議待審批」、
「AI 無法修復需人工」或「純觀察」,細節才放到後面的鏈路區塊。
"""
mode = self._automation_mode()
action = (self.suggested_action or "").upper()
text = f"{self.root_cause} {self.suggested_action}".lower()
if mode == "llm_timeout_manual_gate":
return "🔴 AI 分析超時,需人工排查"
if action in {"NO_ACTION", "待分析", ""} or "invalid_target" in text:
return "🟠 AI 無可安全執行動作,需人工判斷"
if self.confidence <= 0:
return "🟡 規則建議待審批"
if mode == "analysis_degraded":
return "🟠 AI 降級分析,需人工判斷"
if mode == "ai_proposal_ready":
return "🟡 AI 已提出修復建議,等待人工批准"
return "🟡 安全閘門待審批"
def _format_automation_block(self) -> str:
"""Visible AI automation chain for every ACTION REQUIRED card.
2026-05-04 ogt: 加入 Token 用量 + 具體 Ollama 伺服器顯示
@@ -344,6 +369,7 @@ class TelegramMessage:
safe_root_cause = html.escape(self.root_cause)
safe_action = html.escape(self.suggested_action)
safe_downtime = html.escape(self.estimated_downtime)
safe_automation_summary = html.escape(self._automation_status_summary())
# 2026-03-29 ogt: AI Token/Cost 顯示
ai_cost_display = ""
@@ -441,6 +467,7 @@ class TelegramMessage:
f"📋 <code>{html.escape(incident_id)}</code>\n"
f"🎯 資源:<code>{safe_resource}</code>\n"
f"{category_line}"
f"🧭 處置狀態:<b>{safe_automation_summary}</b>\n"
f"\n"
f"{automation_block}"
f"\n"
@@ -4462,8 +4489,6 @@ class TelegramGateway:
2026-04-09 Claude Sonnet 4.6 Asia/Taipei (統帥要求: 狀態變更在原訊息延續)
"""
from src.core.redis_client import get_redis
redis = get_redis()
redis_key = f"tg_msg:{incident_id}"
stored = await redis.get(redis_key)
@@ -4481,6 +4506,31 @@ class TelegramGateway:
logger.warning("append_incident_update_invalid_message_id", stored=stored)
return False
# Telegram 只適合放決策摘要;同一 incident 的相同狀態 5 分鐘內不重複回覆,
# 詳細執行紀錄應進 timeline / AwoooP Run Monitor避免群組被 auto-failure 洗版。
status_hash = hashlib.sha1(status_line.encode("utf-8")).hexdigest()[:16]
dedup_key = f"{INCIDENT_UPDATE_DEDUP_PREFIX}{incident_id}:{status_hash}"
try:
was_set = await redis.set(
dedup_key,
"1",
ex=INCIDENT_UPDATE_DEDUP_TTL_SECONDS,
nx=True,
)
if not was_set:
logger.info(
"append_incident_update_dedup_suppressed",
incident_id=incident_id,
dedup_key=dedup_key,
)
return True
except Exception as exc:
logger.warning(
"append_incident_update_dedup_failed",
incident_id=incident_id,
error=str(exc),
)
# Step 1: 取得原始訊息文字Telegram Bot API 不提供讀取原文,只能在 editMessageText 裡重建)
# 策略: 只追加 status_line不讀取原文Telegram edit 要傳完整新文字)
# 所以先用 editMessageReplyMarkup 換按鈕,再 sendMessage 同 chat 以 reply 方式追加狀態

View File

@@ -7,6 +7,7 @@ test_telegram_message_templates.py - Telegram 訊息模板測試
import pytest
import src.services.telegram_gateway as telegram_gateway_module
from src.services.telegram_gateway import (
DailySummaryMessage,
DeploySuccessMessage,
@@ -15,6 +16,7 @@ from src.services.telegram_gateway import (
ResourceWarnMessage,
SentryErrorMessage,
TelegramMessage,
TelegramGateway,
)
@@ -38,12 +40,50 @@ class TestTelegramMessageFormat:
assert "🚨" in result
assert "嚴重" in result
assert "test-pod-123" in result
assert "處置狀態" in result
assert "規則建議待審批" in result
assert "AI 自動化鏈路" in result
assert "OpenClaw" in result
assert "NemoTron" in result
assert "ElephantAlpha" in result
assert len(result) <= 4096 # Telegram HTML message limit
def test_telegram_message_ai_proposal_marks_approval_wait(self):
"""有 AI 信心分數的修復建議必須標示為 AI 待審批。"""
msg = TelegramMessage(
status_emoji="⚠️",
risk_level="MEDIUM",
resource_name="awoooi-api",
root_cause="CPU sustained high",
suggested_action="kubectl rollout restart deployment/awoooi-api",
estimated_downtime="~30s",
approval_id="INC-20260506-0000",
confidence=0.82,
ai_provider="ollama_gcp_a",
)
result = msg.format()
assert "處置狀態" in result
assert "AI 已提出修復建議,等待人工批准" in result
def test_telegram_message_no_action_marks_manual_judgement(self):
"""NO_ACTION 卡片必須一眼看得出需要人工判斷。"""
msg = TelegramMessage(
status_emoji="",
risk_level="LOW",
resource_name="node-exporter-110",
root_cause="規則命中但沒有安全可執行動作",
suggested_action="NO_ACTION",
estimated_downtime="unknown",
approval_id="INC-20260506-0001",
)
result = msg.format()
assert "處置狀態" in result
assert "AI 無可安全執行動作,需人工判斷" in result
def test_telegram_message_with_token_cost(self):
"""測試含 Token/Cost 的訊息"""
msg = TelegramMessage(
@@ -63,6 +103,46 @@ class TestTelegramMessageFormat:
assert "💰 Tokens: 1,500 / $0.0015" in result
@pytest.mark.asyncio
async def test_append_incident_update_deduplicates_same_status(monkeypatch):
"""同一 Incident 的相同狀態更新 5 分鐘內不可重複洗版。"""
class FakeRedis:
def __init__(self):
self.set_calls = 0
async def get(self, key):
assert key == "tg_msg:INC-DEDUP"
return "12345"
async def set(self, *args, **kwargs):
self.set_calls += 1
assert kwargs["nx"] is True
assert kwargs["ex"] > 0
return self.set_calls == 1
fake_redis = FakeRedis()
sent_requests = []
gateway = TelegramGateway()
async def fake_send_request(method, payload):
sent_requests.append((method, payload))
return {"ok": True}
monkeypatch.setattr(telegram_gateway_module, "get_redis", lambda: fake_redis)
monkeypatch.setattr(gateway, "_send_request", fake_send_request)
status_line = "🤖❌ <b>[AUTO] AI 自動修復失敗,已升級人工介入</b>"
assert await gateway.append_incident_update("INC-DEDUP", status_line) is True
assert await gateway.append_incident_update("INC-DEDUP", status_line) is True
assert [method for method, _ in sent_requests] == [
"editMessageReplyMarkup",
"sendMessage",
]
class TestSentryErrorMessage:
"""測試 Sentry 錯誤訊息"""

View File

@@ -1480,5 +1480,50 @@
"error": "Failed to load queue",
"retry": "Retry"
}
},
"awooop": {
"home": {
"eyebrow": "AI Automation Control Plane",
"title": "AwoooP Governance Overview",
"subtitle": "Unifies tenants, contracts, runs, approvals, and channel state into one operator surface so the AI flywheel and governance plane do not drift apart.",
"refresh": "Refresh",
"snapshotStatus": "Snapshot Status",
"lastUpdated": "Last Updated",
"migrationMode": "Migration Mode",
"migrationValue": "mirror / shadow",
"ready": "In Sync",
"loading": "Loading",
"degraded": "Degraded",
"metrics": {
"tenants": "Tenants",
"tenantsDetail": "{active} active, {shadow} in shadow",
"runs": "Operator Runs",
"runsDetail": "Run state is the single view into async work",
"approvals": "Pending Approvals",
"approvalsDetail": "Every high-risk action must stop at the human gate",
"contracts": "Contracts",
"contractsDetail": "Project / Agent / Policy contract publish state"
},
"lanes": {
"title": "Flywheel Lanes",
"live": "Live",
"mirror": "Mirror",
"providerName": "Provider Order",
"providerDetail": "GCP-A Ollama -> GCP-B Ollama -> 111 Ollama -> OpenClaw/Nemo -> Gemini",
"mcpName": "MCP Gateway",
"mcpDetail": "MCP Gateway stays in mirror / wrap mode before audit and redaction are proven as the only execution gate",
"channelName": "Channel Hub",
"channelDetail": "Telegram / LINE / Slack enter Channel Event first, then message ownership moves gradually",
"approvalName": "Approval Plane",
"approvalDetail": "Run state and Approval plane share one approval meaning"
},
"next": {
"title": "Next Actions",
"item1": "Review run monitor and provider fallback",
"item2": "Handle pending high-risk approvals",
"item3": "Review contract lifecycle",
"item4": "Open the AwoooP work map"
}
}
}
}

View File

@@ -1481,5 +1481,50 @@
"error": "無法載入待辦佇列",
"retry": "重試"
}
},
"awooop": {
"home": {
"eyebrow": "AI 自動化飛輪控制面",
"title": "AwoooP 治理總覽",
"subtitle": "把租戶、合約、Run、審批與通道狀態收斂到同一個操作面避免 AI 自動化飛輪和治理面各自長出一套邏輯。",
"refresh": "重新整理",
"snapshotStatus": "快照狀態",
"lastUpdated": "最後更新",
"migrationMode": "遷移模式",
"migrationValue": "mirror / shadow",
"ready": "同步中",
"loading": "讀取中",
"degraded": "降級",
"metrics": {
"tenants": "租戶",
"tenantsDetail": "{active} 個啟用,{shadow} 個 shadow",
"runs": "Operator Runs",
"runsDetail": "Run state 是非同步任務的唯一觀測入口",
"approvals": "待審批",
"approvalsDetail": "所有高風險動作都必須停在人工閘門",
"contracts": "合約",
"contractsDetail": "Project / Agent / Policy contract 發布狀態"
},
"lanes": {
"title": "飛輪鏈路",
"live": "已接線",
"mirror": "Mirror",
"providerName": "Provider 順序",
"providerDetail": "GCP-A Ollama -> GCP-B Ollama -> 111 Ollama -> OpenClaw/Nemo -> Gemini",
"mcpName": "MCP Gateway",
"mcpDetail": "MCP Gateway 先 mirror / wrap確認 audit 與 redaction 後才切成唯一閘門",
"channelName": "Channel Hub",
"channelDetail": "Telegram / LINE / Slack 先進 Channel Event再逐步切換發送責任",
"approvalName": "Approval Plane",
"approvalDetail": "Run state 與 Approval plane 共享同一條審批語義"
},
"next": {
"title": "下一步操作",
"item1": "查看 Run 監控與 provider fallback",
"item2": "處理等待審批的高風險操作",
"item3": "審查 Contract lifecycle",
"item4": "查看 AwoooP 工作鏈路地圖"
}
}
}
}

View File

@@ -1,9 +1,360 @@
// =============================================================================
// WOOO AIOps - AwoooP Console 入口
// WOOO AIOps - AwoooP Operator Console
// =============================================================================
// 將 AwoooP 定位為 AI 自動化飛輪的治理面、稽核面與人工操作面。
import AwoooPWorkItemsPage from "./work-items/page";
"use client";
import { useCallback, useEffect, useMemo, useState } from "react";
import { useLocale, useTranslations } from "next-intl";
import {
Activity,
ArrowRight,
BrainCircuit,
CheckCircle2,
FileText,
GitBranch,
RefreshCw,
ShieldCheck,
Waypoints,
} from "lucide-react";
import { Link } from "@/i18n/routing";
import { cn } from "@/lib/utils";
type Tenant = {
project_id: string;
display_name?: string;
migration_mode?: string;
is_active?: boolean;
};
type PlatformResponse = {
tenants?: Tenant[];
total?: number;
runs?: unknown[];
contracts?: unknown[];
items?: unknown[];
};
type Snapshot = {
tenants: number;
activeTenants: number;
shadowTenants: number;
runs: number;
approvals: number;
contracts: number;
};
type SnapshotStatus = "loading" | "ready" | "degraded";
const API_BASE = process.env.NEXT_PUBLIC_API_URL ?? "";
const emptySnapshot: Snapshot = {
tenants: 0,
activeTenants: 0,
shadowTenants: 0,
runs: 0,
approvals: 0,
contracts: 0,
};
function numberValue(value: unknown): number {
return typeof value === "number" && Number.isFinite(value) ? value : 0;
}
function countRows(data: PlatformResponse, keys: Array<keyof PlatformResponse>): number {
if (typeof data.total === "number") return data.total;
for (const key of keys) {
const rows = data[key];
if (Array.isArray(rows)) return rows.length;
}
return 0;
}
function MetricCell({
label,
value,
detail,
icon: Icon,
tone = "neutral",
}: {
label: string;
value: string | number;
detail: string;
icon: typeof Activity;
tone?: "neutral" | "good" | "warn";
}) {
return (
<div className="min-h-[116px] border border-[#e0ddd4] bg-white px-4 py-3">
<div className="flex items-start justify-between gap-3">
<div>
<p className="text-xs font-semibold text-[#77736a]">{label}</p>
<p className="mt-2 font-mono text-3xl font-semibold leading-none text-[#141413]">
{value}
</p>
</div>
<span
className={cn(
"flex h-8 w-8 items-center justify-center border",
tone === "good" && "border-[#9bc7a4] bg-[#f0faf2] text-[#17602a]",
tone === "warn" && "border-[#d9b36f] bg-[#fff7e8] text-[#8a5a08]",
tone === "neutral" && "border-[#d8d3c7] bg-[#faf9f3] text-[#5f5b52]"
)}
>
<Icon className="h-4 w-4" aria-hidden="true" />
</span>
</div>
<p className="mt-3 text-xs leading-5 text-[#5f5b52]">{detail}</p>
</div>
);
}
function LaneRow({
name,
status,
detail,
tone,
}: {
name: string;
status: string;
detail: string;
tone: "good" | "warn" | "neutral";
}) {
return (
<div className="grid gap-3 border-b border-[#eee9dd] px-4 py-3 last:border-b-0 md:grid-cols-[180px_140px_1fr]">
<div className="font-mono text-xs font-semibold text-[#141413]">{name}</div>
<div>
<span
className={cn(
"inline-flex border px-2 py-0.5 text-xs font-semibold",
tone === "good" && "border-[#9bc7a4] bg-[#f0faf2] text-[#17602a]",
tone === "warn" && "border-[#d9b36f] bg-[#fff7e8] text-[#8a5a08]",
tone === "neutral" && "border-[#d8d3c7] bg-white text-[#5f5b52]"
)}
>
{status}
</span>
</div>
<div className="text-sm leading-5 text-[#5f5b52]">{detail}</div>
</div>
);
}
export default function AwoooPPage() {
return <AwoooPWorkItemsPage />;
const t = useTranslations("awooop.home");
const locale = useLocale();
const [snapshot, setSnapshot] = useState<Snapshot>(emptySnapshot);
const [status, setStatus] = useState<SnapshotStatus>("loading");
const [lastUpdated, setLastUpdated] = useState<Date | null>(null);
const fetchSnapshot = useCallback(async () => {
setStatus("loading");
try {
const [tenantRes, runRes, approvalRes, contractRes] = await Promise.all([
fetch(`${API_BASE}/api/v1/platform/tenants`),
fetch(`${API_BASE}/api/v1/platform/runs/list?per_page=1`),
fetch(`${API_BASE}/api/v1/platform/approvals`),
fetch(`${API_BASE}/api/v1/platform/contracts?per_page=1`),
]);
if (![tenantRes, runRes, approvalRes, contractRes].every((res) => res.ok)) {
throw new Error("platform snapshot fetch failed");
}
const [tenantData, runData, approvalData, contractData] = await Promise.all([
tenantRes.json() as Promise<PlatformResponse>,
runRes.json() as Promise<PlatformResponse>,
approvalRes.json() as Promise<PlatformResponse>,
contractRes.json() as Promise<PlatformResponse>,
]);
const tenants = Array.isArray(tenantData.tenants) ? tenantData.tenants : [];
setSnapshot({
tenants: countRows(tenantData, ["tenants", "items"]),
activeTenants: tenants.filter((tenant) => tenant.is_active !== false).length,
shadowTenants: tenants.filter((tenant) => tenant.migration_mode === "shadow").length,
runs: countRows(runData, ["runs", "items"]),
approvals: countRows(approvalData, ["items"]),
contracts: countRows(contractData, ["contracts", "items"]),
});
setLastUpdated(new Date());
setStatus("ready");
} catch {
setStatus("degraded");
setLastUpdated(new Date());
}
}, []);
useEffect(() => {
fetchSnapshot();
}, [fetchSnapshot]);
const formattedUpdated = useMemo(() => {
if (!lastUpdated) return "--";
return lastUpdated.toLocaleTimeString(locale === "zh-TW" ? "zh-TW" : "en-US", {
hour: "2-digit",
minute: "2-digit",
second: "2-digit",
});
}, [lastUpdated, locale]);
const healthTone = status === "ready" ? "good" : status === "loading" ? "neutral" : "warn";
return (
<div className="space-y-5">
<section className="border border-[#e0ddd4] bg-white">
<div className="grid gap-px bg-[#e0ddd4] lg:grid-cols-[1.4fr_0.9fr]">
<div className="bg-white p-5">
<div className="flex flex-wrap items-start justify-between gap-4">
<div className="max-w-3xl">
<div className="flex items-center gap-2 text-xs font-semibold text-[#d97757]">
<BrainCircuit className="h-4 w-4" aria-hidden="true" />
<span>{t("eyebrow")}</span>
</div>
<h2 className="mt-3 text-2xl font-semibold tracking-normal text-[#141413]">
{t("title")}
</h2>
<p className="mt-3 max-w-2xl text-sm leading-6 text-[#5f5b52]">
{t("subtitle")}
</p>
</div>
<button
type="button"
onClick={fetchSnapshot}
className="inline-flex items-center gap-2 border border-[#d8d3c7] bg-[#faf9f3] px-3 py-2 text-xs font-semibold text-[#141413] hover:border-[#d97757]"
aria-label={t("refresh")}
>
<RefreshCw
className={cn("h-4 w-4", status === "loading" && "animate-spin")}
aria-hidden="true"
/>
{t("refresh")}
</button>
</div>
</div>
<div className="bg-[#faf9f3] p-5">
<div className="grid gap-3 text-sm">
<div className="flex items-center justify-between gap-3">
<span className="text-[#77736a]">{t("snapshotStatus")}</span>
<span
className={cn(
"inline-flex border px-2 py-0.5 text-xs font-semibold",
healthTone === "good" && "border-[#9bc7a4] bg-[#f0faf2] text-[#17602a]",
healthTone === "warn" && "border-[#d9b36f] bg-[#fff7e8] text-[#8a5a08]",
healthTone === "neutral" && "border-[#d8d3c7] bg-white text-[#5f5b52]"
)}
>
{status === "ready" ? t("ready") : status === "loading" ? t("loading") : t("degraded")}
</span>
</div>
<div className="flex items-center justify-between gap-3">
<span className="text-[#77736a]">{t("lastUpdated")}</span>
<span className="font-mono text-[#141413]">{formattedUpdated}</span>
</div>
<div className="flex items-center justify-between gap-3">
<span className="text-[#77736a]">{t("migrationMode")}</span>
<span className="font-mono text-[#141413]">{t("migrationValue")}</span>
</div>
</div>
</div>
</div>
</section>
<section className="grid gap-3 md:grid-cols-2 xl:grid-cols-4">
<MetricCell
label={t("metrics.tenants")}
value={numberValue(snapshot.tenants)}
detail={t("metrics.tenantsDetail", {
active: numberValue(snapshot.activeTenants),
shadow: numberValue(snapshot.shadowTenants),
})}
icon={Waypoints}
tone="good"
/>
<MetricCell
label={t("metrics.runs")}
value={numberValue(snapshot.runs)}
detail={t("metrics.runsDetail")}
icon={Activity}
tone={snapshot.runs > 0 ? "good" : "neutral"}
/>
<MetricCell
label={t("metrics.approvals")}
value={numberValue(snapshot.approvals)}
detail={t("metrics.approvalsDetail")}
icon={ShieldCheck}
tone={snapshot.approvals > 0 ? "warn" : "good"}
/>
<MetricCell
label={t("metrics.contracts")}
value={numberValue(snapshot.contracts)}
detail={t("metrics.contractsDetail")}
icon={FileText}
tone={snapshot.contracts > 0 ? "good" : "warn"}
/>
</section>
<section className="grid gap-5 xl:grid-cols-[1.1fr_0.9fr]">
<div className="border border-[#e0ddd4] bg-white">
<div className="border-b border-[#e0ddd4] bg-[#faf9f3] px-4 py-3">
<div className="flex items-center gap-2">
<GitBranch className="h-4 w-4 text-[#d97757]" aria-hidden="true" />
<h3 className="text-sm font-semibold text-[#141413]">{t("lanes.title")}</h3>
</div>
</div>
<LaneRow
name={t("lanes.providerName")}
status={t("lanes.live")}
detail={t("lanes.providerDetail")}
tone="good"
/>
<LaneRow
name={t("lanes.mcpName")}
status={t("lanes.mirror")}
detail={t("lanes.mcpDetail")}
tone="warn"
/>
<LaneRow
name={t("lanes.channelName")}
status={t("lanes.mirror")}
detail={t("lanes.channelDetail")}
tone="warn"
/>
<LaneRow
name={t("lanes.approvalName")}
status={t("lanes.live")}
detail={t("lanes.approvalDetail")}
tone="good"
/>
</div>
<div className="border border-[#e0ddd4] bg-white">
<div className="border-b border-[#e0ddd4] bg-[#faf9f3] px-4 py-3">
<div className="flex items-center gap-2">
<CheckCircle2 className="h-4 w-4 text-[#d97757]" aria-hidden="true" />
<h3 className="text-sm font-semibold text-[#141413]">{t("next.title")}</h3>
</div>
</div>
<div className="divide-y divide-[#eee9dd]">
{[
[t("next.item1"), "/awooop/runs" as const],
[t("next.item2"), "/awooop/approvals" as const],
[t("next.item3"), "/awooop/contracts" as const],
[t("next.item4"), "/awooop/work-items" as const],
].map(([label, href]) => (
<Link
key={String(label)}
href={href}
className="flex items-center justify-between gap-3 px-4 py-3 text-sm text-[#141413] hover:bg-[#faf9f3]"
>
<span>{label}</span>
<ArrowRight className="h-4 w-4 text-[#77736a]" aria-hidden="true" />
</Link>
))}
</div>
</div>
</section>
</div>
);
}

View File

@@ -1,3 +1,23 @@
## 2026-05-06 | Telegram 事故通知語義收斂與 AwoooP 首頁總覽
**背景**SRE 戰情室截圖顯示 ACTION REQUIRED、AI 自動修復失敗、Escalation、Code Review、Config Drift 等訊息混在同一條流中;值班者很難快速分辨哪些是 AI 已修復、哪些是 AI 無法修復需要人工、哪些只是報表或治理通知。
**本次修補**
- `TelegramMessage` 主卡新增「處置狀態」,在第一屏明確標示 `AI 已提出修復建議,等待人工批准``AI 無可安全執行動作,需人工判斷``AI 分析超時,需人工排查``規則建議待審批`
- `append_incident_update()` 對同一 `incident_id` 的相同狀態回覆做 5 分鐘 Redis 去重,避免同樣的 `[AUTO] AI 自動修復失敗` 連續洗版。
- 新增 `docs/awooop/TELEGRAM-INCIDENT-NOTIFICATION-MODEL.md`,定義 Telegram / AwoooP Run Monitor / Approval Queue / Incident Timeline / MCP Audit 的分工。
- `/zh-TW/awooop` 首頁改為治理總覽直接顯示租戶、Run、審批、合約與飛輪鏈路狀態不再只是轉到 work-items 頁。
- 新增 AwoooP 首頁 `zh-TW` / `en` i18n 字串。
**驗證**
- `python -m py_compile apps/api/src/services/telegram_gateway.py apps/api/tests/test_telegram_message_templates.py`
- `pytest tests/test_telegram_message_templates.py tests/test_telegram_ai_automation_block.py -q` → 19 passed。
- `pnpm --dir apps/web typecheck` 通過。
- `NEXT_PUBLIC_API_URL=https://awoooi.wooo.work pnpm --dir apps/web build` 通過。
**注意**
- `ruff check src/services/telegram_gateway.py ...` 仍會掃到 `telegram_gateway.py` 既有 import/order、bare except、單行 if 等歷史債;本輪沒有在 6000+ 行 gateway 巨檔做無關機械清理,避免混入額外行為風險。
## 2026-05-06 | AwoooP Run 監控頁 422 修正
**背景**Playwright 驗證 `/zh-TW/awooop` 時未再看到 client-side exception`/zh-TW/awooop/runs` 會顯示「無法載入 Run 資料 HTTP 422」。後端 log 顯示 `GET /api/v1/platform/runs/list?page=1&per_page=50` 被回 422。

View File

@@ -0,0 +1,47 @@
# Telegram 事故通知模型
> 目的:讓 SRE 戰情室一眼分辨「AI 已修復」、「AI 可建議但需批准」、「AI 無法安全修復需人工」與「僅通知」,避免告警、執行 log、Code Review、Drift 與審批結果互相洗版。
## 核心判斷
Telegram 不應是完整執行日誌,也不應承載所有 AI 推理細節。Telegram 的職責是把需要人類注意力的決策摘要送到 SRE 戰情室完整時間線、工具輸出、重試原因、provider fallback 與 audit 交給 AwoooP Run Monitor / Incident Timeline。
## 四種通知狀態
| 狀態 | 意義 | Telegram 行為 | 操作者動作 |
| --- | --- | --- | --- |
| AI 已自動修復 | 自動化已完成且驗證通過 | 更新原 incident 卡或回覆一次結果 | 檢查即可,不需批准 |
| AI 建議待審批 | AI / 規則已提出可執行建議,但被 Trust / Risk gate 擋下 | 發一張 ACTION REQUIRED 主卡 | 批准、拒絕、靜默或看詳情 |
| AI 無法安全修復 | NO_ACTION、INVALID_TARGET、LLM timeout、MCP 失敗或缺少安全動作 | 發人工接手摘要,不重複刷同一狀態 | 人工排查,或要求重診 |
| 僅通知 | 心跳、報表、Code Review 完成、低風險治理資訊 | 彙總卡或摘要頻道 | 通常不需即時動作 |
## 專業化訊息規則
1. 同一個 `incident_id` 只應有一張主卡。後續狀態使用原卡回覆、編輯按鈕或 AwoooP timeline不再每一步都新發卡。
2. 主卡第一屏必須顯示「處置狀態」先回答AI 是否能修、是否已修、是否需要人工。
3. 同一個 `incident_id` 的相同狀態更新,短時間內要去重。詳細重試與錯誤放到 timeline不洗 Telegram。
4. P0 / P1 escalation 可以另發升級卡,但內容必須是「目前影響、已嘗試、卡住原因、需要誰做什麼」,不可重貼所有底層 log。
5. Code Review、Config Drift、報表、心跳不應和 incident 執行回覆混在同一種語義;它們可以在同一 SRE 群組,但必須以摘要卡與固定前綴區分。
## 與 AwoooP 的分工
| 介面 | 承載內容 |
| --- | --- |
| Telegram | 決策摘要、升級、人工批准入口 |
| AwoooP Run Monitor | 非同步 Run、provider fallback、tool call、retry、latency |
| Approval Queue | 所有等待批准的高風險動作 |
| Incident Timeline | 事件完整歷程、AI 嘗試、失敗原因、KM / Playbook 回寫 |
| MCP Audit | 工具執行、redaction、permission gate、credential 注入 |
## 本輪落地
- `TelegramMessage` 主卡新增「處置狀態」。
- `append_incident_update()` 對同一 incident 的相同狀態做 5 分鐘 Redis 去重。
- 既有 `詳情 / 重診 / 歷史` 按鈕保留,讓 Telegram 保持輕量,細節回到控制台。
## 後續建議
1. 將 Telegram 群組升級為 Forum topics 或固定 topic lane`P0/P1 事故``人工審批``治理/報表``CI/Code Review`
2. AwoooP Approval Queue 顯示與 Telegram 相同的「處置狀態」欄位,避免前後端語義分裂。
3. 將 auto-repair failure 的完整 stdout/stderr 改寫入 Run Timeline只在 Telegram 顯示最短摘要與詳情連結。
4. 對 firing 告警做 fingerprint 聚合:同一 alertname + target + namespace 在窗口內只更新卡片,不新增卡片。