fix(telegram): clarify automation notification state
This commit is contained in:
@@ -23,6 +23,7 @@ SOUL.md 鐵律 (4.1 Telegram 訊息壓縮原則):
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import html
|
||||
import os
|
||||
import re
|
||||
@@ -50,6 +51,8 @@ SNOOZE_KEY_PREFIX = "telegram_snooze:" # {approval_id} -> 稍後提醒
|
||||
SILENCE_KEY_PREFIX = "telegram_silence:" # {resource_name} -> 靜默
|
||||
SNOOZE_TTL_SECONDS = 30 * 60 # 30 分鐘
|
||||
SILENCE_TTL_SECONDS = 60 * 60 # 1 小時
|
||||
INCIDENT_UPDATE_DEDUP_PREFIX = "awoooi:tg_update_dedup:" # {incident_id}:{status_hash}
|
||||
INCIDENT_UPDATE_DEDUP_TTL_SECONDS = 5 * 60 # 5 分鐘內相同狀態不重複洗版
|
||||
|
||||
# 2026-04-01 Claude Code: Long Polling 分散式 Leader Election
|
||||
# 防止多 Pod 同時 getUpdates → 409 Conflict 互搶問題
|
||||
@@ -261,6 +264,28 @@ class TelegramMessage:
|
||||
return "analysis_degraded"
|
||||
return "safe_gate_pending"
|
||||
|
||||
def _automation_status_summary(self) -> str:
|
||||
"""Telegram 首屏的人類可讀處置狀態。
|
||||
|
||||
這行是值班判斷入口:先讓人知道這張卡是「AI 已有建議待審批」、
|
||||
「AI 無法修復需人工」或「純觀察」,細節才放到後面的鏈路區塊。
|
||||
"""
|
||||
mode = self._automation_mode()
|
||||
action = (self.suggested_action or "").upper()
|
||||
text = f"{self.root_cause} {self.suggested_action}".lower()
|
||||
|
||||
if mode == "llm_timeout_manual_gate":
|
||||
return "🔴 AI 分析超時,需人工排查"
|
||||
if action in {"NO_ACTION", "待分析", ""} or "invalid_target" in text:
|
||||
return "🟠 AI 無可安全執行動作,需人工判斷"
|
||||
if self.confidence <= 0:
|
||||
return "🟡 規則建議待審批"
|
||||
if mode == "analysis_degraded":
|
||||
return "🟠 AI 降級分析,需人工判斷"
|
||||
if mode == "ai_proposal_ready":
|
||||
return "🟡 AI 已提出修復建議,等待人工批准"
|
||||
return "🟡 安全閘門待審批"
|
||||
|
||||
def _format_automation_block(self) -> str:
|
||||
"""Visible AI automation chain for every ACTION REQUIRED card.
|
||||
2026-05-04 ogt: 加入 Token 用量 + 具體 Ollama 伺服器顯示
|
||||
@@ -344,6 +369,7 @@ class TelegramMessage:
|
||||
safe_root_cause = html.escape(self.root_cause)
|
||||
safe_action = html.escape(self.suggested_action)
|
||||
safe_downtime = html.escape(self.estimated_downtime)
|
||||
safe_automation_summary = html.escape(self._automation_status_summary())
|
||||
|
||||
# 2026-03-29 ogt: AI Token/Cost 顯示
|
||||
ai_cost_display = ""
|
||||
@@ -441,6 +467,7 @@ class TelegramMessage:
|
||||
f"📋 <code>{html.escape(incident_id)}</code>\n"
|
||||
f"🎯 資源:<code>{safe_resource}</code>\n"
|
||||
f"{category_line}"
|
||||
f"🧭 處置狀態:<b>{safe_automation_summary}</b>\n"
|
||||
f"\n"
|
||||
f"{automation_block}"
|
||||
f"\n"
|
||||
@@ -4462,8 +4489,6 @@ class TelegramGateway:
|
||||
|
||||
2026-04-09 Claude Sonnet 4.6 Asia/Taipei (統帥要求: 狀態變更在原訊息延續)
|
||||
"""
|
||||
from src.core.redis_client import get_redis
|
||||
|
||||
redis = get_redis()
|
||||
redis_key = f"tg_msg:{incident_id}"
|
||||
stored = await redis.get(redis_key)
|
||||
@@ -4481,6 +4506,31 @@ class TelegramGateway:
|
||||
logger.warning("append_incident_update_invalid_message_id", stored=stored)
|
||||
return False
|
||||
|
||||
# Telegram 只適合放決策摘要;同一 incident 的相同狀態 5 分鐘內不重複回覆,
|
||||
# 詳細執行紀錄應進 timeline / AwoooP Run Monitor,避免群組被 auto-failure 洗版。
|
||||
status_hash = hashlib.sha1(status_line.encode("utf-8")).hexdigest()[:16]
|
||||
dedup_key = f"{INCIDENT_UPDATE_DEDUP_PREFIX}{incident_id}:{status_hash}"
|
||||
try:
|
||||
was_set = await redis.set(
|
||||
dedup_key,
|
||||
"1",
|
||||
ex=INCIDENT_UPDATE_DEDUP_TTL_SECONDS,
|
||||
nx=True,
|
||||
)
|
||||
if not was_set:
|
||||
logger.info(
|
||||
"append_incident_update_dedup_suppressed",
|
||||
incident_id=incident_id,
|
||||
dedup_key=dedup_key,
|
||||
)
|
||||
return True
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"append_incident_update_dedup_failed",
|
||||
incident_id=incident_id,
|
||||
error=str(exc),
|
||||
)
|
||||
|
||||
# Step 1: 取得原始訊息文字(Telegram Bot API 不提供讀取原文,只能在 editMessageText 裡重建)
|
||||
# 策略: 只追加 status_line,不讀取原文(Telegram edit 要傳完整新文字)
|
||||
# 所以先用 editMessageReplyMarkup 換按鈕,再 sendMessage 同 chat 以 reply 方式追加狀態
|
||||
|
||||
@@ -7,6 +7,7 @@ test_telegram_message_templates.py - Telegram 訊息模板測試
|
||||
|
||||
import pytest
|
||||
|
||||
import src.services.telegram_gateway as telegram_gateway_module
|
||||
from src.services.telegram_gateway import (
|
||||
DailySummaryMessage,
|
||||
DeploySuccessMessage,
|
||||
@@ -15,6 +16,7 @@ from src.services.telegram_gateway import (
|
||||
ResourceWarnMessage,
|
||||
SentryErrorMessage,
|
||||
TelegramMessage,
|
||||
TelegramGateway,
|
||||
)
|
||||
|
||||
|
||||
@@ -38,12 +40,50 @@ class TestTelegramMessageFormat:
|
||||
assert "🚨" in result
|
||||
assert "嚴重" in result
|
||||
assert "test-pod-123" in result
|
||||
assert "處置狀態" in result
|
||||
assert "規則建議待審批" in result
|
||||
assert "AI 自動化鏈路" in result
|
||||
assert "OpenClaw" in result
|
||||
assert "NemoTron" in result
|
||||
assert "ElephantAlpha" in result
|
||||
assert len(result) <= 4096 # Telegram HTML message limit
|
||||
|
||||
def test_telegram_message_ai_proposal_marks_approval_wait(self):
|
||||
"""有 AI 信心分數的修復建議必須標示為 AI 待審批。"""
|
||||
msg = TelegramMessage(
|
||||
status_emoji="⚠️",
|
||||
risk_level="MEDIUM",
|
||||
resource_name="awoooi-api",
|
||||
root_cause="CPU sustained high",
|
||||
suggested_action="kubectl rollout restart deployment/awoooi-api",
|
||||
estimated_downtime="~30s",
|
||||
approval_id="INC-20260506-0000",
|
||||
confidence=0.82,
|
||||
ai_provider="ollama_gcp_a",
|
||||
)
|
||||
|
||||
result = msg.format()
|
||||
|
||||
assert "處置狀態" in result
|
||||
assert "AI 已提出修復建議,等待人工批准" in result
|
||||
|
||||
def test_telegram_message_no_action_marks_manual_judgement(self):
|
||||
"""NO_ACTION 卡片必須一眼看得出需要人工判斷。"""
|
||||
msg = TelegramMessage(
|
||||
status_emoji="ℹ️",
|
||||
risk_level="LOW",
|
||||
resource_name="node-exporter-110",
|
||||
root_cause="規則命中但沒有安全可執行動作",
|
||||
suggested_action="NO_ACTION",
|
||||
estimated_downtime="unknown",
|
||||
approval_id="INC-20260506-0001",
|
||||
)
|
||||
|
||||
result = msg.format()
|
||||
|
||||
assert "處置狀態" in result
|
||||
assert "AI 無可安全執行動作,需人工判斷" in result
|
||||
|
||||
def test_telegram_message_with_token_cost(self):
|
||||
"""測試含 Token/Cost 的訊息"""
|
||||
msg = TelegramMessage(
|
||||
@@ -63,6 +103,46 @@ class TestTelegramMessageFormat:
|
||||
assert "💰 Tokens: 1,500 / $0.0015" in result
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_append_incident_update_deduplicates_same_status(monkeypatch):
|
||||
"""同一 Incident 的相同狀態更新 5 分鐘內不可重複洗版。"""
|
||||
|
||||
class FakeRedis:
|
||||
def __init__(self):
|
||||
self.set_calls = 0
|
||||
|
||||
async def get(self, key):
|
||||
assert key == "tg_msg:INC-DEDUP"
|
||||
return "12345"
|
||||
|
||||
async def set(self, *args, **kwargs):
|
||||
self.set_calls += 1
|
||||
assert kwargs["nx"] is True
|
||||
assert kwargs["ex"] > 0
|
||||
return self.set_calls == 1
|
||||
|
||||
fake_redis = FakeRedis()
|
||||
sent_requests = []
|
||||
gateway = TelegramGateway()
|
||||
|
||||
async def fake_send_request(method, payload):
|
||||
sent_requests.append((method, payload))
|
||||
return {"ok": True}
|
||||
|
||||
monkeypatch.setattr(telegram_gateway_module, "get_redis", lambda: fake_redis)
|
||||
monkeypatch.setattr(gateway, "_send_request", fake_send_request)
|
||||
|
||||
status_line = "🤖❌ <b>[AUTO] AI 自動修復失敗,已升級人工介入</b>"
|
||||
|
||||
assert await gateway.append_incident_update("INC-DEDUP", status_line) is True
|
||||
assert await gateway.append_incident_update("INC-DEDUP", status_line) is True
|
||||
|
||||
assert [method for method, _ in sent_requests] == [
|
||||
"editMessageReplyMarkup",
|
||||
"sendMessage",
|
||||
]
|
||||
|
||||
|
||||
class TestSentryErrorMessage:
|
||||
"""測試 Sentry 錯誤訊息"""
|
||||
|
||||
|
||||
@@ -1480,5 +1480,50 @@
|
||||
"error": "Failed to load queue",
|
||||
"retry": "Retry"
|
||||
}
|
||||
},
|
||||
"awooop": {
|
||||
"home": {
|
||||
"eyebrow": "AI Automation Control Plane",
|
||||
"title": "AwoooP Governance Overview",
|
||||
"subtitle": "Unifies tenants, contracts, runs, approvals, and channel state into one operator surface so the AI flywheel and governance plane do not drift apart.",
|
||||
"refresh": "Refresh",
|
||||
"snapshotStatus": "Snapshot Status",
|
||||
"lastUpdated": "Last Updated",
|
||||
"migrationMode": "Migration Mode",
|
||||
"migrationValue": "mirror / shadow",
|
||||
"ready": "In Sync",
|
||||
"loading": "Loading",
|
||||
"degraded": "Degraded",
|
||||
"metrics": {
|
||||
"tenants": "Tenants",
|
||||
"tenantsDetail": "{active} active, {shadow} in shadow",
|
||||
"runs": "Operator Runs",
|
||||
"runsDetail": "Run state is the single view into async work",
|
||||
"approvals": "Pending Approvals",
|
||||
"approvalsDetail": "Every high-risk action must stop at the human gate",
|
||||
"contracts": "Contracts",
|
||||
"contractsDetail": "Project / Agent / Policy contract publish state"
|
||||
},
|
||||
"lanes": {
|
||||
"title": "Flywheel Lanes",
|
||||
"live": "Live",
|
||||
"mirror": "Mirror",
|
||||
"providerName": "Provider Order",
|
||||
"providerDetail": "GCP-A Ollama -> GCP-B Ollama -> 111 Ollama -> OpenClaw/Nemo -> Gemini",
|
||||
"mcpName": "MCP Gateway",
|
||||
"mcpDetail": "MCP Gateway stays in mirror / wrap mode before audit and redaction are proven as the only execution gate",
|
||||
"channelName": "Channel Hub",
|
||||
"channelDetail": "Telegram / LINE / Slack enter Channel Event first, then message ownership moves gradually",
|
||||
"approvalName": "Approval Plane",
|
||||
"approvalDetail": "Run state and Approval plane share one approval meaning"
|
||||
},
|
||||
"next": {
|
||||
"title": "Next Actions",
|
||||
"item1": "Review run monitor and provider fallback",
|
||||
"item2": "Handle pending high-risk approvals",
|
||||
"item3": "Review contract lifecycle",
|
||||
"item4": "Open the AwoooP work map"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1481,5 +1481,50 @@
|
||||
"error": "無法載入待辦佇列",
|
||||
"retry": "重試"
|
||||
}
|
||||
},
|
||||
"awooop": {
|
||||
"home": {
|
||||
"eyebrow": "AI 自動化飛輪控制面",
|
||||
"title": "AwoooP 治理總覽",
|
||||
"subtitle": "把租戶、合約、Run、審批與通道狀態收斂到同一個操作面,避免 AI 自動化飛輪和治理面各自長出一套邏輯。",
|
||||
"refresh": "重新整理",
|
||||
"snapshotStatus": "快照狀態",
|
||||
"lastUpdated": "最後更新",
|
||||
"migrationMode": "遷移模式",
|
||||
"migrationValue": "mirror / shadow",
|
||||
"ready": "同步中",
|
||||
"loading": "讀取中",
|
||||
"degraded": "降級",
|
||||
"metrics": {
|
||||
"tenants": "租戶",
|
||||
"tenantsDetail": "{active} 個啟用,{shadow} 個 shadow",
|
||||
"runs": "Operator Runs",
|
||||
"runsDetail": "Run state 是非同步任務的唯一觀測入口",
|
||||
"approvals": "待審批",
|
||||
"approvalsDetail": "所有高風險動作都必須停在人工閘門",
|
||||
"contracts": "合約",
|
||||
"contractsDetail": "Project / Agent / Policy contract 發布狀態"
|
||||
},
|
||||
"lanes": {
|
||||
"title": "飛輪鏈路",
|
||||
"live": "已接線",
|
||||
"mirror": "Mirror",
|
||||
"providerName": "Provider 順序",
|
||||
"providerDetail": "GCP-A Ollama -> GCP-B Ollama -> 111 Ollama -> OpenClaw/Nemo -> Gemini",
|
||||
"mcpName": "MCP Gateway",
|
||||
"mcpDetail": "MCP Gateway 先 mirror / wrap,確認 audit 與 redaction 後才切成唯一閘門",
|
||||
"channelName": "Channel Hub",
|
||||
"channelDetail": "Telegram / LINE / Slack 先進 Channel Event,再逐步切換發送責任",
|
||||
"approvalName": "Approval Plane",
|
||||
"approvalDetail": "Run state 與 Approval plane 共享同一條審批語義"
|
||||
},
|
||||
"next": {
|
||||
"title": "下一步操作",
|
||||
"item1": "查看 Run 監控與 provider fallback",
|
||||
"item2": "處理等待審批的高風險操作",
|
||||
"item3": "審查 Contract lifecycle",
|
||||
"item4": "查看 AwoooP 工作鏈路地圖"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,9 +1,360 @@
|
||||
// =============================================================================
|
||||
// WOOO AIOps - AwoooP Console 入口頁
|
||||
// WOOO AIOps - AwoooP Operator Console 首頁
|
||||
// =============================================================================
|
||||
// 將 AwoooP 定位為 AI 自動化飛輪的治理面、稽核面與人工操作面。
|
||||
|
||||
import AwoooPWorkItemsPage from "./work-items/page";
|
||||
"use client";
|
||||
|
||||
import { useCallback, useEffect, useMemo, useState } from "react";
|
||||
import { useLocale, useTranslations } from "next-intl";
|
||||
import {
|
||||
Activity,
|
||||
ArrowRight,
|
||||
BrainCircuit,
|
||||
CheckCircle2,
|
||||
FileText,
|
||||
GitBranch,
|
||||
RefreshCw,
|
||||
ShieldCheck,
|
||||
Waypoints,
|
||||
} from "lucide-react";
|
||||
import { Link } from "@/i18n/routing";
|
||||
import { cn } from "@/lib/utils";
|
||||
|
||||
type Tenant = {
|
||||
project_id: string;
|
||||
display_name?: string;
|
||||
migration_mode?: string;
|
||||
is_active?: boolean;
|
||||
};
|
||||
|
||||
type PlatformResponse = {
|
||||
tenants?: Tenant[];
|
||||
total?: number;
|
||||
runs?: unknown[];
|
||||
contracts?: unknown[];
|
||||
items?: unknown[];
|
||||
};
|
||||
|
||||
type Snapshot = {
|
||||
tenants: number;
|
||||
activeTenants: number;
|
||||
shadowTenants: number;
|
||||
runs: number;
|
||||
approvals: number;
|
||||
contracts: number;
|
||||
};
|
||||
|
||||
type SnapshotStatus = "loading" | "ready" | "degraded";
|
||||
|
||||
const API_BASE = process.env.NEXT_PUBLIC_API_URL ?? "";
|
||||
|
||||
const emptySnapshot: Snapshot = {
|
||||
tenants: 0,
|
||||
activeTenants: 0,
|
||||
shadowTenants: 0,
|
||||
runs: 0,
|
||||
approvals: 0,
|
||||
contracts: 0,
|
||||
};
|
||||
|
||||
function numberValue(value: unknown): number {
|
||||
return typeof value === "number" && Number.isFinite(value) ? value : 0;
|
||||
}
|
||||
|
||||
function countRows(data: PlatformResponse, keys: Array<keyof PlatformResponse>): number {
|
||||
if (typeof data.total === "number") return data.total;
|
||||
for (const key of keys) {
|
||||
const rows = data[key];
|
||||
if (Array.isArray(rows)) return rows.length;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
function MetricCell({
|
||||
label,
|
||||
value,
|
||||
detail,
|
||||
icon: Icon,
|
||||
tone = "neutral",
|
||||
}: {
|
||||
label: string;
|
||||
value: string | number;
|
||||
detail: string;
|
||||
icon: typeof Activity;
|
||||
tone?: "neutral" | "good" | "warn";
|
||||
}) {
|
||||
return (
|
||||
<div className="min-h-[116px] border border-[#e0ddd4] bg-white px-4 py-3">
|
||||
<div className="flex items-start justify-between gap-3">
|
||||
<div>
|
||||
<p className="text-xs font-semibold text-[#77736a]">{label}</p>
|
||||
<p className="mt-2 font-mono text-3xl font-semibold leading-none text-[#141413]">
|
||||
{value}
|
||||
</p>
|
||||
</div>
|
||||
<span
|
||||
className={cn(
|
||||
"flex h-8 w-8 items-center justify-center border",
|
||||
tone === "good" && "border-[#9bc7a4] bg-[#f0faf2] text-[#17602a]",
|
||||
tone === "warn" && "border-[#d9b36f] bg-[#fff7e8] text-[#8a5a08]",
|
||||
tone === "neutral" && "border-[#d8d3c7] bg-[#faf9f3] text-[#5f5b52]"
|
||||
)}
|
||||
>
|
||||
<Icon className="h-4 w-4" aria-hidden="true" />
|
||||
</span>
|
||||
</div>
|
||||
<p className="mt-3 text-xs leading-5 text-[#5f5b52]">{detail}</p>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function LaneRow({
|
||||
name,
|
||||
status,
|
||||
detail,
|
||||
tone,
|
||||
}: {
|
||||
name: string;
|
||||
status: string;
|
||||
detail: string;
|
||||
tone: "good" | "warn" | "neutral";
|
||||
}) {
|
||||
return (
|
||||
<div className="grid gap-3 border-b border-[#eee9dd] px-4 py-3 last:border-b-0 md:grid-cols-[180px_140px_1fr]">
|
||||
<div className="font-mono text-xs font-semibold text-[#141413]">{name}</div>
|
||||
<div>
|
||||
<span
|
||||
className={cn(
|
||||
"inline-flex border px-2 py-0.5 text-xs font-semibold",
|
||||
tone === "good" && "border-[#9bc7a4] bg-[#f0faf2] text-[#17602a]",
|
||||
tone === "warn" && "border-[#d9b36f] bg-[#fff7e8] text-[#8a5a08]",
|
||||
tone === "neutral" && "border-[#d8d3c7] bg-white text-[#5f5b52]"
|
||||
)}
|
||||
>
|
||||
{status}
|
||||
</span>
|
||||
</div>
|
||||
<div className="text-sm leading-5 text-[#5f5b52]">{detail}</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
export default function AwoooPPage() {
|
||||
return <AwoooPWorkItemsPage />;
|
||||
const t = useTranslations("awooop.home");
|
||||
const locale = useLocale();
|
||||
const [snapshot, setSnapshot] = useState<Snapshot>(emptySnapshot);
|
||||
const [status, setStatus] = useState<SnapshotStatus>("loading");
|
||||
const [lastUpdated, setLastUpdated] = useState<Date | null>(null);
|
||||
|
||||
const fetchSnapshot = useCallback(async () => {
|
||||
setStatus("loading");
|
||||
try {
|
||||
const [tenantRes, runRes, approvalRes, contractRes] = await Promise.all([
|
||||
fetch(`${API_BASE}/api/v1/platform/tenants`),
|
||||
fetch(`${API_BASE}/api/v1/platform/runs/list?per_page=1`),
|
||||
fetch(`${API_BASE}/api/v1/platform/approvals`),
|
||||
fetch(`${API_BASE}/api/v1/platform/contracts?per_page=1`),
|
||||
]);
|
||||
|
||||
if (![tenantRes, runRes, approvalRes, contractRes].every((res) => res.ok)) {
|
||||
throw new Error("platform snapshot fetch failed");
|
||||
}
|
||||
|
||||
const [tenantData, runData, approvalData, contractData] = await Promise.all([
|
||||
tenantRes.json() as Promise<PlatformResponse>,
|
||||
runRes.json() as Promise<PlatformResponse>,
|
||||
approvalRes.json() as Promise<PlatformResponse>,
|
||||
contractRes.json() as Promise<PlatformResponse>,
|
||||
]);
|
||||
const tenants = Array.isArray(tenantData.tenants) ? tenantData.tenants : [];
|
||||
|
||||
setSnapshot({
|
||||
tenants: countRows(tenantData, ["tenants", "items"]),
|
||||
activeTenants: tenants.filter((tenant) => tenant.is_active !== false).length,
|
||||
shadowTenants: tenants.filter((tenant) => tenant.migration_mode === "shadow").length,
|
||||
runs: countRows(runData, ["runs", "items"]),
|
||||
approvals: countRows(approvalData, ["items"]),
|
||||
contracts: countRows(contractData, ["contracts", "items"]),
|
||||
});
|
||||
setLastUpdated(new Date());
|
||||
setStatus("ready");
|
||||
} catch {
|
||||
setStatus("degraded");
|
||||
setLastUpdated(new Date());
|
||||
}
|
||||
}, []);
|
||||
|
||||
useEffect(() => {
|
||||
fetchSnapshot();
|
||||
}, [fetchSnapshot]);
|
||||
|
||||
const formattedUpdated = useMemo(() => {
|
||||
if (!lastUpdated) return "--";
|
||||
return lastUpdated.toLocaleTimeString(locale === "zh-TW" ? "zh-TW" : "en-US", {
|
||||
hour: "2-digit",
|
||||
minute: "2-digit",
|
||||
second: "2-digit",
|
||||
});
|
||||
}, [lastUpdated, locale]);
|
||||
|
||||
const healthTone = status === "ready" ? "good" : status === "loading" ? "neutral" : "warn";
|
||||
|
||||
return (
|
||||
<div className="space-y-5">
|
||||
<section className="border border-[#e0ddd4] bg-white">
|
||||
<div className="grid gap-px bg-[#e0ddd4] lg:grid-cols-[1.4fr_0.9fr]">
|
||||
<div className="bg-white p-5">
|
||||
<div className="flex flex-wrap items-start justify-between gap-4">
|
||||
<div className="max-w-3xl">
|
||||
<div className="flex items-center gap-2 text-xs font-semibold text-[#d97757]">
|
||||
<BrainCircuit className="h-4 w-4" aria-hidden="true" />
|
||||
<span>{t("eyebrow")}</span>
|
||||
</div>
|
||||
<h2 className="mt-3 text-2xl font-semibold tracking-normal text-[#141413]">
|
||||
{t("title")}
|
||||
</h2>
|
||||
<p className="mt-3 max-w-2xl text-sm leading-6 text-[#5f5b52]">
|
||||
{t("subtitle")}
|
||||
</p>
|
||||
</div>
|
||||
<button
|
||||
type="button"
|
||||
onClick={fetchSnapshot}
|
||||
className="inline-flex items-center gap-2 border border-[#d8d3c7] bg-[#faf9f3] px-3 py-2 text-xs font-semibold text-[#141413] hover:border-[#d97757]"
|
||||
aria-label={t("refresh")}
|
||||
>
|
||||
<RefreshCw
|
||||
className={cn("h-4 w-4", status === "loading" && "animate-spin")}
|
||||
aria-hidden="true"
|
||||
/>
|
||||
{t("refresh")}
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="bg-[#faf9f3] p-5">
|
||||
<div className="grid gap-3 text-sm">
|
||||
<div className="flex items-center justify-between gap-3">
|
||||
<span className="text-[#77736a]">{t("snapshotStatus")}</span>
|
||||
<span
|
||||
className={cn(
|
||||
"inline-flex border px-2 py-0.5 text-xs font-semibold",
|
||||
healthTone === "good" && "border-[#9bc7a4] bg-[#f0faf2] text-[#17602a]",
|
||||
healthTone === "warn" && "border-[#d9b36f] bg-[#fff7e8] text-[#8a5a08]",
|
||||
healthTone === "neutral" && "border-[#d8d3c7] bg-white text-[#5f5b52]"
|
||||
)}
|
||||
>
|
||||
{status === "ready" ? t("ready") : status === "loading" ? t("loading") : t("degraded")}
|
||||
</span>
|
||||
</div>
|
||||
<div className="flex items-center justify-between gap-3">
|
||||
<span className="text-[#77736a]">{t("lastUpdated")}</span>
|
||||
<span className="font-mono text-[#141413]">{formattedUpdated}</span>
|
||||
</div>
|
||||
<div className="flex items-center justify-between gap-3">
|
||||
<span className="text-[#77736a]">{t("migrationMode")}</span>
|
||||
<span className="font-mono text-[#141413]">{t("migrationValue")}</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section className="grid gap-3 md:grid-cols-2 xl:grid-cols-4">
|
||||
<MetricCell
|
||||
label={t("metrics.tenants")}
|
||||
value={numberValue(snapshot.tenants)}
|
||||
detail={t("metrics.tenantsDetail", {
|
||||
active: numberValue(snapshot.activeTenants),
|
||||
shadow: numberValue(snapshot.shadowTenants),
|
||||
})}
|
||||
icon={Waypoints}
|
||||
tone="good"
|
||||
/>
|
||||
<MetricCell
|
||||
label={t("metrics.runs")}
|
||||
value={numberValue(snapshot.runs)}
|
||||
detail={t("metrics.runsDetail")}
|
||||
icon={Activity}
|
||||
tone={snapshot.runs > 0 ? "good" : "neutral"}
|
||||
/>
|
||||
<MetricCell
|
||||
label={t("metrics.approvals")}
|
||||
value={numberValue(snapshot.approvals)}
|
||||
detail={t("metrics.approvalsDetail")}
|
||||
icon={ShieldCheck}
|
||||
tone={snapshot.approvals > 0 ? "warn" : "good"}
|
||||
/>
|
||||
<MetricCell
|
||||
label={t("metrics.contracts")}
|
||||
value={numberValue(snapshot.contracts)}
|
||||
detail={t("metrics.contractsDetail")}
|
||||
icon={FileText}
|
||||
tone={snapshot.contracts > 0 ? "good" : "warn"}
|
||||
/>
|
||||
</section>
|
||||
|
||||
<section className="grid gap-5 xl:grid-cols-[1.1fr_0.9fr]">
|
||||
<div className="border border-[#e0ddd4] bg-white">
|
||||
<div className="border-b border-[#e0ddd4] bg-[#faf9f3] px-4 py-3">
|
||||
<div className="flex items-center gap-2">
|
||||
<GitBranch className="h-4 w-4 text-[#d97757]" aria-hidden="true" />
|
||||
<h3 className="text-sm font-semibold text-[#141413]">{t("lanes.title")}</h3>
|
||||
</div>
|
||||
</div>
|
||||
<LaneRow
|
||||
name={t("lanes.providerName")}
|
||||
status={t("lanes.live")}
|
||||
detail={t("lanes.providerDetail")}
|
||||
tone="good"
|
||||
/>
|
||||
<LaneRow
|
||||
name={t("lanes.mcpName")}
|
||||
status={t("lanes.mirror")}
|
||||
detail={t("lanes.mcpDetail")}
|
||||
tone="warn"
|
||||
/>
|
||||
<LaneRow
|
||||
name={t("lanes.channelName")}
|
||||
status={t("lanes.mirror")}
|
||||
detail={t("lanes.channelDetail")}
|
||||
tone="warn"
|
||||
/>
|
||||
<LaneRow
|
||||
name={t("lanes.approvalName")}
|
||||
status={t("lanes.live")}
|
||||
detail={t("lanes.approvalDetail")}
|
||||
tone="good"
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div className="border border-[#e0ddd4] bg-white">
|
||||
<div className="border-b border-[#e0ddd4] bg-[#faf9f3] px-4 py-3">
|
||||
<div className="flex items-center gap-2">
|
||||
<CheckCircle2 className="h-4 w-4 text-[#d97757]" aria-hidden="true" />
|
||||
<h3 className="text-sm font-semibold text-[#141413]">{t("next.title")}</h3>
|
||||
</div>
|
||||
</div>
|
||||
<div className="divide-y divide-[#eee9dd]">
|
||||
{[
|
||||
[t("next.item1"), "/awooop/runs" as const],
|
||||
[t("next.item2"), "/awooop/approvals" as const],
|
||||
[t("next.item3"), "/awooop/contracts" as const],
|
||||
[t("next.item4"), "/awooop/work-items" as const],
|
||||
].map(([label, href]) => (
|
||||
<Link
|
||||
key={String(label)}
|
||||
href={href}
|
||||
className="flex items-center justify-between gap-3 px-4 py-3 text-sm text-[#141413] hover:bg-[#faf9f3]"
|
||||
>
|
||||
<span>{label}</span>
|
||||
<ArrowRight className="h-4 w-4 text-[#77736a]" aria-hidden="true" />
|
||||
</Link>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1,3 +1,23 @@
|
||||
## 2026-05-06 | Telegram 事故通知語義收斂與 AwoooP 首頁總覽
|
||||
|
||||
**背景**:SRE 戰情室截圖顯示 ACTION REQUIRED、AI 自動修復失敗、Escalation、Code Review、Config Drift 等訊息混在同一條流中;值班者很難快速分辨哪些是 AI 已修復、哪些是 AI 無法修復需要人工、哪些只是報表或治理通知。
|
||||
|
||||
**本次修補**:
|
||||
- `TelegramMessage` 主卡新增「處置狀態」,在第一屏明確標示 `AI 已提出修復建議,等待人工批准`、`AI 無可安全執行動作,需人工判斷`、`AI 分析超時,需人工排查` 或 `規則建議待審批`。
|
||||
- `append_incident_update()` 對同一 `incident_id` 的相同狀態回覆做 5 分鐘 Redis 去重,避免同樣的 `[AUTO] AI 自動修復失敗` 連續洗版。
|
||||
- 新增 `docs/awooop/TELEGRAM-INCIDENT-NOTIFICATION-MODEL.md`,定義 Telegram / AwoooP Run Monitor / Approval Queue / Incident Timeline / MCP Audit 的分工。
|
||||
- `/zh-TW/awooop` 首頁改為治理總覽,直接顯示租戶、Run、審批、合約與飛輪鏈路狀態;不再只是轉到 work-items 頁。
|
||||
- 新增 AwoooP 首頁 `zh-TW` / `en` i18n 字串。
|
||||
|
||||
**驗證**:
|
||||
- `python -m py_compile apps/api/src/services/telegram_gateway.py apps/api/tests/test_telegram_message_templates.py`
|
||||
- `pytest tests/test_telegram_message_templates.py tests/test_telegram_ai_automation_block.py -q` → 19 passed。
|
||||
- `pnpm --dir apps/web typecheck` 通過。
|
||||
- `NEXT_PUBLIC_API_URL=https://awoooi.wooo.work pnpm --dir apps/web build` 通過。
|
||||
|
||||
**注意**:
|
||||
- `ruff check src/services/telegram_gateway.py ...` 仍會掃到 `telegram_gateway.py` 既有 import/order、bare except、單行 if 等歷史債;本輪沒有在 6000+ 行 gateway 巨檔做無關機械清理,避免混入額外行為風險。
|
||||
|
||||
## 2026-05-06 | AwoooP Run 監控頁 422 修正
|
||||
|
||||
**背景**:Playwright 驗證 `/zh-TW/awooop` 時未再看到 client-side exception,但 `/zh-TW/awooop/runs` 會顯示「無法載入 Run 資料 HTTP 422」。後端 log 顯示 `GET /api/v1/platform/runs/list?page=1&per_page=50` 被回 422。
|
||||
|
||||
47
docs/awooop/TELEGRAM-INCIDENT-NOTIFICATION-MODEL.md
Normal file
47
docs/awooop/TELEGRAM-INCIDENT-NOTIFICATION-MODEL.md
Normal file
@@ -0,0 +1,47 @@
|
||||
# Telegram 事故通知模型
|
||||
|
||||
> 目的:讓 SRE 戰情室一眼分辨「AI 已修復」、「AI 可建議但需批准」、「AI 無法安全修復需人工」與「僅通知」,避免告警、執行 log、Code Review、Drift 與審批結果互相洗版。
|
||||
|
||||
## 核心判斷
|
||||
|
||||
Telegram 不應是完整執行日誌,也不應承載所有 AI 推理細節。Telegram 的職責是把需要人類注意力的決策摘要送到 SRE 戰情室;完整時間線、工具輸出、重試原因、provider fallback 與 audit 交給 AwoooP Run Monitor / Incident Timeline。
|
||||
|
||||
## 四種通知狀態
|
||||
|
||||
| 狀態 | 意義 | Telegram 行為 | 操作者動作 |
|
||||
| --- | --- | --- | --- |
|
||||
| AI 已自動修復 | 自動化已完成且驗證通過 | 更新原 incident 卡或回覆一次結果 | 檢查即可,不需批准 |
|
||||
| AI 建議待審批 | AI / 規則已提出可執行建議,但被 Trust / Risk gate 擋下 | 發一張 ACTION REQUIRED 主卡 | 批准、拒絕、靜默或看詳情 |
|
||||
| AI 無法安全修復 | NO_ACTION、INVALID_TARGET、LLM timeout、MCP 失敗或缺少安全動作 | 發人工接手摘要,不重複刷同一狀態 | 人工排查,或要求重診 |
|
||||
| 僅通知 | 心跳、報表、Code Review 完成、低風險治理資訊 | 彙總卡或摘要頻道 | 通常不需即時動作 |
|
||||
|
||||
## 專業化訊息規則
|
||||
|
||||
1. 同一個 `incident_id` 只應有一張主卡。後續狀態使用原卡回覆、編輯按鈕或 AwoooP timeline,不再每一步都新發卡。
|
||||
2. 主卡第一屏必須顯示「處置狀態」,先回答:AI 是否能修、是否已修、是否需要人工。
|
||||
3. 同一個 `incident_id` 的相同狀態更新,短時間內要去重。詳細重試與錯誤放到 timeline,不洗 Telegram。
|
||||
4. P0 / P1 escalation 可以另發升級卡,但內容必須是「目前影響、已嘗試、卡住原因、需要誰做什麼」,不可重貼所有底層 log。
|
||||
5. Code Review、Config Drift、報表、心跳不應和 incident 執行回覆混在同一種語義;它們可以在同一 SRE 群組,但必須以摘要卡與固定前綴區分。
|
||||
|
||||
## 與 AwoooP 的分工
|
||||
|
||||
| 介面 | 承載內容 |
|
||||
| --- | --- |
|
||||
| Telegram | 決策摘要、升級、人工批准入口 |
|
||||
| AwoooP Run Monitor | 非同步 Run、provider fallback、tool call、retry、latency |
|
||||
| Approval Queue | 所有等待批准的高風險動作 |
|
||||
| Incident Timeline | 事件完整歷程、AI 嘗試、失敗原因、KM / Playbook 回寫 |
|
||||
| MCP Audit | 工具執行、redaction、permission gate、credential 注入 |
|
||||
|
||||
## 本輪落地
|
||||
|
||||
- `TelegramMessage` 主卡新增「處置狀態」。
|
||||
- `append_incident_update()` 對同一 incident 的相同狀態做 5 分鐘 Redis 去重。
|
||||
- 既有 `詳情 / 重診 / 歷史` 按鈕保留,讓 Telegram 保持輕量,細節回到控制台。
|
||||
|
||||
## 後續建議
|
||||
|
||||
1. 將 Telegram 群組升級為 Forum topics 或固定 topic lane:`P0/P1 事故`、`人工審批`、`治理/報表`、`CI/Code Review`。
|
||||
2. AwoooP Approval Queue 顯示與 Telegram 相同的「處置狀態」欄位,避免前後端語義分裂。
|
||||
3. 將 auto-repair failure 的完整 stdout/stderr 改寫入 Run Timeline,只在 Telegram 顯示最短摘要與詳情連結。
|
||||
4. 對 firing 告警做 fingerprint 聚合:同一 alertname + target + namespace 在窗口內只更新卡片,不新增卡片。
|
||||
Reference in New Issue
Block a user