diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index 51d643c8..79a12c4b 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -115,6 +115,7 @@ async def _push_decision_to_telegram( classify_notification, get_telegram_gateway, NotificationType, + _smart_truncate as _smt, ) # 🔴 去重檢查:同一個 incident 10 分鐘內只發一次 @@ -167,6 +168,27 @@ async def _push_decision_to_telegram( """移除 佔位符,避免 Telegram HTML parse 錯誤""" return _re.sub(r'<[^>]+>', '', s).strip() + def _parse_debate_summary(reasoning: str) -> dict[str, str]: + """ + 解析 coordinator debate_summary → {diagnosis, plan, review, critic} + + 格式:「診斷:{...};方案:{...};安全審查:{...};質疑:{...}」 + 2026-04-17 ogt + Claude Sonnet 4.6: 修復 TYPE-8M 三欄重複渲染 + 根因:diagnosis/system_impact/probable_cause 全用 reasoning[:100] → 同一段字 + """ + result: dict[str, str] = {"diagnosis": "", "plan": "", "review": "", "critic": ""} + for part in reasoning.split(";"): + part = part.strip() + if part.startswith("診斷:"): + result["diagnosis"] = part[3:] + elif part.startswith("方案:"): + result["plan"] = part[3:] + elif part.startswith("安全審查:"): + result["review"] = part[5:] + elif part.startswith("質疑:"): + result["critic"] = part[3:] + return result + target = incident.affected_services[0] if incident.affected_services else "unknown" risk_level = proposal_data.get("risk_level", "medium") # 2026-04-09 Claude Code: action 不用 _strip_placeholders,避免截掉 deployment name @@ -311,15 +333,21 @@ async def _push_decision_to_telegram( ) elif _notif_type == NotificationType.TYPE_8M or _alert_category in ("alertchain_health", "flywheel_health"): # TYPE-8M:飛輪/告警鏈路健康異常,發到個人 DM(不發群組) + # 2026-04-17 ogt + Claude Sonnet 4.6: 解析 debate_summary,各欄位用不同組件 + # 根因:diagnosis/system_impact/probable_cause 全取 reasoning[:100] → 三欄重複同一段字 + _parsed = _parse_debate_summary(reasoning) + _diag = _smt(_parsed.get("diagnosis") or description, 120) if (_parsed.get("diagnosis") or description) else "(無診斷)" + _impact = _smt(_parsed.get("plan") or "", 150) + _cause = _smt(_parsed.get("critic") or _parsed.get("review") or "", 100) tg_result = await gateway.send_meta_alert( incident_id=incident.incident_id, approval_id=approval_id, alertname=_alertname, alert_category=_alert_category, - diagnosis=reasoning[:100] if reasoning else description[:100], + diagnosis=_diag, severity_level=risk_level, - system_impact=description[:150] if description else "", - probable_cause=reasoning[:100] if reasoning else "", + system_impact=_impact, + probable_cause=_cause, ) elif _alert_category == "secops": # TYPE-5S:資安事件 — 隔離/封鎖審核卡,發到個人 DM (ADR-075 Step-5) @@ -349,11 +377,15 @@ async def _push_decision_to_telegram( ) else: # TYPE-2 / TYPE-3 / TYPE-4 都走 send_approval_card(按鈕組合由 alert_category 決定) + # 2026-04-17 ogt + Claude Sonnet 4.6: 傳入 requires_human_approval + 智能截斷 + # 根因:① requires_human_approval=True 時動態按鈕卡缺少 [批准][拒絕] → 死卡 + # ② [:500] 在括號中間切斷 → 幽靈截斷「質疑:無(通」 + _requires_human = bool(proposal_data.get("requires_human_review", False)) tg_result = await gateway.send_approval_card( approval_id=approval_id, risk_level=risk_level, resource_name=target[:50], - root_cause=reasoning[:500] if reasoning else description[:500], + root_cause=_smt(reasoning, 500) if reasoning else _smt(description, 500), suggested_action=action[:120] if action else (description[:120] if description else "待分析"), estimated_downtime="5-15 min", primary_responsibility="INFRA", @@ -371,6 +403,7 @@ async def _push_decision_to_telegram( alert_category=_alert_category, notification_type=_notification_type, playbook_name=_playbook_name, + requires_human_approval=_requires_human, ) # 2026-04-09 Claude Sonnet 4.6: 存 message_id → 後續狀態更新在原訊息延續 diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index ce3a754e..039a12c4 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -65,6 +65,31 @@ logger = structlog.get_logger(__name__) _tracer = trace.get_tracer("awoooi.telegram_gateway", "1.0.0") +# ============================================================================= +# 智能截斷 (2026-04-17 ogt + Claude Sonnet 4.6 — ADR-075 修復) +# 根因:粗暴 [:N] 在括號/中文字中間切斷 → 幽靈截斷「質疑:無(通」 +# 規則:在完整句子邊界截斷;若無邊界則補 …[截斷] 標記 +# ============================================================================= + +def _smart_truncate(text: str, limit: int, suffix: str = "…[截斷]") -> str: + """ + 在句子邊界截斷文字,防止破壞括號閉合或切斷中文字。 + + 優先序:。!? > ; > ,、, > 空白 + 若在合理位置(>50% limit)找到邊界 → 在邊界後截斷 + 否則 → 在 limit 處截斷並加 suffix + """ + if len(text) <= limit: + return text + # 依優先序嘗試各邊界字元 + for boundary in ("。", "!", "?", ";", ",", "、", ",", " "): + pos = text.rfind(boundary, 0, limit) + if pos >= limit // 2: # 至少在一半後才算有效邊界 + return text[:pos + len(boundary)] + suffix + # 無邊界:硬截 + 標記 + return text[:limit] + suffix + + # ============================================================================= # Long Polling 配置 (Phase 5 內網修復) # ============================================================================= @@ -1397,6 +1422,9 @@ class TelegramGateway: # ADR-071-E: TYPE-3 動態按鈕 (2026-04-11 Claude Sonnet 4.6) alert_category: str = "", notification_type: str = "", + # 2026-04-17 ogt + Claude Sonnet 4.6: requires_human_approval 強制附加批准/拒絕行 + # 根因:有動態按鈕時走 category 路徑,approve/reject 被漏掉 → 卡片成「死卡」 + requires_human_approval: bool = False, ) -> dict: """ 建立 Inline Keyboard @@ -1460,6 +1488,13 @@ class TelegramGateway: ] # 每行最多 3 個,超過換行 rows = [category_btns[i:i+3] for i in range(0, len(category_btns), 3)] + # 2026-04-17 ogt + Claude Sonnet 4.6: requires_human_approval → 必須附加批准/拒絕行 + # 根因:有動態按鈕時舊邏輯只有 [詳情][忽略],SRE 找不到審核扳機 → 死卡 + if requires_human_approval: + rows.append([ + {"text": "✅ 批准", "callback_data": approve_nonce}, + {"text": "❌ 拒絕", "callback_data": reject_nonce}, + ]) # 通用操作:[查看詳情] [忽略] rows.append([ {"text": "📋 詳情", "callback_data": f"detail:{incident_id}"}, @@ -1536,6 +1571,8 @@ class TelegramGateway: notification_type: str = "", # 2026-04-16 ogt + Claude Sonnet 4.6: 修復鏈路顯示 (ADR-076) playbook_name: str = "", + # 2026-04-17 ogt + Claude Sonnet 4.6: 強制在動態按鈕卡上加批准/拒絕行 + requires_human_approval: bool = False, ) -> dict: """ 推送待簽核卡片到 Telegram (v7.0 含 SignOz 整合) @@ -1627,6 +1664,7 @@ class TelegramGateway: incident_id=incident_id, alert_category=alert_category, notification_type=notification_type, + requires_human_approval=requires_human_approval, ) # 發送訊息 diff --git a/verify_telegram_ui.py b/verify_telegram_ui.py new file mode 100644 index 00000000..1908de86 --- /dev/null +++ b/verify_telegram_ui.py @@ -0,0 +1,167 @@ +""" +verify_telegram_ui.py — Telegram UI 修復驗證腳本 +================================================ +注入 800 字極端字串,驗證: +1. _smart_truncate 在句子邊界截斷,不破壞括號 +2. _parse_debate_summary 正確拆分各欄位(不重複) +3. TYPE-3 requires_human_approval=True → 含批准/拒絕按鈕 + +2026-04-17 ogt + Claude Sonnet 4.6 (ADR-075 UI 修復驗證) +""" +import sys +sys.path.insert(0, "apps/api") + +# ─── 測試 1:smart_truncate ────────────────────────────────────────────────── + +from src.services.telegram_gateway import _smart_truncate + +# 800 字多層括號測試字串 +LONG_REASONING = ( + "診斷:根據告警訊號分析,發現 MoWoooWorkDown 事件導致服務下線," + "可能是由於 deployment 配置錯誤或是 pod 問題引起的(信心 90%,系統正常);" + "方案:kubectl rollout restart deployment/awoooi-api -n awoooi-prod" + "(blast_radius=25,rollback_cost=5,降級風險極低);" + "安全審查:approve(blast_radius 符合安全閾值 ≤50,靜態規則通過,系統正常);" + "質疑:無(通過審查,所有指標在正常範圍內,無需人工干預,建議自動執行)" + "額外備注:此次分析基於最近 15 分鐘的 Prometheus 指標窗口," + "包含 CPU 使用率、記憶體壓力、網路 I/O 三個維度的複合評估(樣本數 N=1440)。" + "補充說明:若下次相同告警在 30 分鐘內再次出現,建議升級至 P1 並通知值班主管。" +) + +print("=" * 60) +print("TEST 1: _smart_truncate") +print("=" * 60) +print(f"原始長度: {len(LONG_REASONING)} 字") +print() + +for limit in [100, 200, 300, 500]: + result = _smart_truncate(LONG_REASONING, limit) + # 驗證括號平衡 + open_p = result.count("(") + close_p = result.count(")") + bracket_ok = open_p == close_p + print(f"limit={limit}: len={len(result)} 括號平衡={bracket_ok} ((={open_p}, )={close_p})") + print(f" 結尾: ...{result[-30:]}") + print() + +# ─── 測試 2:_parse_debate_summary ────────────────────────────────────────── + +# 在 decision_manager 中定義(複製相同邏輯做驗證) +def _parse_debate_summary(reasoning: str) -> dict: + result = {"diagnosis": "", "plan": "", "review": "", "critic": ""} + for part in reasoning.split(";"): + part = part.strip() + if part.startswith("診斷:"): + result["diagnosis"] = part[3:] + elif part.startswith("方案:"): + result["plan"] = part[3:] + elif part.startswith("安全審查:"): + result["review"] = part[5:] + elif part.startswith("質疑:"): + result["critic"] = part[3:] + return result + +print("=" * 60) +print("TEST 2: _parse_debate_summary(各欄位不可重複)") +print("=" * 60) +parsed = _parse_debate_summary(LONG_REASONING) +for key, val in parsed.items(): + print(f" {key}: {val[:80]}{'...' if len(val) > 80 else ''}") + +print() +print("✅ 各欄位均不同(修復重複渲染):") +vals = [v for v in parsed.values() if v] +all_different = len(vals) == len(set(vals)) +print(f" all_different = {all_different}") + +# 模擬 TYPE-8M 卡片渲染 +print() +print("── TYPE-8M 卡片預覽 ──") +_diag = _smart_truncate(parsed["diagnosis"] or "(無診斷)", 120) +_impact = _smart_truncate(parsed["plan"] or "", 150) +_cause = _smart_truncate(parsed["critic"] or parsed["review"] or "", 100) +print(f"🎯 診斷結果:{_diag}") +if _impact: + print(f"🧠 系統影響") + print(f" {_impact}") +if _cause: + print(f"└─ 可能根因:{_cause}") + +# ─── 測試 3:requires_human_approval 按鈕邏輯 ─────────────────────────────── + +print() +print("=" * 60) +print("TEST 3: requires_human_approval → 動態按鈕含批准/拒絕") +print("=" * 60) + +# 模擬 callback_dispatcher 回傳 k8s 動態按鈕 +MOCK_K8S_BUTTONS = [ + ("🔄 重啟", "restart:INC-001"), + ("⬆️ 擴容", "scale_up:INC-001"), + ("⬇️ 縮容", "scale_down:INC-001"), + ("🔙 回滾", "rollback:INC-001"), +] + +def simulate_keyboard(dynamic_buttons: list, requires_human_approval: bool) -> list: + is_type3 = True + approve_nonce = "approve-nonce-xxx" + reject_nonce = "reject-nonce-xxx" + silence_nonce = "silence-nonce-xxx" + + if is_type3 and dynamic_buttons: + btns = [{"text": t, "callback_data": cb} for t, cb in dynamic_buttons] + rows = [btns[i:i+3] for i in range(0, len(btns), 3)] + if requires_human_approval: + rows.append([ + {"text": "✅ 批准", "callback_data": approve_nonce}, + {"text": "❌ 拒絕", "callback_data": reject_nonce}, + ]) + rows.append([ + {"text": "📋 詳情", "callback_data": "detail:INC-001"}, + {"text": "🔕 忽略", "callback_data": silence_nonce}, + ]) + return rows + return [[ + {"text": "✅ 批准", "callback_data": approve_nonce}, + {"text": "❌ 拒絕", "callback_data": reject_nonce}, + {"text": "🔕 靜默", "callback_data": silence_nonce}, + ]] + +print() +print("場景 A: requires_human_approval=False(無動態按鈕卡)") +kb_a = simulate_keyboard([], False) +for row in kb_a: + print(" " + " | ".join(b["text"] for b in row)) + +print() +print("場景 B: requires_human_approval=False + k8s 動態按鈕(舊 bug:死卡)") +kb_b = simulate_keyboard(MOCK_K8S_BUTTONS, False) +for row in kb_b: + print(" " + " | ".join(b["text"] for b in row)) +has_approve_b = any(b["text"] == "✅ 批准" for row in kb_b for b in row) +print(f" 含批准按鈕: {has_approve_b} ← 舊 bug = False(死卡)") + +print() +print("場景 C: requires_human_approval=True + k8s 動態按鈕(新修復)") +kb_c = simulate_keyboard(MOCK_K8S_BUTTONS, True) +for row in kb_c: + print(" " + " | ".join(b["text"] for b in row)) +has_approve_c = any(b["text"] == "✅ 批准" for row in kb_c for b in row) +print(f" 含批准按鈕: {has_approve_c} ← 修復後 = True ✅") + +print() +print("=" * 60) +print("SUMMARY") +print("=" * 60) +t1 = not any("(" in _smart_truncate(LONG_REASONING, l) and ")" not in _smart_truncate(LONG_REASONING, l) + for l in [100, 200, 300, 500]) +t2 = all_different +t3 = has_approve_c and not has_approve_b +print(f"TEST 1 smart_truncate 括號不破壞: {'✅' if t1 else '❌'}") +print(f"TEST 2 parse_debate 各欄位不重複: {'✅' if t2 else '❌'}") +print(f"TEST 3 requires_human→批准按鈕: {'✅' if t3 else '❌'}") +if all([t1, t2, t3]): + print("\n🎉 全部通過!UI 修復驗證完成。") +else: + print("\n❌ 有測試未通過,請檢查。") + sys.exit(1)