From cd928852771e1e1dfa03e0c746101bbb740318c2 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 11 Jun 2026 15:06:54 +0800 Subject: [PATCH] fix(api): add manual handoff package for no-action alerts --- apps/api/src/services/telegram_gateway.py | 181 +++++++++++++++++- .../test_telegram_ai_automation_block.py | 27 +++ .../tests/test_telegram_message_templates.py | 11 +- 3 files changed, 207 insertions(+), 12 deletions(-) diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index 1d62b39c..85071c7f 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -222,6 +222,85 @@ def _format_operator_outcome_lines(outcome: dict[str, object] | None) -> list[st ] +def _needs_manual_handoff_package( + *, + suggested_action: str | None = None, + verdict: str | None = None, +) -> bool: + """Return true when Telegram must show a concrete manual handoff package.""" + + action_text = str(suggested_action or "") + verdict_text = str(verdict or "").lower() + return ( + is_no_action_approval_action(action_text) + or "repair_candidate_missing" in action_text.lower() + or verdict_text.startswith("manual_required") + or verdict_text in { + "observed_not_executed", + "received_only", + "approval_expired_manual_review", + } + ) + + +def _manual_evidence_hint(resource_name: str, alert_category: str) -> str: + """Human-readable evidence target without prescribing a write action.""" + + resource = resource_name.lower() + category = alert_category.lower() + if "node-exporter" in resource: + return "node_exporter target up、scrape error、host CPU/RAM/disk、service log 摘要" + if category in {"host", "host_resource", "infrastructure"}: + return "host metrics、service 狀態、journal 摘要、最近部署/維護紀錄" + if category in {"k8s", "kubernetes", "k8s_workload"}: + return "pod events、rollout 狀態、recent logs、readiness / liveness probe" + if category in {"database", "db"}: + return "連線數、慢查詢、lock、磁碟與 replication / backup 狀態" + if category in {"backup", "backup_failure"}: + return "最近 backup run、失敗 repo、exit code、offsite verifier 與 retry window" + if category in {"external_site", "network"}: + return "HTTP 狀態、DNS/TLS、blackbox probe、上游 / CDN / Nginx log 摘要" + return "來源事件、fingerprint recurrence、metrics、logs、最近變更與相關 run" + + +def _format_manual_handoff_package_lines( + *, + incident_id: str, + resource_name: str, + alert_category: str = "", + suggested_action: str | None = None, + verdict: str | None = None, + compact: bool = False, +) -> list[str]: + """Build a safe manual handoff package for no-action / degraded alerts. + + This is deliberately advisory: it does not create runtime authorization and + does not prescribe a destructive command. The goal is to turn "manual + review" into a concrete evidence and repair-candidate checklist. + """ + + if not _needs_manual_handoff_package( + suggested_action=suggested_action, + verdict=verdict, + ): + return [] + + evidence_hint = _manual_evidence_hint(resource_name, alert_category) + incident_ref = incident_id or "--" + lines = [ + "", + "🧰 人工處置包", + "├ 狀態:AI 尚未產生安全可執行修復,不能直接批准執行", + f"├ 1. 開 Runs / 真相鏈確認 {html.escape(incident_ref)} 仍在 firing 或 recurrence", + f"├ 2. 補證據:{html.escape(evidence_hint)}", + "├ 3. 在 AwoooP 建立修復候選:命令、風險、rollback、verifier、owner", + "└ 4. 修復後回寫:execution result、verifier、KM / PlayBook trust", + ] + if not compact: + lines.append("按鈕:處置包 看完整證據,重診 重新收集,Runs 追蹤狀態") + return lines + + def _format_remediation_history_lines(history: dict[str, object] | None) -> list[str]: if not history or int(history.get("total") or 0) <= 0: return [] @@ -1900,6 +1979,10 @@ class TelegramMessage: def _automation_mode(self) -> str: text = f"{self.root_cause} {self.suggested_action}".lower() + if is_no_action_approval_action(self.suggested_action): + if "repair_candidate_missing" in text: + return "repair_candidate_missing_manual_handoff" + return "manual_handoff_required" if "超時" in text or "timeout" in text: return "llm_timeout_manual_gate" if self.confidence > 0 and self.suggested_action and self.suggested_action != "待分析": @@ -1953,6 +2036,10 @@ class TelegramMessage: return "🟠 AI 補救試跑證據查詢失敗,需人工判斷" if verdict == "approval_required": return "🟡 需要審批後才會執行" + if mode == "repair_candidate_missing_manual_handoff": + return "🟠 缺少可執行修復候選,已產生人工處置包" + if mode == "manual_handoff_required": + return "🟠 未自動修復,已產生人工處置包" if verdict.startswith("manual_required"): return "🟠 未自動修復,需人工判斷" @@ -2021,6 +2108,20 @@ class TelegramMessage: f"└ Flow:{flow}\n" ) + def _format_manual_handoff_package_block(self) -> str: + quality = self.automation_quality or {} + verdict = str(quality.get("verdict") or self._automation_mode()) + lines = _format_manual_handoff_package_lines( + incident_id=self.incident_id or self.approval_id, + resource_name=self.resource_name, + alert_category=self.alert_category, + suggested_action=self.suggested_action, + verdict=verdict, + ) + if not lines: + return "" + return "\n".join(lines) + "\n" + def _format_flow_progress_block(self) -> str: """Operator-facing state of where the alert is in the automation loop.""" quality = self.automation_quality or {} @@ -2029,7 +2130,8 @@ class TelegramMessage: action_upper = (self.suggested_action or "").upper() is_noop = ( - "NO_ACTION" in action_upper + is_no_action_approval_action(self.suggested_action) + or "NO_ACTION" in action_upper or action_upper.startswith("OBSERVE") or action_upper.startswith("INVESTIGATE") or not action_upper.strip() @@ -2153,6 +2255,11 @@ class TelegramMessage: safe_action = html.escape(self.suggested_action) safe_downtime = html.escape(self.estimated_downtime) safe_automation_summary = html.escape(self._automation_status_summary()) + action_heading = ( + "🧭 修復候選狀態" + if is_no_action_approval_action(self.suggested_action) + else "⚡ 建議修復動作" + ) # 2026-03-29 ogt: AI Token/Cost 顯示 ai_cost_display = "" @@ -2245,6 +2352,7 @@ class TelegramMessage: flow_progress_block = self._format_flow_progress_block() operator_outcome_block = self._format_operator_outcome_block() automation_block = self._format_automation_block() + manual_handoff_block = self._format_manual_handoff_package_block() # ADR-075 TYPE-3 格式組裝 message = ( @@ -2258,13 +2366,14 @@ class TelegramMessage: f"{flow_progress_block}\n" f"{operator_outcome_block}" f"{automation_block}" + f"{manual_handoff_block}" f"\n" f"🧠 AI 深度診斷\n" f"├─ 分析:{safe_root_cause}\n" f"├─ 責任:{resp_display}\n" f"└─ {ai_source}\n" f"\n" - f"⚡ 建議修復動作\n" + f"{action_heading}\n" f"{playbook_line}" f"{safe_action}\n" ) @@ -3680,13 +3789,21 @@ class TelegramGateway: if not approval_buttons_enabled: info_row: list[dict] = [] + secondary_row: list[dict] = [] if incident_id: info_row.extend([ - {"text": "📋 詳情", "callback_data": f"detail:{incident_id}"}, - {"text": "📊 歷史", "callback_data": f"history:{incident_id}"}, + {"text": "🧰 處置包", "callback_data": f"detail:{incident_id}"}, + {"text": "🔄 重診", "callback_data": f"reanalyze:{incident_id}"}, ]) - info_row.append({"text": "🔕 靜默", "callback_data": silence_nonce}) + secondary_row.extend([ + {"text": "📊 歷史", "callback_data": f"history:{incident_id}"}, + {"text": "🔕 靜默", "callback_data": silence_nonce}, + ]) + else: + info_row.append({"text": "🔕 靜默", "callback_data": silence_nonce}) buttons: list[list[dict]] = [info_row] + if secondary_row: + buttons.append(secondary_row) awooop_row = _awooop_truth_chain_button_row(incident_id) if awooop_row: buttons.append(awooop_row) @@ -7087,6 +7204,37 @@ class TelegramGateway: truth_chain=truth_chain, remediation_history=remediation_history, ) + quality = ( + truth_chain.get("automation_quality") + if isinstance(truth_chain.get("automation_quality"), dict) + else {} + ) + reconciliation = ( + truth_chain.get("reconciliation") + if isinstance(truth_chain.get("reconciliation"), dict) + else {} + ) + reconciliation_facts = ( + reconciliation.get("facts") + if isinstance(reconciliation.get("facts"), dict) + else {} + ) + latest_action = str( + reconciliation_facts.get("latest_approval_action") or "" + ) + detail_resource = ( + ", ".join(str(s) for s in incident.affected_services[:2]) + if incident.affected_services + else incident_id + ) + lines += _format_manual_handoff_package_lines( + incident_id=incident_id, + resource_name=detail_resource, + alert_category="", + suggested_action=latest_action, + verdict=str(quality.get("verdict") or ""), + compact=True, + ) lines += _format_km_stale_completion_lines(km_completion_summary) lines += _format_remediation_history_lines(remediation_history) gateway_summary = ( @@ -8722,7 +8870,11 @@ class TelegramGateway: if action == "approve": status_emoji = "✅" status_text = f"已批准 by {_html.escape(username)}" - if approval_action is not None and is_no_action_approval_action(approval_action): + no_action_approval = ( + approval_action is not None + and is_no_action_approval_action(approval_action) + ) + if no_action_approval: status_emoji = "🟠" suffix = "已記錄;此卡沒有可執行修復,等待補修復候選" else: @@ -8731,16 +8883,25 @@ class TelegramGateway: status_emoji = "❌" status_text = f"已拒絕 by {_html.escape(username)}" suffix = "" + no_action_approval = False status_line = f"{status_emoji} {status_text} {suffix}".strip() if orig_msg_id: try: # 1. 移除批准/拒絕按鈕(只保留資訊按鈕列) - info_buttons = [[ - {"text": "📋 詳情", "callback_data": f"detail:{incident_id}"}, - {"text": "📊 歷史", "callback_data": f"history:{incident_id}"}, - ]] + if no_action_approval: + info_buttons = [[ + {"text": "🧰 處置包", "callback_data": f"detail:{incident_id}"}, + {"text": "🔄 重診", "callback_data": f"reanalyze:{incident_id}"}, + ], [ + {"text": "📊 歷史", "callback_data": f"history:{incident_id}"}, + ]] + else: + info_buttons = [[ + {"text": "📋 詳情", "callback_data": f"detail:{incident_id}"}, + {"text": "📊 歷史", "callback_data": f"history:{incident_id}"}, + ]] awooop_row = _awooop_truth_chain_button_row(incident_id) if awooop_row: info_buttons.append(awooop_row) diff --git a/apps/api/tests/test_telegram_ai_automation_block.py b/apps/api/tests/test_telegram_ai_automation_block.py index 30f5296d..0b780ee2 100644 --- a/apps/api/tests/test_telegram_ai_automation_block.py +++ b/apps/api/tests/test_telegram_ai_automation_block.py @@ -28,6 +28,33 @@ def test_action_required_card_exposes_ai_automation_on_fallback() -> None: assert "執行:no_action_or_observe" in body +def test_repair_candidate_missing_card_exposes_manual_handoff_package() -> None: + message = TelegramMessage( + status_emoji="ℹ️", + risk_level="LOW", + resource_name="node-exporter-188", + root_cause="AI 選擇不執行修復,需人工判斷是否接手", + suggested_action="NO_ACTION - REPAIR_CANDIDATE_MISSING: LLM 分析失敗,尚未產生安全可執行修復指令", + estimated_downtime="unknown", + approval_id="test-approval-id", + incident_id="INC-20260611-34BBF5", + primary_responsibility="INFRA", + confidence=0.0, + alert_category="host_resource", + ) + + body = message.format() + + assert "缺少可執行修復候選,已產生人工處置包" in body + assert "Mode:repair_candidate_missing_manual_handoff" in body + assert "人工處置包" in body + assert "補證據:node_exporter target up" in body + assert "AwoooP 建立修復候選" in body + assert "按鈕:處置包" in body + assert "修復候選狀態" in body + assert "等待人工批准" not in body + + def test_nemotron_card_exposes_same_ai_automation_chain() -> None: message = TelegramMessage( status_emoji="🚨", diff --git a/apps/api/tests/test_telegram_message_templates.py b/apps/api/tests/test_telegram_message_templates.py index a6c23c17..6c60c503 100644 --- a/apps/api/tests/test_telegram_message_templates.py +++ b/apps/api/tests/test_telegram_message_templates.py @@ -762,6 +762,8 @@ async def test_build_inline_keyboard_hides_approval_for_no_action() -> None: assert "✅ 批准" not in button_texts assert "❌ 拒絕" not in button_texts + assert "🧰 處置包" in button_texts + assert "🔄 重診" in button_texts assert "🔕 靜默" in button_texts assert { "text": "🧭 Runs", @@ -1170,7 +1172,7 @@ class TestTelegramMessageFormat: assert "AI 已提出修復建議,等待人工批准" in result def test_telegram_message_no_action_marks_manual_judgement(self): - """NO_ACTION 卡片必須一眼看得出需要人工判斷。""" + """NO_ACTION 卡片必須一眼看得出需要人工處置包。""" msg = TelegramMessage( status_emoji="ℹ️", risk_level="LOW", @@ -1184,7 +1186,12 @@ class TestTelegramMessageFormat: result = msg.format() assert "處置狀態" in result - assert "AI 無可安全執行動作,需人工判斷" in result + assert "未自動修復,已產生人工處置包" in result + assert "人工處置包" in result + assert "補證據:node_exporter target up" in result + assert "AwoooP 建立修復候選" in result + assert "execution result、verifier、KM / PlayBook trust" in result + assert "等待人工批准" not in result def test_telegram_message_diagnosis_state_is_not_auto_repair(self): """SSH 只讀診斷 lane 不得被顯示成自動修復。"""