fix(api): add manual handoff package for no-action alerts

2026-06-11 15:06:54 +08:00
parent af50509853
commit cd92885277
3 changed files with 207 additions and 12 deletions
--- a/apps/api/src/services/telegram_gateway.py
+++ b/apps/api/src/services/telegram_gateway.py
@@ -222,6 +222,85 @@ def _format_operator_outcome_lines(outcome: dict[str, object] | None) -> list[st
    ]


+def _needs_manual_handoff_package(
+    *,
+    suggested_action: str | None = None,
+    verdict: str | None = None,
+) -> bool:
+    """Return true when Telegram must show a concrete manual handoff package."""
+
+    action_text = str(suggested_action or "")
+    verdict_text = str(verdict or "").lower()
+    return (
+        is_no_action_approval_action(action_text)
+        or "repair_candidate_missing" in action_text.lower()
+        or verdict_text.startswith("manual_required")
+        or verdict_text in {
+            "observed_not_executed",
+            "received_only",
+            "approval_expired_manual_review",
+        }
+    )
+
+
+def _manual_evidence_hint(resource_name: str, alert_category: str) -> str:
+    """Human-readable evidence target without prescribing a write action."""
+
+    resource = resource_name.lower()
+    category = alert_category.lower()
+    if "node-exporter" in resource:
+        return "node_exporter target up、scrape error、host CPU/RAM/disk、service log 摘要"
+    if category in {"host", "host_resource", "infrastructure"}:
+        return "host metrics、service 狀態、journal 摘要、最近部署/維護紀錄"
+    if category in {"k8s", "kubernetes", "k8s_workload"}:
+        return "pod events、rollout 狀態、recent logs、readiness / liveness probe"
+    if category in {"database", "db"}:
+        return "連線數、慢查詢、lock、磁碟與 replication / backup 狀態"
+    if category in {"backup", "backup_failure"}:
+        return "最近 backup run、失敗 repo、exit code、offsite verifier 與 retry window"
+    if category in {"external_site", "network"}:
+        return "HTTP 狀態、DNS/TLS、blackbox probe、上游 / CDN / Nginx log 摘要"
+    return "來源事件、fingerprint recurrence、metrics、logs、最近變更與相關 run"
+
+
+def _format_manual_handoff_package_lines(
+    *,
+    incident_id: str,
+    resource_name: str,
+    alert_category: str = "",
+    suggested_action: str | None = None,
+    verdict: str | None = None,
+    compact: bool = False,
+) -> list[str]:
+    """Build a safe manual handoff package for no-action / degraded alerts.
+
+    This is deliberately advisory: it does not create runtime authorization and
+    does not prescribe a destructive command. The goal is to turn "manual
+    review" into a concrete evidence and repair-candidate checklist.
+    """
+
+    if not _needs_manual_handoff_package(
+        suggested_action=suggested_action,
+        verdict=verdict,
+    ):
+        return []
+
+    evidence_hint = _manual_evidence_hint(resource_name, alert_category)
+    incident_ref = incident_id or "--"
+    lines = [
+        "",
+        "🧰 <b>人工處置包</b>",
+        "├ 狀態：AI 尚未產生安全可執行修復，不能直接批准執行",
+        f"├ 1. 開 Runs / 真相鏈確認 <code>{html.escape(incident_ref)}</code> 仍在 firing 或 recurrence",
+        f"├ 2. 補證據：{html.escape(evidence_hint)}",
+        "├ 3. 在 AwoooP 建立修復候選：命令、風險、rollback、verifier、owner",
+        "└ 4. 修復後回寫：execution result、verifier、KM / PlayBook trust",
+    ]
+    if not compact:
+        lines.append("按鈕：<b>處置包</b> 看完整證據，<b>重診</b> 重新收集，<b>Runs</b> 追蹤狀態")
+    return lines
+
+
 def _format_remediation_history_lines(history: dict[str, object] | None) -> list[str]:
    if not history or int(history.get("total") or 0) <= 0:
        return []
@@ -1900,6 +1979,10 @@ class TelegramMessage:

    def _automation_mode(self) -> str:
        text = f"{self.root_cause} {self.suggested_action}".lower()
+        if is_no_action_approval_action(self.suggested_action):
+            if "repair_candidate_missing" in text:
+                return "repair_candidate_missing_manual_handoff"
+            return "manual_handoff_required"
        if "超時" in text or "timeout" in text:
            return "llm_timeout_manual_gate"
        if self.confidence > 0 and self.suggested_action and self.suggested_action != "待分析":
@@ -1953,6 +2036,10 @@ class TelegramMessage:
            return "🟠 AI 補救試跑證據查詢失敗，需人工判斷"
        if verdict == "approval_required":
            return "🟡 需要審批後才會執行"
+        if mode == "repair_candidate_missing_manual_handoff":
+            return "🟠 缺少可執行修復候選，已產生人工處置包"
+        if mode == "manual_handoff_required":
+            return "🟠 未自動修復，已產生人工處置包"
        if verdict.startswith("manual_required"):
            return "🟠 未自動修復，需人工判斷"

@@ -2021,6 +2108,20 @@ class TelegramMessage:
            f"└ Flow：<code>{flow}</code>\n"
        )

+    def _format_manual_handoff_package_block(self) -> str:
+        quality = self.automation_quality or {}
+        verdict = str(quality.get("verdict") or self._automation_mode())
+        lines = _format_manual_handoff_package_lines(
+            incident_id=self.incident_id or self.approval_id,
+            resource_name=self.resource_name,
+            alert_category=self.alert_category,
+            suggested_action=self.suggested_action,
+            verdict=verdict,
+        )
+        if not lines:
+            return ""
+        return "\n".join(lines) + "\n"
+
    def _format_flow_progress_block(self) -> str:
        """Operator-facing state of where the alert is in the automation loop."""
        quality = self.automation_quality or {}
@@ -2029,7 +2130,8 @@ class TelegramMessage:

        action_upper = (self.suggested_action or "").upper()
        is_noop = (
-            "NO_ACTION" in action_upper
+            is_no_action_approval_action(self.suggested_action)
+            or "NO_ACTION" in action_upper
            or action_upper.startswith("OBSERVE")
            or action_upper.startswith("INVESTIGATE")
            or not action_upper.strip()
@@ -2153,6 +2255,11 @@ class TelegramMessage:
        safe_action = html.escape(self.suggested_action)
        safe_downtime = html.escape(self.estimated_downtime)
        safe_automation_summary = html.escape(self._automation_status_summary())
+        action_heading = (
+            "🧭 <b>修復候選狀態</b>"
+            if is_no_action_approval_action(self.suggested_action)
+            else "⚡ <b>建議修復動作</b>"
+        )

        # 2026-03-29 ogt: AI Token/Cost 顯示
        ai_cost_display = ""
@@ -2245,6 +2352,7 @@ class TelegramMessage:
        flow_progress_block = self._format_flow_progress_block()
        operator_outcome_block = self._format_operator_outcome_block()
        automation_block = self._format_automation_block()
+        manual_handoff_block = self._format_manual_handoff_package_block()

        # ADR-075 TYPE-3 格式組裝
        message = (
@@ -2258,13 +2366,14 @@ class TelegramMessage:
            f"{flow_progress_block}\n"
            f"{operator_outcome_block}"
            f"{automation_block}"
+            f"{manual_handoff_block}"
            f"\n"
            f"🧠 <b>AI 深度診斷</b>\n"
            f"├─ 分析：{safe_root_cause}\n"
            f"├─ 責任：{resp_display}\n"
            f"└─ {ai_source}\n"
            f"\n"
-            f"⚡ <b>建議修復動作</b>\n"
+            f"{action_heading}\n"
            f"{playbook_line}"
            f"<code>{safe_action}</code>\n"
        )
@@ -3680,13 +3789,21 @@ class TelegramGateway:

        if not approval_buttons_enabled:
            info_row: list[dict] = []
+            secondary_row: list[dict] = []
            if incident_id:
                info_row.extend([
-                    {"text": "📋 詳情", "callback_data": f"detail:{incident_id}"},
-                    {"text": "📊 歷史", "callback_data": f"history:{incident_id}"},
+                    {"text": "🧰 處置包", "callback_data": f"detail:{incident_id}"},
+                    {"text": "🔄 重診", "callback_data": f"reanalyze:{incident_id}"},
                ])
-            info_row.append({"text": "🔕 靜默", "callback_data": silence_nonce})
+                secondary_row.extend([
+                    {"text": "📊 歷史", "callback_data": f"history:{incident_id}"},
+                    {"text": "🔕 靜默", "callback_data": silence_nonce},
+                ])
+            else:
+                info_row.append({"text": "🔕 靜默", "callback_data": silence_nonce})
            buttons: list[list[dict]] = [info_row]
+            if secondary_row:
+                buttons.append(secondary_row)
            awooop_row = _awooop_truth_chain_button_row(incident_id)
            if awooop_row:
                buttons.append(awooop_row)
@@ -7087,6 +7204,37 @@ class TelegramGateway:
                    truth_chain=truth_chain,
                    remediation_history=remediation_history,
                )
+                quality = (
+                    truth_chain.get("automation_quality")
+                    if isinstance(truth_chain.get("automation_quality"), dict)
+                    else {}
+                )
+                reconciliation = (
+                    truth_chain.get("reconciliation")
+                    if isinstance(truth_chain.get("reconciliation"), dict)
+                    else {}
+                )
+                reconciliation_facts = (
+                    reconciliation.get("facts")
+                    if isinstance(reconciliation.get("facts"), dict)
+                    else {}
+                )
+                latest_action = str(
+                    reconciliation_facts.get("latest_approval_action") or ""
+                )
+                detail_resource = (
+                    ", ".join(str(s) for s in incident.affected_services[:2])
+                    if incident.affected_services
+                    else incident_id
+                )
+                lines += _format_manual_handoff_package_lines(
+                    incident_id=incident_id,
+                    resource_name=detail_resource,
+                    alert_category="",
+                    suggested_action=latest_action,
+                    verdict=str(quality.get("verdict") or ""),
+                    compact=True,
+                )
                lines += _format_km_stale_completion_lines(km_completion_summary)
                lines += _format_remediation_history_lines(remediation_history)
                gateway_summary = (
@@ -8722,7 +8870,11 @@ class TelegramGateway:
        if action == "approve":
            status_emoji = "✅"
            status_text = f"<b>已批准</b> by {_html.escape(username)}"
-            if approval_action is not None and is_no_action_approval_action(approval_action):
+            no_action_approval = (
+                approval_action is not None
+                and is_no_action_approval_action(approval_action)
+            )
+            if no_action_approval:
                status_emoji = "🟠"
                suffix = "已記錄；此卡沒有可執行修復，等待補修復候選"
            else:
@@ -8731,16 +8883,25 @@ class TelegramGateway:
            status_emoji = "❌"
            status_text = f"<b>已拒絕</b> by {_html.escape(username)}"
            suffix = ""
+            no_action_approval = False

        status_line = f"{status_emoji} {status_text}　{suffix}".strip()

        if orig_msg_id:
            try:
                # 1. 移除批准/拒絕按鈕（只保留資訊按鈕列）
-                info_buttons = [[
-                    {"text": "📋 詳情", "callback_data": f"detail:{incident_id}"},
-                    {"text": "📊 歷史", "callback_data": f"history:{incident_id}"},
-                ]]
+                if no_action_approval:
+                    info_buttons = [[
+                        {"text": "🧰 處置包", "callback_data": f"detail:{incident_id}"},
+                        {"text": "🔄 重診", "callback_data": f"reanalyze:{incident_id}"},
+                    ], [
+                        {"text": "📊 歷史", "callback_data": f"history:{incident_id}"},
+                    ]]
+                else:
+                    info_buttons = [[
+                        {"text": "📋 詳情", "callback_data": f"detail:{incident_id}"},
+                        {"text": "📊 歷史", "callback_data": f"history:{incident_id}"},
+                    ]]
                awooop_row = _awooop_truth_chain_button_row(incident_id)
                if awooop_row:
                    info_buttons.append(awooop_row)
--- a/apps/api/tests/test_telegram_ai_automation_block.py
+++ b/apps/api/tests/test_telegram_ai_automation_block.py
@@ -28,6 +28,33 @@ def test_action_required_card_exposes_ai_automation_on_fallback() -> None:
    assert "執行：<code>no_action_or_observe</code>" in body


+def test_repair_candidate_missing_card_exposes_manual_handoff_package() -> None:
+    message = TelegramMessage(
+        status_emoji="ℹ️",
+        risk_level="LOW",
+        resource_name="node-exporter-188",
+        root_cause="AI 選擇不執行修復，需人工判斷是否接手",
+        suggested_action="NO_ACTION - REPAIR_CANDIDATE_MISSING: LLM 分析失敗，尚未產生安全可執行修復指令",
+        estimated_downtime="unknown",
+        approval_id="test-approval-id",
+        incident_id="INC-20260611-34BBF5",
+        primary_responsibility="INFRA",
+        confidence=0.0,
+        alert_category="host_resource",
+    )
+
+    body = message.format()
+
+    assert "缺少可執行修復候選，已產生人工處置包" in body
+    assert "Mode：<code>repair_candidate_missing_manual_handoff</code>" in body
+    assert "人工處置包" in body
+    assert "補證據：node_exporter target up" in body
+    assert "AwoooP 建立修復候選" in body
+    assert "按鈕：<b>處置包</b>" in body
+    assert "修復候選狀態" in body
+    assert "等待人工批准" not in body
+
+
 def test_nemotron_card_exposes_same_ai_automation_chain() -> None:
    message = TelegramMessage(
        status_emoji="🚨",
--- a/apps/api/tests/test_telegram_message_templates.py
+++ b/apps/api/tests/test_telegram_message_templates.py
@@ -762,6 +762,8 @@ async def test_build_inline_keyboard_hides_approval_for_no_action() -> None:

    assert "✅ 批准" not in button_texts
    assert "❌ 拒絕" not in button_texts
+    assert "🧰 處置包" in button_texts
+    assert "🔄 重診" in button_texts
    assert "🔕 靜默" in button_texts
    assert {
        "text": "🧭 Runs",
@@ -1170,7 +1172,7 @@ class TestTelegramMessageFormat:
        assert "AI 已提出修復建議，等待人工批准" in result

    def test_telegram_message_no_action_marks_manual_judgement(self):
-        """NO_ACTION 卡片必須一眼看得出需要人工判斷。"""
+        """NO_ACTION 卡片必須一眼看得出需要人工處置包。"""
        msg = TelegramMessage(
            status_emoji="ℹ️",
            risk_level="LOW",
@@ -1184,7 +1186,12 @@ class TestTelegramMessageFormat:
        result = msg.format()

        assert "處置狀態" in result
-        assert "AI 無可安全執行動作，需人工判斷" in result
+        assert "未自動修復，已產生人工處置包" in result
+        assert "人工處置包" in result
+        assert "補證據：node_exporter target up" in result
+        assert "AwoooP 建立修復候選" in result
+        assert "execution result、verifier、KM / PlayBook trust" in result
+        assert "等待人工批准" not in result

    def test_telegram_message_diagnosis_state_is_not_auto_repair(self):
        """SSH 只讀診斷 lane 不得被顯示成自動修復。"""