fix(api): add manual handoff package for no-action alerts
Some checks failed
CD Pipeline / tests (push) Successful in 1m32s
Code Review / ai-code-review (push) Successful in 14s
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled

This commit is contained in:
Your Name
2026-06-11 15:06:54 +08:00
parent af50509853
commit cd92885277
3 changed files with 207 additions and 12 deletions

View File

@@ -222,6 +222,85 @@ def _format_operator_outcome_lines(outcome: dict[str, object] | None) -> list[st
]
def _needs_manual_handoff_package(
*,
suggested_action: str | None = None,
verdict: str | None = None,
) -> bool:
"""Return true when Telegram must show a concrete manual handoff package."""
action_text = str(suggested_action or "")
verdict_text = str(verdict or "").lower()
return (
is_no_action_approval_action(action_text)
or "repair_candidate_missing" in action_text.lower()
or verdict_text.startswith("manual_required")
or verdict_text in {
"observed_not_executed",
"received_only",
"approval_expired_manual_review",
}
)
def _manual_evidence_hint(resource_name: str, alert_category: str) -> str:
"""Human-readable evidence target without prescribing a write action."""
resource = resource_name.lower()
category = alert_category.lower()
if "node-exporter" in resource:
return "node_exporter target up、scrape error、host CPU/RAM/disk、service log 摘要"
if category in {"host", "host_resource", "infrastructure"}:
return "host metrics、service 狀態、journal 摘要、最近部署/維護紀錄"
if category in {"k8s", "kubernetes", "k8s_workload"}:
return "pod events、rollout 狀態、recent logs、readiness / liveness probe"
if category in {"database", "db"}:
return "連線數、慢查詢、lock、磁碟與 replication / backup 狀態"
if category in {"backup", "backup_failure"}:
return "最近 backup run、失敗 repo、exit code、offsite verifier 與 retry window"
if category in {"external_site", "network"}:
return "HTTP 狀態、DNS/TLS、blackbox probe、上游 / CDN / Nginx log 摘要"
return "來源事件、fingerprint recurrence、metrics、logs、最近變更與相關 run"
def _format_manual_handoff_package_lines(
*,
incident_id: str,
resource_name: str,
alert_category: str = "",
suggested_action: str | None = None,
verdict: str | None = None,
compact: bool = False,
) -> list[str]:
"""Build a safe manual handoff package for no-action / degraded alerts.
This is deliberately advisory: it does not create runtime authorization and
does not prescribe a destructive command. The goal is to turn "manual
review" into a concrete evidence and repair-candidate checklist.
"""
if not _needs_manual_handoff_package(
suggested_action=suggested_action,
verdict=verdict,
):
return []
evidence_hint = _manual_evidence_hint(resource_name, alert_category)
incident_ref = incident_id or "--"
lines = [
"",
"🧰 <b>人工處置包</b>",
"├ 狀態AI 尚未產生安全可執行修復,不能直接批准執行",
f"├ 1. 開 Runs / 真相鏈確認 <code>{html.escape(incident_ref)}</code> 仍在 firing 或 recurrence",
f"├ 2. 補證據:{html.escape(evidence_hint)}",
"├ 3. 在 AwoooP 建立修復候選命令、風險、rollback、verifier、owner",
"└ 4. 修復後回寫execution result、verifier、KM / PlayBook trust",
]
if not compact:
lines.append("按鈕:<b>處置包</b> 看完整證據,<b>重診</b> 重新收集,<b>Runs</b> 追蹤狀態")
return lines
def _format_remediation_history_lines(history: dict[str, object] | None) -> list[str]:
if not history or int(history.get("total") or 0) <= 0:
return []
@@ -1900,6 +1979,10 @@ class TelegramMessage:
def _automation_mode(self) -> str:
text = f"{self.root_cause} {self.suggested_action}".lower()
if is_no_action_approval_action(self.suggested_action):
if "repair_candidate_missing" in text:
return "repair_candidate_missing_manual_handoff"
return "manual_handoff_required"
if "超時" in text or "timeout" in text:
return "llm_timeout_manual_gate"
if self.confidence > 0 and self.suggested_action and self.suggested_action != "待分析":
@@ -1953,6 +2036,10 @@ class TelegramMessage:
return "🟠 AI 補救試跑證據查詢失敗,需人工判斷"
if verdict == "approval_required":
return "🟡 需要審批後才會執行"
if mode == "repair_candidate_missing_manual_handoff":
return "🟠 缺少可執行修復候選,已產生人工處置包"
if mode == "manual_handoff_required":
return "🟠 未自動修復,已產生人工處置包"
if verdict.startswith("manual_required"):
return "🟠 未自動修復,需人工判斷"
@@ -2021,6 +2108,20 @@ class TelegramMessage:
f"└ Flow<code>{flow}</code>\n"
)
def _format_manual_handoff_package_block(self) -> str:
quality = self.automation_quality or {}
verdict = str(quality.get("verdict") or self._automation_mode())
lines = _format_manual_handoff_package_lines(
incident_id=self.incident_id or self.approval_id,
resource_name=self.resource_name,
alert_category=self.alert_category,
suggested_action=self.suggested_action,
verdict=verdict,
)
if not lines:
return ""
return "\n".join(lines) + "\n"
def _format_flow_progress_block(self) -> str:
"""Operator-facing state of where the alert is in the automation loop."""
quality = self.automation_quality or {}
@@ -2029,7 +2130,8 @@ class TelegramMessage:
action_upper = (self.suggested_action or "").upper()
is_noop = (
"NO_ACTION" in action_upper
is_no_action_approval_action(self.suggested_action)
or "NO_ACTION" in action_upper
or action_upper.startswith("OBSERVE")
or action_upper.startswith("INVESTIGATE")
or not action_upper.strip()
@@ -2153,6 +2255,11 @@ class TelegramMessage:
safe_action = html.escape(self.suggested_action)
safe_downtime = html.escape(self.estimated_downtime)
safe_automation_summary = html.escape(self._automation_status_summary())
action_heading = (
"🧭 <b>修復候選狀態</b>"
if is_no_action_approval_action(self.suggested_action)
else "⚡ <b>建議修復動作</b>"
)
# 2026-03-29 ogt: AI Token/Cost 顯示
ai_cost_display = ""
@@ -2245,6 +2352,7 @@ class TelegramMessage:
flow_progress_block = self._format_flow_progress_block()
operator_outcome_block = self._format_operator_outcome_block()
automation_block = self._format_automation_block()
manual_handoff_block = self._format_manual_handoff_package_block()
# ADR-075 TYPE-3 格式組裝
message = (
@@ -2258,13 +2366,14 @@ class TelegramMessage:
f"{flow_progress_block}\n"
f"{operator_outcome_block}"
f"{automation_block}"
f"{manual_handoff_block}"
f"\n"
f"🧠 <b>AI 深度診斷</b>\n"
f"├─ 分析:{safe_root_cause}\n"
f"├─ 責任:{resp_display}\n"
f"└─ {ai_source}\n"
f"\n"
f"⚡ <b>建議修復動作</b>\n"
f"{action_heading}\n"
f"{playbook_line}"
f"<code>{safe_action}</code>\n"
)
@@ -3680,13 +3789,21 @@ class TelegramGateway:
if not approval_buttons_enabled:
info_row: list[dict] = []
secondary_row: list[dict] = []
if incident_id:
info_row.extend([
{"text": "📋 詳情", "callback_data": f"detail:{incident_id}"},
{"text": "📊 歷史", "callback_data": f"history:{incident_id}"},
{"text": "🧰 處置包", "callback_data": f"detail:{incident_id}"},
{"text": "🔄 重診", "callback_data": f"reanalyze:{incident_id}"},
])
info_row.append({"text": "🔕 靜默", "callback_data": silence_nonce})
secondary_row.extend([
{"text": "📊 歷史", "callback_data": f"history:{incident_id}"},
{"text": "🔕 靜默", "callback_data": silence_nonce},
])
else:
info_row.append({"text": "🔕 靜默", "callback_data": silence_nonce})
buttons: list[list[dict]] = [info_row]
if secondary_row:
buttons.append(secondary_row)
awooop_row = _awooop_truth_chain_button_row(incident_id)
if awooop_row:
buttons.append(awooop_row)
@@ -7087,6 +7204,37 @@ class TelegramGateway:
truth_chain=truth_chain,
remediation_history=remediation_history,
)
quality = (
truth_chain.get("automation_quality")
if isinstance(truth_chain.get("automation_quality"), dict)
else {}
)
reconciliation = (
truth_chain.get("reconciliation")
if isinstance(truth_chain.get("reconciliation"), dict)
else {}
)
reconciliation_facts = (
reconciliation.get("facts")
if isinstance(reconciliation.get("facts"), dict)
else {}
)
latest_action = str(
reconciliation_facts.get("latest_approval_action") or ""
)
detail_resource = (
", ".join(str(s) for s in incident.affected_services[:2])
if incident.affected_services
else incident_id
)
lines += _format_manual_handoff_package_lines(
incident_id=incident_id,
resource_name=detail_resource,
alert_category="",
suggested_action=latest_action,
verdict=str(quality.get("verdict") or ""),
compact=True,
)
lines += _format_km_stale_completion_lines(km_completion_summary)
lines += _format_remediation_history_lines(remediation_history)
gateway_summary = (
@@ -8722,7 +8870,11 @@ class TelegramGateway:
if action == "approve":
status_emoji = ""
status_text = f"<b>已批准</b> by {_html.escape(username)}"
if approval_action is not None and is_no_action_approval_action(approval_action):
no_action_approval = (
approval_action is not None
and is_no_action_approval_action(approval_action)
)
if no_action_approval:
status_emoji = "🟠"
suffix = "已記錄;此卡沒有可執行修復,等待補修復候選"
else:
@@ -8731,16 +8883,25 @@ class TelegramGateway:
status_emoji = ""
status_text = f"<b>已拒絕</b> by {_html.escape(username)}"
suffix = ""
no_action_approval = False
status_line = f"{status_emoji} {status_text} {suffix}".strip()
if orig_msg_id:
try:
# 1. 移除批准/拒絕按鈕(只保留資訊按鈕列)
info_buttons = [[
{"text": "📋 詳情", "callback_data": f"detail:{incident_id}"},
{"text": "📊 歷史", "callback_data": f"history:{incident_id}"},
]]
if no_action_approval:
info_buttons = [[
{"text": "🧰 處置包", "callback_data": f"detail:{incident_id}"},
{"text": "🔄 重診", "callback_data": f"reanalyze:{incident_id}"},
], [
{"text": "📊 歷史", "callback_data": f"history:{incident_id}"},
]]
else:
info_buttons = [[
{"text": "📋 詳情", "callback_data": f"detail:{incident_id}"},
{"text": "📊 歷史", "callback_data": f"history:{incident_id}"},
]]
awooop_row = _awooop_truth_chain_button_row(incident_id)
if awooop_row:
info_buttons.append(awooop_row)

View File

@@ -28,6 +28,33 @@ def test_action_required_card_exposes_ai_automation_on_fallback() -> None:
assert "執行:<code>no_action_or_observe</code>" in body
def test_repair_candidate_missing_card_exposes_manual_handoff_package() -> None:
message = TelegramMessage(
status_emoji="",
risk_level="LOW",
resource_name="node-exporter-188",
root_cause="AI 選擇不執行修復,需人工判斷是否接手",
suggested_action="NO_ACTION - REPAIR_CANDIDATE_MISSING: LLM 分析失敗,尚未產生安全可執行修復指令",
estimated_downtime="unknown",
approval_id="test-approval-id",
incident_id="INC-20260611-34BBF5",
primary_responsibility="INFRA",
confidence=0.0,
alert_category="host_resource",
)
body = message.format()
assert "缺少可執行修復候選,已產生人工處置包" in body
assert "Mode<code>repair_candidate_missing_manual_handoff</code>" in body
assert "人工處置包" in body
assert "補證據node_exporter target up" in body
assert "AwoooP 建立修復候選" in body
assert "按鈕:<b>處置包</b>" in body
assert "修復候選狀態" in body
assert "等待人工批准" not in body
def test_nemotron_card_exposes_same_ai_automation_chain() -> None:
message = TelegramMessage(
status_emoji="🚨",

View File

@@ -762,6 +762,8 @@ async def test_build_inline_keyboard_hides_approval_for_no_action() -> None:
assert "✅ 批准" not in button_texts
assert "❌ 拒絕" not in button_texts
assert "🧰 處置包" in button_texts
assert "🔄 重診" in button_texts
assert "🔕 靜默" in button_texts
assert {
"text": "🧭 Runs",
@@ -1170,7 +1172,7 @@ class TestTelegramMessageFormat:
assert "AI 已提出修復建議,等待人工批准" in result
def test_telegram_message_no_action_marks_manual_judgement(self):
"""NO_ACTION 卡片必須一眼看得出需要人工判斷"""
"""NO_ACTION 卡片必須一眼看得出需要人工處置包"""
msg = TelegramMessage(
status_emoji="",
risk_level="LOW",
@@ -1184,7 +1186,12 @@ class TestTelegramMessageFormat:
result = msg.format()
assert "處置狀態" in result
assert "AI 無可安全執行動作,需人工判斷" in result
assert "未自動修復,已產生人工處置包" in result
assert "人工處置包" in result
assert "補證據node_exporter target up" in result
assert "AwoooP 建立修復候選" in result
assert "execution result、verifier、KM / PlayBook trust" in result
assert "等待人工批准" not in result
def test_telegram_message_diagnosis_state_is_not_auto_repair(self):
"""SSH 只讀診斷 lane 不得被顯示成自動修復。"""