diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index 0cd08f8b..4a98f2e4 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -595,6 +595,10 @@ async def _push_to_telegram_background( fingerprint: str = "", # P2.4 中間態清理 2026-04-24 ogt + Claude Sonnet 4.6 placeholder_message_id: int | None = None, + # 2026-06-11 Codex: 修復候選阻擋時,把下一步與草案欄位直接帶到 Telegram 卡片。 + repair_candidate_blocker_summary: str = "", + repair_candidate_next_step: str = "", + repair_candidate_required_fields: list[str] | None = None, ) -> None: """ 背景任務: 推送待簽核卡片到 Telegram (v7.0 含 SignOz 整合) @@ -688,6 +692,9 @@ async def _push_to_telegram_background( # ADR-075 斷點 B 修復: 傳入分類以啟用動態按鈕 alert_category=alert_category, notification_type=notification_type, + repair_candidate_blocker_summary=repair_candidate_blocker_summary, + repair_candidate_next_step=repair_candidate_next_step, + repair_candidate_required_fields=repair_candidate_required_fields, ) logger.info( @@ -2316,9 +2323,17 @@ async def _process_new_alert_background( repair_candidate_result.metadata.get("repair_candidate_blocker_summary") or ", ".join(blockers) ) + next_step = str( + repair_candidate_result.metadata.get("repair_candidate_next_step") + or "建立人工處置包並補 PlayBook 草案欄位;完成 owner review 後再重跑候選生成。" + ) fallback_create = ApprovalRequestCreate( action=f"NO_ACTION - REPAIR_CANDIDATE_MISSING: {blocker_text}", - description=f"[LLM Failed] {message}\n修復候選阻擋:{blocker_text}", + description=( + f"[LLM Failed] {message}\n" + f"修復候選阻擋:{blocker_text}\n" + f"下一步:{next_step}" + ), risk_level=RiskLevel.LOW, blast_radius=BlastRadius( affected_pods=1, @@ -2331,6 +2346,11 @@ async def _process_new_alert_background( name="MCP/PlayBook candidate gate", passed=False, message=blocker_text[:240], + ), + DryRunCheck( + name="Repair PlayBook draft package", + passed=False, + message=next_step[:240], ) ], requested_by="OpenClaw (fallback candidate gate)", @@ -2338,7 +2358,9 @@ async def _process_new_alert_background( metadata=_approval_metadata_cs4, matched_playbook_id=_matched_playbook_id_cs4, ) - telegram_root_cause = f"LLM fallback 後未產生修復候選;阻擋:{blocker_text}" + telegram_root_cause = ( + f"LLM fallback 後未產生修復候選;阻擋:{blocker_text};下一步:{next_step}" + ) primary_responsibility = "HUMAN" approval = await service.create_approval_with_fingerprint( @@ -2457,6 +2479,19 @@ async def _process_new_alert_background( notification_type=notification_type, alert_category=alert_category, fingerprint=fingerprint, + repair_candidate_blocker_summary=str( + _approval_metadata_cs4.get("repair_candidate_blocker_summary") or "" + ), + repair_candidate_next_step=str( + _approval_metadata_cs4.get("repair_candidate_next_step") or "" + ), + repair_candidate_required_fields=( + _approval_metadata_cs4.get("repair_candidate_draft_package", {}).get( + "required_fields", [] + ) + if isinstance(_approval_metadata_cs4.get("repair_candidate_draft_package"), dict) + else [] + ), ) except Exception as e: diff --git a/apps/api/src/services/repair_candidate_service.py b/apps/api/src/services/repair_candidate_service.py index 32ddb11e..5264eef1 100644 --- a/apps/api/src/services/repair_candidate_service.py +++ b/apps/api/src/services/repair_candidate_service.py @@ -322,6 +322,14 @@ class RepairCandidateService: metadata["repair_candidate_blocker_summary"] = self._humanize_blockers( metadata["repair_candidate_blockers"] ) + draft_package = self._build_draft_package( + blockers=metadata["repair_candidate_blockers"], + playbook=playbook, + evidence=evidence, + ) + metadata["playbook_draft_required"] = True + metadata["repair_candidate_next_step"] = draft_package["next_step"] + metadata["repair_candidate_draft_package"] = draft_package metadata["fallback_action"] = fallback_action return RepairCandidateResult( evidence=evidence, @@ -367,6 +375,93 @@ class RepairCandidateService: } return ";".join(labels.get(blocker, blocker) for blocker in blockers) + def _build_draft_package( + self, + *, + blockers: list[str], + playbook: Playbook | None, + evidence: EvidenceSnapshot | None, + ) -> dict[str, Any]: + """Describe the concrete owner-review package needed to unblock repair. + + The package is a handoff contract only. It must not be interpreted as + approval to mutate runtime state or auto-create an approved PlayBook. + """ + + blocker_set = set(blockers) + if "incident_not_found" in blocker_set: + lane = "restore_truth_chain_before_repair" + next_step = "先修復 incident / approval 真相鏈綁定,再重跑 MCP evidence 與 PlayBook 匹配。" + elif "mcp_evidence_missing" in blocker_set: + lane = "rerun_mcp_evidence_collection" + next_step = ( + "先按重診收集 MCP evidence;成功後再建立服務專屬 PlayBook 草案," + "禁止只憑通用規則批准修復。" + ) + elif { + "playbook_not_matched", + "playbook_not_found", + "playbook_generic_fallback_not_repair", + } & blocker_set: + lane = "create_service_specific_repair_playbook" + next_step = ( + "建立專屬 PlayBook 草案:綁定 alertname / target selector,補 MCP evidence refs、" + "修復命令、rollback、verifier plan 與 owner review;通用兜底不可執行。" + ) + elif "playbook_observe_only" in blocker_set: + lane = "promote_diagnostic_to_repair_playbook" + next_step = ( + "把診斷命令保留為 MCP evidence collector;另建獨立修復步驟、rollback " + "與 verifier,經 owner review 後才可進入批准。" + ) + elif "playbook_command_not_safely_routable" in blocker_set: + lane = "route_command_through_safe_mcp_or_ansible" + next_step = ( + "將命令改走 allowlisted MCP / Ansible route,補 blast radius、rollback " + "與 verifier plan,再送 owner review。" + ) + elif { + "playbook_not_approved", + "playbook_trust_below_gate", + } & blocker_set: + lane = "owner_review_playbook_trust_gate" + next_step = ( + "由 owner review PlayBook 狀態與 trust score;補成功/失敗證據後才可進入修復候選。" + ) + else: + lane = "repair_candidate_owner_review" + next_step = ( + "建立人工處置包並補 PlayBook 草案欄位;完成 owner review 後再重跑候選生成。" + ) + + evidence_ref = None + if evidence and evidence.snapshot_id: + evidence_ref = evidence.snapshot_id + + return { + "schema_version": "repair_candidate_draft_package_v1", + "status": "draft_required", + "lane": lane, + "next_step": next_step, + "matched_playbook_id": playbook.playbook_id if playbook else None, + "matched_playbook_name": playbook.name if playbook else None, + "evidence_snapshot_id": evidence_ref, + "required_fields": [ + "alertname", + "target_selector", + "mcp_evidence_refs", + "repair_command", + "rollback_command", + "verifier_plan", + "owner_review", + ], + "blocked_operations": [ + "auto_execute", + "approve_no_action_as_repair", + "generic_fallback_repair", + ], + } + def _build_description( self, *, diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index 85071c7f..2ebc29ef 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -270,6 +270,9 @@ def _format_manual_handoff_package_lines( alert_category: str = "", suggested_action: str | None = None, verdict: str | None = None, + repair_candidate_blocker_summary: str = "", + repair_candidate_next_step: str = "", + repair_candidate_required_fields: list[str] | None = None, compact: bool = False, ) -> list[str]: """Build a safe manual handoff package for no-action / degraded alerts. @@ -287,6 +290,11 @@ def _format_manual_handoff_package_lines( evidence_hint = _manual_evidence_hint(resource_name, alert_category) incident_ref = incident_id or "--" + required_fields = [ + str(field) + for field in (repair_candidate_required_fields or []) + if str(field).strip() + ] lines = [ "", "🧰 人工處置包", @@ -296,6 +304,25 @@ def _format_manual_handoff_package_lines( "├ 3. 在 AwoooP 建立修復候選:命令、風險、rollback、verifier、owner", "└ 4. 修復後回寫:execution result、verifier、KM / PlayBook trust", ] + insert_at = 3 + if repair_candidate_blocker_summary: + lines.insert( + insert_at, + f"├ 阻擋:{html.escape(str(repair_candidate_blocker_summary)[:260])}", + ) + insert_at += 1 + if repair_candidate_next_step: + lines.insert( + insert_at, + f"├ 下一步:{html.escape(str(repair_candidate_next_step)[:360])}", + ) + insert_at += 1 + if required_fields: + field_text = ", ".join(required_fields[:7]) + lines.insert( + insert_at, + f"├ PlayBook 草案欄位:{html.escape(field_text)}", + ) if not compact: lines.append("按鈕:處置包 看完整證據,重診 重新收集,Runs 追蹤狀態") return lines @@ -1938,6 +1965,9 @@ class TelegramMessage: automation_state: str = "" # diagnosis_collected_manual_required / diagnosis_failed_manual_required automation_quality: dict | None = None # truth-chain automation_quality 摘要 remediation_summary: dict | None = None # ADR-100 read-only dry-run history 摘要 + repair_candidate_blocker_summary: str = "" # 修復候選阻擋原因摘要 + repair_candidate_next_step: str = "" # 修復候選阻擋後的下一步 + repair_candidate_required_fields: list[str] | None = None # PlayBook 草案必填欄位 # ========================================================================== # Phase 22: Nemotron 協作欄位 (ADR-044) @@ -2117,6 +2147,9 @@ class TelegramMessage: alert_category=self.alert_category, suggested_action=self.suggested_action, verdict=verdict, + repair_candidate_blocker_summary=self.repair_candidate_blocker_summary, + repair_candidate_next_step=self.repair_candidate_next_step, + repair_candidate_required_fields=self.repair_candidate_required_fields, ) if not lines: return "" @@ -4126,6 +4159,10 @@ class TelegramGateway: # 2026-04-16 ogt + Claude Sonnet 4.6: 修復鏈路顯示 (ADR-076) playbook_name: str = "", automation_state: str = "", + # 2026-06-11 Codex: no-action 修復候選阻擋時的人工處置包欄位。 + repair_candidate_blocker_summary: str = "", + repair_candidate_next_step: str = "", + repair_candidate_required_fields: list[str] | None = None, ) -> dict: """ 推送待簽核卡片到 Telegram (v7.0 含 SignOz 整合) @@ -4232,6 +4269,9 @@ class TelegramGateway: automation_state=automation_state, automation_quality=automation_quality, remediation_summary=remediation_summary, + repair_candidate_blocker_summary=repair_candidate_blocker_summary, + repair_candidate_next_step=repair_candidate_next_step, + repair_candidate_required_fields=repair_candidate_required_fields, ) # 格式化訊息 — Phase 22: 如果 Nemotron 啟用,使用雙軌格式 diff --git a/apps/api/tests/test_repair_candidate_service.py b/apps/api/tests/test_repair_candidate_service.py index f160e3e8..8ef9530b 100644 --- a/apps/api/tests/test_repair_candidate_service.py +++ b/apps/api/tests/test_repair_candidate_service.py @@ -220,6 +220,54 @@ async def test_candidate_blocked_when_playbook_is_generic_fallback() -> None: assert result.candidate_found is False assert "playbook_generic_fallback_not_repair" in result.blockers assert "通用兜底" in result.metadata["repair_candidate_blocker_summary"] + assert result.metadata["playbook_draft_required"] is True + assert result.metadata["repair_candidate_draft_package"]["schema_version"] == ( + "repair_candidate_draft_package_v1" + ) + assert result.metadata["repair_candidate_draft_package"]["lane"] == ( + "create_service_specific_repair_playbook" + ) + assert "建立專屬 PlayBook 草案" in result.metadata["repair_candidate_next_step"] + assert "repair_command" in result.metadata["repair_candidate_draft_package"]["required_fields"] + + +@pytest.mark.asyncio +async def test_candidate_blocked_observe_only_prompts_repair_playbook_draft() -> None: + incident = _incident() + playbook = _playbook( + "ssh 192.168.0.188 'uptime; ps aux --sort=-%cpu | head -20; docker stats --no-stream'", + risk_level=PlaybookRiskLevel.LOW, + ) + playbook.repair_steps[0].action_type = ActionType.SSH_COMMAND + service = RepairCandidateService( + incident_service=FakeIncidentService(), + investigator=FakeInvestigator(_evidence(incident.incident_id)), + playbook_repository=FakePlaybookRepository(playbook), + auto_repair_service=FakeAutoRepairService(), + ) + service._auto_repair = type( + "NoRouteAutoRepairService", + (), + {"preview_write_ssh_mcp_route": lambda self, incident, command: False}, + )() + + result = await service.build_from_incident( + incident=incident, + alertname="NodeExporterDown", + target_resource="node-exporter-188", + namespace="awoooi-prod", + message="node exporter is down", + fallback_action="NO_ACTION - REPAIR_CANDIDATE_MISSING", + matched_playbook_id="PB-REPAIR-001", + severity="medium", + ) + + assert result.candidate_found is False + assert "playbook_observe_only" in result.blockers + assert result.metadata["repair_candidate_draft_package"]["lane"] == ( + "promote_diagnostic_to_repair_playbook" + ) + assert "診斷命令保留為 MCP evidence collector" in result.metadata["repair_candidate_next_step"] def test_approval_record_data_uses_preallocated_id_without_leaking_metadata() -> None: diff --git a/apps/api/tests/test_telegram_ai_automation_block.py b/apps/api/tests/test_telegram_ai_automation_block.py index 0b780ee2..628c464e 100644 --- a/apps/api/tests/test_telegram_ai_automation_block.py +++ b/apps/api/tests/test_telegram_ai_automation_block.py @@ -41,6 +41,20 @@ def test_repair_candidate_missing_card_exposes_manual_handoff_package() -> None: primary_responsibility="INFRA", confidence=0.0, alert_category="host_resource", + repair_candidate_blocker_summary="只命中通用兜底 PlayBook,禁止當成修復命令", + repair_candidate_next_step=( + "建立專屬 PlayBook 草案:綁定 alertname / target selector,補 MCP evidence refs、" + "修復命令、rollback、verifier plan 與 owner review。" + ), + repair_candidate_required_fields=[ + "alertname", + "target_selector", + "mcp_evidence_refs", + "repair_command", + "rollback_command", + "verifier_plan", + "owner_review", + ], ) body = message.format() @@ -48,6 +62,10 @@ def test_repair_candidate_missing_card_exposes_manual_handoff_package() -> None: assert "缺少可執行修復候選,已產生人工處置包" in body assert "Mode:repair_candidate_missing_manual_handoff" in body assert "人工處置包" in body + assert "只命中通用兜底 PlayBook" in body + assert "建立專屬 PlayBook 草案" in body + assert "PlayBook 草案欄位" in body + assert "repair_command" in body assert "補證據:node_exporter target up" in body assert "AwoooP 建立修復候選" in body assert "按鈕:處置包" in body