From d48fcfbde6d481436c135639043fc29d23f6b5f9 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 27 Jun 2026 13:17:04 +0800 Subject: [PATCH] fix(api): enqueue repair candidates for ansible check mode --- .../services/awooop_ansible_audit_service.py | 9 +- .../src/services/repair_candidate_service.py | 127 +++++++++++++++++- .../tests/test_awooop_truth_chain_service.py | 39 ++++++ .../tests/test_repair_candidate_service.py | 40 ++++++ docs/LOGBOOK.md | 21 +++ 5 files changed, 231 insertions(+), 5 deletions(-) diff --git a/apps/api/src/services/awooop_ansible_audit_service.py b/apps/api/src/services/awooop_ansible_audit_service.py index 405968f6..b748fd76 100644 --- a/apps/api/src/services/awooop_ansible_audit_service.py +++ b/apps/api/src/services/awooop_ansible_audit_service.py @@ -542,10 +542,15 @@ def build_ansible_decision_audit_payload( or "" )[:240], } + controlled_queue = decision_path == "repair_candidate_controlled_queue" output_payload = { "not_used_reason": not_used_reason, - "decision_effect": "audit_only", - "next_required_step": "wire approval_execution to Ansible check-mode before apply", + "decision_effect": "check_mode_queue_ready" if controlled_queue else "audit_only", + "next_required_step": ( + "awooop_ansible_check_mode_worker_claims_candidate" + if controlled_queue + else "wire approval_execution to Ansible check-mode before apply" + ), } return { "operation_type": "ansible_candidate_matched", diff --git a/apps/api/src/services/repair_candidate_service.py b/apps/api/src/services/repair_candidate_service.py index 6d479918..8e56b4dd 100644 --- a/apps/api/src/services/repair_candidate_service.py +++ b/apps/api/src/services/repair_candidate_service.py @@ -31,6 +31,10 @@ from src.models.playbook import RiskLevel as PlaybookRiskLevel from src.repositories.playbook_repository import get_playbook_repository from src.services.action_parser import ActionKind, parse_kubectl_action from src.services.auto_repair_service import AutoRepairService +from src.services.awooop_ansible_audit_service import ( + build_ansible_decision_audit_payload, + record_ansible_decision_audit, +) from src.services.awooop_deeplinks import work_item_url from src.services.evidence_snapshot import EvidenceSnapshot from src.services.incident_service import get_incident_service @@ -71,11 +75,15 @@ class RepairCandidateService: investigator: Any | None = None, playbook_repository: Any | None = None, auto_repair_service: AutoRepairService | None = None, + ansible_audit_builder: Any | None = None, + ansible_audit_recorder: Any | None = None, ) -> None: self._incident_service = incident_service or get_incident_service() self._investigator = investigator or get_pre_decision_investigator() self._playbook_repository = playbook_repository or get_playbook_repository() self._auto_repair = auto_repair_service or AutoRepairService() + self._ansible_audit_builder = ansible_audit_builder or build_ansible_decision_audit_payload + self._ansible_audit_recorder = ansible_audit_recorder or record_ansible_decision_audit async def build_from_incident_id( self, @@ -139,7 +147,7 @@ class RepairCandidateService: ) if not playbook_id: blockers.append("playbook_not_matched") - return self._blocked_result( + return await self._blocked_result_with_controlled_handoff( blockers=blockers, metadata=metadata, evidence=evidence, @@ -148,12 +156,13 @@ class RepairCandidateService: alertname=alertname, target_resource=target_resource, namespace=namespace, + severity=severity, ) playbook = await self._playbook_repository.get_by_id(playbook_id) if not playbook: blockers.append("playbook_not_found") - return self._blocked_result( + return await self._blocked_result_with_controlled_handoff( blockers=blockers, metadata=metadata, evidence=evidence, @@ -162,6 +171,7 @@ class RepairCandidateService: alertname=alertname, target_resource=target_resource, namespace=namespace, + severity=severity, ) metadata["playbook_trust"] = { @@ -181,7 +191,7 @@ class RepairCandidateService: step, step_blockers = self._select_executable_step(incident, playbook) blockers.extend(step_blockers) if blockers or step is None: - return self._blocked_result( + return await self._blocked_result_with_controlled_handoff( blockers=blockers, metadata=metadata, evidence=evidence, @@ -191,6 +201,7 @@ class RepairCandidateService: alertname=alertname, target_resource=target_resource, namespace=namespace, + severity=severity, ) metadata["repair_candidate"] = { @@ -255,6 +266,116 @@ class RepairCandidateService: metadata=metadata, ) + async def _blocked_result_with_controlled_handoff( + self, + *, + blockers: list[str], + metadata: dict[str, Any], + fallback_action: str, + evidence: EvidenceSnapshot | None = None, + playbook: Playbook | None = None, + incident: Incident | None = None, + alertname: str = "", + target_resource: str = "", + namespace: str = "", + severity: str | None = None, + ) -> RepairCandidateResult: + result = self._blocked_result( + blockers=blockers, + metadata=metadata, + evidence=evidence, + playbook=playbook, + fallback_action=fallback_action, + incident=incident, + alertname=alertname, + target_resource=target_resource, + namespace=namespace, + ) + await self._maybe_record_controlled_executor_handoff( + result=result, + incident=incident, + severity=severity, + ) + return result + + async def _maybe_record_controlled_executor_handoff( + self, + *, + result: RepairCandidateResult, + incident: Incident | None, + severity: str | None = None, + ) -> None: + if incident is None or not result.metadata.get("repair_candidate_draft_ready"): + return + draft_package = result.metadata.get("repair_candidate_draft_package") + if not isinstance(draft_package, dict): + return + promotion_contract = draft_package.get("candidate_promotion_contract") + if not isinstance(promotion_contract, dict): + return + + proposal_data = { + "source": "repair_candidate_controlled_queue", + "risk_level": severity or getattr(getattr(incident, "severity", None), "value", ""), + "action": promotion_contract.get("repair_command_template") or "", + "matched_playbook_id": draft_package.get("matched_playbook_id"), + "repair_candidate_status": result.metadata.get("repair_candidate_status"), + "route_id": promotion_contract.get("route_id"), + "controlled_playbook_queue": True, + } + payload = self._ansible_audit_builder( + incident=incident, + proposal_data=proposal_data, + decision_path="repair_candidate_controlled_queue", + not_used_reason=( + "repair candidate controlled queue ready; " + "Ansible check-mode worker should claim candidate" + ), + ) + if payload is None: + handoff = { + "schema_version": "repair_candidate_controlled_executor_handoff_v1", + "status": "no_ansible_catalog_candidate", + "operation_type": "ansible_candidate_matched", + "candidate_count": 0, + "check_mode_worker": "awooop_ansible_check_mode_worker", + "check_mode_queued": False, + "runtime_execution_authorized": False, + } + else: + written = await self._ansible_audit_recorder( + incident=incident, + proposal_data=proposal_data, + decision_path="repair_candidate_controlled_queue", + not_used_reason=( + "repair candidate controlled queue ready; " + "Ansible check-mode worker should claim candidate" + ), + ) + candidates = payload.get("input", {}).get("executor_candidates") + candidate_count = len(candidates) if isinstance(candidates, list) else 0 + handoff = { + "schema_version": "repair_candidate_controlled_executor_handoff_v1", + "status": ( + "ansible_candidate_matched_queued" + if written + else "ansible_candidate_matched_existing_or_write_skipped" + ), + "operation_type": payload.get("operation_type"), + "decision_effect": payload.get("output", {}).get("decision_effect"), + "next_required_step": payload.get("output", {}).get("next_required_step"), + "candidate_count": candidate_count, + "check_mode_worker": "awooop_ansible_check_mode_worker", + "check_mode_queued": True, + "runtime_execution_authorized": False, + } + + result.metadata["controlled_executor_handoff"] = handoff + draft_package["controlled_executor_handoff"] = handoff + work_item = draft_package.get("awooop_work_item") + if isinstance(work_item, dict): + work_item["controlled_executor_handoff"] = handoff + async def _collect_evidence(self, incident: Incident) -> EvidenceSnapshot | None: try: return await self._investigator.investigate(incident) diff --git a/apps/api/tests/test_awooop_truth_chain_service.py b/apps/api/tests/test_awooop_truth_chain_service.py index 63996400..821bcd26 100644 --- a/apps/api/tests/test_awooop_truth_chain_service.py +++ b/apps/api/tests/test_awooop_truth_chain_service.py @@ -1217,6 +1217,45 @@ def test_ansible_decision_audit_payload_is_dry_run_only() -> None: assert payload["dry_run_result"]["check_mode_executed"] is False +def test_ansible_decision_audit_payload_marks_repair_candidate_queue_claimable() -> None: + incident = SimpleNamespace( + incident_id="INC-NODE-188", + project_id="awoooi", + alert_category="infrastructure", + notification_type="TYPE-3", + severity=SimpleNamespace(value="P3"), + affected_services=["node-exporter-188"], + signals=[ + SimpleNamespace( + alert_name="NodeExporterDown", + labels={"alertname": "NodeExporterDown", "instance": "node-exporter-188"}, + annotations={}, + ) + ], + ) + + payload = build_ansible_decision_audit_payload( + incident=incident, + proposal_data={ + "source": "repair_candidate_controlled_queue", + "risk_level": "medium", + "action": "systemctl restart node-exporter-188", + }, + decision_path="repair_candidate_controlled_queue", + not_used_reason="repair candidate controlled queue ready", + ) + + assert payload is not None + assert payload["operation_type"] == "ansible_candidate_matched" + assert payload["status"] == "dry_run" + assert payload["output"]["decision_effect"] == "check_mode_queue_ready" + assert payload["output"]["next_required_step"] == ( + "awooop_ansible_check_mode_worker_claims_candidate" + ) + assert payload["input"]["executor_candidates"] + assert payload["dry_run_result"]["check_mode_executed"] is False + + def test_ansible_decision_audit_payload_exposes_check_mode_safety_flags() -> None: incident = SimpleNamespace( incident_id="INC-MOMO", diff --git a/apps/api/tests/test_repair_candidate_service.py b/apps/api/tests/test_repair_candidate_service.py index aebd85f4..59d8ab86 100644 --- a/apps/api/tests/test_repair_candidate_service.py +++ b/apps/api/tests/test_repair_candidate_service.py @@ -298,11 +298,37 @@ async def test_candidate_blocked_observe_only_prompts_repair_playbook_draft() -> risk_level=PlaybookRiskLevel.LOW, ) playbook.repair_steps[0].action_type = ActionType.SSH_COMMAND + audit_calls: list[tuple[str, dict]] = [] + + def fake_ansible_audit_builder(**kwargs): + audit_calls.append(("builder", kwargs)) + return { + "operation_type": "ansible_candidate_matched", + "input": { + "executor_candidates": [ + { + "catalog_id": "ansible:188-ai-web", + "playbook_path": "infra/ansible/playbooks/188-ai-web.yml", + } + ] + }, + "output": { + "decision_effect": "check_mode_queue_ready", + "next_required_step": "awooop_ansible_check_mode_worker_claims_candidate", + }, + } + + async def fake_ansible_audit_recorder(**kwargs): + audit_calls.append(("recorder", kwargs)) + return True + service = RepairCandidateService( incident_service=FakeIncidentService(), investigator=FakeInvestigator(_evidence(incident.incident_id)), playbook_repository=FakePlaybookRepository(playbook), auto_repair_service=FakeAutoRepairService(), + ansible_audit_builder=fake_ansible_audit_builder, + ansible_audit_recorder=fake_ansible_audit_recorder, ) service._auto_repair = type( "NoRouteAutoRepairService", @@ -383,12 +409,26 @@ async def test_candidate_blocked_observe_only_prompts_repair_playbook_draft() -> assert result.metadata["repair_candidate_promotion_contract"] == promotion_contract assert "promotion=11/11" in result.metadata["repair_candidate_promotion_summary"] assert "runtime=false" in result.metadata["repair_candidate_promotion_summary"] + handoff = result.metadata["controlled_executor_handoff"] + assert handoff["schema_version"] == "repair_candidate_controlled_executor_handoff_v1" + assert handoff["status"] == "ansible_candidate_matched_queued" + assert handoff["operation_type"] == "ansible_candidate_matched" + assert handoff["decision_effect"] == "check_mode_queue_ready" + assert handoff["next_required_step"] == "awooop_ansible_check_mode_worker_claims_candidate" + assert handoff["candidate_count"] == 1 + assert handoff["check_mode_worker"] == "awooop_ansible_check_mode_worker" + assert handoff["check_mode_queued"] is True + assert handoff["runtime_execution_authorized"] is False + assert [name for name, _ in audit_calls] == ["builder", "recorder"] + assert audit_calls[0][1]["decision_path"] == "repair_candidate_controlled_queue" + assert audit_calls[1][1]["decision_path"] == "repair_candidate_controlled_queue" work_item = draft_package["awooop_work_item"] assert work_item["status"] == "controlled_playbook_queue_ready" assert work_item["next_action"] == "queue_check_mode_then_controlled_apply" assert work_item["owner_review_required"] is False assert work_item["controlled_playbook_queue"] is True assert work_item["runtime_execution_authorized"] is False + assert work_item["controlled_executor_handoff"] == handoff assert work_item["candidate_promotion_contract"]["route_id"] == ( "host_service_route_after_owner_review" ) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index b0432249..1ceda9ae 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,24 @@ +## 2026-06-27|D1M 修復候選接 executor worker:受控佇列寫入 Ansible check-mode receipt + +**背景**:D1L 已把 Telegram / AwoooP 修復候選從「人工處置 / owner review」改為 `controlled_playbook_queue_ready`,但盤點後確認真正的 executor worker 只會 claim `automation_operation_log.operation_type='ansible_candidate_matched'`;修復候選草案若沒有同步寫出這個 row,頁面上看似進佇列,實際 worker 沒有任務可跑。 + +**完成內容**: +- `RepairCandidateService` 新增 controlled executor handoff:當 `repair_candidate_draft_ready=true` 時,會用 `record_ansible_decision_audit()` 建立 / 確認 `ansible_candidate_matched` dry-run row,讓 `awooop_ansible_check_mode_worker` 可 claim。 +- D1M handoff metadata 會回寫到 `repair_candidate_draft_package` 與 AwoooP work item:`schema_version=repair_candidate_controlled_executor_handoff_v1`、`operation_type=ansible_candidate_matched`、`check_mode_worker=awooop_ansible_check_mode_worker`、`check_mode_queued=true`、`runtime_execution_authorized=false`。 +- `build_ansible_decision_audit_payload()` 針對 `decision_path=repair_candidate_controlled_queue` 改為 `decision_effect=check_mode_queue_ready`,`next_required_step=awooop_ansible_check_mode_worker_claims_candidate`;舊 manual approval audit 仍維持 `audit_only`。 +- 單元測試新增 fake audit builder / recorder,避免 unit test 連 DB,同時驗證 `builder -> recorder` 皆走 `repair_candidate_controlled_queue`。 + +**驗證結果**: +- `py_compile`:`repair_candidate_service.py`、`awooop_ansible_audit_service.py`、`awooop_ansible_check_mode_service.py` 通過。 +- `pytest apps/api/tests/test_repair_candidate_service.py apps/api/tests/test_awooop_truth_chain_service.py -q`:`62 passed`。 +- Telegram / operator / AwoooP status-chain 回歸:`test_telegram_ai_automation_block.py`、`test_telegram_webhook_execution_handoff.py`、`test_telegram_message_templates.py`、`test_operator_outcome.py`、`test_awooop_operator_timeline_labels.py`:`169 passed`。 +- `git diff --check`:通過。 + +**完成度 / 邊界**: +- 修復候選 → executor worker 可 claim receipt:本地 `100%`。 +- 真實 check-mode worker 執行、controlled apply、post-apply verifier、KM / PlayBook trust writeback、Telegram 實發:尚未在本段執行,仍需下一段接 live worker receipt。 +- 本段沒有 SSH、沒有 Ansible apply、沒有 host write、沒有 restart / reboot / firewall / DB destructive action、沒有 Telegram 實發、沒有讀 secret value。 + ## 2026-06-27|D1L 正式站驗證:修復候選批准後進 AI 受控自動化佇列 **背景**:D1L 已把 Telegram / AwoooP 修復候選從「批准後仍卡人工 / owner review」改為「AI 受控自動化佇列」。本段補正式部署與頁面 smoke,避免把本地測試誤當正式站完成。