fix(api): enqueue repair candidates for ansible check mode
Some checks failed
Code Review / ai-code-review (push) Successful in 14s
CD Pipeline / tests (push) Successful in 1m40s
CD Pipeline / build-and-deploy (push) Successful in 6m32s
CD Pipeline / post-deploy-checks (push) Successful in 1m38s
Ansible / Reboot Recovery Contract / validate (push) Has been cancelled

This commit is contained in:
Your Name
2026-06-27 13:17:04 +08:00
parent 121e5b8861
commit d48fcfbde6
5 changed files with 231 additions and 5 deletions

View File

@@ -542,10 +542,15 @@ def build_ansible_decision_audit_payload(
or ""
)[:240],
}
controlled_queue = decision_path == "repair_candidate_controlled_queue"
output_payload = {
"not_used_reason": not_used_reason,
"decision_effect": "audit_only",
"next_required_step": "wire approval_execution to Ansible check-mode before apply",
"decision_effect": "check_mode_queue_ready" if controlled_queue else "audit_only",
"next_required_step": (
"awooop_ansible_check_mode_worker_claims_candidate"
if controlled_queue
else "wire approval_execution to Ansible check-mode before apply"
),
}
return {
"operation_type": "ansible_candidate_matched",

View File

@@ -31,6 +31,10 @@ from src.models.playbook import RiskLevel as PlaybookRiskLevel
from src.repositories.playbook_repository import get_playbook_repository
from src.services.action_parser import ActionKind, parse_kubectl_action
from src.services.auto_repair_service import AutoRepairService
from src.services.awooop_ansible_audit_service import (
build_ansible_decision_audit_payload,
record_ansible_decision_audit,
)
from src.services.awooop_deeplinks import work_item_url
from src.services.evidence_snapshot import EvidenceSnapshot
from src.services.incident_service import get_incident_service
@@ -71,11 +75,15 @@ class RepairCandidateService:
investigator: Any | None = None,
playbook_repository: Any | None = None,
auto_repair_service: AutoRepairService | None = None,
ansible_audit_builder: Any | None = None,
ansible_audit_recorder: Any | None = None,
) -> None:
self._incident_service = incident_service or get_incident_service()
self._investigator = investigator or get_pre_decision_investigator()
self._playbook_repository = playbook_repository or get_playbook_repository()
self._auto_repair = auto_repair_service or AutoRepairService()
self._ansible_audit_builder = ansible_audit_builder or build_ansible_decision_audit_payload
self._ansible_audit_recorder = ansible_audit_recorder or record_ansible_decision_audit
async def build_from_incident_id(
self,
@@ -139,7 +147,7 @@ class RepairCandidateService:
)
if not playbook_id:
blockers.append("playbook_not_matched")
return self._blocked_result(
return await self._blocked_result_with_controlled_handoff(
blockers=blockers,
metadata=metadata,
evidence=evidence,
@@ -148,12 +156,13 @@ class RepairCandidateService:
alertname=alertname,
target_resource=target_resource,
namespace=namespace,
severity=severity,
)
playbook = await self._playbook_repository.get_by_id(playbook_id)
if not playbook:
blockers.append("playbook_not_found")
return self._blocked_result(
return await self._blocked_result_with_controlled_handoff(
blockers=blockers,
metadata=metadata,
evidence=evidence,
@@ -162,6 +171,7 @@ class RepairCandidateService:
alertname=alertname,
target_resource=target_resource,
namespace=namespace,
severity=severity,
)
metadata["playbook_trust"] = {
@@ -181,7 +191,7 @@ class RepairCandidateService:
step, step_blockers = self._select_executable_step(incident, playbook)
blockers.extend(step_blockers)
if blockers or step is None:
return self._blocked_result(
return await self._blocked_result_with_controlled_handoff(
blockers=blockers,
metadata=metadata,
evidence=evidence,
@@ -191,6 +201,7 @@ class RepairCandidateService:
alertname=alertname,
target_resource=target_resource,
namespace=namespace,
severity=severity,
)
metadata["repair_candidate"] = {
@@ -255,6 +266,116 @@ class RepairCandidateService:
metadata=metadata,
)
async def _blocked_result_with_controlled_handoff(
self,
*,
blockers: list[str],
metadata: dict[str, Any],
fallback_action: str,
evidence: EvidenceSnapshot | None = None,
playbook: Playbook | None = None,
incident: Incident | None = None,
alertname: str = "",
target_resource: str = "",
namespace: str = "",
severity: str | None = None,
) -> RepairCandidateResult:
result = self._blocked_result(
blockers=blockers,
metadata=metadata,
evidence=evidence,
playbook=playbook,
fallback_action=fallback_action,
incident=incident,
alertname=alertname,
target_resource=target_resource,
namespace=namespace,
)
await self._maybe_record_controlled_executor_handoff(
result=result,
incident=incident,
severity=severity,
)
return result
async def _maybe_record_controlled_executor_handoff(
self,
*,
result: RepairCandidateResult,
incident: Incident | None,
severity: str | None = None,
) -> None:
if incident is None or not result.metadata.get("repair_candidate_draft_ready"):
return
draft_package = result.metadata.get("repair_candidate_draft_package")
if not isinstance(draft_package, dict):
return
promotion_contract = draft_package.get("candidate_promotion_contract")
if not isinstance(promotion_contract, dict):
return
proposal_data = {
"source": "repair_candidate_controlled_queue",
"risk_level": severity or getattr(getattr(incident, "severity", None), "value", ""),
"action": promotion_contract.get("repair_command_template") or "",
"matched_playbook_id": draft_package.get("matched_playbook_id"),
"repair_candidate_status": result.metadata.get("repair_candidate_status"),
"route_id": promotion_contract.get("route_id"),
"controlled_playbook_queue": True,
}
payload = self._ansible_audit_builder(
incident=incident,
proposal_data=proposal_data,
decision_path="repair_candidate_controlled_queue",
not_used_reason=(
"repair candidate controlled queue ready; "
"Ansible check-mode worker should claim candidate"
),
)
if payload is None:
handoff = {
"schema_version": "repair_candidate_controlled_executor_handoff_v1",
"status": "no_ansible_catalog_candidate",
"operation_type": "ansible_candidate_matched",
"candidate_count": 0,
"check_mode_worker": "awooop_ansible_check_mode_worker",
"check_mode_queued": False,
"runtime_execution_authorized": False,
}
else:
written = await self._ansible_audit_recorder(
incident=incident,
proposal_data=proposal_data,
decision_path="repair_candidate_controlled_queue",
not_used_reason=(
"repair candidate controlled queue ready; "
"Ansible check-mode worker should claim candidate"
),
)
candidates = payload.get("input", {}).get("executor_candidates")
candidate_count = len(candidates) if isinstance(candidates, list) else 0
handoff = {
"schema_version": "repair_candidate_controlled_executor_handoff_v1",
"status": (
"ansible_candidate_matched_queued"
if written
else "ansible_candidate_matched_existing_or_write_skipped"
),
"operation_type": payload.get("operation_type"),
"decision_effect": payload.get("output", {}).get("decision_effect"),
"next_required_step": payload.get("output", {}).get("next_required_step"),
"candidate_count": candidate_count,
"check_mode_worker": "awooop_ansible_check_mode_worker",
"check_mode_queued": True,
"runtime_execution_authorized": False,
}
result.metadata["controlled_executor_handoff"] = handoff
draft_package["controlled_executor_handoff"] = handoff
work_item = draft_package.get("awooop_work_item")
if isinstance(work_item, dict):
work_item["controlled_executor_handoff"] = handoff
async def _collect_evidence(self, incident: Incident) -> EvidenceSnapshot | None:
try:
return await self._investigator.investigate(incident)

View File

@@ -1217,6 +1217,45 @@ def test_ansible_decision_audit_payload_is_dry_run_only() -> None:
assert payload["dry_run_result"]["check_mode_executed"] is False
def test_ansible_decision_audit_payload_marks_repair_candidate_queue_claimable() -> None:
incident = SimpleNamespace(
incident_id="INC-NODE-188",
project_id="awoooi",
alert_category="infrastructure",
notification_type="TYPE-3",
severity=SimpleNamespace(value="P3"),
affected_services=["node-exporter-188"],
signals=[
SimpleNamespace(
alert_name="NodeExporterDown",
labels={"alertname": "NodeExporterDown", "instance": "node-exporter-188"},
annotations={},
)
],
)
payload = build_ansible_decision_audit_payload(
incident=incident,
proposal_data={
"source": "repair_candidate_controlled_queue",
"risk_level": "medium",
"action": "systemctl restart node-exporter-188",
},
decision_path="repair_candidate_controlled_queue",
not_used_reason="repair candidate controlled queue ready",
)
assert payload is not None
assert payload["operation_type"] == "ansible_candidate_matched"
assert payload["status"] == "dry_run"
assert payload["output"]["decision_effect"] == "check_mode_queue_ready"
assert payload["output"]["next_required_step"] == (
"awooop_ansible_check_mode_worker_claims_candidate"
)
assert payload["input"]["executor_candidates"]
assert payload["dry_run_result"]["check_mode_executed"] is False
def test_ansible_decision_audit_payload_exposes_check_mode_safety_flags() -> None:
incident = SimpleNamespace(
incident_id="INC-MOMO",

View File

@@ -298,11 +298,37 @@ async def test_candidate_blocked_observe_only_prompts_repair_playbook_draft() ->
risk_level=PlaybookRiskLevel.LOW,
)
playbook.repair_steps[0].action_type = ActionType.SSH_COMMAND
audit_calls: list[tuple[str, dict]] = []
def fake_ansible_audit_builder(**kwargs):
audit_calls.append(("builder", kwargs))
return {
"operation_type": "ansible_candidate_matched",
"input": {
"executor_candidates": [
{
"catalog_id": "ansible:188-ai-web",
"playbook_path": "infra/ansible/playbooks/188-ai-web.yml",
}
]
},
"output": {
"decision_effect": "check_mode_queue_ready",
"next_required_step": "awooop_ansible_check_mode_worker_claims_candidate",
},
}
async def fake_ansible_audit_recorder(**kwargs):
audit_calls.append(("recorder", kwargs))
return True
service = RepairCandidateService(
incident_service=FakeIncidentService(),
investigator=FakeInvestigator(_evidence(incident.incident_id)),
playbook_repository=FakePlaybookRepository(playbook),
auto_repair_service=FakeAutoRepairService(),
ansible_audit_builder=fake_ansible_audit_builder,
ansible_audit_recorder=fake_ansible_audit_recorder,
)
service._auto_repair = type(
"NoRouteAutoRepairService",
@@ -383,12 +409,26 @@ async def test_candidate_blocked_observe_only_prompts_repair_playbook_draft() ->
assert result.metadata["repair_candidate_promotion_contract"] == promotion_contract
assert "promotion=11/11" in result.metadata["repair_candidate_promotion_summary"]
assert "runtime=false" in result.metadata["repair_candidate_promotion_summary"]
handoff = result.metadata["controlled_executor_handoff"]
assert handoff["schema_version"] == "repair_candidate_controlled_executor_handoff_v1"
assert handoff["status"] == "ansible_candidate_matched_queued"
assert handoff["operation_type"] == "ansible_candidate_matched"
assert handoff["decision_effect"] == "check_mode_queue_ready"
assert handoff["next_required_step"] == "awooop_ansible_check_mode_worker_claims_candidate"
assert handoff["candidate_count"] == 1
assert handoff["check_mode_worker"] == "awooop_ansible_check_mode_worker"
assert handoff["check_mode_queued"] is True
assert handoff["runtime_execution_authorized"] is False
assert [name for name, _ in audit_calls] == ["builder", "recorder"]
assert audit_calls[0][1]["decision_path"] == "repair_candidate_controlled_queue"
assert audit_calls[1][1]["decision_path"] == "repair_candidate_controlled_queue"
work_item = draft_package["awooop_work_item"]
assert work_item["status"] == "controlled_playbook_queue_ready"
assert work_item["next_action"] == "queue_check_mode_then_controlled_apply"
assert work_item["owner_review_required"] is False
assert work_item["controlled_playbook_queue"] is True
assert work_item["runtime_execution_authorized"] is False
assert work_item["controlled_executor_handoff"] == handoff
assert work_item["candidate_promotion_contract"]["route_id"] == (
"host_service_route_after_owner_review"
)

View File

@@ -1,3 +1,24 @@
## 2026-06-27D1M 修復候選接 executor worker受控佇列寫入 Ansible check-mode receipt
**背景**D1L 已把 Telegram / AwoooP 修復候選從「人工處置 / owner review」改為 `controlled_playbook_queue_ready`,但盤點後確認真正的 executor worker 只會 claim `automation_operation_log.operation_type='ansible_candidate_matched'`;修復候選草案若沒有同步寫出這個 row頁面上看似進佇列實際 worker 沒有任務可跑。
**完成內容**
- `RepairCandidateService` 新增 controlled executor handoff`repair_candidate_draft_ready=true` 時,會用 `record_ansible_decision_audit()` 建立 / 確認 `ansible_candidate_matched` dry-run row`awooop_ansible_check_mode_worker` 可 claim。
- D1M handoff metadata 會回寫到 `repair_candidate_draft_package` 與 AwoooP work item`schema_version=repair_candidate_controlled_executor_handoff_v1``operation_type=ansible_candidate_matched``check_mode_worker=awooop_ansible_check_mode_worker``check_mode_queued=true``runtime_execution_authorized=false`
- `build_ansible_decision_audit_payload()` 針對 `decision_path=repair_candidate_controlled_queue` 改為 `decision_effect=check_mode_queue_ready``next_required_step=awooop_ansible_check_mode_worker_claims_candidate`;舊 manual approval audit 仍維持 `audit_only`
- 單元測試新增 fake audit builder / recorder避免 unit test 連 DB同時驗證 `builder -> recorder` 皆走 `repair_candidate_controlled_queue`
**驗證結果**
- `py_compile``repair_candidate_service.py``awooop_ansible_audit_service.py``awooop_ansible_check_mode_service.py` 通過。
- `pytest apps/api/tests/test_repair_candidate_service.py apps/api/tests/test_awooop_truth_chain_service.py -q``62 passed`
- Telegram / operator / AwoooP status-chain 回歸:`test_telegram_ai_automation_block.py``test_telegram_webhook_execution_handoff.py``test_telegram_message_templates.py``test_operator_outcome.py``test_awooop_operator_timeline_labels.py``169 passed`
- `git diff --check`:通過。
**完成度 / 邊界**
- 修復候選 → executor worker 可 claim receipt本地 `100%`
- 真實 check-mode worker 執行、controlled apply、post-apply verifier、KM / PlayBook trust writeback、Telegram 實發:尚未在本段執行,仍需下一段接 live worker receipt。
- 本段沒有 SSH、沒有 Ansible apply、沒有 host write、沒有 restart / reboot / firewall / DB destructive action、沒有 Telegram 實發、沒有讀 secret value。
## 2026-06-27D1L 正式站驗證:修復候選批准後進 AI 受控自動化佇列
**背景**D1L 已把 Telegram / AwoooP 修復候選從「批准後仍卡人工 / owner review」改為「AI 受控自動化佇列」。本段補正式部署與頁面 smoke避免把本地測試誤當正式站完成。