fix(api): enqueue repair candidates for ansible check mode
Some checks failed
Code Review / ai-code-review (push) Successful in 14s
CD Pipeline / tests (push) Successful in 1m40s
CD Pipeline / build-and-deploy (push) Successful in 6m32s
CD Pipeline / post-deploy-checks (push) Successful in 1m38s
Ansible / Reboot Recovery Contract / validate (push) Has been cancelled
Some checks failed
Code Review / ai-code-review (push) Successful in 14s
CD Pipeline / tests (push) Successful in 1m40s
CD Pipeline / build-and-deploy (push) Successful in 6m32s
CD Pipeline / post-deploy-checks (push) Successful in 1m38s
Ansible / Reboot Recovery Contract / validate (push) Has been cancelled
This commit is contained in:
@@ -542,10 +542,15 @@ def build_ansible_decision_audit_payload(
|
||||
or ""
|
||||
)[:240],
|
||||
}
|
||||
controlled_queue = decision_path == "repair_candidate_controlled_queue"
|
||||
output_payload = {
|
||||
"not_used_reason": not_used_reason,
|
||||
"decision_effect": "audit_only",
|
||||
"next_required_step": "wire approval_execution to Ansible check-mode before apply",
|
||||
"decision_effect": "check_mode_queue_ready" if controlled_queue else "audit_only",
|
||||
"next_required_step": (
|
||||
"awooop_ansible_check_mode_worker_claims_candidate"
|
||||
if controlled_queue
|
||||
else "wire approval_execution to Ansible check-mode before apply"
|
||||
),
|
||||
}
|
||||
return {
|
||||
"operation_type": "ansible_candidate_matched",
|
||||
|
||||
@@ -31,6 +31,10 @@ from src.models.playbook import RiskLevel as PlaybookRiskLevel
|
||||
from src.repositories.playbook_repository import get_playbook_repository
|
||||
from src.services.action_parser import ActionKind, parse_kubectl_action
|
||||
from src.services.auto_repair_service import AutoRepairService
|
||||
from src.services.awooop_ansible_audit_service import (
|
||||
build_ansible_decision_audit_payload,
|
||||
record_ansible_decision_audit,
|
||||
)
|
||||
from src.services.awooop_deeplinks import work_item_url
|
||||
from src.services.evidence_snapshot import EvidenceSnapshot
|
||||
from src.services.incident_service import get_incident_service
|
||||
@@ -71,11 +75,15 @@ class RepairCandidateService:
|
||||
investigator: Any | None = None,
|
||||
playbook_repository: Any | None = None,
|
||||
auto_repair_service: AutoRepairService | None = None,
|
||||
ansible_audit_builder: Any | None = None,
|
||||
ansible_audit_recorder: Any | None = None,
|
||||
) -> None:
|
||||
self._incident_service = incident_service or get_incident_service()
|
||||
self._investigator = investigator or get_pre_decision_investigator()
|
||||
self._playbook_repository = playbook_repository or get_playbook_repository()
|
||||
self._auto_repair = auto_repair_service or AutoRepairService()
|
||||
self._ansible_audit_builder = ansible_audit_builder or build_ansible_decision_audit_payload
|
||||
self._ansible_audit_recorder = ansible_audit_recorder or record_ansible_decision_audit
|
||||
|
||||
async def build_from_incident_id(
|
||||
self,
|
||||
@@ -139,7 +147,7 @@ class RepairCandidateService:
|
||||
)
|
||||
if not playbook_id:
|
||||
blockers.append("playbook_not_matched")
|
||||
return self._blocked_result(
|
||||
return await self._blocked_result_with_controlled_handoff(
|
||||
blockers=blockers,
|
||||
metadata=metadata,
|
||||
evidence=evidence,
|
||||
@@ -148,12 +156,13 @@ class RepairCandidateService:
|
||||
alertname=alertname,
|
||||
target_resource=target_resource,
|
||||
namespace=namespace,
|
||||
severity=severity,
|
||||
)
|
||||
|
||||
playbook = await self._playbook_repository.get_by_id(playbook_id)
|
||||
if not playbook:
|
||||
blockers.append("playbook_not_found")
|
||||
return self._blocked_result(
|
||||
return await self._blocked_result_with_controlled_handoff(
|
||||
blockers=blockers,
|
||||
metadata=metadata,
|
||||
evidence=evidence,
|
||||
@@ -162,6 +171,7 @@ class RepairCandidateService:
|
||||
alertname=alertname,
|
||||
target_resource=target_resource,
|
||||
namespace=namespace,
|
||||
severity=severity,
|
||||
)
|
||||
|
||||
metadata["playbook_trust"] = {
|
||||
@@ -181,7 +191,7 @@ class RepairCandidateService:
|
||||
step, step_blockers = self._select_executable_step(incident, playbook)
|
||||
blockers.extend(step_blockers)
|
||||
if blockers or step is None:
|
||||
return self._blocked_result(
|
||||
return await self._blocked_result_with_controlled_handoff(
|
||||
blockers=blockers,
|
||||
metadata=metadata,
|
||||
evidence=evidence,
|
||||
@@ -191,6 +201,7 @@ class RepairCandidateService:
|
||||
alertname=alertname,
|
||||
target_resource=target_resource,
|
||||
namespace=namespace,
|
||||
severity=severity,
|
||||
)
|
||||
|
||||
metadata["repair_candidate"] = {
|
||||
@@ -255,6 +266,116 @@ class RepairCandidateService:
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
async def _blocked_result_with_controlled_handoff(
|
||||
self,
|
||||
*,
|
||||
blockers: list[str],
|
||||
metadata: dict[str, Any],
|
||||
fallback_action: str,
|
||||
evidence: EvidenceSnapshot | None = None,
|
||||
playbook: Playbook | None = None,
|
||||
incident: Incident | None = None,
|
||||
alertname: str = "",
|
||||
target_resource: str = "",
|
||||
namespace: str = "",
|
||||
severity: str | None = None,
|
||||
) -> RepairCandidateResult:
|
||||
result = self._blocked_result(
|
||||
blockers=blockers,
|
||||
metadata=metadata,
|
||||
evidence=evidence,
|
||||
playbook=playbook,
|
||||
fallback_action=fallback_action,
|
||||
incident=incident,
|
||||
alertname=alertname,
|
||||
target_resource=target_resource,
|
||||
namespace=namespace,
|
||||
)
|
||||
await self._maybe_record_controlled_executor_handoff(
|
||||
result=result,
|
||||
incident=incident,
|
||||
severity=severity,
|
||||
)
|
||||
return result
|
||||
|
||||
async def _maybe_record_controlled_executor_handoff(
|
||||
self,
|
||||
*,
|
||||
result: RepairCandidateResult,
|
||||
incident: Incident | None,
|
||||
severity: str | None = None,
|
||||
) -> None:
|
||||
if incident is None or not result.metadata.get("repair_candidate_draft_ready"):
|
||||
return
|
||||
draft_package = result.metadata.get("repair_candidate_draft_package")
|
||||
if not isinstance(draft_package, dict):
|
||||
return
|
||||
promotion_contract = draft_package.get("candidate_promotion_contract")
|
||||
if not isinstance(promotion_contract, dict):
|
||||
return
|
||||
|
||||
proposal_data = {
|
||||
"source": "repair_candidate_controlled_queue",
|
||||
"risk_level": severity or getattr(getattr(incident, "severity", None), "value", ""),
|
||||
"action": promotion_contract.get("repair_command_template") or "",
|
||||
"matched_playbook_id": draft_package.get("matched_playbook_id"),
|
||||
"repair_candidate_status": result.metadata.get("repair_candidate_status"),
|
||||
"route_id": promotion_contract.get("route_id"),
|
||||
"controlled_playbook_queue": True,
|
||||
}
|
||||
payload = self._ansible_audit_builder(
|
||||
incident=incident,
|
||||
proposal_data=proposal_data,
|
||||
decision_path="repair_candidate_controlled_queue",
|
||||
not_used_reason=(
|
||||
"repair candidate controlled queue ready; "
|
||||
"Ansible check-mode worker should claim candidate"
|
||||
),
|
||||
)
|
||||
if payload is None:
|
||||
handoff = {
|
||||
"schema_version": "repair_candidate_controlled_executor_handoff_v1",
|
||||
"status": "no_ansible_catalog_candidate",
|
||||
"operation_type": "ansible_candidate_matched",
|
||||
"candidate_count": 0,
|
||||
"check_mode_worker": "awooop_ansible_check_mode_worker",
|
||||
"check_mode_queued": False,
|
||||
"runtime_execution_authorized": False,
|
||||
}
|
||||
else:
|
||||
written = await self._ansible_audit_recorder(
|
||||
incident=incident,
|
||||
proposal_data=proposal_data,
|
||||
decision_path="repair_candidate_controlled_queue",
|
||||
not_used_reason=(
|
||||
"repair candidate controlled queue ready; "
|
||||
"Ansible check-mode worker should claim candidate"
|
||||
),
|
||||
)
|
||||
candidates = payload.get("input", {}).get("executor_candidates")
|
||||
candidate_count = len(candidates) if isinstance(candidates, list) else 0
|
||||
handoff = {
|
||||
"schema_version": "repair_candidate_controlled_executor_handoff_v1",
|
||||
"status": (
|
||||
"ansible_candidate_matched_queued"
|
||||
if written
|
||||
else "ansible_candidate_matched_existing_or_write_skipped"
|
||||
),
|
||||
"operation_type": payload.get("operation_type"),
|
||||
"decision_effect": payload.get("output", {}).get("decision_effect"),
|
||||
"next_required_step": payload.get("output", {}).get("next_required_step"),
|
||||
"candidate_count": candidate_count,
|
||||
"check_mode_worker": "awooop_ansible_check_mode_worker",
|
||||
"check_mode_queued": True,
|
||||
"runtime_execution_authorized": False,
|
||||
}
|
||||
|
||||
result.metadata["controlled_executor_handoff"] = handoff
|
||||
draft_package["controlled_executor_handoff"] = handoff
|
||||
work_item = draft_package.get("awooop_work_item")
|
||||
if isinstance(work_item, dict):
|
||||
work_item["controlled_executor_handoff"] = handoff
|
||||
|
||||
async def _collect_evidence(self, incident: Incident) -> EvidenceSnapshot | None:
|
||||
try:
|
||||
return await self._investigator.investigate(incident)
|
||||
|
||||
@@ -1217,6 +1217,45 @@ def test_ansible_decision_audit_payload_is_dry_run_only() -> None:
|
||||
assert payload["dry_run_result"]["check_mode_executed"] is False
|
||||
|
||||
|
||||
def test_ansible_decision_audit_payload_marks_repair_candidate_queue_claimable() -> None:
|
||||
incident = SimpleNamespace(
|
||||
incident_id="INC-NODE-188",
|
||||
project_id="awoooi",
|
||||
alert_category="infrastructure",
|
||||
notification_type="TYPE-3",
|
||||
severity=SimpleNamespace(value="P3"),
|
||||
affected_services=["node-exporter-188"],
|
||||
signals=[
|
||||
SimpleNamespace(
|
||||
alert_name="NodeExporterDown",
|
||||
labels={"alertname": "NodeExporterDown", "instance": "node-exporter-188"},
|
||||
annotations={},
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
payload = build_ansible_decision_audit_payload(
|
||||
incident=incident,
|
||||
proposal_data={
|
||||
"source": "repair_candidate_controlled_queue",
|
||||
"risk_level": "medium",
|
||||
"action": "systemctl restart node-exporter-188",
|
||||
},
|
||||
decision_path="repair_candidate_controlled_queue",
|
||||
not_used_reason="repair candidate controlled queue ready",
|
||||
)
|
||||
|
||||
assert payload is not None
|
||||
assert payload["operation_type"] == "ansible_candidate_matched"
|
||||
assert payload["status"] == "dry_run"
|
||||
assert payload["output"]["decision_effect"] == "check_mode_queue_ready"
|
||||
assert payload["output"]["next_required_step"] == (
|
||||
"awooop_ansible_check_mode_worker_claims_candidate"
|
||||
)
|
||||
assert payload["input"]["executor_candidates"]
|
||||
assert payload["dry_run_result"]["check_mode_executed"] is False
|
||||
|
||||
|
||||
def test_ansible_decision_audit_payload_exposes_check_mode_safety_flags() -> None:
|
||||
incident = SimpleNamespace(
|
||||
incident_id="INC-MOMO",
|
||||
|
||||
@@ -298,11 +298,37 @@ async def test_candidate_blocked_observe_only_prompts_repair_playbook_draft() ->
|
||||
risk_level=PlaybookRiskLevel.LOW,
|
||||
)
|
||||
playbook.repair_steps[0].action_type = ActionType.SSH_COMMAND
|
||||
audit_calls: list[tuple[str, dict]] = []
|
||||
|
||||
def fake_ansible_audit_builder(**kwargs):
|
||||
audit_calls.append(("builder", kwargs))
|
||||
return {
|
||||
"operation_type": "ansible_candidate_matched",
|
||||
"input": {
|
||||
"executor_candidates": [
|
||||
{
|
||||
"catalog_id": "ansible:188-ai-web",
|
||||
"playbook_path": "infra/ansible/playbooks/188-ai-web.yml",
|
||||
}
|
||||
]
|
||||
},
|
||||
"output": {
|
||||
"decision_effect": "check_mode_queue_ready",
|
||||
"next_required_step": "awooop_ansible_check_mode_worker_claims_candidate",
|
||||
},
|
||||
}
|
||||
|
||||
async def fake_ansible_audit_recorder(**kwargs):
|
||||
audit_calls.append(("recorder", kwargs))
|
||||
return True
|
||||
|
||||
service = RepairCandidateService(
|
||||
incident_service=FakeIncidentService(),
|
||||
investigator=FakeInvestigator(_evidence(incident.incident_id)),
|
||||
playbook_repository=FakePlaybookRepository(playbook),
|
||||
auto_repair_service=FakeAutoRepairService(),
|
||||
ansible_audit_builder=fake_ansible_audit_builder,
|
||||
ansible_audit_recorder=fake_ansible_audit_recorder,
|
||||
)
|
||||
service._auto_repair = type(
|
||||
"NoRouteAutoRepairService",
|
||||
@@ -383,12 +409,26 @@ async def test_candidate_blocked_observe_only_prompts_repair_playbook_draft() ->
|
||||
assert result.metadata["repair_candidate_promotion_contract"] == promotion_contract
|
||||
assert "promotion=11/11" in result.metadata["repair_candidate_promotion_summary"]
|
||||
assert "runtime=false" in result.metadata["repair_candidate_promotion_summary"]
|
||||
handoff = result.metadata["controlled_executor_handoff"]
|
||||
assert handoff["schema_version"] == "repair_candidate_controlled_executor_handoff_v1"
|
||||
assert handoff["status"] == "ansible_candidate_matched_queued"
|
||||
assert handoff["operation_type"] == "ansible_candidate_matched"
|
||||
assert handoff["decision_effect"] == "check_mode_queue_ready"
|
||||
assert handoff["next_required_step"] == "awooop_ansible_check_mode_worker_claims_candidate"
|
||||
assert handoff["candidate_count"] == 1
|
||||
assert handoff["check_mode_worker"] == "awooop_ansible_check_mode_worker"
|
||||
assert handoff["check_mode_queued"] is True
|
||||
assert handoff["runtime_execution_authorized"] is False
|
||||
assert [name for name, _ in audit_calls] == ["builder", "recorder"]
|
||||
assert audit_calls[0][1]["decision_path"] == "repair_candidate_controlled_queue"
|
||||
assert audit_calls[1][1]["decision_path"] == "repair_candidate_controlled_queue"
|
||||
work_item = draft_package["awooop_work_item"]
|
||||
assert work_item["status"] == "controlled_playbook_queue_ready"
|
||||
assert work_item["next_action"] == "queue_check_mode_then_controlled_apply"
|
||||
assert work_item["owner_review_required"] is False
|
||||
assert work_item["controlled_playbook_queue"] is True
|
||||
assert work_item["runtime_execution_authorized"] is False
|
||||
assert work_item["controlled_executor_handoff"] == handoff
|
||||
assert work_item["candidate_promotion_contract"]["route_id"] == (
|
||||
"host_service_route_after_owner_review"
|
||||
)
|
||||
|
||||
@@ -1,3 +1,24 @@
|
||||
## 2026-06-27|D1M 修復候選接 executor worker:受控佇列寫入 Ansible check-mode receipt
|
||||
|
||||
**背景**:D1L 已把 Telegram / AwoooP 修復候選從「人工處置 / owner review」改為 `controlled_playbook_queue_ready`,但盤點後確認真正的 executor worker 只會 claim `automation_operation_log.operation_type='ansible_candidate_matched'`;修復候選草案若沒有同步寫出這個 row,頁面上看似進佇列,實際 worker 沒有任務可跑。
|
||||
|
||||
**完成內容**:
|
||||
- `RepairCandidateService` 新增 controlled executor handoff:當 `repair_candidate_draft_ready=true` 時,會用 `record_ansible_decision_audit()` 建立 / 確認 `ansible_candidate_matched` dry-run row,讓 `awooop_ansible_check_mode_worker` 可 claim。
|
||||
- D1M handoff metadata 會回寫到 `repair_candidate_draft_package` 與 AwoooP work item:`schema_version=repair_candidate_controlled_executor_handoff_v1`、`operation_type=ansible_candidate_matched`、`check_mode_worker=awooop_ansible_check_mode_worker`、`check_mode_queued=true`、`runtime_execution_authorized=false`。
|
||||
- `build_ansible_decision_audit_payload()` 針對 `decision_path=repair_candidate_controlled_queue` 改為 `decision_effect=check_mode_queue_ready`,`next_required_step=awooop_ansible_check_mode_worker_claims_candidate`;舊 manual approval audit 仍維持 `audit_only`。
|
||||
- 單元測試新增 fake audit builder / recorder,避免 unit test 連 DB,同時驗證 `builder -> recorder` 皆走 `repair_candidate_controlled_queue`。
|
||||
|
||||
**驗證結果**:
|
||||
- `py_compile`:`repair_candidate_service.py`、`awooop_ansible_audit_service.py`、`awooop_ansible_check_mode_service.py` 通過。
|
||||
- `pytest apps/api/tests/test_repair_candidate_service.py apps/api/tests/test_awooop_truth_chain_service.py -q`:`62 passed`。
|
||||
- Telegram / operator / AwoooP status-chain 回歸:`test_telegram_ai_automation_block.py`、`test_telegram_webhook_execution_handoff.py`、`test_telegram_message_templates.py`、`test_operator_outcome.py`、`test_awooop_operator_timeline_labels.py`:`169 passed`。
|
||||
- `git diff --check`:通過。
|
||||
|
||||
**完成度 / 邊界**:
|
||||
- 修復候選 → executor worker 可 claim receipt:本地 `100%`。
|
||||
- 真實 check-mode worker 執行、controlled apply、post-apply verifier、KM / PlayBook trust writeback、Telegram 實發:尚未在本段執行,仍需下一段接 live worker receipt。
|
||||
- 本段沒有 SSH、沒有 Ansible apply、沒有 host write、沒有 restart / reboot / firewall / DB destructive action、沒有 Telegram 實發、沒有讀 secret value。
|
||||
|
||||
## 2026-06-27|D1L 正式站驗證:修復候選批准後進 AI 受控自動化佇列
|
||||
|
||||
**背景**:D1L 已把 Telegram / AwoooP 修復候選從「批准後仍卡人工 / owner review」改為「AI 受控自動化佇列」。本段補正式部署與頁面 smoke,避免把本地測試誤當正式站完成。
|
||||
|
||||
Reference in New Issue
Block a user