feat(agents): classify runtime apply repair readback
This commit is contained in:
@@ -230,6 +230,155 @@ def _latest_flow_closure(
|
||||
}
|
||||
|
||||
|
||||
def _latest_failure_classification(
|
||||
*,
|
||||
operation_latest_rows: Iterable[Mapping[str, Any] | Any],
|
||||
verifier_latest_rows: Iterable[Mapping[str, Any] | Any],
|
||||
latest_flow_closure: Mapping[str, Any],
|
||||
) -> dict[str, Any]:
|
||||
"""Classify the newest controlled apply outcome without exposing command output."""
|
||||
|
||||
operation_rows = [_row_mapping(row) for row in operation_latest_rows]
|
||||
verifier_rows = [_row_mapping(row) for row in verifier_latest_rows]
|
||||
latest_apply = next(
|
||||
(
|
||||
row
|
||||
for row in operation_rows
|
||||
if str(row.get("operation_type") or "") == "ansible_apply_executed"
|
||||
),
|
||||
None,
|
||||
)
|
||||
if latest_apply is None:
|
||||
return {
|
||||
"schema_version": "ai_agent_executor_failure_classification_v1",
|
||||
"classification": "no_controlled_apply_observed",
|
||||
"action": "wait_for_controlled_apply_receipt",
|
||||
"target_selector": {},
|
||||
"evidence": {
|
||||
"latest_flow_closed": False,
|
||||
"output_tail_in_readback": False,
|
||||
"unredacted_output_required": False,
|
||||
},
|
||||
}
|
||||
|
||||
apply_op_id = str(latest_apply.get("op_id") or "")
|
||||
incident_id = str(latest_apply.get("incident_id") or "")
|
||||
returncode = _int_value(latest_apply.get("returncode"))
|
||||
verifier = next(
|
||||
(
|
||||
row
|
||||
for row in verifier_rows
|
||||
if apply_op_id and str(row.get("apply_op_id") or "") == apply_op_id
|
||||
),
|
||||
{},
|
||||
)
|
||||
verification_result = str(verifier.get("verification_result") or "").lower()
|
||||
latest_flow_closed = latest_flow_closure.get("closed") is True
|
||||
|
||||
if returncode == 0 and verification_result in {"success", ""} and latest_flow_closed:
|
||||
classification = "latest_controlled_apply_closed_success"
|
||||
action = "keep_receipt_chain_closed"
|
||||
elif returncode == 0:
|
||||
classification = "controlled_apply_success_receipt_gap"
|
||||
action = "backfill_missing_verifier_km_or_telegram_receipt"
|
||||
elif latest_flow_closed:
|
||||
classification = "closed_failed_apply_requires_ai_repair"
|
||||
action = "queue_check_mode_replay_and_playbook_repair_candidate"
|
||||
else:
|
||||
classification = "failed_apply_receipt_gap_requires_backfill_then_repair"
|
||||
action = "backfill_missing_receipts_then_queue_repair_candidate"
|
||||
|
||||
return {
|
||||
"schema_version": "ai_agent_executor_failure_classification_v1",
|
||||
"classification": classification,
|
||||
"action": action,
|
||||
"target_selector": {
|
||||
"incident_id": incident_id or None,
|
||||
"apply_op_id": apply_op_id or None,
|
||||
"parent_op_id": latest_apply.get("parent_op_id"),
|
||||
"catalog_id": latest_apply.get("catalog_id"),
|
||||
"playbook_path": latest_apply.get("playbook_path"),
|
||||
"execution_mode": latest_apply.get("execution_mode"),
|
||||
},
|
||||
"evidence": {
|
||||
"operation_status": latest_apply.get("status"),
|
||||
"returncode": latest_apply.get("returncode"),
|
||||
"verification_result": verification_result or None,
|
||||
"latest_flow_closed": latest_flow_closed,
|
||||
"has_post_apply_verifier": latest_flow_closure.get("has_post_apply_verifier") is True,
|
||||
"has_km_writeback": latest_flow_closure.get("has_km_writeback") is True,
|
||||
"has_telegram_receipt": latest_flow_closure.get("has_telegram_receipt") is True,
|
||||
"output_tail_in_readback": False,
|
||||
"unredacted_output_required": False,
|
||||
},
|
||||
"safe_next_steps": [
|
||||
"run_no_write_check_mode_replay",
|
||||
"extract_sanitized_failed_task_summary",
|
||||
"write_km_playbook_repair_candidate",
|
||||
"retry_controlled_apply_only_after_check_mode_passes",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _controlled_retry_package(classification: Mapping[str, Any]) -> dict[str, Any]:
|
||||
"""Build the next no-write repair package from the public failure classification."""
|
||||
|
||||
target_selector = classification.get("target_selector")
|
||||
if not isinstance(target_selector, Mapping):
|
||||
target_selector = {}
|
||||
apply_op_id = str(target_selector.get("apply_op_id") or "")
|
||||
repair_required = classification.get("classification") in {
|
||||
"closed_failed_apply_requires_ai_repair",
|
||||
"failed_apply_receipt_gap_requires_backfill_then_repair",
|
||||
}
|
||||
return {
|
||||
"schema_version": "ai_agent_controlled_retry_package_v1",
|
||||
"package_id": (
|
||||
f"ansible_retry:{apply_op_id[:8]}"
|
||||
if repair_required and apply_op_id
|
||||
else None
|
||||
),
|
||||
"status": (
|
||||
"ready_for_no_write_check_mode_replay"
|
||||
if repair_required
|
||||
else "not_required_for_latest_apply"
|
||||
),
|
||||
"target_selector": dict(target_selector),
|
||||
"source_of_truth": {
|
||||
"catalog_id": target_selector.get("catalog_id"),
|
||||
"playbook_path": target_selector.get("playbook_path"),
|
||||
"source_diff_required_before_retry": True,
|
||||
"failed_task_summary_required": True,
|
||||
},
|
||||
"preflight": {
|
||||
"no_write_check_mode_replay_required": repair_required,
|
||||
"reuse_parent_check_mode_op_id": target_selector.get("parent_op_id"),
|
||||
"unredacted_output_required": False,
|
||||
"secret_value_collection_allowed": False,
|
||||
},
|
||||
"apply_gate": {
|
||||
"controlled_apply_retry_allowed_now": False,
|
||||
"opens_legacy_runner": False,
|
||||
"requires_check_mode_success_before_apply": repair_required,
|
||||
},
|
||||
"rollback": {
|
||||
"rollback_candidate_required": repair_required,
|
||||
"destructive_rollback_allowed": False,
|
||||
"rollback_plan_source": "playbook_repair_candidate_after_failed_task_summary",
|
||||
},
|
||||
"post_apply": {
|
||||
"post_apply_verifier_required": repair_required,
|
||||
"km_playbook_trust_writeback_required": repair_required,
|
||||
"telegram_receipt_required": repair_required,
|
||||
},
|
||||
"next_ai_action": (
|
||||
"run_no_write_check_mode_replay"
|
||||
if repair_required
|
||||
else "keep_latest_apply_receipts"
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def classify_deploy_control_plane_observation(
|
||||
*,
|
||||
run_status: str,
|
||||
@@ -446,6 +595,12 @@ def build_runtime_receipt_readback_from_rows(
|
||||
km_latest_rows=km_latest,
|
||||
telegram_latest_rows=telegram_latest,
|
||||
)
|
||||
latest_failure = _latest_failure_classification(
|
||||
operation_latest_rows=operation_latest,
|
||||
verifier_latest_rows=verifier_latest,
|
||||
latest_flow_closure=latest_closure,
|
||||
)
|
||||
retry_package = _controlled_retry_package(latest_failure)
|
||||
apply_summary = operation_summary.get("ansible_apply_executed") or {}
|
||||
readback = {
|
||||
"schema_version": _LIVE_READBACK_SCHEMA_VERSION,
|
||||
@@ -529,6 +684,8 @@ def build_runtime_receipt_readback_from_rows(
|
||||
),
|
||||
},
|
||||
"latest_flow_closure": latest_closure,
|
||||
"latest_failure_classification": latest_failure,
|
||||
"controlled_retry_package": retry_package,
|
||||
}
|
||||
if error_type:
|
||||
readback["error"] = {
|
||||
@@ -562,6 +719,23 @@ def _attach_runtime_receipt_readback(
|
||||
if (readback.get("latest_flow_closure") or {}).get("closed") is True
|
||||
else 0
|
||||
),
|
||||
"live_executor_latest_apply_repair_required_count": (
|
||||
1
|
||||
if (
|
||||
(readback.get("latest_failure_classification") or {}).get("classification")
|
||||
in {
|
||||
"closed_failed_apply_requires_ai_repair",
|
||||
"failed_apply_receipt_gap_requires_backfill_then_repair",
|
||||
}
|
||||
)
|
||||
else 0
|
||||
),
|
||||
"live_executor_retry_package_ready_count": (
|
||||
1
|
||||
if (readback.get("controlled_retry_package") or {}).get("status")
|
||||
== "ready_for_no_write_check_mode_replay"
|
||||
else 0
|
||||
),
|
||||
})
|
||||
return payload
|
||||
|
||||
|
||||
@@ -219,3 +219,128 @@ def test_runtime_receipt_readback_summarizes_live_executor_closure_rows():
|
||||
"closed": True,
|
||||
"missing": [],
|
||||
}
|
||||
assert readback["latest_failure_classification"]["classification"] == (
|
||||
"latest_controlled_apply_closed_success"
|
||||
)
|
||||
assert readback["controlled_retry_package"]["status"] == "not_required_for_latest_apply"
|
||||
|
||||
|
||||
def test_runtime_receipt_readback_classifies_closed_failed_apply_as_ai_repair():
|
||||
apply_op_id = "94925d5e-6fdc-49c3-90e8-f0a0d57a6a58"
|
||||
incident_id = "INC-20260628-A40A9A"
|
||||
|
||||
readback = build_runtime_receipt_readback_from_rows(
|
||||
project_id="awoooi",
|
||||
db_read_status="ok",
|
||||
operation_count_rows=[
|
||||
{
|
||||
"operation_type": "ansible_apply_executed",
|
||||
"status": "failed",
|
||||
"total": 1,
|
||||
"recent": 1,
|
||||
},
|
||||
],
|
||||
operation_latest_rows=[
|
||||
{
|
||||
"op_id": apply_op_id,
|
||||
"parent_op_id": "8b555f41-e81f-4d8e-956b-fb20d358db63",
|
||||
"operation_type": "ansible_apply_executed",
|
||||
"status": "failed",
|
||||
"actor": "ansible_controlled_apply_worker",
|
||||
"incident_id": incident_id,
|
||||
"catalog_id": "ansible:188-ai-web",
|
||||
"playbook_path": "infra/ansible/playbooks/188-ai-web.yml",
|
||||
"execution_mode": "controlled_apply",
|
||||
"returncode": "2",
|
||||
"duration_ms": 4797,
|
||||
},
|
||||
],
|
||||
verifier_count_rows=[
|
||||
{"verification_result": "failed", "total": 1, "recent": 1},
|
||||
],
|
||||
verifier_latest_rows=[
|
||||
{
|
||||
"id": "evidence-1",
|
||||
"incident_id": incident_id,
|
||||
"verification_result": "failed",
|
||||
"apply_op_id": apply_op_id,
|
||||
"catalog_id": "ansible:188-ai-web",
|
||||
"playbook_path": "infra/ansible/playbooks/188-ai-web.yml",
|
||||
"returncode": "2",
|
||||
},
|
||||
],
|
||||
km_count_rows=[
|
||||
{"status": "REVIEW", "total": 1, "recent": 1},
|
||||
],
|
||||
km_latest_rows=[
|
||||
{
|
||||
"id": "km-1",
|
||||
"title": "AI 自動修復沉澱:INC-20260628-A40A9A",
|
||||
"related_incident_id": incident_id,
|
||||
"related_playbook_id": "ansible:188-ai-web",
|
||||
"path_type": "ansible_apply_receipt:94925d5e",
|
||||
"status": "REVIEW",
|
||||
"created_by": "ai_agent_ansible_worker",
|
||||
},
|
||||
],
|
||||
telegram_count_rows=[
|
||||
{"send_status": "sent", "total": 1, "recent": 1},
|
||||
],
|
||||
telegram_latest_rows=[
|
||||
{
|
||||
"message_id": "telegram-row-1",
|
||||
"run_id": "telegram-run-1",
|
||||
"message_type": "final",
|
||||
"send_status": "sent",
|
||||
"provider_message_id": "32016",
|
||||
"incident_id": incident_id,
|
||||
"action": "controlled_apply_result",
|
||||
},
|
||||
],
|
||||
)
|
||||
|
||||
classification = readback["latest_failure_classification"]
|
||||
assert classification["classification"] == "closed_failed_apply_requires_ai_repair"
|
||||
assert classification["action"] == "queue_check_mode_replay_and_playbook_repair_candidate"
|
||||
assert classification["target_selector"] == {
|
||||
"incident_id": incident_id,
|
||||
"apply_op_id": apply_op_id,
|
||||
"parent_op_id": "8b555f41-e81f-4d8e-956b-fb20d358db63",
|
||||
"catalog_id": "ansible:188-ai-web",
|
||||
"playbook_path": "infra/ansible/playbooks/188-ai-web.yml",
|
||||
"execution_mode": "controlled_apply",
|
||||
}
|
||||
assert classification["evidence"]["returncode"] == "2"
|
||||
assert classification["evidence"]["verification_result"] == "failed"
|
||||
assert classification["evidence"]["latest_flow_closed"] is True
|
||||
assert classification["evidence"]["output_tail_in_readback"] is False
|
||||
assert classification["evidence"]["unredacted_output_required"] is False
|
||||
assert classification["safe_next_steps"] == [
|
||||
"run_no_write_check_mode_replay",
|
||||
"extract_sanitized_failed_task_summary",
|
||||
"write_km_playbook_repair_candidate",
|
||||
"retry_controlled_apply_only_after_check_mode_passes",
|
||||
]
|
||||
|
||||
retry = readback["controlled_retry_package"]
|
||||
assert retry["package_id"] == "ansible_retry:94925d5e"
|
||||
assert retry["status"] == "ready_for_no_write_check_mode_replay"
|
||||
assert retry["source_of_truth"] == {
|
||||
"catalog_id": "ansible:188-ai-web",
|
||||
"playbook_path": "infra/ansible/playbooks/188-ai-web.yml",
|
||||
"source_diff_required_before_retry": True,
|
||||
"failed_task_summary_required": True,
|
||||
}
|
||||
assert retry["preflight"]["no_write_check_mode_replay_required"] is True
|
||||
assert retry["preflight"]["reuse_parent_check_mode_op_id"] == (
|
||||
"8b555f41-e81f-4d8e-956b-fb20d358db63"
|
||||
)
|
||||
assert retry["apply_gate"]["controlled_apply_retry_allowed_now"] is False
|
||||
assert retry["apply_gate"]["requires_check_mode_success_before_apply"] is True
|
||||
assert retry["rollback"]["destructive_rollback_allowed"] is False
|
||||
assert retry["post_apply"] == {
|
||||
"post_apply_verifier_required": True,
|
||||
"km_playbook_trust_writeback_required": True,
|
||||
"telegram_receipt_required": True,
|
||||
}
|
||||
assert retry["next_ai_action"] == "run_no_write_check_mode_replay"
|
||||
|
||||
Reference in New Issue
Block a user