feat(agents): classify runtime apply repair readback

2026-06-28 19:30:27 +08:00
parent d5c9d467c1
commit 688bfd7740
2 changed files with 299 additions and 0 deletions
--- a/apps/api/src/services/ai_agent_autonomous_runtime_control.py
+++ b/apps/api/src/services/ai_agent_autonomous_runtime_control.py
@@ -230,6 +230,155 @@ def _latest_flow_closure(
    }


+def _latest_failure_classification(
+    *,
+    operation_latest_rows: Iterable[Mapping[str, Any] | Any],
+    verifier_latest_rows: Iterable[Mapping[str, Any] | Any],
+    latest_flow_closure: Mapping[str, Any],
+) -> dict[str, Any]:
+    """Classify the newest controlled apply outcome without exposing command output."""
+
+    operation_rows = [_row_mapping(row) for row in operation_latest_rows]
+    verifier_rows = [_row_mapping(row) for row in verifier_latest_rows]
+    latest_apply = next(
+        (
+            row
+            for row in operation_rows
+            if str(row.get("operation_type") or "") == "ansible_apply_executed"
+        ),
+        None,
+    )
+    if latest_apply is None:
+        return {
+            "schema_version": "ai_agent_executor_failure_classification_v1",
+            "classification": "no_controlled_apply_observed",
+            "action": "wait_for_controlled_apply_receipt",
+            "target_selector": {},
+            "evidence": {
+                "latest_flow_closed": False,
+                "output_tail_in_readback": False,
+                "unredacted_output_required": False,
+            },
+        }
+
+    apply_op_id = str(latest_apply.get("op_id") or "")
+    incident_id = str(latest_apply.get("incident_id") or "")
+    returncode = _int_value(latest_apply.get("returncode"))
+    verifier = next(
+        (
+            row
+            for row in verifier_rows
+            if apply_op_id and str(row.get("apply_op_id") or "") == apply_op_id
+        ),
+        {},
+    )
+    verification_result = str(verifier.get("verification_result") or "").lower()
+    latest_flow_closed = latest_flow_closure.get("closed") is True
+
+    if returncode == 0 and verification_result in {"success", ""} and latest_flow_closed:
+        classification = "latest_controlled_apply_closed_success"
+        action = "keep_receipt_chain_closed"
+    elif returncode == 0:
+        classification = "controlled_apply_success_receipt_gap"
+        action = "backfill_missing_verifier_km_or_telegram_receipt"
+    elif latest_flow_closed:
+        classification = "closed_failed_apply_requires_ai_repair"
+        action = "queue_check_mode_replay_and_playbook_repair_candidate"
+    else:
+        classification = "failed_apply_receipt_gap_requires_backfill_then_repair"
+        action = "backfill_missing_receipts_then_queue_repair_candidate"
+
+    return {
+        "schema_version": "ai_agent_executor_failure_classification_v1",
+        "classification": classification,
+        "action": action,
+        "target_selector": {
+            "incident_id": incident_id or None,
+            "apply_op_id": apply_op_id or None,
+            "parent_op_id": latest_apply.get("parent_op_id"),
+            "catalog_id": latest_apply.get("catalog_id"),
+            "playbook_path": latest_apply.get("playbook_path"),
+            "execution_mode": latest_apply.get("execution_mode"),
+        },
+        "evidence": {
+            "operation_status": latest_apply.get("status"),
+            "returncode": latest_apply.get("returncode"),
+            "verification_result": verification_result or None,
+            "latest_flow_closed": latest_flow_closed,
+            "has_post_apply_verifier": latest_flow_closure.get("has_post_apply_verifier") is True,
+            "has_km_writeback": latest_flow_closure.get("has_km_writeback") is True,
+            "has_telegram_receipt": latest_flow_closure.get("has_telegram_receipt") is True,
+            "output_tail_in_readback": False,
+            "unredacted_output_required": False,
+        },
+        "safe_next_steps": [
+            "run_no_write_check_mode_replay",
+            "extract_sanitized_failed_task_summary",
+            "write_km_playbook_repair_candidate",
+            "retry_controlled_apply_only_after_check_mode_passes",
+        ],
+    }
+
+
+def _controlled_retry_package(classification: Mapping[str, Any]) -> dict[str, Any]:
+    """Build the next no-write repair package from the public failure classification."""
+
+    target_selector = classification.get("target_selector")
+    if not isinstance(target_selector, Mapping):
+        target_selector = {}
+    apply_op_id = str(target_selector.get("apply_op_id") or "")
+    repair_required = classification.get("classification") in {
+        "closed_failed_apply_requires_ai_repair",
+        "failed_apply_receipt_gap_requires_backfill_then_repair",
+    }
+    return {
+        "schema_version": "ai_agent_controlled_retry_package_v1",
+        "package_id": (
+            f"ansible_retry:{apply_op_id[:8]}"
+            if repair_required and apply_op_id
+            else None
+        ),
+        "status": (
+            "ready_for_no_write_check_mode_replay"
+            if repair_required
+            else "not_required_for_latest_apply"
+        ),
+        "target_selector": dict(target_selector),
+        "source_of_truth": {
+            "catalog_id": target_selector.get("catalog_id"),
+            "playbook_path": target_selector.get("playbook_path"),
+            "source_diff_required_before_retry": True,
+            "failed_task_summary_required": True,
+        },
+        "preflight": {
+            "no_write_check_mode_replay_required": repair_required,
+            "reuse_parent_check_mode_op_id": target_selector.get("parent_op_id"),
+            "unredacted_output_required": False,
+            "secret_value_collection_allowed": False,
+        },
+        "apply_gate": {
+            "controlled_apply_retry_allowed_now": False,
+            "opens_legacy_runner": False,
+            "requires_check_mode_success_before_apply": repair_required,
+        },
+        "rollback": {
+            "rollback_candidate_required": repair_required,
+            "destructive_rollback_allowed": False,
+            "rollback_plan_source": "playbook_repair_candidate_after_failed_task_summary",
+        },
+        "post_apply": {
+            "post_apply_verifier_required": repair_required,
+            "km_playbook_trust_writeback_required": repair_required,
+            "telegram_receipt_required": repair_required,
+        },
+        "next_ai_action": (
+            "run_no_write_check_mode_replay"
+            if repair_required
+            else "keep_latest_apply_receipts"
+        ),
+    }
+
+
 def classify_deploy_control_plane_observation(
    *,
    run_status: str,
@@ -446,6 +595,12 @@ def build_runtime_receipt_readback_from_rows(
        km_latest_rows=km_latest,
        telegram_latest_rows=telegram_latest,
    )
+    latest_failure = _latest_failure_classification(
+        operation_latest_rows=operation_latest,
+        verifier_latest_rows=verifier_latest,
+        latest_flow_closure=latest_closure,
+    )
+    retry_package = _controlled_retry_package(latest_failure)
    apply_summary = operation_summary.get("ansible_apply_executed") or {}
    readback = {
        "schema_version": _LIVE_READBACK_SCHEMA_VERSION,
@@ -529,6 +684,8 @@ def build_runtime_receipt_readback_from_rows(
            ),
        },
        "latest_flow_closure": latest_closure,
+        "latest_failure_classification": latest_failure,
+        "controlled_retry_package": retry_package,
    }
    if error_type:
        readback["error"] = {
@@ -562,6 +719,23 @@ def _attach_runtime_receipt_readback(
            if (readback.get("latest_flow_closure") or {}).get("closed") is True
            else 0
        ),
+        "live_executor_latest_apply_repair_required_count": (
+            1
+            if (
+                (readback.get("latest_failure_classification") or {}).get("classification")
+                in {
+                    "closed_failed_apply_requires_ai_repair",
+                    "failed_apply_receipt_gap_requires_backfill_then_repair",
+                }
+            )
+            else 0
+        ),
+        "live_executor_retry_package_ready_count": (
+            1
+            if (readback.get("controlled_retry_package") or {}).get("status")
+            == "ready_for_no_write_check_mode_replay"
+            else 0
+        ),
    })
    return payload

--- a/apps/api/tests/test_ai_agent_autonomous_runtime_control.py
+++ b/apps/api/tests/test_ai_agent_autonomous_runtime_control.py
@@ -219,3 +219,128 @@ def test_runtime_receipt_readback_summarizes_live_executor_closure_rows():
        "closed": True,
        "missing": [],
    }
+    assert readback["latest_failure_classification"]["classification"] == (
+        "latest_controlled_apply_closed_success"
+    )
+    assert readback["controlled_retry_package"]["status"] == "not_required_for_latest_apply"
+
+
+def test_runtime_receipt_readback_classifies_closed_failed_apply_as_ai_repair():
+    apply_op_id = "94925d5e-6fdc-49c3-90e8-f0a0d57a6a58"
+    incident_id = "INC-20260628-A40A9A"
+
+    readback = build_runtime_receipt_readback_from_rows(
+        project_id="awoooi",
+        db_read_status="ok",
+        operation_count_rows=[
+            {
+                "operation_type": "ansible_apply_executed",
+                "status": "failed",
+                "total": 1,
+                "recent": 1,
+            },
+        ],
+        operation_latest_rows=[
+            {
+                "op_id": apply_op_id,
+                "parent_op_id": "8b555f41-e81f-4d8e-956b-fb20d358db63",
+                "operation_type": "ansible_apply_executed",
+                "status": "failed",
+                "actor": "ansible_controlled_apply_worker",
+                "incident_id": incident_id,
+                "catalog_id": "ansible:188-ai-web",
+                "playbook_path": "infra/ansible/playbooks/188-ai-web.yml",
+                "execution_mode": "controlled_apply",
+                "returncode": "2",
+                "duration_ms": 4797,
+            },
+        ],
+        verifier_count_rows=[
+            {"verification_result": "failed", "total": 1, "recent": 1},
+        ],
+        verifier_latest_rows=[
+            {
+                "id": "evidence-1",
+                "incident_id": incident_id,
+                "verification_result": "failed",
+                "apply_op_id": apply_op_id,
+                "catalog_id": "ansible:188-ai-web",
+                "playbook_path": "infra/ansible/playbooks/188-ai-web.yml",
+                "returncode": "2",
+            },
+        ],
+        km_count_rows=[
+            {"status": "REVIEW", "total": 1, "recent": 1},
+        ],
+        km_latest_rows=[
+            {
+                "id": "km-1",
+                "title": "AI 自動修復沉澱：INC-20260628-A40A9A",
+                "related_incident_id": incident_id,
+                "related_playbook_id": "ansible:188-ai-web",
+                "path_type": "ansible_apply_receipt:94925d5e",
+                "status": "REVIEW",
+                "created_by": "ai_agent_ansible_worker",
+            },
+        ],
+        telegram_count_rows=[
+            {"send_status": "sent", "total": 1, "recent": 1},
+        ],
+        telegram_latest_rows=[
+            {
+                "message_id": "telegram-row-1",
+                "run_id": "telegram-run-1",
+                "message_type": "final",
+                "send_status": "sent",
+                "provider_message_id": "32016",
+                "incident_id": incident_id,
+                "action": "controlled_apply_result",
+            },
+        ],
+    )
+
+    classification = readback["latest_failure_classification"]
+    assert classification["classification"] == "closed_failed_apply_requires_ai_repair"
+    assert classification["action"] == "queue_check_mode_replay_and_playbook_repair_candidate"
+    assert classification["target_selector"] == {
+        "incident_id": incident_id,
+        "apply_op_id": apply_op_id,
+        "parent_op_id": "8b555f41-e81f-4d8e-956b-fb20d358db63",
+        "catalog_id": "ansible:188-ai-web",
+        "playbook_path": "infra/ansible/playbooks/188-ai-web.yml",
+        "execution_mode": "controlled_apply",
+    }
+    assert classification["evidence"]["returncode"] == "2"
+    assert classification["evidence"]["verification_result"] == "failed"
+    assert classification["evidence"]["latest_flow_closed"] is True
+    assert classification["evidence"]["output_tail_in_readback"] is False
+    assert classification["evidence"]["unredacted_output_required"] is False
+    assert classification["safe_next_steps"] == [
+        "run_no_write_check_mode_replay",
+        "extract_sanitized_failed_task_summary",
+        "write_km_playbook_repair_candidate",
+        "retry_controlled_apply_only_after_check_mode_passes",
+    ]
+
+    retry = readback["controlled_retry_package"]
+    assert retry["package_id"] == "ansible_retry:94925d5e"
+    assert retry["status"] == "ready_for_no_write_check_mode_replay"
+    assert retry["source_of_truth"] == {
+        "catalog_id": "ansible:188-ai-web",
+        "playbook_path": "infra/ansible/playbooks/188-ai-web.yml",
+        "source_diff_required_before_retry": True,
+        "failed_task_summary_required": True,
+    }
+    assert retry["preflight"]["no_write_check_mode_replay_required"] is True
+    assert retry["preflight"]["reuse_parent_check_mode_op_id"] == (
+        "8b555f41-e81f-4d8e-956b-fb20d358db63"
+    )
+    assert retry["apply_gate"]["controlled_apply_retry_allowed_now"] is False
+    assert retry["apply_gate"]["requires_check_mode_success_before_apply"] is True
+    assert retry["rollback"]["destructive_rollback_allowed"] is False
+    assert retry["post_apply"] == {
+        "post_apply_verifier_required": True,
+        "km_playbook_trust_writeback_required": True,
+        "telegram_receipt_required": True,
+    }
+    assert retry["next_ai_action"] == "run_no_write_check_mode_replay"