feat(awooop): close autonomous learning loop readback
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 2m47s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped

This commit is contained in:
Your Name
2026-06-29 16:52:07 +08:00
parent 8397178525
commit 9ca6eec2ee
7 changed files with 479 additions and 5 deletions

View File

@@ -38,6 +38,7 @@ _EXECUTOR_OPERATION_TYPES = (
"ansible_candidate_matched",
"ansible_check_mode_executed",
"ansible_apply_executed",
"ansible_learning_writeback_recorded",
"ansible_rollback_executed",
"ansible_execution_skipped",
)
@@ -781,11 +782,206 @@ def _build_agent_decision_wiring(
}
def _learning_loop_stage(
*,
stage_id: str,
display_name: str,
evidence_sources: list[str],
total: int,
recent: int,
required_for_learning_loop: bool,
writes_runtime_state: bool,
next_action_if_missing: str,
) -> dict[str, Any]:
present = total > 0
return {
"stage_id": stage_id,
"display_name": display_name,
"evidence_sources": evidence_sources,
"present": present,
"total": max(0, total),
"recent": max(0, recent),
"required_for_learning_loop": required_for_learning_loop,
"writes_runtime_state": writes_runtime_state,
"next_action_if_missing": None if present else next_action_if_missing,
}
def _build_learning_loop_readback(
*,
operation_summary: Mapping[str, Any],
verifier_summary: Mapping[str, Any],
km_summary: Mapping[str, Any],
playbook_trust_summary: Mapping[str, Any],
log_integration_taxonomy: Mapping[str, Any],
agent_decision_wiring: Mapping[str, Any],
latest_flow_closure: Mapping[str, Any],
latest_failure_classification: Mapping[str, Any],
controlled_retry_package: Mapping[str, Any],
loop_ledger: Mapping[str, Any],
) -> dict[str, Any]:
"""Expose the verified execution to KM/PlayBook learning loop."""
taxonomy_rollups = log_integration_taxonomy.get("rollups")
if not isinstance(taxonomy_rollups, Mapping):
taxonomy_rollups = {}
learning_source_family_count = _int_value(
taxonomy_rollups.get("learning_source_family_count")
)
classified_event_total = _int_value(taxonomy_rollups.get("classified_event_total"))
recent_classified_event_total = _int_value(
taxonomy_rollups.get("recent_classified_event_total")
)
verifier_total = _trace_total(verifier_summary)
verifier_recent = _trace_recent(verifier_summary)
km_total = _trace_total(km_summary)
km_recent = _trace_recent(km_summary)
learning_writeback_total = _trace_total(
operation_summary,
"ansible_learning_writeback_recorded",
)
learning_writeback_recent = _trace_recent(
operation_summary,
"ansible_learning_writeback_recorded",
)
trust_total = _trace_total(playbook_trust_summary)
trust_recent = _trace_recent(playbook_trust_summary)
repair_feedback_ready = bool(
latest_failure_classification.get("classification")
not in {"", "no_controlled_apply_observed"}
and controlled_retry_package.get("schema_version")
== "ai_agent_controlled_retry_package_v1"
)
next_decision_ready = bool(
agent_decision_wiring.get("status") == "completed"
and loop_ledger.get("closed") is True
)
stages = [
_learning_loop_stage(
stage_id="verified_execution_outcome",
display_name="Verified execution outcome available",
evidence_sources=["incident_evidence.post_execution_state"],
total=verifier_total
if latest_flow_closure.get("has_post_apply_verifier") is True
else 0,
recent=verifier_recent,
required_for_learning_loop=True,
writes_runtime_state=True,
next_action_if_missing="run_post_apply_verifier_and_attach_apply_op_id",
),
_learning_loop_stage(
stage_id="km_learning_writeback",
display_name="KM learning writeback recorded",
evidence_sources=["knowledge_entries"],
total=km_total
if latest_flow_closure.get("has_km_writeback") is True
else 0,
recent=km_recent,
required_for_learning_loop=True,
writes_runtime_state=True,
next_action_if_missing="write_verified_execution_summary_to_km",
),
_learning_loop_stage(
stage_id="learning_repair_record",
display_name="Learning repository repair result recorded",
evidence_sources=[
"automation_operation_log:ansible_learning_writeback_recorded",
"learning_repository",
],
total=learning_writeback_total,
recent=learning_writeback_recent,
required_for_learning_loop=True,
writes_runtime_state=True,
next_action_if_missing="record_learning_repair_result_after_verifier",
),
_learning_loop_stage(
stage_id="playbook_trust_delta",
display_name="PlayBook trust signal available",
evidence_sources=["playbooks"],
total=trust_total,
recent=trust_recent,
required_for_learning_loop=True,
writes_runtime_state=True,
next_action_if_missing="write_playbook_trust_delta_after_verifier",
),
_learning_loop_stage(
stage_id="similar_case_context",
display_name="Similar-case context sources active",
evidence_sources=["log_integration_taxonomy", "knowledge_entries", "playbooks"],
total=classified_event_total if learning_source_family_count > 0 else 0,
recent=recent_classified_event_total,
required_for_learning_loop=True,
writes_runtime_state=False,
next_action_if_missing="activate_learning_source_families_for_similar_case_retrieval",
),
_learning_loop_stage(
stage_id="repair_candidate_feedback",
display_name="Repair or no-repair feedback classified",
evidence_sources=["latest_failure_classification", "controlled_retry_package"],
total=1 if repair_feedback_ready else 0,
recent=1 if repair_feedback_ready else 0,
required_for_learning_loop=True,
writes_runtime_state=False,
next_action_if_missing="classify_latest_apply_result_and_prepare_retry_package",
),
_learning_loop_stage(
stage_id="next_decision_context",
display_name="Next decision can consume learned context",
evidence_sources=["agent_decision_wiring", "autonomous_execution_loop_ledger"],
total=1 if next_decision_ready else 0,
recent=1 if next_decision_ready else 0,
required_for_learning_loop=True,
writes_runtime_state=False,
next_action_if_missing="complete_decision_wiring_and_execution_loop_before_learning_release",
),
]
missing_required = [
str(stage["stage_id"])
for stage in stages
if stage["required_for_learning_loop"] is True and stage["present"] is not True
]
present_required_count = sum(
1
for stage in stages
if stage["required_for_learning_loop"] is True and stage["present"] is True
)
required_count = sum(1 for stage in stages if stage["required_for_learning_loop"] is True)
return {
"schema_version": "ai_agent_learning_loop_readback_v1",
"status": "completed" if not missing_required else "in_progress",
"stages": stages,
"missing_required_stage_ids": missing_required,
"public_safety": {
"stores_raw_logs": False,
"stores_secret_values": False,
"stores_unredacted_telegram_payload": False,
"executes_on_read": False,
"critical_break_glass_still_required": True,
},
"rollups": {
"stage_count": len(stages),
"required_stage_count": required_count,
"required_stage_present_count": present_required_count,
"required_stage_missing_count": len(missing_required),
"verified_execution_total": verifier_total,
"km_writeback_total": km_total,
"learning_writeback_total": learning_writeback_total,
"learning_writeback_recent": learning_writeback_recent,
"playbook_trust_total": trust_total,
"learning_source_family_count": learning_source_family_count,
"similar_case_source_total": classified_event_total,
"repair_feedback_ready_count": 1 if repair_feedback_ready else 0,
"next_decision_ready_count": 1 if next_decision_ready else 0,
},
}
def _build_work_item_progress(
*,
trace_ledger: Mapping[str, Any],
log_integration_taxonomy: Mapping[str, Any],
agent_decision_wiring: Mapping[str, Any],
learning_loop: Mapping[str, Any],
db_read_status: str,
) -> dict[str, Any]:
"""Build ordered work items that the UI and agent can keep advancing."""
@@ -810,6 +1006,15 @@ def _build_work_item_progress(
and agent_decision_wiring.get("schema_version") == "ai_agent_decision_wiring_readback_v1"
and decision_wiring_missing == 0
)
learning_rollups = learning_loop.get("rollups")
if not isinstance(learning_rollups, Mapping):
learning_rollups = {}
learning_loop_missing = _int_value(learning_rollups.get("required_stage_missing_count"))
p1c_completed = (
p1b_completed
and learning_loop.get("schema_version") == "ai_agent_learning_loop_readback_v1"
and learning_loop_missing == 0
)
deployed_readback_complete = (
db_read_status == "ok"
and trace_ledger.get("schema_version") == "ai_agent_autonomous_trace_ledger_v1"
@@ -873,8 +1078,9 @@ def _build_work_item_progress(
"work_item_id": "P1-C-learning-loop",
"priority": "P1-C",
"title": "KM / PlayBook trust learning loop",
"status": "pending",
"status": "completed" if p1c_completed else "in_progress" if p1b_completed else "pending",
"exit_criteria": "verified execution updates KM entries, trust delta, similar-case clusters, and repair candidates",
"remaining_learning_loop_stage_count": learning_loop_missing,
},
{
"work_item_id": "P1-D-alert-noise-reduction",
@@ -1753,10 +1959,23 @@ def build_runtime_receipt_readback_from_rows(
loop_ledger=loop_ledger,
latest_flow_closure=latest_closure,
)
learning_loop = _build_learning_loop_readback(
operation_summary=operation_summary,
verifier_summary=verifier_summary,
km_summary=km_summary,
playbook_trust_summary=playbook_trust_summary,
log_integration_taxonomy=log_integration_taxonomy,
agent_decision_wiring=agent_decision_wiring,
latest_flow_closure=latest_closure,
latest_failure_classification=latest_failure,
controlled_retry_package=retry_package,
loop_ledger=loop_ledger,
)
work_item_progress = _build_work_item_progress(
trace_ledger=trace_ledger,
log_integration_taxonomy=log_integration_taxonomy,
agent_decision_wiring=agent_decision_wiring,
learning_loop=learning_loop,
db_read_status=db_read_status,
)
apply_summary = operation_summary.get("ansible_apply_executed") or {}
@@ -1879,6 +2098,7 @@ def build_runtime_receipt_readback_from_rows(
"trace_ledger": trace_ledger,
"log_integration_taxonomy": log_integration_taxonomy,
"agent_decision_wiring": agent_decision_wiring,
"learning_loop": learning_loop,
"work_item_progress": work_item_progress,
}
if error_type:
@@ -2002,6 +2222,31 @@ def _attach_runtime_receipt_readback(
if (readback.get("agent_decision_wiring") or {}).get("status") == "completed"
else 0
),
"live_learning_loop_stage_count": _int_value(
((readback.get("learning_loop") or {}).get("rollups") or {}).get(
"stage_count"
)
),
"live_learning_loop_required_present_count": _int_value(
((readback.get("learning_loop") or {}).get("rollups") or {}).get(
"required_stage_present_count"
)
),
"live_learning_loop_required_missing_count": _int_value(
((readback.get("learning_loop") or {}).get("rollups") or {}).get(
"required_stage_missing_count"
)
),
"live_learning_loop_complete_count": (
1
if (readback.get("learning_loop") or {}).get("status") == "completed"
else 0
),
"live_learning_loop_similar_case_source_count": _int_value(
((readback.get("learning_loop") or {}).get("rollups") or {}).get(
"similar_case_source_total"
)
),
"live_work_item_count": _int_value(
((readback.get("work_item_progress") or {}).get("rollups") or {}).get(
"work_item_count"
@@ -2421,6 +2666,7 @@ _RUNTIME_OPERATION_COUNTS_SQL = """
'ansible_candidate_matched',
'ansible_check_mode_executed',
'ansible_apply_executed',
'ansible_learning_writeback_recorded',
'ansible_rollback_executed',
'ansible_execution_skipped'
)
@@ -2452,6 +2698,7 @@ _RUNTIME_OPERATION_LATEST_SQL = """
'ansible_candidate_matched',
'ansible_check_mode_executed',
'ansible_apply_executed',
'ansible_learning_writeback_recorded',
'ansible_rollback_executed',
'ansible_execution_skipped'
)

View File

@@ -659,6 +659,109 @@ def _post_apply_action_label(claim: AnsibleCheckModeClaim, *, apply_op_id: str)
)
async def _record_learning_writeback_receipt(
claim: AnsibleCheckModeClaim,
result: AnsibleRunResult,
*,
apply_op_id: str,
verification_result: str,
action_label: str,
project_id: str,
) -> bool:
"""Persist the post-verifier learning receipt after LearningService accepts it."""
matched_playbook_id = str(claim.catalog_id or "")[:36] or None
try:
from src.services.learning_service import get_learning_service
await get_learning_service().record_verification_result(
incident_id=claim.incident_id,
action_taken=action_label,
verification_result=verification_result,
matched_playbook_id=matched_playbook_id,
)
except Exception as exc:
logger.warning(
"ansible_post_apply_trust_learning_writeback_failed",
incident_id=claim.incident_id,
catalog_id=claim.catalog_id,
apply_op_id=apply_op_id,
error=str(exc),
)
return False
try:
input_payload = {
"schema_version": "ansible_learning_writeback_receipt_v1",
"incident_id": claim.incident_id,
"catalog_id": claim.catalog_id,
"playbook_path": claim.apply_playbook_path,
"apply_op_id": apply_op_id,
"verification_result": verification_result,
"matched_playbook_id": matched_playbook_id,
"learning_repository": "repair_result",
"playbook_trust_update_attempted": matched_playbook_id is not None,
"stores_raw_logs": False,
"stores_secret_values": False,
}
output_payload = {
"learning_recorded": True,
"success": verification_result == "success",
"returncode": result.returncode,
"timed_out": result.timed_out,
}
async with get_db_context(project_id) as db:
inserted = await db.execute(
text("""
INSERT INTO automation_operation_log (
operation_type, actor, status, incident_id,
input, output, dry_run_result,
parent_op_id, tags
)
SELECT
'ansible_learning_writeback_recorded',
'ansible_controlled_apply_worker',
'success',
:incident_db_id,
CAST(:input AS jsonb),
CAST(:output AS jsonb),
'{}'::jsonb,
CAST(:parent_op_id AS uuid),
:tags
WHERE NOT EXISTS (
SELECT 1
FROM automation_operation_log existing
WHERE existing.operation_type = 'ansible_learning_writeback_recorded'
AND existing.parent_op_id = CAST(:parent_op_id AS uuid)
)
RETURNING op_id
"""),
{
"incident_db_id": _automation_operation_log_incident_id(claim.incident_id),
"input": json.dumps(input_payload, ensure_ascii=False),
"output": json.dumps(output_payload, ensure_ascii=False),
"parent_op_id": apply_op_id,
"tags": [
"ansible",
"controlled_apply",
"learning_writeback",
"playbook_trust",
"ai_agent_auto_execution",
],
},
)
return inserted.scalar() is not None
except Exception as exc:
logger.warning(
"ansible_learning_writeback_receipt_failed",
incident_id=claim.incident_id,
catalog_id=claim.catalog_id,
apply_op_id=apply_op_id,
error=str(exc),
)
return False
async def _record_post_apply_verifier_and_learning(
claim: AnsibleCheckModeClaim,
result: AnsibleRunResult,
@@ -690,7 +793,7 @@ async def _record_post_apply_verifier_and_learning(
f"incident={claim.incident_id}; catalog={claim.catalog_id}; "
f"result={verification_result}; returncode={result.returncode}; apply_op={apply_op_id}"
)
status = {"verification": False, "learning": False}
status = {"verification": False, "learning": False, "trust_learning": False}
try:
async with get_db_context(project_id) as db:
@@ -802,6 +905,14 @@ async def _record_post_apply_verifier_and_learning(
apply_op_id=apply_op_id,
error=str(exc),
)
status["trust_learning"] = await _record_learning_writeback_receipt(
claim,
result,
apply_op_id=apply_op_id,
verification_result=verification_result,
action_label=action_label,
project_id=project_id,
)
return status
@@ -853,6 +964,7 @@ async def backfill_missing_auto_repair_execution_receipts_once(
"written": 0,
"verification_written": 0,
"learning_written": 0,
"trust_learning_written": 0,
"skipped": 0,
"error": None,
}
@@ -898,9 +1010,15 @@ async def backfill_missing_auto_repair_execution_receipts_once(
WHERE km.related_incident_id = coalesce(
apply.incident_id::text,
apply.input ->> 'incident_id'
)
)
AND km.path_type = 'ansible_apply_receipt:' || left(apply.op_id::text, 8)
)
OR NOT EXISTS (
SELECT 1
FROM automation_operation_log learning
WHERE learning.operation_type = 'ansible_learning_writeback_recorded'
AND learning.parent_op_id = apply.op_id
)
)
ORDER BY apply.created_at DESC
LIMIT :limit
@@ -934,6 +1052,8 @@ async def backfill_missing_auto_repair_execution_receipts_once(
stats["verification_written"] += 1
if writeback.get("learning"):
stats["learning_written"] += 1
if writeback.get("trust_learning"):
stats["trust_learning_written"] += 1
except Exception as exc:
stats["error"] = f"{type(exc).__name__}: {exc}"[:500]
logger.warning("ansible_auto_repair_execution_receipt_backfill_failed", **stats)

View File

@@ -68,6 +68,13 @@ def test_ai_agent_autonomous_runtime_control_exposes_reports_and_executor_receip
assert data["rollups"]["direct_bot_api_allowed_count"] == 0
assert data["rollups"]["legacy_policy_overridden_count"] >= 4
assert data["runtime_receipt_readback"]["db_read_status"] == "not_queried"
assert data["runtime_receipt_readback"]["learning_loop"]["status"] == "in_progress"
assert (
data["runtime_receipt_readback"]["learning_loop"]["rollups"][
"required_stage_missing_count"
]
== 7
)
def test_ai_agent_autonomous_runtime_control_exposes_internal_control_loop():
@@ -161,6 +168,12 @@ def test_runtime_receipt_readback_summarizes_live_executor_closure_rows():
"total": 1,
"recent": 1,
},
{
"operation_type": "ansible_learning_writeback_recorded",
"status": "success",
"total": 1,
"recent": 1,
},
],
operation_latest_rows=[
{
@@ -201,6 +214,18 @@ def test_runtime_receipt_readback_summarizes_live_executor_closure_rows():
"returncode": "0",
"duration_ms": 7727,
},
{
"op_id": "learning-op",
"parent_op_id": apply_op_id,
"operation_type": "ansible_learning_writeback_recorded",
"status": "success",
"actor": "ansible_controlled_apply_worker",
"incident_id": incident_id,
"catalog_id": "ansible:188-momo-backup-user",
"playbook_path": "infra/ansible/playbooks/188-momo-backup-user.yml",
"execution_mode": "learning_writeback",
"returncode": "0",
},
],
auto_repair_count_rows=[
{"result_status": "success", "total": 1, "recent": 1},
@@ -416,6 +441,31 @@ def test_runtime_receipt_readback_summarizes_live_executor_closure_rows():
assert decision_wiring["rollups"]["check_mode_total"] == 1
assert decision_wiring["rollups"]["controlled_apply_total"] == 1
assert decision_wiring["rollups"]["required_stage_missing_count"] == 0
learning_loop = readback["learning_loop"]
assert learning_loop["schema_version"] == "ai_agent_learning_loop_readback_v1"
assert learning_loop["status"] == "completed"
assert learning_loop["missing_required_stage_ids"] == []
assert {
stage["stage_id"]
for stage in learning_loop["stages"]
if stage["required_for_learning_loop"]
} == {
"verified_execution_outcome",
"km_learning_writeback",
"learning_repair_record",
"playbook_trust_delta",
"similar_case_context",
"repair_candidate_feedback",
"next_decision_context",
}
assert learning_loop["rollups"]["required_stage_present_count"] == 7
assert learning_loop["rollups"]["required_stage_missing_count"] == 0
assert learning_loop["rollups"]["learning_writeback_total"] == 1
assert learning_loop["rollups"]["learning_writeback_recent"] == 1
assert learning_loop["rollups"]["similar_case_source_total"] > 0
assert learning_loop["rollups"]["repair_feedback_ready_count"] == 1
assert learning_loop["public_safety"]["stores_secret_values"] is False
assert learning_loop["public_safety"]["executes_on_read"] is False
progress = readback["work_item_progress"]
assert progress["schema_version"] == "ai_agent_automation_work_item_progress_v1"
ordered_ids = [item["work_item_id"] for item in progress["ordered_items"]]
@@ -435,10 +485,12 @@ def test_runtime_receipt_readback_summarizes_live_executor_closure_rows():
assert progress["ordered_items"][4]["status"] == "completed"
assert progress["ordered_items"][5]["status"] == "completed"
assert progress["ordered_items"][6]["status"] == "completed"
assert progress["ordered_items"][7]["status"] == "completed"
assert progress["ordered_items"][7]["remaining_learning_loop_stage_count"] == 0
assert progress["source_family_items"]
assert {item["status"] for item in progress["source_family_items"]} == {"completed"}
assert progress["rollups"]["source_family_work_item_count"] == 10
assert progress["rollups"]["pending_count"] >= 4
assert progress["rollups"]["pending_count"] >= 3
def test_runtime_receipt_readback_classifies_closed_failed_apply_as_ai_repair():

View File

@@ -21,6 +21,7 @@ from src.services.awooop_ansible_check_mode_service import (
_post_apply_km_path_type,
_post_apply_verification_result,
_record_auto_repair_execution_receipt,
_record_learning_writeback_receipt,
_send_controlled_apply_telegram_receipt,
build_ansible_apply_command,
build_ansible_check_mode_claim_input,
@@ -1613,9 +1614,20 @@ def test_ansible_apply_receipt_backfill_includes_verifier_and_km_gaps() -> None:
assert "evidence.post_execution_state ->> 'apply_op_id' = apply.op_id::text" in source
assert "FROM knowledge_entries km" in source
assert "km.path_type = 'ansible_apply_receipt:' || left(apply.op_id::text, 8)" in source
assert "ansible_learning_writeback_recorded" in source
assert "_record_post_apply_verifier_and_learning" in source
def test_ansible_learning_writeback_receipt_records_learning_service_call() -> None:
source = inspect.getsource(_record_learning_writeback_receipt)
assert "record_verification_result" in source
assert "ansible_learning_writeback_recorded" in source
assert "learning_repository" in source
assert "stores_raw_logs" in source
assert "stores_secret_values" in source
def test_ansible_live_controlled_apply_sends_telegram_receipt_but_backfill_does_not() -> None:
live_source = inspect.getsource(run_controlled_apply_for_claim)
backfill_source = inspect.getsource(backfill_missing_auto_repair_execution_receipts_once)