feat(awooop): close autonomous learning loop readback
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 2m47s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped

This commit is contained in:
Your Name
2026-06-29 16:52:07 +08:00
parent 8397178525
commit 9ca6eec2ee
7 changed files with 479 additions and 5 deletions

View File

@@ -38,6 +38,7 @@ _EXECUTOR_OPERATION_TYPES = (
"ansible_candidate_matched",
"ansible_check_mode_executed",
"ansible_apply_executed",
"ansible_learning_writeback_recorded",
"ansible_rollback_executed",
"ansible_execution_skipped",
)
@@ -781,11 +782,206 @@ def _build_agent_decision_wiring(
}
def _learning_loop_stage(
*,
stage_id: str,
display_name: str,
evidence_sources: list[str],
total: int,
recent: int,
required_for_learning_loop: bool,
writes_runtime_state: bool,
next_action_if_missing: str,
) -> dict[str, Any]:
present = total > 0
return {
"stage_id": stage_id,
"display_name": display_name,
"evidence_sources": evidence_sources,
"present": present,
"total": max(0, total),
"recent": max(0, recent),
"required_for_learning_loop": required_for_learning_loop,
"writes_runtime_state": writes_runtime_state,
"next_action_if_missing": None if present else next_action_if_missing,
}
def _build_learning_loop_readback(
*,
operation_summary: Mapping[str, Any],
verifier_summary: Mapping[str, Any],
km_summary: Mapping[str, Any],
playbook_trust_summary: Mapping[str, Any],
log_integration_taxonomy: Mapping[str, Any],
agent_decision_wiring: Mapping[str, Any],
latest_flow_closure: Mapping[str, Any],
latest_failure_classification: Mapping[str, Any],
controlled_retry_package: Mapping[str, Any],
loop_ledger: Mapping[str, Any],
) -> dict[str, Any]:
"""Expose the verified execution to KM/PlayBook learning loop."""
taxonomy_rollups = log_integration_taxonomy.get("rollups")
if not isinstance(taxonomy_rollups, Mapping):
taxonomy_rollups = {}
learning_source_family_count = _int_value(
taxonomy_rollups.get("learning_source_family_count")
)
classified_event_total = _int_value(taxonomy_rollups.get("classified_event_total"))
recent_classified_event_total = _int_value(
taxonomy_rollups.get("recent_classified_event_total")
)
verifier_total = _trace_total(verifier_summary)
verifier_recent = _trace_recent(verifier_summary)
km_total = _trace_total(km_summary)
km_recent = _trace_recent(km_summary)
learning_writeback_total = _trace_total(
operation_summary,
"ansible_learning_writeback_recorded",
)
learning_writeback_recent = _trace_recent(
operation_summary,
"ansible_learning_writeback_recorded",
)
trust_total = _trace_total(playbook_trust_summary)
trust_recent = _trace_recent(playbook_trust_summary)
repair_feedback_ready = bool(
latest_failure_classification.get("classification")
not in {"", "no_controlled_apply_observed"}
and controlled_retry_package.get("schema_version")
== "ai_agent_controlled_retry_package_v1"
)
next_decision_ready = bool(
agent_decision_wiring.get("status") == "completed"
and loop_ledger.get("closed") is True
)
stages = [
_learning_loop_stage(
stage_id="verified_execution_outcome",
display_name="Verified execution outcome available",
evidence_sources=["incident_evidence.post_execution_state"],
total=verifier_total
if latest_flow_closure.get("has_post_apply_verifier") is True
else 0,
recent=verifier_recent,
required_for_learning_loop=True,
writes_runtime_state=True,
next_action_if_missing="run_post_apply_verifier_and_attach_apply_op_id",
),
_learning_loop_stage(
stage_id="km_learning_writeback",
display_name="KM learning writeback recorded",
evidence_sources=["knowledge_entries"],
total=km_total
if latest_flow_closure.get("has_km_writeback") is True
else 0,
recent=km_recent,
required_for_learning_loop=True,
writes_runtime_state=True,
next_action_if_missing="write_verified_execution_summary_to_km",
),
_learning_loop_stage(
stage_id="learning_repair_record",
display_name="Learning repository repair result recorded",
evidence_sources=[
"automation_operation_log:ansible_learning_writeback_recorded",
"learning_repository",
],
total=learning_writeback_total,
recent=learning_writeback_recent,
required_for_learning_loop=True,
writes_runtime_state=True,
next_action_if_missing="record_learning_repair_result_after_verifier",
),
_learning_loop_stage(
stage_id="playbook_trust_delta",
display_name="PlayBook trust signal available",
evidence_sources=["playbooks"],
total=trust_total,
recent=trust_recent,
required_for_learning_loop=True,
writes_runtime_state=True,
next_action_if_missing="write_playbook_trust_delta_after_verifier",
),
_learning_loop_stage(
stage_id="similar_case_context",
display_name="Similar-case context sources active",
evidence_sources=["log_integration_taxonomy", "knowledge_entries", "playbooks"],
total=classified_event_total if learning_source_family_count > 0 else 0,
recent=recent_classified_event_total,
required_for_learning_loop=True,
writes_runtime_state=False,
next_action_if_missing="activate_learning_source_families_for_similar_case_retrieval",
),
_learning_loop_stage(
stage_id="repair_candidate_feedback",
display_name="Repair or no-repair feedback classified",
evidence_sources=["latest_failure_classification", "controlled_retry_package"],
total=1 if repair_feedback_ready else 0,
recent=1 if repair_feedback_ready else 0,
required_for_learning_loop=True,
writes_runtime_state=False,
next_action_if_missing="classify_latest_apply_result_and_prepare_retry_package",
),
_learning_loop_stage(
stage_id="next_decision_context",
display_name="Next decision can consume learned context",
evidence_sources=["agent_decision_wiring", "autonomous_execution_loop_ledger"],
total=1 if next_decision_ready else 0,
recent=1 if next_decision_ready else 0,
required_for_learning_loop=True,
writes_runtime_state=False,
next_action_if_missing="complete_decision_wiring_and_execution_loop_before_learning_release",
),
]
missing_required = [
str(stage["stage_id"])
for stage in stages
if stage["required_for_learning_loop"] is True and stage["present"] is not True
]
present_required_count = sum(
1
for stage in stages
if stage["required_for_learning_loop"] is True and stage["present"] is True
)
required_count = sum(1 for stage in stages if stage["required_for_learning_loop"] is True)
return {
"schema_version": "ai_agent_learning_loop_readback_v1",
"status": "completed" if not missing_required else "in_progress",
"stages": stages,
"missing_required_stage_ids": missing_required,
"public_safety": {
"stores_raw_logs": False,
"stores_secret_values": False,
"stores_unredacted_telegram_payload": False,
"executes_on_read": False,
"critical_break_glass_still_required": True,
},
"rollups": {
"stage_count": len(stages),
"required_stage_count": required_count,
"required_stage_present_count": present_required_count,
"required_stage_missing_count": len(missing_required),
"verified_execution_total": verifier_total,
"km_writeback_total": km_total,
"learning_writeback_total": learning_writeback_total,
"learning_writeback_recent": learning_writeback_recent,
"playbook_trust_total": trust_total,
"learning_source_family_count": learning_source_family_count,
"similar_case_source_total": classified_event_total,
"repair_feedback_ready_count": 1 if repair_feedback_ready else 0,
"next_decision_ready_count": 1 if next_decision_ready else 0,
},
}
def _build_work_item_progress(
*,
trace_ledger: Mapping[str, Any],
log_integration_taxonomy: Mapping[str, Any],
agent_decision_wiring: Mapping[str, Any],
learning_loop: Mapping[str, Any],
db_read_status: str,
) -> dict[str, Any]:
"""Build ordered work items that the UI and agent can keep advancing."""
@@ -810,6 +1006,15 @@ def _build_work_item_progress(
and agent_decision_wiring.get("schema_version") == "ai_agent_decision_wiring_readback_v1"
and decision_wiring_missing == 0
)
learning_rollups = learning_loop.get("rollups")
if not isinstance(learning_rollups, Mapping):
learning_rollups = {}
learning_loop_missing = _int_value(learning_rollups.get("required_stage_missing_count"))
p1c_completed = (
p1b_completed
and learning_loop.get("schema_version") == "ai_agent_learning_loop_readback_v1"
and learning_loop_missing == 0
)
deployed_readback_complete = (
db_read_status == "ok"
and trace_ledger.get("schema_version") == "ai_agent_autonomous_trace_ledger_v1"
@@ -873,8 +1078,9 @@ def _build_work_item_progress(
"work_item_id": "P1-C-learning-loop",
"priority": "P1-C",
"title": "KM / PlayBook trust learning loop",
"status": "pending",
"status": "completed" if p1c_completed else "in_progress" if p1b_completed else "pending",
"exit_criteria": "verified execution updates KM entries, trust delta, similar-case clusters, and repair candidates",
"remaining_learning_loop_stage_count": learning_loop_missing,
},
{
"work_item_id": "P1-D-alert-noise-reduction",
@@ -1753,10 +1959,23 @@ def build_runtime_receipt_readback_from_rows(
loop_ledger=loop_ledger,
latest_flow_closure=latest_closure,
)
learning_loop = _build_learning_loop_readback(
operation_summary=operation_summary,
verifier_summary=verifier_summary,
km_summary=km_summary,
playbook_trust_summary=playbook_trust_summary,
log_integration_taxonomy=log_integration_taxonomy,
agent_decision_wiring=agent_decision_wiring,
latest_flow_closure=latest_closure,
latest_failure_classification=latest_failure,
controlled_retry_package=retry_package,
loop_ledger=loop_ledger,
)
work_item_progress = _build_work_item_progress(
trace_ledger=trace_ledger,
log_integration_taxonomy=log_integration_taxonomy,
agent_decision_wiring=agent_decision_wiring,
learning_loop=learning_loop,
db_read_status=db_read_status,
)
apply_summary = operation_summary.get("ansible_apply_executed") or {}
@@ -1879,6 +2098,7 @@ def build_runtime_receipt_readback_from_rows(
"trace_ledger": trace_ledger,
"log_integration_taxonomy": log_integration_taxonomy,
"agent_decision_wiring": agent_decision_wiring,
"learning_loop": learning_loop,
"work_item_progress": work_item_progress,
}
if error_type:
@@ -2002,6 +2222,31 @@ def _attach_runtime_receipt_readback(
if (readback.get("agent_decision_wiring") or {}).get("status") == "completed"
else 0
),
"live_learning_loop_stage_count": _int_value(
((readback.get("learning_loop") or {}).get("rollups") or {}).get(
"stage_count"
)
),
"live_learning_loop_required_present_count": _int_value(
((readback.get("learning_loop") or {}).get("rollups") or {}).get(
"required_stage_present_count"
)
),
"live_learning_loop_required_missing_count": _int_value(
((readback.get("learning_loop") or {}).get("rollups") or {}).get(
"required_stage_missing_count"
)
),
"live_learning_loop_complete_count": (
1
if (readback.get("learning_loop") or {}).get("status") == "completed"
else 0
),
"live_learning_loop_similar_case_source_count": _int_value(
((readback.get("learning_loop") or {}).get("rollups") or {}).get(
"similar_case_source_total"
)
),
"live_work_item_count": _int_value(
((readback.get("work_item_progress") or {}).get("rollups") or {}).get(
"work_item_count"
@@ -2421,6 +2666,7 @@ _RUNTIME_OPERATION_COUNTS_SQL = """
'ansible_candidate_matched',
'ansible_check_mode_executed',
'ansible_apply_executed',
'ansible_learning_writeback_recorded',
'ansible_rollback_executed',
'ansible_execution_skipped'
)
@@ -2452,6 +2698,7 @@ _RUNTIME_OPERATION_LATEST_SQL = """
'ansible_candidate_matched',
'ansible_check_mode_executed',
'ansible_apply_executed',
'ansible_learning_writeback_recorded',
'ansible_rollback_executed',
'ansible_execution_skipped'
)

View File

@@ -659,6 +659,109 @@ def _post_apply_action_label(claim: AnsibleCheckModeClaim, *, apply_op_id: str)
)
async def _record_learning_writeback_receipt(
claim: AnsibleCheckModeClaim,
result: AnsibleRunResult,
*,
apply_op_id: str,
verification_result: str,
action_label: str,
project_id: str,
) -> bool:
"""Persist the post-verifier learning receipt after LearningService accepts it."""
matched_playbook_id = str(claim.catalog_id or "")[:36] or None
try:
from src.services.learning_service import get_learning_service
await get_learning_service().record_verification_result(
incident_id=claim.incident_id,
action_taken=action_label,
verification_result=verification_result,
matched_playbook_id=matched_playbook_id,
)
except Exception as exc:
logger.warning(
"ansible_post_apply_trust_learning_writeback_failed",
incident_id=claim.incident_id,
catalog_id=claim.catalog_id,
apply_op_id=apply_op_id,
error=str(exc),
)
return False
try:
input_payload = {
"schema_version": "ansible_learning_writeback_receipt_v1",
"incident_id": claim.incident_id,
"catalog_id": claim.catalog_id,
"playbook_path": claim.apply_playbook_path,
"apply_op_id": apply_op_id,
"verification_result": verification_result,
"matched_playbook_id": matched_playbook_id,
"learning_repository": "repair_result",
"playbook_trust_update_attempted": matched_playbook_id is not None,
"stores_raw_logs": False,
"stores_secret_values": False,
}
output_payload = {
"learning_recorded": True,
"success": verification_result == "success",
"returncode": result.returncode,
"timed_out": result.timed_out,
}
async with get_db_context(project_id) as db:
inserted = await db.execute(
text("""
INSERT INTO automation_operation_log (
operation_type, actor, status, incident_id,
input, output, dry_run_result,
parent_op_id, tags
)
SELECT
'ansible_learning_writeback_recorded',
'ansible_controlled_apply_worker',
'success',
:incident_db_id,
CAST(:input AS jsonb),
CAST(:output AS jsonb),
'{}'::jsonb,
CAST(:parent_op_id AS uuid),
:tags
WHERE NOT EXISTS (
SELECT 1
FROM automation_operation_log existing
WHERE existing.operation_type = 'ansible_learning_writeback_recorded'
AND existing.parent_op_id = CAST(:parent_op_id AS uuid)
)
RETURNING op_id
"""),
{
"incident_db_id": _automation_operation_log_incident_id(claim.incident_id),
"input": json.dumps(input_payload, ensure_ascii=False),
"output": json.dumps(output_payload, ensure_ascii=False),
"parent_op_id": apply_op_id,
"tags": [
"ansible",
"controlled_apply",
"learning_writeback",
"playbook_trust",
"ai_agent_auto_execution",
],
},
)
return inserted.scalar() is not None
except Exception as exc:
logger.warning(
"ansible_learning_writeback_receipt_failed",
incident_id=claim.incident_id,
catalog_id=claim.catalog_id,
apply_op_id=apply_op_id,
error=str(exc),
)
return False
async def _record_post_apply_verifier_and_learning(
claim: AnsibleCheckModeClaim,
result: AnsibleRunResult,
@@ -690,7 +793,7 @@ async def _record_post_apply_verifier_and_learning(
f"incident={claim.incident_id}; catalog={claim.catalog_id}; "
f"result={verification_result}; returncode={result.returncode}; apply_op={apply_op_id}"
)
status = {"verification": False, "learning": False}
status = {"verification": False, "learning": False, "trust_learning": False}
try:
async with get_db_context(project_id) as db:
@@ -802,6 +905,14 @@ async def _record_post_apply_verifier_and_learning(
apply_op_id=apply_op_id,
error=str(exc),
)
status["trust_learning"] = await _record_learning_writeback_receipt(
claim,
result,
apply_op_id=apply_op_id,
verification_result=verification_result,
action_label=action_label,
project_id=project_id,
)
return status
@@ -853,6 +964,7 @@ async def backfill_missing_auto_repair_execution_receipts_once(
"written": 0,
"verification_written": 0,
"learning_written": 0,
"trust_learning_written": 0,
"skipped": 0,
"error": None,
}
@@ -898,9 +1010,15 @@ async def backfill_missing_auto_repair_execution_receipts_once(
WHERE km.related_incident_id = coalesce(
apply.incident_id::text,
apply.input ->> 'incident_id'
)
)
AND km.path_type = 'ansible_apply_receipt:' || left(apply.op_id::text, 8)
)
OR NOT EXISTS (
SELECT 1
FROM automation_operation_log learning
WHERE learning.operation_type = 'ansible_learning_writeback_recorded'
AND learning.parent_op_id = apply.op_id
)
)
ORDER BY apply.created_at DESC
LIMIT :limit
@@ -934,6 +1052,8 @@ async def backfill_missing_auto_repair_execution_receipts_once(
stats["verification_written"] += 1
if writeback.get("learning"):
stats["learning_written"] += 1
if writeback.get("trust_learning"):
stats["trust_learning_written"] += 1
except Exception as exc:
stats["error"] = f"{type(exc).__name__}: {exc}"[:500]
logger.warning("ansible_auto_repair_execution_receipt_backfill_failed", **stats)