feat(awooop): project ai route repair work item
All checks were successful
CD Pipeline / tests (push) Successful in 1m22s
Code Review / ai-code-review (push) Successful in 11s
CD Pipeline / build-and-deploy (push) Successful in 3m30s
CD Pipeline / post-deploy-checks (push) Successful in 1m20s

This commit is contained in:
Your Name
2026-05-25 14:42:57 +08:00
parent e5cd01c9cb
commit 63b4c3453f
6 changed files with 296 additions and 1 deletions

View File

@@ -1007,7 +1007,7 @@ def _ai_route_repair_evidence_item(
observed_state = _as_dict(payload.get("observed_state"))
side_effects = _ai_route_repair_side_effects(payload.get("side_effects"))
return {
evidence = {
"schema_version": (
payload.get("schema_version")
or envelope.get("schema_version")
@@ -1037,6 +1037,12 @@ def _ai_route_repair_evidence_item(
"provider_ts": row.get("provider_ts"),
"received_at": row.get("received_at"),
}
evidence["work_item"] = _ai_route_repair_work_item(evidence)
evidence["playbook_recommendation"] = _ai_route_repair_playbook_recommendation(
evidence
)
evidence["owner_action"] = _ai_route_repair_owner_action(evidence)
return evidence
def _ai_route_repair_side_effects(value: Any) -> dict[str, bool | None]:
@@ -1078,6 +1084,101 @@ def _source_ref_count(envelope: Any) -> int:
return total
def _ai_route_repair_work_item(evidence: Mapping[str, Any]) -> dict[str, Any]:
target = str(evidence.get("target_resource") or "unknown").strip()
blockers = _as_string_list(evidence.get("access_blockers"))
open_item = bool(blockers)
work_item_id = f"ai-route-repair:{target or 'unknown'}"
return {
"schema_version": "awooop_ai_route_repair_work_item_v1",
"work_item_id": work_item_id,
"status": "open" if open_item else "watching",
"kind": "ai_route_primary_lane_repair",
"next_step": (
"restore_primary_ollama_lane_access"
if open_item
else "continue_route_monitoring"
),
"reason": "primary_lane_unavailable" if open_item else "primary_lane_observed",
"needs_human": open_item,
"owner": "cloud_sre_operator",
"target_resource": target or None,
"target_href": "/awooop/runs",
"decision_effect": "none",
"safety_level": "read_only_work_item_projection",
"writes_incident_state": False,
"writes_auto_repair_result": False,
"writes_runtime_route": False,
}
def _ai_route_repair_playbook_recommendation(
evidence: Mapping[str, Any],
) -> dict[str, Any]:
blockers = set(_as_string_list(evidence.get("access_blockers")))
live_probe = _as_dict(evidence.get("live_probe"))
steps: list[dict[str, Any]] = []
if any(blocker.startswith("gcloud_") for blocker in blockers):
steps.append({
"step": "verify_cloud_control_plane_access",
"scope": "gcp_compute_read",
"mode": "manual_or_approved",
})
if "gcp_a_ssh_refused" in blockers or (
live_probe.get("gcp_a_direct_22") == "connection_refused"
):
steps.append({
"step": "restore_gcp_a_os_access",
"scope": "gcp_serial_console_or_os_login",
"mode": "manual_or_approved",
})
if "gcp_a_ollama_11434_refused" in blockers or (
live_probe.get("gcp_a_direct_11434") == "connection_refused"
):
steps.append({
"step": "restore_ollama_service_on_gcp_a",
"scope": "systemd_ollama",
"mode": "manual_or_approved",
})
if live_probe.get("proxy_110_11435") == "http_502":
steps.append({
"step": "verify_110_proxy_after_gcp_a_recovery",
"scope": "nginx_proxy_readback",
"mode": "read_only_verification",
})
steps.append({
"step": "verify_ai_route_status_returns_primary",
"scope": "awooop_ai_route_status",
"mode": "read_only_verification",
})
return {
"schema_version": "awooop_ai_route_playbook_recommendation_v1",
"playbook_id": "ai_route_primary_lane_recovery",
"status": "candidate_from_live_evidence",
"safe_to_auto_execute": False,
"requires_approval": True,
"decision_effect": "none",
"steps": steps,
}
def _ai_route_repair_owner_action(evidence: Mapping[str, Any]) -> dict[str, Any]:
work_item = _as_dict(evidence.get("work_item"))
playbook = _as_dict(evidence.get("playbook_recommendation"))
return {
"schema_version": "awooop_ai_route_owner_action_v1",
"lead_agent": "Hermes",
"supporting_agents": ["OpenClaw", "ElephantAlpha"],
"human_owner": "Cloud/SRE owner",
"automation_state": "blocked_by_external_cloud_or_os_access",
"next_step": work_item.get("next_step") or "continue_route_monitoring",
"playbook_id": playbook.get("playbook_id"),
"safe_to_auto_repair": False,
"blocking_reason": work_item.get("reason") or "unknown",
}
def _ai_route_lane_state(
*,
policy_order: list[dict[str, Any]],

View File

@@ -1613,11 +1613,13 @@ def test_ai_route_repair_evidence_item_summarizes_operator_safe_fields() -> None
"lane_mode": "degraded_failover",
},
"live_probe": {
"gcp_a_direct_22": "connection_refused",
"gcp_a_direct_11434": "connection_refused",
"gcp_b_direct_11434": "http_200",
},
"access_blockers": [
"gcloud_compute_instances_get_missing",
"gcp_a_ssh_refused",
"gcp_a_ollama_11434_refused",
],
"side_effects": {
@@ -1640,6 +1642,7 @@ def test_ai_route_repair_evidence_item_summarizes_operator_safe_fields() -> None
assert item["source_ref_count"] == 4
assert item["access_blockers"] == [
"gcloud_compute_instances_get_missing",
"gcp_a_ssh_refused",
"gcp_a_ollama_11434_refused",
]
assert item["live_probe"]["gcp_a_direct_11434"] == "connection_refused"
@@ -1649,6 +1652,35 @@ def test_ai_route_repair_evidence_item_summarizes_operator_safe_fields() -> None
"approval_created": False,
"runtime_route_changed": False,
}
assert item["work_item"] == {
"schema_version": "awooop_ai_route_repair_work_item_v1",
"work_item_id": "ai-route-repair:ollama_gcp_a",
"status": "open",
"kind": "ai_route_primary_lane_repair",
"next_step": "restore_primary_ollama_lane_access",
"reason": "primary_lane_unavailable",
"needs_human": True,
"owner": "cloud_sre_operator",
"target_resource": "ollama_gcp_a",
"target_href": "/awooop/runs",
"decision_effect": "none",
"safety_level": "read_only_work_item_projection",
"writes_incident_state": False,
"writes_auto_repair_result": False,
"writes_runtime_route": False,
}
assert item["playbook_recommendation"]["playbook_id"] == (
"ai_route_primary_lane_recovery"
)
assert item["playbook_recommendation"]["safe_to_auto_execute"] is False
assert [step["step"] for step in item["playbook_recommendation"]["steps"]] == [
"verify_cloud_control_plane_access",
"restore_gcp_a_os_access",
"restore_ollama_service_on_gcp_a",
"verify_ai_route_status_returns_primary",
]
assert item["owner_action"]["lead_agent"] == "Hermes"
assert item["owner_action"]["safe_to_auto_repair"] is False
def test_ai_route_lane_state_marks_degraded_failover() -> None:

View File

@@ -1961,6 +1961,9 @@
"recurrenceWorkItems": {
"title": "Recurring alert work item / ticket entry"
},
"aiRouteRepairWorkItem": {
"title": "AI Provider primary-lane repair work item"
},
"configDriftFsm": {
"title": "Config Drift fingerprint state machine"
},
@@ -1996,6 +1999,7 @@
"sourceDossier": "Inbound alerts must show received / incident_linked / source refs",
"autoRepair": "Requires auto_repair, verification_result=success, and KM writeback",
"recurrenceWorkItems": "Completed-without-repair, failed repair, and manual gate groups must become trackable work items",
"aiRouteRepairWorkItem": "Provider lane degradation must expose evidence, owner, PlayBook candidate, and auto-repair safety",
"configDriftFsm": "The same drift fingerprint must expose recurrence, PR, zero diff, handoff, and next step",
"remediationQueue": "Every degraded / failed / timeout row must map to replay, reverify, ticket, or manual review",
"telegramCallbacks": "Detail and history buttons cannot depend only on Redis TTL or stale snapshots",
@@ -2016,6 +2020,13 @@
"recurrenceSourceReviewRecorded": "Source reviews recorded: {count}",
"recurrenceSourceApplied": "Source matches applied: {count}",
"recurrenceEmpty": "No open recurring-alert work item in the recent window",
"aiRouteRepairWorkItem": "AI route: {lane}; current {selected}; target {target}; {blockers} blockers",
"aiRouteRepairWorkItemId": "Work item: {id}",
"aiRouteRepairSkipped": "Skipped: {skipped}",
"aiRouteRepairOwner": "Owner: {owner}; lead agent: {lead}",
"aiRouteRepairPlaybook": "PlayBook: {playbook}; {steps} steps",
"aiRouteRepairSafety": "Safe auto-repair: {safe}",
"aiRouteRepairUnavailable": "AI route repair evidence has not returned yet",
"driftFingerprint": "Config Drift: {state}; {count}x in 12h",
"driftFingerprintUnavailable": "Config Drift fingerprint state API has not responded",
"driftFingerprintId": "Fingerprint: {fingerprint}; Report: {report}",

View File

@@ -1962,6 +1962,9 @@
"recurrenceWorkItems": {
"title": "重複告警工作項 / Ticket 入口"
},
"aiRouteRepairWorkItem": {
"title": "AI Provider primary lane 修復工作項"
},
"configDriftFsm": {
"title": "Config Drift fingerprint 狀態機"
},
@@ -1997,6 +2000,7 @@
"sourceDossier": "入站告警必須能查到 received / incident_linked / source refs",
"autoRepair": "必須同時有 auto_repair、verification_result=success 與 KM 回寫",
"recurrenceWorkItems": "Run 完成無修復、修復失敗與人工閘門必須進入可追蹤工作項",
"aiRouteRepairWorkItem": "Provider lane 降級時必須顯示 evidence、owner、PlayBook 候選與是否可自動修復",
"configDriftFsm": "同一 drift fingerprint 必須顯示重複、PR、零 diff、交接與下一步",
"remediationQueue": "每筆 degraded / failed / timeout 都必須映射到重跑、重驗、Ticket 或人工檢查",
"telegramCallbacks": "按下詳情與歷史不能再只依賴 Redis TTL 或舊快照",
@@ -2017,6 +2021,13 @@
"recurrenceSourceReviewRecorded": "來源審核已寫入歷史:{count}",
"recurrenceSourceApplied": "來源配對已套用:{count}",
"recurrenceEmpty": "近期重複告警尚無待處理工作項",
"aiRouteRepairWorkItem": "AI route{lane};目前 {selected};目標 {target};阻塞 {blockers} 項",
"aiRouteRepairWorkItemId": "Work item{id}",
"aiRouteRepairSkipped": "已跳過:{skipped}",
"aiRouteRepairOwner": "Owner{owner};主責 Agent{lead}",
"aiRouteRepairPlaybook": "PlayBook{playbook};步驟 {steps}",
"aiRouteRepairSafety": "可安全自動修復:{safe}",
"aiRouteRepairUnavailable": "AI route repair evidence 尚未回傳",
"driftFingerprint": "Config Drift{state}12h 內 {count} 次",
"driftFingerprintUnavailable": "Config Drift fingerprint state API 尚未回應",
"driftFingerprintId": "Fingerprint{fingerprint}Report{report}",

View File

@@ -834,6 +834,49 @@ type CallbackRepliesWorkItemResponse = {
per_page: number;
};
type AiRouteRepairEvidence = {
target_resource?: string | null;
access_blockers?: string[];
source_ref_count?: number | null;
work_item?: {
work_item_id?: string | null;
status?: string | null;
next_step?: string | null;
reason?: string | null;
owner?: string | null;
target_href?: string | null;
needs_human?: boolean | null;
} | null;
playbook_recommendation?: {
playbook_id?: string | null;
status?: string | null;
safe_to_auto_execute?: boolean | null;
requires_approval?: boolean | null;
steps?: Array<{ step?: string | null; scope?: string | null; mode?: string | null }>;
} | null;
owner_action?: {
lead_agent?: string | null;
supporting_agents?: string[];
human_owner?: string | null;
automation_state?: string | null;
next_step?: string | null;
playbook_id?: string | null;
safe_to_auto_repair?: boolean | null;
blocking_reason?: string | null;
} | null;
};
type AiRouteStatusResponse = {
lane_mode?: string | null;
selected_provider?: string | null;
skipped_lanes?: Array<{ provider_name?: string | null }>;
operator_action?: {
action?: string | null;
human_required?: boolean | null;
} | null;
repair_evidence?: AiRouteRepairEvidence | null;
};
type Telemetry = {
quality: AutomationQualitySummary | null;
governanceEvents: GovernanceEventsResponse | null;
@@ -852,6 +895,7 @@ type Telemetry = {
driftFingerprintState: DriftFingerprintState | null;
callbackReplies: CallbackRepliesWorkItemResponse | null;
statusChain: AwoooPStatusChain | null;
aiRouteStatus: AiRouteStatusResponse | null;
};
type WorkItem = {
@@ -1599,6 +1643,16 @@ function buildWorkItems(
latestCallbackOwnerReview?.km_stale_completion_summary ?? null;
const latestCallbackWorkItem = latestCallbackSummary?.work_item ?? null;
const latestCallbackTriage = latestCallbackWorkItem?.triage ?? null;
const aiRoute = telemetry.aiRouteStatus;
const aiRouteRepairEvidence = aiRoute?.repair_evidence ?? null;
const aiRouteWorkItem = aiRouteRepairEvidence?.work_item ?? null;
const aiRoutePlaybook = aiRouteRepairEvidence?.playbook_recommendation ?? null;
const aiRouteOwnerAction = aiRouteRepairEvidence?.owner_action ?? null;
const aiRouteBlockers = aiRouteRepairEvidence?.access_blockers ?? [];
const aiRouteSkipped = aiRoute?.skipped_lanes
?.map((lane) => lane.provider_name)
.filter(Boolean)
.join(" -> ");
const remediationQueue = telemetry.slo?.adr100?.verification_coverage?.remediation_queue;
const remediationTotal = remediationQueue?.total ?? 0;
const remediationReadyForAi = remediationQueue?.ready_for_ai ?? 0;
@@ -1708,6 +1762,46 @@ function buildWorkItems(
? `/awooop/work-items?project_id=${encodeURIComponent(telemetry.eventRecurrence?.project_id ?? "awoooi")}&work_item_id=${encodeURIComponent(latestRecurrenceOpenItem.work_item.work_item_id)}${latestRecurrenceOpenItem.work_item.incident_id ? `&incident_id=${encodeURIComponent(latestRecurrenceOpenItem.work_item.incident_id)}` : ""}`
: "/awooop/runs",
},
{
id: "aiRouteRepairWorkItem",
phase: "T178",
status: aiRouteWorkItem?.status === "open"
? "blocked"
: aiRoute
? "watching"
: "blocked",
surfaceKey: "runs",
source: "/api/v1/platform/ai-route-status + ai_route_repair",
gateKey: "aiRouteRepairWorkItem",
evidence: t("evidence.aiRouteRepairWorkItem", {
lane: aiRoute?.lane_mode ?? "--",
selected: aiRoute?.selected_provider ?? "--",
target: aiRouteRepairEvidence?.target_resource ?? "--",
blockers: aiRouteBlockers.length,
}),
evidenceDetails: aiRouteRepairEvidence
? [
t("evidence.aiRouteRepairWorkItemId", {
id: aiRouteWorkItem?.work_item_id ?? "--",
}),
t("evidence.aiRouteRepairSkipped", {
skipped: aiRouteSkipped || "--",
}),
t("evidence.aiRouteRepairOwner", {
owner: aiRouteOwnerAction?.human_owner ?? aiRouteWorkItem?.owner ?? "--",
lead: aiRouteOwnerAction?.lead_agent ?? "--",
}),
t("evidence.aiRouteRepairPlaybook", {
playbook: aiRoutePlaybook?.playbook_id ?? "--",
steps: aiRoutePlaybook?.steps?.length ?? 0,
}),
t("evidence.aiRouteRepairSafety", {
safe: String(aiRouteOwnerAction?.safe_to_auto_repair ?? false),
}),
]
: [t("evidence.aiRouteRepairUnavailable")],
href: aiRouteWorkItem?.target_href ?? "/awooop/runs",
},
{
id: "configDriftFsm",
phase: "T64",
@@ -4594,6 +4688,7 @@ export default function AwoooPWorkItemsPage() {
driftFingerprintState: null,
callbackReplies: null,
statusChain: null,
aiRouteStatus: null,
});
const [loading, setLoading] = useState(true);
const [lastUpdated, setLastUpdated] = useState<Date | null>(null);
@@ -4617,6 +4712,7 @@ export default function AwoooPWorkItemsPage() {
const remediationHistoryUrl = `${API_BASE}/api/v1/ai/slo/remediation/history?limit=80`;
const driftFingerprintUrl = `${API_BASE}/api/v1/drift/fingerprints/state?namespace=awoooi-prod`;
const callbackRepliesUrl = `${API_BASE}/api/v1/platform/runs/callback-replies?project_id=${encodedProjectId}&per_page=100`;
const aiRouteStatusUrl = `${API_BASE}/api/v1/platform/ai-route-status?workload_type=deep_rca`;
const [
quality,
@@ -4635,6 +4731,7 @@ export default function AwoooPWorkItemsPage() {
remediationHistory,
driftFingerprintState,
callbackReplies,
aiRouteStatus,
] = await Promise.all([
fetchJson<AutomationQualitySummary>(qualityUrl, 15000),
fetchJson<GovernanceEventsResponse>(governanceEventsUrl),
@@ -4652,6 +4749,7 @@ export default function AwoooPWorkItemsPage() {
fetchJson<RemediationHistoryResponse>(remediationHistoryUrl),
fetchJson<DriftFingerprintState>(driftFingerprintUrl, 12000),
fetchJson<CallbackRepliesWorkItemResponse>(callbackRepliesUrl, 12000),
fetchJson<AiRouteStatusResponse>(aiRouteStatusUrl, 12000),
]);
const statusChainIncidentId = selectStatusChainIncidentId(
@@ -4687,6 +4785,7 @@ export default function AwoooPWorkItemsPage() {
driftFingerprintState,
callbackReplies,
statusChain,
aiRouteStatus,
});
setLastUpdated(new Date());
setLoading(false);

View File

@@ -1,3 +1,44 @@
## 2026-05-25T178 AI route repair work item / PlayBook 候選
**背景**
- T177 已讓 `ai_route_repair / repair_diagnosis` 顯示在 `/api/v1/platform/ai-route-status` 與 AwoooP Runs但 operator 仍需要更清楚知道這件事是否形成工作項、由誰接、PlayBook 建議是什麼、是否可安全自動修復。
- 本段仍不修 GCP-A、不改 route、不建立 Incident / Telegram / Approval只把既有 DB repair evidence 轉成 read-only work item projection維持低噪音與安全邊界。
**本次修復**
- `repair_evidence` 新增:
- `work_item``ai-route-repair:<target>`,目前 `ollama_gcp_a` 為 open / needs_human=true。
- `playbook_recommendation``ai_route_primary_lane_recovery`,依 live blockers 組出 GCP control plane、OS access、Ollama service、110 proxy、route status verification 等步驟;`safe_to_auto_execute=false``requires_approval=true`
- `owner_action`:主責 HermesOpenClaw / ElephantAlpha 協作human owner 為 Cloud/SRE owner狀態為 blocked by external cloud/OS access。
- AwoooP Work Items 頁新增 T178「AI Provider primary lane 修復工作項」,讀 `/api/v1/platform/ai-route-status` 顯示 lane、selected provider、target、blocker 數、work item id、owner、PlayBook 與安全自動修復判斷。
- 此 work item 是 read-model projection不寫 incident state、不寫 auto-repair result、不變更 runtime route。
**本地驗證**
```text
python3 -m py_compile apps/api/src/services/platform_operator_service.py apps/api/src/api/v1/platform/operator_runs.py -> pass
jq empty apps/web/messages/zh-TW.json apps/web/messages/en.json -> pass
git diff --check -> pass
ruff check platform_operator_service.py + targeted tests --ignore B008 -> pass
pytest targeted ai-route status/evidence tests -> 6 passed
pnpm --dir apps/web exec tsc --noEmit --tsBuildInfoFile /tmp/awoooi-t178-tsconfig.tsbuildinfo -> pass
```
**目前整體進度**
- AwoooP 告警可觀測鏈:約 99.32%。
- 低風險自動修復閉環:約 95.8%。
- 前端 AI 自動化管理介面同步:約 97.6%。
- Telegram 詳情 / 歷史可解釋性:約 95.5%。
- Callback evidence / DB replayability約 96.0%。
- MCP / 自建 MCP 可見性:約 88%。
- Sentry / SigNoz source correlation visibility約 88%。
- Ansible / PlayBook decision visibility約 85.2%。
- KM owner-review / completion governance約 84%。
- AI Provider lane 健康與可見性:約 92%GCP-A runtime 尚未修復;但 repair evidence / work item / PlayBook 候選已可見)。
- 完整 AI 自動化管理產品化:約 95.2%。
## 2026-05-25T177 AI route repair evidence API / 前端投影
**背景**