diff --git a/apps/api/src/services/platform_operator_service.py b/apps/api/src/services/platform_operator_service.py index 27ea408c..c4046a12 100644 --- a/apps/api/src/services/platform_operator_service.py +++ b/apps/api/src/services/platform_operator_service.py @@ -1007,7 +1007,7 @@ def _ai_route_repair_evidence_item( observed_state = _as_dict(payload.get("observed_state")) side_effects = _ai_route_repair_side_effects(payload.get("side_effects")) - return { + evidence = { "schema_version": ( payload.get("schema_version") or envelope.get("schema_version") @@ -1037,6 +1037,12 @@ def _ai_route_repair_evidence_item( "provider_ts": row.get("provider_ts"), "received_at": row.get("received_at"), } + evidence["work_item"] = _ai_route_repair_work_item(evidence) + evidence["playbook_recommendation"] = _ai_route_repair_playbook_recommendation( + evidence + ) + evidence["owner_action"] = _ai_route_repair_owner_action(evidence) + return evidence def _ai_route_repair_side_effects(value: Any) -> dict[str, bool | None]: @@ -1078,6 +1084,101 @@ def _source_ref_count(envelope: Any) -> int: return total +def _ai_route_repair_work_item(evidence: Mapping[str, Any]) -> dict[str, Any]: + target = str(evidence.get("target_resource") or "unknown").strip() + blockers = _as_string_list(evidence.get("access_blockers")) + open_item = bool(blockers) + work_item_id = f"ai-route-repair:{target or 'unknown'}" + return { + "schema_version": "awooop_ai_route_repair_work_item_v1", + "work_item_id": work_item_id, + "status": "open" if open_item else "watching", + "kind": "ai_route_primary_lane_repair", + "next_step": ( + "restore_primary_ollama_lane_access" + if open_item + else "continue_route_monitoring" + ), + "reason": "primary_lane_unavailable" if open_item else "primary_lane_observed", + "needs_human": open_item, + "owner": "cloud_sre_operator", + "target_resource": target or None, + "target_href": "/awooop/runs", + "decision_effect": "none", + "safety_level": "read_only_work_item_projection", + "writes_incident_state": False, + "writes_auto_repair_result": False, + "writes_runtime_route": False, + } + + +def _ai_route_repair_playbook_recommendation( + evidence: Mapping[str, Any], +) -> dict[str, Any]: + blockers = set(_as_string_list(evidence.get("access_blockers"))) + live_probe = _as_dict(evidence.get("live_probe")) + steps: list[dict[str, Any]] = [] + if any(blocker.startswith("gcloud_") for blocker in blockers): + steps.append({ + "step": "verify_cloud_control_plane_access", + "scope": "gcp_compute_read", + "mode": "manual_or_approved", + }) + if "gcp_a_ssh_refused" in blockers or ( + live_probe.get("gcp_a_direct_22") == "connection_refused" + ): + steps.append({ + "step": "restore_gcp_a_os_access", + "scope": "gcp_serial_console_or_os_login", + "mode": "manual_or_approved", + }) + if "gcp_a_ollama_11434_refused" in blockers or ( + live_probe.get("gcp_a_direct_11434") == "connection_refused" + ): + steps.append({ + "step": "restore_ollama_service_on_gcp_a", + "scope": "systemd_ollama", + "mode": "manual_or_approved", + }) + if live_probe.get("proxy_110_11435") == "http_502": + steps.append({ + "step": "verify_110_proxy_after_gcp_a_recovery", + "scope": "nginx_proxy_readback", + "mode": "read_only_verification", + }) + steps.append({ + "step": "verify_ai_route_status_returns_primary", + "scope": "awooop_ai_route_status", + "mode": "read_only_verification", + }) + + return { + "schema_version": "awooop_ai_route_playbook_recommendation_v1", + "playbook_id": "ai_route_primary_lane_recovery", + "status": "candidate_from_live_evidence", + "safe_to_auto_execute": False, + "requires_approval": True, + "decision_effect": "none", + "steps": steps, + } + + +def _ai_route_repair_owner_action(evidence: Mapping[str, Any]) -> dict[str, Any]: + work_item = _as_dict(evidence.get("work_item")) + playbook = _as_dict(evidence.get("playbook_recommendation")) + return { + "schema_version": "awooop_ai_route_owner_action_v1", + "lead_agent": "Hermes", + "supporting_agents": ["OpenClaw", "ElephantAlpha"], + "human_owner": "Cloud/SRE owner", + "automation_state": "blocked_by_external_cloud_or_os_access", + "next_step": work_item.get("next_step") or "continue_route_monitoring", + "playbook_id": playbook.get("playbook_id"), + "safe_to_auto_repair": False, + "blocking_reason": work_item.get("reason") or "unknown", + } + + def _ai_route_lane_state( *, policy_order: list[dict[str, Any]], diff --git a/apps/api/tests/test_awooop_operator_timeline_labels.py b/apps/api/tests/test_awooop_operator_timeline_labels.py index ab4eda3b..e85adcdb 100644 --- a/apps/api/tests/test_awooop_operator_timeline_labels.py +++ b/apps/api/tests/test_awooop_operator_timeline_labels.py @@ -1613,11 +1613,13 @@ def test_ai_route_repair_evidence_item_summarizes_operator_safe_fields() -> None "lane_mode": "degraded_failover", }, "live_probe": { + "gcp_a_direct_22": "connection_refused", "gcp_a_direct_11434": "connection_refused", "gcp_b_direct_11434": "http_200", }, "access_blockers": [ "gcloud_compute_instances_get_missing", + "gcp_a_ssh_refused", "gcp_a_ollama_11434_refused", ], "side_effects": { @@ -1640,6 +1642,7 @@ def test_ai_route_repair_evidence_item_summarizes_operator_safe_fields() -> None assert item["source_ref_count"] == 4 assert item["access_blockers"] == [ "gcloud_compute_instances_get_missing", + "gcp_a_ssh_refused", "gcp_a_ollama_11434_refused", ] assert item["live_probe"]["gcp_a_direct_11434"] == "connection_refused" @@ -1649,6 +1652,35 @@ def test_ai_route_repair_evidence_item_summarizes_operator_safe_fields() -> None "approval_created": False, "runtime_route_changed": False, } + assert item["work_item"] == { + "schema_version": "awooop_ai_route_repair_work_item_v1", + "work_item_id": "ai-route-repair:ollama_gcp_a", + "status": "open", + "kind": "ai_route_primary_lane_repair", + "next_step": "restore_primary_ollama_lane_access", + "reason": "primary_lane_unavailable", + "needs_human": True, + "owner": "cloud_sre_operator", + "target_resource": "ollama_gcp_a", + "target_href": "/awooop/runs", + "decision_effect": "none", + "safety_level": "read_only_work_item_projection", + "writes_incident_state": False, + "writes_auto_repair_result": False, + "writes_runtime_route": False, + } + assert item["playbook_recommendation"]["playbook_id"] == ( + "ai_route_primary_lane_recovery" + ) + assert item["playbook_recommendation"]["safe_to_auto_execute"] is False + assert [step["step"] for step in item["playbook_recommendation"]["steps"]] == [ + "verify_cloud_control_plane_access", + "restore_gcp_a_os_access", + "restore_ollama_service_on_gcp_a", + "verify_ai_route_status_returns_primary", + ] + assert item["owner_action"]["lead_agent"] == "Hermes" + assert item["owner_action"]["safe_to_auto_repair"] is False def test_ai_route_lane_state_marks_degraded_failover() -> None: diff --git a/apps/web/messages/en.json b/apps/web/messages/en.json index 704d4c9f..3a57a710 100644 --- a/apps/web/messages/en.json +++ b/apps/web/messages/en.json @@ -1961,6 +1961,9 @@ "recurrenceWorkItems": { "title": "Recurring alert work item / ticket entry" }, + "aiRouteRepairWorkItem": { + "title": "AI Provider primary-lane repair work item" + }, "configDriftFsm": { "title": "Config Drift fingerprint state machine" }, @@ -1996,6 +1999,7 @@ "sourceDossier": "Inbound alerts must show received / incident_linked / source refs", "autoRepair": "Requires auto_repair, verification_result=success, and KM writeback", "recurrenceWorkItems": "Completed-without-repair, failed repair, and manual gate groups must become trackable work items", + "aiRouteRepairWorkItem": "Provider lane degradation must expose evidence, owner, PlayBook candidate, and auto-repair safety", "configDriftFsm": "The same drift fingerprint must expose recurrence, PR, zero diff, handoff, and next step", "remediationQueue": "Every degraded / failed / timeout row must map to replay, reverify, ticket, or manual review", "telegramCallbacks": "Detail and history buttons cannot depend only on Redis TTL or stale snapshots", @@ -2016,6 +2020,13 @@ "recurrenceSourceReviewRecorded": "Source reviews recorded: {count}", "recurrenceSourceApplied": "Source matches applied: {count}", "recurrenceEmpty": "No open recurring-alert work item in the recent window", + "aiRouteRepairWorkItem": "AI route: {lane}; current {selected}; target {target}; {blockers} blockers", + "aiRouteRepairWorkItemId": "Work item: {id}", + "aiRouteRepairSkipped": "Skipped: {skipped}", + "aiRouteRepairOwner": "Owner: {owner}; lead agent: {lead}", + "aiRouteRepairPlaybook": "PlayBook: {playbook}; {steps} steps", + "aiRouteRepairSafety": "Safe auto-repair: {safe}", + "aiRouteRepairUnavailable": "AI route repair evidence has not returned yet", "driftFingerprint": "Config Drift: {state}; {count}x in 12h", "driftFingerprintUnavailable": "Config Drift fingerprint state API has not responded", "driftFingerprintId": "Fingerprint: {fingerprint}; Report: {report}", diff --git a/apps/web/messages/zh-TW.json b/apps/web/messages/zh-TW.json index 8a0499ed..0286d9eb 100644 --- a/apps/web/messages/zh-TW.json +++ b/apps/web/messages/zh-TW.json @@ -1962,6 +1962,9 @@ "recurrenceWorkItems": { "title": "重複告警工作項 / Ticket 入口" }, + "aiRouteRepairWorkItem": { + "title": "AI Provider primary lane 修復工作項" + }, "configDriftFsm": { "title": "Config Drift fingerprint 狀態機" }, @@ -1997,6 +2000,7 @@ "sourceDossier": "入站告警必須能查到 received / incident_linked / source refs", "autoRepair": "必須同時有 auto_repair、verification_result=success 與 KM 回寫", "recurrenceWorkItems": "Run 完成無修復、修復失敗與人工閘門必須進入可追蹤工作項", + "aiRouteRepairWorkItem": "Provider lane 降級時必須顯示 evidence、owner、PlayBook 候選與是否可自動修復", "configDriftFsm": "同一 drift fingerprint 必須顯示重複、PR、零 diff、交接與下一步", "remediationQueue": "每筆 degraded / failed / timeout 都必須映射到重跑、重驗、Ticket 或人工檢查", "telegramCallbacks": "按下詳情與歷史不能再只依賴 Redis TTL 或舊快照", @@ -2017,6 +2021,13 @@ "recurrenceSourceReviewRecorded": "來源審核已寫入歷史:{count}", "recurrenceSourceApplied": "來源配對已套用:{count}", "recurrenceEmpty": "近期重複告警尚無待處理工作項", + "aiRouteRepairWorkItem": "AI route:{lane};目前 {selected};目標 {target};阻塞 {blockers} 項", + "aiRouteRepairWorkItemId": "Work item:{id}", + "aiRouteRepairSkipped": "已跳過:{skipped}", + "aiRouteRepairOwner": "Owner:{owner};主責 Agent:{lead}", + "aiRouteRepairPlaybook": "PlayBook:{playbook};步驟 {steps}", + "aiRouteRepairSafety": "可安全自動修復:{safe}", + "aiRouteRepairUnavailable": "AI route repair evidence 尚未回傳", "driftFingerprint": "Config Drift:{state};12h 內 {count} 次", "driftFingerprintUnavailable": "Config Drift fingerprint state API 尚未回應", "driftFingerprintId": "Fingerprint:{fingerprint};Report:{report}", diff --git a/apps/web/src/app/[locale]/awooop/work-items/page.tsx b/apps/web/src/app/[locale]/awooop/work-items/page.tsx index de158959..7072083c 100644 --- a/apps/web/src/app/[locale]/awooop/work-items/page.tsx +++ b/apps/web/src/app/[locale]/awooop/work-items/page.tsx @@ -834,6 +834,49 @@ type CallbackRepliesWorkItemResponse = { per_page: number; }; +type AiRouteRepairEvidence = { + target_resource?: string | null; + access_blockers?: string[]; + source_ref_count?: number | null; + work_item?: { + work_item_id?: string | null; + status?: string | null; + next_step?: string | null; + reason?: string | null; + owner?: string | null; + target_href?: string | null; + needs_human?: boolean | null; + } | null; + playbook_recommendation?: { + playbook_id?: string | null; + status?: string | null; + safe_to_auto_execute?: boolean | null; + requires_approval?: boolean | null; + steps?: Array<{ step?: string | null; scope?: string | null; mode?: string | null }>; + } | null; + owner_action?: { + lead_agent?: string | null; + supporting_agents?: string[]; + human_owner?: string | null; + automation_state?: string | null; + next_step?: string | null; + playbook_id?: string | null; + safe_to_auto_repair?: boolean | null; + blocking_reason?: string | null; + } | null; +}; + +type AiRouteStatusResponse = { + lane_mode?: string | null; + selected_provider?: string | null; + skipped_lanes?: Array<{ provider_name?: string | null }>; + operator_action?: { + action?: string | null; + human_required?: boolean | null; + } | null; + repair_evidence?: AiRouteRepairEvidence | null; +}; + type Telemetry = { quality: AutomationQualitySummary | null; governanceEvents: GovernanceEventsResponse | null; @@ -852,6 +895,7 @@ type Telemetry = { driftFingerprintState: DriftFingerprintState | null; callbackReplies: CallbackRepliesWorkItemResponse | null; statusChain: AwoooPStatusChain | null; + aiRouteStatus: AiRouteStatusResponse | null; }; type WorkItem = { @@ -1599,6 +1643,16 @@ function buildWorkItems( latestCallbackOwnerReview?.km_stale_completion_summary ?? null; const latestCallbackWorkItem = latestCallbackSummary?.work_item ?? null; const latestCallbackTriage = latestCallbackWorkItem?.triage ?? null; + const aiRoute = telemetry.aiRouteStatus; + const aiRouteRepairEvidence = aiRoute?.repair_evidence ?? null; + const aiRouteWorkItem = aiRouteRepairEvidence?.work_item ?? null; + const aiRoutePlaybook = aiRouteRepairEvidence?.playbook_recommendation ?? null; + const aiRouteOwnerAction = aiRouteRepairEvidence?.owner_action ?? null; + const aiRouteBlockers = aiRouteRepairEvidence?.access_blockers ?? []; + const aiRouteSkipped = aiRoute?.skipped_lanes + ?.map((lane) => lane.provider_name) + .filter(Boolean) + .join(" -> "); const remediationQueue = telemetry.slo?.adr100?.verification_coverage?.remediation_queue; const remediationTotal = remediationQueue?.total ?? 0; const remediationReadyForAi = remediationQueue?.ready_for_ai ?? 0; @@ -1708,6 +1762,46 @@ function buildWorkItems( ? `/awooop/work-items?project_id=${encodeURIComponent(telemetry.eventRecurrence?.project_id ?? "awoooi")}&work_item_id=${encodeURIComponent(latestRecurrenceOpenItem.work_item.work_item_id)}${latestRecurrenceOpenItem.work_item.incident_id ? `&incident_id=${encodeURIComponent(latestRecurrenceOpenItem.work_item.incident_id)}` : ""}` : "/awooop/runs", }, + { + id: "aiRouteRepairWorkItem", + phase: "T178", + status: aiRouteWorkItem?.status === "open" + ? "blocked" + : aiRoute + ? "watching" + : "blocked", + surfaceKey: "runs", + source: "/api/v1/platform/ai-route-status + ai_route_repair", + gateKey: "aiRouteRepairWorkItem", + evidence: t("evidence.aiRouteRepairWorkItem", { + lane: aiRoute?.lane_mode ?? "--", + selected: aiRoute?.selected_provider ?? "--", + target: aiRouteRepairEvidence?.target_resource ?? "--", + blockers: aiRouteBlockers.length, + }), + evidenceDetails: aiRouteRepairEvidence + ? [ + t("evidence.aiRouteRepairWorkItemId", { + id: aiRouteWorkItem?.work_item_id ?? "--", + }), + t("evidence.aiRouteRepairSkipped", { + skipped: aiRouteSkipped || "--", + }), + t("evidence.aiRouteRepairOwner", { + owner: aiRouteOwnerAction?.human_owner ?? aiRouteWorkItem?.owner ?? "--", + lead: aiRouteOwnerAction?.lead_agent ?? "--", + }), + t("evidence.aiRouteRepairPlaybook", { + playbook: aiRoutePlaybook?.playbook_id ?? "--", + steps: aiRoutePlaybook?.steps?.length ?? 0, + }), + t("evidence.aiRouteRepairSafety", { + safe: String(aiRouteOwnerAction?.safe_to_auto_repair ?? false), + }), + ] + : [t("evidence.aiRouteRepairUnavailable")], + href: aiRouteWorkItem?.target_href ?? "/awooop/runs", + }, { id: "configDriftFsm", phase: "T64", @@ -4594,6 +4688,7 @@ export default function AwoooPWorkItemsPage() { driftFingerprintState: null, callbackReplies: null, statusChain: null, + aiRouteStatus: null, }); const [loading, setLoading] = useState(true); const [lastUpdated, setLastUpdated] = useState(null); @@ -4617,6 +4712,7 @@ export default function AwoooPWorkItemsPage() { const remediationHistoryUrl = `${API_BASE}/api/v1/ai/slo/remediation/history?limit=80`; const driftFingerprintUrl = `${API_BASE}/api/v1/drift/fingerprints/state?namespace=awoooi-prod`; const callbackRepliesUrl = `${API_BASE}/api/v1/platform/runs/callback-replies?project_id=${encodedProjectId}&per_page=100`; + const aiRouteStatusUrl = `${API_BASE}/api/v1/platform/ai-route-status?workload_type=deep_rca`; const [ quality, @@ -4635,6 +4731,7 @@ export default function AwoooPWorkItemsPage() { remediationHistory, driftFingerprintState, callbackReplies, + aiRouteStatus, ] = await Promise.all([ fetchJson(qualityUrl, 15000), fetchJson(governanceEventsUrl), @@ -4652,6 +4749,7 @@ export default function AwoooPWorkItemsPage() { fetchJson(remediationHistoryUrl), fetchJson(driftFingerprintUrl, 12000), fetchJson(callbackRepliesUrl, 12000), + fetchJson(aiRouteStatusUrl, 12000), ]); const statusChainIncidentId = selectStatusChainIncidentId( @@ -4687,6 +4785,7 @@ export default function AwoooPWorkItemsPage() { driftFingerprintState, callbackReplies, statusChain, + aiRouteStatus, }); setLastUpdated(new Date()); setLoading(false); diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 68f68be5..2ad4ad62 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,44 @@ +## 2026-05-25|T178 AI route repair work item / PlayBook 候選 + +**背景**: + +- T177 已讓 `ai_route_repair / repair_diagnosis` 顯示在 `/api/v1/platform/ai-route-status` 與 AwoooP Runs,但 operator 仍需要更清楚知道:這件事是否形成工作項、由誰接、PlayBook 建議是什麼、是否可安全自動修復。 +- 本段仍不修 GCP-A、不改 route、不建立 Incident / Telegram / Approval;只把既有 DB repair evidence 轉成 read-only work item projection,維持低噪音與安全邊界。 + +**本次修復**: + +- `repair_evidence` 新增: + - `work_item`:`ai-route-repair:`,目前 `ollama_gcp_a` 為 open / needs_human=true。 + - `playbook_recommendation`:`ai_route_primary_lane_recovery`,依 live blockers 組出 GCP control plane、OS access、Ollama service、110 proxy、route status verification 等步驟;`safe_to_auto_execute=false`、`requires_approval=true`。 + - `owner_action`:主責 Hermes,OpenClaw / ElephantAlpha 協作,human owner 為 Cloud/SRE owner;狀態為 blocked by external cloud/OS access。 +- AwoooP Work Items 頁新增 T178「AI Provider primary lane 修復工作項」,讀 `/api/v1/platform/ai-route-status` 顯示 lane、selected provider、target、blocker 數、work item id、owner、PlayBook 與安全自動修復判斷。 +- 此 work item 是 read-model projection,不寫 incident state、不寫 auto-repair result、不變更 runtime route。 + +**本地驗證**: + +```text +python3 -m py_compile apps/api/src/services/platform_operator_service.py apps/api/src/api/v1/platform/operator_runs.py -> pass +jq empty apps/web/messages/zh-TW.json apps/web/messages/en.json -> pass +git diff --check -> pass +ruff check platform_operator_service.py + targeted tests --ignore B008 -> pass +pytest targeted ai-route status/evidence tests -> 6 passed +pnpm --dir apps/web exec tsc --noEmit --tsBuildInfoFile /tmp/awoooi-t178-tsconfig.tsbuildinfo -> pass +``` + +**目前整體進度**: + +- AwoooP 告警可觀測鏈:約 99.32%。 +- 低風險自動修復閉環:約 95.8%。 +- 前端 AI 自動化管理介面同步:約 97.6%。 +- Telegram 詳情 / 歷史可解釋性:約 95.5%。 +- Callback evidence / DB replayability:約 96.0%。 +- MCP / 自建 MCP 可見性:約 88%。 +- Sentry / SigNoz source correlation visibility:約 88%。 +- Ansible / PlayBook decision visibility:約 85.2%。 +- KM owner-review / completion governance:約 84%。 +- AI Provider lane 健康與可見性:約 92%(GCP-A runtime 尚未修復;但 repair evidence / work item / PlayBook 候選已可見)。 +- 完整 AI 自動化管理產品化:約 95.2%。 + ## 2026-05-25|T177 AI route repair evidence API / 前端投影 **背景**: