feat(awooop): project ai route repair work item

2026-05-25 14:42:57 +08:00
parent e5cd01c9cb
commit 63b4c3453f
6 changed files with 296 additions and 1 deletions
--- a/apps/api/src/services/platform_operator_service.py
+++ b/apps/api/src/services/platform_operator_service.py
@@ -1007,7 +1007,7 @@ def _ai_route_repair_evidence_item(
    observed_state = _as_dict(payload.get("observed_state"))
    side_effects = _ai_route_repair_side_effects(payload.get("side_effects"))

-    return {
+    evidence = {
        "schema_version": (
            payload.get("schema_version")
            or envelope.get("schema_version")
@@ -1037,6 +1037,12 @@ def _ai_route_repair_evidence_item(
        "provider_ts": row.get("provider_ts"),
        "received_at": row.get("received_at"),
    }
+    evidence["work_item"] = _ai_route_repair_work_item(evidence)
+    evidence["playbook_recommendation"] = _ai_route_repair_playbook_recommendation(
+        evidence
+    )
+    evidence["owner_action"] = _ai_route_repair_owner_action(evidence)
+    return evidence


 def _ai_route_repair_side_effects(value: Any) -> dict[str, bool | None]:
@@ -1078,6 +1084,101 @@ def _source_ref_count(envelope: Any) -> int:
    return total


+def _ai_route_repair_work_item(evidence: Mapping[str, Any]) -> dict[str, Any]:
+    target = str(evidence.get("target_resource") or "unknown").strip()
+    blockers = _as_string_list(evidence.get("access_blockers"))
+    open_item = bool(blockers)
+    work_item_id = f"ai-route-repair:{target or 'unknown'}"
+    return {
+        "schema_version": "awooop_ai_route_repair_work_item_v1",
+        "work_item_id": work_item_id,
+        "status": "open" if open_item else "watching",
+        "kind": "ai_route_primary_lane_repair",
+        "next_step": (
+            "restore_primary_ollama_lane_access"
+            if open_item
+            else "continue_route_monitoring"
+        ),
+        "reason": "primary_lane_unavailable" if open_item else "primary_lane_observed",
+        "needs_human": open_item,
+        "owner": "cloud_sre_operator",
+        "target_resource": target or None,
+        "target_href": "/awooop/runs",
+        "decision_effect": "none",
+        "safety_level": "read_only_work_item_projection",
+        "writes_incident_state": False,
+        "writes_auto_repair_result": False,
+        "writes_runtime_route": False,
+    }
+
+
+def _ai_route_repair_playbook_recommendation(
+    evidence: Mapping[str, Any],
+) -> dict[str, Any]:
+    blockers = set(_as_string_list(evidence.get("access_blockers")))
+    live_probe = _as_dict(evidence.get("live_probe"))
+    steps: list[dict[str, Any]] = []
+    if any(blocker.startswith("gcloud_") for blocker in blockers):
+        steps.append({
+            "step": "verify_cloud_control_plane_access",
+            "scope": "gcp_compute_read",
+            "mode": "manual_or_approved",
+        })
+    if "gcp_a_ssh_refused" in blockers or (
+        live_probe.get("gcp_a_direct_22") == "connection_refused"
+    ):
+        steps.append({
+            "step": "restore_gcp_a_os_access",
+            "scope": "gcp_serial_console_or_os_login",
+            "mode": "manual_or_approved",
+        })
+    if "gcp_a_ollama_11434_refused" in blockers or (
+        live_probe.get("gcp_a_direct_11434") == "connection_refused"
+    ):
+        steps.append({
+            "step": "restore_ollama_service_on_gcp_a",
+            "scope": "systemd_ollama",
+            "mode": "manual_or_approved",
+        })
+    if live_probe.get("proxy_110_11435") == "http_502":
+        steps.append({
+            "step": "verify_110_proxy_after_gcp_a_recovery",
+            "scope": "nginx_proxy_readback",
+            "mode": "read_only_verification",
+        })
+    steps.append({
+        "step": "verify_ai_route_status_returns_primary",
+        "scope": "awooop_ai_route_status",
+        "mode": "read_only_verification",
+    })
+
+    return {
+        "schema_version": "awooop_ai_route_playbook_recommendation_v1",
+        "playbook_id": "ai_route_primary_lane_recovery",
+        "status": "candidate_from_live_evidence",
+        "safe_to_auto_execute": False,
+        "requires_approval": True,
+        "decision_effect": "none",
+        "steps": steps,
+    }
+
+
+def _ai_route_repair_owner_action(evidence: Mapping[str, Any]) -> dict[str, Any]:
+    work_item = _as_dict(evidence.get("work_item"))
+    playbook = _as_dict(evidence.get("playbook_recommendation"))
+    return {
+        "schema_version": "awooop_ai_route_owner_action_v1",
+        "lead_agent": "Hermes",
+        "supporting_agents": ["OpenClaw", "ElephantAlpha"],
+        "human_owner": "Cloud/SRE owner",
+        "automation_state": "blocked_by_external_cloud_or_os_access",
+        "next_step": work_item.get("next_step") or "continue_route_monitoring",
+        "playbook_id": playbook.get("playbook_id"),
+        "safe_to_auto_repair": False,
+        "blocking_reason": work_item.get("reason") or "unknown",
+    }
+
+
 def _ai_route_lane_state(
    *,
    policy_order: list[dict[str, Any]],
--- a/apps/api/tests/test_awooop_operator_timeline_labels.py
+++ b/apps/api/tests/test_awooop_operator_timeline_labels.py
@@ -1613,11 +1613,13 @@ def test_ai_route_repair_evidence_item_summarizes_operator_safe_fields() -> None
                        "lane_mode": "degraded_failover",
                    },
                    "live_probe": {
+                        "gcp_a_direct_22": "connection_refused",
                        "gcp_a_direct_11434": "connection_refused",
                        "gcp_b_direct_11434": "http_200",
                    },
                    "access_blockers": [
                        "gcloud_compute_instances_get_missing",
+                        "gcp_a_ssh_refused",
                        "gcp_a_ollama_11434_refused",
                    ],
                    "side_effects": {
@@ -1640,6 +1642,7 @@ def test_ai_route_repair_evidence_item_summarizes_operator_safe_fields() -> None
    assert item["source_ref_count"] == 4
    assert item["access_blockers"] == [
        "gcloud_compute_instances_get_missing",
+        "gcp_a_ssh_refused",
        "gcp_a_ollama_11434_refused",
    ]
    assert item["live_probe"]["gcp_a_direct_11434"] == "connection_refused"
@@ -1649,6 +1652,35 @@ def test_ai_route_repair_evidence_item_summarizes_operator_safe_fields() -> None
        "approval_created": False,
        "runtime_route_changed": False,
    }
+    assert item["work_item"] == {
+        "schema_version": "awooop_ai_route_repair_work_item_v1",
+        "work_item_id": "ai-route-repair:ollama_gcp_a",
+        "status": "open",
+        "kind": "ai_route_primary_lane_repair",
+        "next_step": "restore_primary_ollama_lane_access",
+        "reason": "primary_lane_unavailable",
+        "needs_human": True,
+        "owner": "cloud_sre_operator",
+        "target_resource": "ollama_gcp_a",
+        "target_href": "/awooop/runs",
+        "decision_effect": "none",
+        "safety_level": "read_only_work_item_projection",
+        "writes_incident_state": False,
+        "writes_auto_repair_result": False,
+        "writes_runtime_route": False,
+    }
+    assert item["playbook_recommendation"]["playbook_id"] == (
+        "ai_route_primary_lane_recovery"
+    )
+    assert item["playbook_recommendation"]["safe_to_auto_execute"] is False
+    assert [step["step"] for step in item["playbook_recommendation"]["steps"]] == [
+        "verify_cloud_control_plane_access",
+        "restore_gcp_a_os_access",
+        "restore_ollama_service_on_gcp_a",
+        "verify_ai_route_status_returns_primary",
+    ]
+    assert item["owner_action"]["lead_agent"] == "Hermes"
+    assert item["owner_action"]["safe_to_auto_repair"] is False


 def test_ai_route_lane_state_marks_degraded_failover() -> None:
--- a/apps/web/messages/en.json
+++ b/apps/web/messages/en.json
@@ -1961,6 +1961,9 @@
        "recurrenceWorkItems": {
          "title": "Recurring alert work item / ticket entry"
        },
+        "aiRouteRepairWorkItem": {
+          "title": "AI Provider primary-lane repair work item"
+        },
        "configDriftFsm": {
          "title": "Config Drift fingerprint state machine"
        },
@@ -1996,6 +1999,7 @@
        "sourceDossier": "Inbound alerts must show received / incident_linked / source refs",
        "autoRepair": "Requires auto_repair, verification_result=success, and KM writeback",
        "recurrenceWorkItems": "Completed-without-repair, failed repair, and manual gate groups must become trackable work items",
+        "aiRouteRepairWorkItem": "Provider lane degradation must expose evidence, owner, PlayBook candidate, and auto-repair safety",
        "configDriftFsm": "The same drift fingerprint must expose recurrence, PR, zero diff, handoff, and next step",
        "remediationQueue": "Every degraded / failed / timeout row must map to replay, reverify, ticket, or manual review",
        "telegramCallbacks": "Detail and history buttons cannot depend only on Redis TTL or stale snapshots",
@@ -2016,6 +2020,13 @@
        "recurrenceSourceReviewRecorded": "Source reviews recorded: {count}",
        "recurrenceSourceApplied": "Source matches applied: {count}",
        "recurrenceEmpty": "No open recurring-alert work item in the recent window",
+        "aiRouteRepairWorkItem": "AI route: {lane}; current {selected}; target {target}; {blockers} blockers",
+        "aiRouteRepairWorkItemId": "Work item: {id}",
+        "aiRouteRepairSkipped": "Skipped: {skipped}",
+        "aiRouteRepairOwner": "Owner: {owner}; lead agent: {lead}",
+        "aiRouteRepairPlaybook": "PlayBook: {playbook}; {steps} steps",
+        "aiRouteRepairSafety": "Safe auto-repair: {safe}",
+        "aiRouteRepairUnavailable": "AI route repair evidence has not returned yet",
        "driftFingerprint": "Config Drift: {state}; {count}x in 12h",
        "driftFingerprintUnavailable": "Config Drift fingerprint state API has not responded",
        "driftFingerprintId": "Fingerprint: {fingerprint}; Report: {report}",
--- a/apps/web/messages/zh-TW.json
+++ b/apps/web/messages/zh-TW.json
@@ -1962,6 +1962,9 @@
        "recurrenceWorkItems": {
          "title": "重複告警工作項 / Ticket 入口"
        },
+        "aiRouteRepairWorkItem": {
+          "title": "AI Provider primary lane 修復工作項"
+        },
        "configDriftFsm": {
          "title": "Config Drift fingerprint 狀態機"
        },
@@ -1997,6 +2000,7 @@
        "sourceDossier": "入站告警必須能查到 received / incident_linked / source refs",
        "autoRepair": "必須同時有 auto_repair、verification_result=success 與 KM 回寫",
        "recurrenceWorkItems": "Run 完成無修復、修復失敗與人工閘門必須進入可追蹤工作項",
+        "aiRouteRepairWorkItem": "Provider lane 降級時必須顯示 evidence、owner、PlayBook 候選與是否可自動修復",
        "configDriftFsm": "同一 drift fingerprint 必須顯示重複、PR、零 diff、交接與下一步",
        "remediationQueue": "每筆 degraded / failed / timeout 都必須映射到重跑、重驗、Ticket 或人工檢查",
        "telegramCallbacks": "按下詳情與歷史不能再只依賴 Redis TTL 或舊快照",
@@ -2017,6 +2021,13 @@
        "recurrenceSourceReviewRecorded": "來源審核已寫入歷史：{count}",
        "recurrenceSourceApplied": "來源配對已套用：{count}",
        "recurrenceEmpty": "近期重複告警尚無待處理工作項",
+        "aiRouteRepairWorkItem": "AI route：{lane}；目前 {selected}；目標 {target}；阻塞 {blockers} 項",
+        "aiRouteRepairWorkItemId": "Work item：{id}",
+        "aiRouteRepairSkipped": "已跳過：{skipped}",
+        "aiRouteRepairOwner": "Owner：{owner}；主責 Agent：{lead}",
+        "aiRouteRepairPlaybook": "PlayBook：{playbook}；步驟 {steps}",
+        "aiRouteRepairSafety": "可安全自動修復：{safe}",
+        "aiRouteRepairUnavailable": "AI route repair evidence 尚未回傳",
        "driftFingerprint": "Config Drift：{state}；12h 內 {count} 次",
        "driftFingerprintUnavailable": "Config Drift fingerprint state API 尚未回應",
        "driftFingerprintId": "Fingerprint：{fingerprint}；Report：{report}",
--- a/apps/web/src/app/[locale]/awooop/work-items/page.tsx
+++ b/apps/web/src/app/[locale]/awooop/work-items/page.tsx
@@ -834,6 +834,49 @@ type CallbackRepliesWorkItemResponse = {
  per_page: number;
 };

+type AiRouteRepairEvidence = {
+  target_resource?: string | null;
+  access_blockers?: string[];
+  source_ref_count?: number | null;
+  work_item?: {
+    work_item_id?: string | null;
+    status?: string | null;
+    next_step?: string | null;
+    reason?: string | null;
+    owner?: string | null;
+    target_href?: string | null;
+    needs_human?: boolean | null;
+  } | null;
+  playbook_recommendation?: {
+    playbook_id?: string | null;
+    status?: string | null;
+    safe_to_auto_execute?: boolean | null;
+    requires_approval?: boolean | null;
+    steps?: Array<{ step?: string | null; scope?: string | null; mode?: string | null }>;
+  } | null;
+  owner_action?: {
+    lead_agent?: string | null;
+    supporting_agents?: string[];
+    human_owner?: string | null;
+    automation_state?: string | null;
+    next_step?: string | null;
+    playbook_id?: string | null;
+    safe_to_auto_repair?: boolean | null;
+    blocking_reason?: string | null;
+  } | null;
+};
+
+type AiRouteStatusResponse = {
+  lane_mode?: string | null;
+  selected_provider?: string | null;
+  skipped_lanes?: Array<{ provider_name?: string | null }>;
+  operator_action?: {
+    action?: string | null;
+    human_required?: boolean | null;
+  } | null;
+  repair_evidence?: AiRouteRepairEvidence | null;
+};
+
 type Telemetry = {
  quality: AutomationQualitySummary | null;
  governanceEvents: GovernanceEventsResponse | null;
@@ -852,6 +895,7 @@ type Telemetry = {
  driftFingerprintState: DriftFingerprintState | null;
  callbackReplies: CallbackRepliesWorkItemResponse | null;
  statusChain: AwoooPStatusChain | null;
+  aiRouteStatus: AiRouteStatusResponse | null;
 };

 type WorkItem = {
@@ -1599,6 +1643,16 @@ function buildWorkItems(
    latestCallbackOwnerReview?.km_stale_completion_summary ?? null;
  const latestCallbackWorkItem = latestCallbackSummary?.work_item ?? null;
  const latestCallbackTriage = latestCallbackWorkItem?.triage ?? null;
+  const aiRoute = telemetry.aiRouteStatus;
+  const aiRouteRepairEvidence = aiRoute?.repair_evidence ?? null;
+  const aiRouteWorkItem = aiRouteRepairEvidence?.work_item ?? null;
+  const aiRoutePlaybook = aiRouteRepairEvidence?.playbook_recommendation ?? null;
+  const aiRouteOwnerAction = aiRouteRepairEvidence?.owner_action ?? null;
+  const aiRouteBlockers = aiRouteRepairEvidence?.access_blockers ?? [];
+  const aiRouteSkipped = aiRoute?.skipped_lanes
+    ?.map((lane) => lane.provider_name)
+    .filter(Boolean)
+    .join(" -> ");
  const remediationQueue = telemetry.slo?.adr100?.verification_coverage?.remediation_queue;
  const remediationTotal = remediationQueue?.total ?? 0;
  const remediationReadyForAi = remediationQueue?.ready_for_ai ?? 0;
@@ -1708,6 +1762,46 @@ function buildWorkItems(
        ? `/awooop/work-items?project_id=${encodeURIComponent(telemetry.eventRecurrence?.project_id ?? "awoooi")}&work_item_id=${encodeURIComponent(latestRecurrenceOpenItem.work_item.work_item_id)}${latestRecurrenceOpenItem.work_item.incident_id ? `&incident_id=${encodeURIComponent(latestRecurrenceOpenItem.work_item.incident_id)}` : ""}`
        : "/awooop/runs",
    },
+    {
+      id: "aiRouteRepairWorkItem",
+      phase: "T178",
+      status: aiRouteWorkItem?.status === "open"
+        ? "blocked"
+        : aiRoute
+          ? "watching"
+          : "blocked",
+      surfaceKey: "runs",
+      source: "/api/v1/platform/ai-route-status + ai_route_repair",
+      gateKey: "aiRouteRepairWorkItem",
+      evidence: t("evidence.aiRouteRepairWorkItem", {
+        lane: aiRoute?.lane_mode ?? "--",
+        selected: aiRoute?.selected_provider ?? "--",
+        target: aiRouteRepairEvidence?.target_resource ?? "--",
+        blockers: aiRouteBlockers.length,
+      }),
+      evidenceDetails: aiRouteRepairEvidence
+        ? [
+            t("evidence.aiRouteRepairWorkItemId", {
+              id: aiRouteWorkItem?.work_item_id ?? "--",
+            }),
+            t("evidence.aiRouteRepairSkipped", {
+              skipped: aiRouteSkipped || "--",
+            }),
+            t("evidence.aiRouteRepairOwner", {
+              owner: aiRouteOwnerAction?.human_owner ?? aiRouteWorkItem?.owner ?? "--",
+              lead: aiRouteOwnerAction?.lead_agent ?? "--",
+            }),
+            t("evidence.aiRouteRepairPlaybook", {
+              playbook: aiRoutePlaybook?.playbook_id ?? "--",
+              steps: aiRoutePlaybook?.steps?.length ?? 0,
+            }),
+            t("evidence.aiRouteRepairSafety", {
+              safe: String(aiRouteOwnerAction?.safe_to_auto_repair ?? false),
+            }),
+          ]
+        : [t("evidence.aiRouteRepairUnavailable")],
+      href: aiRouteWorkItem?.target_href ?? "/awooop/runs",
+    },
    {
      id: "configDriftFsm",
      phase: "T64",
@@ -4594,6 +4688,7 @@ export default function AwoooPWorkItemsPage() {
    driftFingerprintState: null,
    callbackReplies: null,
    statusChain: null,
+    aiRouteStatus: null,
  });
  const [loading, setLoading] = useState(true);
  const [lastUpdated, setLastUpdated] = useState<Date | null>(null);
@@ -4617,6 +4712,7 @@ export default function AwoooPWorkItemsPage() {
    const remediationHistoryUrl = `${API_BASE}/api/v1/ai/slo/remediation/history?limit=80`;
    const driftFingerprintUrl = `${API_BASE}/api/v1/drift/fingerprints/state?namespace=awoooi-prod`;
    const callbackRepliesUrl = `${API_BASE}/api/v1/platform/runs/callback-replies?project_id=${encodedProjectId}&per_page=100`;
+    const aiRouteStatusUrl = `${API_BASE}/api/v1/platform/ai-route-status?workload_type=deep_rca`;

    const [
      quality,
@@ -4635,6 +4731,7 @@ export default function AwoooPWorkItemsPage() {
      remediationHistory,
      driftFingerprintState,
      callbackReplies,
+      aiRouteStatus,
    ] = await Promise.all([
      fetchJson<AutomationQualitySummary>(qualityUrl, 15000),
      fetchJson<GovernanceEventsResponse>(governanceEventsUrl),
@@ -4652,6 +4749,7 @@ export default function AwoooPWorkItemsPage() {
      fetchJson<RemediationHistoryResponse>(remediationHistoryUrl),
      fetchJson<DriftFingerprintState>(driftFingerprintUrl, 12000),
      fetchJson<CallbackRepliesWorkItemResponse>(callbackRepliesUrl, 12000),
+      fetchJson<AiRouteStatusResponse>(aiRouteStatusUrl, 12000),
    ]);

    const statusChainIncidentId = selectStatusChainIncidentId(
@@ -4687,6 +4785,7 @@ export default function AwoooPWorkItemsPage() {
      driftFingerprintState,
      callbackReplies,
      statusChain,
+      aiRouteStatus,
    });
    setLastUpdated(new Date());
    setLoading(false);
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -1,3 +1,44 @@
+## 2026-05-25｜T178 AI route repair work item / PlayBook 候選
+
+**背景**：
+
+- T177 已讓 `ai_route_repair / repair_diagnosis` 顯示在 `/api/v1/platform/ai-route-status` 與 AwoooP Runs，但 operator 仍需要更清楚知道：這件事是否形成工作項、由誰接、PlayBook 建議是什麼、是否可安全自動修復。
+- 本段仍不修 GCP-A、不改 route、不建立 Incident / Telegram / Approval；只把既有 DB repair evidence 轉成 read-only work item projection，維持低噪音與安全邊界。
+
+**本次修復**：
+
+- `repair_evidence` 新增：
+  - `work_item`：`ai-route-repair:<target>`，目前 `ollama_gcp_a` 為 open / needs_human=true。
+  - `playbook_recommendation`：`ai_route_primary_lane_recovery`，依 live blockers 組出 GCP control plane、OS access、Ollama service、110 proxy、route status verification 等步驟；`safe_to_auto_execute=false`、`requires_approval=true`。
+  - `owner_action`：主責 Hermes，OpenClaw / ElephantAlpha 協作，human owner 為 Cloud/SRE owner；狀態為 blocked by external cloud/OS access。
+- AwoooP Work Items 頁新增 T178「AI Provider primary lane 修復工作項」，讀 `/api/v1/platform/ai-route-status` 顯示 lane、selected provider、target、blocker 數、work item id、owner、PlayBook 與安全自動修復判斷。
+- 此 work item 是 read-model projection，不寫 incident state、不寫 auto-repair result、不變更 runtime route。
+
+**本地驗證**：
+
+```text
+python3 -m py_compile apps/api/src/services/platform_operator_service.py apps/api/src/api/v1/platform/operator_runs.py -> pass
+jq empty apps/web/messages/zh-TW.json apps/web/messages/en.json -> pass
+git diff --check -> pass
+ruff check platform_operator_service.py + targeted tests --ignore B008 -> pass
+pytest targeted ai-route status/evidence tests -> 6 passed
+pnpm --dir apps/web exec tsc --noEmit --tsBuildInfoFile /tmp/awoooi-t178-tsconfig.tsbuildinfo -> pass
+```
+
+**目前整體進度**：
+
+- AwoooP 告警可觀測鏈：約 99.32%。
+- 低風險自動修復閉環：約 95.8%。
+- 前端 AI 自動化管理介面同步：約 97.6%。
+- Telegram 詳情 / 歷史可解釋性：約 95.5%。
+- Callback evidence / DB replayability：約 96.0%。
+- MCP / 自建 MCP 可見性：約 88%。
+- Sentry / SigNoz source correlation visibility：約 88%。
+- Ansible / PlayBook decision visibility：約 85.2%。
+- KM owner-review / completion governance：約 84%。
+- AI Provider lane 健康與可見性：約 92%（GCP-A runtime 尚未修復；但 repair evidence / work item / PlayBook 候選已可見）。
+- 完整 AI 自動化管理產品化：約 95.2%。
+
 ## 2026-05-25｜T177 AI route repair evidence API / 前端投影

 **背景**：