From 67296746c06eea142fdcad8428fb90562480f433 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 25 May 2026 14:21:25 +0800 Subject: [PATCH] feat(awooop): surface ai route repair evidence --- apps/api/src/api/v1/platform/operator_runs.py | 1 + .../src/services/platform_operator_service.py | 213 +++++++++++++++++- .../test_awooop_operator_timeline_labels.py | 98 ++++++++ apps/web/messages/en.json | 45 +++- apps/web/messages/zh-TW.json | 45 +++- .../web/src/app/[locale]/awooop/runs/page.tsx | 188 ++++++++++++++++ .../dashboard/automation-evidence-card.tsx | 16 +- docs/LOGBOOK.md | 43 ++++ 8 files changed, 642 insertions(+), 7 deletions(-) diff --git a/apps/api/src/api/v1/platform/operator_runs.py b/apps/api/src/api/v1/platform/operator_runs.py index 4fed4506..088aa835 100644 --- a/apps/api/src/api/v1/platform/operator_runs.py +++ b/apps/api/src/api/v1/platform/operator_runs.py @@ -151,6 +151,7 @@ class AiRouteStatusResponse(BaseModel): active_lane: dict[str, Any] | None = None skipped_lanes: list[dict[str, Any]] = Field(default_factory=list) operator_action: dict[str, Any] | None = None + repair_evidence: dict[str, Any] | None = None checked_at: datetime diff --git a/apps/api/src/services/platform_operator_service.py b/apps/api/src/services/platform_operator_service.py index 67562f65..27ea408c 100644 --- a/apps/api/src/services/platform_operator_service.py +++ b/apps/api/src/services/platform_operator_service.py @@ -99,6 +99,8 @@ _CICD_STATUS_FILTERS = {"running", "success", "failed", "pending"} _CICD_STAGE_RE = re.compile(r"^[a-z0-9_:-]{1,64}$", re.IGNORECASE) _AI_ROUTE_STATUS_SCHEMA_VERSION = "awooop_ai_route_status_v1" _AI_ROUTE_WORKLOADS = set(get_args(OllamaWorkloadType)) +_AI_ROUTE_REPAIR_EVIDENCE_PROVIDER = "ai_route_repair" +_AI_ROUTE_REPAIR_EVIDENCE_STAGE = "repair_diagnosis" _SOURCE_CORRELATION_SCHEMA_VERSION = "source_provider_correlation_v1" _SOURCE_CORRELATION_PROVIDERS = ("sentry", "signoz") _SOURCE_CORRELATION_EVENT_LIMIT = 200 @@ -642,7 +644,7 @@ async def get_ai_route_status( selected_provider=route.primary.provider_name, health=health, )) - return response + return await _ai_route_response_with_repair_evidence(response) def _validate_ai_route_workload(workload_type: str | None) -> OllamaWorkloadType: @@ -696,7 +698,7 @@ async def _ai_route_lightweight_status_from_policy( route_reason=route_reason, error=str(exc), ) - return _ai_route_unavailable_status( + response = _ai_route_unavailable_status( workload=workload, policy_order=policy_order, checked_at=checked_at, @@ -704,6 +706,7 @@ async def _ai_route_lightweight_status_from_policy( route_error=route_error, route_source="ollama_failover_manager", ) + return await _ai_route_response_with_repair_evidence(response) health_by_provider = { endpoint.provider_name: _ai_route_health_item(report) @@ -741,7 +744,7 @@ async def _ai_route_lightweight_status_from_policy( selected_provider="gemini", health=health_by_provider, )) - return response + return await _ai_route_response_with_repair_evidence(response) selected = endpoints[selected_index] model = get_settings().OLLAMA_HEALTH_CHECK_MODEL @@ -783,7 +786,7 @@ async def _ai_route_lightweight_status_from_policy( selected_provider=selected.provider_name, health=health_by_provider, )) - return response + return await _ai_route_response_with_repair_evidence(response) async def _ai_route_probe_connectivity( @@ -873,6 +876,208 @@ def _ai_route_unavailable_status( return response +async def _ai_route_response_with_repair_evidence( + response: dict[str, Any], +) -> dict[str, Any]: + """Attach latest read-only repair dossier evidence when a lane is degraded.""" + response["repair_evidence"] = None + if response.get("lane_mode") not in { + "degraded_failover", + "cloud_fallback", + "unavailable", + }: + return response + + target_provider = _ai_route_repair_evidence_target(response) + response["repair_evidence"] = await _latest_ai_route_repair_evidence( + target_provider=target_provider, + ) + return response + + +def _ai_route_repair_evidence_target(response: Mapping[str, Any]) -> str | None: + skipped_lanes = response.get("skipped_lanes") + if isinstance(skipped_lanes, list): + for lane in skipped_lanes: + if not isinstance(lane, dict): + continue + provider_name = str(lane.get("provider_name") or "").strip() + if provider_name and lane.get("action_required") is True: + return provider_name + for lane in skipped_lanes: + if isinstance(lane, dict): + provider_name = str(lane.get("provider_name") or "").strip() + if provider_name: + return provider_name + policy_order = response.get("policy_order") + if isinstance(policy_order, list): + for item in policy_order: + if not isinstance(item, dict): + continue + if item.get("runtime") == "ollama": + provider_name = str(item.get("provider_name") or "").strip() + if provider_name: + return provider_name + return None + + +async def _latest_ai_route_repair_evidence( + *, + project_id: str = "awoooi", + target_provider: str | None = None, +) -> dict[str, Any] | None: + """Fetch the newest AI route repair diagnosis stored in AwoooP event DB.""" + params: dict[str, Any] = { + "project_id": project_id, + "provider": _AI_ROUTE_REPAIR_EVIDENCE_PROVIDER, + "stage": _AI_ROUTE_REPAIR_EVIDENCE_STAGE, + } + target_clause = "" + if target_provider: + target_clause = """ + AND COALESCE( + NULLIF(source_envelope #>> '{log_correlation,target_resource}', ''), + NULLIF(source_envelope #>> '{extra,payload,target_resource}', '') + ) = :target_provider + """ + params["target_provider"] = target_provider + + try: + item = await _fetch_latest_ai_route_repair_evidence( + params=params, + target_clause=target_clause, + ) + if item is None and target_provider: + params.pop("target_provider", None) + item = await _fetch_latest_ai_route_repair_evidence( + params=params, + target_clause="", + ) + return item + except Exception as exc: + logger.warning( + "ai_route_repair_evidence_fetch_failed", + project_id=project_id, + target_provider=target_provider, + error=str(exc), + ) + return None + + +async def _fetch_latest_ai_route_repair_evidence( + *, + params: dict[str, Any], + target_clause: str, +) -> dict[str, Any] | None: + sql = text(f""" + SELECT + event_id, + run_id, + provider_event_id, + source_envelope, + provider_ts, + received_at + FROM awooop_conversation_event + WHERE project_id = :project_id + AND LOWER(COALESCE( + NULLIF(source_envelope->>'provider', ''), + NULLIF(split_part(provider_event_id, ':', 1), ''), + channel_type + )) = :provider + AND LOWER(COALESCE(NULLIF(source_envelope->>'stage', ''), 'received')) = :stage + {target_clause} + ORDER BY received_at DESC, event_id DESC + LIMIT 1 + """) + async with get_db_context("awoooi") as db: + result = await db.execute(sql, params) + row = result.mappings().first() + return _ai_route_repair_evidence_item(row) if row else None + + +def _ai_route_repair_evidence_item( + row: Mapping[str, Any], +) -> dict[str, Any]: + """Project route-repair source envelopes into a compact operator-safe view.""" + envelope = _as_dict(row.get("source_envelope")) + extra = _as_dict(envelope.get("extra")) + payload = _as_dict(extra.get("payload")) + log_correlation = _as_dict(envelope.get("log_correlation")) + live_probe = _as_dict(payload.get("live_probe")) + observed_state = _as_dict(payload.get("observed_state")) + side_effects = _ai_route_repair_side_effects(payload.get("side_effects")) + + return { + "schema_version": ( + payload.get("schema_version") + or envelope.get("schema_version") + or "ai_route_repair_evidence_projection_v1" + ), + "provider": ( + envelope.get("provider") + or str(row.get("provider_event_id") or "").split(":", 1)[0] + ), + "stage": envelope.get("stage") or _AI_ROUTE_REPAIR_EVIDENCE_STAGE, + "provider_event_id": row.get("provider_event_id"), + "conversation_event_id": _string_or_none(row.get("event_id")), + "run_id": _string_or_none(row.get("run_id")), + "alertname": log_correlation.get("alertname"), + "severity": log_correlation.get("severity"), + "fingerprint": log_correlation.get("fingerprint"), + "target_resource": ( + log_correlation.get("target_resource") + or payload.get("target_resource") + or observed_state.get("target_resource") + ), + "observed_state": observed_state, + "live_probe": live_probe, + "access_blockers": _as_string_list(payload.get("access_blockers")), + "side_effects": side_effects, + "source_ref_count": _source_ref_count(envelope), + "provider_ts": row.get("provider_ts"), + "received_at": row.get("received_at"), + } + + +def _ai_route_repair_side_effects(value: Any) -> dict[str, bool | None]: + raw = _as_dict(value) + return { + "incident_created": _bool_or_none(raw.get("incident_created")), + "telegram_sent": _bool_or_none(raw.get("telegram_sent")), + "approval_created": _bool_or_none(raw.get("approval_created")), + "runtime_route_changed": _bool_or_none(raw.get("runtime_route_changed")), + } + + +def _as_string_list(value: Any) -> list[str]: + if isinstance(value, list): + return [str(item) for item in value if str(item or "").strip()] + if value not in (None, ""): + return [str(value)] + return [] + + +def _string_or_none(value: Any) -> str | None: + if value in (None, ""): + return None + return str(value) + + +def _bool_or_none(value: Any) -> bool | None: + return value if isinstance(value, bool) else None + + +def _source_ref_count(envelope: Any) -> int: + source_refs = _as_dict(_as_dict(envelope).get("source_refs")) + total = 0 + for value in source_refs.values(): + if isinstance(value, list): + total += len([item for item in value if str(item or "").strip()]) + elif value not in (None, ""): + total += 1 + return total + + def _ai_route_lane_state( *, policy_order: list[dict[str, Any]], diff --git a/apps/api/tests/test_awooop_operator_timeline_labels.py b/apps/api/tests/test_awooop_operator_timeline_labels.py index 77455926..ab4eda3b 100644 --- a/apps/api/tests/test_awooop_operator_timeline_labels.py +++ b/apps/api/tests/test_awooop_operator_timeline_labels.py @@ -21,6 +21,7 @@ from src.services.platform_operator_service import ( _ai_route_health_map, _ai_route_lane_state, _ai_route_policy_order, + _ai_route_repair_evidence_item, _build_awooop_status_chain, _callback_reply_event_item, _callback_reply_summary_matches_status, @@ -1562,6 +1563,12 @@ def test_ai_route_status_response_preserves_route_fields() -> None: "action": "monitor", "reason": "primary_lane_active", }, + "repair_evidence": { + "provider": "ai_route_repair", + "stage": "repair_diagnosis", + "target_resource": "ollama_gcp_a", + "access_blockers": ["gcloud_compute_instances_get_missing"], + }, "checked_at": datetime(2026, 5, 19, 12, 0, 0), }) @@ -1569,6 +1576,79 @@ def test_ai_route_status_response_preserves_route_fields() -> None: assert dumped["policy_order"][-1]["provider_name"] == "gemini" assert dumped["selected_provider"] == "ollama_gcp_a" assert dumped["lane_mode"] == "primary" + assert dumped["repair_evidence"]["target_resource"] == "ollama_gcp_a" + + +def test_ai_route_repair_evidence_item_summarizes_operator_safe_fields() -> None: + row = { + "event_id": UUID("dff309f0-f159-4537-8f58-47714ce94dca"), + "run_id": UUID("ca67ebcc-a24f-53e7-9505-2db15d855ecc"), + "provider_event_id": ( + "ai_route_repair:repair_diagnosis:" + "gcp-a-primary-lane-down-20260525T060415Z" + ), + "provider_ts": datetime(2026, 5, 25, 6, 4, 15), + "received_at": datetime(2026, 5, 25, 6, 5, 3), + "source_envelope": { + "provider": "ai_route_repair", + "stage": "repair_diagnosis", + "log_correlation": { + "alertname": "GcpAPrimaryLaneDown", + "severity": "warning", + "target_resource": "ollama_gcp_a", + "fingerprint": "ai-route-gcp-a-primary-down", + }, + "source_refs": { + "alert_ids": ["gcp-a-primary-lane-down"], + "signoz_alerts": ["signoz:gcp-a"], + "sentry_issue_ids": [], + "fingerprints": ["ai-route-gcp-a-primary-down"], + "run_ids": ["ca67ebcc-a24f-53e7-9505-2db15d855ecc"], + }, + "extra": { + "payload": { + "schema_version": "ai_route_repair_diagnosis_v1", + "observed_state": { + "target_resource": "ollama_gcp_a", + "lane_mode": "degraded_failover", + }, + "live_probe": { + "gcp_a_direct_11434": "connection_refused", + "gcp_b_direct_11434": "http_200", + }, + "access_blockers": [ + "gcloud_compute_instances_get_missing", + "gcp_a_ollama_11434_refused", + ], + "side_effects": { + "incident_created": False, + "telegram_sent": False, + "approval_created": False, + "runtime_route_changed": False, + }, + }, + }, + }, + } + + item = _ai_route_repair_evidence_item(row) + + assert item["provider"] == "ai_route_repair" + assert item["stage"] == "repair_diagnosis" + assert item["target_resource"] == "ollama_gcp_a" + assert item["run_id"] == "ca67ebcc-a24f-53e7-9505-2db15d855ecc" + assert item["source_ref_count"] == 4 + assert item["access_blockers"] == [ + "gcloud_compute_instances_get_missing", + "gcp_a_ollama_11434_refused", + ] + assert item["live_probe"]["gcp_a_direct_11434"] == "connection_refused" + assert item["side_effects"] == { + "incident_created": False, + "telegram_sent": False, + "approval_created": False, + "runtime_route_changed": False, + } def test_ai_route_lane_state_marks_degraded_failover() -> None: @@ -1669,6 +1749,15 @@ async def test_ai_route_status_times_out_before_slow_provider_checks(monkeypatch fake_connectivity, ) + async def no_repair_evidence(**_kwargs): + return None + + monkeypatch.setattr( + platform_operator_service, + "_latest_ai_route_repair_evidence", + no_repair_evidence, + ) + response = await platform_operator_service.get_ai_route_status("deep_rca") assert response["route_reason"] == ( @@ -1722,6 +1811,15 @@ async def test_ai_route_status_lightweight_fallback_keeps_gemini_policy_only( fake_offline_connectivity, ) + async def no_repair_evidence(**_kwargs): + return None + + monkeypatch.setattr( + platform_operator_service, + "_latest_ai_route_repair_evidence", + no_repair_evidence, + ) + response = await platform_operator_service.get_ai_route_status("deep_rca") assert response["selected_provider"] == "gemini" diff --git a/apps/web/messages/en.json b/apps/web/messages/en.json index 94a009d9..b1ae67bf 100644 --- a/apps/web/messages/en.json +++ b/apps/web/messages/en.json @@ -261,6 +261,7 @@ "modelRoute": "Model route", "routeDetail": "{model}; current {selected}; {primary}={primaryStatus}; fallback {fallback}", "routeLaneDetail": "{mode}; skipped {skipped}", + "routeRepairDetail": "Repair evidence: {target}, blockers {blockers}, {sourceRefs} source refs", "routeReasonSeparator": "; ", "routeReason": "Reason: {reason}", "routeErrorDetail": "Route check failed: {error}", @@ -2924,7 +2925,49 @@ "inspect_ai_router": "Inspect AI Router / provider status", "unknown": "Confirm next action" }, - "degradedSummary": "Current handoff is {active}; skipped {skipped}; next action: {action}" + "degradedSummary": "Current handoff is {active}; skipped {skipped}; next action: {action}", + "repairEvidence": { + "title": "Latest repair diagnosis evidence", + "meta": "Event info", + "target": "Target: {target}", + "run": "Run: {run}", + "receivedAt": "Stored: {time}", + "sourceRefs": "{count} source refs", + "blockerTitle": "Current blockers", + "sideEffectTitle": "Side-effect check", + "sideEffectSeparator": ": ", + "emptyValue": "--", + "values": { + "yes": "yes", + "no": "no", + "unknown": "not reported" + }, + "sideEffects": { + "incident_created": "Incident created", + "telegram_sent": "Telegram sent", + "approval_created": "Approval created", + "runtime_route_changed": "Runtime route changed" + }, + "blockers": { + "gcloud_compute_instances_get_missing": "Missing GCP instance get permission", + "gcloud_compute_instances_list_missing": "Missing GCP instance list permission", + "gcloud_projects_get_iam_policy_missing": "Missing GCP IAM read permission", + "gcp_a_ssh_refused": "GCP-A SSH refused", + "gcp_a_ollama_11434_refused": "GCP-A Ollama 11434 refused", + "proxy_110_11435_http_502": "110 proxy 11435 returned 502", + "unknown": "{blocker}" + }, + "probes": { + "gcp_a_ping": "GCP-A ping", + "gcp_a_ssh_22": "GCP-A SSH 22", + "gcp_a_direct_11434": "GCP-A 11434", + "gcp_b_direct_11434": "GCP-B 11434", + "proxy_110_11435": "110 proxy 11435", + "proxy_110_11436": "110 proxy 11436", + "proxy_110_11437": "110 proxy 11437", + "unknown": "{probe}" + } + } } }, "incidentEvidence": { diff --git a/apps/web/messages/zh-TW.json b/apps/web/messages/zh-TW.json index 711b95c8..447725ba 100644 --- a/apps/web/messages/zh-TW.json +++ b/apps/web/messages/zh-TW.json @@ -262,6 +262,7 @@ "modelRoute": "模型路由", "routeDetail": "{model};目前 {selected};{primary}={primaryStatus};備援 {fallback}", "routeLaneDetail": "{mode};已跳過 {skipped}", + "routeRepairDetail": "修復證據:{target},阻塞 {blockers},來源 {sourceRefs} 筆", "routeReasonSeparator": ";", "routeReason": "原因:{reason}", "routeErrorDetail": "路由檢查失敗:{error}", @@ -2925,7 +2926,49 @@ "inspect_ai_router": "需檢查 AI Router / provider 狀態", "unknown": "待確認下一步" }, - "degradedSummary": "目前由 {active} 接手;已跳過 {skipped};下一步:{action}" + "degradedSummary": "目前由 {active} 接手;已跳過 {skipped};下一步:{action}", + "repairEvidence": { + "title": "最新修復診斷證據", + "meta": "事件資訊", + "target": "目標:{target}", + "run": "Run:{run}", + "receivedAt": "入庫:{time}", + "sourceRefs": "來源證據 {count} 筆", + "blockerTitle": "目前阻塞", + "sideEffectTitle": "副作用檢查", + "sideEffectSeparator": ":", + "emptyValue": "--", + "values": { + "yes": "有", + "no": "無", + "unknown": "未回報" + }, + "sideEffects": { + "incident_created": "建立 Incident", + "telegram_sent": "送出 Telegram", + "approval_created": "建立簽核", + "runtime_route_changed": "變更 runtime route" + }, + "blockers": { + "gcloud_compute_instances_get_missing": "缺 GCP instance get 權限", + "gcloud_compute_instances_list_missing": "缺 GCP instance list 權限", + "gcloud_projects_get_iam_policy_missing": "缺 GCP IAM 讀取權限", + "gcp_a_ssh_refused": "GCP-A SSH 拒絕連線", + "gcp_a_ollama_11434_refused": "GCP-A Ollama 11434 拒絕", + "proxy_110_11435_http_502": "110 proxy 11435 回 502", + "unknown": "{blocker}" + }, + "probes": { + "gcp_a_ping": "GCP-A ping", + "gcp_a_ssh_22": "GCP-A SSH 22", + "gcp_a_direct_11434": "GCP-A 11434", + "gcp_b_direct_11434": "GCP-B 11434", + "proxy_110_11435": "110 proxy 11435", + "proxy_110_11436": "110 proxy 11436", + "proxy_110_11437": "110 proxy 11437", + "unknown": "{probe}" + } + } } }, "incidentEvidence": { diff --git a/apps/web/src/app/[locale]/awooop/runs/page.tsx b/apps/web/src/app/[locale]/awooop/runs/page.tsx index 937903de..2c20c427 100644 --- a/apps/web/src/app/[locale]/awooop/runs/page.tsx +++ b/apps/web/src/app/[locale]/awooop/runs/page.tsx @@ -445,6 +445,30 @@ interface AiRouteOperatorAction { reason?: string | null; } +interface AiRouteRepairEvidence { + provider?: string | null; + stage?: string | null; + provider_event_id?: string | null; + conversation_event_id?: string | null; + run_id?: string | null; + alertname?: string | null; + severity?: string | null; + fingerprint?: string | null; + target_resource?: string | null; + observed_state?: Record; + live_probe?: Record; + access_blockers?: string[]; + side_effects?: { + incident_created?: boolean | null; + telegram_sent?: boolean | null; + approval_created?: boolean | null; + runtime_route_changed?: boolean | null; + }; + source_ref_count?: number | null; + provider_ts?: string | null; + received_at?: string | null; +} + interface AiRouteStatusResponse { schema_version: string; workload_type: string; @@ -461,6 +485,7 @@ interface AiRouteStatusResponse { active_lane?: AiRouteLaneItem | null; skipped_lanes?: AiRouteLaneItem[]; operator_action?: AiRouteOperatorAction | null; + repair_evidence?: AiRouteRepairEvidence | null; checked_at: string; } @@ -2050,6 +2075,49 @@ function aiRouteOperatorActionLabelKey(action?: string | null) { return "operatorActions.unknown"; } +const AI_ROUTE_REPAIR_BLOCKER_KEYS = new Set([ + "gcloud_compute_instances_get_missing", + "gcloud_compute_instances_list_missing", + "gcloud_projects_get_iam_policy_missing", + "gcp_a_ssh_refused", + "gcp_a_ollama_11434_refused", + "proxy_110_11435_http_502", +]); + +const AI_ROUTE_REPAIR_PROBE_KEYS = new Set([ + "gcp_a_ping", + "gcp_a_ssh_22", + "gcp_a_direct_11434", + "gcp_b_direct_11434", + "proxy_110_11435", + "proxy_110_11436", + "proxy_110_11437", +]); + +const AI_ROUTE_REPAIR_SIDE_EFFECT_KEYS = [ + "incident_created", + "telegram_sent", + "approval_created", + "runtime_route_changed", +] as const; + +function aiRouteRepairBlockerLabelKey(blocker?: string | null) { + return blocker && AI_ROUTE_REPAIR_BLOCKER_KEYS.has(blocker) + ? `repairEvidence.blockers.${blocker}` + : "repairEvidence.blockers.unknown"; +} + +function aiRouteRepairProbeLabelKey(probe?: string | null) { + return probe && AI_ROUTE_REPAIR_PROBE_KEYS.has(probe) + ? `repairEvidence.probes.${probe}` + : "repairEvidence.probes.unknown"; +} + +function compactEvidenceId(value?: string | null) { + if (!value) return "--"; + return value.length > 18 ? `${value.slice(0, 12)}...` : value; +} + function AiRouteStatusPanel({ status, error, @@ -2064,6 +2132,9 @@ function AiRouteStatusPanel({ const laneMode = status?.lane_mode ?? null; const laneModeKey = aiRouteLaneModeLabelKey(laneMode); const operatorActionKey = aiRouteOperatorActionLabelKey(status?.operator_action?.action); + const repairEvidence = status?.repair_evidence ?? null; + const repairBlockers = repairEvidence?.access_blockers?.filter(Boolean).slice(0, 4) ?? []; + const repairProbes = Object.entries(repairEvidence?.live_probe ?? {}).slice(0, 6); const skippedLanes = status?.skipped_lanes ?? []; const skippedProviderSet = new Set( skippedLanes @@ -2076,6 +2147,12 @@ function AiRouteStatusPanel({ minute: "2-digit", }) : "--"; + const repairEvidenceAt = repairEvidence?.received_at + ? new Date(repairEvidence.received_at).toLocaleTimeString("zh-TW", { + hour: "2-digit", + minute: "2-digit", + }) + : "--"; return (
@@ -2132,6 +2209,117 @@ function AiRouteStatusPanel({ )} + {repairEvidence && ( +
+
+
+
+ + {t("repairEvidence.sourceRefs", { + count: repairEvidence.source_ref_count ?? 0, + })} + +
+ +
+
+

+ {t("repairEvidence.meta")} +

+
+

+ {t("repairEvidence.target", { + target: repairEvidence.target_resource ?? "--", + })} +

+

+ {t("repairEvidence.run", { + run: compactEvidenceId(repairEvidence.run_id), + })} +

+

+ {t("repairEvidence.receivedAt", { + time: repairEvidenceAt, + })} +

+
+
+ +
+

+ {t("repairEvidence.blockerTitle")} +

+
+ {repairBlockers.length > 0 ? repairBlockers.map((blocker) => ( + + {AI_ROUTE_REPAIR_BLOCKER_KEYS.has(blocker) + ? t(aiRouteRepairBlockerLabelKey(blocker) as never) + : t("repairEvidence.blockers.unknown", { blocker })} + + )) : ( + + {t("repairEvidence.emptyValue")} + + )} +
+
+ +
+

+ {t("repairEvidence.sideEffectTitle")} +

+
+ {AI_ROUTE_REPAIR_SIDE_EFFECT_KEYS.map((key) => { + const value = repairEvidence.side_effects?.[key]; + const valueKey = value === true + ? "repairEvidence.values.yes" + : value === false + ? "repairEvidence.values.no" + : "repairEvidence.values.unknown"; + return ( +

+ {t(`repairEvidence.sideEffects.${key}` as never)} + {t("repairEvidence.sideEffectSeparator")} + {t(valueKey as never)} +

+ ); + })} +
+
+
+ + {repairProbes.length > 0 && ( +
+ {repairProbes.map(([probe, value]) => ( +
+

+ {AI_ROUTE_REPAIR_PROBE_KEYS.has(probe) + ? t(aiRouteRepairProbeLabelKey(probe) as never) + : t("repairEvidence.probes.unknown", { probe })} +

+

+ {String(value ?? "--")} +

+
+ ))} +
+ )} +
+ )} +

{t("fields.workload")}

diff --git a/apps/web/src/components/dashboard/automation-evidence-card.tsx b/apps/web/src/components/dashboard/automation-evidence-card.tsx index bd8b54e2..73903593 100644 --- a/apps/web/src/components/dashboard/automation-evidence-card.tsx +++ b/apps/web/src/components/dashboard/automation-evidence-card.tsx @@ -126,6 +126,11 @@ interface AiRouteStatusResponse { action?: string | null human_required?: boolean } | null + repair_evidence?: { + target_resource?: string | null + access_blockers?: string[] + source_ref_count?: number | null + } | null } interface EvidenceSnapshot { @@ -394,8 +399,17 @@ export function AutomationEvidenceCard() { skipped: skippedLanes || '--', }) : null + const repairEvidence = route?.repair_evidence ?? null + const repairBlockers = repairEvidence?.access_blockers?.slice(0, 2).join(', ') + const repairDetail = repairEvidence + ? t('routeRepairDetail', { + target: repairEvidence.target_resource ?? '--', + blockers: repairBlockers || '--', + sourceRefs: repairEvidence.source_ref_count ?? 0, + }) + : null const routeDetail = route?.route_reason && !route.route_error - ? `${routeSummary}${laneDetail ? t('routeReasonSeparator') + laneDetail : ''}${t('routeReasonSeparator')}${t('routeReason', { reason: route.route_reason })}` + ? `${routeSummary}${laneDetail ? t('routeReasonSeparator') + laneDetail : ''}${repairDetail ? t('routeReasonSeparator') + repairDetail : ''}${t('routeReasonSeparator')}${t('routeReason', { reason: route.route_reason })}` : routeSummary return { diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index d8a2bfcd..5d69e775 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,46 @@ +## 2026-05-25|T177 AI route repair evidence API / 前端投影 + +**背景**: + +- T176 已把 GCP-A primary lane down 的 live probe 與阻塞原因寫入 `awooop_conversation_event`,但 operator 在 AwoooP Runs / Dashboard 仍只能看到 `lane_mode=degraded_failover` 與 `repair_skipped_primary_lane`,無法直接看出「診斷證據在哪、阻塞是什麼、是否有副作用」。 +- 本輪目標不是改 AI Provider 路由,也不是自動重啟 GCP-A;只把既有 AwoooP DB 證據做白名單投影,讓前端可以解釋 GCP-A 為何被跳過。 + +**本次修復**: + +- `/api/v1/platform/ai-route-status` 新增 `repair_evidence` 欄位;當 `lane_mode` 為 `degraded_failover` / `cloud_fallback` / `unavailable` 時,從最新 `ai_route_repair / repair_diagnosis` source envelope 取回: + - `provider_event_id`、`conversation_event_id`、`run_id` + - `target_resource`、`severity`、`fingerprint` + - `live_probe`、`access_blockers` + - `side_effects`(incident / Telegram / approval / runtime route 是否被建立或變更) + - `source_ref_count` +- AwoooP Runs 的 AI Provider panel 新增「最新修復診斷證據」區塊,顯示阻塞點、探測結果、來源證據數與副作用檢查。 +- Dashboard Automation Evidence card 也會把 route degraded 摘要補上 repair evidence target / blockers / source refs。 +- 前端文案已補 `zh-TW` / `en` i18n;UI 沒有新增 emoji,使用既有 Lucide icon。 + +**本地驗證**: + +```text +python3 -m py_compile apps/api/src/services/platform_operator_service.py apps/api/src/api/v1/platform/operator_runs.py -> pass +jq empty apps/web/messages/zh-TW.json apps/web/messages/en.json -> pass +git diff --check -> pass +pytest targeted ai-route status/evidence tests -> 6 passed +pnpm --dir apps/web exec tsc --noEmit --tsBuildInfoFile /tmp/awoooi-t177-tsconfig.tsbuildinfo -> pass +``` + +**目前整體進度**: + +- AwoooP 告警可觀測鏈:約 99.3%。 +- 低風險自動修復閉環:約 95.7%。 +- 前端 AI 自動化管理介面同步:約 97.3%。 +- Telegram 詳情 / 歷史可解釋性:約 95.5%。 +- Callback evidence / DB replayability:約 96.0%。 +- MCP / 自建 MCP 可見性:約 88%。 +- Sentry / SigNoz source correlation visibility:約 88%。 +- Ansible / PlayBook decision visibility:約 84.8%。 +- KM owner-review / completion governance:約 84%。 +- AI Provider lane 健康與可見性:約 91%(GCP-A runtime 尚未修復;但 repair diagnosis 已能在 API / 前端呈現)。 +- 完整 AI 自動化管理產品化:約 95.0%。 + ## 2026-05-25|T176 GCP-A primary lane repair evidence 入庫 **背景**: