feat(awooop): surface ai route repair evidence
All checks were successful
CD Pipeline / tests (push) Successful in 1m24s
Code Review / ai-code-review (push) Successful in 12s
CD Pipeline / build-and-deploy (push) Successful in 3m53s
CD Pipeline / post-deploy-checks (push) Successful in 1m48s

This commit is contained in:
Your Name
2026-05-25 14:21:25 +08:00
parent e570d9f6a9
commit 67296746c0
8 changed files with 642 additions and 7 deletions

View File

@@ -151,6 +151,7 @@ class AiRouteStatusResponse(BaseModel):
active_lane: dict[str, Any] | None = None
skipped_lanes: list[dict[str, Any]] = Field(default_factory=list)
operator_action: dict[str, Any] | None = None
repair_evidence: dict[str, Any] | None = None
checked_at: datetime

View File

@@ -99,6 +99,8 @@ _CICD_STATUS_FILTERS = {"running", "success", "failed", "pending"}
_CICD_STAGE_RE = re.compile(r"^[a-z0-9_:-]{1,64}$", re.IGNORECASE)
_AI_ROUTE_STATUS_SCHEMA_VERSION = "awooop_ai_route_status_v1"
_AI_ROUTE_WORKLOADS = set(get_args(OllamaWorkloadType))
_AI_ROUTE_REPAIR_EVIDENCE_PROVIDER = "ai_route_repair"
_AI_ROUTE_REPAIR_EVIDENCE_STAGE = "repair_diagnosis"
_SOURCE_CORRELATION_SCHEMA_VERSION = "source_provider_correlation_v1"
_SOURCE_CORRELATION_PROVIDERS = ("sentry", "signoz")
_SOURCE_CORRELATION_EVENT_LIMIT = 200
@@ -642,7 +644,7 @@ async def get_ai_route_status(
selected_provider=route.primary.provider_name,
health=health,
))
return response
return await _ai_route_response_with_repair_evidence(response)
def _validate_ai_route_workload(workload_type: str | None) -> OllamaWorkloadType:
@@ -696,7 +698,7 @@ async def _ai_route_lightweight_status_from_policy(
route_reason=route_reason,
error=str(exc),
)
return _ai_route_unavailable_status(
response = _ai_route_unavailable_status(
workload=workload,
policy_order=policy_order,
checked_at=checked_at,
@@ -704,6 +706,7 @@ async def _ai_route_lightweight_status_from_policy(
route_error=route_error,
route_source="ollama_failover_manager",
)
return await _ai_route_response_with_repair_evidence(response)
health_by_provider = {
endpoint.provider_name: _ai_route_health_item(report)
@@ -741,7 +744,7 @@ async def _ai_route_lightweight_status_from_policy(
selected_provider="gemini",
health=health_by_provider,
))
return response
return await _ai_route_response_with_repair_evidence(response)
selected = endpoints[selected_index]
model = get_settings().OLLAMA_HEALTH_CHECK_MODEL
@@ -783,7 +786,7 @@ async def _ai_route_lightweight_status_from_policy(
selected_provider=selected.provider_name,
health=health_by_provider,
))
return response
return await _ai_route_response_with_repair_evidence(response)
async def _ai_route_probe_connectivity(
@@ -873,6 +876,208 @@ def _ai_route_unavailable_status(
return response
async def _ai_route_response_with_repair_evidence(
response: dict[str, Any],
) -> dict[str, Any]:
"""Attach latest read-only repair dossier evidence when a lane is degraded."""
response["repair_evidence"] = None
if response.get("lane_mode") not in {
"degraded_failover",
"cloud_fallback",
"unavailable",
}:
return response
target_provider = _ai_route_repair_evidence_target(response)
response["repair_evidence"] = await _latest_ai_route_repair_evidence(
target_provider=target_provider,
)
return response
def _ai_route_repair_evidence_target(response: Mapping[str, Any]) -> str | None:
skipped_lanes = response.get("skipped_lanes")
if isinstance(skipped_lanes, list):
for lane in skipped_lanes:
if not isinstance(lane, dict):
continue
provider_name = str(lane.get("provider_name") or "").strip()
if provider_name and lane.get("action_required") is True:
return provider_name
for lane in skipped_lanes:
if isinstance(lane, dict):
provider_name = str(lane.get("provider_name") or "").strip()
if provider_name:
return provider_name
policy_order = response.get("policy_order")
if isinstance(policy_order, list):
for item in policy_order:
if not isinstance(item, dict):
continue
if item.get("runtime") == "ollama":
provider_name = str(item.get("provider_name") or "").strip()
if provider_name:
return provider_name
return None
async def _latest_ai_route_repair_evidence(
*,
project_id: str = "awoooi",
target_provider: str | None = None,
) -> dict[str, Any] | None:
"""Fetch the newest AI route repair diagnosis stored in AwoooP event DB."""
params: dict[str, Any] = {
"project_id": project_id,
"provider": _AI_ROUTE_REPAIR_EVIDENCE_PROVIDER,
"stage": _AI_ROUTE_REPAIR_EVIDENCE_STAGE,
}
target_clause = ""
if target_provider:
target_clause = """
AND COALESCE(
NULLIF(source_envelope #>> '{log_correlation,target_resource}', ''),
NULLIF(source_envelope #>> '{extra,payload,target_resource}', '')
) = :target_provider
"""
params["target_provider"] = target_provider
try:
item = await _fetch_latest_ai_route_repair_evidence(
params=params,
target_clause=target_clause,
)
if item is None and target_provider:
params.pop("target_provider", None)
item = await _fetch_latest_ai_route_repair_evidence(
params=params,
target_clause="",
)
return item
except Exception as exc:
logger.warning(
"ai_route_repair_evidence_fetch_failed",
project_id=project_id,
target_provider=target_provider,
error=str(exc),
)
return None
async def _fetch_latest_ai_route_repair_evidence(
*,
params: dict[str, Any],
target_clause: str,
) -> dict[str, Any] | None:
sql = text(f"""
SELECT
event_id,
run_id,
provider_event_id,
source_envelope,
provider_ts,
received_at
FROM awooop_conversation_event
WHERE project_id = :project_id
AND LOWER(COALESCE(
NULLIF(source_envelope->>'provider', ''),
NULLIF(split_part(provider_event_id, ':', 1), ''),
channel_type
)) = :provider
AND LOWER(COALESCE(NULLIF(source_envelope->>'stage', ''), 'received')) = :stage
{target_clause}
ORDER BY received_at DESC, event_id DESC
LIMIT 1
""")
async with get_db_context("awoooi") as db:
result = await db.execute(sql, params)
row = result.mappings().first()
return _ai_route_repair_evidence_item(row) if row else None
def _ai_route_repair_evidence_item(
row: Mapping[str, Any],
) -> dict[str, Any]:
"""Project route-repair source envelopes into a compact operator-safe view."""
envelope = _as_dict(row.get("source_envelope"))
extra = _as_dict(envelope.get("extra"))
payload = _as_dict(extra.get("payload"))
log_correlation = _as_dict(envelope.get("log_correlation"))
live_probe = _as_dict(payload.get("live_probe"))
observed_state = _as_dict(payload.get("observed_state"))
side_effects = _ai_route_repair_side_effects(payload.get("side_effects"))
return {
"schema_version": (
payload.get("schema_version")
or envelope.get("schema_version")
or "ai_route_repair_evidence_projection_v1"
),
"provider": (
envelope.get("provider")
or str(row.get("provider_event_id") or "").split(":", 1)[0]
),
"stage": envelope.get("stage") or _AI_ROUTE_REPAIR_EVIDENCE_STAGE,
"provider_event_id": row.get("provider_event_id"),
"conversation_event_id": _string_or_none(row.get("event_id")),
"run_id": _string_or_none(row.get("run_id")),
"alertname": log_correlation.get("alertname"),
"severity": log_correlation.get("severity"),
"fingerprint": log_correlation.get("fingerprint"),
"target_resource": (
log_correlation.get("target_resource")
or payload.get("target_resource")
or observed_state.get("target_resource")
),
"observed_state": observed_state,
"live_probe": live_probe,
"access_blockers": _as_string_list(payload.get("access_blockers")),
"side_effects": side_effects,
"source_ref_count": _source_ref_count(envelope),
"provider_ts": row.get("provider_ts"),
"received_at": row.get("received_at"),
}
def _ai_route_repair_side_effects(value: Any) -> dict[str, bool | None]:
raw = _as_dict(value)
return {
"incident_created": _bool_or_none(raw.get("incident_created")),
"telegram_sent": _bool_or_none(raw.get("telegram_sent")),
"approval_created": _bool_or_none(raw.get("approval_created")),
"runtime_route_changed": _bool_or_none(raw.get("runtime_route_changed")),
}
def _as_string_list(value: Any) -> list[str]:
if isinstance(value, list):
return [str(item) for item in value if str(item or "").strip()]
if value not in (None, ""):
return [str(value)]
return []
def _string_or_none(value: Any) -> str | None:
if value in (None, ""):
return None
return str(value)
def _bool_or_none(value: Any) -> bool | None:
return value if isinstance(value, bool) else None
def _source_ref_count(envelope: Any) -> int:
source_refs = _as_dict(_as_dict(envelope).get("source_refs"))
total = 0
for value in source_refs.values():
if isinstance(value, list):
total += len([item for item in value if str(item or "").strip()])
elif value not in (None, ""):
total += 1
return total
def _ai_route_lane_state(
*,
policy_order: list[dict[str, Any]],

View File

@@ -21,6 +21,7 @@ from src.services.platform_operator_service import (
_ai_route_health_map,
_ai_route_lane_state,
_ai_route_policy_order,
_ai_route_repair_evidence_item,
_build_awooop_status_chain,
_callback_reply_event_item,
_callback_reply_summary_matches_status,
@@ -1562,6 +1563,12 @@ def test_ai_route_status_response_preserves_route_fields() -> None:
"action": "monitor",
"reason": "primary_lane_active",
},
"repair_evidence": {
"provider": "ai_route_repair",
"stage": "repair_diagnosis",
"target_resource": "ollama_gcp_a",
"access_blockers": ["gcloud_compute_instances_get_missing"],
},
"checked_at": datetime(2026, 5, 19, 12, 0, 0),
})
@@ -1569,6 +1576,79 @@ def test_ai_route_status_response_preserves_route_fields() -> None:
assert dumped["policy_order"][-1]["provider_name"] == "gemini"
assert dumped["selected_provider"] == "ollama_gcp_a"
assert dumped["lane_mode"] == "primary"
assert dumped["repair_evidence"]["target_resource"] == "ollama_gcp_a"
def test_ai_route_repair_evidence_item_summarizes_operator_safe_fields() -> None:
row = {
"event_id": UUID("dff309f0-f159-4537-8f58-47714ce94dca"),
"run_id": UUID("ca67ebcc-a24f-53e7-9505-2db15d855ecc"),
"provider_event_id": (
"ai_route_repair:repair_diagnosis:"
"gcp-a-primary-lane-down-20260525T060415Z"
),
"provider_ts": datetime(2026, 5, 25, 6, 4, 15),
"received_at": datetime(2026, 5, 25, 6, 5, 3),
"source_envelope": {
"provider": "ai_route_repair",
"stage": "repair_diagnosis",
"log_correlation": {
"alertname": "GcpAPrimaryLaneDown",
"severity": "warning",
"target_resource": "ollama_gcp_a",
"fingerprint": "ai-route-gcp-a-primary-down",
},
"source_refs": {
"alert_ids": ["gcp-a-primary-lane-down"],
"signoz_alerts": ["signoz:gcp-a"],
"sentry_issue_ids": [],
"fingerprints": ["ai-route-gcp-a-primary-down"],
"run_ids": ["ca67ebcc-a24f-53e7-9505-2db15d855ecc"],
},
"extra": {
"payload": {
"schema_version": "ai_route_repair_diagnosis_v1",
"observed_state": {
"target_resource": "ollama_gcp_a",
"lane_mode": "degraded_failover",
},
"live_probe": {
"gcp_a_direct_11434": "connection_refused",
"gcp_b_direct_11434": "http_200",
},
"access_blockers": [
"gcloud_compute_instances_get_missing",
"gcp_a_ollama_11434_refused",
],
"side_effects": {
"incident_created": False,
"telegram_sent": False,
"approval_created": False,
"runtime_route_changed": False,
},
},
},
},
}
item = _ai_route_repair_evidence_item(row)
assert item["provider"] == "ai_route_repair"
assert item["stage"] == "repair_diagnosis"
assert item["target_resource"] == "ollama_gcp_a"
assert item["run_id"] == "ca67ebcc-a24f-53e7-9505-2db15d855ecc"
assert item["source_ref_count"] == 4
assert item["access_blockers"] == [
"gcloud_compute_instances_get_missing",
"gcp_a_ollama_11434_refused",
]
assert item["live_probe"]["gcp_a_direct_11434"] == "connection_refused"
assert item["side_effects"] == {
"incident_created": False,
"telegram_sent": False,
"approval_created": False,
"runtime_route_changed": False,
}
def test_ai_route_lane_state_marks_degraded_failover() -> None:
@@ -1669,6 +1749,15 @@ async def test_ai_route_status_times_out_before_slow_provider_checks(monkeypatch
fake_connectivity,
)
async def no_repair_evidence(**_kwargs):
return None
monkeypatch.setattr(
platform_operator_service,
"_latest_ai_route_repair_evidence",
no_repair_evidence,
)
response = await platform_operator_service.get_ai_route_status("deep_rca")
assert response["route_reason"] == (
@@ -1722,6 +1811,15 @@ async def test_ai_route_status_lightweight_fallback_keeps_gemini_policy_only(
fake_offline_connectivity,
)
async def no_repair_evidence(**_kwargs):
return None
monkeypatch.setattr(
platform_operator_service,
"_latest_ai_route_repair_evidence",
no_repair_evidence,
)
response = await platform_operator_service.get_ai_route_status("deep_rca")
assert response["selected_provider"] == "gemini"

View File

@@ -261,6 +261,7 @@
"modelRoute": "Model route",
"routeDetail": "{model}; current {selected}; {primary}={primaryStatus}; fallback {fallback}",
"routeLaneDetail": "{mode}; skipped {skipped}",
"routeRepairDetail": "Repair evidence: {target}, blockers {blockers}, {sourceRefs} source refs",
"routeReasonSeparator": "; ",
"routeReason": "Reason: {reason}",
"routeErrorDetail": "Route check failed: {error}",
@@ -2924,7 +2925,49 @@
"inspect_ai_router": "Inspect AI Router / provider status",
"unknown": "Confirm next action"
},
"degradedSummary": "Current handoff is {active}; skipped {skipped}; next action: {action}"
"degradedSummary": "Current handoff is {active}; skipped {skipped}; next action: {action}",
"repairEvidence": {
"title": "Latest repair diagnosis evidence",
"meta": "Event info",
"target": "Target: {target}",
"run": "Run: {run}",
"receivedAt": "Stored: {time}",
"sourceRefs": "{count} source refs",
"blockerTitle": "Current blockers",
"sideEffectTitle": "Side-effect check",
"sideEffectSeparator": ": ",
"emptyValue": "--",
"values": {
"yes": "yes",
"no": "no",
"unknown": "not reported"
},
"sideEffects": {
"incident_created": "Incident created",
"telegram_sent": "Telegram sent",
"approval_created": "Approval created",
"runtime_route_changed": "Runtime route changed"
},
"blockers": {
"gcloud_compute_instances_get_missing": "Missing GCP instance get permission",
"gcloud_compute_instances_list_missing": "Missing GCP instance list permission",
"gcloud_projects_get_iam_policy_missing": "Missing GCP IAM read permission",
"gcp_a_ssh_refused": "GCP-A SSH refused",
"gcp_a_ollama_11434_refused": "GCP-A Ollama 11434 refused",
"proxy_110_11435_http_502": "110 proxy 11435 returned 502",
"unknown": "{blocker}"
},
"probes": {
"gcp_a_ping": "GCP-A ping",
"gcp_a_ssh_22": "GCP-A SSH 22",
"gcp_a_direct_11434": "GCP-A 11434",
"gcp_b_direct_11434": "GCP-B 11434",
"proxy_110_11435": "110 proxy 11435",
"proxy_110_11436": "110 proxy 11436",
"proxy_110_11437": "110 proxy 11437",
"unknown": "{probe}"
}
}
}
},
"incidentEvidence": {

View File

@@ -262,6 +262,7 @@
"modelRoute": "模型路由",
"routeDetail": "{model};目前 {selected}{primary}={primaryStatus};備援 {fallback}",
"routeLaneDetail": "{mode};已跳過 {skipped}",
"routeRepairDetail": "修復證據:{target},阻塞 {blockers},來源 {sourceRefs} 筆",
"routeReasonSeparator": "",
"routeReason": "原因:{reason}",
"routeErrorDetail": "路由檢查失敗:{error}",
@@ -2925,7 +2926,49 @@
"inspect_ai_router": "需檢查 AI Router / provider 狀態",
"unknown": "待確認下一步"
},
"degradedSummary": "目前由 {active} 接手;已跳過 {skipped};下一步:{action}"
"degradedSummary": "目前由 {active} 接手;已跳過 {skipped};下一步:{action}",
"repairEvidence": {
"title": "最新修復診斷證據",
"meta": "事件資訊",
"target": "目標:{target}",
"run": "Run{run}",
"receivedAt": "入庫:{time}",
"sourceRefs": "來源證據 {count} 筆",
"blockerTitle": "目前阻塞",
"sideEffectTitle": "副作用檢查",
"sideEffectSeparator": "",
"emptyValue": "--",
"values": {
"yes": "有",
"no": "無",
"unknown": "未回報"
},
"sideEffects": {
"incident_created": "建立 Incident",
"telegram_sent": "送出 Telegram",
"approval_created": "建立簽核",
"runtime_route_changed": "變更 runtime route"
},
"blockers": {
"gcloud_compute_instances_get_missing": "缺 GCP instance get 權限",
"gcloud_compute_instances_list_missing": "缺 GCP instance list 權限",
"gcloud_projects_get_iam_policy_missing": "缺 GCP IAM 讀取權限",
"gcp_a_ssh_refused": "GCP-A SSH 拒絕連線",
"gcp_a_ollama_11434_refused": "GCP-A Ollama 11434 拒絕",
"proxy_110_11435_http_502": "110 proxy 11435 回 502",
"unknown": "{blocker}"
},
"probes": {
"gcp_a_ping": "GCP-A ping",
"gcp_a_ssh_22": "GCP-A SSH 22",
"gcp_a_direct_11434": "GCP-A 11434",
"gcp_b_direct_11434": "GCP-B 11434",
"proxy_110_11435": "110 proxy 11435",
"proxy_110_11436": "110 proxy 11436",
"proxy_110_11437": "110 proxy 11437",
"unknown": "{probe}"
}
}
}
},
"incidentEvidence": {

View File

@@ -445,6 +445,30 @@ interface AiRouteOperatorAction {
reason?: string | null;
}
interface AiRouteRepairEvidence {
provider?: string | null;
stage?: string | null;
provider_event_id?: string | null;
conversation_event_id?: string | null;
run_id?: string | null;
alertname?: string | null;
severity?: string | null;
fingerprint?: string | null;
target_resource?: string | null;
observed_state?: Record<string, unknown>;
live_probe?: Record<string, unknown>;
access_blockers?: string[];
side_effects?: {
incident_created?: boolean | null;
telegram_sent?: boolean | null;
approval_created?: boolean | null;
runtime_route_changed?: boolean | null;
};
source_ref_count?: number | null;
provider_ts?: string | null;
received_at?: string | null;
}
interface AiRouteStatusResponse {
schema_version: string;
workload_type: string;
@@ -461,6 +485,7 @@ interface AiRouteStatusResponse {
active_lane?: AiRouteLaneItem | null;
skipped_lanes?: AiRouteLaneItem[];
operator_action?: AiRouteOperatorAction | null;
repair_evidence?: AiRouteRepairEvidence | null;
checked_at: string;
}
@@ -2050,6 +2075,49 @@ function aiRouteOperatorActionLabelKey(action?: string | null) {
return "operatorActions.unknown";
}
const AI_ROUTE_REPAIR_BLOCKER_KEYS = new Set([
"gcloud_compute_instances_get_missing",
"gcloud_compute_instances_list_missing",
"gcloud_projects_get_iam_policy_missing",
"gcp_a_ssh_refused",
"gcp_a_ollama_11434_refused",
"proxy_110_11435_http_502",
]);
const AI_ROUTE_REPAIR_PROBE_KEYS = new Set([
"gcp_a_ping",
"gcp_a_ssh_22",
"gcp_a_direct_11434",
"gcp_b_direct_11434",
"proxy_110_11435",
"proxy_110_11436",
"proxy_110_11437",
]);
const AI_ROUTE_REPAIR_SIDE_EFFECT_KEYS = [
"incident_created",
"telegram_sent",
"approval_created",
"runtime_route_changed",
] as const;
function aiRouteRepairBlockerLabelKey(blocker?: string | null) {
return blocker && AI_ROUTE_REPAIR_BLOCKER_KEYS.has(blocker)
? `repairEvidence.blockers.${blocker}`
: "repairEvidence.blockers.unknown";
}
function aiRouteRepairProbeLabelKey(probe?: string | null) {
return probe && AI_ROUTE_REPAIR_PROBE_KEYS.has(probe)
? `repairEvidence.probes.${probe}`
: "repairEvidence.probes.unknown";
}
function compactEvidenceId(value?: string | null) {
if (!value) return "--";
return value.length > 18 ? `${value.slice(0, 12)}...` : value;
}
function AiRouteStatusPanel({
status,
error,
@@ -2064,6 +2132,9 @@ function AiRouteStatusPanel({
const laneMode = status?.lane_mode ?? null;
const laneModeKey = aiRouteLaneModeLabelKey(laneMode);
const operatorActionKey = aiRouteOperatorActionLabelKey(status?.operator_action?.action);
const repairEvidence = status?.repair_evidence ?? null;
const repairBlockers = repairEvidence?.access_blockers?.filter(Boolean).slice(0, 4) ?? [];
const repairProbes = Object.entries(repairEvidence?.live_probe ?? {}).slice(0, 6);
const skippedLanes = status?.skipped_lanes ?? [];
const skippedProviderSet = new Set(
skippedLanes
@@ -2076,6 +2147,12 @@ function AiRouteStatusPanel({
minute: "2-digit",
})
: "--";
const repairEvidenceAt = repairEvidence?.received_at
? new Date(repairEvidence.received_at).toLocaleTimeString("zh-TW", {
hour: "2-digit",
minute: "2-digit",
})
: "--";
return (
<section className="border border-[#e0ddd4] bg-white">
@@ -2132,6 +2209,117 @@ function AiRouteStatusPanel({
</div>
)}
{repairEvidence && (
<div className="border-b border-[#e0ddd4] bg-[#fbfcfb] px-4 py-4">
<div className="flex flex-wrap items-start justify-between gap-3">
<div className="flex min-w-0 items-center gap-2">
<SearchCheck className="h-4 w-4 shrink-0 text-[#17602a]" aria-hidden="true" />
<div className="min-w-0">
<p className="text-sm font-semibold text-[#141413]">
{t("repairEvidence.title")}
</p>
<p className="mt-1 truncate font-mono text-xs text-[#77736a]">
{repairEvidence.provider_event_id ?? "--"}
</p>
</div>
</div>
<span className="border border-[#cbdccf] bg-[#f0faf2] px-2 py-0.5 text-xs font-semibold text-[#17602a]">
{t("repairEvidence.sourceRefs", {
count: repairEvidence.source_ref_count ?? 0,
})}
</span>
</div>
<div className="mt-4 grid gap-3 lg:grid-cols-3">
<div className="border border-[#e0ddd4] bg-white px-3 py-3">
<p className="text-xs font-semibold text-[#77736a]">
{t("repairEvidence.meta")}
</p>
<div className="mt-2 space-y-1 text-xs leading-5 text-[#5f5b52]">
<p>
{t("repairEvidence.target", {
target: repairEvidence.target_resource ?? "--",
})}
</p>
<p>
{t("repairEvidence.run", {
run: compactEvidenceId(repairEvidence.run_id),
})}
</p>
<p>
{t("repairEvidence.receivedAt", {
time: repairEvidenceAt,
})}
</p>
</div>
</div>
<div className="border border-[#e0ddd4] bg-white px-3 py-3">
<p className="text-xs font-semibold text-[#77736a]">
{t("repairEvidence.blockerTitle")}
</p>
<div className="mt-2 flex flex-wrap gap-1.5">
{repairBlockers.length > 0 ? repairBlockers.map((blocker) => (
<span
key={blocker}
className="border border-[#d9b36f] bg-[#fff7e8] px-2 py-1 text-xs font-medium text-[#6d4707]"
title={blocker}
>
{AI_ROUTE_REPAIR_BLOCKER_KEYS.has(blocker)
? t(aiRouteRepairBlockerLabelKey(blocker) as never)
: t("repairEvidence.blockers.unknown", { blocker })}
</span>
)) : (
<span className="text-xs text-[#77736a]">
{t("repairEvidence.emptyValue")}
</span>
)}
</div>
</div>
<div className="border border-[#e0ddd4] bg-white px-3 py-3">
<p className="text-xs font-semibold text-[#77736a]">
{t("repairEvidence.sideEffectTitle")}
</p>
<div className="mt-2 space-y-1 text-xs leading-5 text-[#5f5b52]">
{AI_ROUTE_REPAIR_SIDE_EFFECT_KEYS.map((key) => {
const value = repairEvidence.side_effects?.[key];
const valueKey = value === true
? "repairEvidence.values.yes"
: value === false
? "repairEvidence.values.no"
: "repairEvidence.values.unknown";
return (
<p key={key}>
{t(`repairEvidence.sideEffects.${key}` as never)}
{t("repairEvidence.sideEffectSeparator")}
{t(valueKey as never)}
</p>
);
})}
</div>
</div>
</div>
{repairProbes.length > 0 && (
<div className="mt-3 grid gap-px bg-[#e0ddd4] md:grid-cols-2 xl:grid-cols-3">
{repairProbes.map(([probe, value]) => (
<div key={probe} className="bg-white px-3 py-2 text-xs leading-5">
<p className="font-semibold text-[#141413]">
{AI_ROUTE_REPAIR_PROBE_KEYS.has(probe)
? t(aiRouteRepairProbeLabelKey(probe) as never)
: t("repairEvidence.probes.unknown", { probe })}
</p>
<p className="mt-1 truncate font-mono text-[#77736a]" title={String(value ?? "--")}>
{String(value ?? "--")}
</p>
</div>
))}
</div>
)}
</div>
)}
<div className="grid gap-px bg-[#e0ddd4] md:grid-cols-4">
<div className="bg-white px-4 py-3">
<p className="text-xs font-semibold text-[#77736a]">{t("fields.workload")}</p>

View File

@@ -126,6 +126,11 @@ interface AiRouteStatusResponse {
action?: string | null
human_required?: boolean
} | null
repair_evidence?: {
target_resource?: string | null
access_blockers?: string[]
source_ref_count?: number | null
} | null
}
interface EvidenceSnapshot {
@@ -394,8 +399,17 @@ export function AutomationEvidenceCard() {
skipped: skippedLanes || '--',
})
: null
const repairEvidence = route?.repair_evidence ?? null
const repairBlockers = repairEvidence?.access_blockers?.slice(0, 2).join(', ')
const repairDetail = repairEvidence
? t('routeRepairDetail', {
target: repairEvidence.target_resource ?? '--',
blockers: repairBlockers || '--',
sourceRefs: repairEvidence.source_ref_count ?? 0,
})
: null
const routeDetail = route?.route_reason && !route.route_error
? `${routeSummary}${laneDetail ? t('routeReasonSeparator') + laneDetail : ''}${t('routeReasonSeparator')}${t('routeReason', { reason: route.route_reason })}`
? `${routeSummary}${laneDetail ? t('routeReasonSeparator') + laneDetail : ''}${repairDetail ? t('routeReasonSeparator') + repairDetail : ''}${t('routeReasonSeparator')}${t('routeReason', { reason: route.route_reason })}`
: routeSummary
return {

View File

@@ -1,3 +1,46 @@
## 2026-05-25T177 AI route repair evidence API / 前端投影
**背景**
- T176 已把 GCP-A primary lane down 的 live probe 與阻塞原因寫入 `awooop_conversation_event`,但 operator 在 AwoooP Runs / Dashboard 仍只能看到 `lane_mode=degraded_failover``repair_skipped_primary_lane`,無法直接看出「診斷證據在哪、阻塞是什麼、是否有副作用」。
- 本輪目標不是改 AI Provider 路由,也不是自動重啟 GCP-A只把既有 AwoooP DB 證據做白名單投影,讓前端可以解釋 GCP-A 為何被跳過。
**本次修復**
- `/api/v1/platform/ai-route-status` 新增 `repair_evidence` 欄位;當 `lane_mode``degraded_failover` / `cloud_fallback` / `unavailable` 時,從最新 `ai_route_repair / repair_diagnosis` source envelope 取回:
- `provider_event_id``conversation_event_id``run_id`
- `target_resource``severity``fingerprint`
- `live_probe``access_blockers`
- `side_effects`incident / Telegram / approval / runtime route 是否被建立或變更)
- `source_ref_count`
- AwoooP Runs 的 AI Provider panel 新增「最新修復診斷證據」區塊,顯示阻塞點、探測結果、來源證據數與副作用檢查。
- Dashboard Automation Evidence card 也會把 route degraded 摘要補上 repair evidence target / blockers / source refs。
- 前端文案已補 `zh-TW` / `en` i18nUI 沒有新增 emoji使用既有 Lucide icon。
**本地驗證**
```text
python3 -m py_compile apps/api/src/services/platform_operator_service.py apps/api/src/api/v1/platform/operator_runs.py -> pass
jq empty apps/web/messages/zh-TW.json apps/web/messages/en.json -> pass
git diff --check -> pass
pytest targeted ai-route status/evidence tests -> 6 passed
pnpm --dir apps/web exec tsc --noEmit --tsBuildInfoFile /tmp/awoooi-t177-tsconfig.tsbuildinfo -> pass
```
**目前整體進度**
- AwoooP 告警可觀測鏈:約 99.3%。
- 低風險自動修復閉環:約 95.7%。
- 前端 AI 自動化管理介面同步:約 97.3%。
- Telegram 詳情 / 歷史可解釋性:約 95.5%。
- Callback evidence / DB replayability約 96.0%。
- MCP / 自建 MCP 可見性:約 88%。
- Sentry / SigNoz source correlation visibility約 88%。
- Ansible / PlayBook decision visibility約 84.8%。
- KM owner-review / completion governance約 84%。
- AI Provider lane 健康與可見性:約 91%GCP-A runtime 尚未修復;但 repair diagnosis 已能在 API / 前端呈現)。
- 完整 AI 自動化管理產品化:約 95.0%。
## 2026-05-25T176 GCP-A primary lane repair evidence 入庫
**背景**